├── .gitignore
├── 13-jss.Rproj
├── README.md
├── case-study
    ├── case-study.r
    ├── counts-all.tex
    ├── counts-disease.tex
    ├── counts-prop.tex
    ├── counts.tex
    ├── deaths.rds
    ├── icd-main.csv
    ├── n-dist-log.pdf
    ├── n-dist-raw.pdf
    ├── n-dist-resid.pdf
    ├── overall.pdf
    ├── raw.tex
    ├── unusual-big.pdf
    └── unusual-sml.pdf
├── correspondence
    ├── .gitignore
    ├── 0-submission.pdf
    ├── 1-review-1.pdf
    ├── 1-review-2.pdf
    ├── 1-review-3.txt
    ├── 1-review-ae.txt
    ├── 2-response.md
    ├── 3-post-response.md
    └── 3-review-post.txt
├── data
    ├── billboard-clean.tex
    ├── billboard-rank.tex
    ├── billboard-raw.tex
    ├── billboard-song.tex
    ├── billboard.csv
    ├── billboard.r
    ├── melt-output.tex
    ├── melt-raw.tex
    ├── melt.r
    ├── pew-clean.tex
    ├── pew-raw.tex
    ├── pew.r
    ├── pew.sav
    ├── preg-raw-1.tex
    ├── preg-raw-2.tex
    ├── preg-tidy.tex
    ├── preg.R
    ├── read-fwf.r
    ├── tb-clean-1.tex
    ├── tb-clean-2.tex
    ├── tb-raw.tex
    ├── tb.csv
    ├── tb.r
    ├── weather-clean-1.tex
    ├── weather-clean-2.tex
    ├── weather-raw.tex
    ├── weather.r
    ├── weather.txt
    └── xtable.r
├── jss.bst
├── jss.cls
├── jss.dtx
├── jsslogo.jpg
├── model-1.tex
├── model-2.tex
├── references.bib
├── t-test.r
└── tidy-data.tex


/.gitignore:
--------------------------------------------------------------------------------
1 | tidy-data.pdf
2 | .Rhistory
3 | *.zip
4 | .Rproj.user
5 | 


--------------------------------------------------------------------------------
/13-jss.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Tidy data
 2 | 
 3 | Files available in this package:
 4 | 
 5 | * `tidy-data.tex` and `tidy-data.pdf`: the latex source to generate the paper 
 6 |   and the resulting pdf
 7 | 
 8 | * `data/`: raw datasets, the code to tidy them, and the results, as used in 
 9 |    Section 3. Source individual `.R` files to recreate the tidied data.
10 | 
11 | * `t-test.r`: code used to generate Table 14 (`model-1.tex` and `model-2.tex`), 
12 |   comparing data needed for paired t-test vs. a mixed effects model.
13 | 
14 | * `case-study/`: the code and data for the case study in Section 5. 
15 |   Run `case-study.r` to recreate all tables and plots.
16 | 
17 | <!--
18 | zip -ur submission.zip tidy-data.tex tidy-data.pdf references.bib \
19 |   model-1.tex model-2.tex data/ case-study/
20 | -->


--------------------------------------------------------------------------------
/case-study/case-study.r:
--------------------------------------------------------------------------------
  1 | options(stringsAsFactors = F)
  2 | library(reshape2)
  3 | library(ggplot2)
  4 | library(plyr)
  5 | library(stringr)
  6 | library(MASS)
  7 | source("../data/xtable.r")
  8 | 
  9 | if (!file.exists("deaths.rds")) {
 10 |   src <- "https://github.com/hadley/mexico-mortality/raw/master/deaths/deaths08.csv.bz2"
 11 |   file.download(src, "deaths.csv.bz2", quiet = TRUE)
 12 |   
 13 |   deaths <- read.csv("deaths08.csv.bz2")
 14 |   unlink("deaths08.csv.bz2")
 15 |   deaths$hod[deaths$hod == 99] <- NA
 16 |   deaths$hod[deaths$hod == 24] <- 0
 17 |   deaths$hod[deaths$hod == 0] <- NA
 18 |   deaths$hod <- as.integer(deaths$hod)  
 19 |   deaths <- arrange(deaths, yod, mod, dod, hod, cod)
 20 |   deaths <- deaths[c("yod", "mod", "dod", "hod", "cod")]
 21 |   
 22 |   saveRDS(deaths, "deaths.rds")
 23 | }
 24 | 
 25 | deaths <- readRDS("deaths.rds")
 26 | 
 27 | ok <- subset(deaths, yod == 2008 & mod != 0 & dod != 0)
 28 | xtable(ok[c(1, 1:14 * 2000), c("yod", "mod", "dod", "hod", "cod")], 
 29 |   "raw.tex")
 30 | 
 31 | codes <- read.csv("icd-main.csv")
 32 | codes$disease <- sapply(codes$disease, function(x)
 33 |   str_c(strwrap(x, width = 30), collapse = "\n"))
 34 | names(codes)[1] <- "cod"
 35 | codes <- codes[!duplicated(codes$cod), ]
 36 | 
 37 | # Display overall hourly deaths
 38 | hod_all <- subset(count(deaths, "hod"), !is.na(hod))
 39 | qplot(hod, freq, data = hod_all, geom = "line") + 
 40 |   scale_y_continuous("Number of deaths", labels = function(x) format(x, big.mark = ",")) + 
 41 |   xlab("Hour of day")
 42 | ggsave("overall.pdf", width = 10, height = 6)
 43 | 
 44 | # Count deaths per hour, per disease
 45 | hod2 <- count(deaths, c("cod", "hod"))
 46 | hod2 <- subset(hod2, !is.na(hod))
 47 | hod2 <- join(hod2, codes)
 48 | hod2 <- ddply(hod2, "cod", transform, prop = freq / sum(freq))
 49 | 
 50 | # Compare to overall abundance
 51 | overall <- ddply(hod2, "hod", summarise, freq_all = sum(freq))
 52 | overall <- mutate(overall, prop_all = freq_all / sum(freq_all))
 53 | 
 54 | hod2 <- join(overall, hod2, by = "hod")
 55 | 
 56 | # Pick better subset of rows to show
 57 | cods <- join(arrange(count(deaths, "cod"), desc(freq)), codes)
 58 | mutate(tail(subset(cods, freq > 100), 30), disease = str_sub(disease, 1, 30))
 59 | 
 60 | hod3 <- subset(hod2, cod %in% c("I21", "N18", "E84", "B16") & hod >= 8 & hod <= 12)[1:15, c("hod", "cod", "disease", "freq", "prop", "freq_all", "prop_all")]
 61 | 
 62 | xtable(hod3[c("hod", "cod", "freq")], "counts.tex")
 63 | xtable(hod3[c("disease")], "counts-disease.tex")
 64 | xtable(hod3[5], "counts-prop.tex")
 65 | xtable(hod3[6:7], "counts-all.tex")
 66 | 
 67 | devi <- ddply(hod2, "cod", summarise, n = sum(freq), 
 68 |   dist = mean((prop - prop_all)^2))
 69 | devi <- subset(devi, n > 50)
 70 | 
 71 | # Find outliers
 72 | xlog10 <- scale_x_log10(
 73 |   breaks = c(100, 1000, 10000), 
 74 |   labels = c(100, 1000, 10000), 
 75 |   minor_breaks = log10(outer(1:9, 10^(1:5), "*")))
 76 | ylog10 <- scale_y_log10(
 77 |   breaks = 10 ^ -c(3, 4, 5), 
 78 |   labels = c("0.001", "0.0001", "0.00001"),
 79 |   minor_breaks = log10(outer(1:9, 10^-(3:6), "*")))
 80 | 
 81 | qplot(n, dist, data = devi)
 82 | ggsave("n-dist-raw.pdf", width = 6, height = 6)
 83 | qplot(n, dist, data = devi) + 
 84 |   geom_smooth(method = "rlm", se = F) + 
 85 |   xlog10 + 
 86 |   ylog10
 87 | ggsave("n-dist-log.pdf", width = 6, height = 6)
 88 | 
 89 | devi$resid <- resid(rlm(log(dist) ~ log(n), data = devi))
 90 | coef(rlm(log(dist) ~ log(n), data = devi))
 91 | ggplot(devi, aes(n, resid)) + 
 92 |   geom_hline(yintercept = 1.5, colour = "grey50") +
 93 |   geom_point() + 
 94 |   xlog10
 95 | ggsave("n-dist-resid.pdf", width = 6, height = 6)
 96 | 
 97 | unusual <- subset(devi, resid > 1.5)
 98 | hod_unusual_big <- match_df(hod2, subset(unusual, n > 350))
 99 | hod_unusual_sml <- match_df(hod2, subset(unusual, n <= 350))
100 | 
101 | # Visualise unusual causes of death
102 | ggplot(hod_unusual_big, aes(hod, prop)) + 
103 |   geom_line(aes(y = prop_all), data = overall, colour = "grey50") +
104 |   geom_line() + 
105 |   facet_wrap(~ disease, ncol = 3)
106 | ggsave("unusual-big.pdf", width = 8, height = 6)
107 | last_plot() %+% hod_unusual_sml
108 | ggsave("unusual-sml.pdf", width = 8, height = 4)
109 | 


--------------------------------------------------------------------------------
/case-study/counts-all.tex:
--------------------------------------------------------------------------------
 1 | \begin{tabular}{rr}
 2 |   \toprule
 3 |  freq\_all & prop\_all \\ 
 4 |   \midrule
 5 |   21915 & 0.04 \\ 
 6 |   21915 & 0.04 \\ 
 7 |   21915 & 0.04 \\ 
 8 |   21915 & 0.04 \\ 
 9 |   22401 & 0.04 \\ 
10 |   22401 & 0.04 \\ 
11 |   22401 & 0.04 \\ 
12 |   22401 & 0.04 \\ 
13 |   24321 & 0.05 \\ 
14 |   24321 & 0.05 \\ 
15 |   24321 & 0.05 \\ 
16 |   24321 & 0.05 \\ 
17 |   23843 & 0.05 \\ 
18 |   23843 & 0.05 \\ 
19 |   23843 & 0.05 \\ 
20 |    \bottomrule
21 | \end{tabular}
22 | 


--------------------------------------------------------------------------------
/case-study/counts-disease.tex:
--------------------------------------------------------------------------------
 1 | \begin{tabular}{l}
 2 |   \toprule
 3 |  disease \\ 
 4 |   \midrule
 5 |   Acute hepatitis B \\ 
 6 |   Cystic fibrosis \\ 
 7 |   Acute myocardial infarction \\ 
 8 |   Chronic renal failure \\ 
 9 |   Acute hepatitis B \\ 
10 |   Cystic fibrosis \\ 
11 |   Acute myocardial infarction \\ 
12 |   Chronic renal failure \\ 
13 |   Acute hepatitis B \\ 
14 |   Cystic fibrosis \\ 
15 |   Acute myocardial infarction \\ 
16 |   Chronic renal failure \\ 
17 |   Acute hepatitis B \\ 
18 |   Cystic fibrosis \\ 
19 |   Acute myocardial infarction \\ 
20 |    \bottomrule
21 | \end{tabular}
22 | 


--------------------------------------------------------------------------------
/case-study/counts-prop.tex:
--------------------------------------------------------------------------------
 1 | \begin{tabular}{r}
 2 |   \toprule
 3 |  prop \\ 
 4 |   \midrule
 5 |   0.04 \\ 
 6 |   0.03 \\ 
 7 |   0.05 \\ 
 8 |   0.04 \\ 
 9 |   0.07 \\ 
10 |   0.01 \\ 
11 |   0.05 \\ 
12 |   0.04 \\ 
13 |   0.10 \\ 
14 |   0.07 \\ 
15 |   0.05 \\ 
16 |   0.04 \\ 
17 |   0.06 \\ 
18 |   0.03 \\ 
19 |   0.05 \\ 
20 |    \bottomrule
21 | \end{tabular}
22 | 


--------------------------------------------------------------------------------
/case-study/counts.tex:
--------------------------------------------------------------------------------
 1 | \begin{tabular}{rlr}
 2 |   \toprule
 3 |  hod & cod & freq \\ 
 4 |   \midrule
 5 |     8 & B16 &   4 \\ 
 6 |     8 & E84 &   3 \\ 
 7 |     8 & I21 & 2205 \\ 
 8 |     8 & N18 & 315 \\ 
 9 |     9 & B16 &   7 \\ 
10 |     9 & E84 &   1 \\ 
11 |     9 & I21 & 2209 \\ 
12 |     9 & N18 & 333 \\ 
13 |    10 & B16 &  10 \\ 
14 |    10 & E84 &   7 \\ 
15 |    10 & I21 & 2434 \\ 
16 |    10 & N18 & 343 \\ 
17 |    11 & B16 &   6 \\ 
18 |    11 & E84 &   3 \\ 
19 |    11 & I21 & 2128 \\ 
20 |    \bottomrule
21 | \end{tabular}
22 | 


--------------------------------------------------------------------------------
/case-study/deaths.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/tidy-data/5623c7a302977e8c88f623a08556d5c08e940d8c/case-study/deaths.rds


--------------------------------------------------------------------------------
/case-study/n-dist-log.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/tidy-data/5623c7a302977e8c88f623a08556d5c08e940d8c/case-study/n-dist-log.pdf


--------------------------------------------------------------------------------
/case-study/n-dist-raw.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/tidy-data/5623c7a302977e8c88f623a08556d5c08e940d8c/case-study/n-dist-raw.pdf


--------------------------------------------------------------------------------
/case-study/n-dist-resid.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/tidy-data/5623c7a302977e8c88f623a08556d5c08e940d8c/case-study/n-dist-resid.pdf


--------------------------------------------------------------------------------
/case-study/overall.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/tidy-data/5623c7a302977e8c88f623a08556d5c08e940d8c/case-study/overall.pdf


--------------------------------------------------------------------------------
/case-study/raw.tex:
--------------------------------------------------------------------------------
 1 | \begin{tabular}{rrrrl}
 2 |   \toprule
 3 |  yod & mod & dod & hod & cod \\ 
 4 |   \midrule
 5 |   2008 &   1 &   1 &   1 & B20 \\ 
 6 |   2008 &   1 &   2 &   4 & I67 \\ 
 7 |   2008 &   1 &   3 &   8 & I50 \\ 
 8 |   2008 &   1 &   4 &  12 & I50 \\ 
 9 |   2008 &   1 &   5 &  16 & K70 \\ 
10 |   2008 &   1 &   6 &  18 & I21 \\ 
11 |   2008 &   1 &   7 &  20 & I21 \\ 
12 |   2008 &   1 &   8 & --- & K74 \\ 
13 |   2008 &   1 &  10 &   5 & K74 \\ 
14 |   2008 &   1 &  11 &   9 & I21 \\ 
15 |   2008 &   1 &  12 &  15 & I25 \\ 
16 |   2008 &   1 &  13 &  20 & R54 \\ 
17 |   2008 &   1 &  15 &   2 & I61 \\ 
18 |   2008 &   1 &  16 &   7 & I21 \\ 
19 |   2008 &   1 &  17 &  13 & I21 \\ 
20 |    \bottomrule
21 | \end{tabular}
22 | 


--------------------------------------------------------------------------------
/case-study/unusual-big.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/tidy-data/5623c7a302977e8c88f623a08556d5c08e940d8c/case-study/unusual-big.pdf


--------------------------------------------------------------------------------
/case-study/unusual-sml.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/tidy-data/5623c7a302977e8c88f623a08556d5c08e940d8c/case-study/unusual-sml.pdf


--------------------------------------------------------------------------------
/correspondence/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/correspondence/0-submission.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/tidy-data/5623c7a302977e8c88f623a08556d5c08e940d8c/correspondence/0-submission.pdf


--------------------------------------------------------------------------------
/correspondence/1-review-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/tidy-data/5623c7a302977e8c88f623a08556d5c08e940d8c/correspondence/1-review-1.pdf


--------------------------------------------------------------------------------
/correspondence/1-review-2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/tidy-data/5623c7a302977e8c88f623a08556d5c08e940d8c/correspondence/1-review-2.pdf


--------------------------------------------------------------------------------
/correspondence/1-review-3.txt:
--------------------------------------------------------------------------------
 1 | Review of “Tidy Data” by Hadley Wickham
 2 | 
 3 | I think this is a useful and interesting paper, which, considering the widespread use of the author’s ddply and ggplot R functions (and the many questions about them on the R newsgroup), should be of interest to a large number of readers.
 4 | 
 5 | The author is obviously intimately familiar with the tools he’s using in this paper, and he should remember that the readers of this paper are not nearly as familiar with them as he is.
 6 | 
 7 | Section 2
 8 | 
 9 | “When using tables for communication, this so-called “Alabama first” (Wainer 2000) ordering should be abandoned, and replaced with an ordering based on a meaningful variable.”
10 | 
11 | There doesn’t seem to be any justification or explanation for this statement.
12 | 
13 | Section 3.1
14 | 
15 | Since most readers of this paper will want to apply these ideas using R, I think it would be a good idea to show how an R datasets containing the Pew and Billboard data would look, as well as the R code that would be used to melt the datasets. This is especially true for the Billboard data, because additional cleaning was performed. Similar comments apply to the other datasets in the paper.
16 | 
17 | Section 3.5
18 | 
19 | I feel the examples in this section need to be explained in more detail. The referenced external links contain a large number of files, and it’s not really clear which files are relevant to the discussion in this section.
20 | 
21 | Section 4.1
22 | 
23 | “Compare this to the difficulty of combining datasets stored in arrays; these typically require painstaking alignment before matrix operations can be used, and errors can be very hard to detect.”
24 | 
25 | Would anyone seriously suggest using matrices to store data which would be combined with other data sets? The use of an index to combine multiple datasets doesn’t really seem to be related to tidy data.
26 | 
27 | Section 4.3
28 | 
29 | I think readers of this paper might find examples from stata more relevant than those from spss. 
30 | 
31 | Section 5
32 | 
33 | I was not able to download the data for the case study from https://raw.github.com/hadley/tidy-data/master/case-study/deaths.rdata; I recieved a 403 Forbidden error.
34 | 
35 | I think there should be a more thorough discussion explaining the goal of analyzing this data set. I’m not familiar with the term ”time course”, and the single sentence
36 | 
37 | “The case study uses individual-level mortality data from Mexico, with the goal of finding causes of death that have notably different time patterns within a day.”
38 | 
39 | really didn’t clarify the goal sufficiently for me. Perhaps an example of an unusual time course would help to clarify things.
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/correspondence/1-review-ae.txt:
--------------------------------------------------------------------------------
 1 | JSS 1090: Tidy Data
 2 | 
 3 | Hadley is well-known and well-regarded in the field and
 4 | at JSS.  I was fortunate to find three very different
 5 | reviewers and, of course, looked through the paper myself.
 6 | I do not provide my own review.
 7 | 
 8 | This paper has obvious potential.  I think the reviewers
 9 | have many good comments (though of course Hadley may disagree
10 | with some of them and act accordingly) that will improve the
11 | paper.  I'm simply going to assume that Hadley will revise
12 | and resubmit.
13 | 


--------------------------------------------------------------------------------
/correspondence/2-response.md:
--------------------------------------------------------------------------------
  1 | I really appreciate the thoughtful comments of all the reviewers. I've
  2 | worked hard to make the paper easier to understand, and proof read it
  3 | several more times. I include a point-by-point summary of the reviewer's
  4 | comments and my rebuttal below. Minor problems fixed without needing
  5 | further discussion are listed at the end.
  6 | 
  7 | Reviewer 1
  8 | ==========
  9 | 
 10 | > p. 1 last full sentence: "The reorganization makes...because it
 11 | > conforms to a standard that facilitates well an initial exploration
 12 | > and analysis of the data; you don't..."
 13 | 
 14 | I'm not quite sure what the reviewer meant here, but I've rewritten the
 15 | entire paragraph for clarity.
 16 | 
 17 | > p. 2 line 2: Is "reformulating" an approximation for "munging"?
 18 | 
 19 | Yes, but I think mungling better conveys the inelegance of this
 20 | operation :)
 21 | 
 22 | > p. 3 bottom. Consider having a new bold subtitle as a guidepost for
 23 | > the reader: Defining tidy data:
 24 | 
 25 | I've completely reorganised this section and it should be much more
 26 | clear.
 27 | 
 28 | > p. 14 last paragraph. A difficulty is identified; does the author
 29 | > suggest a solution? Reader isn't clear where we are left as we enter
 30 | > the Case Study.
 31 | 
 32 | I'm not currently aware of any packages that resolve this problem, and
 33 | I've added a note to that effect.
 34 | 
 35 | > p. 21 down 13: might be clearer if "...an efficient equivalent to
 36 | > join." – bolding or perhaps quotes.
 37 | 
 38 | I've removed those comments because I know no longer think that they are
 39 | true.
 40 | 
 41 | > This reviewer notes (in a spirit of collegiality) that the best
 42 | > writers about data analysis (from a previous generation) – Fred
 43 | > Mosteller, John Tukey, David Hoaglin, Paul Velleman, Frank Anscombe
 44 | > (as illustrations) – correctly used data as plural form (and
 45 | > occasionally datum as singular). From that perspective, "the data is
 46 | > ..." grates more than a little ... but regrettably this is a battle
 47 | > that is now lost. Sigh... :) :)
 48 | 
 49 | I tried changing "data is" to "data are", and found the result grating.
 50 | I think you're right that the battle is lost.
 51 | 
 52 | Reviewer 2
 53 | ==========
 54 | 
 55 | > While the topic is important and the message is clear, I found the
 56 | > tone of the paper to be too conversational. The author overuses
 57 | > contractions, ambiguous pronouns ("It's often said..."), and casually
 58 | > addresses the reader in second-person form. (R2)
 59 | 
 60 | I have largely kept the conversational style of the paper, while trying
 61 | to make it easier to understand. I don't think it's necessary for a
 62 | scientific paper to be too formal. This work appeals more to
 63 | practicioners of data analysis than academic statisticians, so I think
 64 | it is particularly important that the writing is approachable.
 65 | 
 66 | > The author also switches between the use of italics, boldface, quotes,
 67 | > and the LaTeX verbatim environment constantly, which tends to be
 68 | > distracting at times. (R2) - The author uses "colvars" in italics when
 69 | > first naming the terminology, and then subsequently mentions the Pew
 70 | > dataset contains one colvar, without italics. (R2) - The author does
 71 | > not use formatting on the column names in the caption for Table 9 but
 72 | > does use \verbatim to reference column names in Table 10's caption.
 73 | > (R2)
 74 | 
 75 | Thanks for pointing this out. Throughout the next I now use bold face
 76 | consistently for the first definition of a new term, and verbatim for
 77 | variable names.
 78 | 
 79 | > The author overuses colons. In some cases, I found hyphens to be more
 80 | > appropriate, and in others, I would think semicolons might be more
 81 | > suitable.
 82 | 
 83 | I've reviewed all colons and changed many to other forms of punctuation.
 84 | 
 85 | Reviewer 3
 86 | ==========
 87 | 
 88 | > The author is obviously intimately familiar with the tools he's using
 89 | > in this paper, and he should remember that the readers of this paper
 90 | > are not nearly as familiar with them as he is. (R3)
 91 | 
 92 | I have tried to clarify terms and explain things more clearly
 93 | thoroughout the paper
 94 | 
 95 | > Section 2: "When using tables for communication, this so-called
 96 | > "Alabama first" (Wainer 2000) ordering should be abandoned, and
 97 | > replaced with an ordering based on a meaningful variable."
 98 | >
 99 | > There doesn't seem to be any justification or explanation for this
100 | > statement. (R3)
101 | 
102 | Agreed, and I've removed it.
103 | 
104 | > Section 3.1: Since most readers of this paper will want to apply these
105 | > ideas using R, I think it would be a good idea to show how an R
106 | > datasets containing the Pew and Billboard data would look, as well as
107 | > the R code that would be used to melt the datasets. This is especially
108 | > true for the Billboard data, because additional cleaning was
109 | > performed. Similar comments apply to the other datasets in the paper.
110 | > (R3)
111 | 
112 | I think including the code in the paper would make it too long, but they
113 | are all available at the paper website at
114 | https://github.com/hadley/tidy-data
115 | 
116 | > Section 3.5: I feel the examples in this section need to be explained
117 | > in more detail. The referenced external links contain a large number
118 | > of files, and it's not really clear which files are relevant to the
119 | > discussion in this section. (R3)
120 | 
121 | I've added a brief snippet of R code to illustrate how you might tackle
122 | this.
123 | 
124 | > Section 4.1: "Compare this to the difficulty of combining datasets
125 | > stored in arrays; these typically require painstaking alignment before
126 | > matrix operations can be used, and errors can be very hard to detect."
127 | > Would anyone seriously suggest using matrices to store data which
128 | > would be combined with other data sets? The use of an index to combine
129 | > multiple datasets doesn't really seem to be related to tidy data. (R3)
130 | 
131 | Right, and I've now removed this sentence.
132 | 
133 | > Section 5: I was not able to download the data for the case study from
134 | > https://raw.github.com/hadley/tidy-data/master/case-study/deaths.rdata;
135 | > I recieved a 403 Forbidden error. (R3)
136 | 
137 | Smaller version that you can now download is at
138 | https://raw.github.com/hadley/tidy-data/master/case-study/deaths.rds
139 | 
140 | > I think there should be a more thorough discussion explaining the goal
141 | > of analyzing this data set. I'm not familiar with the term "time
142 | > course", and the single sentence "The case study uses individual-level
143 | > mortality data f rom Mexico, with the goal of finding causes of death
144 | > that have notably different time patterns within a day." really didn't
145 | > clarify the goal sufficiently for me. Perhaps an example of an unusual
146 | > time course would help to clarify things. (R3)
147 | 
148 | I've added a figure showing the temporal pattern over all causes of
149 | death, and tweaked the explanation. Hopefully this should now be clear.
150 | 
151 | Minor problems
152 | ==============
153 | 
154 | The following minor problems reported by the reviewers were resolved
155 | without need for discussion.
156 | 
157 | > Abstract: "take in and take out" "input and output"
158 | 
159 | > p. 1 up 6: "subset" -\> "component"
160 | 
161 | > p. 2 line 9: -\> "an extension..."
162 | 
163 | > p. 2, 5 up from subtitle: "...techniques with real examples." --
164 | > could do nicely.
165 | 
166 | > p. 2, 1 up from subtitle "...misses and what other approaches might
167 | > be fruitful to pursue."
168 | 
169 | > p. 3 2/3 down "...were were..."
170 | 
171 | > p. 10, up 2. Sentence is garbled.
172 | 
173 | > p. 11 bottom: (left) and (right) may not be the apt descriptors...
174 | 
175 | > p. 12, line 9: comma after file name?
176 | 
177 | > p. 12: This reviewer would prefer to avoid "hopefully" about 13 lines
178 | > down. More substantively, the sentence is garbled.
179 | 
180 | > p. 13 middle "...by the by preposition." Should second "by' be
181 | > bolded? Or in quotes" Same question arises again.
182 | 
183 | > p. 17 middle -\> " diseases we work with have..."
184 | 
185 | > p. 19 last line "...seem like they should ..." -\> "may"
186 | 
187 | > Section 4.3: I think readers of this paper might find examples from
188 | > stata more relevant than those from spss. (R3)
189 | 
190 | > The last sentence in the first paragraph of the discussion needs to be
191 | > rephrased for clarity. (R2)
192 | 
193 | > Some tables also have unreadable characters in them, such as the
194 | > degree sign in Table 12's top panel (row 6's artist). Same issue with
195 | > Table 3's 5th religion. Table 12's caption also describes there being
196 | > a left and right dataset, whereas the subtables are clearly placed
197 | > above and below. (R2)
198 | 
199 | > Abstract: "variables are stored in columns, observations in rows, and
200 | > a single type of..." -- this list isn't logically homogeneous.
201 | 
202 | > p.5, 3.1 -- "The Pew Center is an American ... that collects data on
203 | > attitudes to topics RANGING from religion to the internet"
204 | 
205 | > "Billboard" should always be capitalized as it is a proper noun.
206 | 
207 | > p.13, section 4.2 - "Tidy visualization tools ONLY NEED to be..."
208 | 
209 | > p.16, 4-th line from bottom: "Next, we work out THE overall..."
210 | 
211 | > p.16, 2nd line from the bottom: "Then finally, WE join.."
212 | 
213 | > p.19, "The causes of death fall INTO three main groups: ... " There
214 | > should be a hyphen between "transportation" and "related".
215 | 
216 | > p. 19, 2nd paragraph of discussion: "This makes it easy"; remove
217 | > "is".
218 | 
219 | > p. 21, last sentence of first paragraph: "and A BETTER KNOWLEDGE OF
220 | > how we can best design tools..."
221 | 
222 | > p. 21, last paragraph before Acknowledgements "verifying experimental
223 | > design, AND filling in..."
224 | 
225 | > Author information, last page "Adjunct ASSISTANT Professor"
226 | 


--------------------------------------------------------------------------------
/correspondence/3-post-response.md:
--------------------------------------------------------------------------------
 1 | > \proglang, \pkg and \code have been used for highlighting throughout
 2 | > the paper (including titles and references), except where explicitly escaped.
 3 | 
 4 | Done.
 5 | 
 6 | > All table row/column headers should be in sentence style 
 7 | > (i.e. first letter capitalized). 
 8 | 
 9 | All table columns represent variable names in data frames. I think it is confusing to have them in a different case, so I have left as is.
10 | 
11 | > For bullet lists/itemized lists please use either a comma, semi-colon, or 
12 | > period at the end of each time (e.g., see p. 3 item 2 and p. 5).
13 | 
14 | I have switched to period style throughout.
15 | 
16 | > Tables should fit in the normal text width of the manuscript, 
17 | > e.g. see p. 6, 8, 11.
18 | 
19 | Adjusted.
20 | 
21 | > If using "e.g." and "i.e." add a comma after the period (see top of p. 4).
22 | 
23 | Fixed.
24 | 
25 | > References:
26 | > o Springer-Verlag (not: Springer)
27 | 
28 | Done
29 | 
30 | > o Please make sure that all software packages are \cite{}'d properly. 
31 | 
32 | Done
33 | 
34 | > o All references should be in title style.
35 | 
36 | Done
37 | 
38 | > o See FAQ for specific reference instructions.
39 | 
40 | Done.


--------------------------------------------------------------------------------
/correspondence/3-review-post.txt:
--------------------------------------------------------------------------------
 1 | JSS 1090: Wickham
 2 | 
 3 | Tidy Data
 4 | 
 5 | For further instruction on JSS style requirements please see the JSS style manual(in particular section 2.1 Style Checklist) at http://www.jstatsoft.org/downloads/JSSstyle.zip
 6 | Also see FAQ at: http://www.jstatsoft.org/style
 7 | For further references please see RECENT JSS papers for detailed documentation and examples. 
 8 | 
 9 | Manuscript:
10 | o As a reminder, please make sure that:
11 |   - \proglang, \pkg and \code have been used for highlighting throughout
12 | the paper (including titles and references), except where explicitly escaped.
13 | 
14 | o All table row/column headers should be in sentence style (i.e. first letter capitalized). 
15 | 
16 | o For bullet lists/itemized lists please use either a comma, semi-colon, or period at the end of each time (e.g., see p. 3 item 2 and p. 5).
17 | 
18 | o Tables should fit in the normal text width of the manuscript, e.g. see p. 6, 8, 11.
19 | 
20 | o If using "e.g." and "i.e." add a comma after the period (see top of p. 4).
21 | 
22 | 
23 | References:
24 | o Springer-Verlag (not: Springer)
25 | o Please make sure that all software packages are \cite{}'d properly. 
26 | o All references should be in title style.
27 | o See FAQ for specific reference instructions.
28 | 
29 | 
30 | Code:
31 | As a reminder, please make sure that all files needed to replicate the code/examples within the manuscript are included in a standalone replication script.
32 | 


--------------------------------------------------------------------------------
/data/billboard-clean.tex:
--------------------------------------------------------------------------------
 1 | \begin{tabular}{rllllrr}
 2 |   \toprule
 3 |  year & artist & time & track & date & week & rank \\ 
 4 |   \midrule
 5 |   2000 & 2 Pac & 4:22 & Baby Don't Cry & 2000-02-26 &   1 &  87 \\ 
 6 |   2000 & 2 Pac & 4:22 & Baby Don't Cry & 2000-03-04 &   2 &  82 \\ 
 7 |   2000 & 2 Pac & 4:22 & Baby Don't Cry & 2000-03-11 &   3 &  72 \\ 
 8 |   2000 & 2 Pac & 4:22 & Baby Don't Cry & 2000-03-18 &   4 &  77 \\ 
 9 |   2000 & 2 Pac & 4:22 & Baby Don't Cry & 2000-03-25 &   5 &  87 \\ 
10 |   2000 & 2 Pac & 4:22 & Baby Don't Cry & 2000-04-01 &   6 &  94 \\ 
11 |   2000 & 2 Pac & 4:22 & Baby Don't Cry & 2000-04-08 &   7 &  99 \\ 
12 |   2000 & 2Ge+her & 3:15 & The Hardest Part Of ... & 2000-09-02 &   1 &  91 \\ 
13 |   2000 & 2Ge+her & 3:15 & The Hardest Part Of ... & 2000-09-09 &   2 &  87 \\ 
14 |   2000 & 2Ge+her & 3:15 & The Hardest Part Of ... & 2000-09-16 &   3 &  92 \\ 
15 |   2000 & 3 Doors Down & 3:53 & Kryptonite & 2000-04-08 &   1 &  81 \\ 
16 |   2000 & 3 Doors Down & 3:53 & Kryptonite & 2000-04-15 &   2 &  70 \\ 
17 |   2000 & 3 Doors Down & 3:53 & Kryptonite & 2000-04-22 &   3 &  68 \\ 
18 |   2000 & 3 Doors Down & 3:53 & Kryptonite & 2000-04-29 &   4 &  67 \\ 
19 |   2000 & 3 Doors Down & 3:53 & Kryptonite & 2000-05-06 &   5 &  66 \\ 
20 |    \bottomrule
21 | \end{tabular}
22 | 


--------------------------------------------------------------------------------
/data/billboard-rank.tex:
--------------------------------------------------------------------------------
 1 | \begin{tabular}{rlr}
 2 |   \toprule
 3 |  id & date & rank \\ 
 4 |   \midrule
 5 |     1 & 2000-02-26 &  87 \\ 
 6 |     1 & 2000-03-04 &  82 \\ 
 7 |     1 & 2000-03-11 &  72 \\ 
 8 |     1 & 2000-03-18 &  77 \\ 
 9 |     1 & 2000-03-25 &  87 \\ 
10 |     1 & 2000-04-01 &  94 \\ 
11 |     1 & 2000-04-08 &  99 \\ 
12 |     2 & 2000-09-02 &  91 \\ 
13 |     2 & 2000-09-09 &  87 \\ 
14 |     2 & 2000-09-16 &  92 \\ 
15 |     3 & 2000-04-08 &  81 \\ 
16 |     3 & 2000-04-15 &  70 \\ 
17 |     3 & 2000-04-22 &  68 \\ 
18 |     3 & 2000-04-29 &  67 \\ 
19 |     3 & 2000-05-06 &  66 \\ 
20 |    \bottomrule
21 | \end{tabular}
22 | 


--------------------------------------------------------------------------------
/data/billboard-raw.tex:
--------------------------------------------------------------------------------
 1 | \begin{tabular}{rllllrrr}
 2 |   \toprule
 3 |  year & artist & track & time & date.entered & wk1 & wk2 & wk3 \\ 
 4 |   \midrule
 5 |   2000 & 2 Pac & Baby Don't Cry & 4:22 & 2000-02-26 &  87 &  82 &  72 \\ 
 6 |   2000 & 2Ge+her & The Hardest Part Of ... & 3:15 & 2000-09-02 &  91 &  87 &  92 \\ 
 7 |   2000 & 3 Doors Down & Kryptonite & 3:53 & 2000-04-08 &  81 &  70 &  68 \\ 
 8 |   2000 & 98\verb|^|0 & Give Me Just One Nig... & 3:24 & 2000-08-19 &  51 &  39 &  34 \\ 
 9 |   2000 & A*Teens & Dancing Queen & 3:44 & 2000-07-08 &  97 &  97 &  96 \\ 
10 |   2000 & Aaliyah & I Don't Wanna & 4:15 & 2000-01-29 &  84 &  62 &  51 \\ 
11 |   2000 & Aaliyah & Try Again & 4:03 & 2000-03-18 &  59 &  53 &  38 \\ 
12 |   2000 & Adams, Yolanda & Open My Heart & 5:30 & 2000-08-26 &  76 &  76 &  74 \\ 
13 |    \bottomrule
14 | \end{tabular}
15 | 


--------------------------------------------------------------------------------
/data/billboard-song.tex:
--------------------------------------------------------------------------------
 1 | \begin{tabular}{rlll}
 2 |   \toprule
 3 |  id & artist & track & time \\ 
 4 |   \midrule
 5 |     1 & 2 Pac & Baby Don't Cry & 4:22 \\ 
 6 |     2 & 2Ge+her & The Hardest Part Of ... & 3:15 \\ 
 7 |     3 & 3 Doors Down & Kryptonite & 3:53 \\ 
 8 |     4 & 3 Doors Down & Loser & 4:24 \\ 
 9 |     5 & 504 Boyz & Wobble Wobble & 3:35 \\ 
10 |     6 & 98\verb|^|0 & Give Me Just One Nig... & 3:24 \\ 
11 |     7 & A*Teens & Dancing Queen & 3:44 \\ 
12 |     8 & Aaliyah & I Don't Wanna & 4:15 \\ 
13 |     9 & Aaliyah & Try Again & 4:03 \\ 
14 |    10 & Adams, Yolanda & Open My Heart & 5:30 \\ 
15 |    11 & Adkins, Trace & More & 3:05 \\ 
16 |    12 & Aguilera, Christina & Come On Over Baby & 3:38 \\ 
17 |    13 & Aguilera, Christina & I Turn To You & 4:00 \\ 
18 |    14 & Aguilera, Christina & What A Girl Wants & 3:18 \\ 
19 |    15 & Alice Deejay & Better Off Alone & 6:50 \\ 
20 |    \bottomrule
21 | \end{tabular}
22 | 


--------------------------------------------------------------------------------
/data/billboard.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/tidy-data/5623c7a302977e8c88f623a08556d5c08e940d8c/data/billboard.csv


--------------------------------------------------------------------------------
/data/billboard.r:
--------------------------------------------------------------------------------
 1 | options(stringsAsFactors = FALSE)
 2 | library(lubridate)
 3 | library(reshape2)
 4 | library(stringr)
 5 | library(plyr)
 6 | source("xtable.r")
 7 | 
 8 | raw <- read.csv("billboard.csv")
 9 | raw <- raw[, c("year", "artist.inverted", "track", "time", "date.entered", "x1st.week", "x2nd.week", "x3rd.week", "x4th.week", "x5th.week", "x6th.week", "x7th.week", "x8th.week", "x9th.week", "x10th.week", "x11th.week", "x12th.week", "x13th.week", "x14th.week", "x15th.week", "x16th.week", "x17th.week", "x18th.week", "x19th.week", "x20th.week", "x21st.week", "x22nd.week", "x23rd.week", "x24th.week", "x25th.week", "x26th.week", "x27th.week", "x28th.week", "x29th.week", "x30th.week", "x31st.week", "x32nd.week", "x33rd.week", "x34th.week", "x35th.week", "x36th.week", "x37th.week", "x38th.week", "x39th.week", "x40th.week", "x41st.week", "x42nd.week", "x43rd.week", "x44th.week", "x45th.week", "x46th.week", "x47th.week", "x48th.week", "x49th.week", "x50th.week", "x51st.week", "x52nd.week", "x53rd.week", "x54th.week", "x55th.week", "x56th.week", "x57th.week", "x58th.week", "x59th.week", "x60th.week", "x61st.week", "x62nd.week", "x63rd.week", "x64th.week", "x65th.week", "x66th.week", "x67th.week", "x68th.week", "x69th.week", "x70th.week", "x71st.week", "x72nd.week", "x73rd.week", "x74th.week", "x75th.week", "x76th.week")]
10 | names(raw)[2] <- "artist"
11 | 
12 | raw$artist <- iconv(raw$artist, "MAC", "ASCII//translit")
13 | raw$track <- str_replace(raw$track, " \\(.*?\\)", "")
14 | names(raw)[-(1:5)] <- str_c("wk", 1:76)
15 | raw <- arrange(raw, year, artist, track)
16 | 
17 | long_name <- nchar(raw$track) > 20
18 | raw$track[long_name] <- paste0(substr(raw$track[long_name], 0, 20), "...")
19 | 
20 | 
21 | xtable(raw[c(1:3, 6:10), 1:8], "billboard-raw.tex")
22 | 
23 | clean <- melt(raw, id = 1:5, na.rm = T)
24 | clean$week <- as.integer(str_replace_all(clean$variable, "[^0-9]+", ""))
25 | clean$variable <- NULL
26 | 
27 | clean$date.entered <- ymd(clean$date.entered)
28 | clean$date <- clean$date.entered + weeks(clean$week - 1)
29 | clean$date.entered <- NULL
30 | clean <- rename(clean, c("value" = "rank"))
31 | clean <- arrange(clean, year, artist, track, time, week)
32 | clean <- clean[c("year", "artist", "time", "track", "date", "week", "rank")]
33 | 
34 | clean_out <- mutate(clean, 
35 |   date = as.character(date))
36 | xtable(clean_out[1:15, ], "billboard-clean.tex")
37 | 
38 | # Normalisation --------------------------------------------------------------
39 | 
40 | song <- unrowname(unique(clean[c("artist", "track", "time")]))
41 | song$id <- 1:nrow(song)
42 | 
43 | narrow <- song[1:15, c("id","artist", "track", "time")]
44 | xtable(narrow, "billboard-song.tex")
45 | 
46 | rank <- join(clean, song, match = "first")
47 | rank <- rank[c("id", "date", "rank")]
48 | rank$date <- as.character(rank$date)
49 | xtable(rank[1:15, ], "billboard-rank.tex")
50 | 


--------------------------------------------------------------------------------
/data/melt-output.tex:
--------------------------------------------------------------------------------
 1 | \begin{tabular}{llr}
 2 |   \toprule
 3 |  row & column & value \\ 
 4 |   \midrule
 5 |   A & a &   1 \\ 
 6 |   B & a &   2 \\ 
 7 |   C & a &   3 \\ 
 8 |   A & b &   4 \\ 
 9 |   B & b &   5 \\ 
10 |   C & b &   6 \\ 
11 |   A & c &   7 \\ 
12 |   B & c &   8 \\ 
13 |   C & c &   9 \\ 
14 |    \bottomrule
15 | \end{tabular}
16 | 


--------------------------------------------------------------------------------
/data/melt-raw.tex:
--------------------------------------------------------------------------------
 1 | \begin{tabular}{lrrr}
 2 |   \toprule
 3 |  row & a & b & c \\ 
 4 |   \midrule
 5 |   A &   1 &   4 &   7 \\ 
 6 |   B &   2 &   5 &   8 \\ 
 7 |   C &   3 &   6 &   9 \\ 
 8 |    \bottomrule
 9 | \end{tabular}
10 | 


--------------------------------------------------------------------------------
/data/melt.r:
--------------------------------------------------------------------------------
 1 | library(reshape2)
 2 | source("xtable.r")
 3 | 
 4 | df <- data.frame(row = LETTERS[1:3], a = 1:3, b = 4:6, c = 7:9)
 5 | xtable(df, "melt-raw.tex")
 6 | 
 7 | dfm <- melt(df, id = "row")
 8 | names(dfm)[2] <- "column"
 9 | xtable(dfm, "melt-output.tex")
10 | 


--------------------------------------------------------------------------------
/data/pew-clean.tex:
--------------------------------------------------------------------------------
 1 | \begin{tabular}{llr}
 2 |   \toprule
 3 |  religion & income & freq \\ 
 4 |   \midrule
 5 |   Agnostic & $<$\$10k &  27 \\ 
 6 |   Agnostic & \$10-20k &  34 \\ 
 7 |   Agnostic & \$20-30k &  60 \\ 
 8 |   Agnostic & \$30-40k &  81 \\ 
 9 |   Agnostic & \$40-50k &  76 \\ 
10 |   Agnostic & \$50-75k & 137 \\ 
11 |   Agnostic & \$75-100k & 122 \\ 
12 |   Agnostic & \$100-150k & 109 \\ 
13 |   Agnostic & $>$150k &  84 \\ 
14 |   Agnostic & Don't know/refused &  96 \\ 
15 |    \bottomrule
16 | \end{tabular}
17 | 


--------------------------------------------------------------------------------
/data/pew-raw.tex:
--------------------------------------------------------------------------------
 1 | \begin{tabular}{lrrrrrr}
 2 |   \toprule
 3 |  religion & $<$\$10k & \$10-20k & \$20-30k & \$30-40k & \$40-50k & \$50-75k \\ 
 4 |   \midrule
 5 |   Agnostic &  27 &  34 &  60 &  81 &  76 & 137 \\ 
 6 |   Atheist &  12 &  27 &  37 &  52 &  35 &  70 \\ 
 7 |   Buddhist &  27 &  21 &  30 &  34 &  33 &  58 \\ 
 8 |   Catholic & 418 & 617 & 732 & 670 & 638 & 1116 \\ 
 9 |   Don’t know/refused &  15 &  14 &  15 &  11 &  10 &  35 \\ 
10 |   Evangelical Prot & 575 & 869 & 1064 & 982 & 881 & 1486 \\ 
11 |   Hindu &   1 &   9 &   7 &   9 &  11 &  34 \\ 
12 |   Historically Black Prot & 228 & 244 & 236 & 238 & 197 & 223 \\ 
13 |   Jehovah's Witness &  20 &  27 &  24 &  24 &  21 &  30 \\ 
14 |   Jewish &  19 &  19 &  25 &  25 &  30 &  95 \\ 
15 |    \bottomrule
16 | \end{tabular}
17 | 


--------------------------------------------------------------------------------
/data/pew.r:
--------------------------------------------------------------------------------
 1 | library(foreign)
 2 | library(stringr)
 3 | library(plyr)
 4 | library(reshape2)
 5 | source("xtable.r")
 6 | 
 7 | # Data from http://pewforum.org/Datasets/Dataset-Download.aspx
 8 | 
 9 | # Load data -----------------------------------------------------------------
10 | 
11 | pew <- read.spss("pew.sav")
12 | pew <- as.data.frame(pew)
13 | 
14 | 
15 | religion <- pew[c("q16", "reltrad", "income")]
16 | religion$reltrad <- as.character(religion$reltrad)
17 | religion$reltrad <- str_replace(religion$reltrad, " Churches", "")
18 | religion$reltrad <- str_replace(religion$reltrad, " Protestant", " Prot")
19 | religion$reltrad[religion$q16 == " Atheist (do not believe in God) "] <- "Atheist"
20 | religion$reltrad[religion$q16 == " Agnostic (not sure if there is a God) "] <- "Agnostic"
21 | religion$reltrad <- str_trim(religion$reltrad)
22 | religion$reltrad <- str_replace_all(religion$reltrad, " \\(.*?\\)", "")
23 | 
24 | religion$income <- c("Less than $10,000" = "<$10k", 
25 |   "10 to under $20,000" = "$10-20k", 
26 |   "20 to under $30,000" = "$20-30k", 
27 |   "30 to under $40,000" = "$30-40k", 
28 |   "40 to under $50,000" = "$40-50k", 
29 |   "50 to under $75,000" = "$50-75k",
30 |   "75 to under $100,000" = "$75-100k", 
31 |   "100 to under $150,000" = "$100-150k", 
32 |   "$150,000 or more" = ">150k", 
33 |   "Don't know/Refused (VOL)" = "Don't know/refused")[religion$income]
34 | 
35 | religion$income <- factor(religion$income, levels = c("<$10k", "$10-20k", "$20-30k", "$30-40k", "$40-50k", "$50-75k", 
36 |   "$75-100k", "$100-150k", ">150k", "Don't know/refused"))
37 | 
38 | counts <- count(religion, c("reltrad", "income"))
39 | names(counts)[1] <- "religion"
40 | 
41 | xtable(counts[1:10, ], file = "pew-clean.tex")
42 | 
43 | # Convert into the form in which I originally saw it -------------------------
44 | 
45 | raw <- dcast(counts, religion ~ income)
46 | xtable(raw[1:10, 1:7], file = "pew-raw.tex")


--------------------------------------------------------------------------------
/data/pew.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/tidy-data/5623c7a302977e8c88f623a08556d5c08e940d8c/data/pew.sav


--------------------------------------------------------------------------------
/data/preg-raw-1.tex:
--------------------------------------------------------------------------------
 1 | \begin{tabular}{lrr}
 2 |   \toprule
 3 |   & treatmenta & treatmentb \\ 
 4 |   \midrule
 5 |   John Smith & --- &   2 \\ 
 6 |   Jane Doe &  16 &  11 \\ 
 7 |   Mary Johnson &   3 &   1 \\ 
 8 |    \bottomrule
 9 | \end{tabular}
10 | 


--------------------------------------------------------------------------------
/data/preg-raw-2.tex:
--------------------------------------------------------------------------------
1 | \begin{tabular}{lrrr}
2 |   \toprule
3 |   & John Smith & Jane Doe & Mary Johnson \\ 
4 |   \midrule
5 |   treatmenta & --- &  16 &   3 \\ 
6 |   treatmentb &   2 &  11 &   1 \\ 
7 |    \bottomrule
8 | \end{tabular}
9 | 


--------------------------------------------------------------------------------
/data/preg-tidy.tex:
--------------------------------------------------------------------------------
 1 | \begin{tabular}{llr}
 2 |   \toprule
 3 |  name & trt & result \\ 
 4 |   \midrule
 5 |   John Smith & a & --- \\ 
 6 |   Jane Doe & a &  16 \\ 
 7 |   Mary Johnson & a &   3 \\ 
 8 |   John Smith & b &   2 \\ 
 9 |   Jane Doe & b &  11 \\ 
10 |   Mary Johnson & b &   1 \\ 
11 |    \bottomrule
12 | \end{tabular}
13 | 


--------------------------------------------------------------------------------
/data/preg.R:
--------------------------------------------------------------------------------
 1 | source("xtable.r")
 2 | library(reshape2)
 3 | set.seed(1014)
 4 | 
 5 | preg <- matrix(c(NA, sample(20, 5)), ncol = 2, byrow = T)
 6 | colnames(preg) <- paste0("treatment", c("a", "b"))
 7 | rownames(preg) <- c("John Smith", "Jane Doe", "Mary Johnson")
 8 | 
 9 | xtable(preg, "preg-raw-1.tex", rownames = TRUE, align = "lrr")
10 | xtable(t(preg), "preg-raw-2.tex", rownames = TRUE, align = "lrrr")
11 | 
12 | # Make tidy version
13 | 
14 | pregm <- melt(preg, id = "name")
15 | names(pregm) <- c("name", "trt", "result")
16 | pregm$trt <- gsub("treatment", "", pregm$trt)
17 | 
18 | xtable(pregm, "preg-tidy.tex")
19 | 


--------------------------------------------------------------------------------
/data/read-fwf.r:
--------------------------------------------------------------------------------
 1 | read.fwf2 <- function(path, cols) {
 2 |   raw_stations <- readLines(path)
 3 |   stations <- data.frame(matrix(ncol = 0, nrow = length(raw_stations)))
 4 | 
 5 |   for(i in 1:nrow(cols)) {
 6 |     field <- cols[i, ]
 7 |     stations[[field$name]] <- str_trim(str_sub(raw_stations, field$start, field$end))
 8 |   }
 9 |   stations[stations == ""] <- NA
10 |   stations[] <- lapply(stations, type.convert, as.is = TRUE)
11 |   
12 |   stations
13 | }
14 | 


--------------------------------------------------------------------------------
/data/tb-clean-1.tex:
--------------------------------------------------------------------------------
 1 | % latex table generated in R 2.14.1 by xtable 1.6-0 package
 2 | % Wed Mar  7 07:12:31 2012
 3 | \begin{tabular}{lrlr}
 4 |   \toprule
 5 |  country & year & column & cases \\ 
 6 |   \midrule
 7 |   AD & 2000 & m014 &   0 \\ 
 8 |   AD & 2000 & m1524 &   0 \\ 
 9 |   AD & 2000 & m2534 &   1 \\ 
10 |   AD & 2000 & m3544 &   0 \\ 
11 |   AD & 2000 & m4554 &   0 \\ 
12 |   AD & 2000 & m5564 &   0 \\ 
13 |   AD & 2000 & m65 &   0 \\ 
14 |   AE & 2000 & m014 &   2 \\ 
15 |   AE & 2000 & m1524 &   4 \\ 
16 |   AE & 2000 & m2534 &   4 \\ 
17 |   AE & 2000 & m3544 &   6 \\ 
18 |   AE & 2000 & m4554 &   5 \\ 
19 |   AE & 2000 & m5564 &  12 \\ 
20 |   AE & 2000 & m65 &  10 \\ 
21 |   AE & 2000 & f014 &   3 \\ 
22 |    \bottomrule
23 | \end{tabular}
24 | 


--------------------------------------------------------------------------------
/data/tb-clean-2.tex:
--------------------------------------------------------------------------------
 1 | % latex table generated in R 2.14.1 by xtable 1.6-0 package
 2 | % Wed Mar  7 07:12:31 2012
 3 | \begin{tabular}{lrllr}
 4 |   \toprule
 5 |  country & year & sex & age & cases \\ 
 6 |   \midrule
 7 |   AD & 2000 & m & 0-14 &   0 \\ 
 8 |   AD & 2000 & m & 15-24 &   0 \\ 
 9 |   AD & 2000 & m & 25-34 &   1 \\ 
10 |   AD & 2000 & m & 35-44 &   0 \\ 
11 |   AD & 2000 & m & 45-54 &   0 \\ 
12 |   AD & 2000 & m & 55-64 &   0 \\ 
13 |   AD & 2000 & m & 65+ &   0 \\ 
14 |   AE & 2000 & m & 0-14 &   2 \\ 
15 |   AE & 2000 & m & 15-24 &   4 \\ 
16 |   AE & 2000 & m & 25-34 &   4 \\ 
17 |   AE & 2000 & m & 35-44 &   6 \\ 
18 |   AE & 2000 & m & 45-54 &   5 \\ 
19 |   AE & 2000 & m & 55-64 &  12 \\ 
20 |   AE & 2000 & m & 65+ &  10 \\ 
21 |   AE & 2000 & f & 0-14 &   3 \\ 
22 |    \bottomrule
23 | \end{tabular}
24 | 


--------------------------------------------------------------------------------
/data/tb-raw.tex:
--------------------------------------------------------------------------------
 1 | % latex table generated in R 2.14.1 by xtable 1.6-0 package
 2 | % Wed Mar  7 07:12:31 2012
 3 | \begin{tabular}{lrrrrrrrrrr}
 4 |   \toprule
 5 |  country & year & m014 & m1524 & m2534 & m3544 & m4554 & m5564 & m65 & mu & f014 \\ 
 6 |   \midrule
 7 |   AD & 2000 &   0 &   0 &   1 &   0 &   0 &   0 &   0 & --- & --- \\ 
 8 |   AE & 2000 &   2 &   4 &   4 &   6 &   5 &  12 &  10 & --- &   3 \\ 
 9 |   AF & 2000 &  52 & 228 & 183 & 149 & 129 &  94 &  80 & --- &  93 \\ 
10 |   AG & 2000 &   0 &   0 &   0 &   0 &   0 &   0 &   1 & --- &   1 \\ 
11 |   AL & 2000 &   2 &  19 &  21 &  14 &  24 &  19 &  16 & --- &   3 \\ 
12 |   AM & 2000 &   2 & 152 & 130 & 131 &  63 &  26 &  21 & --- &   1 \\ 
13 |   AN & 2000 &   0 &   0 &   1 &   2 &   0 &   0 &   0 & --- &   0 \\ 
14 |   AO & 2000 & 186 & 999 & 1003 & 912 & 482 & 312 & 194 & --- & 247 \\ 
15 |   AR & 2000 &  97 & 278 & 594 & 402 & 419 & 368 & 330 & --- & 121 \\ 
16 |   AS & 2000 & --- & --- & --- & --- &   1 &   1 & --- & --- & --- \\ 
17 |    \bottomrule
18 | \end{tabular}
19 | 


--------------------------------------------------------------------------------
/data/tb.r:
--------------------------------------------------------------------------------
 1 | library(reshape2)
 2 | library(plyr)
 3 | library(stringr)
 4 | source("xtable.r")
 5 | options(stringsAsFactors = FALSE)
 6 | 
 7 | # Load -----------------------------------------------------------------------
 8 | raw <- read.csv("tb.csv", na.strings = "")
 9 | raw$new_sp <- NULL
10 | raw <- subset(raw, year == 2000)
11 | names(raw)[1] <- "country"
12 | 
13 | names(raw) <- str_replace(names(raw), "new_sp_", "")
14 | raw$m04 <- NULL
15 | raw$m514 <- NULL
16 | raw$f04 <- NULL
17 | raw$f514 <- NULL
18 | 
19 | xtable(raw[1:10, 1:11], file = "tb-raw.tex")
20 | 
21 | # Melt -----------------------------------------------------------------------
22 | 
23 | clean <- melt(raw, id = c("country", "year"), na.rm = TRUE)
24 | names(clean)[3] <- "column"
25 | names(clean)[4] <- "cases"
26 | 
27 | clean <- arrange(clean, country, column, year)
28 | xtable(clean[1:15, ], file = "tb-clean-1.tex")
29 | 
30 | # Break up variable in to sex and age ----------------------------------------
31 | 
32 | clean$sex <- str_sub(clean$column, 1, 1)
33 | 
34 | ages <- c("04" = "0-4", "514" = "5-14", "014" = "0-14", "1524" = "15-24", "2534" = "25-34", "3544" = "35-44", "4554" = "45-54", "5564" = "55-64", "65"= "65+", "u" = NA)
35 | 
36 | clean$age <- factor(ages[str_sub(clean$column, 2)], levels = ages)
37 | 
38 | clean <- clean[c("country", "year", "sex", "age", "cases")]
39 | 
40 | xtable(clean[1:15, ], file = "tb-clean-2.tex")
41 | 


--------------------------------------------------------------------------------
/data/weather-clean-1.tex:
--------------------------------------------------------------------------------
 1 | \begin{tabular}{lllr}
 2 |   \toprule
 3 |  id & date & element & value \\ 
 4 |   \midrule
 5 |   MX17004 & 2010-01-30 & tmax & 27.8 \\ 
 6 |   MX17004 & 2010-01-30 & tmin & 14.5 \\ 
 7 |   MX17004 & 2010-02-02 & tmax & 27.3 \\ 
 8 |   MX17004 & 2010-02-02 & tmin & 14.4 \\ 
 9 |   MX17004 & 2010-02-03 & tmax & 24.1 \\ 
10 |   MX17004 & 2010-02-03 & tmin & 14.4 \\ 
11 |   MX17004 & 2010-02-11 & tmax & 29.7 \\ 
12 |   MX17004 & 2010-02-11 & tmin & 13.4 \\ 
13 |   MX17004 & 2010-02-23 & tmax & 29.9 \\ 
14 |   MX17004 & 2010-02-23 & tmin & 10.7 \\ 
15 |    \bottomrule
16 | \end{tabular}
17 | 


--------------------------------------------------------------------------------
/data/weather-clean-2.tex:
--------------------------------------------------------------------------------
 1 | \begin{tabular}{llrr}
 2 |   \toprule
 3 |  id & date & tmax & tmin \\ 
 4 |   \midrule
 5 |   MX17004 & 2010-01-30 & 27.8 & 14.5 \\ 
 6 |   MX17004 & 2010-02-02 & 27.3 & 14.4 \\ 
 7 |   MX17004 & 2010-02-03 & 24.1 & 14.4 \\ 
 8 |   MX17004 & 2010-02-11 & 29.7 & 13.4 \\ 
 9 |   MX17004 & 2010-02-23 & 29.9 & 10.7 \\ 
10 |   MX17004 & 2010-03-05 & 32.1 & 14.2 \\ 
11 |   MX17004 & 2010-03-10 & 34.5 & 16.8 \\ 
12 |   MX17004 & 2010-03-16 & 31.1 & 17.6 \\ 
13 |   MX17004 & 2010-04-27 & 36.3 & 16.7 \\ 
14 |   MX17004 & 2010-05-27 & 33.2 & 18.2 \\ 
15 |    \bottomrule
16 | \end{tabular}
17 | 


--------------------------------------------------------------------------------
/data/weather-raw.tex:
--------------------------------------------------------------------------------
 1 | \begin{tabular}{lrrlrrrrrrrr}
 2 |   \toprule
 3 |  id & year & month & element & d1 & d2 & d3 & d4 & d5 & d6 & d7 & d8 \\ 
 4 |   \midrule
 5 |   MX17004 & 2010 &  1 & tmax & --- & --- & --- & --- & --- & --- & --- & --- \\ 
 6 |   MX17004 & 2010 &  1 & tmin & --- & --- & --- & --- & --- & --- & --- & --- \\ 
 7 |   MX17004 & 2010 &  2 & tmax & --- & 27.3 & 24.1 & --- & --- & --- & --- & --- \\ 
 8 |   MX17004 & 2010 &  2 & tmin & --- & 14.4 & 14.4 & --- & --- & --- & --- & --- \\ 
 9 |   MX17004 & 2010 &  3 & tmax & --- & --- & --- & --- & 32.1 & --- & --- & --- \\ 
10 |   MX17004 & 2010 &  3 & tmin & --- & --- & --- & --- & 14.2 & --- & --- & --- \\ 
11 |   MX17004 & 2010 &  4 & tmax & --- & --- & --- & --- & --- & --- & --- & --- \\ 
12 |   MX17004 & 2010 &  4 & tmin & --- & --- & --- & --- & --- & --- & --- & --- \\ 
13 |   MX17004 & 2010 &  5 & tmax & --- & --- & --- & --- & --- & --- & --- & --- \\ 
14 |   MX17004 & 2010 &  5 & tmin & --- & --- & --- & --- & --- & --- & --- & --- \\ 
15 |    \bottomrule
16 | \end{tabular}
17 | 


--------------------------------------------------------------------------------
/data/weather.r:
--------------------------------------------------------------------------------
 1 | library(stringr)
 2 | library(reshape2)
 3 | library(plyr)
 4 | source("xtable.r")
 5 | source("read-fwf.r")
 6 | options(stringsAsFactors = FALSE)
 7 | 
 8 | # Define format for fixed width file
 9 | cols <- data.frame(
10 |   name =  c("id", "year", "month", "element"),
11 |   start = c(1,     12,    16,      18),
12 |   end =   c(11,    15,    17,      21))
13 | 
14 | names <- str_c(c("value", "mflag", "qflag", "sflag"), rep(1:31, each = 4), sep = "_")
15 | starts <- cumsum(c(22, rep(c(5, 1, 1, 1), 31)))
16 | starts <- starts[-length(starts)]
17 | ends <- c(starts[-1], starts[length(starts)] + 1) - 1
18 | 
19 | values <- data.frame(name = names, start = starts, end = ends)
20 | cols <- rbind(cols, values)
21 | 
22 | # Load data and subset to small example
23 | raw <- read.fwf2("weather.txt",  cols)
24 | raw <- subset(raw, year == 2010 & element %in% c("TMIN", "TMAX")) 
25 | raw <- raw[, c(1:4, which(str_detect(names(raw), "value")))]
26 | raw$id <- str_c(str_sub(raw$id, 1, 2), str_sub(raw$id, -5, -1))
27 | 
28 | names(raw)[-(1:4)] <- str_c("d", 1:31)
29 | raw[raw == -9999] <- NA
30 | raw[-(1:4)] <- raw[-(1:4)] / 10 
31 | rownames(raw) <- NULL
32 | raw$element <- tolower(raw$element)
33 | 
34 | xtable(raw[1:10, 1:12], file = "weather-raw.tex", digits = 1)
35 | 
36 | # Melt and tidy
37 | 
38 | clean1 <- melt(raw, id = 1:4, na.rm = T)
39 | clean1$day <- as.integer(str_replace(clean1$variable, "d", ""))
40 | clean1$date <- as.Date(ISOdate(clean1$year, clean1$month, clean1$day))
41 | 
42 | clean1 <- clean1[c("id", "date", "element", "value")]
43 | clean1 <- arrange(clean1, date, element)
44 | clean1$date <- as.character(clean1$date) # work around xtable bug
45 | xtable(clean1[1:10, ], file = "weather-clean-1.tex", digits = 1)
46 | 
47 | # Cast
48 | 
49 | clean2 <- dcast(clean1, ... ~ element)
50 | xtable(clean2[1:10, ], file = "weather-clean-2.tex", digits = 1)
51 | 


--------------------------------------------------------------------------------
/data/xtable.r:
--------------------------------------------------------------------------------
 1 | # Change defaults for xtable to be more attractive
 2 | # Inspired by: http://cameron.bracken.bz/sweave-xtable-booktabs
 3 | library(xtable)
 4 | 
 5 | xtable <- function(x, file = "", ..., rownames = FALSE){
 6 |   table <- xtable::xtable(x, ...)
 7 |   print(table, floating = F, hline.after = NULL, 
 8 |     add.to.row = list(pos = list(-1,0, nrow(x)), 
 9 |     command = c('\\toprule\n ','\\midrule\n  ','\\bottomrule\n')),
10 |     include.rownames = rownames, NA.string = "---",
11 |     file = file, 
12 |     comment = FALSE, timestamp = FALSE
13 |   )
14 | }


--------------------------------------------------------------------------------
/jss.bst:
--------------------------------------------------------------------------------
   1 | %%
   2 | %% This is file `jss.bst',
   3 | %% generated with the docstrip utility.
   4 | %%
   5 | %% The original source files were:
   6 | %%
   7 | %% merlin.mbs  (with options: `ay,nat,nm-rvx,keyxyr,dt-beg,yr-par,note-yr,tit-qq,atit-u,trnum-it,vol-bf,volp-com,num-xser,pre-edn,isbn,issn,edpar,pp,ed,xedn,xand,etal-it,revdata,eprint,url,url-blk,doi,nfss')
   8 | %% 
   9 | %% ** BibTeX style file for JSS publications (http://www.jstatsoft.org/)
  10 | %% 
  11 | %% Copyright 1994-2007 Patrick W Daly
  12 | %% License: GPL-2
  13 |  % ===============================================================
  14 |  % IMPORTANT NOTICE:
  15 |  % This bibliographic style (bst) file has been generated from one or
  16 |  % more master bibliographic style (mbs) files, listed above, provided
  17 |  % with kind permission of Patrick W Daly.
  18 |  %
  19 |  % This generated file can be redistributed and/or modified under the terms
  20 |  % of the General Public License (Version 2).
  21 |  % ===============================================================
  22 |  % Name and version information of the main mbs file:
  23 |  % \ProvidesFile{merlin.mbs}[2007/04/24 4.20 (PWD, AO, DPC)]
  24 |  %   For use with BibTeX version 0.99a or later
  25 |  %-------------------------------------------------------------------
  26 |  % This bibliography style file is intended for texts in ENGLISH
  27 |  % This is an author-year citation style bibliography. As such, it is
  28 |  % non-standard LaTeX, and requires a special package file to function properly.
  29 |  % Such a package is    natbib.sty   by Patrick W. Daly
  30 |  % The form of the \bibitem entries is
  31 |  %   \bibitem[Jones et al.(1990)]{key}...
  32 |  %   \bibitem[Jones et al.(1990)Jones, Baker, and Smith]{key}...
  33 |  % The essential feature is that the label (the part in brackets) consists
  34 |  % of the author names, as they should appear in the citation, with the year
  35 |  % in parentheses following. There must be no space before the opening
  36 |  % parenthesis!
  37 |  % With natbib v5.3, a full list of authors may also follow the year.
  38 |  % In natbib.sty, it is possible to define the type of enclosures that is
  39 |  % really wanted (brackets or parentheses), but in either case, there must
  40 |  % be parentheses in the label.
  41 |  % The \cite command functions as follows:
  42 |  %   \citet{key} ==>>                Jones et al. (1990)
  43 |  %   \citet*{key} ==>>               Jones, Baker, and Smith (1990)
  44 |  %   \citep{key} ==>>                (Jones et al., 1990)
  45 |  %   \citep*{key} ==>>               (Jones, Baker, and Smith, 1990)
  46 |  %   \citep[chap. 2]{key} ==>>       (Jones et al., 1990, chap. 2)
  47 |  %   \citep[e.g.][]{key} ==>>        (e.g. Jones et al., 1990)
  48 |  %   \citep[e.g.][p. 32]{key} ==>>   (e.g. Jones et al., p. 32)
  49 |  %   \citeauthor{key} ==>>           Jones et al.
  50 |  %   \citeauthor*{key} ==>>          Jones, Baker, and Smith
  51 |  %   \citeyear{key} ==>>             1990
  52 |  %---------------------------------------------------------------------
  53 | 
  54 | ENTRY
  55 |   { address
  56 |     archive
  57 |     author
  58 |     booktitle
  59 |     chapter
  60 |     collaboration
  61 |     doi
  62 |     edition
  63 |     editor
  64 |     eid
  65 |     eprint
  66 |     howpublished
  67 |     institution
  68 |     isbn
  69 |     issn
  70 |     journal
  71 |     key
  72 |     month
  73 |     note
  74 |     number
  75 |     numpages
  76 |     organization
  77 |     pages
  78 |     publisher
  79 |     school
  80 |     series
  81 |     title
  82 |     type
  83 |     url
  84 |     volume
  85 |     year
  86 |   }
  87 |   {}
  88 |   { label extra.label sort.label short.list }
  89 | INTEGERS { output.state before.all mid.sentence after.sentence after.block }
  90 | FUNCTION {init.state.consts}
  91 | { #0 'before.all :=
  92 |   #1 'mid.sentence :=
  93 |   #2 'after.sentence :=
  94 |   #3 'after.block :=
  95 | }
  96 | STRINGS { s t}
  97 | FUNCTION {output.nonnull}
  98 | { 's :=
  99 |   output.state mid.sentence =
 100 |     { ", " * write$ }
 101 |     { output.state after.block =
 102 |         { add.period$ write$
 103 |           newline$
 104 |           "\newblock " write$
 105 |         }
 106 |         { output.state before.all =
 107 |             'write$
 108 |             { add.period$ " " * write$ }
 109 |           if$
 110 |         }
 111 |       if$
 112 |       mid.sentence 'output.state :=
 113 |     }
 114 |   if$
 115 |   s
 116 | }
 117 | FUNCTION {output}
 118 | { duplicate$ empty$
 119 |     'pop$
 120 |     'output.nonnull
 121 |   if$
 122 | }
 123 | FUNCTION {output.check}
 124 | { 't :=
 125 |   duplicate$ empty$
 126 |     { pop$ "empty " t * " in " * cite$ * warning$ }
 127 |     'output.nonnull
 128 |   if$
 129 | }
 130 | FUNCTION {fin.entry}
 131 | { add.period$
 132 |   write$
 133 |   newline$
 134 | }
 135 | 
 136 | FUNCTION {new.block}
 137 | { output.state before.all =
 138 |     'skip$
 139 |     { after.block 'output.state := }
 140 |   if$
 141 | }
 142 | FUNCTION {new.sentence}
 143 | { output.state after.block =
 144 |     'skip$
 145 |     { output.state before.all =
 146 |         'skip$
 147 |         { after.sentence 'output.state := }
 148 |       if$
 149 |     }
 150 |   if$
 151 | }
 152 | FUNCTION {add.blank}
 153 | {  " " * before.all 'output.state :=
 154 | }
 155 | 
 156 | FUNCTION {date.block}
 157 | {
 158 |   new.block
 159 | }
 160 | 
 161 | FUNCTION {not}
 162 | {   { #0 }
 163 |     { #1 }
 164 |   if$
 165 | }
 166 | FUNCTION {and}
 167 | {   'skip$
 168 |     { pop$ #0 }
 169 |   if$
 170 | }
 171 | FUNCTION {or}
 172 | {   { pop$ #1 }
 173 |     'skip$
 174 |   if$
 175 | }
 176 | FUNCTION {non.stop}
 177 | { duplicate$
 178 |    "}" * add.period$
 179 |    #-1 #1 substring$ "." =
 180 | }
 181 | 
 182 | STRINGS {z}
 183 | FUNCTION {remove.dots}
 184 | { 'z :=
 185 |   ""
 186 |   { z empty$ not }
 187 |   { z #1 #1 substring$
 188 |     z #2 global.max$ substring$ 'z :=
 189 |     duplicate$ "." = 'pop$
 190 |       { * }
 191 |     if$
 192 |   }
 193 |   while$
 194 | }
 195 | FUNCTION {new.block.checkb}
 196 | { empty$
 197 |   swap$ empty$
 198 |   and
 199 |     'skip$
 200 |     'new.block
 201 |   if$
 202 | }
 203 | FUNCTION {field.or.null}
 204 | { duplicate$ empty$
 205 |     { pop$ "" }
 206 |     'skip$
 207 |   if$
 208 | }
 209 | FUNCTION {emphasize}
 210 | { duplicate$ empty$
 211 |     { pop$ "" }
 212 |     { "\emph{" swap$ * "}" * }
 213 |   if$
 214 | }
 215 | FUNCTION {bolden}
 216 | { duplicate$ empty$
 217 |     { pop$ "" }
 218 |     { "\textbf{" swap$ * "}" * }
 219 |   if$
 220 | }
 221 | FUNCTION {tie.or.space.prefix}
 222 | { duplicate$ text.length$ #3 <
 223 |     { "~" }
 224 |     { " " }
 225 |   if$
 226 |   swap$
 227 | }
 228 | 
 229 | FUNCTION {capitalize}
 230 | { "u" change.case$ "t" change.case$ }
 231 | 
 232 | FUNCTION {space.word}
 233 | { " " swap$ * " " * }
 234 |  % Here are the language-specific definitions for explicit words.
 235 |  % Each function has a name bbl.xxx where xxx is the English word.
 236 |  % The language selected here is ENGLISH
 237 | FUNCTION {bbl.and}
 238 | { "and"}
 239 | 
 240 | FUNCTION {bbl.etal}
 241 | { "et~al." }
 242 | 
 243 | FUNCTION {bbl.editors}
 244 | { "eds." }
 245 | 
 246 | FUNCTION {bbl.editor}
 247 | { "ed." }
 248 | 
 249 | FUNCTION {bbl.edby}
 250 | { "edited by" }
 251 | 
 252 | FUNCTION {bbl.edition}
 253 | { "edition" }
 254 | 
 255 | FUNCTION {bbl.volume}
 256 | { "volume" }
 257 | 
 258 | FUNCTION {bbl.of}
 259 | { "of" }
 260 | 
 261 | FUNCTION {bbl.number}
 262 | { "number" }
 263 | 
 264 | FUNCTION {bbl.nr}
 265 | { "no." }
 266 | 
 267 | FUNCTION {bbl.in}
 268 | { "in" }
 269 | 
 270 | FUNCTION {bbl.pages}
 271 | { "pp." }
 272 | 
 273 | FUNCTION {bbl.page}
 274 | { "p." }
 275 | 
 276 | FUNCTION {bbl.eidpp}
 277 | { "pages" }
 278 | 
 279 | FUNCTION {bbl.chapter}
 280 | { "chapter" }
 281 | 
 282 | FUNCTION {bbl.techrep}
 283 | { "Technical Report" }
 284 | 
 285 | FUNCTION {bbl.mthesis}
 286 | { "Master's thesis" }
 287 | 
 288 | FUNCTION {bbl.phdthesis}
 289 | { "Ph.D. thesis" }
 290 | 
 291 | MACRO {jan} {"January"}
 292 | 
 293 | MACRO {feb} {"February"}
 294 | 
 295 | MACRO {mar} {"March"}
 296 | 
 297 | MACRO {apr} {"April"}
 298 | 
 299 | MACRO {may} {"May"}
 300 | 
 301 | MACRO {jun} {"June"}
 302 | 
 303 | MACRO {jul} {"July"}
 304 | 
 305 | MACRO {aug} {"August"}
 306 | 
 307 | MACRO {sep} {"September"}
 308 | 
 309 | MACRO {oct} {"October"}
 310 | 
 311 | MACRO {nov} {"November"}
 312 | 
 313 | MACRO {dec} {"December"}
 314 | 
 315 | MACRO {acmcs} {"ACM Computing Surveys"}
 316 | 
 317 | MACRO {acta} {"Acta Informatica"}
 318 | 
 319 | MACRO {cacm} {"Communications of the ACM"}
 320 | 
 321 | MACRO {ibmjrd} {"IBM Journal of Research and Development"}
 322 | 
 323 | MACRO {ibmsj} {"IBM Systems Journal"}
 324 | 
 325 | MACRO {ieeese} {"IEEE Transactions on Software Engineering"}
 326 | 
 327 | MACRO {ieeetc} {"IEEE Transactions on Computers"}
 328 | 
 329 | MACRO {ieeetcad}
 330 |  {"IEEE Transactions on Computer-Aided Design of Integrated Circuits"}
 331 | 
 332 | MACRO {ipl} {"Information Processing Letters"}
 333 | 
 334 | MACRO {jacm} {"Journal of the ACM"}
 335 | 
 336 | MACRO {jcss} {"Journal of Computer and System Sciences"}
 337 | 
 338 | MACRO {scp} {"Science of Computer Programming"}
 339 | 
 340 | MACRO {sicomp} {"SIAM Journal on Computing"}
 341 | 
 342 | MACRO {tocs} {"ACM Transactions on Computer Systems"}
 343 | 
 344 | MACRO {tods} {"ACM Transactions on Database Systems"}
 345 | 
 346 | MACRO {tog} {"ACM Transactions on Graphics"}
 347 | 
 348 | MACRO {toms} {"ACM Transactions on Mathematical Software"}
 349 | 
 350 | MACRO {toois} {"ACM Transactions on Office Information Systems"}
 351 | 
 352 | MACRO {toplas} {"ACM Transactions on Programming Languages and Systems"}
 353 | 
 354 | MACRO {tcs} {"Theoretical Computer Science"}
 355 | FUNCTION {bibinfo.check}
 356 | { swap$
 357 |   duplicate$ missing$
 358 |     {
 359 |       pop$ pop$
 360 |       ""
 361 |     }
 362 |     { duplicate$ empty$
 363 |         {
 364 |           swap$ pop$
 365 |         }
 366 |         { swap$
 367 |           pop$
 368 |         }
 369 |       if$
 370 |     }
 371 |   if$
 372 | }
 373 | FUNCTION {bibinfo.warn}
 374 | { swap$
 375 |   duplicate$ missing$
 376 |     {
 377 |       swap$ "missing " swap$ * " in " * cite$ * warning$ pop$
 378 |       ""
 379 |     }
 380 |     { duplicate$ empty$
 381 |         {
 382 |           swap$ "empty " swap$ * " in " * cite$ * warning$
 383 |         }
 384 |         { swap$
 385 |           pop$
 386 |         }
 387 |       if$
 388 |     }
 389 |   if$
 390 | }
 391 | FUNCTION {format.eprint}
 392 | { eprint duplicate$ empty$
 393 |     'skip$
 394 |     { "\eprint"
 395 |       archive empty$
 396 |         'skip$
 397 |         { "[" * archive * "]" * }
 398 |       if$
 399 |       "{" * swap$ * "}" *
 400 |     }
 401 |   if$
 402 | }
 403 | FUNCTION {format.url}
 404 | { url empty$
 405 |     { "" }
 406 |     { "\urlprefix\url{" url * "}" * }
 407 |   if$
 408 | }
 409 | 
 410 | INTEGERS { nameptr namesleft numnames }
 411 | 
 412 | 
 413 | STRINGS  { bibinfo}
 414 | 
 415 | FUNCTION {format.names}
 416 | { 'bibinfo :=
 417 |   duplicate$ empty$ 'skip$ {
 418 |   's :=
 419 |   "" 't :=
 420 |   #1 'nameptr :=
 421 |   s num.names$ 'numnames :=
 422 |   numnames 'namesleft :=
 423 |     { namesleft #0 > }
 424 |     { s nameptr
 425 |       "{vv~}{ll}{ jj}{ f{}}"
 426 |       format.name$
 427 |       remove.dots
 428 |       bibinfo bibinfo.check
 429 |       't :=
 430 |       nameptr #1 >
 431 |         {
 432 |           namesleft #1 >
 433 |             { ", " * t * }
 434 |             {
 435 |               s nameptr "{ll}" format.name$ duplicate$ "others" =
 436 |                 { 't := }
 437 |                 { pop$ }
 438 |               if$
 439 |               "," *
 440 |               t "others" =
 441 |                 {
 442 |                   " " * bbl.etal emphasize *
 443 |                 }
 444 |                 { " " * t * }
 445 |               if$
 446 |             }
 447 |           if$
 448 |         }
 449 |         't
 450 |       if$
 451 |       nameptr #1 + 'nameptr :=
 452 |       namesleft #1 - 'namesleft :=
 453 |     }
 454 |   while$
 455 |   } if$
 456 | }
 457 | FUNCTION {format.names.ed}
 458 | {
 459 |   'bibinfo :=
 460 |   duplicate$ empty$ 'skip$ {
 461 |   's :=
 462 |   "" 't :=
 463 |   #1 'nameptr :=
 464 |   s num.names$ 'numnames :=
 465 |   numnames 'namesleft :=
 466 |     { namesleft #0 > }
 467 |     { s nameptr
 468 |       "{f{}~}{vv~}{ll}{ jj}"
 469 |       format.name$
 470 |       remove.dots
 471 |       bibinfo bibinfo.check
 472 |       't :=
 473 |       nameptr #1 >
 474 |         {
 475 |           namesleft #1 >
 476 |             { ", " * t * }
 477 |             {
 478 |               s nameptr "{ll}" format.name$ duplicate$ "others" =
 479 |                 { 't := }
 480 |                 { pop$ }
 481 |               if$
 482 |               "," *
 483 |               t "others" =
 484 |                 {
 485 | 
 486 |                   " " * bbl.etal emphasize *
 487 |                 }
 488 |                 { " " * t * }
 489 |               if$
 490 |             }
 491 |           if$
 492 |         }
 493 |         't
 494 |       if$
 495 |       nameptr #1 + 'nameptr :=
 496 |       namesleft #1 - 'namesleft :=
 497 |     }
 498 |   while$
 499 |   } if$
 500 | }
 501 | FUNCTION {format.key}
 502 | { empty$
 503 |     { key field.or.null }
 504 |     { "" }
 505 |   if$
 506 | }
 507 | 
 508 | FUNCTION {format.authors}
 509 | { author "author" format.names
 510 |     duplicate$ empty$ 'skip$
 511 |     { collaboration "collaboration" bibinfo.check
 512 |       duplicate$ empty$ 'skip$
 513 |         { " (" swap$ * ")" * }
 514 |       if$
 515 |       *
 516 |     }
 517 |   if$
 518 | }
 519 | FUNCTION {get.bbl.editor}
 520 | { editor num.names$ #1 > 'bbl.editors 'bbl.editor if$ }
 521 | 
 522 | FUNCTION {format.editors}
 523 | { editor "editor" format.names duplicate$ empty$ 'skip$
 524 |     {
 525 |       " " *
 526 |       get.bbl.editor
 527 |    "(" swap$ * ")" *
 528 |       *
 529 |     }
 530 |   if$
 531 | }
 532 | FUNCTION {format.isbn}
 533 | { isbn "isbn" bibinfo.check
 534 |   duplicate$ empty$ 'skip$
 535 |     {
 536 |       new.block
 537 |       "ISBN " swap$ *
 538 |     }
 539 |   if$
 540 | }
 541 | 
 542 | FUNCTION {format.issn}
 543 | { issn "issn" bibinfo.check
 544 |   duplicate$ empty$ 'skip$
 545 |     {
 546 |       new.block
 547 |       "ISSN " swap$ *
 548 |     }
 549 |   if$
 550 | }
 551 | 
 552 | FUNCTION {format.doi}
 553 | { doi "doi" bibinfo.check
 554 |   duplicate$ empty$ 'skip$
 555 |     {
 556 |       new.block
 557 |       "\doi{" swap$ * "}" *
 558 |     }
 559 |   if$
 560 | }
 561 | FUNCTION {format.note}
 562 | {
 563 |  note empty$
 564 |     { "" }
 565 |     { note #1 #1 substring$
 566 |       duplicate$ "{" =
 567 |         'skip$
 568 |         { output.state mid.sentence =
 569 |           { "l" }
 570 |           { "u" }
 571 |         if$
 572 |         change.case$
 573 |         }
 574 |       if$
 575 |       note #2 global.max$ substring$ * "note" bibinfo.check
 576 |     }
 577 |   if$
 578 | }
 579 | 
 580 | FUNCTION {format.title}
 581 | { title
 582 |   "title" bibinfo.check
 583 |   duplicate$ empty$ 'skip$
 584 |     {
 585 |       "\enquote{" swap$ *
 586 |       add.period$ "}" *
 587 |     }
 588 |   if$
 589 | }
 590 | FUNCTION {format.full.names}
 591 | {'s :=
 592 |  "" 't :=
 593 |   #1 'nameptr :=
 594 |   s num.names$ 'numnames :=
 595 |   numnames 'namesleft :=
 596 |     { namesleft #0 > }
 597 |     { s nameptr
 598 |       "{vv~}{ll}" format.name$
 599 |       't :=
 600 |       nameptr #1 >
 601 |         {
 602 |           namesleft #1 >
 603 |             { ", " * t * }
 604 |             {
 605 |               s nameptr "{ll}" format.name$ duplicate$ "others" =
 606 |                 { 't := }
 607 |                 { pop$ }
 608 |               if$
 609 |               t "others" =
 610 |                 {
 611 |                   " " * bbl.etal emphasize *
 612 |                 }
 613 |                 {
 614 |                   numnames #2 >
 615 |                     { "," * }
 616 |                     'skip$
 617 |                   if$
 618 |                   bbl.and
 619 |                   space.word * t *
 620 |                 }
 621 |               if$
 622 |             }
 623 |           if$
 624 |         }
 625 |         't
 626 |       if$
 627 |       nameptr #1 + 'nameptr :=
 628 |       namesleft #1 - 'namesleft :=
 629 |     }
 630 |   while$
 631 | }
 632 | 
 633 | FUNCTION {author.editor.key.full}
 634 | { author empty$
 635 |     { editor empty$
 636 |         { key empty$
 637 |             { cite$ #1 #3 substring$ }
 638 |             'key
 639 |           if$
 640 |         }
 641 |         { editor format.full.names }
 642 |       if$
 643 |     }
 644 |     { author format.full.names }
 645 |   if$
 646 | }
 647 | 
 648 | FUNCTION {author.key.full}
 649 | { author empty$
 650 |     { key empty$
 651 |          { cite$ #1 #3 substring$ }
 652 |           'key
 653 |       if$
 654 |     }
 655 |     { author format.full.names }
 656 |   if$
 657 | }
 658 | 
 659 | FUNCTION {editor.key.full}
 660 | { editor empty$
 661 |     { key empty$
 662 |          { cite$ #1 #3 substring$ }
 663 |           'key
 664 |       if$
 665 |     }
 666 |     { editor format.full.names }
 667 |   if$
 668 | }
 669 | 
 670 | FUNCTION {make.full.names}
 671 | { type$ "book" =
 672 |   type$ "inbook" =
 673 |   or
 674 |     'author.editor.key.full
 675 |     { type$ "proceedings" =
 676 |         'editor.key.full
 677 |         'author.key.full
 678 |       if$
 679 |     }
 680 |   if$
 681 | }
 682 | 
 683 | FUNCTION {output.bibitem}
 684 | { newline$
 685 |   "\bibitem[{" write$
 686 |   label write$
 687 |   ")" make.full.names duplicate$ short.list =
 688 |      { pop$ }
 689 |      { * }
 690 |    if$
 691 |   "}]{" * write$
 692 |   cite$ write$
 693 |   "}" write$
 694 |   newline$
 695 |   ""
 696 |   before.all 'output.state :=
 697 | }
 698 | 
 699 | FUNCTION {n.dashify}
 700 | {
 701 |   't :=
 702 |   ""
 703 |     { t empty$ not }
 704 |     { t #1 #1 substring$ "-" =
 705 |         { t #1 #2 substring$ "--" = not
 706 |             { "--" *
 707 |               t #2 global.max$ substring$ 't :=
 708 |             }
 709 |             {   { t #1 #1 substring$ "-" = }
 710 |                 { "-" *
 711 |                   t #2 global.max$ substring$ 't :=
 712 |                 }
 713 |               while$
 714 |             }
 715 |           if$
 716 |         }
 717 |         { t #1 #1 substring$ *
 718 |           t #2 global.max$ substring$ 't :=
 719 |         }
 720 |       if$
 721 |     }
 722 |   while$
 723 | }
 724 | 
 725 | FUNCTION {word.in}
 726 | { bbl.in capitalize
 727 |   " " * }
 728 | 
 729 | FUNCTION {format.date}
 730 | { year "year" bibinfo.check duplicate$ empty$
 731 |     {
 732 |       "empty year in " cite$ * "; set to ????" * warning$
 733 |        pop$ "????"
 734 |     }
 735 |     'skip$
 736 |   if$
 737 |   extra.label *
 738 |   before.all 'output.state :=
 739 |   " (" swap$ * ")" *
 740 | }
 741 | FUNCTION {format.btitle}
 742 | { title "title" bibinfo.check
 743 |   duplicate$ empty$ 'skip$
 744 |     {
 745 |       emphasize
 746 |     }
 747 |   if$
 748 | }
 749 | FUNCTION {either.or.check}
 750 | { empty$
 751 |     'pop$
 752 |     { "can't use both " swap$ * " fields in " * cite$ * warning$ }
 753 |   if$
 754 | }
 755 | FUNCTION {format.bvolume}
 756 | { volume empty$
 757 |     { "" }
 758 |     { bbl.volume volume tie.or.space.prefix
 759 |       "volume" bibinfo.check * *
 760 |       series "series" bibinfo.check
 761 |       duplicate$ empty$ 'pop$
 762 |         { swap$ bbl.of space.word * swap$
 763 |           emphasize * }
 764 |       if$
 765 |       "volume and number" number either.or.check
 766 |     }
 767 |   if$
 768 | }
 769 | FUNCTION {format.number.series}
 770 | { volume empty$
 771 |     { number empty$
 772 |         { series field.or.null }
 773 |         { series empty$
 774 |             { number "number" bibinfo.check }
 775 |             { output.state mid.sentence =
 776 |                 { bbl.number }
 777 |                 { bbl.number capitalize }
 778 |               if$
 779 |               number tie.or.space.prefix "number" bibinfo.check * *
 780 |               bbl.in space.word *
 781 |               series "series" bibinfo.check *
 782 |             }
 783 |           if$
 784 |         }
 785 |       if$
 786 |     }
 787 |     { "" }
 788 |   if$
 789 | }
 790 | 
 791 | FUNCTION {format.edition}
 792 | { edition duplicate$ empty$ 'skip$
 793 |     {
 794 |       output.state mid.sentence =
 795 |         { "l" }
 796 |         { "t" }
 797 |       if$ change.case$
 798 |       "edition" bibinfo.check
 799 |       " " * bbl.edition *
 800 |     }
 801 |   if$
 802 | }
 803 | INTEGERS { multiresult }
 804 | FUNCTION {multi.page.check}
 805 | { 't :=
 806 |   #0 'multiresult :=
 807 |     { multiresult not
 808 |       t empty$ not
 809 |       and
 810 |     }
 811 |     { t #1 #1 substring$
 812 |       duplicate$ "-" =
 813 |       swap$ duplicate$ "," =
 814 |       swap$ "+" =
 815 |       or or
 816 |         { #1 'multiresult := }
 817 |         { t #2 global.max$ substring$ 't := }
 818 |       if$
 819 |     }
 820 |   while$
 821 |   multiresult
 822 | }
 823 | FUNCTION {format.pages}
 824 | { pages duplicate$ empty$ 'skip$
 825 |     { duplicate$ multi.page.check
 826 |         {
 827 |           bbl.pages swap$
 828 |           n.dashify
 829 |         }
 830 |         {
 831 |           bbl.page swap$
 832 |         }
 833 |       if$
 834 |       tie.or.space.prefix
 835 |       "pages" bibinfo.check
 836 |       * *
 837 |     }
 838 |   if$
 839 | }
 840 | FUNCTION {format.journal.pages}
 841 | { pages duplicate$ empty$ 'pop$
 842 |     { swap$ duplicate$ empty$
 843 |         { pop$ pop$ format.pages }
 844 |         {
 845 |           ", " *
 846 |           swap$
 847 |           n.dashify
 848 |           "pages" bibinfo.check
 849 |           *
 850 |         }
 851 |       if$
 852 |     }
 853 |   if$
 854 | }
 855 | FUNCTION {format.journal.eid}
 856 | { eid "eid" bibinfo.check
 857 |   duplicate$ empty$ 'pop$
 858 |     { swap$ duplicate$ empty$ 'skip$
 859 |       {
 860 |           ", " *
 861 |       }
 862 |       if$
 863 |       swap$ *
 864 |       numpages empty$ 'skip$
 865 |         { bbl.eidpp numpages tie.or.space.prefix
 866 |           "numpages" bibinfo.check * *
 867 |           " (" swap$ * ")" * *
 868 |         }
 869 |       if$
 870 |     }
 871 |   if$
 872 | }
 873 | FUNCTION {format.vol.num.pages}
 874 | { volume field.or.null
 875 |   duplicate$ empty$ 'skip$
 876 |     {
 877 |       "volume" bibinfo.check
 878 |     }
 879 |   if$
 880 |   bolden
 881 |   number "number" bibinfo.check duplicate$ empty$ 'skip$
 882 |     {
 883 |       swap$ duplicate$ empty$
 884 |         { "there's a number but no volume in " cite$ * warning$ }
 885 |         'skip$
 886 |       if$
 887 |       swap$
 888 |       "(" swap$ * ")" *
 889 |     }
 890 |   if$ *
 891 |   eid empty$
 892 |     { format.journal.pages }
 893 |     { format.journal.eid }
 894 |   if$
 895 | }
 896 | 
 897 | FUNCTION {format.chapter.pages}
 898 | { chapter empty$
 899 |     'format.pages
 900 |     { type empty$
 901 |         { bbl.chapter }
 902 |         { type "l" change.case$
 903 |           "type" bibinfo.check
 904 |         }
 905 |       if$
 906 |       chapter tie.or.space.prefix
 907 |       "chapter" bibinfo.check
 908 |       * *
 909 |       pages empty$
 910 |         'skip$
 911 |         { ", " * format.pages * }
 912 |       if$
 913 |     }
 914 |   if$
 915 | }
 916 | 
 917 | FUNCTION {format.booktitle}
 918 | {
 919 |   booktitle "booktitle" bibinfo.check
 920 |   emphasize
 921 | }
 922 | FUNCTION {format.in.ed.booktitle}
 923 | { format.booktitle duplicate$ empty$ 'skip$
 924 |     {
 925 |       editor "editor" format.names.ed duplicate$ empty$ 'pop$
 926 |         {
 927 |           " " *
 928 |           get.bbl.editor
 929 |           "(" swap$ * "), " *
 930 |           * swap$
 931 |           * }
 932 |       if$
 933 |       word.in swap$ *
 934 |     }
 935 |   if$
 936 | }
 937 | FUNCTION {format.thesis.type}
 938 | { type duplicate$ empty$
 939 |     'pop$
 940 |     { swap$ pop$
 941 |       "t" change.case$ "type" bibinfo.check
 942 |     }
 943 |   if$
 944 | }
 945 | FUNCTION {format.tr.number}
 946 | { number "number" bibinfo.check
 947 |   type duplicate$ empty$
 948 |     { pop$ bbl.techrep }
 949 |     'skip$
 950 |   if$
 951 |   "type" bibinfo.check
 952 |   swap$ duplicate$ empty$
 953 |     { pop$ "t" change.case$ }
 954 |     { tie.or.space.prefix * * }
 955 |   if$
 956 | }
 957 | FUNCTION {format.article.crossref}
 958 | {
 959 |   word.in
 960 |   " \cite{" * crossref * "}" *
 961 | }
 962 | FUNCTION {format.book.crossref}
 963 | { volume duplicate$ empty$
 964 |     { "empty volume in " cite$ * "'s crossref of " * crossref * warning$
 965 |       pop$ word.in
 966 |     }
 967 |     { bbl.volume
 968 |       capitalize
 969 |       swap$ tie.or.space.prefix "volume" bibinfo.check * * bbl.of space.word *
 970 |     }
 971 |   if$
 972 |   " \cite{" * crossref * "}" *
 973 | }
 974 | FUNCTION {format.incoll.inproc.crossref}
 975 | {
 976 |   word.in
 977 |   " \cite{" * crossref * "}" *
 978 | }
 979 | FUNCTION {format.org.or.pub}
 980 | { 't :=
 981 |   ""
 982 |   address empty$ t empty$ and
 983 |     'skip$
 984 |     {
 985 |       t empty$
 986 |         { address "address" bibinfo.check *
 987 |         }
 988 |         { t *
 989 |           address empty$
 990 |             'skip$
 991 |             { ", " * address "address" bibinfo.check * }
 992 |           if$
 993 |         }
 994 |       if$
 995 |     }
 996 |   if$
 997 | }
 998 | FUNCTION {format.publisher.address}
 999 | { publisher "publisher" bibinfo.warn format.org.or.pub
1000 | }
1001 | 
1002 | FUNCTION {format.organization.address}
1003 | { organization "organization" bibinfo.check format.org.or.pub
1004 | }
1005 | 
1006 | FUNCTION {article}
1007 | { output.bibitem
1008 |   format.authors "author" output.check
1009 |   author format.key output
1010 |   format.date "year" output.check
1011 |   date.block
1012 |   format.title "title" output.check
1013 |   new.block
1014 |   crossref missing$
1015 |     {
1016 |       journal
1017 |       "journal" bibinfo.check
1018 |       emphasize
1019 |       "journal" output.check
1020 |       format.vol.num.pages output
1021 |     }
1022 |     { format.article.crossref output.nonnull
1023 |       format.pages output
1024 |     }
1025 |   if$
1026 |   format.issn output
1027 |   format.doi output
1028 |   new.block
1029 |   format.note output
1030 |   format.eprint output
1031 |   format.url output
1032 |   fin.entry
1033 | }
1034 | FUNCTION {book}
1035 | { output.bibitem
1036 |   author empty$
1037 |     { format.editors "author and editor" output.check
1038 |       editor format.key output
1039 |     }
1040 |     { format.authors output.nonnull
1041 |       crossref missing$
1042 |         { "author and editor" editor either.or.check }
1043 |         'skip$
1044 |       if$
1045 |     }
1046 |   if$
1047 |   format.date "year" output.check
1048 |   date.block
1049 |   format.btitle "title" output.check
1050 |   crossref missing$
1051 |     { format.bvolume output
1052 |       new.block
1053 |       format.number.series output
1054 |       format.edition output
1055 |       new.sentence
1056 |       format.publisher.address output
1057 |     }
1058 |     {
1059 |       new.block
1060 |       format.book.crossref output.nonnull
1061 |     }
1062 |   if$
1063 |   format.isbn output
1064 |   format.doi output
1065 |   new.block
1066 |   format.note output
1067 |   format.eprint output
1068 |   format.url output
1069 |   fin.entry
1070 | }
1071 | FUNCTION {booklet}
1072 | { output.bibitem
1073 |   format.authors output
1074 |   author format.key output
1075 |   format.date "year" output.check
1076 |   date.block
1077 |   format.title "title" output.check
1078 |   new.block
1079 |   howpublished "howpublished" bibinfo.check output
1080 |   address "address" bibinfo.check output
1081 |   format.isbn output
1082 |   format.doi output
1083 |   new.block
1084 |   format.note output
1085 |   format.eprint output
1086 |   format.url output
1087 |   fin.entry
1088 | }
1089 | 
1090 | FUNCTION {inbook}
1091 | { output.bibitem
1092 |   author empty$
1093 |     { format.editors "author and editor" output.check
1094 |       editor format.key output
1095 |     }
1096 |     { format.authors output.nonnull
1097 |       crossref missing$
1098 |         { "author and editor" editor either.or.check }
1099 |         'skip$
1100 |       if$
1101 |     }
1102 |   if$
1103 |   format.date "year" output.check
1104 |   date.block
1105 |   format.btitle "title" output.check
1106 |   crossref missing$
1107 |     {
1108 |       format.bvolume output
1109 |       format.chapter.pages "chapter and pages" output.check
1110 |       new.block
1111 |       format.number.series output
1112 |       format.edition output
1113 |       new.sentence
1114 |       format.publisher.address output
1115 |     }
1116 |     {
1117 |       format.chapter.pages "chapter and pages" output.check
1118 |       new.block
1119 |       format.book.crossref output.nonnull
1120 |     }
1121 |   if$
1122 |   crossref missing$
1123 |     { format.isbn output }
1124 |     'skip$
1125 |   if$
1126 |   format.doi output
1127 |   new.block
1128 |   format.note output
1129 |   format.eprint output
1130 |   format.url output
1131 |   fin.entry
1132 | }
1133 | 
1134 | FUNCTION {incollection}
1135 | { output.bibitem
1136 |   format.authors "author" output.check
1137 |   author format.key output
1138 |   format.date "year" output.check
1139 |   date.block
1140 |   format.title "title" output.check
1141 |   new.block
1142 |   crossref missing$
1143 |     { format.in.ed.booktitle "booktitle" output.check
1144 |       format.bvolume output
1145 |       format.number.series output
1146 |       format.edition output
1147 |       format.chapter.pages output
1148 |       new.sentence
1149 |       format.publisher.address output
1150 |       format.isbn output
1151 |     }
1152 |     { format.incoll.inproc.crossref output.nonnull
1153 |       format.chapter.pages output
1154 |     }
1155 |   if$
1156 |   format.doi output
1157 |   new.block
1158 |   format.note output
1159 |   format.eprint output
1160 |   format.url output
1161 |   fin.entry
1162 | }
1163 | FUNCTION {inproceedings}
1164 | { output.bibitem
1165 |   format.authors "author" output.check
1166 |   author format.key output
1167 |   format.date "year" output.check
1168 |   date.block
1169 |   format.title "title" output.check
1170 |   new.block
1171 |   crossref missing$
1172 |     { format.in.ed.booktitle "booktitle" output.check
1173 |       format.bvolume output
1174 |       format.number.series output
1175 |       format.pages output
1176 |       new.sentence
1177 |       publisher empty$
1178 |         { format.organization.address output }
1179 |         { organization "organization" bibinfo.check output
1180 |           format.publisher.address output
1181 |         }
1182 |       if$
1183 |       format.isbn output
1184 |       format.issn output
1185 |     }
1186 |     { format.incoll.inproc.crossref output.nonnull
1187 |       format.pages output
1188 |     }
1189 |   if$
1190 |   format.doi output
1191 |   new.block
1192 |   format.note output
1193 |   format.eprint output
1194 |   format.url output
1195 |   fin.entry
1196 | }
1197 | FUNCTION {conference} { inproceedings }
1198 | FUNCTION {manual}
1199 | { output.bibitem
1200 |   format.authors output
1201 |   author format.key output
1202 |   format.date "year" output.check
1203 |   date.block
1204 |   format.btitle "title" output.check
1205 |   organization address new.block.checkb
1206 |   organization "organization" bibinfo.check output
1207 |   address "address" bibinfo.check output
1208 |   format.edition output
1209 |   format.doi output
1210 |   new.block
1211 |   format.note output
1212 |   format.eprint output
1213 |   format.url output
1214 |   fin.entry
1215 | }
1216 | 
1217 | FUNCTION {mastersthesis}
1218 | { output.bibitem
1219 |   format.authors "author" output.check
1220 |   author format.key output
1221 |   format.date "year" output.check
1222 |   date.block
1223 |   format.btitle
1224 |   "title" output.check
1225 |   new.block
1226 |   bbl.mthesis format.thesis.type output.nonnull
1227 |   school "school" bibinfo.warn output
1228 |   address "address" bibinfo.check output
1229 |   format.doi output
1230 |   new.block
1231 |   format.note output
1232 |   format.eprint output
1233 |   format.url output
1234 |   fin.entry
1235 | }
1236 | 
1237 | FUNCTION {misc}
1238 | { output.bibitem
1239 |   format.authors output
1240 |   author format.key output
1241 |   format.date "year" output.check
1242 |   date.block
1243 |   format.title output
1244 |   new.block
1245 |   howpublished "howpublished" bibinfo.check output
1246 |   format.doi output
1247 |   new.block
1248 |   format.note output
1249 |   format.eprint output
1250 |   format.url output
1251 |   fin.entry
1252 | }
1253 | FUNCTION {phdthesis}
1254 | { output.bibitem
1255 |   format.authors "author" output.check
1256 |   author format.key output
1257 |   format.date "year" output.check
1258 |   date.block
1259 |   format.btitle
1260 |   "title" output.check
1261 |   new.block
1262 |   bbl.phdthesis format.thesis.type output.nonnull
1263 |   school "school" bibinfo.warn output
1264 |   address "address" bibinfo.check output
1265 |   format.doi output
1266 |   new.block
1267 |   format.note output
1268 |   format.eprint output
1269 |   format.url output
1270 |   fin.entry
1271 | }
1272 | 
1273 | FUNCTION {proceedings}
1274 | { output.bibitem
1275 |   format.editors output
1276 |   editor format.key output
1277 |   format.date "year" output.check
1278 |   date.block
1279 |   format.btitle "title" output.check
1280 |   format.bvolume output
1281 |   format.number.series output
1282 |   new.sentence
1283 |   publisher empty$
1284 |     { format.organization.address output }
1285 |     { organization "organization" bibinfo.check output
1286 |       format.publisher.address output
1287 |     }
1288 |   if$
1289 |   format.isbn output
1290 |   format.issn output
1291 |   format.doi output
1292 |   new.block
1293 |   format.note output
1294 |   format.eprint output
1295 |   format.url output
1296 |   fin.entry
1297 | }
1298 | 
1299 | FUNCTION {techreport}
1300 | { output.bibitem
1301 |   format.authors "author" output.check
1302 |   author format.key output
1303 |   format.date "year" output.check
1304 |   date.block
1305 |   format.title
1306 |   "title" output.check
1307 |   new.block
1308 |   format.tr.number emphasize output.nonnull
1309 |   institution "institution" bibinfo.warn output
1310 |   address "address" bibinfo.check output
1311 |   format.doi output
1312 |   new.block
1313 |   format.note output
1314 |   format.eprint output
1315 |   format.url output
1316 |   fin.entry
1317 | }
1318 | 
1319 | FUNCTION {unpublished}
1320 | { output.bibitem
1321 |   format.authors "author" output.check
1322 |   author format.key output
1323 |   format.date "year" output.check
1324 |   date.block
1325 |   format.title "title" output.check
1326 |   format.doi output
1327 |   new.block
1328 |   format.note "note" output.check
1329 |   format.eprint output
1330 |   format.url output
1331 |   fin.entry
1332 | }
1333 | 
1334 | FUNCTION {default.type} { misc }
1335 | READ
1336 | FUNCTION {sortify}
1337 | { purify$
1338 |   "l" change.case$
1339 | }
1340 | INTEGERS { len }
1341 | FUNCTION {chop.word}
1342 | { 's :=
1343 |   'len :=
1344 |   s #1 len substring$ =
1345 |     { s len #1 + global.max$ substring$ }
1346 |     's
1347 |   if$
1348 | }
1349 | FUNCTION {format.lab.names}
1350 | { 's :=
1351 |   "" 't :=
1352 |   s #1 "{vv~}{ll}" format.name$
1353 |   s num.names$ duplicate$
1354 |   #2 >
1355 |     { pop$
1356 |       " " * bbl.etal emphasize *
1357 |     }
1358 |     { #2 <
1359 |         'skip$
1360 |         { s #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" =
1361 |             {
1362 |               " " * bbl.etal emphasize *
1363 |             }
1364 |             { bbl.and space.word * s #2 "{vv~}{ll}" format.name$
1365 |               * }
1366 |           if$
1367 |         }
1368 |       if$
1369 |     }
1370 |   if$
1371 | }
1372 | 
1373 | FUNCTION {author.key.label}
1374 | { author empty$
1375 |     { key empty$
1376 |         { cite$ #1 #3 substring$ }
1377 |         'key
1378 |       if$
1379 |     }
1380 |     { author format.lab.names }
1381 |   if$
1382 | }
1383 | 
1384 | FUNCTION {author.editor.key.label}
1385 | { author empty$
1386 |     { editor empty$
1387 |         { key empty$
1388 |             { cite$ #1 #3 substring$ }
1389 |             'key
1390 |           if$
1391 |         }
1392 |         { editor format.lab.names }
1393 |       if$
1394 |     }
1395 |     { author format.lab.names }
1396 |   if$
1397 | }
1398 | 
1399 | FUNCTION {editor.key.label}
1400 | { editor empty$
1401 |     { key empty$
1402 |         { cite$ #1 #3 substring$ }
1403 |         'key
1404 |       if$
1405 |     }
1406 |     { editor format.lab.names }
1407 |   if$
1408 | }
1409 | 
1410 | FUNCTION {calc.short.authors}
1411 | { type$ "book" =
1412 |   type$ "inbook" =
1413 |   or
1414 |     'author.editor.key.label
1415 |     { type$ "proceedings" =
1416 |         'editor.key.label
1417 |         'author.key.label
1418 |       if$
1419 |     }
1420 |   if$
1421 |   'short.list :=
1422 | }
1423 | 
1424 | FUNCTION {calc.label}
1425 | { calc.short.authors
1426 |   short.list
1427 |   "("
1428 |   *
1429 |   year duplicate$ empty$
1430 |   short.list key field.or.null = or
1431 |      { pop$ "" }
1432 |      'skip$
1433 |   if$
1434 |   *
1435 |   'label :=
1436 | }
1437 | 
1438 | FUNCTION {sort.format.names}
1439 | { 's :=
1440 |   #1 'nameptr :=
1441 |   ""
1442 |   s num.names$ 'numnames :=
1443 |   numnames 'namesleft :=
1444 |     { namesleft #0 > }
1445 |     { s nameptr
1446 |       "{vv{ } }{ll{ }}{  f{ }}{  jj{ }}"
1447 |       format.name$ 't :=
1448 |       nameptr #1 >
1449 |         {
1450 |           "   "  *
1451 |           namesleft #1 = t "others" = and
1452 |             { "zzzzz" * }
1453 |             { t sortify * }
1454 |           if$
1455 |         }
1456 |         { t sortify * }
1457 |       if$
1458 |       nameptr #1 + 'nameptr :=
1459 |       namesleft #1 - 'namesleft :=
1460 |     }
1461 |   while$
1462 | }
1463 | 
1464 | FUNCTION {sort.format.title}
1465 | { 't :=
1466 |   "A " #2
1467 |     "An " #3
1468 |       "The " #4 t chop.word
1469 |     chop.word
1470 |   chop.word
1471 |   sortify
1472 |   #1 global.max$ substring$
1473 | }
1474 | FUNCTION {author.sort}
1475 | { author empty$
1476 |     { key empty$
1477 |         { "to sort, need author or key in " cite$ * warning$
1478 |           ""
1479 |         }
1480 |         { key sortify }
1481 |       if$
1482 |     }
1483 |     { author sort.format.names }
1484 |   if$
1485 | }
1486 | FUNCTION {author.editor.sort}
1487 | { author empty$
1488 |     { editor empty$
1489 |         { key empty$
1490 |             { "to sort, need author, editor, or key in " cite$ * warning$
1491 |               ""
1492 |             }
1493 |             { key sortify }
1494 |           if$
1495 |         }
1496 |         { editor sort.format.names }
1497 |       if$
1498 |     }
1499 |     { author sort.format.names }
1500 |   if$
1501 | }
1502 | FUNCTION {editor.sort}
1503 | { editor empty$
1504 |     { key empty$
1505 |         { "to sort, need editor or key in " cite$ * warning$
1506 |           ""
1507 |         }
1508 |         { key sortify }
1509 |       if$
1510 |     }
1511 |     { editor sort.format.names }
1512 |   if$
1513 | }
1514 | FUNCTION {presort}
1515 | { calc.label
1516 |   label sortify
1517 |   "    "
1518 |   *
1519 |   type$ "book" =
1520 |   type$ "inbook" =
1521 |   or
1522 |     'author.editor.sort
1523 |     { type$ "proceedings" =
1524 |         'editor.sort
1525 |         'author.sort
1526 |       if$
1527 |     }
1528 |   if$
1529 |   #1 entry.max$ substring$
1530 |   'sort.label :=
1531 |   sort.label
1532 |   *
1533 |   "    "
1534 |   *
1535 |   title field.or.null
1536 |   sort.format.title
1537 |   *
1538 |   #1 entry.max$ substring$
1539 |   'sort.key$ :=
1540 | }
1541 | 
1542 | ITERATE {presort}
1543 | SORT
1544 | STRINGS { last.label next.extra }
1545 | INTEGERS { last.extra.num number.label }
1546 | FUNCTION {initialize.extra.label.stuff}
1547 | { #0 int.to.chr$ 'last.label :=
1548 |   "" 'next.extra :=
1549 |   #0 'last.extra.num :=
1550 |   #0 'number.label :=
1551 | }
1552 | FUNCTION {forward.pass}
1553 | { last.label label =
1554 |     { last.extra.num #1 + 'last.extra.num :=
1555 |       last.extra.num int.to.chr$ 'extra.label :=
1556 |     }
1557 |     { "a" chr.to.int$ 'last.extra.num :=
1558 |       "" 'extra.label :=
1559 |       label 'last.label :=
1560 |     }
1561 |   if$
1562 |   number.label #1 + 'number.label :=
1563 | }
1564 | FUNCTION {reverse.pass}
1565 | { next.extra "b" =
1566 |     { "a" 'extra.label := }
1567 |     'skip$
1568 |   if$
1569 |   extra.label 'next.extra :=
1570 |   extra.label
1571 |   duplicate$ empty$
1572 |     'skip$
1573 |     { "{\natexlab{" swap$ * "}}" * }
1574 |   if$
1575 |   'extra.label :=
1576 |   label extra.label * 'label :=
1577 | }
1578 | EXECUTE {initialize.extra.label.stuff}
1579 | ITERATE {forward.pass}
1580 | REVERSE {reverse.pass}
1581 | FUNCTION {bib.sort.order}
1582 | { sort.label
1583 |   "    "
1584 |   *
1585 |   year field.or.null sortify
1586 |   *
1587 |   "    "
1588 |   *
1589 |   title field.or.null
1590 |   sort.format.title
1591 |   *
1592 |   #1 entry.max$ substring$
1593 |   'sort.key$ :=
1594 | }
1595 | ITERATE {bib.sort.order}
1596 | SORT
1597 | FUNCTION {begin.bib}
1598 | { preamble$ empty$
1599 |     'skip$
1600 |     { preamble$ write$ newline$ }
1601 |   if$
1602 |   "\begin{thebibliography}{" number.label int.to.str$ * "}" *
1603 |   write$ newline$
1604 |   "\newcommand{\enquote}[1]{``#1''}"
1605 |   write$ newline$
1606 |   "\providecommand{\natexlab}[1]{#1}"
1607 |   write$ newline$
1608 |   "\providecommand{\url}[1]{\texttt{#1}}"
1609 |   write$ newline$
1610 |   "\providecommand{\urlprefix}{URL }"
1611 |   write$ newline$
1612 |   "\expandafter\ifx\csname urlstyle\endcsname\relax"
1613 |   write$ newline$
1614 |   "  \providecommand{\doi}[1]{doi:\discretionary{}{}{}#1}\else"
1615 |   write$ newline$
1616 |   "  \providecommand{\doi}{doi:\discretionary{}{}{}\begingroup \urlstyle{rm}\Url}\fi"
1617 |   write$ newline$
1618 |   "\providecommand{\eprint}[2][]{\url{#2}}"
1619 |   write$ newline$
1620 | }
1621 | EXECUTE {begin.bib}
1622 | EXECUTE {init.state.consts}
1623 | ITERATE {call.type$}
1624 | FUNCTION {end.bib}
1625 | { newline$
1626 |   "\end{thebibliography}" write$ newline$
1627 | }
1628 | EXECUTE {end.bib}
1629 | %% End of customized bst file
1630 | %%
1631 | %% End of file `jss.bst'.
1632 | 


--------------------------------------------------------------------------------
/jss.cls:
--------------------------------------------------------------------------------
  1 | %%
  2 | %% This is file `jss.cls',
  3 | \def\fileversion{2.1}
  4 | \def\filename{jss}
  5 | \def\filedate{2012/06/07}
  6 | %%
  7 | %% Package `jss' to use with LaTeX2e for JSS publications (http://www.jstatsoft.org/)
  8 | %% License: GPL-2
  9 | %% Copyright: (C) Achim Zeileis
 10 | %% Please report errors to Achim.Zeileis@R-project.org
 11 | %%
 12 | \NeedsTeXFormat{LaTeX2e}
 13 | \ProvidesClass{jss}[\filedate\space\fileversion\space jss class by Achim Zeileis]
 14 | %% options
 15 | \newif\if@article
 16 | \newif\if@codesnippet
 17 | \newif\if@bookreview
 18 | \newif\if@softwarereview
 19 | \newif\if@review
 20 | \newif\if@shortnames
 21 | \newif\if@nojss
 22 | \newif\if@notitle
 23 | \newif\if@noheadings
 24 | \newif\if@nofooter
 25 | 
 26 | \@articletrue
 27 | \@codesnippetfalse
 28 | \@bookreviewfalse
 29 | \@softwarereviewfalse
 30 | \@reviewfalse
 31 | \@shortnamesfalse
 32 | \@nojssfalse
 33 | \@notitlefalse
 34 | \@noheadingsfalse
 35 | \@nofooterfalse
 36 | 
 37 | \DeclareOption{article}{\@articletrue%
 38 |   \@codesnippetfalse \@bookreviewfalse \@softwarereviewfalse}
 39 | \DeclareOption{codesnippet}{\@articlefalse%
 40 |   \@codesnippettrue \@bookreviewfalse \@softwarereviewfalse}
 41 | \DeclareOption{bookreview}{\@articlefalse%
 42 |   \@codesnippetfalse \@bookreviewtrue \@softwarereviewfalse}
 43 | \DeclareOption{softwarereview}{\@articlefalse%
 44 |   \@codesnippetfalse \@bookreviewfalse \@softwarereviewtrue}
 45 | \DeclareOption{shortnames}{\@shortnamestrue}
 46 | \DeclareOption{nojss}{\@nojsstrue}
 47 | \DeclareOption{notitle}{\@notitletrue}
 48 | \DeclareOption{noheadings}{\@noheadingstrue}
 49 | \DeclareOption{nofooter}{\@nofootertrue}
 50 | 
 51 | \ProcessOptions
 52 | \LoadClass[11pt,a4paper,twoside]{article}
 53 | %% required packages
 54 | \RequirePackage{graphicx,a4wide,color,ae,fancyvrb}
 55 | \RequirePackage[T1]{fontenc}
 56 | \IfFileExists{upquote.sty}{\RequirePackage{upquote}}{}
 57 | %% bibliography
 58 | \if@shortnames
 59 |   \usepackage[authoryear,round]{natbib}
 60 | \else
 61 |   \usepackage[authoryear,round,longnamesfirst]{natbib}
 62 | \fi
 63 | \bibpunct{(}{)}{;}{a}{}{,}
 64 | \bibliographystyle{jss}
 65 | %% paragraphs
 66 | \setlength{\parskip}{0.7ex plus0.1ex minus0.1ex}
 67 | \setlength{\parindent}{0em}
 68 | %% for all publications
 69 | \newcommand{\Address}[1]{\def\@Address{#1}}
 70 | \newcommand{\Plaintitle}[1]{\def\@Plaintitle{#1}}
 71 | \newcommand{\Shorttitle}[1]{\def\@Shorttitle{#1}}
 72 | \newcommand{\Plainauthor}[1]{\def\@Plainauthor{#1}}
 73 | \newcommand{\Volume}[1]{\def\@Volume{#1}}
 74 | \newcommand{\Year}[1]{\def\@Year{#1}}
 75 | \newcommand{\Month}[1]{\def\@Month{#1}}
 76 | \newcommand{\Issue}[1]{\def\@Issue{#1}}
 77 | \newcommand{\Submitdate}[1]{\def\@Submitdate{#1}}
 78 | %% for articles and code snippets
 79 | \newcommand{\Acceptdate}[1]{\def\@Acceptdate{#1}}
 80 | \newcommand{\Abstract}[1]{\def\@Abstract{#1}}
 81 | \newcommand{\Keywords}[1]{\def\@Keywords{#1}}
 82 | \newcommand{\Plainkeywords}[1]{\def\@Plainkeywords{#1}}
 83 | %% for book and software reviews
 84 | \newcommand{\Reviewer}[1]{\def\@Reviewer{#1}}
 85 | \newcommand{\Booktitle}[1]{\def\@Booktitle{#1}}
 86 | \newcommand{\Bookauthor}[1]{\def\@Bookauthor{#1}}
 87 | \newcommand{\Publisher}[1]{\def\@Publisher{#1}}
 88 | \newcommand{\Pubaddress}[1]{\def\@Pubaddress{#1}}
 89 | \newcommand{\Pubyear}[1]{\def\@Pubyear{#1}}
 90 | \newcommand{\ISBN}[1]{\def\@ISBN{#1}}
 91 | \newcommand{\Pages}[1]{\def\@Pages{#1}}
 92 | \newcommand{\Price}[1]{\def\@Price{#1}}
 93 | \newcommand{\Plainreviewer}[1]{\def\@Plainreviewer{#1}}
 94 | \newcommand{\Softwaretitle}[1]{\def\@Softwaretitle{#1}}
 95 | \newcommand{\URL}[1]{\def\@URL{#1}}
 96 | %% for internal use
 97 | \newcommand{\Seriesname}[1]{\def\@Seriesname{#1}}
 98 | \newcommand{\Hypersubject}[1]{\def\@Hypersubject{#1}}
 99 | \newcommand{\Hyperauthor}[1]{\def\@Hyperauthor{#1}}
100 | \newcommand{\Footername}[1]{\def\@Footername{#1}}
101 | \newcommand{\Firstdate}[1]{\def\@Firstdate{#1}}
102 | \newcommand{\Seconddate}[1]{\def\@Seconddate{#1}}
103 | \newcommand{\Reviewauthor}[1]{\def\@Reviewauthor{#1}}
104 | %% defaults
105 | \author{Firstname Lastname\\Affiliation}
106 | \title{Title}
107 | \Abstract{---!!!---an abstract is required---!!!---}
108 | \Plainauthor{\@author}
109 | \Volume{VV}
110 | \Year{YYYY}
111 | \Month{MMMMMM}
112 | \Issue{II}
113 | \Submitdate{yyyy-mm-dd}
114 | \Acceptdate{yyyy-mm-dd}
115 | \Address{
116 |   Firstname Lastname\\
117 |   Affiliation\\
118 |   Address, Country\\
119 |   E-mail: \email{name@address}\\
120 |   URL: \url{http://link/to/webpage/}
121 | }
122 | 
123 | \Reviewer{Firstname Lastname\\Affiliation}
124 | \Plainreviewer{Firstname Lastname}
125 | \Booktitle{Book Title}
126 | \Bookauthor{Book Author}
127 | \Publisher{Publisher}
128 | \Pubaddress{Publisher's Address}
129 | \Pubyear{YYY}
130 | \ISBN{x-xxxxx-xxx-x}
131 | \Pages{xv + 123}
132 | \Price{USD 69.95 (P)}
133 | \URL{http://link/to/webpage/}
134 | \if@article
135 |   \Seriesname{Issue}
136 |   \Hypersubject{Journal of Statistical Software}
137 |   \Plaintitle{\@title}
138 |   \Shorttitle{\@title}
139 |   \Plainkeywords{\@Keywords}
140 | \fi
141 | 
142 | \if@codesnippet
143 |   \Seriesname{Code Snippet}
144 |   \Hypersubject{Journal of Statistical Software -- Code Snippets}
145 |   \Plaintitle{\@title}
146 |   \Shorttitle{\@title}
147 |   \Plainkeywords{\@Keywords}
148 | \fi
149 | 
150 | \if@bookreview
151 |   \Seriesname{Book Review}
152 |   \Hypersubject{Journal of Statistical Software -- Book Reviews}
153 |   \Plaintitle{\@Booktitle}
154 |   \Shorttitle{\@Booktitle}
155 |   \Reviewauthor{\@Bookauthor\\
156 |                 \@Publisher, \@Pubaddress, \@Pubyear.\\
157 |                 ISBN~\@ISBN. \@Pages~pp. \@Price.\\
158 |                 \url{\@URL}}
159 |   \Plainkeywords{}
160 |   \@reviewtrue
161 | \fi
162 | 
163 | \if@softwarereview
164 |   \Seriesname{Software Review}
165 |   \Hypersubject{Journal of Statistical Software -- Software Reviews}
166 |   \Plaintitle{\@Softwaretitle}
167 |   \Shorttitle{\@Softwaretitle}
168 |   \Booktitle{\@Softwaretitle}
169 |   \Reviewauthor{\@Publisher, \@Pubaddress. \@Price.\\
170 |                 \url{\@URL}}
171 |   \Plainkeywords{}
172 |   \@reviewtrue
173 | \fi
174 | 
175 | \if@review
176 |   \Hyperauthor{\@Plainreviewer}
177 |   \Keywords{}
178 |   \Footername{Reviewer}
179 |   \Firstdate{\textit{Published:} \@Submitdate}
180 |   \Seconddate{}
181 | \else
182 |   \Hyperauthor{\@Plainauthor}
183 |   \Keywords{---!!!---at least one keyword is required---!!!---}
184 |   \Footername{Affiliation}
185 |   \Firstdate{\textit{Submitted:} \@Submitdate}
186 |   \Seconddate{\textit{Accepted:} \@Acceptdate}
187 | \fi
188 | %% Sweave(-like)
189 | \DefineVerbatimEnvironment{Sinput}{Verbatim}{fontshape=sl}
190 | \DefineVerbatimEnvironment{Soutput}{Verbatim}{}
191 | \DefineVerbatimEnvironment{Scode}{Verbatim}{fontshape=sl}
192 | \newenvironment{Schunk}{}{}
193 | \DefineVerbatimEnvironment{Code}{Verbatim}{}
194 | \DefineVerbatimEnvironment{CodeInput}{Verbatim}{fontshape=sl}
195 | \DefineVerbatimEnvironment{CodeOutput}{Verbatim}{}
196 | \newenvironment{CodeChunk}{}{}
197 | \setkeys{Gin}{width=0.8\textwidth}
198 | %% footer
199 | \newlength{\footerskip}
200 | \setlength{\footerskip}{2.5\baselineskip plus 2ex minus 0.5ex}
201 | 
202 | \newcommand{\makefooter}{%
203 |   \vspace{\footerskip}
204 | 
205 |   \if@nojss
206 |     \begin{samepage}
207 |     \textbf{\large \@Footername: \nopagebreak}\\[.3\baselineskip] \nopagebreak
208 |     \@Address \nopagebreak
209 |     \end{samepage}
210 |   \else
211 |     \begin{samepage}
212 |     \textbf{\large \@Footername: \nopagebreak}\\[.3\baselineskip] \nopagebreak
213 |     \@Address \nopagebreak
214 |     \vfill
215 |     \hrule \nopagebreak
216 |     \vspace{.1\baselineskip}
217 |     {\fontfamily{pzc} \fontsize{13}{15} \selectfont Journal of Statistical Software}
218 |     \hfill
219 |     \url{http://www.jstatsoft.org/}\\ \nopagebreak
220 |     published by the American Statistical Association
221 |     \hfill
222 |     \url{http://www.amstat.org/}\\[.3\baselineskip] \nopagebreak
223 |     {Volume~\@Volume, \@Seriesname~\@Issue}
224 |     \hfill
225 |     \@Firstdate\\ \nopagebreak
226 |     {\@Month{} \@Year}
227 |     \hfill
228 |     \@Seconddate  \nopagebreak
229 |     \vspace{.3\baselineskip}
230 |     \hrule
231 |     \end{samepage}
232 |   \fi
233 | }
234 | \if@nofooter
235 |   %% \AtEndDocument{\makefooter}
236 | \else
237 |   \AtEndDocument{\makefooter}
238 | \fi
239 | %% required packages
240 | \RequirePackage{hyperref}
241 | %% new \maketitle
242 | \def\@myoddhead{
243 |   {\color{white} JSS}\\[-1.42cm]
244 |   \hspace{-2em} \includegraphics[height=23mm,keepaspectratio]{jsslogo} \hfill
245 |   \parbox[b][23mm]{118mm}{\hrule height 3pt
246 |    \center{
247 |    {\fontfamily{pzc} \fontsize{28}{32} \selectfont Journal of Statistical Software}
248 |    \vfill
249 |    {\it \small \@Month{} \@Year, Volume~\@Volume, \@Seriesname~\@Issue.%
250 |             \hfill \href{http://www.jstatsoft.org/}{http://www.jstatsoft.org/}}}\\[0.1cm]
251 |      \hrule height 3pt}}
252 | \if@review
253 |   \renewcommand{\maketitle}{
254 |   \if@nojss
255 |     %% \@oddhead{\@myoddhead}\\[3\baselineskip]
256 |   \else
257 |     \@oddhead{\@myoddhead}\\[3\baselineskip]
258 |   \fi
259 |     {\large
260 |     \noindent
261 |     Reviewer: \@Reviewer
262 |     \vspace{\baselineskip}
263 |     \hrule
264 |     \vspace{\baselineskip}
265 |     \textbf{\@Booktitle}
266 |     \begin{quotation} \noindent
267 |     \@Reviewauthor
268 |     \end{quotation}
269 |     \vspace{0.7\baselineskip}
270 |     \hrule
271 |     \vspace{1.3\baselineskip}
272 |     }
273 | 
274 |     \thispagestyle{empty}
275 |     \if@nojss
276 |       \markboth{\centerline{\@Shorttitle}}{\centerline{\@Hyperauthor}}
277 |     \else
278 |       \markboth{\centerline{\@Shorttitle}}{\centerline{\@Hypersubject}}
279 |     \fi
280 |     \pagestyle{myheadings}
281 |   }
282 | \else
283 |   \def\maketitle{
284 |   \if@nojss
285 |     %% \@oddhead{\@myoddhead} \par
286 |   \else
287 |     \@oddhead{\@myoddhead} \par
288 |   \fi
289 |    \begingroup
290 |      \def\thefootnote{\fnsymbol{footnote}}
291 |      \def\@makefnmark{\hbox to 0pt{$^{\@thefnmark}$\hss}}
292 |      \long\def\@makefntext##1{\parindent 1em\noindent
293 |                               \hbox to1.8em{\hss $\m@th ^{\@thefnmark}$}##1}
294 |      \@maketitle \@thanks
295 |    \endgroup
296 |    \setcounter{footnote}{0}
297 | 
298 |    \if@noheadings
299 |     %% \markboth{\centerline{\@Shorttitle}}{\centerline{\@Hypersubject}}
300 |     \else
301 |      \thispagestyle{empty}
302 |       \if@nojss
303 |         \markboth{\centerline{\@Shorttitle}}{\centerline{\@Hyperauthor}}
304 |       \else
305 |         \markboth{\centerline{\@Shorttitle}}{\centerline{\@Hypersubject}}
306 |       \fi
307 |      \pagestyle{myheadings}
308 |    \fi
309 | 
310 |    \let\maketitle\relax \let\@maketitle\relax
311 |    \gdef\@thanks{}\gdef\@author{}\gdef\@title{}\let\thanks\relax
312 |   }
313 | 
314 |   \def\@maketitle{\vbox{\hsize\textwidth \linewidth\hsize
315 |   \if@nojss
316 |     %% \vskip 1in
317 |   \else
318 |     \vskip 1in
319 |   \fi
320 |    {\centering
321 |    {\LARGE\bf \@title\par}
322 |    \vskip 0.2in plus 1fil minus 0.1in
323 |    {
324 |        \def\and{\unskip\enspace{\rm and}\enspace}%
325 |        \def\And{\end{tabular}\hss \egroup \hskip 1in plus 2fil
326 |           \hbox to 0pt\bgroup\hss \begin{tabular}[t]{c}\large\bf\rule{\z@}{24pt}\ignorespaces}%
327 |        \def\AND{\end{tabular}\hss\egroup \hfil\hfil\egroup
328 |           \vskip 0.1in plus 1fil minus 0.05in
329 |           \hbox to \linewidth\bgroup\rule{\z@}{10pt} \hfil\hfil
330 |           \hbox to 0pt\bgroup\hss \begin{tabular}[t]{c}\large\bf\rule{\z@}{24pt}\ignorespaces}
331 |        \hbox to \linewidth\bgroup\rule{\z@}{10pt} \hfil\hfil
332 |        \hbox to 0pt\bgroup\hss \begin{tabular}[t]{c}\large\bf\rule{\z@}{24pt}\@author
333 |        \end{tabular}\hss\egroup
334 |    \hfil\hfil\egroup}
335 |    \vskip 0.3in minus 0.1in
336 |    \hrule
337 |    \begin{abstract}
338 |    \@Abstract
339 |    \end{abstract}}
340 |    \textit{Keywords}:~\@Keywords.
341 |    \vskip 0.1in minus 0.05in
342 |    \hrule
343 |    \vskip 0.2in minus 0.1in
344 |   }}
345 | \fi
346 | %% sections, subsections, and subsubsections
347 | \newlength{\preXLskip}
348 | \newlength{\preLskip}
349 | \newlength{\preMskip}
350 | \newlength{\preSskip}
351 | \newlength{\postMskip}
352 | \newlength{\postSskip}
353 | \setlength{\preXLskip}{1.8\baselineskip plus 0.5ex minus 0ex}
354 | \setlength{\preLskip}{1.5\baselineskip plus 0.3ex minus 0ex}
355 | \setlength{\preMskip}{1\baselineskip plus 0.2ex minus 0ex}
356 | \setlength{\preSskip}{.8\baselineskip plus 0.2ex minus 0ex}
357 | \setlength{\postMskip}{.5\baselineskip plus 0ex minus 0.1ex}
358 | \setlength{\postSskip}{.3\baselineskip plus 0ex minus 0.1ex}
359 | 
360 | \newcommand{\jsssec}[2][default]{\vskip \preXLskip%
361 |   \pdfbookmark[1]{#1}{Section.\thesection.#1}%
362 |   \refstepcounter{section}%
363 |   \centerline{\textbf{\Large \thesection. #2}} \nopagebreak
364 |   \vskip \postMskip \nopagebreak}
365 | \newcommand{\jsssecnn}[1]{\vskip \preXLskip%
366 |   \centerline{\textbf{\Large #1}} \nopagebreak
367 |   \vskip \postMskip \nopagebreak}
368 | 
369 | \newcommand{\jsssubsec}[2][default]{\vskip \preMskip%
370 |   \pdfbookmark[2]{#1}{Subsection.\thesubsection.#1}%
371 |   \refstepcounter{subsection}%
372 |   \textbf{\large \thesubsection. #2} \nopagebreak
373 |   \vskip \postSskip \nopagebreak}
374 | \newcommand{\jsssubsecnn}[1]{\vskip \preMskip%
375 |   \textbf{\large #1} \nopagebreak
376 |   \vskip \postSskip \nopagebreak}
377 | 
378 | \newcommand{\jsssubsubsec}[2][default]{\vskip \preSskip%
379 |   \pdfbookmark[3]{#1}{Subsubsection.\thesubsubsection.#1}%
380 |   \refstepcounter{subsubsection}%
381 |   {\large \textit{#2}} \nopagebreak
382 |   \vskip \postSskip \nopagebreak}
383 | \newcommand{\jsssubsubsecnn}[1]{\vskip \preSskip%
384 |   {\textit{\large #1}} \nopagebreak
385 |   \vskip \postSskip \nopagebreak}
386 | 
387 | \newcommand{\jsssimplesec}[2][default]{\vskip \preLskip%
388 | %%  \pdfbookmark[1]{#1}{Section.\thesection.#1}%
389 |   \refstepcounter{section}%
390 |   \textbf{\large #1} \nopagebreak
391 |   \vskip \postSskip \nopagebreak}
392 | \newcommand{\jsssimplesecnn}[1]{\vskip \preLskip%
393 |   \textbf{\large #1} \nopagebreak
394 |   \vskip \postSskip \nopagebreak}
395 | 
396 | \if@review
397 |   \renewcommand{\section}{\secdef \jsssimplesec \jsssimplesecnn}
398 |   \renewcommand{\subsection}{\secdef \jsssimplesec \jsssimplesecnn}
399 |   \renewcommand{\subsubsection}{\secdef \jsssimplesec \jsssimplesecnn}
400 | \else
401 |   \renewcommand{\section}{\secdef \jsssec \jsssecnn}
402 |   \renewcommand{\subsection}{\secdef \jsssubsec \jsssubsecnn}
403 |   \renewcommand{\subsubsection}{\secdef \jsssubsubsec \jsssubsubsecnn}
404 | \fi
405 | %% colors
406 | \definecolor{Red}{rgb}{0.5,0,0}
407 | \definecolor{Blue}{rgb}{0,0,0.5}
408 | \if@review
409 |   \hypersetup{%
410 |     hyperindex = {true},
411 |     colorlinks = {true},
412 |     linktocpage = {true},
413 |     plainpages = {false},
414 |     linkcolor = {Blue},
415 |     citecolor = {Blue},
416 |     urlcolor = {Red},
417 |     pdfstartview = {Fit},
418 |     pdfpagemode = {None},
419 |     pdfview = {XYZ null null null}
420 |   }
421 | \else
422 |   \hypersetup{%
423 |     hyperindex = {true},
424 |     colorlinks = {true},
425 |     linktocpage = {true},
426 |     plainpages = {false},
427 |     linkcolor = {Blue},
428 |     citecolor = {Blue},
429 |     urlcolor = {Red},
430 |     pdfstartview = {Fit},
431 |     pdfpagemode = {UseOutlines},
432 |     pdfview = {XYZ null null null}
433 |   }
434 | \fi
435 | \if@nojss
436 |   \AtBeginDocument{
437 |     \hypersetup{%
438 |       pdfauthor = {\@Hyperauthor},
439 |       pdftitle = {\@Plaintitle},
440 |       pdfkeywords = {\@Plainkeywords}
441 |     }
442 |   }
443 | \else
444 |   \AtBeginDocument{
445 |     \hypersetup{%
446 |       pdfauthor = {\@Hyperauthor},
447 |       pdftitle = {\@Plaintitle},
448 |       pdfsubject = {\@Hypersubject},
449 |       pdfkeywords = {\@Plainkeywords}
450 |     }
451 |   }
452 | \fi
453 | \if@notitle
454 |   %% \AtBeginDocument{\maketitle}
455 | \else
456 |   \AtBeginDocument{\maketitle}
457 | \fi
458 | %% commands
459 | \newcommand\code{\bgroup\@makeother\_\@makeother\~\@makeother\$\@codex}
460 | \def\@codex#1{{\normalfont\ttfamily\hyphenchar\font=-1 #1}\egroup}
461 | %%\let\code=\texttt
462 | \let\proglang=\textsf
463 | \newcommand{\pkg}[1]{{\fontseries{b}\selectfont #1}}
464 | \newcommand{\email}[1]{\href{mailto:#1}{\normalfont\texttt{#1}}}
465 | \ifx\csname urlstyle\endcsname\relax
466 |   \newcommand\@doi[1]{doi:\discretionary{}{}{}#1}\else
467 |   \newcommand\@doi{doi:\discretionary{}{}{}\begingroup
468 | \urlstyle{tt}\Url}\fi
469 | \newcommand{\doi}[1]{\href{http://dx.doi.org/#1}{\normalfont\texttt{\@doi{#1}}}}
470 | \newcommand{\E}{\mathsf{E}}
471 | \newcommand{\VAR}{\mathsf{VAR}}
472 | \newcommand{\COV}{\mathsf{COV}}
473 | \newcommand{\Prob}{\mathsf{P}}
474 | \endinput
475 | %%
476 | %% End of file `jss.cls'.
477 | 


--------------------------------------------------------------------------------
/jss.dtx:
--------------------------------------------------------------------------------
   1 | \def\fileversion{2.1}
   2 | \def\filename{jss}
   3 | \def\filedate{2012/06/07}
   4 | %
   5 | % \iffalse
   6 | %
   7 | %%
   8 | %% Package `jss' to use with LaTeX2e for JSS publications (http://www.jstatsoft.org/)
   9 | %% License: GPL-2
  10 | %% Copyright: (C) Achim Zeileis
  11 | %% Please report errors to Achim.Zeileis@R-project.org
  12 | %%
  13 | %
  14 | % \fi
  15 | %
  16 | % \changes{0.1}{2004/08/09}
  17 | %   {First draft.}
  18 | %
  19 | % \changes{1.0}{2004/09/29}
  20 | %   {First release.
  21 | %    - new font size (11pt)
  22 | %    - better formatting of sections}
  23 | %
  24 | % \changes{1.1}{2004/10/01}
  25 | %   {Bug fix: sections and pdfbookmarks
  26 | %    (arguments were switched).}
  27 | %
  28 | % \changes{1.2}{2004/10/02}
  29 | %   {changed logo name, improved docs}
  30 | %
  31 | % \changes{1.3}{2004/10/05}
  32 | %   {fixed Shorttitle default}
  33 | %
  34 | % \changes{1.4}{2005/01/28}
  35 | %   {updated docs}
  36 | %
  37 | % \changes{1.5}{2005/12/09}
  38 | %   {now an official ASA journal}
  39 | %
  40 | % \changes{1.6}{2007/01/28}
  41 | %   {small enhancements}
  42 | %
  43 | % \changes{1.7}{2007/10/15}
  44 | %   {changed link colors, modifed hyperref inclusion for texlive}
  45 | %
  46 | % \changes{1.8}{2008/04/08}
  47 | %   {added option to omit JSS markup, slightly changed pkg markup}
  48 | %
  49 | % \changes{2.0}{2009/09/24}
  50 | %   {added GPL-2 license, new options 'notitle' and 'noheadings'}
  51 | %
  52 | % \changes{2.1}{2012/06/07}
  53 | %   {allow _ in doi, new option 'nofooter'}
  54 | %
  55 | %
  56 | %
  57 | % \MakeShortVerb{\|}
  58 | % \newcommand{\foopkg}[1]{{\normalfont\fontseries{b}\selectfont #1}}
  59 | % \newcommand{\enquote}[1]{``#1''}
  60 | %
  61 | % \title{\foopkg{jss}: A Document Class for Publications in the Journal of Statistical Software}
  62 | % \author{Achim Zeileis}
  63 | %
  64 | % \maketitle
  65 | %
  66 | % \section{Introduction} \label{sec:intro}
  67 | %
  68 | % The \LaTeXe{} document class \foopkg{jss} is an extension of the
  69 | % standard \LaTeXe{} \foopkg{article} class for publications in the
  70 | % Journal of Statistical Software (JSS, \url{http://www.jstatsoft.org/}).
  71 | % Additionally, the JSS-specific header/footer can be easily switched
  72 | % off so that the document class can easily be used for other publications,
  73 | % e.g., \textsf{R} package vignettes.
  74 | %
  75 | % The document class provides infrastructure for all four kinds of publications
  76 | % in JSS: regular articles, code snippets, book reviews and
  77 | % software reviews. Each document requires several declarations to
  78 | % be made in the header (before |\begin{document}|) 
  79 | % which are described in Section~\ref{sec:ifa} separately
  80 | % for articles/code snippets and book/software reviews
  81 | % along with some general commands
  82 | % which can be used in all documents.
  83 | %
  84 | % The final version of JSS papers should be prepared using this JSS style file;
  85 | % the submission of the final version needs to include the full sources
  86 | % (|.tex|, |.bib|, and all graphics). A quick check for the most important aspects
  87 | % of the JSS style is given in Section~\ref{sec:check}; authors should make sure that all
  88 | % of them are addressed in the final version. A list of frequently asked questions
  89 | % (FAQ) is available online \url{http://www.jstatsoft.org/style} that provides
  90 | % additional details and tries to address typical problems.
  91 | %
  92 | % All documents need to be processed by pdf\TeX{}, some useful information
  93 | % on this is provided in Section~\ref{sec:TeX}, which also contains some
  94 | % information on using \textsc{Bib}\TeX{}. \textsc{Bib}\TeX{} together
  95 | % with the style file |jss.bst| produces references
  96 | % and citations in the required format.
  97 | %
  98 | % The actual code for the batch file (|jss.ins|), the
  99 | % driver (|jss.drv|) and the class (|jss.cls|) are
 100 | % briefly described in Section~\ref{sec:code}. Note, that usually
 101 | % you do not have to read that section when you want to prepare
 102 | % a submission for JSS.
 103 | %
 104 | %
 105 | % \section{Instructions for authors} \label{sec:ifa}
 106 | %
 107 | % To use the JSS styles, you have to include the class file
 108 | % |jss.cls|, the logo |jsslogo.jpg| and the \textsc{Bib}\TeX{}
 109 | % style \texttt{jss.bst} in your search path. This can either be
 110 | % your local working directory or in your |texmf| or 
 111 | % |localtexmf| tree.
 112 | %
 113 | % The \LaTeX{} documents have to include the |jss.cls| first by
 114 | % 
 115 | % |\documentclass[|\textit{type}|]{jss}|
 116 | % 
 117 | % where \textit{type} can be |article| (which is the default),
 118 | % |codesnippet|, |bookreview| or |softwarereview|.
 119 | % Templates with brief instructions are provided in
 120 | % |article.tex|, |codesnippet.tex|, |bookreview.tex|
 121 | % and |softwarereview.tex| respectively. The corresponding
 122 | % commands used for the header declarations are described
 123 | % in more detail in the following.
 124 | %
 125 | % By using |jss.cls|, the packages \foopkg{graphicx}, \foopkg{a4wide},
 126 | % \foopkg{color}, \foopkg{hyperref}, \foopkg{ae}, \foopkg{fancyverb} and
 127 | % \foopkg{natbib} are loaded automatically.
 128 | % Authors may, of course, include further packages
 129 | % but should not change the page layout
 130 | % or change the font or font encoding. If the package \foopkg{thumbpdf}
 131 | % is available, its inclusion is encouraged.
 132 | %
 133 | % The titles of JSS publications are capitalized, i.e., in title style, but the section
 134 | % headers are not and should be in sentence-style.
 135 | %
 136 | % Acknowledgments should be included at the end of the paper before the
 137 | % references in a separate section set up via |\section*{Acknowledgments}|.
 138 | %
 139 | % \emph{Hint.} If you want to use markup in section headers you will usually
 140 | % have to escape it for the PDF bookmarks by giving the text for the 
 141 | % bookmark explicitly without markup, e.g.,
 142 | % \begin{verbatim}
 143 | % \section[Calling C++ from R]{Calling \proglang{C++} from \proglang{R}}
 144 | % \end{verbatim}
 145 | %
 146 | % \emph{Hint.} If compilation with pdf\TeX{} fails with an error at
 147 | % |\begin{document}| the reason is almost surely that some of the
 148 | % declarations in the header have not been made properly. For example,
 149 | % |\Plainauthor|, |\Plaintitle| or |\Plainkeywords| might be missing
 150 | % or still containing markup.
 151 | %
 152 | % \emph{Hint.} If you want to use the JSS style for a non-JSS paper
 153 | % (or a modification of a JSS paper, e.g., in a vignette), you can
 154 | % set the option |nojss| in the |\documentclass| statement to suppress
 155 | % JSS-specific layout.
 156 | %
 157 | % 
 158 | % \subsection{Style checklist} \label{sec:check}
 159 | % A quick check for the most important aspects of the JSS style is given below. 
 160 | % Authors should make sure that all of them are addressed in the final version.
 161 | % More details can be found in the remainder of this manual.
 162 | % 
 163 | % \begin{itemize}
 164 | %   \item The manuscript can be compiled by pdf\TeX{}.
 165 | %   \item |\proglang|, |\pkg| and |\code| have been used for highlighting
 166 | %         throughout the paper (including titles and references),
 167 | %         except where explicitly escaped.
 168 | %   \item References are provided in a |.bib| \textsc{Bib}\TeX{} database
 169 | %         and included in the text by |\cite|, |\citep|, |\citet|, etc.
 170 | %   \item Titles and headers are formatted properly:
 171 | %         \begin{itemize}
 172 | %           \item |\title| in title style,
 173 | %           \item |\section| etc.\ in sentence style,
 174 | %           \item all titles in the \textsc{Bib}\TeX{} file in title style.
 175 | %         \end{itemize}
 176 | %   \item Figures, tables and equations are marked with a |\label|
 177 | %         and referred to by |\ref|, e.g., ``|Figure~\ref{...}|''.
 178 | %   \item Software packages are |\cite{}|d properly.
 179 | % \end{itemize}
 180 | % 
 181 | % 
 182 | % \subsection{Articles and code snippets}
 183 | %
 184 | % For JSS articles and code snippets respectively, 
 185 | % the following declarations have to be made
 186 | % in the header of the \TeX{} sources (before |\begin{document}|).
 187 | % See also the template |article.tex| or |codesnippet.tex|
 188 | % respectively.
 189 | %
 190 | % \DescribeMacro{\author}
 191 | % The command |\author| specifies the list of authors. The name
 192 | % of each author should be followed by a linebreak and his
 193 | % affiliation (only the university, in a single line). The authors
 194 | % should be separated by |\And| (instead of |\and|), e.g.,
 195 | % \begin{verbatim}
 196 | % \author{Achim Zeileis\\Universit\"at Innsbruck \And 
 197 | %         Second Author\\Plus Affiliation}
 198 | % \end{verbatim}
 199 | % If not all authors fit into a single line, |\AND| (instead of
 200 | % |\And|) should be used in front of authors that should go into
 201 | % the next line.
 202 | % 
 203 | % \DescribeMacro{\Plainauthor}
 204 | % The list of authors without affiliations. It needs to be
 205 | % comma-separated and must not contain any markup (bold fonts etc.), e.g.,
 206 | % \begin{verbatim}
 207 | % \Plainauthor{Achim Zeileis, Second Author}
 208 | % \end{verbatim}
 209 | % 
 210 | % \DescribeMacro{\title}
 211 | % The title of the paper. It should be capitalized and may contain
 212 | % further markup (in particular markup such as |\pkg| and |\proglang|), e.g.,
 213 | % \begin{verbatim}
 214 | % \title{A Capitalized Title for a Package \pkg{foo}}
 215 | % \end{verbatim}
 216 | % 
 217 | % \DescribeMacro{\Plaintitle}
 218 | % The full title without any markup.
 219 | % The default is to use |\title|, therefore it needs to be specified
 220 | % only if it is different from |\title|, e.g.,
 221 | % \begin{verbatim}
 222 | % \Plaintitle{A Capitalized Title for a Package foo}
 223 | % \end{verbatim}
 224 | % 
 225 | % \DescribeMacro{\Shorttitle}
 226 | % A shorter version of the title to be used for page headings.
 227 | % The default is to use |\title|, therefore it needs to be specified
 228 | % only if it is different from |\title|, e.g.,
 229 | % \begin{verbatim}
 230 | % \Shorttitle{foo: A Capitalized Title}
 231 | % \end{verbatim}
 232 | % 
 233 | % \DescribeMacro{\Abstract}
 234 | % Enter the abstract for your article here, e.g.,
 235 | % \begin{verbatim}
 236 | % \Abstract{
 237 | %   The abstract of the article.
 238 | % }
 239 | % \end{verbatim}
 240 | % 
 241 | % \DescribeMacro{\Keywords}
 242 | % A comma-separated list of (at least one) keyword(s) which
 243 | % should not be capitalized, e.g.,
 244 | % |\Keywords{keywords, comma-separated, not capitalized}|.
 245 | %
 246 | % \DescribeMacro{\Plainkeywords}
 247 | % The list of keywords without any markup. The default is to use
 248 | % |\Keywords|, therefore it needs to be specified only
 249 | % if it is different from |\Keywords|.
 250 | %
 251 | % \DescribeMacro{\Volume}
 252 | % The JSS volume number in which the article is published,
 253 | % e.g., |\Volume{11}|. Note:
 254 | % This information will be provided upon acceptance
 255 | % or added by the technical editor. Prior to acceptance,
 256 | % do not use this command.
 257 | %
 258 | % \DescribeMacro{\Issue}
 259 | % The JSS issue number in which the article is published,
 260 | % e.g., |\Issue{9}|. Note:
 261 | % This information will be provided upon acceptance
 262 | % or added by the technical editor. Prior to acceptance,
 263 | % do not use this command.
 264 | %
 265 | % \DescribeMacro{\Month}
 266 | % The month in which the article is published,
 267 | % e.g., |\Month{September}|. Note:
 268 | % This information will be provided upon acceptance
 269 | % or added by the technical editor. Prior to acceptance,
 270 | % do not use this command.
 271 | %
 272 | % \DescribeMacro{\Year}
 273 | % The year in which the article is published,
 274 | % e.g., |\Year{2004}|. Note:
 275 | % This information will be provided upon acceptance
 276 | % or added by the technical editor. Prior to acceptance,
 277 | % do not use this command.
 278 | %
 279 | % \DescribeMacro{\Submitdate}
 280 | % The date of submission for the article,
 281 | % e.g., |\Submitdate{2004-09-29}|. Note:
 282 | % This information will be provided upon acceptance
 283 | % or added by the technical editor. Prior to acceptance,
 284 | % do not use this command.
 285 | %
 286 | % \DescribeMacro{\Acceptdate}
 287 | % The date of acceptance for the article,
 288 | % e.g., |\Acceptdate{2004-09-29}|. Note:
 289 | % This information will be provided upon acceptance
 290 | % or added by the technical editor. Prior to acceptance,
 291 | % do not use this command.
 292 | %
 293 | % \DescribeMacro{\Address}
 294 | % The address of (at least) one author should be given in
 295 | % the following format
 296 | % \begin{verbatim}
 297 | % \Address{
 298 | %   Achim Zeileis\\
 299 | %   Department of Statistics and Mathematics\\
 300 | %   Faculty of Economics and Statistics\\
 301 | %   Universit\"at Innsbruck\\
 302 | %   6020 Innsbruck, Austria\\
 303 | %   E-mail: \email{Achim.Zeileis@uibk.ac.at}\\
 304 | %   URL: \url{http://eeecon.uibk.ac.at/~zeileis/}
 305 | % }
 306 | % \end{verbatim}
 307 | % It is also possible to include your telephone and fax 
 308 | % number, by adding them in the format
 309 | % \begin{verbatim}
 310 | %   Telephone: +43/512/507-7103
 311 | %   Fax: +43/512/507-2851
 312 | % \end{verbatim}
 313 | % before the e-mail address.
 314 | %
 315 | % Furthermore, if the document is prepared using the |Sweave|
 316 | % functions in \textsf{R}, something like the following line
 317 | % \begin{verbatim}
 318 | % %% need no \usepackage{Sweave.sty}
 319 | % \end{verbatim}
 320 | % (with `\%\%') needs to be included in the header.
 321 | %
 322 | % \subsection{Book and software reviews}
 323 | %
 324 | % For JSS book and software respectively, 
 325 | % the following declarations have to be made
 326 | % in the header of the \TeX{} sources (before |\begin{document}|).
 327 | % See also the template |bookreview.tex| or |softwarereview.tex|
 328 | % respectively. Note that some commands might differ between
 329 | % book and software reviews, this is always stated explicitely
 330 | % below.
 331 | %
 332 | % \DescribeMacro{\Reviewer}
 333 | % The command |\Reviewer| specifies the name of the reviewer
 334 | % followed by a linebreak and his affiliation (only the university,
 335 | % in a single line), e.g.,
 336 | % \begin{verbatim}
 337 | % \Reviewer{Frederic Udina\\Pompeu Fabra University}
 338 | % \end{verbatim}
 339 | % 
 340 | % \DescribeMacro{\Plainreviewer}
 341 | % The name of the reviewer without affiliation. 
 342 | % It must not contain any markup (bold fonts etc.), e.g.,
 343 | % \begin{verbatim}
 344 | % \Plainauthor{Frederic Udina}
 345 | % \end{verbatim}
 346 | % 
 347 | % \emph{The following five commands are just required for book reviews.}
 348 | %
 349 | % \DescribeMacro{\Booktitle}
 350 | % The title of the book. It should be capitalized and may contain
 351 | % further markup (in particular markup such as |\pkg| and |\proglang|), e.g.,
 352 | % \begin{verbatim}
 353 | % \Booktitle{Visualizing Categorical Data}
 354 | % \end{verbatim}
 355 | %
 356 | % \DescribeMacro{\Bookauthor}
 357 | % Author(s) of the book, e.g.,
 358 | % \begin{verbatim}
 359 | % \Bookauthor{Michael Friendly}
 360 | % \end{verbatim}
 361 | % If there are several authors they should be comma-separated,
 362 | % and the last author separated by |and|, e.g.,
 363 | % |\Bookauthor{A and B}| or |\Bookauthor{A, B and C}|.
 364 | % 
 365 | % \DescribeMacro{\Pubyear}
 366 | % Year of publication, e.g., |\Pubyear{2000}|.
 367 | %
 368 | % \DescribeMacro{\ISBN}
 369 | % ISBN number, e.g., |\ISBN{1-58025-660-0}|.
 370 | %
 371 | % \DescribeMacro{\Pages}
 372 | % Number of pages, both arabic and roman (if available), e.g.,
 373 | % |\Pages{456}| or |\Pages{xvi + 145}|.
 374 | % 
 375 | % \emph{The following command is just required for software reviews.}
 376 | %
 377 | % \DescribeMacro{\Softwaretitle}
 378 | % The title of the software. It should be capitalized and may contain
 379 | % further markup (in particular markup such as |\pkg| and |\proglang|), e.g.,
 380 | % \begin{verbatim}
 381 | % \Softwaretitle{\pkg{Aabel} 1.5.7}
 382 | % \end{verbatim}
 383 | %
 384 | % \emph{The remaining commands are again required for both book and software reviews.}
 385 | %
 386 | % \DescribeMacro{\Publisher}
 387 | % Publisher of the book/software, e.g., |\Publisher{SAS Institute Inc.}|
 388 | % or\\ |\Publisher{Gigawiz Ltd. Co.}|.
 389 | % 
 390 | % \DescribeMacro{\Pubaddress}
 391 | % Address of the publisher of the book/software, e.g., |\Pubaddress{Carey, NC}|.
 392 | % 
 393 | % \DescribeMacro{\Price}
 394 | % Price of the book/software. For books this might simply be
 395 | % |\Price{USD 69.95}| or\\ |\Price{USD 69.95 (P)}|, but could also distinguish between hardcover
 396 | % and paperback\\ versions |\Price{USD 69.95 (P), USD 89.95 (H)}|. Analogously,
 397 | % for a software it could\\ be |\Price{USD 349 (standard), USD 249 (academic)}|.
 398 | % 
 399 | % \DescribeMacro{\URL}
 400 | % A URL for the book or software, e.g., 
 401 | % \begin{verbatim}
 402 | % \URL{http://www.math.yorku.ca/SCS/vcd/}
 403 | % \end{verbatim}
 404 | % If no URL is available, use |\URL{}|.
 405 | % 
 406 | % \DescribeMacro{\Plaintitle}
 407 | % The full book or software title without any markup (line breaks, bold fonts etc.).
 408 | % The default is to use |\Booktitle| or |\Softwaretitle| respectively,
 409 | % therefore it needs to be specified
 410 | % only if it is different from |\Booktitle| or |\Softwaretitle|, e.g.,
 411 | % \begin{verbatim}
 412 | % \Plaintitle{Visualizing Categorical Data}
 413 | % \end{verbatim}
 414 | % 
 415 | % \DescribeMacro{\Shorttitle}
 416 | % A shorter version of the book or software title to be used for page headings.
 417 | % The default is to use |\Booktitle| or |\Softwaretitle| respectively,
 418 | % therefore it needs to be specified
 419 | % only if it is different from |\Booktitle| or |\Softwaretitle|, e.g.,
 420 | % \begin{verbatim}
 421 | % \Shorttitle{Visualizing Categorical Data}
 422 | % \end{verbatim}
 423 | % 
 424 | % \DescribeMacro{\Volume}
 425 | % The JSS volume number in which the review is published,
 426 | % e.g., |\Volume{11}|. Note:
 427 | % This information will be provided upon acceptance
 428 | % or added by the technical editor.
 429 | %
 430 | % \DescribeMacro{\Issue}
 431 | % The JSS issue number in which the review is published,
 432 | % e.g., |\Issue{9}|. Note:
 433 | % This information will be provided upon acceptance
 434 | % or added by the technical editor.
 435 | %
 436 | % \DescribeMacro{\Month}
 437 | % The month in which the review is published,
 438 | % e.g., |\Month{September}|. Note:
 439 | % This information will be provided upon acceptance
 440 | % or added by the technical editor.
 441 | %
 442 | % \DescribeMacro{\Year}
 443 | % The year in which the review is published,
 444 | % e.g., |\Year{2004}|. Note:
 445 | % This information will be provided upon acceptance
 446 | % or added by the technical editor.
 447 | %
 448 | % \DescribeMacro{\Submitdate}
 449 | % The date of publication for the review,
 450 | % e.g., |\Submitdate{2004-09-29}|. Note:
 451 | % This information will be provided upon acceptance
 452 | % or added by the technical editor.
 453 | %
 454 | % \DescribeMacro{\Address}
 455 | % The address of (at least) one author should be given in
 456 | % the following format
 457 | % \begin{verbatim}
 458 | % \Address{
 459 | %   Achim Zeileis\\
 460 | %   Department of Statistics and Mathematics\\
 461 | %   Faculty of Economics and Statistics\\
 462 | %   Universit\"at Innsbruck\\
 463 | %   6020 Innsbruck, Austria\\
 464 | %   E-mail: \email{Achim.Zeileis@uibk.ac.at}\\
 465 | %   URL: \url{http://eeecon.uibk.ac.at/~zeileis/}
 466 | % }
 467 | % \end{verbatim}
 468 | % It is also possible to include your telephone and fax 
 469 | % number, by adding them in the format
 470 | % \begin{verbatim}
 471 | %   Telephone: +43/512/507-7103
 472 | %   Fax: +43/512/507-2851
 473 | % \end{verbatim}
 474 | % before the e-mail address.
 475 | %
 476 | % \subsection{Further commands}
 477 | %
 478 | % The \foopkg{jss} package provides several commands for typesetting 
 479 | % names related to software (programming languages, packages, code) and
 480 | % mathematical formulae.
 481 | %
 482 | % \subsubsection*{Writing about software}
 483 | %
 484 | % \DescribeMacro{\proglang}
 485 | % This should be used for typesetting the names of programming
 486 | % languages, e.g., |\proglang{Java}|, |\proglang{C++}| or |\proglang{R}|.
 487 | % This applies also to programmable environments which also have a GUI
 488 | % like |\proglang{SAS}|, |\proglang{Stata}| or |\proglang{S-PLUS}|.
 489 | %
 490 | % \DescribeMacro{\pkg}
 491 | % This should be used for typesetting the names of packages, e.g.,
 492 | % |\pkg{CMregr}|, |\pkg{MATCH}| or |\pkg{strucchange}|.
 493 | %
 494 | % \DescribeMacro{\code}
 495 | % This should be used for typesetting code chunks within
 496 | % the text, e.g., |\code{plot(1:10)}|. Currently, this simply uses a typewriter
 497 | % font. Although it escapes most special characters, it might still lead to
 498 | % problems with some special characters.
 499 | % In such cases the code can also be set using |\verb|, e.g.,
 500 | % |\verb/print("hello world")/|.
 501 | %
 502 | % \subsubsection*{Layout of code}
 503 | %
 504 | % |jss.cls| only provides very simple means of including code which are mostly 
 505 | % borrowed from \foopkg{Sweave}. There are three verbatim environments for code: |Code|,
 506 | % |CodeInput| and |CodeOutput|. Furthermore, there is an environment
 507 | % |CodeChunk| which can be put around sequences of |CodeInput|s and
 508 | % |CodeOutput|s to (hopefully) keep \LaTeX{} from page-breaking in the middle of 
 509 | % a code chunk. In short, there are two options: a) if no distinction between
 510 | % input and output is necessary, the code is placed between |\begin{Code}|
 511 | % and |\end{Code}|. b) If input and output should be distinguished, this can
 512 | % be done like in the following example.
 513 | % \begin{verbatim}
 514 | % \begin{CodeChunk}
 515 | % \begin{CodeInput}
 516 | % first input first line
 517 | % first input second line
 518 | % \end{CodeInput}
 519 | % \begin{CodeOutput}
 520 | % output of first input
 521 | % \end{CodeOutput}
 522 | % \begin{CodeInput}
 523 | % second input
 524 | % \end{CodeInput}
 525 | % \begin{CodeOutput}
 526 | % second output
 527 | % \end{CodeOutput}
 528 | % \end{CodeChunk}
 529 | % \end{verbatim}
 530 | % An example what this could look like, is the following \textsf{R} code. The first
 531 | % three lines are the input, the rest is output.
 532 | % \begin{verbatim}
 533 | % \begin{CodeChunk}
 534 | % \begin{CodeInput}
 535 | % R> data(cars)
 536 | % R> fm <- lm(dist ~ speed, data = log(cars))
 537 | % R> summary(fm)
 538 | % \end{CodeInput}
 539 | % \begin{CodeOutput}
 540 | % Call:
 541 | % lm(formula = dist ~ speed, data = log(cars))
 542 | % 
 543 | % Residuals:
 544 | %      Min       1Q   Median       3Q      Max 
 545 | % -1.00215 -0.24578 -0.02898  0.20717  0.88289 
 546 | % 
 547 | % Coefficients:
 548 | %             Estimate Std. Error t value Pr(>|t|)    
 549 | % (Intercept)  -0.7297     0.3758  -1.941   0.0581 .  
 550 | % speed         1.6024     0.1395  11.484 2.26e-15 ***
 551 | % ---
 552 | % Signif. codes:  0 `***' 0.001 `**' 0.01 `*' 0.05 `.' 0.1 ` ' 1 
 553 | % 
 554 | % Residual standard error: 0.4053 on 48 degrees of freedom
 555 | % Multiple R-Squared: 0.7331,     Adjusted R-squared: 0.7276 
 556 | % F-statistic: 131.9 on 1 and 48 DF,  p-value: 2.259e-15 
 557 | % \end{CodeOutput}
 558 | % \end{CodeChunk}
 559 | % \end{verbatim}
 560 | % If you prepare your paper using \foopkg{Sweave} (which is recommended
 561 | % if you describe an \textsf{R} package) do \emph{not} include
 562 | % |Sweave.sty| into your document, the necessary commands are already available within
 563 | % |jss.cls|. To prevent \foopkg{Sweave} from including |Sweave.sty|
 564 | % automatically you need to include a line like
 565 | % \begin{verbatim}
 566 | % %% need no \usepackage{Sweave.sty}
 567 | % \end{verbatim}
 568 | % (with `\%\%') into the header of your document.
 569 | % 
 570 | % If this basic infrastructure for typesetting your code is not
 571 | % sufficient, you can also use other \LaTeX{} packages like the
 572 | % \foopkg{listings} package.
 573 | %
 574 | % \subsubsection*{Mathematical formulae}
 575 | %
 576 | % Commonly used operators like $\mathsf{E}$, $\mathsf{VAR}$, $\mathsf{COV}$, and $\mathsf{P}$ should be set
 577 | % using the commands |\E|, |\VAR|, |\COV| and |\Prob|. Beyond this, \foopkg{jss} does not
 578 | % provide (or enforce) a certain mathematical notation. However, using the AMS packages (\foopkg{amsmath},
 579 | % \foopkg{amssymb}, etc.) could be useful.
 580 | %
 581 | %
 582 | % \section{Using pdf\TeX{} and \textbf{\sc Bib}\TeX{}} \label{sec:TeX}
 583 | %
 584 | % \subsubsection*{Using pdf\TeX{}}
 585 | %
 586 | % A \LaTeX{} document (|foo.tex|, say) using |jss.cls| needs to be compiled using 
 587 | % pdf\TeX{}, typically this will be done using either of the 
 588 | % following commands:
 589 | % \begin{verbatim}
 590 | % pdflatex foo.tex
 591 | %
 592 | % texi2dvi --pdf foo.tex
 593 | %
 594 | % texi2pdf foo.tex
 595 | % \end{verbatim}
 596 | % If you are not using command line tools but some integrated GUI editor for
 597 | % \LaTeX{} documents you will have to press the `pdf\LaTeX{}' button
 598 | % (as opposed to the `\LaTeX{}' button).
 599 | %
 600 | % All graphics included into the document have to be in a format pdf\TeX{} can
 601 | % deal with, i.e., PDF for vector graphics or JPG/PNG/etc. for bitmaps/raster graphics.
 602 | % If you cannot produce PDF graphics directly but only PS/EPS, these can
 603 | % be converted using |ps2pdf| or |epstopdf| (usually preferred).
 604 | %
 605 | % \emph{Hint.} If you are used to compiling your documents with standard \LaTeX{}
 606 | % and then getting automatic reloads of the resulting DVI document
 607 | % in your DVI viewer, which is not possible with PDF documents in many
 608 | % PDF viewers: you might want to look at \foopkg{xpdf} (Linux) or \foopkg{gsview}
 609 | % (Windows, see \url{http://www.cs.wisc.edu/~ghost/gsview/})
 610 | % which have a reload function.
 611 | %
 612 | % \emph{Hint.} If you want to use markup in section headers you will usually
 613 | % have to escape it for the PDF bookmarks by giving the text for the 
 614 | % bookmark explicitly without markup, e.g.,
 615 | % \begin{verbatim}
 616 | % \section[Calling C++ from R]{Calling \proglang{C++} from \proglang{R}}
 617 | % \end{verbatim}
 618 | %
 619 | % \emph{Hint.} If you know how to produce \LaTeX{} documents that can be
 620 | % processed with both \LaTeX{} and pdf\TeX{}, you can do so if you provide
 621 | % an EPS substitute for |jsslogo.jpg| (e.g. an empty or converted |jsslogo.eps|).
 622 | % Note, however, that the final document needs to be processed with pdf\TeX{}.
 623 | % Neither this manual nor the JSS encourage or support compilation of
 624 | % JSS documents with standard \LaTeX{}.
 625 | %
 626 | %
 627 | % \subsubsection*{References with \textbf{\sc Bib}\TeX{}}
 628 | %
 629 | % The format for references (e.g., articles, books, software, proceedings)
 630 | % should look like this
 631 | %
 632 | % \begin{quote}
 633 | % Brown RL, Durbin J, Evans JM (1975).
 634 | % \newblock \enquote{Techniques for Testing the Constancy of Regression
 635 | %   Relationships over Time.}
 636 | % \newblock \emph{Journal of the Royal Statistical Society B}, \textbf{37},
 637 | %   149--163.
 638 | % 
 639 | % Friendly M (2000).
 640 | % \newblock \emph{Visualizing Categorical Data}.
 641 | % \newblock SAS Insitute, Carey, NC.
 642 | % 
 643 | % {\textsf{R} Development Core Team} (2004).
 644 | % \newblock \emph{\textsf{R}: {A} Language and Environment for Statistical
 645 | %   Computing}.
 646 | % \newblock \textsf{R} Foundation for Statistical Computing, Vienna, Austria.
 647 | % \newblock {ISBN} 3-900051-00-3, URL~\url{http://www.R-project.org/}.
 648 | % 
 649 | % Urbanek S, Theus M (2003).
 650 | % \newblock \enquote{\foopkg{iPlots} -- {H}igh Interaction Graphics for \textsf{R}.}
 651 | % \newblock In K~Hornik, F~Leisch, A~Zeileis (eds.), \enquote{Proceedings of the
 652 | %   3rd International Workshop on Distributed Statistical Computing, Vienna,
 653 | %   Austria,} {ISSN 1609-395X},
 654 | %   URL~\url{http://www.ci.tuwien.ac.at/Conferences/DSC-2003/Proceedings/}.
 655 | % \end{quote}
 656 | %
 657 | % \emph{Important.} Note, that also the titles of papers are in title style
 658 | % (as opposed to sentence style), i.e., they are capitalized. 
 659 | % The first word after a colon `:' is always capitalized. Furthermore, commands
 660 | % like \verb/\proglang/ and \verb/\pkg/ should also be used for the
 661 | % references. The names of journals or proceeding volumes should not 
 662 | % be abbreviated.
 663 | %
 664 | % The easiest way to achieve this
 665 | % is to use \textsc{Bib}\TeX{} together with the style file |jss.bst|.
 666 | % To do so, the references just have to be included in a \textsc{Bib}\TeX{} file,
 667 | % |foo.bib| say, which has to be included at the end of the \LaTeX{}
 668 | % document by |\bibliography{foo}|.
 669 | % Note, that to obtain references in the format above, the |title| field
 670 | % in your bib file, needs to be capitalized (contrary to the folklore,
 671 | % there are \textsc{Bib}\TeX{} styles that rely on this even for |@Article|
 672 | % entries), i.e. the entry |title = {Visualizing Categorical Data}| is 
 673 | % correct, while entries like |title = {Visualizing categorical data}|
 674 | % or (even worse) |title = {{Visualizing categorical data}}| are not.
 675 | %
 676 | % The default in |jss.cls| is to use the \foopkg{natbib} package 
 677 | % with options |authoryear|, |round| and |longnamesfirst|. If you cite
 678 | % any article with six or more authors, the citations with all names should
 679 | % be avoided. This can either be done by declaring |\shortcites{...}| for
 680 | % the particular references or by turning the |longnamesfirst| option off
 681 | % completely. The latter can be done by using the option |shortnames|
 682 | % when loading the |jss.cls| class
 683 | % \begin{verbatim}
 684 | % \documentclass[article,shortnames]{jss}
 685 | % \end{verbatim}
 686 | %
 687 | %
 688 | % %\newpage
 689 | %
 690 | % \section{The code} \label{sec:code}
 691 | %
 692 | % \subsection{The batch file}
 693 | %
 694 | % First comes the code for creating the batch file \file{\filename.ins}
 695 | % which in turn can be used for producing the package and driver files.
 696 | %
 697 | %    \begin{macrocode}
 698 | %<*install>
 699 | \begin{filecontents}{\filename.ins}
 700 | % Simply TeX or LaTeX this file to extract various files from the source
 701 | % file `jss.dtx'.
 702 | \def\batchfile{jss.ins}
 703 | \input docstrip.tex
 704 | \generateFile{jss.drv}{t}{\from{jss.dtx}{driver}}
 705 | \generateFile{jss.cls}{t}{\from{jss.dtx}{class}}
 706 | \Msg{*******************************************************}
 707 | \Msg{* For documentation, run LaTeX on jss.dtx or jss.drv. *}
 708 | \Msg{*******************************************************}
 709 | \end{filecontents}
 710 | %</install>
 711 | %    \end{macrocode}
 712 | %
 713 | % \subsection{The driver}
 714 | %
 715 | % Next comes the documentation driver file for \TeX{}, i.e., the file
 716 | % that will produce the documentation you are currently reading.  It
 717 | % will be extracted from this file by the \texttt{docstrip}
 718 | % program.  Since it is the first code in the file one can
 719 | % alternatively process this file directly with \LaTeXe{} to obtain
 720 | % the documentation. 
 721 | %
 722 | %    \begin{macrocode}
 723 | %<*driver>
 724 | \documentclass[a4paper]{ltxdoc}
 725 | \providecommand{\file}[1]{\texttt{#1}}
 726 | \providecommand{\pkg}[1]{{\fontseries{b}\selectfont #1}}
 727 | \usepackage{color,hyperref,a4wide}
 728 | \oddsidemargin1.2cm
 729 | \textwidth14.2cm
 730 | \textheight23.3cm
 731 | \topmargin-.7cm
 732 | \setlength{\parskip}{0.7ex plus0.1ex minus0.1ex}
 733 | \setlength{\parindent}{0em}
 734 | \begin{document}
 735 |    \OnlyDescription
 736 |    \DocInput{jss.dtx}
 737 | \end{document}
 738 | %</driver>
 739 | %    \end{macrocode}
 740 | %
 741 | % \subsection{The class}
 742 | %
 743 | % Next is the main part, the code for the class file.
 744 | %
 745 | % It requires \LaTeXe{}
 746 | %    \begin{macrocode}
 747 | %<*class>
 748 | \NeedsTeXFormat{LaTeX2e}
 749 | \ProvidesClass{jss}[\filedate\space\fileversion\space jss class by Achim Zeileis]
 750 | %</class>
 751 | %    \end{macrocode}
 752 | % and is based on the \texttt{article} class. But before we load
 753 | % the class we declare and process some options.
 754 | % These reflects wether we want to write an article, code snippet,
 755 | % a book review or software review. The \texttt{shortnames} option
 756 | % is for loading \texttt{natbib} \emph{without} the option
 757 | % \texttt{longnamesfirst}. The \texttt{nojss} option suppresses JSS header and footer.
 758 | % The \texttt{notitle} option suppresses the automatic |\maketitle| at the
 759 | % beginning of the document. The \texttt{noheadnings} option suppresses headings
 760 | % on the pages. The \texttt{nofooter} option suppresses the automatic |\makefooter| at the
 761 | % end of the document.
 762 | %    \begin{macrocode}
 763 | %<*class>
 764 | %% options
 765 | \newif\if@article
 766 | \newif\if@codesnippet
 767 | \newif\if@bookreview
 768 | \newif\if@softwarereview
 769 | \newif\if@review
 770 | \newif\if@shortnames
 771 | \newif\if@nojss
 772 | \newif\if@notitle
 773 | \newif\if@noheadings
 774 | \newif\if@nofooter
 775 | 
 776 | \@articletrue
 777 | \@codesnippetfalse
 778 | \@bookreviewfalse
 779 | \@softwarereviewfalse
 780 | \@reviewfalse
 781 | \@shortnamesfalse
 782 | \@nojssfalse
 783 | \@notitlefalse
 784 | \@noheadingsfalse
 785 | \@nofooterfalse
 786 | 
 787 | \DeclareOption{article}{\@articletrue%
 788 |   \@codesnippetfalse \@bookreviewfalse \@softwarereviewfalse}
 789 | \DeclareOption{codesnippet}{\@articlefalse%
 790 |   \@codesnippettrue \@bookreviewfalse \@softwarereviewfalse}
 791 | \DeclareOption{bookreview}{\@articlefalse%
 792 |   \@codesnippetfalse \@bookreviewtrue \@softwarereviewfalse}
 793 | \DeclareOption{softwarereview}{\@articlefalse%
 794 |   \@codesnippetfalse \@bookreviewfalse \@softwarereviewtrue}
 795 | \DeclareOption{shortnames}{\@shortnamestrue}
 796 | \DeclareOption{nojss}{\@nojsstrue}
 797 | \DeclareOption{notitle}{\@notitletrue}
 798 | \DeclareOption{noheadings}{\@noheadingstrue}
 799 | \DeclareOption{nofooter}{\@nofootertrue}
 800 | 
 801 | \ProcessOptions
 802 | \LoadClass[11pt,a4paper,twoside]{article}
 803 | %</class>
 804 | %    \end{macrocode}
 805 | %
 806 | % A few packages are required and the font encoding is specified.
 807 | %    \begin{macrocode}
 808 | %<*class>
 809 | %% required packages
 810 | \RequirePackage{graphicx,a4wide,color,ae,fancyvrb}
 811 | \RequirePackage[T1]{fontenc}
 812 | \IfFileExists{upquote.sty}{\RequirePackage{upquote}}{}
 813 | %</class>
 814 | %    \end{macrocode}
 815 | %
 816 | % In addition, \texttt{hyperref} is included later on.
 817 | % The bibliography is generated using \texttt{natbib} and
 818 | % the \textsc{Bib}\TeX{} style \file{jss.bst}.
 819 | %    \begin{macrocode}
 820 | %<*class>
 821 | %% bibliography
 822 | \if@shortnames
 823 |   \usepackage[authoryear,round]{natbib}
 824 | \else
 825 |   \usepackage[authoryear,round,longnamesfirst]{natbib}
 826 | \fi
 827 | \bibpunct{(}{)}{;}{a}{}{,}
 828 | \bibliographystyle{jss}
 829 | %</class>
 830 | %    \end{macrocode}
 831 | %
 832 | % Paragraphs are not indented, instead \verb/\parskip/ is 
 833 | % increased.
 834 | %    \begin{macrocode}
 835 | %<*class>
 836 | %% paragraphs
 837 | \setlength{\parskip}{0.7ex plus0.1ex minus0.1ex}
 838 | \setlength{\parindent}{0em}
 839 | %</class>
 840 | %    \end{macrocode}
 841 | %
 842 | % To process the meta information we need some new commands:
 843 | % for all publications,
 844 | %    \begin{macrocode}
 845 | %<*class>
 846 | %% for all publications
 847 | \newcommand{\Address}[1]{\def\@Address{#1}}
 848 | \newcommand{\Plaintitle}[1]{\def\@Plaintitle{#1}}
 849 | \newcommand{\Shorttitle}[1]{\def\@Shorttitle{#1}}
 850 | \newcommand{\Plainauthor}[1]{\def\@Plainauthor{#1}}
 851 | \newcommand{\Volume}[1]{\def\@Volume{#1}}
 852 | \newcommand{\Year}[1]{\def\@Year{#1}}
 853 | \newcommand{\Month}[1]{\def\@Month{#1}}
 854 | \newcommand{\Issue}[1]{\def\@Issue{#1}}
 855 | \newcommand{\Submitdate}[1]{\def\@Submitdate{#1}}
 856 | %</class>
 857 | %    \end{macrocode}
 858 | % for articles and code snippets,
 859 | %    \begin{macrocode}
 860 | %<*class>
 861 | %% for articles and code snippets
 862 | \newcommand{\Acceptdate}[1]{\def\@Acceptdate{#1}}
 863 | \newcommand{\Abstract}[1]{\def\@Abstract{#1}}
 864 | \newcommand{\Keywords}[1]{\def\@Keywords{#1}}
 865 | \newcommand{\Plainkeywords}[1]{\def\@Plainkeywords{#1}}
 866 | %</class>
 867 | %    \end{macrocode}
 868 | % for book and software reviews,
 869 | %    \begin{macrocode}
 870 | %<*class>
 871 | %% for book and software reviews
 872 | \newcommand{\Reviewer}[1]{\def\@Reviewer{#1}}
 873 | \newcommand{\Booktitle}[1]{\def\@Booktitle{#1}}
 874 | \newcommand{\Bookauthor}[1]{\def\@Bookauthor{#1}}
 875 | \newcommand{\Publisher}[1]{\def\@Publisher{#1}}
 876 | \newcommand{\Pubaddress}[1]{\def\@Pubaddress{#1}}
 877 | \newcommand{\Pubyear}[1]{\def\@Pubyear{#1}}
 878 | \newcommand{\ISBN}[1]{\def\@ISBN{#1}}
 879 | \newcommand{\Pages}[1]{\def\@Pages{#1}}
 880 | \newcommand{\Price}[1]{\def\@Price{#1}}
 881 | \newcommand{\Plainreviewer}[1]{\def\@Plainreviewer{#1}}
 882 | \newcommand{\Softwaretitle}[1]{\def\@Softwaretitle{#1}}
 883 | \newcommand{\URL}[1]{\def\@URL{#1}}
 884 | %</class>
 885 | %    \end{macrocode}
 886 | % and for internal use only.
 887 | %    \begin{macrocode}
 888 | %<*class>
 889 | %% for internal use
 890 | \newcommand{\Seriesname}[1]{\def\@Seriesname{#1}}
 891 | \newcommand{\Hypersubject}[1]{\def\@Hypersubject{#1}}
 892 | \newcommand{\Hyperauthor}[1]{\def\@Hyperauthor{#1}}
 893 | \newcommand{\Footername}[1]{\def\@Footername{#1}}
 894 | \newcommand{\Firstdate}[1]{\def\@Firstdate{#1}}
 895 | \newcommand{\Seconddate}[1]{\def\@Seconddate{#1}}
 896 | \newcommand{\Reviewauthor}[1]{\def\@Reviewauthor{#1}}
 897 | %</class>
 898 | %    \end{macrocode}
 899 | %
 900 | % Some defaults for theses commands are specified, which
 901 | % are (hopefully) a useful guidance when using the
 902 | % \file{\filename.cls}.
 903 | %    \begin{macrocode}
 904 | %<*class>
 905 | %% defaults
 906 | \author{Firstname Lastname\\Affiliation}
 907 | \title{Title}
 908 | \Abstract{---!!!---an abstract is required---!!!---}
 909 | \Plainauthor{\@author}
 910 | \Volume{VV}
 911 | \Year{YYYY}
 912 | \Month{MMMMMM}
 913 | \Issue{II}
 914 | \Submitdate{yyyy-mm-dd}
 915 | \Acceptdate{yyyy-mm-dd}
 916 | \Address{
 917 |   Firstname Lastname\\
 918 |   Affiliation\\
 919 |   Address, Country\\
 920 |   E-mail: \email{name@address}\\
 921 |   URL: \url{http://link/to/webpage/}
 922 | }
 923 | 
 924 | \Reviewer{Firstname Lastname\\Affiliation}
 925 | \Plainreviewer{Firstname Lastname}
 926 | \Booktitle{Book Title}
 927 | \Bookauthor{Book Author}
 928 | \Publisher{Publisher}
 929 | \Pubaddress{Publisher's Address}
 930 | \Pubyear{YYY}
 931 | \ISBN{x-xxxxx-xxx-x}
 932 | \Pages{xv + 123}
 933 | \Price{USD 69.95 (P)}
 934 | \URL{http://link/to/webpage/}
 935 | %</class>
 936 | %    \end{macrocode}
 937 | %
 938 | % Conditional on the type of document several other defaults
 939 | % and some meta information is stored.
 940 | %    \begin{macrocode}
 941 | %<*class>
 942 | \if@article
 943 |   \Seriesname{Issue}
 944 |   \Hypersubject{Journal of Statistical Software}
 945 |   \Plaintitle{\@title}
 946 |   \Shorttitle{\@title}
 947 |   \Plainkeywords{\@Keywords}
 948 | \fi
 949 | 
 950 | \if@codesnippet
 951 |   \Seriesname{Code Snippet}
 952 |   \Hypersubject{Journal of Statistical Software -- Code Snippets}
 953 |   \Plaintitle{\@title}
 954 |   \Shorttitle{\@title}
 955 |   \Plainkeywords{\@Keywords}
 956 | \fi
 957 | 
 958 | \if@bookreview
 959 |   \Seriesname{Book Review}
 960 |   \Hypersubject{Journal of Statistical Software -- Book Reviews}
 961 |   \Plaintitle{\@Booktitle}
 962 |   \Shorttitle{\@Booktitle}
 963 |   \Reviewauthor{\@Bookauthor\\
 964 |                 \@Publisher, \@Pubaddress, \@Pubyear.\\
 965 |                 ISBN~\@ISBN. \@Pages~pp. \@Price.\\
 966 |                 \url{\@URL}}  
 967 |   \Plainkeywords{}
 968 |   \@reviewtrue
 969 | \fi
 970 | 
 971 | \if@softwarereview
 972 |   \Seriesname{Software Review}
 973 |   \Hypersubject{Journal of Statistical Software -- Software Reviews}
 974 |   \Plaintitle{\@Softwaretitle}
 975 |   \Shorttitle{\@Softwaretitle}
 976 |   \Booktitle{\@Softwaretitle}  
 977 |   \Reviewauthor{\@Publisher, \@Pubaddress. \@Price.\\
 978 |                 \url{\@URL}}  
 979 |   \Plainkeywords{}
 980 |   \@reviewtrue
 981 | \fi
 982 | 
 983 | \if@review
 984 |   \Hyperauthor{\@Plainreviewer}
 985 |   \Keywords{}
 986 |   \Footername{Reviewer}
 987 |   \Firstdate{\textit{Published:} \@Submitdate}
 988 |   \Seconddate{}
 989 | \else
 990 |   \Hyperauthor{\@Plainauthor}
 991 |   \Keywords{---!!!---at least one keyword is required---!!!---}
 992 |   \Footername{Affiliation}
 993 |   \Firstdate{\textit{Submitted:} \@Submitdate}
 994 |   \Seconddate{\textit{Accepted:} \@Acceptdate}
 995 | \fi
 996 | %</class>
 997 | %    \end{macrocode}
 998 | %
 999 | % For typesetting of code some basic infrastructure along
1000 | % the lines of Sweave is provided. First, the Sweave commands
1001 | % are provided explicitly,
1002 | %    \begin{macrocode}
1003 | %<*class>
1004 | %% Sweave(-like)
1005 | \DefineVerbatimEnvironment{Sinput}{Verbatim}{fontshape=sl}
1006 | \DefineVerbatimEnvironment{Soutput}{Verbatim}{}
1007 | \DefineVerbatimEnvironment{Scode}{Verbatim}{fontshape=sl}
1008 | \newenvironment{Schunk}{}{}
1009 | %</class>
1010 | %    \end{macrocode}
1011 | % and analogous commands with more neutral names for general
1012 | % pieces of code.
1013 | %    \begin{macrocode}
1014 | %<*class>
1015 | \DefineVerbatimEnvironment{Code}{Verbatim}{}
1016 | \DefineVerbatimEnvironment{CodeInput}{Verbatim}{fontshape=sl}
1017 | \DefineVerbatimEnvironment{CodeOutput}{Verbatim}{}
1018 | \newenvironment{CodeChunk}{}{}
1019 | \setkeys{Gin}{width=0.8\textwidth}
1020 | %</class>
1021 | %    \end{macrocode}
1022 | %
1023 | % The header and footer of JSS publications displays the logo,
1024 | % the publication information and some further links. Here,
1025 | % we define the footer first (because it must be included
1026 | % before \texttt{hyperref} in {\TeX}live). It contains the somewhat extended
1027 | % publication information (from the header), preceeded by the address of the 
1028 | % author/reviewer.
1029 | %    \begin{macrocode}
1030 | %<*class>
1031 | %% footer
1032 | \newlength{\footerskip}
1033 | \setlength{\footerskip}{2.5\baselineskip plus 2ex minus 0.5ex}
1034 | 
1035 | \newcommand{\makefooter}{%
1036 |   \vspace{\footerskip}
1037 | 
1038 |   \if@nojss
1039 |     \begin{samepage}
1040 |     \textbf{\large \@Footername: \nopagebreak}\\[.3\baselineskip] \nopagebreak
1041 |     \@Address \nopagebreak
1042 |     \end{samepage}  
1043 |   \else
1044 |     \begin{samepage}
1045 |     \textbf{\large \@Footername: \nopagebreak}\\[.3\baselineskip] \nopagebreak
1046 |     \@Address \nopagebreak
1047 |     \vfill
1048 |     \hrule \nopagebreak
1049 |     \vspace{.1\baselineskip}
1050 |     {\fontfamily{pzc} \fontsize{13}{15} \selectfont Journal of Statistical Software}
1051 |     \hfill
1052 |     \url{http://www.jstatsoft.org/}\\ \nopagebreak
1053 |     published by the American Statistical Association
1054 |     \hfill
1055 |     \url{http://www.amstat.org/}\\[.3\baselineskip] \nopagebreak
1056 |     {Volume~\@Volume, \@Seriesname~\@Issue}
1057 |     \hfill
1058 |     \@Firstdate\\ \nopagebreak
1059 |     {\@Month{} \@Year}
1060 |     \hfill
1061 |     \@Seconddate  \nopagebreak
1062 |     \vspace{.3\baselineskip}
1063 |     \hrule
1064 |     \end{samepage}
1065 |   \fi
1066 | }
1067 | %</class>
1068 | %    \end{macrocode}
1069 | %
1070 | % We include the footer at the end of the
1071 | % document (for title see below).
1072 | %    \begin{macrocode}
1073 | %<*class>
1074 | \if@nofooter
1075 |   %% \AtEndDocument{\makefooter}
1076 | \else
1077 |   \AtEndDocument{\makefooter}
1078 | \fi
1079 | %</class>
1080 | %    \end{macrocode}
1081 | %
1082 | % After defining this, we can require the \texttt{hyperref} package.
1083 | %    \begin{macrocode}
1084 | %<*class>
1085 | %% required packages
1086 | \RequirePackage{hyperref}
1087 | %</class>
1088 | %    \end{macrocode}
1089 | % and proceed to define the header.
1090 | %
1091 | % The header for all JSS publications has the logo \file{jsslogo.jpg}
1092 | % along with the publication information.
1093 | %    \begin{macrocode}
1094 | %<*class>
1095 | %% new \maketitle
1096 | \def\@myoddhead{
1097 |   {\color{white} JSS}\\[-1.42cm]
1098 |   \hspace{-2em} \includegraphics[height=23mm,keepaspectratio]{jsslogo} \hfill
1099 |   \parbox[b][23mm]{118mm}{\hrule height 3pt
1100 | 	   \center{
1101 | 	   {\fontfamily{pzc} \fontsize{28}{32} \selectfont Journal of Statistical Software}
1102 |    \vfill
1103 | 	   {\it \small \@Month{} \@Year, Volume~\@Volume, \@Seriesname~\@Issue.%
1104 |             \hfill \href{http://www.jstatsoft.org/}{http://www.jstatsoft.org/}}}\\[0.1cm]
1105 |      \hrule height 3pt}}
1106 | %</class>
1107 | %    \end{macrocode}
1108 | %
1109 | % This header is then used in the re-defined \verb/\maketitle/:
1110 | %    \begin{macrocode}
1111 | %<*class>
1112 | \if@review
1113 |   \renewcommand{\maketitle}{
1114 |   \if@nojss
1115 |     %% \@oddhead{\@myoddhead}\\[3\baselineskip]
1116 |   \else
1117 |     \@oddhead{\@myoddhead}\\[3\baselineskip]
1118 |   \fi
1119 |     {\large
1120 |     \noindent
1121 |     Reviewer: \@Reviewer
1122 |     \vspace{\baselineskip}
1123 |     \hrule
1124 |     \vspace{\baselineskip}
1125 |     \textbf{\@Booktitle}
1126 |     \begin{quotation} \noindent
1127 |     \@Reviewauthor
1128 |     \end{quotation}
1129 |     \vspace{0.7\baselineskip}
1130 |     \hrule
1131 |     \vspace{1.3\baselineskip}
1132 |     }
1133 | 
1134 |     \thispagestyle{empty}
1135 |     \if@nojss
1136 |       \markboth{\centerline{\@Shorttitle}}{\centerline{\@Hyperauthor}}
1137 |     \else
1138 |       \markboth{\centerline{\@Shorttitle}}{\centerline{\@Hypersubject}}
1139 |     \fi
1140 |     \pagestyle{myheadings}
1141 |   }
1142 | \else
1143 |   \def\maketitle{
1144 |   \if@nojss
1145 |     %% \@oddhead{\@myoddhead} \par
1146 |   \else
1147 |     \@oddhead{\@myoddhead} \par
1148 |   \fi  
1149 |    \begingroup
1150 |      \def\thefootnote{\fnsymbol{footnote}}
1151 |      \def\@makefnmark{\hbox to 0pt{$^{\@thefnmark}$\hss}}
1152 |      \long\def\@makefntext##1{\parindent 1em\noindent
1153 |                               \hbox to1.8em{\hss $\m@th ^{\@thefnmark}$}##1}
1154 |      \@maketitle \@thanks
1155 |    \endgroup
1156 |    \setcounter{footnote}{0}
1157 | 
1158 |    \if@noheadings
1159 |     %% \markboth{\centerline{\@Shorttitle}}{\centerline{\@Hypersubject}}
1160 |     \else
1161 |      \thispagestyle{empty}
1162 |       \if@nojss
1163 |         \markboth{\centerline{\@Shorttitle}}{\centerline{\@Hyperauthor}}
1164 |       \else
1165 |         \markboth{\centerline{\@Shorttitle}}{\centerline{\@Hypersubject}}
1166 |       \fi
1167 |      \pagestyle{myheadings}
1168 |    \fi
1169 | 
1170 |    \let\maketitle\relax \let\@maketitle\relax
1171 |    \gdef\@thanks{}\gdef\@author{}\gdef\@title{}\let\thanks\relax
1172 |   }
1173 | 
1174 |   \def\@maketitle{\vbox{\hsize\textwidth \linewidth\hsize
1175 |   \if@nojss
1176 |     %% \vskip 1in
1177 |   \else
1178 |     \vskip 1in  
1179 |   \fi
1180 |    {\centering
1181 |    {\LARGE\bf \@title\par}
1182 |    \vskip 0.2in plus 1fil minus 0.1in
1183 |    {
1184 |        \def\and{\unskip\enspace{\rm and}\enspace}%
1185 |        \def\And{\end{tabular}\hss \egroup \hskip 1in plus 2fil
1186 |    	      \hbox to 0pt\bgroup\hss \begin{tabular}[t]{c}\large\bf\rule{\z@}{24pt}\ignorespaces}%
1187 |        \def\AND{\end{tabular}\hss\egroup \hfil\hfil\egroup
1188 |    	      \vskip 0.1in plus 1fil minus 0.05in
1189 |    	      \hbox to \linewidth\bgroup\rule{\z@}{10pt} \hfil\hfil
1190 |    	      \hbox to 0pt\bgroup\hss \begin{tabular}[t]{c}\large\bf\rule{\z@}{24pt}\ignorespaces}
1191 |        \hbox to \linewidth\bgroup\rule{\z@}{10pt} \hfil\hfil
1192 |        \hbox to 0pt\bgroup\hss \begin{tabular}[t]{c}\large\bf\rule{\z@}{24pt}\@author
1193 |        \end{tabular}\hss\egroup
1194 |    \hfil\hfil\egroup}
1195 |    \vskip 0.3in minus 0.1in
1196 |    \hrule
1197 |    \begin{abstract}
1198 |    \@Abstract
1199 |    \end{abstract}}
1200 |    \textit{Keywords}:~\@Keywords.
1201 |    \vskip 0.1in minus 0.05in
1202 |    \hrule
1203 |    \vskip 0.2in minus 0.1in
1204 |   }}
1205 | \fi
1206 | %</class>
1207 | %    \end{macrocode}
1208 | %
1209 | % The appearance of sections, subsections and subsubsections is
1210 | % controlled by
1211 | %    \begin{macrocode}
1212 | %<*class>
1213 | %% sections, subsections, and subsubsections
1214 | \newlength{\preXLskip}
1215 | \newlength{\preLskip}
1216 | \newlength{\preMskip}
1217 | \newlength{\preSskip}
1218 | \newlength{\postMskip}
1219 | \newlength{\postSskip}
1220 | \setlength{\preXLskip}{1.8\baselineskip plus 0.5ex minus 0ex}
1221 | \setlength{\preLskip}{1.5\baselineskip plus 0.3ex minus 0ex}
1222 | \setlength{\preMskip}{1\baselineskip plus 0.2ex minus 0ex}
1223 | \setlength{\preSskip}{.8\baselineskip plus 0.2ex minus 0ex}
1224 | \setlength{\postMskip}{.5\baselineskip plus 0ex minus 0.1ex}
1225 | \setlength{\postSskip}{.3\baselineskip plus 0ex minus 0.1ex}
1226 | 
1227 | 
1228 | \newcommand{\jsssec}[2][default]{\vskip \preXLskip%
1229 |   \pdfbookmark[1]{#1}{Section.\thesection.#1}%
1230 |   \refstepcounter{section}%
1231 |   \centerline{\textbf{\Large \thesection. #2}} \nopagebreak  
1232 |   \vskip \postMskip \nopagebreak}
1233 | \newcommand{\jsssecnn}[1]{\vskip \preXLskip%
1234 |   \centerline{\textbf{\Large #1}} \nopagebreak
1235 |   \vskip \postMskip \nopagebreak}
1236 | 
1237 | \newcommand{\jsssubsec}[2][default]{\vskip \preMskip%
1238 |   \pdfbookmark[2]{#1}{Subsection.\thesubsection.#1}%
1239 |   \refstepcounter{subsection}%
1240 |   \textbf{\large \thesubsection. #2} \nopagebreak
1241 |   \vskip \postSskip \nopagebreak}
1242 | \newcommand{\jsssubsecnn}[1]{\vskip \preMskip%
1243 |   \textbf{\large #1} \nopagebreak
1244 |   \vskip \postSskip \nopagebreak}
1245 | 
1246 | \newcommand{\jsssubsubsec}[2][default]{\vskip \preSskip%
1247 |   \pdfbookmark[3]{#1}{Subsubsection.\thesubsubsection.#1}%
1248 |   \refstepcounter{subsubsection}%
1249 |   {\large \textit{#2}} \nopagebreak
1250 |   \vskip \postSskip \nopagebreak}
1251 | \newcommand{\jsssubsubsecnn}[1]{\vskip \preSskip%
1252 |   {\textit{\large #1}} \nopagebreak
1253 |   \vskip \postSskip \nopagebreak}
1254 | 
1255 | \newcommand{\jsssimplesec}[2][default]{\vskip \preLskip%
1256 | %%  \pdfbookmark[1]{#1}{Section.\thesection.#1}%
1257 |   \refstepcounter{section}%
1258 |   \textbf{\large #1} \nopagebreak
1259 |   \vskip \postSskip \nopagebreak}
1260 | \newcommand{\jsssimplesecnn}[1]{\vskip \preLskip%
1261 |   \textbf{\large #1} \nopagebreak
1262 |   \vskip \postSskip \nopagebreak}
1263 | 
1264 | \if@review
1265 |   \renewcommand{\section}{\secdef \jsssimplesec \jsssimplesecnn}
1266 |   \renewcommand{\subsection}{\secdef \jsssimplesec \jsssimplesecnn}
1267 |   \renewcommand{\subsubsection}{\secdef \jsssimplesec \jsssimplesecnn}
1268 | \else
1269 |   \renewcommand{\section}{\secdef \jsssec \jsssecnn}
1270 |   \renewcommand{\subsection}{\secdef \jsssubsec \jsssubsecnn}
1271 |   \renewcommand{\subsubsection}{\secdef \jsssubsubsec \jsssubsubsecnn}
1272 | \fi
1273 | %</class>
1274 | %    \end{macrocode}
1275 | %
1276 | % The hypersetup uses some modified colors
1277 | %    \begin{macrocode}
1278 | %<*class>
1279 | %% colors
1280 | \definecolor{Red}{rgb}{0.5,0,0}
1281 | \definecolor{Blue}{rgb}{0,0,0.5}
1282 | %</class>
1283 | %    \end{macrocode}
1284 | % and is then defined by
1285 | %    \begin{macrocode}
1286 | %<*class>
1287 | \if@review
1288 |   \hypersetup{%
1289 |     hyperindex = {true},
1290 |     colorlinks = {true},
1291 |     linktocpage = {true},
1292 |     plainpages = {false},
1293 |     linkcolor = {Blue},
1294 |     citecolor = {Blue},
1295 |     urlcolor = {Red},
1296 |     pdfstartview = {Fit},
1297 |     pdfpagemode = {None},
1298 |     pdfview = {XYZ null null null}
1299 |   }
1300 | \else
1301 |   \hypersetup{%
1302 |     hyperindex = {true},
1303 |     colorlinks = {true},
1304 |     linktocpage = {true},
1305 |     plainpages = {false},
1306 |     linkcolor = {Blue},
1307 |     citecolor = {Blue},
1308 |     urlcolor = {Red},
1309 |     pdfstartview = {Fit},
1310 |     pdfpagemode = {UseOutlines},
1311 |     pdfview = {XYZ null null null}
1312 |   }
1313 | \fi
1314 | %</class>
1315 | %    \end{macrocode}
1316 | %
1317 | % The information for the hyper summary requires
1318 | % some information which has not been processed
1319 | % before the beginning of the document. Therefore,
1320 | % we need a second \verb/\hypersetup/.
1321 | %    \begin{macrocode}
1322 | %<*class>
1323 | \if@nojss
1324 |   \AtBeginDocument{
1325 |     \hypersetup{%
1326 |       pdfauthor = {\@Hyperauthor},
1327 |       pdftitle = {\@Plaintitle},
1328 |       pdfkeywords = {\@Plainkeywords}
1329 |     }
1330 |   }
1331 | \else
1332 |   \AtBeginDocument{
1333 |     \hypersetup{%
1334 |       pdfauthor = {\@Hyperauthor},
1335 |       pdftitle = {\@Plaintitle},
1336 |       pdfsubject = {\@Hypersubject},
1337 |       pdfkeywords = {\@Plainkeywords}
1338 |     }
1339 |   }
1340 | \fi
1341 | %</class>
1342 | %    \end{macrocode}
1343 | %
1344 | % We put the header at the beginning of the
1345 | % document (for footer see above).
1346 | %    \begin{macrocode}
1347 | %<*class>
1348 | \if@notitle
1349 |   %% \AtBeginDocument{\maketitle}
1350 | \else
1351 |   \AtBeginDocument{\maketitle}
1352 | \fi
1353 | %</class>
1354 | %    \end{macrocode}
1355 | %
1356 | % Finally, some additional commands are provided for writing about
1357 | % software (code, programming languages, packages),
1358 | %    \begin{macrocode}
1359 | %<*class>
1360 | %% commands
1361 | \newcommand\code{\bgroup\@makeother\_\@makeother\~\@makeother\$\@codex}
1362 | \def\@codex#1{{\normalfont\ttfamily\hyphenchar\font=-1 #1}\egroup}
1363 | %%\let\code=\texttt
1364 | \let\proglang=\textsf
1365 | \newcommand{\pkg}[1]{{\fontseries{b}\selectfont #1}}
1366 | %</class>
1367 | %    \end{macrocode}
1368 | % for specifying e-mail addresses,
1369 | %    \begin{macrocode}
1370 | %<*class>
1371 | \newcommand{\email}[1]{\href{mailto:#1}{\normalfont\texttt{#1}}}
1372 | %</class>
1373 | %    \end{macrocode}
1374 | % digital object identifiers (DOIs),
1375 | %    \begin{macrocode}
1376 | %<*class>
1377 | \ifx\csname urlstyle\endcsname\relax
1378 |   \newcommand\@doi[1]{doi:\discretionary{}{}{}#1}\else
1379 |   \newcommand\@doi{doi:\discretionary{}{}{}\begingroup
1380 | \urlstyle{tt}\Url}\fi
1381 | \newcommand{\doi}[1]{\href{http://dx.doi.org/#1}{\normalfont\texttt{\@doi{#1}}}}
1382 | %</class>
1383 | %    \end{macrocode}
1384 | % and for mathematical notation.
1385 | %    \begin{macrocode}
1386 | %<*class>
1387 | \newcommand{\E}{\mathsf{E}}
1388 | \newcommand{\VAR}{\mathsf{VAR}}
1389 | \newcommand{\COV}{\mathsf{COV}}
1390 | \newcommand{\Prob}{\mathsf{P}}
1391 | %</class>
1392 | %    \end{macrocode}
1393 | 


--------------------------------------------------------------------------------
/jsslogo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/tidy-data/5623c7a302977e8c88f623a08556d5c08e940d8c/jsslogo.jpg


--------------------------------------------------------------------------------
/model-1.tex:
--------------------------------------------------------------------------------
 1 | % latex table generated in R 2.14.2 by xtable 1.6-0 package
 2 | % Wed Mar 21 10:40:47 2012
 3 | \begin{tabular}{rrr}
 4 |   \toprule
 5 |  id & x & y \\ 
 6 |   \midrule
 7 |     1 & 22.19 & 24.05 \\ 
 8 |     2 & 19.82 & 22.91 \\ 
 9 |     3 & 19.81 & 21.19 \\ 
10 |     4 & 17.49 & 18.59 \\ 
11 |     5 & 19.44 & 19.85 \\ 
12 |    \bottomrule
13 | \end{tabular}
14 | 


--------------------------------------------------------------------------------
/model-2.tex:
--------------------------------------------------------------------------------
 1 | % latex table generated in R 2.14.2 by xtable 1.6-0 package
 2 | % Wed Mar 21 10:40:47 2012
 3 | \begin{tabular}{rlr}
 4 |   \toprule
 5 |  id & variable & value \\ 
 6 |   \midrule
 7 |     1 & x & 22.19 \\ 
 8 |     2 & x & 19.82 \\ 
 9 |     3 & x & 19.81 \\ 
10 |     4 & x & 17.49 \\ 
11 |     5 & x & 19.44 \\ 
12 |     1 & y & 24.05 \\ 
13 |     2 & y & 22.91 \\ 
14 |     3 & y & 21.19 \\ 
15 |     4 & y & 18.59 \\ 
16 |     5 & y & 19.85 \\ 
17 |    \bottomrule
18 | \end{tabular}
19 | 


--------------------------------------------------------------------------------
/references.bib:
--------------------------------------------------------------------------------
  1 | 
  2 | @Book{codd:1990,
  3 |   author  = {Codd, E. F.},
  4 |   title   = {The Relational Model for Database Management: Version 2},
  5 |   address = {Boston, MA, USA},
  6 |   isbn    = {0-201-14192-2},
  7 |   publisher={Addison-Wesley Longman Publishing Co., Inc.},
  8 |   year    = {1990}
  9 | }
 10 | 
 11 | @Book{dasu:2003,
 12 |   author  = {Dasu, T. and Johnson, T.},
 13 |   title   = {Exploratory Data Mining and Data Cleaning},
 14 |   publisher={Wiley-IEEE},
 15 |   year    = {2003}
 16 | }
 17 | 
 18 | @Article{inselberg:1985,
 19 |   author  = {Inselberg, A.},
 20 |   title   = {The Plane with Parallel Coordinates},
 21 |   journal = {The Visual Computer},
 22 |   keywords= {parallel coordinates},
 23 |   pages   = {69--91},
 24 |   volume  = 1,
 25 |   year    = 1985
 26 | }
 27 | 
 28 | @InProceedings{kandel:2011,
 29 |   author  = {Kandel, Sean and Paepcke, Andreas and Hellerstein, Joseph and
 30 |             Heer, Jeffrey},
 31 |   title   = {Wrangler: Interactive Visual Specification of Data
 32 |             Transformation Scripts},
 33 |   booktitle={ACM Human Factors in Computing Systems (CHI)},
 34 |   keywords= {reshape},
 35 |   url     = {http://vis.stanford.edu/papers/wrangler},
 36 |   year    = {2011}
 37 | }
 38 | 
 39 | @Conference{lakshmanan:1996,
 40 |   author  = {Lakshmanan, L.V.S. and Sadri, F. and Subramanian, I.N.},
 41 |   title   = {{SchemaSQL-a language for interoperability in relational
 42 |             multi-database systems}},
 43 |   booktitle={Proceedings of the International Conference on Very Large Data
 44 |             Bases},
 45 |   issn    = {1047-7349},
 46 |   keywords= {reshape},
 47 |   pages   = {239--250},
 48 |   year    = {1996}
 49 | }
 50 | 
 51 | @InProceedings{mckinney:2010,
 52 |   author  = {McKinney, Wes},
 53 |   title   = {Data Structures for Statistical Computing in \proglang{Python}},
 54 |   booktitle={Proceedings of the 9th Python in Science Conference},
 55 |   editor  = {van der Walt, St\'efan and Millman, Jarrod},
 56 |   pages   = {51 - 56},
 57 |   year    = {2010}
 58 | }
 59 | 
 60 | @Book{me:ggplot2,
 61 |   author  = {Wickham, Hadley},
 62 |   title   = {\pkg{ggplot2}: Elegant Graphics for Data Analysis},
 63 |   month   = {July},
 64 |   publisher={Springer-Verlag},
 65 |   series  = {{useR}},
 66 |   year    = {2009}
 67 | }
 68 | 
 69 | @Article{me:plyr,
 70 |   author  = {Wickham, Hadley},
 71 |   title   = {The Split-apply-combine Strategy for Data Analysis},
 72 |   journal = {Journal of Statistical Software},
 73 |   number  = {1},
 74 |   pages   = {1--29},
 75 |   url     = {http://www.jstatsoft.org/v40/i01/},
 76 |   volume  = {40},
 77 |   year    = {2011}
 78 | }
 79 | 
 80 | @Manual{r,
 81 |   author  = {{R Development Core Team}},
 82 |   title   = {\proglang{R}: A Language and Environment for Statistical Computing},
 83 |   address = {Vienna, Austria},
 84 |   note    = {{ISBN} 3-900051-07-0},
 85 |   organization={R Foundation for Statistical Computing},
 86 |   url     = {http://www.R-project.org/},
 87 |   year    = {2011}
 88 | }
 89 | 
 90 | @Conference{raman:2001,
 91 |   author  = {Raman, V. and Hellerstein, J.M.},
 92 |   title   = {{Potter's Wheel: An Interactive Data Cleaning System}},
 93 |   booktitle={Proceedings of the International Conference on Very Large Data
 94 |             Bases},
 95 |   issn    = {1047-7349},
 96 |   keywords= {reshape},
 97 |   pages   = {381--390},
 98 |   year    = {2001}
 99 | }
100 | 
101 | @Book{sarkar:2008,
102 |   author  = {Sarkar, Deepayan},
103 |   title   = {\pkg{Lattice}: Multivariate Data Visualization with \proglang{R}},
104 |   publisher={Springer-Verlag},
105 |   year    = {2008}
106 | }
107 | 
108 | @Book{wainer:2000,
109 |   author  = {Wainer, Howard},
110 |   title   = {Visual revelations: Graphical Tales of Fate and Deception from
111 |             Napoleon Bonaparte to Ross Perot},
112 |   publisher={Lawrence Erlbaum},
113 |   year    = {2000}
114 | }
115 | 
116 | @Article{wegman:1990,
117 |   author  = {Wegman, Edward J.},
118 |   title   = {Hyperdimensional Data Analysis Using Parallel Coordinates},
119 |   journal = {Journal of the American Statistical Association},
120 |   keywords= {parallel coordinates},
121 |   number  = {411},
122 |   pages   = {664-675},
123 |   volume  = {85},
124 |   year    = {1990}
125 | }
126 | 
127 | @Article{wickham:2007b,
128 |   author  = {Wickham, Hadley},
129 |   title   = {Reshaping Data with the \pkg{Reshape} Package},
130 |   journal = {Journal of Statistical Software},
131 |   keywords= {reshape},
132 |   number  = {12},
133 |   pages   = {1--20},
134 |   url     = {http://www.jstatsoft.org/v21/i12/paper},
135 |   volume  = {21},
136 |   year    = {2007}
137 | }
138 | 
139 | @Article{wickham:2007d,
140 |   author  = {Wickham, Hadley},
141 |   title   = {A Layered Grammar of Graphics},
142 |   journal = {Journal of Computational and Graphical Statistics},
143 |   number  = {1},
144 |   pages   = {3-28},
145 |   volume  = {19},
146 |   year    = {2010}
147 | }
148 | 
149 | @Book{wilkinson:2006,
150 |   author  = {Wilkinson, Leland},
151 |   title   = {The Grammar of Graphics},
152 |   edition = {2nd},
153 |   keywords= {graphics/theory},
154 |   publisher={Springer-Verlag},
155 |   series  = {Statistics and Computing},
156 |   year    = {2005}
157 | }
158 | 


--------------------------------------------------------------------------------
/t-test.r:
--------------------------------------------------------------------------------
 1 | # From Ben Bolker
 2 | library(reshape2)
 3 | library(lme4)
 4 | set.seed(1001)
 5 | source("data/xtable.r")
 6 | 
 7 | x <- rnorm(5, 20, 1)
 8 | y <- x + rnorm(5, 2, 1)
 9 | df1 <- data.frame(id = seq_along(y), x = x, y = y)
10 | df2 <- melt(df1, id = "id")
11 | 
12 | xtable(df1, file = "model-1.tex")
13 | xtable(df2, file = "model-2.tex")
14 | 
15 | t1 <- t.test(df1$x, df1$y, paired = TRUE)
16 | m1 <- lmer(value ~ variable + (1 | id) , data = df2, REML = TRUE)
17 | 
18 | all.equal(abs(t1$statistic), coef(summary(m1))["variabley","t value"])
19 | 
20 | # The t statistic is (almost) the same. (all.equal() reports a relative
21 | # difference of 4.618215e-07). REML=TRUE isn't necessary (it's the
22 | # default) but it emphasizes the fact that the paired t test is exactly
23 | # equivalent to REML.


--------------------------------------------------------------------------------
/tidy-data.tex:
--------------------------------------------------------------------------------
  1 | \documentclass[article]{jss}
  2 | \usepackage[utf8]{inputenc}
  3 | 
  4 | % jss.cls ----------------------------------------------------------------------
  5 | 
  6 | %% almost as usual
  7 | \author{Hadley Wickham\\RStudio}
  8 | \title{Tidy Data}
  9 | 
 10 | \Plainauthor{Hadley Wickham}
 11 | \Plaintitle{Tidy Data}
 12 | 
 13 | %% an abstract and keywords
 14 | \Abstract{
 15 | A huge amount of effort is spent cleaning data to get it ready for analysis, but there has been little research on how to make data cleaning as easy and effective as possible. This paper tackles a small, but important, component of data cleaning: data tidying. Tidy datasets are easy to manipulate, model and visualise, and have a specific structure: each variable is a column, each observation is a row, and each type of observational unit is a table. This framework makes it easy to tidy messy datasets because only a small set of tools are needed to deal with a wide range of un-tidy datasets. This structure also makes it easier to develop tidy tools for data analysis, tools that both input and output tidy datasets. The advantages of a consistent data structure and matching tools are demonstrated with a case study free from mundane data manipulation chores.
 16 | }
 17 | \Keywords{data cleaning, data tidying, relational databases, \proglang{R}}
 18 | \Plainkeywords{data cleaning, data tidying, relational databases, R}
 19 | 
 20 | %% \Volume{50}
 21 | %% \Issue{9}
 22 | %% \Month{June}
 23 | %% \Year{2012}
 24 | %% \Submitdate{2012-06-04}
 25 | %% \Acceptdate{2012-06-04}
 26 | 
 27 | \Address{
 28 |   Hadley Wickham\\
 29 |   Chief Scientist, RStudio\\
 30 |   Adjunct Professor, Rice University\\
 31 |   E-mail: \email{h.wickham@gmail.com}\\
 32 |   URL: \url{http://had.co.nz}
 33 | }
 34 | 
 35 | % jss.cls ----------------------------------------------------------------------
 36 | 
 37 | \DeclareGraphicsExtensions{.png,.pdf,.jpg}
 38 | \usepackage[format=plain,font=small]{caption}
 39 | \usepackage{booktabs}
 40 | \usepackage{fancyvrb}
 41 | \renewcommand{\FancyVerbFormatLine}[1]{R> #1}
 42 | \usepackage{epigraph}
 43 | \usepackage[subrefformat=parens]{subfig}
 44 | 
 45 | \begin{document}
 46 | \maketitle
 47 | 
 48 | \section{Introduction}
 49 | 
 50 | It is often said that 80\% of data analysis is spent on the process of cleaning and preparing the data \citep{dasu:2003}. Data preparation is not just a first step, but must be repeated many over the course of analysis as new problems come to light or new data is collected. Despite the amount of time it takes, there has been surprisingly little research on how to clean data well. Part of the challenge is the breadth of activities it encompasses: from outlier checking, to date parsing, to missing value imputation. To get a handle on the problem, this paper focusses on a small, but important, aspect of data cleaning that I call data \textbf{tidying}: structuring datasets to facilitate analysis.
 51 | 
 52 | The principles of tidy data provide a standard way to organise data values within a dataset. A standard makes initial data cleaning easier because you don't need to start from scratch and reinvent the wheel every time. The tidy data standard has been designed to facilitate initial exploration and analysis of the data, and to simplify the development of data analysis tools that work well together. Current tools often require translation. You have to spend time munging the output from one tool so you can input it into another. Tidy datasets and tidy tools work hand in hand to make data analysis easier, allowing you to focus on the interesting domain problem, not on the uninteresting logistics of data.
 53 | 
 54 | The principles of tidy data are closely tied to those of relational databases and Codd's relational algebra \citep{codd:1990}, but are framed in a language familiar to statisticians. Computer scientists have also contributed much to the study of data cleaning. For example, \citet{lakshmanan:1996} define an extension to SQL to allow it to operate on messy datasets, \citet{raman:2001} provide a framework for cleaning datasets, and \citet{kandel:2011} develop an interactive tool with a friendly user interface which automatically creates code to clean data. These tools are useful but they are presented in a language foreign to most statisticians, they fail to give much advice on how datasets should be structured, and they lack connections to the tools of data analysis.
 55 | 
 56 | The development of tidy data has been driven by my experience working with real-world datasets. With few, if any, constraints on their organisation, such datasets are often constructed in bizarre ways. I have spent countless hours struggling to get such datasets organised in a way that makes data analysis possible, let alone easy. I have also struggled to impart these skills to my students so they could tackle real-world datasets on their own. In the course of these struggles I developed the \pkg{reshape} and \pkg{reshape2} \citep{wickham:2007b} packages. While I could intuitively use the tools and teach them through examples, I lacked the framework to make my intuition explicit. This paper provides that framework. It provides a comprehensive ``philosophy of data'': one that underlies my work in the \pkg{plyr} \citep{me:plyr} and \pkg{ggplot2} \citep{me:ggplot2} packages.
 57 | 
 58 | The paper proceeds as follows. Section~\ref{sec:defining} begins by defining the three characteristics that make a dataset tidy. Since most real world datasets are not tidy, Section~\ref{sec:tidying} describes the operations needed to make messy datasets tidy, and illustrates the techniques with a range of real examples. Section~\ref{sec:tidy-tools} defines tidy tools, tools that input and output tidy datasets, and discusses how tidy data and tidy tools together can make data analysis easier. These principles are illustrated with a small case study in Section~\ref{sec:case-study}. Section~\ref{sec:discussion} concludes with a discussion of what this framework misses and what other approaches might be fruitful to pursue.
 59 | 
 60 | \section{Defining tidy data}
 61 | \label{sec:defining}
 62 | 
 63 | \epigraph{Happy families are all alike; every unhappy family is unhappy in its own way}{Leo Tolstoy}
 64 | 
 65 | Like families, tidy datasets are all alike but every messy dataset is messy in its own way. Tidy datasets provide a standardized way to link the structure of a dataset (its physical layout) with its semantics (its meaning). In this section, I'll provide some standard vocabulary for describing the structure and semantics of a dataset, and then use those definitions to define tidy data.
 66 | 
 67 | \subsection{Data structure}
 68 | 
 69 | Most statistical datasets are rectangular tables made up of \textbf{rows} and \textbf{columns}. The columns are almost always labelled and the rows are sometimes labelled. Table~\ref{tbl:preg-raw-1} provides some data about an imaginary experiment in a format commonly seen in the wild. The table has two columns and three rows, and both rows and columns are labelled.
 70 | 
 71 | \begin{table}[htbp]
 72 |   \centering
 73 |   \input{data/preg-raw-1.tex}
 74 |   \caption{Typical presentation dataset.}
 75 |   \label{tbl:preg-raw-1}
 76 | \end{table}
 77 | 
 78 | There are many ways to structure the same underlying data. Table~\ref{tbl:preg-raw-2} shows the same data as Table~\ref{tbl:preg-raw-1}, but the rows and columns have been transposed. The data is the same, but the layout is different. Our vocabulary of rows and columns is simply not rich enough to describe why the two tables represent the same data. In addition to appearance, we need a way to describe the underlying semantics, or meaning, of the values displayed in table.
 79 | 
 80 | \begin{table}[htbp]
 81 |   \centering
 82 |   \input{data/preg-raw-2.tex}
 83 |   \caption{The same data as in Table~\ref{tbl:preg-raw-1} but structured differently.}
 84 |   \label{tbl:preg-raw-2}
 85 | \end{table}
 86 | 
 87 | \subsection{Data semantics}
 88 | 
 89 | A dataset is a collection of \textbf{values}, usually either numbers (if quantitative) or strings (if qualitative). Values are organised in two ways. Every value belongs to a \textbf{variable} and an \textbf{observation}. A variable contains all values that measure the same underlying attribute (like height, temperature, duration) across units. An observation contains all values measured on the same unit (like a person, or a day, or a race) across attributes. 
 90 | 
 91 | Table~\ref{tbl:preg-tidy} reorganises Table~\ref{tbl:preg-raw-1} to make the values, variables and obserations more clear. The dataset contains 18 values representing three variables and six observations. The variables are:
 92 | 
 93 | \begin{enumerate}
 94 | 
 95 | \item \code{person}, with three possible values (John, Mary, and Jane).
 96 | 
 97 | \item \code{treatment}, with two possible values (a and b).
 98 | 
 99 | \item \code{result}, with five or six values depending on how you think of the missing value (-, 16, 3, 2, 11, 1).
100 | 
101 | \end{enumerate}
102 | 
103 | The experimental design tells us more about the structure of the observations. In this experiment, every combination of of \code{person} and \code{treatment} was measured, a completely crossed design. The experimental design also determines whether or not missing values can be safely dropped. In this experiment, the missing value represents an observation that should have been made, but wasn't, so it's important to keep it. Structural missing values, which represent measurements that can't be made (e.g., the count of pregnant males) can be safely removed.
104 | 
105 | \begin{table}[htbp]
106 |   \centering
107 |   \input{data/preg-tidy.tex}
108 |   \caption{The same data as in Table~\ref{tbl:preg-raw-1} but with variables in columns and observations in rows.}
109 |   \label{tbl:preg-tidy}
110 | \end{table}
111 | 
112 | For a given dataset, it's usually easy to figure out what are observations and what are variables, but it is surprisingly difficult to precisely define variables and observations in general. For example, if the columns in the Table~\ref{tbl:preg-raw-1} were \code{height} and \code{weight} we would have been happy to call them variables. If the columns were \code{height} and \code{width}, it would be less clear cut, as we might think of height and width as values of a \code{dimension} variable. If the columns were \code{home phone} and \code{work phone}, we could treat these as two variables, but in a fraud detection environment we might want variables \code{phone number} and \code{number type} because the use of one phone number for multiple people might suggest fraud. A general rule of thumb is that it is easier to describe functional relationships between variables (e.g., \code{z} is a linear combination of \code{x} and \code{y}, \code{density} is the ratio of \code{weight} to \code{volume}) than between rows, and it is easier to make comparisons between groups of observations (e.g., average of group a vs.\ average of group b) than between groups of columns.
113 | 
114 | In a given analysis, there may be multiple levels of observation. For example, in a trial of new allergy medication we might have three observational types: demographic data collected from each person (\code{age}, \code{sex}, \code{race}), medical data collected from each person on each day (\code{number of sneezes}, \code{redness of eyes}), and meterological data collected on each day (\code{temperature}, \code{pollen count}).
115 | 
116 | \subsection{Tidy data}
117 | 
118 | Tidy data is a standard way of mapping the meaning of a dataset to its structure. A dataset is messy or tidy depending on how rows, columns and tables are matched up with observations, variables and types. In \textbf{tidy data}:
119 | 
120 | \begin{enumerate}
121 |   \item Each variable forms a column.
122 |   \item Each observation forms a row.
123 |   \item Each type of observational unit forms a table.
124 | \end{enumerate}
125 | 
126 | \noindent This is Codd's 3rd normal form \citep{codd:1990}, but with the constraints framed in statistical language, and the focus put on a single dataset rather than the many connected datasets common in relational databases. \textbf{Messy data} is any other other arrangement of the data. 
127 | 
128 | Table~\ref{tbl:preg-tidy} is the tidy version of Table~\ref{tbl:preg-raw-1}. Each row represents an observation, the \code{result} of one \code{treatment} on one \code{person}, and each column is a variable.
129 | 
130 | Tidy data makes it easy for an analyst or a computer to extract needed variables because it provides a standard way of structuring a dataset. Compare Table~\ref{tbl:preg-tidy} to Table~\ref{tbl:preg-raw-1}: in Table~\ref{tbl:preg-raw-1} you need to use different strategies to extract different variables. This slows analysis and invites errors. If you consider how many data analysis operations involve all of the values in a variable (every aggregation function), you can see how important it is to extract these values in a simple, standard way. Tidy data is particularly well suited for vectorised programming languages like \proglang{R}, because the layout ensures that values of different variables from the same observation are always paired.
131 | 
132 | While the order of variables and observations does not affect analysis, a good ordering makes it easier to scan the raw values. One way of organising variables is by their role in the analysis: are values fixed by the design of the data collection, or are they measured during the course of the experiment? Fixed variables describe the experimental design and are known in advance. Computer scientists often call fixed variables dimensions, and statisticians usually denote them with subscripts on random variables. Measured variables are what we actually measure in the study. Fixed variables should come first, followed by measured variables, each ordered so that related variables are contiguous. Rows can then be ordered by the first variable, breaking ties with the second and subsequent (fixed) variables. This is the convention adopted by all tabular displays in this paper. 
133 | 
134 | \section{Tidying messy datasets}
135 | \label{sec:tidying}
136 | 
137 | Real datasets can, and often do, violate the three precepts of tidy data in almost every way imaginable. While occasionally you do get a dataset that you can start analysing immediately, this is the exception, not the rule. This section describes the five most common problems with messy datasets, along with their remedies:
138 | 
139 | \begin{itemize}
140 |   \item Column headers are values, not variable names.
141 |   \item Multiple variables are stored in one column.
142 |   \item Variables are stored in both rows and columns.
143 |   \item Multiple types of observational units are stored in the same table.
144 |   \item A single observational unit is stored in multiple tables.
145 | \end{itemize}
146 | 
147 | Surprisingly, most messy datasets, including types of messiness not explicitly described above, can be tidied with a small set of tools: melting, string splitting, and casting. The following sections illustrate each problem with a real dataset that I have encountered, and show how to tidy them. The complete datasets and the \proglang{R} code used to tidy them are available online at \url{https://github.com/hadley/tidy-data}, and in the online supplementary materials for this paper.
148 | 
149 | \subsection{Column headers are values, not variable names}
150 | 
151 | A common type of messy dataset is tabular data designed for presentation, where variables form both the rows and columns, and column headers are values, not variable names. While I would call this arrangement messy, in some cases it can be extremely useful. It provides efficient storage for completely crossed designs, and it can lead to extremely efficient computation if desired operations can be expressed as matrix operations. This issue is discussed in depth in Section~\ref{sec:discussion}.
152 | 
153 | Table~\ref{tbl:pew-raw} shows a subset of a typical dataset of this form. This dataset explores the relationship between income and religion in the US. It comes from a report\footnote{\url{http://religions.pewforum.org/pdf/comparison-Income\%20Distribution\%20of\%20Religious\%20Traditions.pdf}} produced by the Pew Research Center, an American think-tank that collects data on attitudes to topics ranging from religion to the internet, and produces many reports that contain datasets in this format.
154 | 
155 | \begin{table}[htbp]
156 |   \centering
157 |   \input{data/pew-raw.tex}
158 |   \caption{The first ten rows of data on income and religion from the Pew Forum. Three columns, \code{\$75-100k}, \code{\$100-150k} and \code{>150k}, have been omitted} 
159 |   \label{tbl:pew-raw}
160 | \end{table}
161 | 
162 | This dataset has three variables, \code{religion}, \code{income} and \code{frequency}. To tidy it, we need to \textbf{melt}, or stack it. In other words, we need to turn columns into rows. While this is often described as making wide datasets long or tall, I will avoid those terms because they are imprecise. Melting is parameterised by a list of columns that are already variables, or \textbf{colvar}s for short. The other columns are converted into two variables: a new variable called \code{column} that contains repeated column headings and a new variable called \code{value} that contains the concatenated data values from the previously separate columns. This is illustrated in Table~\ref{tbl:melt} with a toy dataset. The result of melting is a \textbf{molten} dataset.
163 | 
164 | \begin{table}
165 |   \centering
166 |   \subfloat[Raw data]{\label{tbl:melt-raw} \input{data/melt-raw.tex}}%
167 |   \hspace{2em}%
168 |   \subfloat[Molten data]{\label{tbl:melt-molten}\input{data/melt-output.tex}}
169 | 
170 |   \caption{A simple example of melting. (a) is melted with one colvar, row, yielding the molten dataset (b). The information in each table is exactly the same, just stored in a different way.}
171 |   \label{tbl:melt} 
172 | \end{table}
173 | 
174 | The Pew dataset has one colvar, \code{religion}, and melting yields Table~\ref{tbl:pew-clean}. To better reflect their roles in this dataset, the \code{variable} column has been renamed to \code{income}, and the \code{value} column to \code{freq}. This form is tidy because each column represents a variable and each row represents an observation, in this case a demographic unit corresponding to a combination of \code{religion} and \code{income}.
175 | 
176 | \begin{table}[htbp]
177 |   \centering
178 |   \input{data/pew-clean.tex}
179 |   \caption{The first ten rows of the tidied Pew survey dataset on income and religion. The \code{column} has been renamed to \code{income}, and \code{value} to \code{freq}.}
180 |   \label{tbl:pew-clean}
181 | \end{table}
182 | 
183 | Another common use of this data format is to record regularly spaced observations over time. For example, the Billboard dataset shown in Table~\ref{tbl:billboard-raw} records the date a song first entered the Billboard Top 100. It has variables for \code{artist}, \code{track}, \code{date.entered}, \code{rank} and \code{week}. The rank in each week after it enters the top 100 is recorded in 75 columns, \code{wk1} to \code{wk75}. If a song is in the Top 100 for less than 75 weeks the remaining columns are filled with missing values. This form of storage is not tidy, but it is useful for data entry. It reduces duplication since otherwise each song in each week would need its own row, and song metadata like title and artist would need to be repeated. This issue will be discussed in more depth in Section~\ref{sub:multiple-types}.
184 | 
185 | \begin{table}[htbp]
186 |   \centering
187 |   \input{data/billboard-raw.tex}
188 |   \caption{The first eight Billboard top hits for 2000. Other columns not shown are \code{wk4}, \code{wk5}, ..., \code{wk75}.}
189 |   \label{tbl:billboard-raw}
190 | \end{table}
191 | 
192 | This dataset has colvars \code{year}, \code{artist}, \code{track}, \code{time}, and \code{date.entered}. Melting yields Table~\ref{tbl:billboard-clean}. I have also done a little cleaning as well as tidying: \code{column} has been converted to \code{week} by extracting the number, and \code{date} has been computed from \code{date.entered} and \code{week}.
193 | 
194 | \begin{table}[htbp]
195 |   \centering
196 |   \input{data/billboard-clean.tex}
197 |   \caption{First fifteen rows of the tidied billboard dataset. The \code{date} column does not appear in the original table, but can be computed from \code{date.entered} and \code{week}.}
198 |   \label{tbl:billboard-clean}
199 | \end{table}
200 | 
201 | \subsection{Multiple variables stored in one column}
202 | 
203 | After melting, the \code{column} variable names often becomes a combination of multiple underlying variable names. This is illustrated by the tuberculosis (TB) dataset, a sample of which is shown in Table~\ref{tbl:tb-raw}. This dataset comes from the World Health Organisation, and records the counts of confirmed tuberculosis cases by \code{country}, \code{year}, and demographic group. The demographic groups are broken down by \code{sex} (m, f) and \code{age} (0--14, 15--25, 25--34, 35--44, 45--54, 55--64, unknown). 
204 | 
205 | \begin{table}[htbp]
206 |   \centering
207 |   \input{data/tb-raw.tex}
208 |   \caption{Original TB dataset. Corresponding to each `m' column for males, there is also an `f' column for females, \code{f1524}, \code{f2534} and so on. These are not shown to conserve space. Note the mixture of 0s and missing values (---). This is due to the data collection process and the distinction is important for this dataset.}
209 |   \label{tbl:tb-raw}
210 | \end{table}
211 | 
212 | Column headers in this format are often separated by some character (\code{.}, \code{-}, \code{\_}, \code{:}). While the string can be broken into pieces using that character as a divider, in other cases, such as for this dataset, more careful string processing is required. For example, the variable names can be matched to a lookup table that converts single compound value into multiple component values. 
213 | 
214 | Table~\subref*{tbl:tb-molten} shows the results of melting the TB dataset, and Table~\subref*{tbl:tb-tidy} shows the results of splitting the single column \code{column} into two real variables: \code{age} and \code{sex}.  
215 | 
216 | \begin{table}[htbp]
217 |   \centering
218 |   \subfloat[Molten data]{\label{tbl:tb-molten}\input{data/tb-clean-1.tex}}%
219 |   \hspace{2em}%
220 |   \subfloat[Tidy data]{\label{tbl:tb-tidy}\input{data/tb-clean-2.tex}}
221 |   
222 |   \caption{Tidying the TB dataset requires first melting, and then splitting the \code{column} column into two variables: \code{sex} and \code{age}.}
223 |   \label{tbl:tb-clean}
224 | \end{table}
225 | 
226 | Storing the values in this form resolves another problem in the original data. We want to compare rates, not counts. But to compute rates, we need to know the population. In the original format, there is no easy way to add a population variable. It has to be stored in a separate table, which makes it hard to correctly match populations to counts. In tidy form, adding variables for population and rate is easy. They are just additional columns.
227 | 
228 | \subsection{Variables are stored in both rows and columns}
229 | 
230 | The most complicated form of messy data occurs when variables are stored in both rows and columns. Table~\ref{tbl:weather-raw} shows daily weather data from the Global Historical Climatology Network for one weather station (MX17004) in Mexico for five months in 2010. It has variables in individual columns (\code{id}, \code{year}, \code{month}), spread across columns (\code{day}, d1--d31) and across rows (\code{tmin}, \code{tmax}) (minimum and maximum temperature). Months with less than 31 days have structural missing values for the last day(s) of the month. The \code{element} column is not a variable; it stores the names of variables.
231 | 
232 | To tidy this dataset we first melt it with colvars \code{id}, \code{year}, \code{month} and the column that contains variable names, \code{element}. This yields Table~\subref*{tbl:weather-molten}. For presentation, we have dropped the missing values, making them implicit rather than explicit. This is permissible because we know how many days are in each month and can easily reconstruct the explicit missing values.
233 | 
234 | This dataset is mostly tidy, but we have two variables stored in rows: \code{tmin} and \code{tmax}, the type of observation. Not shown in this example are the other meteorological variables \code{prcp} (precipitation) and \code{snow} (snowfall). Fixing this requires the cast, or unstack, operation. This performs the inverse of melting by rotating the \code{element} variable back out into the columns (Table~\subref*{tbl:weather-tidy}). This form is tidy. There is one variable in each column, and each row represents a day's observations. The cast operation is described in depth in \citet{wickham:2007b}.
235 | 
236 | \begin{table}[htbp]
237 |   \centering
238 |   \input{data/weather-raw.tex}
239 |   \caption{Original weather dataset.  There is a column for each possible day in the month.  Columns \code{d9} to \code{d31} have been omitted to conserve space.}
240 |   \label{tbl:weather-raw}
241 | \end{table}
242 | 
243 | \begin{table}[htbp]
244 |   \centering
245 |   \subfloat[Molten data]%
246 |     {\label{tbl:weather-molten}\input{data/weather-clean-1.tex}}%
247 |   \hspace{2em}%
248 |   \subfloat[Tidy data]%
249 |     {\label{tbl:weather-tidy}\input{data/weather-clean-2.tex}}%
250 | 
251 |   \caption{(a) Molten weather dataset. This is almost tidy, but instead of values, the \code{element} column contains names of variables. Missing values are dropped to conserve space. (b) Tidy weather dataset. Each row represents the meteorological measurements for a single day. There are two measured variables, minimum (\code{tmin}) and maximum (\code{tmax}) temperature; all other variables are fixed.}
252 |   \label{tbl:weather-clean}
253 | \end{table}
254 | 
255 | \subsection{Multiple types in one table}
256 | \label{sub:multiple-types}
257 | 
258 | Datasets often involve values collected at multiple levels, on different types of observational units. During tidying, each type of observational unit should be stored in its own table. This is closely related to the idea of database normalisation, where each fact is expressed in only one place. If this is not done, it's possible for inconsistencies to occur. 
259 | 
260 | %If you're not familiar with normalisation, it can be worthwhile to learn a little about it. There are many good tutorials available online - I found \url{http://phlonx.com/resources/nf3/} after a few minutes of searching. You certainly don't need to become an expert, as most statistical databases only need a small amount of normalisation, but it is extremely helpful for identifying inconsistencies in your data.
261 | 
262 | The Billboard dataset described in Table~\ref{tbl:billboard-clean} actually contains observations on two types of observational units: the song and its rank in each week. This manifests itself through the duplication of facts about the song: \code{artist} and \code{time} are repeated for every song in each week. The billboard dataset needs to be broken down into two datasets: a song dataset which stores \code{artist}, \code{song name} and \code{time}, and a ranking dataset which gives the \code{rank} of the \code{song} in each \code{week}. Table~\ref{tbl:billboard-normal} shows these two datasets. You could also imagine a week dataset which would record background information about the week, maybe the total number of songs sold or similar demographic information.
263 | 
264 | \begin{table}
265 |   \centering
266 |   \input{data/billboard-song.tex}\hspace{1em}%
267 |   \input{data/billboard-rank.tex}
268 | 
269 |   \caption{Normalised billboard dataset split up into song dataset (left) and rank dataset (right). First 15 rows of each dataset shown; \code{genre} omitted from song dataset, \code{week} omitted from rank dataset.}
270 |   \label{tbl:billboard-normal}
271 | \end{table}
272 | 
273 | Normalisation is useful for tidying and eliminating inconsistencies. However, there are few data analysis tools that work directly with relational data, so analysis usually also requires denormalisation or the merging the datasets back into one table. 
274 | 
275 | % Multiple-choice/check all that apply data
276 | 
277 | \subsection{One type in multiple tables}
278 | 
279 | It's also common to find data values about a single type of observational unit spread out over multiple tables or files. These tables and files are often split up by another variable, so that each represents a single year, person, or location. As long as the format for individual records is consistent, this is an easy problem to fix: 
280 | 
281 | \begin{enumerate}
282 |   \item Read the files into a list of tables.
283 | 
284 | \item For each table, add a new column that records the original file name (because the file name is often the value of an important variable).
285 | 
286 |   \item Combine all tables into a single table. 
287 | \end{enumerate}
288 | 
289 | The \pkg{plyr} package makes this a straightforward task in \proglang{R}. The following code generates a vector of file names in a directory (\code{data/}) which match a regular expression (ends in \code{.csv}). Next we name each element of the vector with the name of the file. We do this because \pkg{plyr} will preserve the names in the following step, ensuring that each row in the final data frame is labelled with its source. Finally, \code{ldply()} loops over each path, reading in the csv file and combining the results into a single data frame.
290 | 
291 | \begin{Verbatim}
292 | paths <- dir("data", pattern = "\\.csv$", full.names = TRUE)
293 | names(paths) <- basename(paths)
294 | ldply(paths, read.csv, stringsAsFactors = FALSE)
295 | \end{Verbatim}
296 | 
297 | Once you have a single table, you can perform additional tidying as needed. An example of this type of cleaning can be found at \url{https://github.com/hadley/data-baby-names} which takes 129 yearly baby name tables provided by the US Social Security Administration and combines them into a single file.
298 | 
299 | A more complicated situation occurs when the dataset structure changes over time. For example, the datasets may contain different variables, the same variables with different names, different file formats, or different conventions for missing values. This may require you to tidy each file to individually (or, if you're lucky, in small groups) and then combine them once tidied. An example of this type of tidying is illustrated in \url{https://github.com/hadley/data-fuel-economy}, which shows the tidying of {\sc epa} fuel economy data for over 50,000 cars from 1978 to 2008. The raw data is available online, but each year is stored in a separate file and there are four major formats with many minor variations, making tidying this dataset a considerable challenge.
300 | 
301 | \section{Tidy tools}
302 | \label{sec:tidy-tools}
303 | 
304 | Once you have a tidy dataset, what can you do with it? Tidy data is only worthwhile if it makes analysis easier. This section discusses tidy tools, tools that take tidy datasets as input and return tidy datasets as output. Tidy tools are useful because the output of one tool can be used as the input to another. This allows you to simply and easily compose multiple tools to solve a problem. Tidy data also ensures that variables are stored in a consistent, explicit manner. This makes each tool simpler, because it doesn't need a Swiss Army knife of parameters for dealing with different dataset structures.
305 | 
306 | Tools can be messy for two reasons: either they take messy datasets as input (messy-input tools) or they produce messy datasets as output (messy-output tools). Messy-input tools are typically more complicated than tidy-input tools because they need to include some parts of the tidying process. This can be useful for common types of messy datasets, but it typically makes the function more complex, harder to use and harder to maintain. Messy-output tools are frustrating and slow down analysis because they can not be easily composed and you must constantly think about how to convert from one format to another. We'll see examples of both in the following sections.
307 | 
308 | Next, I give examples of tidy and messy tools for three important components of analysis: data manipulation, visualisation and modelling. I will focus particularly on tools provided by \proglang{R} \citep{R}, because it has many existing tidy tools, but I will also touch on other statistical programming environments.
309 | 
310 | \subsection{Manipulation}
311 | 
312 | Data manipulation includes variable-by-variable transformation (e.g., \code{log} or \code{sqrt}), as well as aggregation, filtering and reordering. In my experience, these are the four fundamental verbs of data manipulation:
313 | 
314 | \begin{itemize}
315 | 
316 |   \item Filter: subsetting or removing observations based on some
317 |   condition.
318 | 
319 |   \item Transform: adding or modifying variables. These modifications can
320 |   involve either a single variable (e.g., log-transformation), or multiple 
321 |   variables (e.g., computing density from weight and volume).
322 | 
323 |   \item Aggregate: collapsing multiple values into a single value (e.g., by 
324 |   summing or taking means).
325 | 
326 |   \item Sort: changing the order of observations.
327 | 
328 | \end{itemize}
329 | 
330 | All these operations are made easier when there is a consistent way to refer to variables. Tidy data provides this because each variable resides in its own column.
331 | 
332 | In \proglang{R}, filtering and transforming are performed by the base \proglang{R} functions \code{subset()} and \code{transform()}. These are input and output-tidy. The \code{aggregate()} function performs group-wise aggregation. It is input-tidy. Provided that a single aggregation method is used, it is also output-tidy . The \pkg{plyr} package provides tidy \code{summarise()} and \code{arrange()} functions for aggregation and sorting.
333 | 
334 | The four verbs can be, and often are, modified by the ``by'' preposition. We often need group-wise aggregates, transformations and subsets, to pick the biggest in each group, to average over replicates and so on. Combining each of the four verbs with a by operator allows them to operate on subsets of a data frame at a time. Most \proglang{SAS} {\sc proc}s possess a {\sc by} statement which allows the operation to be performed by group, and are generally input-tidy. Base \proglang{R} possesses a \code{by()} function, which is input-tidy, but not output-tidy, because it produces a list. The \code{ddply()} function from the \pkg{plyr} package is a tidy alternative.
335 | 
336 | % Some aggregations occur so frequently they deserve their own optimised implementations. One such operation is (weighted) counting. Base R provides the {\tt table} function for this, but it is not output-tidy: it returns a multidimensional array. An tidy alternative is the {\tt count} function from {\tt plyr}, which returns a tidy dataset with a column for each of the input variables plus a new variable {\tt freq}, which records the number of records in each category.
337 | 
338 | Other tools are needed when we have multiple datasets. An advantage of tidy data is the ease with which it can be combined with other tidy datasets. All that is needed is a join operator that works by matching common variables and adding new columns. This is implemented in the \code{merge()} function in base \proglang{R}, or the \code{join()} function in \pkg{plyr}. Compare these operators with the difficulty of combining datasets stored in arrays. This task typically requires painstaking alignment before matrix operations can be used, which can can make errors very hard to detect.
339 | 
340 | \subsection{Visualisation}
341 | 
342 | Tidy visualisation tools only need to be input-tidy as their output is visual. Domain specific languages work particularly well for the visualisation of tidy datasets because they can describe a visualisation as a mapping between variables and aesthetic properties of the graph (e.g., position, size, shape and colour). This is the idea behind the grammar of graphics \citep{wilkinson:2006}, and the layered grammar of graphics \citep{wickham:2007d}, an extension tailored specifically for \proglang{R}.
343 | 
344 | Most graphical tools in \proglang{R} are input-tidy, including the \pkg{base} \code{plot()} function, the \pkg{lattice} family of plots \citep{sarkar:2008} and \pkg{ggplot2} \citep{me:ggplot2}. Some specialised tools exist for visualising messy datasets. Some base \proglang{R} functions like \code{barplot()}, \code{matplot()}, \code{dotchart()}, and \code{mosaicplot()}, work with messy datasets where variables are spread out over multiple columns. Similarly, the parallel coordinates plot \citep{wegman:1990,inselberg:1985} can be used to create time series plots for messy datasets where each time point is a column.
345 | 
346 | \subsection{Modelling}
347 | \label{sub:modelling}
348 | 
349 | Modelling is the driving inspiration of this work because most modelling tools work best with tidy datasets. Every statistical language has a way of describing a model as a connection among different variables, a domain specific language that connects responses to predictors: 
350 | 
351 | \begin{itemize}
352 | 
353 |   \item \proglang{R} (\code{lm()}): \code{y ~ a + b + c * d}.
354 | 
355 |   \item \proglang{SAS} (\code{PROC GLM}): \code{y = a + b + c + d + c * d}.
356 | 
357 |   \item \proglang{SPSS} (\code{glm}): \code{y BY a b c d / DESIGN a b c d c * d}.
358 | 
359 |   \item \proglang{Stata} (\code{regress}): \code{y a b c\#d}.
360 | 
361 | 
362 | \end{itemize}
363 | 
364 | This is not to say that tidy data is the format used internally to compute the regression. Significant transformations take place to produce a numeric matrix that can easily be fed to standard linear algebra routines. Common transformations include adding an intercept column (a column of ones), turning categorical variables into multiple binary dummy variables, and projecting data onto the appropriate basis of a spline function.
365 | 
366 | % http://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/viewer.htm#statug_glm_sect022.htm
367 | 
368 | There have been some attempts to adapt modelling functions for specific types of messy datasets. For example, in \proglang{SAS}'s \code{proc glm}, multiple variables on the response side of the equation will be interpreted as repeated measures if the {\sc repeated} keyword is present. For the raw Billboard data, we could construct a model of the form \code{wk1-wk76 = track} instead of \code{rank = week * track} on the tidy data. 
369 | 
370 | Another interesting example is the classic paired t-test, which can be computed in two ways depending on how the data is stored. If the data is stored as in Table~\subref*{tbl:paired}, then a paired t-test is just a t-test applied to the mean difference between x and y. If it is stored in the form of Table~\subref*{tbl:mixed}, then an equivalent result can be produced by fitting a mixed effects model, with a fixed dummy variable representing the \code{variable}, and a random intercept for each id. (In \proglang{R}'s lmer4 notation, this is expressed as \code{value ~ variable + (1 | id)}). Either way of modelling the data yields the same result. Without more information we can't say which form of the data is tidy: if x and y represent length of left and right arms, then Table~\subref*{tbl:paired} is tidy, if x and y represent measurements on day 1 and day 10, then Table~\subref*{tbl:mixed} is tidy.
371 | 
372 | \begin{table}
373 |   \centering
374 |   \subfloat[Data for paired t-test]{
375 |     \label{tbl:paired}
376 |     \input{model-1.tex}
377 |   }%
378 |   \hspace{2em}%
379 |   \subfloat[Data for mixed effects model]{
380 |     \label{tbl:mixed}
381 |     \input{model-2.tex}
382 |   }
383 | 
384 |   \caption{Two data sets for performing the same test.}
385 |   \label{label}
386 | \end{table}
387 | 
388 | While model inputs usually require tidy inputs, such attention to detail doesn't carry over to model outputs. Outputs such as predictions and estimated coefficients aren't always tidy. This makes it more difficult to combine results from multiple models. For example, in \proglang{R}, the default representation of model coefficients is not tidy because it does not have an explicit variable that records the variable name for each estimate, they are instead recorded as row names. In \proglang{R}, row names must be unique, so combining coefficients from many models (e.g., from bootstrap resamples, or subgroups) requires workarounds to avoid losing important information. This knocks you out of the flow of analysis and makes it harder to combine the results from multiple models. I'm not currently aware of any packages that resolve this problem.
389 | 
390 | \section{Case study} 
391 | \label{sec:case-study}
392 | 
393 | The following case study illustrates how tidy data and tidy tools make data analysis easier by easing the transitions between manipulation, visualisation and modelling. You will not see any code that exists solely to get the output of one function into the right format to input to another. I'll show the \proglang{R} code that performs the analysis, but even if you're not familiar with \proglang{R} or the exact idioms I use, I've tried to make it easy to understand by tightly interleaving code, results and explanation.
394 | 
395 | The case study uses individual-level mortality data from Mexico. The goal is to find causes of death with unusual temporal patterns within a day. Figure~\ref{fig:overall} shows the temporal pattern, the number of deaths per hour, for all causes of death. My goal is to find the diseases that differ most from this pattern.
396 | 
397 | \begin{figure}[htbp]
398 |   \centering
399 |   \includegraphics[width=0.65\linewidth]{case-study/overall}
400 |   \caption{Temporal pattern of all causes of death.}
401 |   \label{fig:overall}
402 | \end{figure}
403 | 
404 | The full dataset has information on 539,530 deaths in Mexico in 2008 and 55 variables, including the the location and time of death, the cause of death, and demographics of the deceased. Table~\ref{fig:raw} shows a small sample of the dataset, focussing on variables related to time of death (\code{year}, \code{month}, \code{day} and \code{hour}), and cause of death (\code{cod}).
405 | 
406 | \begin{table}
407 |   \centering
408 |   \input{case-study/raw.tex}
409 |   \caption{A sample of 16 rows and 5 columns from the original dataset of 539,530 rows and 55 columns.}
410 |   \label{fig:raw}
411 | \end{table}
412 | 
413 | To achieve our goal of finding unusual temporal patterns, we do the following. First, we count the number of deaths in each hour (\code{hod}) for each cause (\code{cod}) with the tidy \code{count} function. 
414 | 
415 | \begin{Verbatim}
416 | hod2 <- count(deaths, c("hod", "cod"))
417 | \end{Verbatim}
418 | 
419 | Then we remove missing (and hence uninformative for our purpose) values with \code{subset}. 
420 | 
421 | \begin{Verbatim}
422 | hod2 <- subset(hod2, !is.na(hod))
423 | \end{Verbatim}
424 | 
425 | This gives Table~\subref*{tbl:counts:1}. To provide informative labels for disease, we next join the dataset to the \code{codes} dataset, connected by the \code{cod} variable. This adds a new variable, \code{disease}, shown in Table~\subref*{tbl:counts:2}.
426 | 
427 | \begin{Verbatim}
428 | hod2 <- join(hod2, codes, by = "cod")
429 | \end{Verbatim}
430 | 
431 | The total deaths for each cause varies over several orders of magnitude: there are 46,794 deaths from heart attack but only 10 from avalanche. This means that rather than the total number, it makes more sense to compare the proportion of deaths in each hour. We compute this by breaking the dataset down by \code{cod}, and then \code{transform()}ing to add a new \code{prop} column, the hourly frequency divided by the total number of deaths from that cause. This new column is Table~\subref*{tbl:counts:3}.
432 | 
433 | \code{ddply()} breaks down the first argument (\code{hod2}) by its second (the \code{cod} variable), and then applies the third argument (\code{transform}) to each resulting piece. The fourth argument (\code{prop = freq / sum(freq)}) is then passed on to \code{transform()}.
434 | 
435 | \begin{Verbatim}
436 | hod2 <- ddply(hod2, "cod", transform, prop = freq / sum(freq))
437 | \end{Verbatim}
438 | 
439 | We then compute the overall average death rate for each hour, and merge that back into the original dataset. This yields Table~\subref*{tbl:counts:4}. By comparing \code{prop} to \code{prop\_all}, we can easily compare each disease with the overall incidence pattern.
440 | 
441 | First, we work out the number of people dying each hour.  We break down \code{hod2} by \code{hod}, and compute the total for each cause of death.
442 | 
443 | \begin{Verbatim}
444 | overall <- ddply(hod2, "hod", summarise, freq_all = sum(freq))
445 | \end{Verbatim}
446 | 
447 | Next, we work out the overall proportion of people dying each hour:
448 | 
449 | \begin{Verbatim}
450 | overall <- transform(overall, prop_all = freq_all / sum(freq_all))
451 | \end{Verbatim}
452 | 
453 | Finally, we join the overall dataset with the individual dataset to make it easier to compare the two:
454 | 
455 | \begin{Verbatim}
456 | hod2 <- join(hod2, overall, by = "hod")
457 | \end{Verbatim}
458 | 
459 | \begin{table}[htbp]
460 |   \centering
461 |   \subfloat[]{
462 |     \label{tbl:counts:1}\input{case-study/counts.tex}
463 |   }%
464 |   \subfloat[]{
465 |     \label{tbl:counts:2}\input{case-study/counts-disease.tex}
466 |   }%
467 |   \subfloat[]{
468 |     \label{tbl:counts:3}\input{case-study/counts-prop.tex}
469 |   }%
470 |   \subfloat[]{
471 |     \label{tbl:counts:4}\input{case-study/counts-all.tex}
472 |   }
473 |   
474 |   \caption{A sample of four diseases and four hours from \code{hod2} data frame.}
475 |   \label{tbl:counts}
476 | \end{table}
477 | 
478 | Next we compute a distance between the temporal pattern of each cause of death and the overall temporal pattern. There are many ways to measure this distance, but I found a simple mean squared deviation to be revealing. We also record the sample size, the total number of deaths from that cause. To ensure that the diseases we consider are sufficiently representative we'll only work with diseases with more than 50 total deaths ($\sim$2/hour).
479 | 
480 | \begin{Verbatim}
481 | devi <- ddply(hod2, "cod", summarise, 
482 |   n = sum(freq), 
483 |   dist = mean((prop - prop_all)^2))
484 | 
485 | devi <- subset(devi, n > 50)
486 | \end{Verbatim}
487 | 
488 | We don't know the variance characteristics of this estimator, but we can explore it visually by plotting \code{n} vs.\ \code{deviation}, Figure~\subref*{fig:deviation-raw}. On a linear scale, the plot shows little, except that variability decreases with sample size. But on the log-log scale, Figure~\subref*{fig:deviation-log}, there is a clear pattern. This is particularly easy to see the pattern when we add the line of best fit from a robust linear model. 
489 | 
490 | \begin{Verbatim}
491 | ggplot(data = devi, aes(x = n, y = dist) + geom_point()
492 | 
493 | last_plot() + 
494 |   scale_x_log10() + 
495 |   scale_y_log10() +
496 |   geom_smooth(method = "rlm", se = F)
497 | \end{Verbatim}
498 | 
499 | \begin{figure}[htbp]
500 |   \centering
501 |   \subfloat[Linear scales]{
502 |     \label{fig:deviation-raw}
503 |     \includegraphics[width=0.5\linewidth]{case-study/n-dist-raw.pdf}
504 |   }%
505 |   \subfloat[Log scales]{
506 |     \label{fig:deviation-log}
507 |     \includegraphics[width=0.5\linewidth]{case-study/n-dist-log.pdf}
508 |   }
509 | 
510 |   \caption{(a) Plot of n vs deviation. Variability of deviation is dominated by sample size: small samples have large variability. (b) Log-log plot makes it easy to see the pattern of variation as well as unusually high values.  The blue line is a robust line of best fit.}
511 |   \label{fig:deviation}
512 | \end{figure}
513 | 
514 | \begin{figure}[htbp]
515 |   \centering
516 |   \includegraphics[width=0.5\linewidth]{case-study/n-dist-resid}
517 |   \caption{Residuals from a robust linear model predicting $\log(dist)$ by $\log(n)$. Horizontal line at 1.5 shows threshold for further exploration.}
518 |   \label{fig:devi-resid}
519 | \end{figure}
520 | 
521 | We are interested in points that have high $y$-values, relative to their $x$-neighbours. Controlling for the number of deaths, these points represent the diseases which depart the most from the overall pattern.
522 | 
523 | To find these unusual points, we fit a robust linear model and plot the residuals,  Figure~\ref{fig:devi-resid}. The plot shows an empty region around a residual of 1.5. So somewhat arbitrarily, we'll select those diseases with a residual greater than 1.5. We do this in two steps: first, we select the appropriate rows from \code{devi} (one row per disease), and then we find the matching temporal course information from the original summary dataset (24 rows per disease).
524 | 
525 | \begin{Verbatim}
526 | devi$resid <- resid(rlm(log(dist) ~ log(n), data = devi))
527 | unusual <- subset(devi, resid > 1.5)
528 | hod_unusual <- match_df(hod2, unusual)
529 | \end{Verbatim}
530 | 
531 | Finally, we plot the temporal course for each unusual cause, Figure~\ref{fig:disease}. We split the diseases into two plots because of differences in variability. The top plot shows diseases with over 350 deaths and the bottom with under 350. The causes of death fall into three main groups: murder, drowning, and transportation related. Murder is more common at night, drowning in the afternoon, and transportation related deaths during commute times. The pale gray line in the background shows the temporal course across all diseases.
532 | 
533 | \begin{Verbatim}
534 | ggplot(data = subset(hod_unusual, n > 350), aes(x = hod, y = prop)) + 
535 |   geom_line(aes(y = prop_all), data = overall, colour = "grey50") +
536 |   geom_line() + 
537 |   facet_wrap(~ disease, ncol = 3)
538 | \end{Verbatim}
539 | 
540 | \begin{figure}[htbp]
541 |   \centering
542 |     \includegraphics[width=0.9\textwidth]{case-study/unusual-big}
543 |     \includegraphics[width=0.9\textwidth]{case-study/unusual-sml}
544 |   \caption{Causes of death with unusual temporal courses. Overall hourly death rate shown in grey. (Top) Causes of death with more than 350 deaths over a year. (Bottom) Causes of death with fewer than 350 deaths. Note that the y-axes are on different scales.}
545 |   \label{fig:disease}
546 | \end{figure}
547 | 
548 | \section{Discussion}
549 | \label{sec:discussion}
550 | 
551 | Data cleaning is an important problem, but it is an uncommon subject of study in statistics. This paper carves out a small but important subset of data cleaning that I've called data tidying: structuring datasets to facilitate manipulation, visualisation and modelling. There is still much work to be done. Incremental improvements will happen as our understanding of tidy data and tidy tools improves, and as we improve our ability to reduce the friction of getting data into a tidy form.
552 | 
553 | Bigger improvements may be possible by exploring alternative formulations of tidiness. There is a chicken-and-egg problem with tidy data: if tidy data is only as useful as the tools that work with it, then tidy tools will be inextricably linked to tidy data. This makes it easy to get stuck in a local maxima where independently changing data structures or data tools will not improve workflow. Breaking out of this local maxima is hard. It requires long-term concerted effort with the prospect of many false starts. While I hope that the tidy data framework is not one of those false starts, I also don't see it as the final solution. I hope others will build on this framework to develop even better data storage strategies and better tools.
554 | 
555 | Surprisingly, I have found few principles to guide the design of tidy data, which acknowledge both statistical and cognitive factors. To date, my work has been driven by my experience doing data analysis, my knowledge of relational database design, and my own rumination on the tools of data analysis. The human factors, user-centered design, and human-computer interaction communities may be able to add to this conversation, but the design of data and tools to work with it has not been an active research topic in those fields. In the future, I hope to use methodologies from these fields (user-testing, ethnography, talk-aloud protocols) to improve our understanding of the cognitive side of data analysis, and to further improve our ability to design appropriate tools.
556 | 
557 | Other formulations of tidy data are possible. For example, it would be possible to construct a set of tools for dealing with values stored in multidimensional arrays. This is a common storage format for large biomedical datasets generated by microarrays or fMRI's. It's also necessary for many multivariate methods based on matrix manipulation. Fortunately, because there are many efficient tools for working with high-dimensional arrays, even sparse ones, such an array-tidy format is not only likely to be quite compact and efficient, it should also be able to easily connect with the mathematical basis of statistics. This, in fact, is the approach taken by the Pandas python data analysis library \citep{mckinney:2010}. Even more interestingly, we could consider tidy tools that can ignore the underlying data representation and automatically choose between array-tidy and dataframe-tidy formats to optimise memory usage and performance.
558 | 
559 | Apart from tidying, there are many other tasks involved in cleaning data: parsing dates and numbers, identifying missing values, correcting character encodings (for international data), matching similar but not identical values (created by typos), verifying experimental design, and filling in structural missing values, not to mention model-based data cleaning that identifies suspicious values. Can we develop other frameworks to make these tasks easier?
560 | 
561 | % While the tools that power this work grew out of my personal struggle to work with data, the framework that hooks them all together did not develop until I had to teach data cleaning. I could look at a dataset and intuit what needed to be done to it, but I couldn't explain what I was doing, and I found it very difficult to teach. This description of tidy data in this paper easier to teach because students are pretty good at identifying variables and values, and then there is a straightforward path to follow to get data in the right format.
562 | 
563 | \section{Acknowledgements} 
564 | \label{sec:acknowledgements}
565 | 
566 | This work wouldn't be possible without the many conversations I've had about data and how to deal with them statistically. I'd particularly like to thank Phil Dixon, Di Cook, and Heike Hofmann, who have put up with numerous questions over the years. I'd also like to thank the users of the \pkg{reshape} package who have provided many challenging problems, and my students who continue to challenge me to explain what I know in a way that they can understand. I'd also like to thank Bob Muenchen, Burt Gunter, Nick Horton and Garrett Grolemund who gave detailed comments on earlier drafts, and to particularly thank Ross Gayler who provided the nice example of the challenges of defining a variable and Ben Bolker who showed me the natural equivalence between a paired t-test and a mixed effects model. 
567 | 
568 | % bibtool -x tidy-data.aux -c > references.bib 
569 | \bibliography{references}
570 | 
571 | \end{document}
572 | 


--------------------------------------------------------------------------------