├── .Rprofile ├── .gitignore ├── LICENSE ├── Makefile ├── README.Rmd ├── README.md ├── apt.txt ├── input ├── communicate-plots.Rmd ├── footnotes.awk ├── images │ ├── brewer-1.png │ ├── cover.png │ ├── visualization-grammar-1.png │ ├── visualization-grammar-2.png │ ├── visualization-grammar-3.png │ ├── visualization-stat-bar.png │ └── visualization-themes.png ├── r4ds-python-plotnine.ipynb.Rmd ├── translate.sed └── visualize.Rmd ├── output ├── r4ds-python-plotnine.Rmd └── r4ds-python-plotnine.ipynb ├── r4ds-python-plotnine.Rproj ├── renv.lock ├── renv ├── .gitignore ├── activate.R └── settings.dcf └── requirements.txt /.Rprofile: -------------------------------------------------------------------------------- 1 | source("renv/activate.R") 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # R 2 | renv/library 3 | .Rproj.user 4 | .Rhistory 5 | .RData 6 | .Ruserdata 7 | README.html 8 | 9 | # Python 10 | venv/ 11 | .ipynb_checkpoints 12 | 13 | # Input files 14 | input/figure 15 | input/plotnine.html 16 | 17 | # Output files 18 | output/figure 19 | output/r4ds-python-plotnine.blog.md 20 | output/r4ds-python-plotnine.div.Rmd 21 | 22 | # macOS 23 | .DS_Store 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Data Science Workshops 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean rmd ipynb blogpost lab anchors 2 | .SUFFIXES: 3 | 4 | .ONESHELL: 5 | SHELL = /usr/bin/env bash -o pipefail 6 | .SHELLFLAGS = -e 7 | 8 | NAME = r4ds-python-plotnine 9 | 10 | # SLUG, SITE, and POST are only used to produce the blog post at datascienceworkshops.com 11 | SLUG = plotnine-grammar-of-graphics-for-python 12 | SITE = ~/repos/mine/dsw-com 13 | POST = $(SITE)/content/_posts/2019-12-11-$(SLUG).md 14 | 15 | output: 16 | mkdir -p $@ 17 | 18 | venv: requirements.txt 19 | rm -rf venv 20 | virtualenv --python=python3.7 venv 21 | . venv/bin/activate; \ 22 | pip install -U pip; \ 23 | pip install -Ur requirements.txt; \ 24 | python -m ipykernel install --user --name=$(NAME) # install kernelspec 25 | 26 | renv/library: renv.lock 27 | Rscript --vanilla -e 'if (!requireNamespace("renv")) install.packages("renv"); renv::restore()' 28 | touch $@ 29 | 30 | clean: 31 | rm -rf output 32 | 33 | output/$(NAME).ipynb: input/$(NAME).ipynb.Rmd output venv # compile to a Jupyter notebook 34 | . venv/bin/activate; \ 35 | < $< sed -e '/^---$$/,/^---$$/d;/```{r/,/```/d;/TODO/d' | \ 36 | sed -e '//,//d' | \ 37 | sed -e '/_HIDE_MD/d' | \ 38 | sed -e '/START_COMMENT/,/END_COMMENT/d' | \ 39 | awk -f input/footnotes.awk | \ 40 | cat -s | \ 41 | sed -re 's/\[\^([0-9]+)\]: (.*)$$/\1\. \2<\/span>\n/' | \ 42 | sed -re 's/\[\^([0-9]+)\]/[\1<\/sup>](#fn:\1)/g' | \ 43 | jupytext --from rmarkdown --to notebook --set-kernel $(NAME) --execute > $@ 44 | 45 | output/$(NAME).div.Rmd: input/$(NAME).ipynb.Rmd output # remove lines not meant for R markdown 46 | < $< sed -e '//,//d' | \ 47 | sed -e '/_HIDE_IPYNB/d' | \ 48 | sed -e '/START_COMMENT/,/END_COMMENT/d' | \ 49 | awk -f input/footnotes.awk > $@ 50 | 51 | output/$(NAME).Rmd: output/$(NAME).div.Rmd # remove divs and extra yaml as they're only used by the blog post 52 | < $< sed -re '/^tagline:/i ---' | \ 53 | sed -re '/^tagline:/,/^---/d;/<\/?div/d;s/ class="[^"]"//g' > $@ 54 | 55 | output/$(NAME).blog.md: output/$(NAME).div.Rmd venv renv/library 56 | Rscript --vanilla -e 'source("renv/activate.R"); knitr::knit("$<", "$@")' 57 | 58 | $(POST): output/$(NAME).blog.md $(SITE)/content/_posts $(SITE)/assets/img/blog 59 | rm -f $(SITE)/content/_posts/*-$(SLUG).md 60 | mkdir -p $(SITE)/assets/img/blog/$(SLUG)/ 61 | rm -rf $(SITE)/assets/img/blog/$(SLUG)/* 62 | cp output/figure/* $(SITE)/assets/img/blog/$(SLUG)/ 63 | cp input/images/* $(SITE)/assets/img/blog/$(SLUG)/ 64 | cat $< | \ 65 | sed 's/ alt="[^"]*"//g' | \ 66 | sed 's/ title="[^"]*"//g' | \ 67 | sed '/ $@ 71 | 72 | README.md: README.Rmd venv renv/library 73 | Rscript --vanilla -e 'source("renv/activate.R"); rmarkdown::render("$<")' 74 | 75 | rmd: output/$(NAME).Rmd 76 | 77 | ipynb: output/$(NAME).ipynb 78 | 79 | blogpost: $(POST) # not meant to be run by mere mortals 80 | 81 | anchors: 82 | curl -sL "https://r4ds.had.co.nz" | \ 83 | grep 'data-level' | \ 84 | awk -F\" '$$4 ~ /^(3|28)/ {print "["$$4"](https://r4ds.had.co.nz/"$$8")   "}' 85 | 86 | lab: venv 87 | . venv/bin/activate 88 | jupyter lab 89 | 90 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r setup, echo=FALSE} 8 | library(knitr) 9 | library(reticulate) 10 | 11 | use_virtualenv("./venv", required = TRUE) 12 | opts_chunk$set(comment = "") 13 | ``` 14 | 15 | # Plotnine: Grammar of Graphics for Python 16 | 17 | ##### A translation of the visualisation chapters from "R for Data Science" to Python using Plotnine and Pandas. 18 | 19 | This repository contains all the code and text to generate [this tutorial](https://www.datascienceworkshops.com/blog/plotnine-grammar-of-graphics-for-python). The tutorial is also available as a Jupyter notebook and an R notebook; these are located in the *output* directory. 20 | 21 | 22 | ## Run notebooks 23 | 24 | To run these notebooks, make sure you have the necessary dependencies installed. After cloning this repository you can: 25 | 26 | * Run `make venv` to create a virtualenv with Python 3.7, install the packages listed in *requirements.txt*, and create a Jupyter kernel specification. 27 | * Run `make lab` to start Jupyter Lab using the virtualenv. 28 | * Run `make renv/library` to use to install all the required R packages. 29 | 30 | 31 | ## Re-create notebooks from source 32 | 33 | If you change the input source, i.e., *input/r4ds-python-plotnine.ipynb.Rmd*, you can: 34 | 35 | * Run `make ipynb` to re-create the Jupyter notebook *output/r4ds-python-plotnine.ipynb*. 36 | * Run `make rmd` to re-create the R notebook *output/r4ds-python-plotnine.Rmd*. 37 | 38 | This tutorial and the accompanying code has been written and tested on macOS. I suspect that it also works on other Unix and Linux distributions. I doubt that it works out of the box on Windows. The *Makefile* and *requirements.txt* files may provide useful hints on how to install the dependencies. 39 | 40 | 41 | ## R packages used 42 | 43 | ```{r} 44 | sessioninfo::session_info() 45 | ``` 46 | 47 | 48 | ## Python packages used 49 | 50 | ```{bash} 51 | . venv/bin/activate && pip freeze 52 | ``` 53 | 54 | 55 | ## License 56 | 57 | The tutorial, the Jupyter notebook and the R notebook are licenced, just like the original text, under the [Creative Commons Attribution-NonCommercial-NoDerivs 3.0](https://creativecommons.org/licenses/by-nc-nd/4.0/) License while everything else is licensed under the [MIT License](https://raw.githubusercontent.com/datascienceworkshops/r4ds-python-plotnine/master/LICENSE). 58 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # Plotnine: Grammar of Graphics for Python 5 | 6 | ##### A translation of the visualisation chapters from “R for Data Science” to Python using Plotnine and Pandas. 7 | 8 | This repository contains all the code and text to generate [this 9 | tutorial](https://www.datascienceworkshops.com/blog/plotnine-grammar-of-graphics-for-python). 10 | The tutorial is also available as a Jupyter notebook and an R notebook; 11 | these are located in the *output* directory. 12 | 13 | ## Run notebooks 14 | 15 | To run these notebooks, make sure you have the necessary dependencies 16 | installed. After cloning this repository you can: 17 | 18 | - Run `make venv` to create a virtualenv with Python 3.7, install the 19 | packages listed in *requirements.txt*, and create a Jupyter kernel 20 | specification. 21 | - Run `make lab` to start Jupyter Lab using the virtualenv. 22 | - Run `make renv/library` to use to install all the required R 23 | packages. 24 | 25 | ## Re-create notebooks from source 26 | 27 | If you change the input source, i.e., 28 | *input/r4ds-python-plotnine.ipynb.Rmd*, you can: 29 | 30 | - Run `make ipynb` to re-create the Jupyter notebook 31 | *output/r4ds-python-plotnine.ipynb*. 32 | - Run `make rmd` to re-create the R notebook 33 | *output/r4ds-python-plotnine.Rmd*. 34 | 35 | This tutorial and the accompanying code has been written and tested on 36 | macOS. I suspect that it also works on other Unix and Linux 37 | distributions. I doubt that it works out of the box on Windows. The 38 | *Makefile* and *requirements.txt* files may provide useful hints on how 39 | to install the dependencies. 40 | 41 | ## R packages used 42 | 43 | ``` r 44 | sessioninfo::session_info() 45 | ``` 46 | 47 | ─ Session info ─────────────────────────────────────────────────────────────── 48 | setting value 49 | version R version 3.6.1 (2019-07-05) 50 | os macOS Mojave 10.14.6 51 | system x86_64, darwin15.6.0 52 | ui X11 53 | language en_US.UTF-8 54 | collate en_US.UTF-8 55 | ctype en_US.UTF-8 56 | tz Europe/Amsterdam 57 | date 2019-12-10 58 | 59 | ─ Packages ─────────────────────────────────────────────────────────────────── 60 | ! package * version date lib source 61 | P assertthat 0.2.1 2019-03-21 [?] CRAN (R 3.6.0) 62 | P cli 2.0.0 2019-12-09 [?] CRAN (R 3.6.1) 63 | P crayon 1.3.4 2017-09-16 [?] CRAN (R 3.6.0) 64 | P digest 0.6.23 2019-11-23 [?] CRAN (R 3.6.0) 65 | P evaluate 0.14 2019-05-28 [?] CRAN (R 3.6.0) 66 | P fansi 0.4.0 2018-10-05 [?] CRAN (R 3.6.0) 67 | P glue 1.3.1 2019-03-12 [?] CRAN (R 3.6.0) 68 | P htmltools 0.4.0 2019-10-04 [?] CRAN (R 3.6.0) 69 | P jsonlite 1.6 2018-12-07 [?] CRAN (R 3.6.0) 70 | P knitr * 1.26 2019-11-12 [?] CRAN (R 3.6.0) 71 | P lattice 0.20-38 2018-11-04 [?] CRAN (R 3.6.1) 72 | P magrittr 1.5 2014-11-22 [?] CRAN (R 3.6.0) 73 | P Matrix 1.2-18 2019-11-27 [?] CRAN (R 3.6.0) 74 | P Rcpp 1.0.3 2019-11-08 [?] CRAN (R 3.6.0) 75 | P renv 0.9.2 2019-12-09 [?] CRAN (R 3.6.1) 76 | P reticulate * 1.13 2019-07-24 [?] CRAN (R 3.6.0) 77 | P rlang 0.4.2 2019-11-23 [?] CRAN (R 3.6.0) 78 | P rmarkdown 1.18 2019-11-27 [?] CRAN (R 3.6.0) 79 | P sessioninfo 1.1.1 2018-11-05 [?] CRAN (R 3.6.0) 80 | P stringi 1.4.3 2019-03-12 [?] CRAN (R 3.6.0) 81 | P stringr 1.4.0 2019-02-10 [?] CRAN (R 3.6.0) 82 | P withr 2.1.2 2018-03-15 [?] CRAN (R 3.6.0) 83 | P xfun 0.11 2019-11-12 [?] CRAN (R 3.6.0) 84 | P yaml 2.2.0 2018-07-25 [?] CRAN (R 3.6.0) 85 | 86 | [1] /Users/jeroen/repos/datascienceworkshops/r4ds-python-plotnine/renv/library/R-3.6/x86_64-apple-darwin15.6.0 87 | [2] /private/var/folders/8h/88ch3k996hb0t11db8bj9pzh0000gn/T/RtmpF8ov4T/renv-system-library 88 | 89 | P ── Loaded and on-disk path mismatch. 90 | 91 | ## Python packages used 92 | 93 | ``` bash 94 | . venv/bin/activate && pip freeze 95 | ``` 96 | 97 | adjustText==0.7.3 98 | appnope==0.1.0 99 | attrs==19.3.0 100 | backcall==0.1.0 101 | bleach==3.1.0 102 | cycler==0.10.0 103 | decorator==4.4.1 104 | defusedxml==0.6.0 105 | descartes==1.1.0 106 | entrypoints==0.3 107 | importlib-metadata==1.2.0 108 | ipykernel==5.1.3 109 | ipython==7.10.1 110 | ipython-genutils==0.2.0 111 | ipywidgets==7.5.1 112 | jedi==0.15.1 113 | Jinja2==2.10.3 114 | joblib==0.14.0 115 | jsonschema==3.2.0 116 | jupyter==1.0.0 117 | jupyter-client==5.3.4 118 | jupyter-console==6.0.0 119 | jupyter-core==4.6.1 120 | jupytext==1.3.0 121 | kiwisolver==1.1.0 122 | MarkupSafe==1.1.1 123 | matplotlib==3.1.2 124 | mistune==0.8.4 125 | mizani==0.6.0 126 | more-itertools==8.0.2 127 | nbconvert==5.6.1 128 | nbformat==4.4.0 129 | notebook==6.0.2 130 | numpy==1.17.4 131 | palettable==3.3.0 132 | pandas==0.25.3 133 | pandocfilters==1.4.2 134 | parso==0.5.1 135 | patsy==0.5.1 136 | pexpect==4.7.0 137 | pickleshare==0.7.5 138 | plotnine==0.6.0+23.g3eb58cd 139 | prometheus-client==0.7.1 140 | prompt-toolkit==2.0.10 141 | ptyprocess==0.6.0 142 | Pygments==2.5.2 143 | pyparsing==2.4.5 144 | pyrsistent==0.15.6 145 | python-dateutil==2.8.1 146 | pytz==2019.3 147 | PyYAML==5.2 148 | pyzmq==18.1.1 149 | qtconsole==4.6.0 150 | scikit-learn==0.22 151 | scikit-misc==0.1.1 152 | scipy==1.3.3 153 | Send2Trash==1.5.0 154 | six==1.13.0 155 | statsmodels==0.10.2 156 | terminado==0.8.3 157 | testpath==0.4.4 158 | tornado==6.0.3 159 | traitlets==4.3.3 160 | wcwidth==0.1.7 161 | webencodings==0.5.1 162 | widgetsnbextension==3.5.1 163 | zipp==0.6.0 164 | 165 | ## License 166 | 167 | The tutorial, the Jupyter notebook and the R notebook are licenced, just 168 | like the original text, under the [Creative Commons 169 | Attribution-NonCommercial-NoDerivs 3.0](https://creativecommons.org/licenses/by-nc-nd/4.0/) 170 | License while everything else is licensed under the [MIT 171 | License](https://raw.githubusercontent.com/datascienceworkshops/r4ds-python-plotnine/master/LICENSE). 172 | -------------------------------------------------------------------------------- /apt.txt: -------------------------------------------------------------------------------- 1 | texlive-latex-base 2 | texlive-latex-recommended 3 | texlive-science 4 | texlive-latex-extra 5 | texlive-fonts-recommended 6 | dvipng 7 | ghostscript 8 | -------------------------------------------------------------------------------- /input/communicate-plots.Rmd: -------------------------------------------------------------------------------- 1 | # Graphics for communication 2 | 3 | ## Introduction 4 | 5 | In [exploratory data analysis], you learned how to use plots as tools for _exploration_. When you make exploratory plots, you know---even before looking---which variables the plot will display. You made each plot for a purpose, could quickly look at it, and then move on to the next plot. In the course of most analyses, you'll produce tens or hundreds of plots, most of which are immediately thrown away. 6 | 7 | Now that you understand your data, you need to _communicate_ your understanding to others. Your audience will likely not share your background knowledge and will not be deeply invested in the data. To help others quickly build up a good mental model of the data, you will need to invest considerable effort in making your plots as self-explanatory as possible. In this chapter, you'll learn some of the tools that ggplot2 provides to do so. 8 | 9 | This chapter focuses on the tools you need to create good graphics. I assume that you know what you want, and just need to know how to do it. For that reason, I highly recommend pairing this chapter with a good general visualisation book. I particularly like [_The Truthful Art_](https://amzn.com/0321934075), by Albert Cairo. It doesn't teach the mechanics of creating visualisations, but instead focuses on what you need to think about in order to create effective graphics. 10 | 11 | ### Prerequisites 12 | 13 | In this chapter, we'll focus once again on ggplot2. We'll also use a little dplyr for data manipulation, and a few ggplot2 extension packages, including __ggrepel__ and __viridis__. Rather than loading those extensions here, we'll refer to their functions explicitly, using the `::` notation. This will help make it clear which functions are built into ggplot2, and which come from other packages. Don't forget you'll need to install those packages with `install.packages()` if you don't already have them. 14 | 15 | ```{r, message = FALSE} 16 | library(tidyverse) 17 | ``` 18 | 19 | ## Label 20 | 21 | The easiest place to start when turning an exploratory graphic into an expository graphic is with good labels. You add labels with the `labs()` function. This example adds a plot title: 22 | 23 | ```{r, message = FALSE} 24 | ggplot(mpg, aes(displ, hwy)) + 25 | geom_point(aes(color = class)) + 26 | geom_smooth(se = FALSE) + 27 | labs(title = "Fuel efficiency generally decreases with engine size") 28 | ``` 29 | 30 | The purpose of a plot title is to summarise the main finding. Avoid titles that just describe what the plot is, e.g. "A scatterplot of engine displacement vs. fuel economy". 31 | 32 | If you need to add more text, there are two other useful labels that you can use in ggplot2 2.2.0 and above (which should be available by the time you're reading this book): 33 | 34 | * `subtitle` adds additional detail in a smaller font beneath the title. 35 | 36 | * `caption` adds text at the bottom right of the plot, often used to describe 37 | the source of the data. 38 | 39 | ```{r, message = FALSE} 40 | ggplot(mpg, aes(displ, hwy)) + 41 | geom_point(aes(color = class)) + 42 | geom_smooth(se = FALSE) + 43 | labs( 44 | title = "Fuel efficiency generally decreases with engine size", 45 | subtitle = "Two seaters (sports cars) are an exception because of their light weight", 46 | caption = "Data from fueleconomy.gov" 47 | ) 48 | ``` 49 | 50 | You can also use `labs()` to replace the axis and legend titles. It's usually a good idea to replace short variable names with more detailed descriptions, and to include the units. 51 | 52 | ```{r, message = FALSE} 53 | ggplot(mpg, aes(displ, hwy)) + 54 | geom_point(aes(colour = class)) + 55 | geom_smooth(se = FALSE) + 56 | labs( 57 | x = "Engine displacement (L)", 58 | y = "Highway fuel economy (mpg)", 59 | colour = "Car type" 60 | ) 61 | ``` 62 | 63 | It's possible to use mathematical equations instead of text strings. Just switch `""` out for `quote()` and read about the available options in `?plotmath`: 64 | 65 | ```{r, fig.asp = 1, out.width = "50%", fig.width = 3} 66 | df <- tibble( 67 | x = runif(10), 68 | y = runif(10) 69 | ) 70 | ggplot(df, aes(x, y)) + 71 | geom_point() + 72 | labs( 73 | x = quote(sum(x[i] ^ 2, i == 1, n)), 74 | y = quote(alpha + beta + frac(delta, theta)) 75 | ) 76 | ``` 77 | 78 | ### Exercises 79 | 80 | 1. Create one plot on the fuel economy data with customised `title`, 81 | `subtitle`, `caption`, `x`, `y`, and `colour` labels. 82 | 83 | 1. The `geom_smooth()` is somewhat misleading because the `hwy` for 84 | large engines is skewed upwards due to the inclusion of lightweight 85 | sports cars with big engines. Use your modelling tools to fit and display 86 | a better model. 87 | 88 | 1. Take an exploratory graphic that you've created in the last month, and add 89 | informative titles to make it easier for others to understand. 90 | 91 | ## Annotations 92 | 93 | In addition to labelling major components of your plot, it's often useful to label individual observations or groups of observations. The first tool you have at your disposal is `geom_text()`. `geom_text()` is similar to `geom_point()`, but it has an additional aesthetic: `label`. This makes it possible to add textual labels to your plots. 94 | 95 | There are two possible sources of labels. First, you might have a tibble that provides labels. The plot below isn't terribly useful, but it illustrates a useful approach: pull out the most efficient car in each class with dplyr, and then label it on the plot: 96 | 97 | ```{r} 98 | best_in_class <- mpg %>% 99 | group_by(class) %>% 100 | filter(row_number(desc(hwy)) == 1) 101 | 102 | ggplot(mpg, aes(displ, hwy)) + 103 | geom_point(aes(colour = class)) + 104 | geom_text(aes(label = model), data = best_in_class) 105 | ``` 106 | 107 | This is hard to read because the labels overlap with each other, and with the points. We can make things a little better by switching to `geom_label()` which draws a rectangle behind the text. We also use the `nudge_y` parameter to move the labels slightly above the corresponding points: 108 | 109 | ```{r} 110 | ggplot(mpg, aes(displ, hwy)) + 111 | geom_point(aes(colour = class)) + 112 | geom_label(aes(label = model), data = best_in_class, nudge_y = 2, alpha = 0.5) 113 | ``` 114 | 115 | That helps a bit, but if you look closely in the top-left hand corner, you'll notice that there are two labels practically on top of each other. This happens because the highway mileage and displacement for the best cars in the compact and subcompact categories are exactly the same. There's no way that we can fix these by applying the same transformation for every label. Instead, we can use the __ggrepel__ package by Kamil Slowikowski. This useful package will automatically adjust labels so that they don't overlap: 116 | 117 | ```{r} 118 | ggplot(mpg, aes(displ, hwy)) + 119 | geom_point(aes(colour = class)) + 120 | geom_point(size = 3, shape = 1, data = best_in_class) + 121 | ggrepel::geom_label_repel(aes(label = model), data = best_in_class) 122 | ``` 123 | 124 | Note another handy technique used here: I added a second layer of large, hollow points to highlight the points that I've labelled. 125 | 126 | You can sometimes use the same idea to replace the legend with labels placed directly on the plot. It's not wonderful for this plot, but it isn't too bad. (`theme(legend.position = "none"`) turns the legend off --- we'll talk about it more shortly.) 127 | 128 | ```{r} 129 | class_avg <- mpg %>% 130 | group_by(class) %>% 131 | summarise( 132 | displ = median(displ), 133 | hwy = median(hwy) 134 | ) 135 | 136 | ggplot(mpg, aes(displ, hwy, colour = class)) + 137 | ggrepel::geom_label_repel(aes(label = class), 138 | data = class_avg, 139 | size = 6, 140 | label.size = 0, 141 | segment.color = NA 142 | ) + 143 | geom_point() + 144 | theme(legend.position = "none") 145 | ``` 146 | 147 | Alternatively, you might just want to add a single label to the plot, but you'll still need to create a data frame. Often, you want the label in the corner of the plot, so it's convenient to create a new data frame using `summarise()` to compute the maximum values of x and y. 148 | 149 | ```{r} 150 | label <- mpg %>% 151 | summarise( 152 | displ = max(displ), 153 | hwy = max(hwy), 154 | label = "Increasing engine size is \nrelated to decreasing fuel economy." 155 | ) 156 | 157 | ggplot(mpg, aes(displ, hwy)) + 158 | geom_point() + 159 | geom_text(aes(label = label), data = label, vjust = "top", hjust = "right") 160 | ``` 161 | 162 | If you want to place the text exactly on the borders of the plot, you can use `+Inf` and `-Inf`. Since we're no longer computing the positions from `mpg`, we can use `tibble()` to create the data frame: 163 | 164 | ```{r} 165 | label <- tibble( 166 | displ = Inf, 167 | hwy = Inf, 168 | label = "Increasing engine size is \nrelated to decreasing fuel economy." 169 | ) 170 | 171 | ggplot(mpg, aes(displ, hwy)) + 172 | geom_point() + 173 | geom_text(aes(label = label), data = label, vjust = "top", hjust = "right") 174 | ``` 175 | 176 | In these examples, I manually broke the label up into lines using `"\n"`. Another approach is to use `stringr::str_wrap()` to automatically add line breaks, given the number of characters you want per line: 177 | 178 | ```{r} 179 | "Increasing engine size is related to decreasing fuel economy." %>% 180 | stringr::str_wrap(width = 40) %>% 181 | writeLines() 182 | ``` 183 | 184 | Note the use of `hjust` and `vjust` to control the alignment of the label. Figure \@ref(fig:just) shows all nine possible combinations. 185 | 186 | ```{r just, echo = FALSE, fig.cap = "All nine combinations of `hjust` and `vjust`.", fig.asp = 0.5, fig.width = 4.5, out.width = "60%"} 187 | vjust <- c(bottom = 0, center = 0.5, top = 1) 188 | hjust <- c(left = 0, center = 0.5, right = 1) 189 | 190 | df <- tidyr::crossing(hj = names(hjust), vj = names(vjust)) %>% 191 | mutate( 192 | y = vjust[vj], 193 | x = hjust[hj], 194 | label = paste0("hjust = '", hj, "'\n", "vjust = '", vj, "'") 195 | ) 196 | 197 | ggplot(df, aes(x, y)) + 198 | geom_point(colour = "grey70", size = 5) + 199 | geom_point(size = 0.5, colour = "red") + 200 | geom_text(aes(label = label, hjust = hj, vjust = vj), size = 4) + 201 | labs(x = NULL, y = NULL) 202 | ``` 203 | 204 | Remember, in addition to `geom_text()`, you have many other geoms in ggplot2 available to help annotate your plot. A few ideas: 205 | 206 | * Use `geom_hline()` and `geom_vline()` to add reference lines. I often make 207 | them thick (`size = 2`) and white (`colour = white`), and draw them 208 | underneath the primary data layer. That makes them easy to see, without 209 | drawing attention away from the data. 210 | 211 | * Use `geom_rect()` to draw a rectangle around points of interest. The 212 | boundaries of the rectangle are defined by aesthetics `xmin`, `xmax`, 213 | `ymin`, `ymax`. 214 | 215 | * Use `geom_segment()` with the `arrow` argument to draw attention 216 | to a point with an arrow. Use aesthetics `x` and `y` to define the 217 | starting location, and `xend` and `yend` to define the end location. 218 | 219 | The only limit is your imagination (and your patience with positioning annotations to be aesthetically pleasing)! 220 | 221 | ### Exercises 222 | 223 | 1. Use `geom_text()` with infinite positions to place text at the 224 | four corners of the plot. 225 | 226 | 1. Read the documentation for `annotate()`. How can you use it to add a text 227 | label to a plot without having to create a tibble? 228 | 229 | 1. How do labels with `geom_text()` interact with faceting? How can you 230 | add a label to a single facet? How can you put a different label in 231 | each facet? (Hint: think about the underlying data.) 232 | 233 | 1. What arguments to `geom_label()` control the appearance of the background 234 | box? 235 | 236 | 1. What are the four arguments to `arrow()`? How do they work? Create a series 237 | of plots that demonstrate the most important options. 238 | 239 | ## Scales 240 | 241 | The third way you can make your plot better for communication is to adjust the scales. Scales control the mapping from data values to things that you can perceive. Normally, ggplot2 automatically adds scales for you. For example, when you type: 242 | 243 | ```{r default-scales, fig.show = "hide"} 244 | ggplot(mpg, aes(displ, hwy)) + 245 | geom_point(aes(colour = class)) 246 | ``` 247 | 248 | ggplot2 automatically adds default scales behind the scenes: 249 | 250 | ```{r, fig.show = "hide"} 251 | ggplot(mpg, aes(displ, hwy)) + 252 | geom_point(aes(colour = class)) + 253 | scale_x_continuous() + 254 | scale_y_continuous() + 255 | scale_colour_discrete() 256 | ``` 257 | 258 | Note the naming scheme for scales: `scale_` followed by the name of the aesthetic, then `_`, then the name of the scale. The default scales are named according to the type of variable they align with: continuous, discrete, datetime, or date. There are lots of non-default scales which you'll learn about below. 259 | 260 | The default scales have been carefully chosen to do a good job for a wide range of inputs. Nevertheless, you might want to override the defaults for two reasons: 261 | 262 | * You might want to tweak some of the parameters of the default scale. 263 | This allows you to do things like change the breaks on the axes, or the 264 | key labels on the legend. 265 | 266 | * You might want to replace the scale altogether, and use a completely 267 | different algorithm. Often you can do better than the default because 268 | you know more about the data. 269 | 270 | ### Axis ticks and legend keys 271 | 272 | There are two primary arguments that affect the appearance of the ticks on the axes and the keys on the legend: `breaks` and `labels`. Breaks controls the position of the ticks, or the values associated with the keys. Labels controls the text label associated with each tick/key. The most common use of `breaks` is to override the default choice: 273 | 274 | ```{r} 275 | ggplot(mpg, aes(displ, hwy)) + 276 | geom_point() + 277 | scale_y_continuous(breaks = seq(15, 40, by = 5)) 278 | ``` 279 | 280 | You can use `labels` in the same way (a character vector the same length as `breaks`), but you can also set it to `NULL` to suppress the labels altogether. This is useful for maps, or for publishing plots where you can't share the absolute numbers. 281 | 282 | ```{r} 283 | ggplot(mpg, aes(displ, hwy)) + 284 | geom_point() + 285 | scale_x_continuous(labels = NULL) + 286 | scale_y_continuous(labels = NULL) 287 | ``` 288 | 289 | You can also use `breaks` and `labels` to control the appearance of legends. Collectively axes and legends are called __guides__. Axes are used for x and y aesthetics; legends are used for everything else. 290 | 291 | Another use of `breaks` is when you have relatively few data points and want to highlight exactly where the observations occur. For example, take this plot that shows when each US president started and ended their term. 292 | 293 | ```{r} 294 | presidential %>% 295 | mutate(id = 33 + row_number()) %>% 296 | ggplot(aes(start, id)) + 297 | geom_point() + 298 | geom_segment(aes(xend = end, yend = id)) + 299 | scale_x_date(NULL, breaks = presidential$start, date_labels = "'%y") 300 | ``` 301 | 302 | Note that the specification of breaks and labels for date and datetime scales is a little different: 303 | 304 | * `date_labels` takes a format specification, in the same form as 305 | `parse_datetime()`. 306 | 307 | * `date_breaks` (not shown here), takes a string like "2 days" or "1 month". 308 | 309 | ### Legend layout 310 | 311 | You will most often use `breaks` and `labels` to tweak the axes. While they both also work for legends, there are a few other techniques you are more likely to use. 312 | 313 | To control the overall position of the legend, you need to use a `theme()` setting. We'll come back to themes at the end of the chapter, but in brief, they control the non-data parts of the plot. The theme setting `legend.position` controls where the legend is drawn: 314 | 315 | ```{r fig.asp = 1, fig.align = "default", out.width = "50%", fig.width = 4} 316 | base <- ggplot(mpg, aes(displ, hwy)) + 317 | geom_point(aes(colour = class)) 318 | 319 | base + theme(legend.position = "left") 320 | base + theme(legend.position = "top") 321 | base + theme(legend.position = "bottom") 322 | base + theme(legend.position = "right") # the default 323 | ``` 324 | 325 | You can also use `legend.position = "none"` to suppress the display of the legend altogether. 326 | 327 | To control the display of individual legends, use `guides()` along with `guide_legend()` or `guide_colourbar()`. The following example shows two important settings: controlling the number of rows the legend uses with `nrow`, and overriding one of the aesthetics to make the points bigger. This is particularly useful if you have used a low `alpha` to display many points on a plot. 328 | 329 | ```{r} 330 | ggplot(mpg, aes(displ, hwy)) + 331 | geom_point(aes(colour = class)) + 332 | geom_smooth(se = FALSE) + 333 | theme(legend.position = "bottom") + 334 | guides(colour = guide_legend(nrow = 1, override.aes = list(size = 4))) 335 | ``` 336 | 337 | ### Replacing a scale 338 | 339 | Instead of just tweaking the details a little, you can instead replace the scale altogether. There are two types of scales you're mostly likely to want to switch out: continuous position scales and colour scales. Fortunately, the same principles apply to all the other aesthetics, so once you've mastered position and colour, you'll be able to quickly pick up other scale replacements. 340 | 341 | It's very useful to plot transformations of your variable. For example, as we've seen in [diamond prices](diamond-prices) it's easier to see the precise relationship between `carat` and `price` if we log transform them: 342 | 343 | ```{r, fig.align = "default", out.width = "50%"} 344 | ggplot(diamonds, aes(carat, price)) + 345 | geom_bin2d() 346 | 347 | ggplot(diamonds, aes(log10(carat), log10(price))) + 348 | geom_bin2d() 349 | ``` 350 | 351 | However, the disadvantage of this transformation is that the axes are now labelled with the transformed values, making it hard to interpret the plot. Instead of doing the transformation in the aesthetic mapping, we can instead do it with the scale. This is visually identical, except the axes are labelled on the original data scale. 352 | 353 | ```{r} 354 | ggplot(diamonds, aes(carat, price)) + 355 | geom_bin2d() + 356 | scale_x_log10() + 357 | scale_y_log10() 358 | ``` 359 | 360 | Another scale that is frequently customised is colour. The default categorical scale picks colours that are evenly spaced around the colour wheel. Useful alternatives are the ColorBrewer scales which have been hand tuned to work better for people with common types of colour blindness. The two plots below look similar, but there is enough difference in the shades of red and green that the dots on the right can be distinguished even by people with red-green colour blindness. 361 | 362 | ```{r, fig.align = "default", out.width = "50%"} 363 | ggplot(mpg, aes(displ, hwy)) + 364 | geom_point(aes(color = drv)) 365 | 366 | ggplot(mpg, aes(displ, hwy)) + 367 | geom_point(aes(color = drv)) + 368 | scale_colour_brewer(palette = "Set1") 369 | ``` 370 | 371 | Don't forget simpler techniques. If there are just a few colours, you can add a redundant shape mapping. This will also help ensure your plot is interpretable in black and white. 372 | 373 | ```{r} 374 | ggplot(mpg, aes(displ, hwy)) + 375 | geom_point(aes(color = drv, shape = drv)) + 376 | scale_colour_brewer(palette = "Set1") 377 | ``` 378 | 379 | The ColorBrewer scales are documented online at and made available in R via the __RColorBrewer__ package, by Erich Neuwirth. Figure \@ref(fig:brewer) shows the complete list of all palettes. The sequential (top) and diverging (bottom) palettes are particularly useful if your categorical values are ordered, or have a "middle". This often arises if you've used `cut()` to make a continuous variable into a categorical variable. 380 | 381 | ```{r brewer, fig.asp = 2.5, echo = FALSE, fig.cap = "All ColourBrewer scales."} 382 | par(mar = c(0, 3, 0, 0)) 383 | RColorBrewer::display.brewer.all() 384 | ``` 385 | 386 | When you have a predefined mapping between values and colours, use `scale_colour_manual()`. For example, if we map presidential party to colour, we want to use the standard mapping of red for Republicans and blue for Democrats: 387 | 388 | ```{r} 389 | presidential %>% 390 | mutate(id = 33 + row_number()) %>% 391 | ggplot(aes(start, id, colour = party)) + 392 | geom_point() + 393 | geom_segment(aes(xend = end, yend = id)) + 394 | scale_colour_manual(values = c(Republican = "red", Democratic = "blue")) 395 | ``` 396 | 397 | For continuous colour, you can use the built-in `scale_colour_gradient()` or `scale_fill_gradient()`. If you have a diverging scale, you can use `scale_colour_gradient2()`. That allows you to give, for example, positive and negative values different colours. That's sometimes also useful if you want to distinguish points above or below the mean. 398 | 399 | Another option is `scale_colour_viridis()` provided by the __viridis__ package. It's a continuous analog of the categorical ColorBrewer scales. The designers, Nathaniel Smith and Stéfan van der Walt, carefully tailored a continuous colour scheme that has good perceptual properties. Here's an example from the viridis vignette. 400 | 401 | ```{r, fig.align = "default", fig.asp = 1, out.width = "50%", fig.width = 4} 402 | df <- tibble( 403 | x = rnorm(10000), 404 | y = rnorm(10000) 405 | ) 406 | ggplot(df, aes(x, y)) + 407 | geom_hex() + 408 | coord_fixed() 409 | 410 | ggplot(df, aes(x, y)) + 411 | geom_hex() + 412 | viridis::scale_fill_viridis() + 413 | coord_fixed() 414 | ``` 415 | 416 | Note that all colour scales come in two variety: `scale_colour_x()` and `scale_fill_x()` for the `colour` and `fill` aesthetics respectively (the colour scales are available in both UK and US spellings). 417 | 418 | ### Exercises 419 | 420 | 1. Why doesn't the following code override the default scale? 421 | 422 | ```{r fig.show = "hide"} 423 | ggplot(df, aes(x, y)) + 424 | geom_hex() + 425 | scale_colour_gradient(low = "white", high = "red") + 426 | coord_fixed() 427 | ``` 428 | 429 | 1. What is the first argument to every scale? How does it compare to `labs()`? 430 | 431 | 1. Change the display of the presidential terms by: 432 | 433 | 1. Combining the two variants shown above. 434 | 1. Improving the display of the y axis. 435 | 1. Labelling each term with the name of the president. 436 | 1. Adding informative plot labels. 437 | 1. Placing breaks every 4 years (this is trickier than it seems!). 438 | 439 | 1. Use `override.aes` to make the legend on the following plot easier to see. 440 | 441 | ```{r, dev = "png", out.width = "50%"} 442 | ggplot(diamonds, aes(carat, price)) + 443 | geom_point(aes(colour = cut), alpha = 1/20) 444 | ``` 445 | 446 | ## Zooming 447 | 448 | There are three ways to control the plot limits: 449 | 450 | 1. Adjusting what data are plotted 451 | 1. Setting the limits in each scale 452 | 1. Setting `xlim` and `ylim` in `coord_cartesian()` 453 | 454 | To zoom in on a region of the plot, it's generally best to use `coord_cartesian()`. Compare the following two plots: 455 | 456 | ```{r out.width = "50%", fig.align = "default", message = FALSE} 457 | ggplot(mpg, mapping = aes(displ, hwy)) + 458 | geom_point(aes(color = class)) + 459 | geom_smooth() + 460 | coord_cartesian(xlim = c(5, 7), ylim = c(10, 30)) 461 | 462 | mpg %>% 463 | filter(displ >= 5, displ <= 7, hwy >= 10, hwy <= 30) %>% 464 | ggplot(aes(displ, hwy)) + 465 | geom_point(aes(color = class)) + 466 | geom_smooth() 467 | ``` 468 | 469 | You can also set the `limits` on individual scales. Reducing the limits is basically equivalent to subsetting the data. It is generally more useful if you want _expand_ the limits, for example, to match scales across different plots. For example, if we extract two classes of cars and plot them separately, it's difficult to compare the plots because all three scales (the x-axis, the y-axis, and the colour aesthetic) have different ranges. 470 | 471 | ```{r out.width = "50%", fig.align = "default", fig.width = 4} 472 | suv <- mpg %>% filter(class == "suv") 473 | compact <- mpg %>% filter(class == "compact") 474 | 475 | ggplot(suv, aes(displ, hwy, colour = drv)) + 476 | geom_point() 477 | 478 | ggplot(compact, aes(displ, hwy, colour = drv)) + 479 | geom_point() 480 | ``` 481 | 482 | One way to overcome this problem is to share scales across multiple plots, training the scales with the `limits` of the full data. 483 | 484 | ```{r out.width = "50%", fig.align = "default", fig.width = 4} 485 | x_scale <- scale_x_continuous(limits = range(mpg$displ)) 486 | y_scale <- scale_y_continuous(limits = range(mpg$hwy)) 487 | col_scale <- scale_colour_discrete(limits = unique(mpg$drv)) 488 | 489 | ggplot(suv, aes(displ, hwy, colour = drv)) + 490 | geom_point() + 491 | x_scale + 492 | y_scale + 493 | col_scale 494 | 495 | ggplot(compact, aes(displ, hwy, colour = drv)) + 496 | geom_point() + 497 | x_scale + 498 | y_scale + 499 | col_scale 500 | ``` 501 | 502 | In this particular case, you could have simply used faceting, but this technique is useful more generally, if for instance, you want spread plots over multiple pages of a report. 503 | 504 | ## Themes 505 | 506 | Finally, you can customise the non-data elements of your plot with a theme: 507 | 508 | ```{r, message = FALSE} 509 | ggplot(mpg, aes(displ, hwy)) + 510 | geom_point(aes(color = class)) + 511 | geom_smooth(se = FALSE) + 512 | theme_bw() 513 | ``` 514 | 515 | ggplot2 includes eight themes by default, as shown in Figure \@ref(fig:themes). Many more are included in add-on packages like __ggthemes__ (), by Jeffrey Arnold. 516 | 517 | ```{r themes, echo = FALSE, fig.cap = "The eight themes built-in to ggplot2."} 518 | knitr::include_graphics("images/visualization-themes.png") 519 | ``` 520 | 521 | Many people wonder why the default theme has a grey background. This was a deliberate choice because it puts the data forward while still making the grid lines visible. The white grid lines are visible (which is important because they significantly aid position judgements), but they have little visual impact and we can easily tune them out. The grey background gives the plot a similar typographic colour to the text, ensuring that the graphics fit in with the flow of a document without jumping out with a bright white background. Finally, the grey background creates a continuous field of colour which ensures that the plot is perceived as a single visual entity. 522 | 523 | It's also possible to control individual components of each theme, like the size and colour of the font used for the y axis. Unfortunately, this level of detail is outside the scope of this book, so you'll need to read the [ggplot2 book](https://amzn.com/331924275X) for the full details. You can also create your own themes, if you are trying to match a particular corporate or journal style. 524 | 525 | ## Saving your plots 526 | 527 | There are two main ways to get your plots out of R and into your final write-up: `ggsave()` and knitr. `ggsave()` will save the most recent plot to disk: 528 | 529 | ```{r, fig.show = "none"} 530 | ggplot(mpg, aes(displ, hwy)) + geom_point() 531 | ggsave("my-plot.pdf") 532 | ``` 533 | ```{r, include = FALSE} 534 | file.remove("my-plot.pdf") 535 | ``` 536 | 537 | If you don't specify the `width` and `height` they will be taken from the dimensions of the current plotting device. For reproducible code, you'll want to specify them. 538 | 539 | Generally, however, I think you should be assembling your final reports using R Markdown, so I want to focus on the important code chunk options that you should know about for graphics. You can learn more about `ggsave()` in the documentation. 540 | 541 | ### Figure sizing 542 | 543 | The biggest challenge of graphics in R Markdown is getting your figures the right size and shape. There are five main options that control figure sizing: `fig.width`, `fig.height`, `fig.asp`, `out.width` and `out.height`. Image sizing is challenging because there are two sizes (the size of the figure created by R and the size at which it is inserted in the output document), and multiple ways of specifying the size (i.e., height, width, and aspect ratio: pick two of three). 544 | 545 | I only ever use three of the five options: 546 | 547 | * I find it most aesthetically pleasing for plots to have a consistent 548 | width. To enforce this, I set `fig.width = 6` (6") and `fig.asp = 0.618` 549 | (the golden ratio) in the defaults. Then in individual chunks, I only 550 | adjust `fig.asp`. 551 | 552 | * I control the output size with `out.width` and set it to a percentage 553 | of the line width. I default to `out.width = "70%"` 554 | and `fig.align = "center"`. That give plots room to breathe, without taking 555 | up too much space. 556 | 557 | * To put multiple plots in a single row I set the `out.width` to 558 | `50%` for two plots, `33%` for 3 plots, or `25%` to 4 plots, and set 559 | `fig.align = "default"`. Depending on what I'm trying to illustrate (e.g. 560 | show data or show plot variations), I'll also tweak `fig.width`, as 561 | discussed below. 562 | 563 | If you find that you're having to squint to read the text in your plot, you need to tweak `fig.width`. If `fig.width` is larger than the size the figure is rendered in the final doc, the text will be too small; if `fig.width` is smaller, the text will be too big. You'll often need to do a little experimentation to figure out the right ratio between the `fig.width` and the eventual width in your document. To illustrate the principle, the following three plots have `fig.width` of 4, 6, and 8 respectively: 564 | 565 | ```{r, include = FALSE} 566 | plot <- ggplot(mpg, aes(displ, hwy)) + geom_point() 567 | ``` 568 | ```{r, fig.width = 4, echo = FALSE} 569 | plot 570 | ``` 571 | ```{r, fig.width = 6, echo = FALSE} 572 | plot 573 | ``` 574 | ```{r, fig.width = 8, echo = FALSE} 575 | plot 576 | ``` 577 | 578 | If you want to make sure the font size is consistent across all your figures, whenever you set `out.width`, you'll also need to adjust `fig.width` to maintain the same ratio with your default `out.width`. For example, if your default `fig.width` is 6 and `out.width` is 0.7, when you set `out.width = "50%"` you'll need to set `fig.width` to 4.3 (6 * 0.5 / 0.7). 579 | 580 | ### Other important options 581 | 582 | When mingling code and text, like I do in this book, I recommend setting `fig.show = "hold"` so that plots are shown after the code. This has the pleasant side effect of forcing you to break up large blocks of code with their explanations. 583 | 584 | To add a caption to the plot, use `fig.cap`. In R Markdown this will change the figure from inline to "floating". 585 | 586 | If you're producing PDF output, the default graphics type is PDF. This is a good default because PDFs are high quality vector graphics. However, they can produce very large and slow plots if you are displaying thousands of points. In that case, set `dev = "png"` to force the use of PNGs. They are slightly lower quality, but will be much more compact. 587 | 588 | It's a good idea to name code chunks that produce figures, even if you don't routinely label other chunks. The chunk label is used to generate the file name of the graphic on disk, so naming your chunks makes it much easier to pick out plots and reuse in other circumstances (i.e. if you want to quickly drop a single plot into an email or a tweet). 589 | 590 | ## Learning more 591 | 592 | The absolute best place to learn more is the ggplot2 book: [_ggplot2: Elegant graphics for data analysis_](https://amzn.com/331924275X). It goes into much more depth about the underlying theory, and has many more examples of how to combine the individual pieces to solve practical problems. Unfortunately, the book is not available online for free, although you can find the source code at . 593 | 594 | Another great resource is the ggplot2 extensions guide . This site lists many of the packages that extend ggplot2 with new geoms and scales. It's a great place to start if you're trying to do something that seems hard with ggplot2. 595 | -------------------------------------------------------------------------------- /input/footnotes.awk: -------------------------------------------------------------------------------- 1 | /\[\^\]/ { 2 | split($0, parts, /\[\^\]\(|\.\)/) 3 | for (p = 1; p < length(parts); p++) { 4 | if (p % 2) { 5 | printf "%s[^%d]", parts[p], ++i 6 | } else { 7 | notes[i] = parts[p] 8 | } 9 | } 10 | print parts[p] 11 | next 12 | } 13 | 14 | { 15 | print 16 | } 17 | 18 | END { 19 | print "\n## Footnotes\n" 20 | 21 | for (i = 1; i <= length(notes); i++) { 22 | print "[^" i "]: " notes[i] "." 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /input/images/brewer-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeroenjanssens/r4ds-python-plotnine/913898724fcc7b61acdd8af98caa4258d54e6f69/input/images/brewer-1.png -------------------------------------------------------------------------------- /input/images/cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeroenjanssens/r4ds-python-plotnine/913898724fcc7b61acdd8af98caa4258d54e6f69/input/images/cover.png -------------------------------------------------------------------------------- /input/images/visualization-grammar-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeroenjanssens/r4ds-python-plotnine/913898724fcc7b61acdd8af98caa4258d54e6f69/input/images/visualization-grammar-1.png -------------------------------------------------------------------------------- /input/images/visualization-grammar-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeroenjanssens/r4ds-python-plotnine/913898724fcc7b61acdd8af98caa4258d54e6f69/input/images/visualization-grammar-2.png -------------------------------------------------------------------------------- /input/images/visualization-grammar-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeroenjanssens/r4ds-python-plotnine/913898724fcc7b61acdd8af98caa4258d54e6f69/input/images/visualization-grammar-3.png -------------------------------------------------------------------------------- /input/images/visualization-stat-bar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeroenjanssens/r4ds-python-plotnine/913898724fcc7b61acdd8af98caa4258d54e6f69/input/images/visualization-stat-bar.png -------------------------------------------------------------------------------- /input/images/visualization-themes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeroenjanssens/r4ds-python-plotnine/913898724fcc7b61acdd8af98caa4258d54e6f69/input/images/visualization-themes.png -------------------------------------------------------------------------------- /input/translate.sed: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env -S sed -rf 2 | 3 | # Basic replacements 4 | /```/!{ 5 | s/FALSE/False/g 6 | s/TRUE/True/g 7 | s/NULL/None/g 8 | s/\bR\b/Python/g 9 | s/ggplot2/plotnine/g 10 | s/data frame/DataFrame/g 11 | } 12 | 13 | # Apply the following only inside code chunks 14 | /```\{/,/```/{ 15 | # Change language to Python 16 | s/```\{r/```\{python/ 17 | 18 | # Add slashes after plusses 19 | s/\+\s*$/+\\/ 20 | 21 | # Quote column names 22 | s/([\ \(])(x|y|cty|hwy|displ|class|drv|cut|clarity|depth|carat|price)([\),])/\1"\2"\3/g 23 | 24 | # Replace dots with underscores in function names 25 | /```/!s/([a-z]{2,})\.([a-z]+)/\1_\2/g 26 | 27 | # Remove indentation 28 | /(```|ggplot)/!{ 29 | s/^(( )*) /\1/ 30 | } 31 | } 32 | 33 | # Remove whitespace around equal signs 34 | s/ = /=/g 35 | 36 | # Change assignment operator 37 | s/<-/=/g 38 | 39 | # No such thing as Python Markdown 40 | s/Python Markdown/R Markdown/g 41 | -------------------------------------------------------------------------------- /input/visualize.Rmd: -------------------------------------------------------------------------------- 1 | # Data visualisation 2 | 3 | ## Introduction 4 | 5 | > "The simple graph has brought more information to the data analyst’s mind 6 | > than any other device." --- John Tukey 7 | 8 | This chapter will teach you how to visualise your data using ggplot2. R has several systems for making graphs, but ggplot2 is one of the most elegant and most versatile. ggplot2 implements the __grammar of graphics__, a coherent system for describing and building graphs. With ggplot2, you can do more faster by learning one system and applying it in many places. 9 | 10 | If you'd like to learn more about the theoretical underpinnings of ggplot2 before you start, I'd recommend reading "The Layered Grammar of Graphics", . 11 | 12 | ### Prerequisites 13 | 14 | This chapter focusses on ggplot2, one of the core members of the tidyverse. To access the datasets, help pages, and functions that we will use in this chapter, load the tidyverse by running this code: 15 | 16 | ```{r setup} 17 | library(tidyverse) 18 | ``` 19 | 20 | That one line of code loads the core tidyverse; packages which you will use in almost every data analysis. It also tells you which functions from the tidyverse conflict with functions in base R (or from other packages you might have loaded). 21 | 22 | If you run this code and get the error message "there is no package called ‘tidyverse’", you'll need to first install it, then run `library()` once again. 23 | 24 | ```{r eval = FALSE} 25 | install.packages("tidyverse") 26 | library(tidyverse) 27 | ``` 28 | 29 | You only need to install a package once, but you need to reload it every time you start a new session. 30 | 31 | If we need to be explicit about where a function (or dataset) comes from, we'll use the special form `package::function()`. For example, `ggplot2::ggplot()` tells you explicitly that we're using the `ggplot()` function from the ggplot2 package. 32 | 33 | ## First steps 34 | 35 | Let's use our first graph to answer a question: Do cars with big engines use more fuel than cars with small engines? You probably already have an answer, but try to make your answer precise. What does the relationship between engine size and fuel efficiency look like? Is it positive? Negative? Linear? Nonlinear? 36 | 37 | ### The `mpg` data frame 38 | 39 | You can test your answer with the `mpg` __data frame__ found in ggplot2 (aka `ggplot2::mpg`). A data frame is a rectangular collection of variables (in the columns) and observations (in the rows). `mpg` contains observations collected by the US Environmental Protection Agency on 38 models of car. 40 | 41 | ```{r} 42 | mpg 43 | ``` 44 | 45 | Among the variables in `mpg` are: 46 | 47 | 1. `displ`, a car's engine size, in litres. 48 | 49 | 1. `hwy`, a car's fuel efficiency on the highway, in miles per gallon (mpg). 50 | A car with a low fuel efficiency consumes more fuel than a car with a high 51 | fuel efficiency when they travel the same distance. 52 | 53 | To learn more about `mpg`, open its help page by running `?mpg`. 54 | 55 | ### Creating a ggplot 56 | 57 | To plot `mpg`, run this code to put `displ` on the x-axis and `hwy` on the y-axis: 58 | 59 | ```{r} 60 | ggplot(data = mpg) + 61 | geom_point(mapping = aes(x = displ, y = hwy)) 62 | ``` 63 | 64 | The plot shows a negative relationship between engine size (`displ`) and fuel efficiency (`hwy`). In other words, cars with big engines use more fuel. Does this confirm or refute your hypothesis about fuel efficiency and engine size? 65 | 66 | With ggplot2, you begin a plot with the function `ggplot()`. `ggplot()` creates a coordinate system that you can add layers to. The first argument of `ggplot()` is the dataset to use in the graph. So `ggplot(data = mpg)` creates an empty graph, but it's not very interesting so I'm not going to show it here. 67 | 68 | You complete your graph by adding one or more layers to `ggplot()`. The function `geom_point()` adds a layer of points to your plot, which creates a scatterplot. ggplot2 comes with many geom functions that each add a different type of layer to a plot. You'll learn a whole bunch of them throughout this chapter. 69 | 70 | Each geom function in ggplot2 takes a `mapping` argument. This defines how variables in your dataset are mapped to visual properties. The `mapping` argument is always paired with `aes()`, and the `x` and `y` arguments of `aes()` specify which variables to map to the x and y axes. ggplot2 looks for the mapped variables in the `data` argument, in this case, `mpg`. 71 | 72 | ### A graphing template 73 | 74 | Let's turn this code into a reusable template for making graphs with ggplot2. To make a graph, replace the bracketed sections in the code below with a dataset, a geom function, or a collection of mappings. 75 | 76 | ```{r eval = FALSE} 77 | ggplot(data = ) + 78 | (mapping = aes()) 79 | ``` 80 | 81 | The rest of this chapter will show you how to complete and extend this template to make different types of graphs. We will begin with the `` component. 82 | 83 | ### Exercises 84 | 85 | 1. Run `ggplot(data = mpg)`. What do you see? 86 | 87 | 1. How many rows are in `mpg`? How many columns? 88 | 89 | 1. What does the `drv` variable describe? Read the help for `?mpg` to find 90 | out. 91 | 92 | 1. Make a scatterplot of `hwy` vs `cyl`. 93 | 94 | 1. What happens if you make a scatterplot of `class` vs `drv`? Why is 95 | the plot not useful? 96 | 97 | ## Aesthetic mappings 98 | 99 | > "The greatest value of a picture is when it forces us to notice what we 100 | > never expected to see." --- John Tukey 101 | 102 | In the plot below, one group of points (highlighted in red) seems to fall outside of the linear trend. These cars have a higher mileage than you might expect. How can you explain these cars? 103 | 104 | ```{r, echo = FALSE} 105 | ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 106 | geom_point() + 107 | geom_point(data = dplyr::filter(mpg, displ > 5, hwy > 20), colour = "red", size = 2.2) 108 | ``` 109 | 110 | Let's hypothesize that the cars are hybrids. One way to test this hypothesis is to look at the `class` value for each car. The `class` variable of the `mpg` dataset classifies cars into groups such as compact, midsize, and SUV. If the outlying points are hybrids, they should be classified as compact cars or, perhaps, subcompact cars (keep in mind that this data was collected before hybrid trucks and SUVs became popular). 111 | 112 | You can add a third variable, like `class`, to a two dimensional scatterplot by mapping it to an __aesthetic__. An aesthetic is a visual property of the objects in your plot. Aesthetics include things like the size, the shape, or the color of your points. You can display a point (like the one below) in different ways by changing the values of its aesthetic properties. Since we already use the word "value" to describe data, let's use the word "level" to describe aesthetic properties. Here we change the levels of a point's size, shape, and color to make the point small, triangular, or blue: 113 | 114 | ```{r, echo = FALSE, asp = 1/4} 115 | ggplot() + 116 | geom_point(aes(1, 1), size = 20) + 117 | geom_point(aes(2, 1), size = 10) + 118 | geom_point(aes(3, 1), size = 20, shape = 17) + 119 | geom_point(aes(4, 1), size = 20, colour = "blue") + 120 | scale_x_continuous(NULL, limits = c(0.5, 4.5), labels = NULL) + 121 | scale_y_continuous(NULL, limits = c(0.9, 1.1), labels = NULL) + 122 | theme(aspect.ratio = 1/3) 123 | ``` 124 | 125 | You can convey information about your data by mapping the aesthetics in your plot to the variables in your dataset. For example, you can map the colors of your points to the `class` variable to reveal the class of each car. 126 | 127 | ```{r} 128 | ggplot(data = mpg) + 129 | geom_point(mapping = aes(x = displ, y = hwy, color = class)) 130 | ``` 131 | 132 | (If you prefer British English, like Hadley, you can use `colour` instead of `color`.) 133 | 134 | To map an aesthetic to a variable, associate the name of the aesthetic to the name of the variable inside `aes()`. ggplot2 will automatically assign a unique level of the aesthetic (here a unique color) to each unique value of the variable, a process known as __scaling__. ggplot2 will also add a legend that explains which levels correspond to which values. 135 | 136 | The colors reveal that many of the unusual points are two-seater cars. These cars don't seem like hybrids, and are, in fact, sports cars! Sports cars have large engines like SUVs and pickup trucks, but small bodies like midsize and compact cars, which improves their gas mileage. In hindsight, these cars were unlikely to be hybrids since they have large engines. 137 | 138 | In the above example, we mapped `class` to the color aesthetic, but we could have mapped `class` to the size aesthetic in the same way. In this case, the exact size of each point would reveal its class affiliation. We get a _warning_ here, because mapping an unordered variable (`class`) to an ordered aesthetic (`size`) is not a good idea. 139 | 140 | ```{r} 141 | ggplot(data = mpg) + 142 | geom_point(mapping = aes(x = displ, y = hwy, size = class)) 143 | ``` 144 | 145 | Or we could have mapped `class` to the _alpha_ aesthetic, which controls the transparency of the points, or to the shape aesthetic, which controls the shape of the points. 146 | 147 | ```{r out.width = "50%", fig.align = 'default', warning = FALSE, fig.asp = 1/2, fig.cap =""} 148 | # Left 149 | ggplot(data = mpg) + 150 | geom_point(mapping = aes(x = displ, y = hwy, alpha = class)) 151 | 152 | # Right 153 | ggplot(data = mpg) + 154 | geom_point(mapping = aes(x = displ, y = hwy, shape = class)) 155 | ``` 156 | 157 | What happened to the SUVs? ggplot2 will only use six shapes at a time. By default, additional groups will go unplotted when you use the shape aesthetic. 158 | 159 | For each aesthetic, you use `aes()` to associate the name of the aesthetic with a variable to display. The `aes()` function gathers together each of the aesthetic mappings used by a layer and passes them to the layer's mapping argument. The syntax highlights a useful insight about `x` and `y`: the x and y locations of a point are themselves aesthetics, visual properties that you can map to variables to display information about the data. 160 | 161 | Once you map an aesthetic, ggplot2 takes care of the rest. It selects a reasonable scale to use with the aesthetic, and it constructs a legend that explains the mapping between levels and values. For x and y aesthetics, ggplot2 does not create a legend, but it creates an axis line with tick marks and a label. The axis line acts as a legend; it explains the mapping between locations and values. 162 | 163 | You can also _set_ the aesthetic properties of your geom manually. For example, we can make all of the points in our plot blue: 164 | 165 | ```{r} 166 | ggplot(data = mpg) + 167 | geom_point(mapping = aes(x = displ, y = hwy), color = "blue") 168 | ``` 169 | 170 | Here, the color doesn't convey information about a variable, but only changes the appearance of the plot. To set an aesthetic manually, set the aesthetic by name as an argument of your geom function; i.e. it goes _outside_ of `aes()`. You'll need to pick a level that makes sense for that aesthetic: 171 | 172 | * The name of a color as a character string. 173 | 174 | * The size of a point in mm. 175 | 176 | * The shape of a point as a number, as shown in Figure \@ref(fig:shapes). 177 | 178 | ```{r shapes, echo = FALSE, out.width = "75%", fig.asp = 1/3, fig.cap="R has 25 built in shapes that are identified by numbers. There are some seeming duplicates: for example, 0, 15, and 22 are all squares. The difference comes from the interaction of the `colour` and `fill` aesthetics. The hollow shapes (0--14) have a border determined by `colour`; the solid shapes (15--18) are filled with `colour`; the filled shapes (21--24) have a border of `colour` and are filled with `fill`.", warning = FALSE} 179 | shapes <- tibble( 180 | shape = c(0, 1, 2, 5, 3, 4, 6:19, 22, 21, 24, 23, 20), 181 | x = (0:24 %/% 5) / 2, 182 | y = (-(0:24 %% 5)) / 4 183 | ) 184 | ggplot(shapes, aes(x, y)) + 185 | geom_point(aes(shape = shape), size = 5, fill = "red") + 186 | geom_text(aes(label = shape), hjust = 0, nudge_x = 0.15) + 187 | scale_shape_identity() + 188 | expand_limits(x = 4.1) + 189 | scale_x_continuous(NULL, breaks = NULL) + 190 | scale_y_continuous(NULL, breaks = NULL, limits = c(-1.2, 0.2)) + 191 | theme_minimal() + 192 | theme(aspect.ratio = 1/2.75) 193 | ``` 194 | 195 | ### Exercises 196 | 197 | 1. What's gone wrong with this code? Why are the points not blue? 198 | 199 | ```{r} 200 | ggplot(data = mpg) + 201 | geom_point(mapping = aes(x = displ, y = hwy, color = "blue")) 202 | ``` 203 | 204 | 1. Which variables in `mpg` are categorical? Which variables are continuous? 205 | (Hint: type `?mpg` to read the documentation for the dataset). How 206 | can you see this information when you run `mpg`? 207 | 208 | 1. Map a continuous variable to `color`, `size`, and `shape`. How do 209 | these aesthetics behave differently for categorical vs. continuous 210 | variables? 211 | 212 | 1. What happens if you map the same variable to multiple aesthetics? 213 | 214 | 1. What does the `stroke` aesthetic do? What shapes does it work with? 215 | (Hint: use `?geom_point`) 216 | 217 | 1. What happens if you map an aesthetic to something other than a variable 218 | name, like `aes(colour = displ < 5)`? Note, you'll also need to specify x and y. 219 | 220 | ## Common problems 221 | 222 | As you start to run R code, you're likely to run into problems. Don't worry --- it happens to everyone. I have been writing R code for years, and every day I still write code that doesn't work! 223 | 224 | Start by carefully comparing the code that you're running to the code in the book. R is extremely picky, and a misplaced character can make all the difference. Make sure that every `(` is matched with a `)` and every `"` is paired with another `"`. Sometimes you'll run the code and nothing happens. Check the left-hand of your console: if it's a `+`, it means that R doesn't think you've typed a complete expression and it's waiting for you to finish it. In this case, it's usually easy to start from scratch again by pressing ESCAPE to abort processing the current command. 225 | 226 | One common problem when creating ggplot2 graphics is to put the `+` in the wrong place: it has to come at the end of the line, not the start. In other words, make sure you haven't accidentally written code like this: 227 | 228 | ```R 229 | ggplot(data = mpg) 230 | + geom_point(mapping = aes(x = displ, y = hwy)) 231 | ``` 232 | 233 | If you're still stuck, try the help. You can get help about any R function by running `?function_name` in the console, or selecting the function name and pressing F1 in RStudio. Don't worry if the help doesn't seem that helpful - instead skip down to the examples and look for code that matches what you're trying to do. 234 | 235 | If that doesn't help, carefully read the error message. Sometimes the answer will be buried there! But when you're new to R, the answer might be in the error message but you don't yet know how to understand it. Another great tool is Google: try googling the error message, as it's likely someone else has had the same problem, and has gotten help online. 236 | 237 | ## Facets 238 | 239 | One way to add additional variables is with aesthetics. Another way, particularly useful for categorical variables, is to split your plot into __facets__, subplots that each display one subset of the data. 240 | 241 | To facet your plot by a single variable, use `facet_wrap()`. The first argument of `facet_wrap()` should be a formula, which you create with `~` followed by a variable name (here "formula" is the name of a data structure in R, not a synonym for "equation"). The variable that you pass to `facet_wrap()` should be discrete. 242 | 243 | ```{r} 244 | ggplot(data = mpg) + 245 | geom_point(mapping = aes(x = displ, y = hwy)) + 246 | facet_wrap(~ class, nrow = 2) 247 | ``` 248 | 249 | To facet your plot on the combination of two variables, add `facet_grid()` to your plot call. The first argument of `facet_grid()` is also a formula. This time the formula should contain two variable names separated by a `~`. 250 | 251 | ```{r} 252 | ggplot(data = mpg) + 253 | geom_point(mapping = aes(x = displ, y = hwy)) + 254 | facet_grid(drv ~ cyl) 255 | ``` 256 | 257 | If you prefer to not facet in the rows or columns dimension, use a `.` instead of a variable name, e.g. `+ facet_grid(. ~ cyl)`. 258 | 259 | ### Exercises 260 | 261 | 1. What happens if you facet on a continuous variable? 262 | 263 | 1. What do the empty cells in plot with `facet_grid(drv ~ cyl)` mean? 264 | How do they relate to this plot? 265 | 266 | ```{r, eval = FALSE} 267 | ggplot(data = mpg) + 268 | geom_point(mapping = aes(x = drv, y = cyl)) 269 | ``` 270 | 271 | 1. What plots does the following code make? What does `.` do? 272 | 273 | ```{r eval = FALSE} 274 | ggplot(data = mpg) + 275 | geom_point(mapping = aes(x = displ, y = hwy)) + 276 | facet_grid(drv ~ .) 277 | 278 | ggplot(data = mpg) + 279 | geom_point(mapping = aes(x = displ, y = hwy)) + 280 | facet_grid(. ~ cyl) 281 | ``` 282 | 283 | 1. Take the first faceted plot in this section: 284 | 285 | ```{r, eval = FALSE} 286 | ggplot(data = mpg) + 287 | geom_point(mapping = aes(x = displ, y = hwy)) + 288 | facet_wrap(~ class, nrow = 2) 289 | ``` 290 | 291 | What are the advantages to using faceting instead of the colour aesthetic? 292 | What are the disadvantages? How might the balance change if you had a 293 | larger dataset? 294 | 295 | 1. Read `?facet_wrap`. What does `nrow` do? What does `ncol` do? What other 296 | options control the layout of the individual panels? Why doesn't 297 | `facet_grid()` have `nrow` and `ncol` arguments? 298 | 299 | 1. When using `facet_grid()` you should usually put the variable with more 300 | unique levels in the columns. Why? 301 | 302 | ## Geometric objects 303 | 304 | How are these two plots similar? 305 | 306 | ```{r echo = FALSE, out.width = "50%", fig.align="default", message = FALSE} 307 | ggplot(data = mpg) + 308 | geom_point(mapping = aes(x = displ, y = hwy)) 309 | 310 | ggplot(data = mpg) + 311 | geom_smooth(mapping = aes(x = displ, y = hwy)) 312 | ``` 313 | 314 | Both plots contain the same x variable, the same y variable, and both describe the same data. But the plots are not identical. Each plot uses a different visual object to represent the data. In ggplot2 syntax, we say that they use different __geoms__. 315 | 316 | A __geom__ is the geometrical object that a plot uses to represent data. People often describe plots by the type of geom that the plot uses. For example, bar charts use bar geoms, line charts use line geoms, boxplots use boxplot geoms, and so on. Scatterplots break the trend; they use the point geom. As we see above, you can use different geoms to plot the same data. The plot on the left uses the point geom, and the plot on the right uses the smooth geom, a smooth line fitted to the data. 317 | 318 | To change the geom in your plot, change the geom function that you add to `ggplot()`. For instance, to make the plots above, you can use this code: 319 | 320 | ```{r eval = FALSE} 321 | # left 322 | ggplot(data = mpg) + 323 | geom_point(mapping = aes(x = displ, y = hwy)) 324 | 325 | # right 326 | ggplot(data = mpg) + 327 | geom_smooth(mapping = aes(x = displ, y = hwy)) 328 | ``` 329 | 330 | Every geom function in ggplot2 takes a `mapping` argument. However, not every aesthetic works with every geom. You could set the shape of a point, but you couldn't set the "shape" of a line. On the other hand, you _could_ set the linetype of a line. `geom_smooth()` will draw a different line, with a different linetype, for each unique value of the variable that you map to linetype. 331 | 332 | ```{r message = FALSE} 333 | ggplot(data = mpg) + 334 | geom_smooth(mapping = aes(x = displ, y = hwy, linetype = drv)) 335 | ``` 336 | 337 | Here `geom_smooth()` separates the cars into three lines based on their `drv` value, which describes a car's drivetrain. One line describes all of the points with a `4` value, one line describes all of the points with an `f` value, and one line describes all of the points with an `r` value. Here, `4` stands for four-wheel drive, `f` for front-wheel drive, and `r` for rear-wheel drive. 338 | 339 | If this sounds strange, we can make it more clear by overlaying the lines on top of the raw data and then coloring everything according to `drv`. 340 | 341 | ```{r echo = FALSE, message = FALSE} 342 | ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = drv)) + 343 | geom_point() + 344 | geom_smooth(mapping = aes(linetype = drv)) 345 | ``` 346 | 347 | Notice that this plot contains two geoms in the same graph! If this makes you excited, buckle up. We will learn how to place multiple geoms in the same plot very soon. 348 | 349 | ggplot2 provides over 30 geoms, and extension packages provide even more (see for a sampling). The best way to get a comprehensive overview is the ggplot2 cheatsheet, which you can find at . To learn more about any single geom, use help: `?geom_smooth`. 350 | 351 | Many geoms, like `geom_smooth()`, use a single geometric object to display multiple rows of data. For these geoms, you can set the `group` aesthetic to a categorical variable to draw multiple objects. ggplot2 will draw a separate object for each unique value of the grouping variable. In practice, ggplot2 will automatically group the data for these geoms whenever you map an aesthetic to a discrete variable (as in the `linetype` example). It is convenient to rely on this feature because the group aesthetic by itself does not add a legend or distinguishing features to the geoms. 352 | 353 | ```{r, fig.width = 3, fig.align = 'default', out.width = "33%", message = FALSE} 354 | ggplot(data = mpg) + 355 | geom_smooth(mapping = aes(x = displ, y = hwy)) 356 | 357 | ggplot(data = mpg) + 358 | geom_smooth(mapping = aes(x = displ, y = hwy, group = drv)) 359 | 360 | ggplot(data = mpg) + 361 | geom_smooth( 362 | mapping = aes(x = displ, y = hwy, color = drv), 363 | show.legend = FALSE 364 | ) 365 | ``` 366 | 367 | To display multiple geoms in the same plot, add multiple geom functions to `ggplot()`: 368 | 369 | ```{r, message = FALSE} 370 | ggplot(data = mpg) + 371 | geom_point(mapping = aes(x = displ, y = hwy)) + 372 | geom_smooth(mapping = aes(x = displ, y = hwy)) 373 | ``` 374 | 375 | This, however, introduces some duplication in our code. Imagine if you wanted to change the y-axis to display `cty` instead of `hwy`. You'd need to change the variable in two places, and you might forget to update one. You can avoid this type of repetition by passing a set of mappings to `ggplot()`. ggplot2 will treat these mappings as global mappings that apply to each geom in the graph. In other words, this code will produce the same plot as the previous code: 376 | 377 | ```{r, eval = FALSE} 378 | ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 379 | geom_point() + 380 | geom_smooth() 381 | ``` 382 | 383 | If you place mappings in a geom function, ggplot2 will treat them as local mappings for the layer. It will use these mappings to extend or overwrite the global mappings _for that layer only_. This makes it possible to display different aesthetics in different layers. 384 | 385 | ```{r, message = FALSE} 386 | ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 387 | geom_point(mapping = aes(color = class)) + 388 | geom_smooth() 389 | ``` 390 | 391 | You can use the same idea to specify different `data` for each layer. Here, our smooth line displays just a subset of the `mpg` dataset, the subcompact cars. The local data argument in `geom_smooth()` overrides the global data argument in `ggplot()` for that layer only. 392 | 393 | ```{r, message = FALSE} 394 | ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 395 | geom_point(mapping = aes(color = class)) + 396 | geom_smooth(data = filter(mpg, class == "subcompact"), se = FALSE) 397 | ``` 398 | 399 | (You'll learn how `filter()` works in the chapter on data transformations: for now, just know that this command selects only the subcompact cars.) 400 | 401 | ### Exercises 402 | 403 | 1. What geom would you use to draw a line chart? A boxplot? 404 | A histogram? An area chart? 405 | 406 | 1. Run this code in your head and predict what the output will look like. 407 | Then, run the code in R and check your predictions. 408 | 409 | ```{r, eval = FALSE} 410 | ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = drv)) + 411 | geom_point() + 412 | geom_smooth(se = FALSE) 413 | ``` 414 | 415 | 1. What does `show.legend = FALSE` do? What happens if you remove it? 416 | Why do you think I used it earlier in the chapter? 417 | 418 | 1. What does the `se` argument to `geom_smooth()` do? 419 | 420 | 421 | 1. Will these two graphs look different? Why/why not? 422 | 423 | ```{r, eval = FALSE} 424 | ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 425 | geom_point() + 426 | geom_smooth() 427 | 428 | ggplot() + 429 | geom_point(data = mpg, mapping = aes(x = displ, y = hwy)) + 430 | geom_smooth(data = mpg, mapping = aes(x = displ, y = hwy)) 431 | ``` 432 | 433 | 1. Recreate the R code necessary to generate the following graphs. 434 | 435 | ```{r echo = FALSE, fig.width = 3, out.width = "50%", fig.align = "default", message = FALSE} 436 | ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 437 | geom_point() + 438 | geom_smooth(se = FALSE) 439 | ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 440 | geom_smooth(aes(group = drv), se = FALSE) + 441 | geom_point() 442 | ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = drv)) + 443 | geom_point() + 444 | geom_smooth(se = FALSE) 445 | ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 446 | geom_point(aes(color = drv)) + 447 | geom_smooth(se = FALSE) 448 | ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 449 | geom_point(aes(color = drv)) + 450 | geom_smooth(aes(linetype = drv), se = FALSE) 451 | ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 452 | geom_point(size = 4, colour = "white") + 453 | geom_point(aes(colour = drv)) 454 | ``` 455 | 456 | ## Statistical transformations 457 | 458 | Next, let's take a look at a bar chart. Bar charts seem simple, but they are interesting because they reveal something subtle about plots. Consider a basic bar chart, as drawn with `geom_bar()`. The following chart displays the total number of diamonds in the `diamonds` dataset, grouped by `cut`. The `diamonds` dataset comes in ggplot2 and contains information about ~54,000 diamonds, including the `price`, `carat`, `color`, `clarity`, and `cut` of each diamond. The chart shows that more diamonds are available with high quality cuts than with low quality cuts. 459 | 460 | ```{r} 461 | ggplot(data = diamonds) + 462 | geom_bar(mapping = aes(x = cut)) 463 | ``` 464 | 465 | On the x-axis, the chart displays `cut`, a variable from `diamonds`. On the y-axis, it displays count, but count is not a variable in `diamonds`! Where does count come from? Many graphs, like scatterplots, plot the raw values of your dataset. Other graphs, like bar charts, calculate new values to plot: 466 | 467 | * bar charts, histograms, and frequency polygons bin your data 468 | and then plot bin counts, the number of points that fall in each bin. 469 | 470 | * smoothers fit a model to your data and then plot predictions from the 471 | model. 472 | 473 | * boxplots compute a robust summary of the distribution and then display a 474 | specially formatted box. 475 | 476 | The algorithm used to calculate new values for a graph is called a __stat__, short for statistical transformation. The figure below describes how this process works with `geom_bar()`. 477 | 478 | ```{r, echo = FALSE, out.width = "100%"} 479 | knitr::include_graphics("images/visualization-stat-bar.png") 480 | ``` 481 | 482 | You can learn which stat a geom uses by inspecting the default value for the `stat` argument. For example, `?geom_bar` shows that the default value for `stat` is "count", which means that `geom_bar()` uses `stat_count()`. `stat_count()` is documented on the same page as `geom_bar()`, and if you scroll down you can find a section called "Computed variables". That describes how it computes two new variables: `count` and `prop`. 483 | 484 | You can generally use geoms and stats interchangeably. For example, you can recreate the previous plot using `stat_count()` instead of `geom_bar()`: 485 | 486 | ```{r} 487 | ggplot(data = diamonds) + 488 | stat_count(mapping = aes(x = cut)) 489 | ``` 490 | 491 | This works because every geom has a default stat; and every stat has a default geom. This means that you can typically use geoms without worrying about the underlying statistical transformation. There are three reasons you might need to use a stat explicitly: 492 | 493 | 1. You might want to override the default stat. In the code below, I change 494 | the stat of `geom_bar()` from count (the default) to identity. This lets 495 | me map the height of the bars to the raw values of a $y$ variable. 496 | Unfortunately when people talk about bar charts casually, they might be 497 | referring to this type of bar chart, where the height of the bar is already 498 | present in the data, or the previous bar chart where the height of the bar 499 | is generated by counting rows. 500 | 501 | ```{r, warning = FALSE} 502 | demo <- tribble( 503 | ~cut, ~freq, 504 | "Fair", 1610, 505 | "Good", 4906, 506 | "Very Good", 12082, 507 | "Premium", 13791, 508 | "Ideal", 21551 509 | ) 510 | 511 | ggplot(data = demo) + 512 | geom_bar(mapping = aes(x = cut, y = freq), stat = "identity") 513 | ``` 514 | 515 | (Don't worry that you haven't seen `<-` or `tribble()` before. You might be 516 | able to guess at their meaning from the context, and you'll learn exactly 517 | what they do soon!) 518 | 519 | 1. You might want to override the default mapping from transformed variables 520 | to aesthetics. For example, you might want to display a bar chart of 521 | proportion, rather than count: 522 | 523 | ```{r} 524 | ggplot(data = diamonds) + 525 | geom_bar(mapping = aes(x = cut, y = ..prop.., group = 1)) 526 | ``` 527 | 528 | To find the variables computed by the stat, look for the help section 529 | titled "computed variables". 530 | 531 | 1. You might want to draw greater attention to the statistical transformation 532 | in your code. For example, you might use `stat_summary()`, which 533 | summarises the y values for each unique x value, to draw 534 | attention to the summary that you're computing: 535 | 536 | ```{r} 537 | ggplot(data = diamonds) + 538 | stat_summary( 539 | mapping = aes(x = cut, y = depth), 540 | fun.ymin = min, 541 | fun.ymax = max, 542 | fun.y = median 543 | ) 544 | ``` 545 | 546 | ggplot2 provides over 20 stats for you to use. Each stat is a function, so you can get help in the usual way, e.g. `?stat_bin`. To see a complete list of stats, try the ggplot2 cheatsheet. 547 | 548 | ### Exercises 549 | 550 | 1. What is the default geom associated with `stat_summary()`? How could 551 | you rewrite the previous plot to use that geom function instead of the 552 | stat function? 553 | 554 | 1. What does `geom_col()` do? How is it different to `geom_bar()`? 555 | 556 | 1. Most geoms and stats come in pairs that are almost always used in 557 | concert. Read through the documentation and make a list of all the 558 | pairs. What do they have in common? 559 | 560 | 1. What variables does `stat_smooth()` compute? What parameters control 561 | its behaviour? 562 | 563 | 1. In our proportion bar chart, we need to set `group = 1`. Why? In other 564 | words what is the problem with these two graphs? 565 | 566 | ```{r, eval = FALSE} 567 | ggplot(data = diamonds) + 568 | geom_bar(mapping = aes(x = cut, y = ..prop..)) 569 | ggplot(data = diamonds) + 570 | geom_bar(mapping = aes(x = cut, fill = color, y = ..prop..)) 571 | ``` 572 | 573 | 574 | ## Position adjustments 575 | 576 | There's one more piece of magic associated with bar charts. You can colour a bar chart using either the `colour` aesthetic, or, more usefully, `fill`: 577 | 578 | ```{r out.width = "50%", fig.align = "default"} 579 | ggplot(data = diamonds) + 580 | geom_bar(mapping = aes(x = cut, colour = cut)) 581 | ggplot(data = diamonds) + 582 | geom_bar(mapping = aes(x = cut, fill = cut)) 583 | ``` 584 | 585 | Note what happens if you map the fill aesthetic to another variable, like `clarity`: the bars are automatically stacked. Each colored rectangle represents a combination of `cut` and `clarity`. 586 | 587 | ```{r} 588 | ggplot(data = diamonds) + 589 | geom_bar(mapping = aes(x = cut, fill = clarity)) 590 | ``` 591 | 592 | The stacking is performed automatically by the __position adjustment__ specified by the `position` argument. If you don't want a stacked bar chart, you can use one of three other options: `"identity"`, `"dodge"` or `"fill"`. 593 | 594 | * `position = "identity"` will place each object exactly where it falls in 595 | the context of the graph. This is not very useful for bars, because it 596 | overlaps them. To see that overlapping we either need to make the bars 597 | slightly transparent by setting `alpha` to a small value, or completely 598 | transparent by setting `fill = NA`. 599 | 600 | ```{r out.width = "50%", fig.align = "default"} 601 | ggplot(data = diamonds, mapping = aes(x = cut, fill = clarity)) + 602 | geom_bar(alpha = 1/5, position = "identity") 603 | ggplot(data = diamonds, mapping = aes(x = cut, colour = clarity)) + 604 | geom_bar(fill = NA, position = "identity") 605 | ``` 606 | 607 | The identity position adjustment is more useful for 2d geoms, like points, 608 | where it is the default. 609 | 610 | * `position = "fill"` works like stacking, but makes each set of stacked bars 611 | the same height. This makes it easier to compare proportions across 612 | groups. 613 | 614 | ```{r} 615 | ggplot(data = diamonds) + 616 | geom_bar(mapping = aes(x = cut, fill = clarity), position = "fill") 617 | ``` 618 | 619 | * `position = "dodge"` places overlapping objects directly _beside_ one 620 | another. This makes it easier to compare individual values. 621 | 622 | ```{r} 623 | ggplot(data = diamonds) + 624 | geom_bar(mapping = aes(x = cut, fill = clarity), position = "dodge") 625 | ``` 626 | 627 | There's one other type of adjustment that's not useful for bar charts, but it can be very useful for scatterplots. Recall our first scatterplot. Did you notice that the plot displays only 126 points, even though there are 234 observations in the dataset? 628 | 629 | ```{r echo = FALSE} 630 | ggplot(data = mpg) + 631 | geom_point(mapping = aes(x = displ, y = hwy)) 632 | ``` 633 | 634 | The values of `hwy` and `displ` are rounded so the points appear on a grid and many points overlap each other. This problem is known as __overplotting__. This arrangement makes it hard to see where the mass of the data is. Are the data points spread equally throughout the graph, or is there one special combination of `hwy` and `displ` that contains 109 values? 635 | 636 | You can avoid this gridding by setting the position adjustment to "jitter". `position = "jitter"` adds a small amount of random noise to each point. This spreads the points out because no two points are likely to receive the same amount of random noise. 637 | 638 | ```{r} 639 | ggplot(data = mpg) + 640 | geom_point(mapping = aes(x = displ, y = hwy), position = "jitter") 641 | ``` 642 | 643 | Adding randomness seems like a strange way to improve your plot, but while it makes your graph less accurate at small scales, it makes your graph _more_ revealing at large scales. Because this is such a useful operation, ggplot2 comes with a shorthand for `geom_point(position = "jitter")`: `geom_jitter()`. 644 | 645 | To learn more about a position adjustment, look up the help page associated with each adjustment: `?position_dodge`, `?position_fill`, `?position_identity`, `?position_jitter`, and `?position_stack`. 646 | 647 | ### Exercises 648 | 649 | 1. What is the problem with this plot? How could you improve it? 650 | 651 | ```{r} 652 | ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + 653 | geom_point() 654 | ``` 655 | 656 | 1. What parameters to `geom_jitter()` control the amount of jittering? 657 | 658 | 1. Compare and contrast `geom_jitter()` with `geom_count()`. 659 | 660 | 1. What's the default position adjustment for `geom_boxplot()`? Create 661 | a visualisation of the `mpg` dataset that demonstrates it. 662 | 663 | ## Coordinate systems 664 | 665 | Coordinate systems are probably the most complicated part of ggplot2. The default coordinate system is the Cartesian coordinate system where the x and y positions act independently to determine the location of each point. There are a number of other coordinate systems that are occasionally helpful. 666 | 667 | * `coord_flip()` switches the x and y axes. This is useful (for example), 668 | if you want horizontal boxplots. It's also useful for long labels: it's 669 | hard to get them to fit without overlapping on the x-axis. 670 | 671 | ```{r fig.width = 3, out.width = "50%", fig.align = "default"} 672 | ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + 673 | geom_boxplot() 674 | ggplot(data = mpg, mapping = aes(x = class, y = hwy)) + 675 | geom_boxplot() + 676 | coord_flip() 677 | ``` 678 | 679 | * `coord_quickmap()` sets the aspect ratio correctly for maps. This is very 680 | important if you're plotting spatial data with ggplot2 (which unfortunately 681 | we don't have the space to cover in this book). 682 | 683 | ```{r fig.width = 3, out.width = "50%", fig.align = "default", message = FALSE} 684 | nz <- map_data("nz") 685 | 686 | ggplot(nz, aes(long, lat, group = group)) + 687 | geom_polygon(fill = "white", colour = "black") 688 | 689 | ggplot(nz, aes(long, lat, group = group)) + 690 | geom_polygon(fill = "white", colour = "black") + 691 | coord_quickmap() 692 | ``` 693 | 694 | * `coord_polar()` uses polar coordinates. Polar coordinates reveal an 695 | interesting connection between a bar chart and a Coxcomb chart. 696 | 697 | ```{r fig.width = 3, out.width = "50%", fig.align = "default", fig.asp = 1} 698 | bar <- ggplot(data = diamonds) + 699 | geom_bar( 700 | mapping = aes(x = cut, fill = cut), 701 | show.legend = FALSE, 702 | width = 1 703 | ) + 704 | theme(aspect.ratio = 1) + 705 | labs(x = NULL, y = NULL) 706 | 707 | bar + coord_flip() 708 | bar + coord_polar() 709 | ``` 710 | 711 | ### Exercises 712 | 713 | 1. Turn a stacked bar chart into a pie chart using `coord_polar()`. 714 | 715 | 1. What does `labs()` do? Read the documentation. 716 | 717 | 1. What's the difference between `coord_quickmap()` and `coord_map()`? 718 | 719 | 1. What does the plot below tell you about the relationship between city 720 | and highway mpg? Why is `coord_fixed()` important? What does 721 | `geom_abline()` do? 722 | 723 | ```{r, fig.asp = 1, out.width = "50%"} 724 | ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + 725 | geom_point() + 726 | geom_abline() + 727 | coord_fixed() 728 | ``` 729 | 730 | ## The layered grammar of graphics 731 | 732 | In the previous sections, you learned much more than how to make scatterplots, bar charts, and boxplots. You learned a foundation that you can use to make _any_ type of plot with ggplot2. To see this, let's add position adjustments, stats, coordinate systems, and faceting to our code template: 733 | 734 | ``` 735 | ggplot(data = ) + 736 | ( 737 | mapping = aes(), 738 | stat = , 739 | position = 740 | ) + 741 | + 742 | 743 | ``` 744 | 745 | Our new template takes seven parameters, the bracketed words that appear in the template. In practice, you rarely need to supply all seven parameters to make a graph because ggplot2 will provide useful defaults for everything except the data, the mappings, and the geom function. 746 | 747 | The seven parameters in the template compose the grammar of graphics, a formal system for building plots. The grammar of graphics is based on the insight that you can uniquely describe _any_ plot as a combination of a dataset, a geom, a set of mappings, a stat, a position adjustment, a coordinate system, and a faceting scheme. 748 | 749 | To see how this works, consider how you could build a basic plot from scratch: you could start with a dataset and then transform it into the information that you want to display (with a stat). 750 | 751 | ```{r, echo = FALSE, out.width = "100%"} 752 | knitr::include_graphics("images/visualization-grammar-1.png") 753 | ``` 754 | 755 | Next, you could choose a geometric object to represent each observation in the transformed data. You could then use the aesthetic properties of the geoms to represent variables in the data. You would map the values of each variable to the levels of an aesthetic. 756 | 757 | ```{r, echo = FALSE, out.width = "100%"} 758 | knitr::include_graphics("images/visualization-grammar-2.png") 759 | ``` 760 | 761 | You'd then select a coordinate system to place the geoms into. You'd use the location of the objects (which is itself an aesthetic property) to display the values of the x and y variables. At that point, you would have a complete graph, but you could further adjust the positions of the geoms within the coordinate system (a position adjustment) or split the graph into subplots (faceting). You could also extend the plot by adding one or more additional layers, where each additional layer uses a dataset, a geom, a set of mappings, a stat, and a position adjustment. 762 | 763 | ```{r, echo = FALSE, out.width = "100%"} 764 | knitr::include_graphics("images/visualization-grammar-3.png") 765 | ``` 766 | 767 | You could use this method to build _any_ plot that you imagine. In other words, you can use the code template that you've learned in this chapter to build hundreds of thousands of unique plots. 768 | -------------------------------------------------------------------------------- /output/r4ds-python-plotnine.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Plotnine: Grammar of Graphics for Python" 3 | output: 4 | html_notebook: 5 | fig_retina: 2 6 | md_document: 7 | variant: markdown_github 8 | preserve_yaml: true 9 | fig_retina: 2 10 | --- 11 | 12 | 13 | ```{r setup, echo=FALSE} 14 | library(reticulate) 15 | use_virtualenv("../venv", required = TRUE) 16 | knitr::opts_chunk$set(fig.show = "hold", 17 | fig.align = "center", 18 | out.extra = "class=\"md:w-3/4 lg:w-2/3\"", 19 | dpi = 300, 20 | comment = "") 21 | ``` 22 | 23 | ## Preface 24 | 25 | [plotnine](https://github.com/has2k1/plotnine) is a data visualisation package for Python based on the grammar of graphics, created by Hassan Kibirige. Its API is similar to [ggplot2](https://ggplot2.tidyverse.org/), a widely successful R package by [Hadley Wickham and others](https://ggplot2.tidyverse.org/authors.html).[^1] 26 | 27 | I'm a staunch proponent of ggplot2. The underlying grammar of graphics is accompanied by a consistent API that allows you to quickly and iteratively create different types of beautiful data visualisations while rarely having to consult the documentation. A welcoming set of properties when doing exploratory data analysis. 28 | 29 | I must admit that I haven't tried every data visualisation package there is for Python, but when it comes to the most popular ones, I personally find them either 30 | convenient but limited ([pandas](https://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html)), 31 | flexible but complicated ([matplotlib](https://matplotlib.org/)), or 32 | beautiful but inconsistent ([seaborn](https://seaborn.pydata.org/)). Your mileage may vary. plotnine, on the other hand, shows a lot of promise. 33 | I estimate it currently has a 95% coverage of ggplot2's functionality, and it's still actively being developed. 34 | All in all, as someone who uses both R and Python, I'm very pleased to be able to transfer my ggplot2 knowledge to the Python ecosystem. 35 | 36 | I figured that plotnine could use a good tutorial so that perhaps more Pythonistas would give this package a shot. 37 | Instead of writing one from scratch, I turned to the, in my opinion, best free tutorial for ggplot2: [R for Data Science](https://r4ds.had.co.nz/) by Hadley Wickham and Garrett Grolemund, published by O'Reilly Media in 2016. 38 | 39 | ```{r, echo=FALSE, dev="png", out.extra="class=\"p-4 w-1/2 md:w-1/3 lg:w-1/4\""} 40 | knitr::include_graphics("images/cover.png") 41 | ``` 42 | 43 | All I had to do was translate[^2] the visualization chapters (chapter 3 and 28) from R and ggplot2 to Python and plotnine. 44 | I would like to thank Hadley, Garrett, and O'Reilly Media, for granting me permission to do so. 45 | Translating an existing text is quicker than writing a new one, and has the benefit that it becomes possible to compare both the syntax and coverage of plotnine to ggplot2. 46 | 47 | However, while quicker, translating is not always straightforward. 48 | I have tried to change as little as possible to the original text while making sure that the text and the code are still in sync. 49 | In case any errors or falsehoods have been introduced due to translation, then I'm the one to blame. 50 | For example, to the best of my knowledge, neither authors have made any claims about plotnine. 51 | If you find such an error and think it is fixable, then it would be greatly appreciated if you'd let me know by [creating an issue on Github](https://github.com/datascienceworkshops/r4ds-python-plotnine/issues). Thank you. 52 | The section numbers in this tutorial link back to the corresponding section of the original text, in case you want to compare them.[^3] Only this preface and the few footnotes scattered among the text are entirely mine. 53 | 54 | This tutorial is also available as a [Jupyter notebook](https://github.com/datascienceworkshops/r4ds-python-plotnine/blob/master/output/r4ds-python-plotnine.ipynb) and an [R notebook](https://github.com/datascienceworkshops/r4ds-python-plotnine/blob/master/output/r4ds-python-plotnine.Rmd) in case you want to follow along. 55 | If you clone the [Github repository](https://github.com/datascienceworkshops/r4ds-python-plotnine) then you can find the notebooks in the `output` directory. 56 | The [README](https://github.com/datascienceworkshops/r4ds-python-plotnine/blob/master/README.md) contains instructions on how to run the notebooks. 57 | The Jupyter notebook is also available on [Binder](https://mybinder.org/v2/gh/datascienceworkshops/r4ds-python-plotnine/master?filepath=output%2Fr4ds-python-plotnine.ipynb), but keep in mind that the interactive version may take a while to launch. 58 | 59 | Without further ado, let's start learning about plotnine! 60 | 61 | Cheers, 62 | 63 | Jeroen 64 | 65 | 66 | # [3](https://r4ds.had.co.nz/data-visualisation.html)   Data visualisation 67 | 68 | ## [3.1](https://r4ds.had.co.nz/data-visualisation.html#introduction-1)   Introduction 69 | 70 | > "The simple graph has brought more information to the data analyst’s mind 71 | > than any other device." --- John Tukey 72 | 73 | This tutorial will teach you how to visualise your data using plotnine. Python has many packages for making graphs, but plotnine is one of the most elegant and most versatile. plotnine implements the __grammar of graphics__, a coherent system for describing and building graphs. With plotnine, you can do more faster by learning one system and applying it in many places. 74 | 75 | If you'd like to learn more about the theoretical underpinnings of plotnine before you start, I'd recommend reading [The Layered Grammar of Graphics](http://vita.had.co.nz/papers/layered-grammar.pdf). 76 | 77 | ### [3.1.1](https://r4ds.had.co.nz/data-visualisation.html#prerequisites-1)   Prerequisites 78 | 79 | This tutorial focusses on plotnine. We'll also use a little numpy and pandas for data manipulation. To access the datasets, help pages, and functions that we will use in this tutorial, import[^4] the necessary packages by running this code: 80 | 81 | ```{python import} 82 | from plotnine import * 83 | from plotnine.data import * 84 | 85 | import numpy as np 86 | import pandas as pd 87 | ``` 88 | 89 | ```{python settings, echo=FALSE} 90 | import warnings 91 | pd.set_option("display.max_rows", 10) 92 | 93 | from matplotlib import rcParams 94 | rcParams.update({"figure.max_open_warning": 0, 95 | "savefig.bbox": "tight"}) 96 | 97 | theme_set(theme_gray(base_size=12)) 98 | ``` 99 | 100 | 101 | If you run this code and get the error message `ModuleNotFoundError: No module named 'plotnine'`, you'll need to first install it[^5], then run the code once again. 102 | 103 | ``` 104 | ! pip install plotnine[all] 105 | ``` 106 | 107 | You only need to install a package once, but you need to import it every time you run your script or (re)start the kernel. 108 | 109 | 110 | ## [3.2](https://r4ds.had.co.nz/data-visualisation.html#first-steps)   First steps 111 | 112 | Let's use our first graph to answer a question: Do cars with big engines use more fuel than cars with small engines? You probably already have an answer, but try to make your answer precise. What does the relationship between engine size and fuel efficiency look like? Is it positive? Negative? Linear? Nonlinear? 113 | 114 | ### [3.2.1](https://r4ds.had.co.nz/data-visualisation.html#the-mpg-data-frame)   The `mpg` DataFrame 115 | 116 | You can test your answer with the `mpg` DataFrame found in `plotnine.data`. A DataFrame is a rectangular collection of variables (in the columns) and observations (in the rows). `mpg` contains observations collected by the US Environmental Protection Agency on 38 models of car. 117 | 118 | ```{python} 119 | mpg 120 | ``` 121 | 122 | Among the variables in `mpg` are: 123 | 124 | 1. `displ`, a car's engine size, in litres. 125 | 126 | 1. `hwy`, a car's fuel efficiency on the highway, in miles per gallon (mpg). 127 | A car with a low fuel efficiency consumes more fuel than a car with a high 128 | fuel efficiency when they travel the same distance. 129 | 130 | To learn more about `mpg`, open its help page by running `?mpg`. 131 | 132 | ### [3.2.2](https://r4ds.had.co.nz/data-visualisation.html#creating-a-ggplot)   Creating a ggplot 133 | 134 | To plot `mpg`, run this code[^6] to put `displ` on the x-axis and `hwy` on the y-axis: 135 | 136 | ```{python} 137 | ggplot(data=mpg) +\ 138 | geom_point(mapping=aes(x="displ", y="hwy")) 139 | ``` 140 | 141 | The plot shows a negative relationship between engine size (`displ`) and fuel efficiency (`hwy`). In other words, cars with big engines use more fuel. Does this confirm or refute your hypothesis about fuel efficiency and engine size? 142 | 143 | With plotnine, you begin a plot with the function `ggplot()`. `ggplot()` creates a coordinate system that you can add layers to. The first argument of `ggplot()` is the dataset to use in the graph. So `ggplot(data=mpg)` creates an empty graph, but it's not very interesting so I'm not going to show it here. 144 | 145 | You complete your graph by adding one or more layers to `ggplot()`. The function `geom_point()` adds a layer of points to your plot, which creates a scatterplot. plotnine comes with many geom functions that each add a different type of layer to a plot. You'll learn a whole bunch of them throughout this tutorial. 146 | 147 | Each geom function in plotnine takes a `mapping` argument. This defines how variables in your dataset are mapped to visual properties. The `mapping` argument is always paired with `aes()`, and the `x` and `y` arguments of `aes()` specify which variables to map to the x and y axes. plotnine looks for the mapped variables in the `data` argument, in this case, `mpg`. 148 | 149 | ### [3.2.3](https://r4ds.had.co.nz/data-visualisation.html#a-graphing-template)   A graphing template 150 | 151 | Let's turn this code into a reusable template for making graphs with plotnine. To make a graph, replace the bracketed sections in the code below with a dataset, a geom function, or a collection of mappings. 152 | 153 | ``` 154 | ggplot(data=) +\ 155 | (mapping=aes()) 156 | ``` 157 | 158 | The rest of this tutorial will show you how to complete and extend this template to make different types of graphs. We will begin with the `` component. 159 | 160 | ### [3.2.4](https://r4ds.had.co.nz/data-visualisation.html#exercises)   Exercises 161 | 162 | 1. Run `ggplot(data=mpg)`. What do you see? 163 | 164 | 1. How many rows are in `mpg`? How many columns? 165 | 166 | 1. What does the `drv` variable describe? Read the help for `?mpg` to find 167 | out. 168 | 169 | 1. Make a scatterplot of `hwy` vs `cyl`. 170 | 171 | 1. What happens if you make a scatterplot of `class` vs `drv`? Why is 172 | the plot not useful? 173 | 174 | ## [3.3](https://r4ds.had.co.nz/data-visualisation.html#aesthetic-mappings)   Aesthetic mappings 175 | 176 | > "The greatest value of a picture is when it forces us to notice what we 177 | > never expected to see." --- John Tukey 178 | 179 | In the plot below, one group of points (highlighted in red) seems to fall outside of the linear trend. These cars have a higher mileage than you might expect. How can you explain these cars? 180 | 181 | 182 | ```{python, echo=FALSE} 183 | ggplot(data=mpg, mapping=aes(x="displ", y="hwy")) +\ 184 | geom_point() +\ 185 | geom_point(data=mpg.query("displ > 5 & hwy > 20"), colour="red", size=2.2) 186 | ``` 187 | 188 | Let's hypothesize that the cars are hybrids. One way to test this hypothesis is to look at the `class` value for each car. The `class` variable of the `mpg` dataset classifies cars into groups such as compact, midsize, and SUV. If the outlying points are hybrids, they should be classified as compact cars or, perhaps, subcompact cars (keep in mind that this data was collected before hybrid trucks and SUVs became popular). 189 | 190 | You can add a third variable, like `class`, to a two dimensional scatterplot by mapping it to an __aesthetic__. An aesthetic is a visual property of the objects in your plot. Aesthetics include things like the size, the shape, or the color of your points. You can display a point (like the one below) in different ways by changing the values of its aesthetic properties. Since we already use the word "value" to describe data, let's use the word "level" to describe aesthetic properties. Here we change the levels of a point's size, shape, and color to make the point small, triangular, or blue: 191 | 192 | 193 | ```{python, echo=FALSE} 194 | def no_labels(x) : 195 | return [""] * len(x) 196 | 197 | df = pd.DataFrame({"x": [1, 2, 3, 4], 198 | "y": [1, 1, 1, 1], 199 | "size": [20, 10, 20, 20], 200 | "shape": ["o", "o", "^", "o"], 201 | "color": ["black", "black", "black", "blue"]}) 202 | ggplot(df, aes("x", "y", size="size", shape="shape", color="color")) +\ 203 | geom_point() +\ 204 | scale_x_continuous(limits=(0.5, 4.5), labels=no_labels) +\ 205 | scale_y_continuous(limits=(0.9, 1.1), labels=no_labels) +\ 206 | scale_size_identity() +\ 207 | scale_shape_identity() +\ 208 | scale_color_identity() +\ 209 | labs(x=None, y=None) +\ 210 | theme(aspect_ratio=1/3) 211 | ``` 212 | 213 | 214 | You can convey information about your data by mapping the aesthetics in your plot to the variables in your dataset. For example, you can map the colors of your points to the `class` variable to reveal the class of each car. 215 | 216 | ```{python} 217 | ggplot(data=mpg) +\ 218 | geom_point(mapping=aes(x="displ", y="hwy", color="class")) 219 | ``` 220 | 221 | (If you prefer British English, like Hadley, you can use `colour` instead of `color`.) 222 | 223 | To map an aesthetic to a variable, associate the name of the aesthetic to the name of the variable inside `aes()`. plotnine will automatically assign a unique level of the aesthetic (here a unique color) to each unique value of the variable, a process known as __scaling__. plotnine will also add a legend that explains which levels correspond to which values. 224 | 225 | The colors reveal that many of the unusual points are two-seater cars. These cars don't seem like hybrids, and are, in fact, sports cars! Sports cars have large engines like SUVs and pickup trucks, but small bodies like midsize and compact cars, which improves their gas mileage. In hindsight, these cars were unlikely to be hybrids since they have large engines. 226 | 227 | In the above example, we mapped `class` to the color aesthetic, but we could have mapped `class` to the size aesthetic in the same way. In this case, the exact size of each point would reveal its class affiliation. We get a _warning_ here, because mapping an unordered variable (`class`) to an ordered aesthetic (`size`) is not a good idea. 228 | 229 | ```{python} 230 | ggplot(data=mpg) +\ 231 | geom_point(mapping=aes(x="displ", y="hwy", size="class")) 232 | ``` 233 | 234 | Similarly, we could have mapped `manufacturer` to the _alpha_ aesthetic, which controls the transparency of the points, or to the _shape_ aesthetic, which controls the shape of the points.[^7] 235 | 236 | ```{python echo=FALSE} 237 | warnings.filterwarnings("ignore") 238 | ``` 239 | 240 | 241 | 242 | ```{python, eval=FALSE} 243 | # Left 244 | ggplot(data=mpg) +\ 245 | geom_point(mapping=aes(x="displ", y="hwy", alpha="manufacturer")) 246 | 247 | # Right 248 | ggplot(data=mpg) +\ 249 | geom_point(mapping=aes(x="displ", y="hwy", shape="manufacturer")) 250 | ``` 251 | 252 | ```{python echo=FALSE, out.extra = ""} 253 | ggplot(data=mpg) +\ 254 | geom_point(mapping=aes(x="displ", y="hwy", alpha="manufacturer")) 255 | ``` 256 | ```{python echo=FALSE, out.extra = ""} 257 | ggplot(data=mpg) +\ 258 | geom_point(mapping=aes(x="displ", y="hwy", shape="manufacturer")) 259 | ``` 260 | 261 | What happened to Toyota and Volkswagen? plotnine will only use 13 shapes at a time. By default, additional groups will go unplotted when you use the shape aesthetic. 262 | 263 | For each aesthetic, you use `aes()` to associate the name of the aesthetic with a variable to display. The `aes()` function gathers together each of the aesthetic mappings used by a layer and passes them to the layer's mapping argument. The syntax highlights a useful insight about `x` and `y`: the x and y locations of a point are themselves aesthetics, visual properties that you can map to variables to display information about the data. 264 | 265 | Once you map an aesthetic, plotnine takes care of the rest. It selects a reasonable scale to use with the aesthetic, and it constructs a legend that explains the mapping between levels and values. For x and y aesthetics, plotnine does not create a legend, but it creates an axis line with tick marks and a label. The axis line acts as a legend; it explains the mapping between locations and values. 266 | 267 | You can also _set_ the aesthetic properties of your geom manually. For example, we can make all of the points in our plot blue: 268 | 269 | ```{python} 270 | ggplot(data=mpg) +\ 271 | geom_point(mapping=aes(x="displ", y="hwy"), color="blue") 272 | ``` 273 | 274 | Here, the color doesn't convey information about a variable, but only changes the appearance of the plot. To set an aesthetic manually, set the aesthetic by name as an argument of your geom function; i.e. it goes _outside_ of `aes()`. You'll need to pick a level that makes sense for that aesthetic: 275 | 276 | * The name of a color as a string. 277 | * The size of a point in mm. 278 | * The shape of a point as a character or number, as shown below. 279 | 280 | ```{python shapes, echo=FALSE} 281 | shapes = [".",",","o","v","^","<",">","1","2","3","4","8","s","p","P","*","h","H","+","x","X","D","d","|","_"] + list(range(12)) 282 | labels = ["'" + s + "'" for s in shapes[:25]] + [str(s) for s in range(12)] 283 | 284 | df_shapes = pd.DataFrame({"x": [x // 6 for x in range(len(shapes))], 285 | "y": [-(y % 6) for y in range(len(shapes))], 286 | "shape": shapes, 287 | "label": labels}) 288 | 289 | ggplot(df_shapes, aes("x", "y", shape="shape")) +\ 290 | geom_point(size=5) +\ 291 | geom_text(aes(label="label"), nudge_x=0.4, size=10, color='grey') +\ 292 | scale_shape_identity() +\ 293 | theme_void() +\ 294 | theme(aspect_ratio=1/2.75) 295 | ``` 296 | 297 | ### [3.3.1](https://r4ds.had.co.nz/data-visualisation.html#exercises-1)   Exercises[^8] 298 | 299 | 1. Which variables in `mpg` are categorical? Which variables are continuous? 300 | (Hint: type `?mpg` to read the documentation for the dataset). How 301 | can you see this information when you run `mpg`? 302 | 303 | 1. Map a continuous variable to `color`, `size`, and `shape`. How do 304 | these aesthetics behave differently for categorical vs. continuous 305 | variables? 306 | 307 | 1. What happens if you map the same variable to multiple aesthetics? 308 | 309 | 1. What does the `stroke` aesthetic do? What shapes does it work with? 310 | (Hint: use `?geom_point`) 311 | 312 | 1. What happens if you map an aesthetic to something other than a variable 313 | name, like `aes(colour="displ < 5")`? Note, you'll also need to specify x and y. 314 | 315 | 316 | ## [3.4](https://r4ds.had.co.nz/data-visualisation.html#common-problems)   Common problems 317 | 318 | As you start to run Python code, you're likely to run into problems. Don't worry --- it happens to everyone. I have been writing Python code for years, and every day I still write code that doesn't work! 319 | 320 | Start by carefully comparing the code that you're running to the code in the book. Python is extremely picky, and a misplaced character can make all the difference. Make sure that every `(` is matched with a `)` and every `"` is paired with another `"`. 321 | 322 | One common problem when creating plotnine graphics is to forget the `\`: it has to come at the end of the line. In other words, make sure you haven't accidentally written code like this: 323 | 324 | ``` 325 | ggplot(data=mpg) + 326 | geom_point(mapping=aes(x=displ, y=hwy)) 327 | ``` 328 | 329 | Alternatively, if you wrap the entire expression in parentheses then you can leave out the `\`: 330 | 331 | ``` 332 | (ggplot(data=mpg) + 333 | geom_point(mapping=aes(x=displ, y=hwy))) 334 | ``` 335 | 336 | If you're still stuck, try the help. You can get help about any Python function by running `?function_name`. Don't worry if the help doesn't seem that helpful - instead skip down to the examples and look for code that matches what you're trying to do. 337 | 338 | If that doesn't help, carefully read the error message. Sometimes the answer will be buried there! But when you're new to Python, the answer might be in the error message but you don't yet know how to understand it. Another great tool is Google: try googling the error message, as it's likely someone else has had the same problem, and has gotten help online. 339 | 340 | ## [3.5](https://r4ds.had.co.nz/data-visualisation.html#facets)   Facets 341 | 342 | One way to add additional variables is with aesthetics. Another way, particularly useful for categorical variables, is to split your plot into __facets__, subplots that each display one subset of the data. 343 | 344 | To facet your plot by a single variable, use `facet_wrap()`. The first argument of `facet_wrap()` should be a formula, which you create with `~` followed by a variable name (here "formula" is the name of a data structure in Python, not a synonym for "equation"). The variable that you pass to `facet_wrap()` should be discrete. 345 | 346 | ```{python} 347 | ggplot(data=mpg) +\ 348 | geom_point(mapping=aes(x="displ", y="hwy")) +\ 349 | facet_wrap("class", nrow=2) 350 | ``` 351 | 352 | To facet your plot on the combination of two variables, add `facet_grid()` to your plot call. The first argument of `facet_grid()` is also a formula. This time the formula should contain two variable names separated by a `~`. 353 | 354 | ```{python} 355 | ggplot(data=mpg) +\ 356 | geom_point(mapping=aes(x="displ", y="hwy")) +\ 357 | facet_grid("drv ~ cyl") 358 | ``` 359 | 360 | If you prefer to not facet in the rows or columns dimension, use a `.` instead of a variable name, e.g. `+ facet_grid(". ~ cyl")`. 361 | 362 | 363 | ### [3.5.1](https://r4ds.had.co.nz/data-visualisation.html#exercises-2)   Exercises 364 | 365 | 1. What happens if you facet on a continuous variable? 366 | 367 | 1. What do the empty cells in plot with `facet_grid("drv ~ cyl")` mean? 368 | How do they relate to this plot? 369 | 370 | ```{python, eval=FALSE} 371 | ggplot(data=mpg) +\ 372 | geom_point(mapping=aes(x="drv", y="cyl")) 373 | ``` 374 | 375 | 1. What plots does the following code make? What does `.` do? 376 | 377 | ```{python eval=FALSE} 378 | ggplot(data=mpg) +\ 379 | geom_point(mapping=aes(x="displ", y="hwy")) +\ 380 | facet_grid("drv ~ .") 381 | 382 | ggplot(data=mpg) +\ 383 | geom_point(mapping=aes(x="displ", y="hwy")) +\ 384 | facet_grid(". ~ cyl") 385 | ``` 386 | 387 | 1. Take the first faceted plot in this section: 388 | 389 | ```{python, eval=FALSE} 390 | ggplot(data=mpg) +\ 391 | geom_point(mapping=aes(x="displ", y="hwy")) +\ 392 | facet_wrap("class", nrow=2) 393 | ``` 394 | 395 | What are the advantages to using faceting instead of the colour aesthetic? 396 | What are the disadvantages? How might the balance change if you had a 397 | larger dataset? 398 | 399 | 1. Read `?facet_wrap`. What does `nrow` do? What does `ncol` do? What other 400 | options control the layout of the individual panels? Why doesn't `facet_grid()` have `nrow` and `ncol` arguments? 401 | 402 | 1. When using `facet_grid()` you should usually put the variable with more 403 | unique levels in the columns. Why? 404 | 405 | 406 | 407 | ## [3.6](https://r4ds.had.co.nz/data-visualisation.html#geometric-objects)   Geometric objects 408 | 409 | How are these two plots similar? 410 | 411 | 412 | 413 | ```{python echo=FALSE, out.extra = ""} 414 | ggplot(data=mpg) +\ 415 | geom_point(mapping=aes(x="displ", y="hwy")) 416 | ``` 417 | ```{python echo=FALSE, out.extra = ""} 418 | ggplot(data=mpg) +\ 419 | geom_smooth(mapping=aes(x="displ", y="hwy")) 420 | ``` 421 | 422 | Both plots contain the same x variable, the same y variable, and both describe the same data. But the plots are not identical. Each plot uses a different visual object to represent the data. In plotnine syntax, we say that they use different __geoms__. 423 | 424 | A __geom__ is the geometrical object that a plot uses to represent data. People often describe plots by the type of geom that the plot uses. For example, bar charts use bar geoms, line charts use line geoms, boxplots use boxplot geoms, and so on. Scatterplots break the trend; they use the point geom. As we see above, you can use different geoms to plot the same data. The plot on the left uses the point geom, and the plot on the right uses the smooth geom, a smooth line fitted to the data. 425 | 426 | To change the geom in your plot, change the geom function that you add to `ggplot()`. 427 | For instance, to make the plots above, you can use this code: 428 | 429 | ```{python eval=FALSE} 430 | # Left 431 | ggplot(data=mpg) +\ 432 | geom_point(mapping=aes(x="displ", y="hwy")) 433 | 434 | # Right 435 | ggplot(data=mpg) +\ 436 | geom_smooth(mapping=aes(x="displ", y="hwy")) 437 | ``` 438 | 439 | Every geom function in plotnine takes a `mapping` argument. However, not every aesthetic works with every geom. You could set the shape of a point, but you couldn't set the "shape" of a line. On the other hand, you _could_ set the linetype of a line. `geom_smooth()` will draw a different line, with a different linetype, for each unique value of the variable that you map to linetype. 440 | 441 | ```{python} 442 | ggplot(data=mpg) +\ 443 | geom_smooth(mapping=aes(x="displ", y="hwy", linetype="drv")) 444 | ``` 445 | 446 | Here `geom_smooth()` separates the cars into three lines based on their `drv` value, which describes a car's drivetrain. One line describes all of the points with a `4` value, one line describes all of the points with an `f` value, and one line describes all of the points with an `r` value. Here, `4` stands for four-wheel drive, `f` for front-wheel drive, and `r` for rear-wheel drive. 447 | 448 | If this sounds strange, we can make it more clear by overlaying the lines on top of the raw data and then coloring everything according to `drv`. 449 | 450 | ```{python echo=FALSE, message=FALSE} 451 | ggplot(data=mpg, mapping=aes(x="displ", y="hwy", color="drv")) +\ 452 | geom_point() +\ 453 | geom_smooth(mapping=aes(linetype="drv")) 454 | ``` 455 | 456 | Notice that this plot contains two geoms in the same graph! If this makes you excited, buckle up. We will learn how to place multiple geoms in the same plot very soon. 457 | 458 | plotnine provides over 30 geoms. The best way to get a comprehensive overview is the ggplot2 cheatsheet, which you can find at . To learn more about any single geom, use help: `?geom_smooth`. 459 | 460 | Many geoms, like `geom_smooth()`, use a single geometric object to display multiple rows of data. For these geoms, you can set the `group` aesthetic to a categorical variable to draw multiple objects. plotnine will draw a separate object for each unique value of the grouping variable. In practice, plotnine will automatically group the data for these geoms whenever you map an aesthetic to a discrete variable (as in the `linetype` example). It is convenient to rely on this feature because the group aesthetic by itself does not add a legend or distinguishing features to the geoms. 461 | 462 | 463 | ```{python, eval=FALSE} 464 | ggplot(data=mpg) +\ 465 | geom_smooth(mapping=aes(x="displ", y="hwy")) 466 | 467 | ggplot(data=mpg) +\ 468 | geom_smooth(mapping=aes(x="displ", y="hwy", group="drv")) 469 | 470 | ggplot(data=mpg) +\ 471 | geom_smooth(mapping=aes(x="displ", y="hwy", color="drv"), show_legend=False) 472 | ``` 473 | 474 | ```{python, echo=FALSE, out.extra = ""} 475 | ggplot(data=mpg) +\ 476 | geom_smooth(mapping=aes(x="displ", y="hwy")) 477 | ``` 478 | ```{python, echo=FALSE, out.extra = ""} 479 | ggplot(data=mpg) +\ 480 | geom_smooth(mapping=aes(x="displ", y="hwy", group="drv")) 481 | ``` 482 | ```{python, echo=FALSE, out.extra = ""} 483 | ggplot(data=mpg) +\ 484 | geom_smooth(mapping=aes(x="displ", y="hwy", color="drv"), show_legend=False) 485 | ``` 486 | 487 | 488 | 489 | 490 | To display multiple geoms in the same plot, add multiple geom functions to `ggplot()`: 491 | 492 | ```{python, message=FALSE} 493 | ggplot(data=mpg) +\ 494 | geom_point(mapping=aes(x="displ", y="hwy")) +\ 495 | geom_smooth(mapping=aes(x="displ", y="hwy")) 496 | ``` 497 | 498 | This, however, introduces some duplication in our code. Imagine if you wanted to change the y-axis to display `cty` instead of `hwy`. You'd need to change the variable in two places, and you might forget to update one. You can avoid this type of repetition by passing a set of mappings to `ggplot()`. plotnine will treat these mappings as global mappings that apply to each geom in the graph. In other words, this code will produce the same plot as the previous code: 499 | 500 | ```{python, eval=FALSE} 501 | ggplot(data=mpg, mapping=aes(x="displ", y="hwy")) +\ 502 | geom_point() +\ 503 | geom_smooth() 504 | ``` 505 | 506 | If you place mappings in a geom function, plotnine will treat them as local mappings for the layer. It will use these mappings to extend or overwrite the global mappings _for that layer only_. This makes it possible to display different aesthetics in different layers. 507 | 508 | ```{python, message=FALSE} 509 | ggplot(data=mpg, mapping=aes(x="displ", y="hwy")) +\ 510 | geom_point(mapping=aes(color="class")) +\ 511 | geom_smooth() 512 | ``` 513 | 514 | You can use the same idea to specify different `data` for each layer. Here, our smooth line displays just a subset of the `mpg` dataset, the subcompact cars. The local data argument in `geom_smooth()` overrides the global data argument in `ggplot()` for that layer only. 515 | 516 | ```{python, message=FALSE} 517 | ggplot(data=mpg, mapping=aes(x="displ", y="hwy")) +\ 518 | geom_point(mapping=aes(color="class")) +\ 519 | geom_smooth(data=mpg.loc[mpg["class"] == "subcompact"], se=False) 520 | ``` 521 | 522 | ### [3.6.1](https://r4ds.had.co.nz/data-visualisation.html#exercises-3)   Exercises 523 | 524 | 1. What geom would you use to draw a line chart? A boxplot? A histogram? An area chart? 525 | 526 | 1. Run this code in your head and predict what the output will look like. 527 | Then, run the code in Python and check your predictions. 528 | 529 | ```{python, eval=FALSE} 530 | ggplot(data=mpg, mapping=aes(x="displ", y="hwy", color="drv")) +\ 531 | geom_point() +\ 532 | geom_smooth(se=False) 533 | ``` 534 | 535 | 1. What does `show_legend=False` do? What happens if you remove it? Why do you think I used it earlier in the chapter? 536 | 537 | 1. What does the `se` argument to `geom_smooth()` do? 538 | 539 | 1. Will these two graphs look different? Why/why not? 540 | 541 | ```{python, eval=FALSE} 542 | ggplot(data=mpg, mapping=aes(x="displ", y="hwy")) +\ 543 | geom_point() +\ 544 | geom_smooth() 545 | 546 | ggplot() +\ 547 | geom_point(data=mpg, mapping=aes(x="displ", y="hwy")) +\ 548 | geom_smooth(data=mpg, mapping=aes(x="displ", y="hwy")) 549 | ``` 550 | 551 | 1. Recreate the Python code necessary to generate the following graphs. 552 | 553 | ```{python, echo=FALSE, out.extra = ""} 554 | ggplot(data=mpg, mapping=aes(x="displ", y="hwy")) +\ 555 | geom_point() +\ 556 | geom_smooth(se=False) 557 | ``` 558 | ```{python, echo=FALSE, out.extra = ""} 559 | ggplot(data=mpg, mapping=aes(x="displ", y="hwy")) +\ 560 | geom_smooth(aes(group="drv"), se=False) +\ 561 | geom_point() 562 | ``` 563 | ```{python, echo=FALSE, out.extra = ""} 564 | ggplot(data=mpg, mapping=aes(x="displ", y="hwy", color="drv")) +\ 565 | geom_point() +\ 566 | geom_smooth(se=False) 567 | ``` 568 | ```{python, echo=FALSE, out.extra = ""} 569 | ggplot(data=mpg, mapping=aes(x="displ", y="hwy")) +\ 570 | geom_point(aes(color="drv")) +\ 571 | geom_smooth(se=False) 572 | ``` 573 | ```{python, echo=FALSE, out.extra = ""} 574 | ggplot(data=mpg, mapping=aes(x="displ", y="hwy")) +\ 575 | geom_point(aes(color="drv")) +\ 576 | geom_smooth(aes(linetype="drv"), se=False) 577 | ``` 578 | ```{python, echo=FALSE, out.extra = ""} 579 | ggplot(data=mpg, mapping=aes(x="displ", y="hwy")) +\ 580 | geom_point(size=4, colour="white") +\ 581 | geom_point(aes(colour="drv")) 582 | ``` 583 | 584 | 585 | You can learn which stat a geom uses by inspecting the default value for the `stat` argument. For example, `?geom_bar` shows that the default value for `stat` is "count", which means that `geom_bar()` uses `stat_count()`. `stat_count()` is documented on the same page as `geom_bar()`, and if you scroll down you can find a section called "Computed variables". That describes how it computes two new variables: `count` and `prop`. 586 | 587 | You can generally use geoms and stats interchangeably. For example, you can recreate the previous plot using `stat_count()` instead of `geom_bar()`: 588 | 589 | ```{python} 590 | ggplot(data=diamonds) +\ 591 | stat_count(mapping=aes(x="cut")) 592 | ``` 593 | 594 | This works because every geom has a default stat; and every stat has a default geom. This means that you can typically use geoms without worrying about the underlying statistical transformation. There are three reasons you might need to use a stat explicitly: 595 | 596 | 1. You might want to override the default stat. In the code below, I change 597 | the stat of `geom_bar()` from count (the default) to identity. This lets 598 | me map the height of the bars to the raw values of a "y" variable. 599 | Unfortunately when people talk about bar charts casually, they might be 600 | referring to this type of bar chart, where the height of the bar is already 601 | present in the data, or the previous bar chart where the height of the bar 602 | is generated by counting rows. 603 | ```{python} 604 | demo = pd.DataFrame({"cut": ["Fair", "Good", "Very Good", "Premium", "Ideal"], 605 | "freq": [1610, 4906, 12082, 13791, 21551]}) 606 | 607 | ggplot(data=demo) +\ 608 | geom_bar(mapping=aes(x="cut", y="freq"), stat="identity") 609 | ``` 610 | 611 | 1. You might want to override the default mapping from transformed variables to aesthetics. For example, you might want to display a bar chart of proportion, rather than count: 612 | ```{python} 613 | ggplot(data=diamonds) +\ 614 | geom_bar(mapping=aes(x="cut", y="..prop..", group=1)) 615 | ``` 616 | To find the variables computed by the stat, look for the help section 617 | titled "computed variables". 618 | 619 | 1. You might want to draw greater attention to the statistical transformation 620 | in your code. For example, you might use `stat_summary()`, which 621 | summarises the y values for each unique x value, to draw 622 | attention to the summary that you're computing: 623 | 624 | ```{python} 625 | ggplot(data=diamonds) +\ 626 | stat_summary( 627 | mapping=aes(x="cut", y="depth"), 628 | fun_ymin=np.min, 629 | fun_ymax=np.max, 630 | fun_y=np.median 631 | ) 632 | ``` 633 | 634 | plotnine provides over 20 stats for you to use. Each stat is a function, so you can get help in the usual way, e.g. `?stat_bin`. To see a complete list of stats, try the ggplot2 cheatsheet. 635 | 636 | ### [3.7.1](https://r4ds.had.co.nz/data-visualisation.html#exercises-4)   Exercises 637 | 638 | 1. What is the default geom associated with `stat_summary()`? How could 639 | you rewrite the previous plot to use that geom function instead of the 640 | stat function? 641 | 642 | 1. What does `geom_col()` do? How is it different to `geom_bar()`? 643 | 644 | 1. Most geoms and stats come in pairs that are almost always used in 645 | concert. Read through the documentation and make a list of all the 646 | pairs. What do they have in common? 647 | 648 | 1. What variables does `stat_smooth()` compute? What parameters control 649 | its behaviour? 650 | 651 | 1. In our proportion bar chart, we need to set `group=1`. Why? In other 652 | words what is the problem with these two graphs? 653 | 654 | ```{python, eval=FALSE} 655 | ggplot(data=diamonds) +\ 656 | geom_bar(mapping=aes(x="cut", y="..prop..")) 657 | 658 | ggplot(data=diamonds) +\ 659 | geom_bar(mapping=aes(x="cut", fill="color", y="..prop..")) 660 | ``` 661 | 662 | ## [3.8](https://r4ds.had.co.nz/data-visualisation.html#position-adjustments)   Position adjustments 663 | 664 | There's one more piece of magic associated with bar charts. You can colour a bar chart using either the `colour` aesthetic, or, more usefully, `fill`: 665 | 666 | ```{python, eval=FALSE} 667 | # Left 668 | ggplot(data=diamonds) +\ 669 | geom_bar(mapping=aes(x="cut", colour="cut")) 670 | 671 | # Right 672 | ggplot(data=diamonds) +\ 673 | geom_bar(mapping=aes(x="cut", fill="cut")) 674 | ``` 675 | 676 | ```{python, echo=FALSE, out.extra = ""} 677 | ggplot(data=diamonds) +\ 678 | geom_bar(mapping=aes(x="cut", colour="cut")) 679 | ``` 680 | ```{python, echo=FALSE, out.extra = ""} 681 | ggplot(data=diamonds) +\ 682 | geom_bar(mapping=aes(x="cut", fill="cut")) 683 | ``` 684 | 685 | Note what happens if you map the fill aesthetic to another variable, like `clarity`: the bars are automatically stacked. Each colored rectangle represents a combination of `cut` and `clarity`. 686 | 687 | ```{python} 688 | ggplot(data=diamonds) +\ 689 | geom_bar(mapping=aes(x="cut", fill="clarity")) 690 | ``` 691 | 692 | 693 | The stacking is performed automatically by the __position adjustment__ specified by the `position` argument. If you don't want a stacked bar chart, you can use one of three other options: `"identity"`, `"dodge"` or `"fill"`. 694 | 695 | * `position="identity"` will place each object exactly where it falls in 696 | the context of the graph. This is not very useful for bars, because it 697 | overlaps them. To see that overlapping we either need to make the bars 698 | slightly transparent by setting `alpha` to a small value, or completely 699 | transparent by setting `fill=None`. 700 | ```{python, eval=FALSE} 701 | # Left 702 | ggplot(data=diamonds, mapping=aes(x="cut", fill="clarity")) +\ 703 | geom_bar(alpha=1/5, position="identity") 704 | 705 | # Right 706 | ggplot(data=diamonds, mapping=aes(x="cut", colour="clarity")) +\ 707 | geom_bar(fill=None, position="identity") 708 | ``` 709 | 710 | ```{python, echo=FALSE, out.extra = ""} 711 | ggplot(data=diamonds, mapping=aes(x="cut", fill="clarity")) +\ 712 | geom_bar(alpha=1/5, position="identity") 713 | ``` 714 | ```{python, echo=FALSE, out.extra = ""} 715 | ggplot(data=diamonds, mapping=aes(x="cut", colour="clarity")) +\ 716 | geom_bar(fill=None, position="identity") 717 | ``` 718 | The identity position adjustment is more useful for 2d geoms, like points, 719 | where it is the default. 720 | 721 | * `position="fill"` works like stacking, but makes each set of stacked bars 722 | the same height. This makes it easier to compare proportions across 723 | groups. 724 | 725 | ```{python} 726 | ggplot(data=diamonds) +\ 727 | geom_bar(mapping=aes(x="cut", fill="clarity"), position="fill") 728 | ``` 729 | 730 | * `position="dodge"` places overlapping objects directly _beside_ one 731 | another. This makes it easier to compare individual values. 732 | 733 | ```{python} 734 | ggplot(data=diamonds) +\ 735 | geom_bar(mapping=aes(x="cut", fill="clarity"), position="dodge") 736 | ``` 737 | 738 | There's one other type of adjustment that's not useful for bar charts, but it can be very useful for scatterplots. Recall our first scatterplot. Did you notice that the plot displays only 126 points, even though there are 234 observations in the dataset? 739 | 740 | ```{python echo=FALSE} 741 | ggplot(data=mpg) +\ 742 | geom_point(mapping=aes(x="displ", y="hwy")) 743 | ``` 744 | 745 | The values of `hwy` and `displ` are rounded so the points appear on a grid and many points overlap each other. This problem is known as __overplotting__. This arrangement makes it hard to see where the mass of the data is. Are the data points spread equally throughout the graph, or is there one special combination of `hwy` and `displ` that contains 109 values? 746 | 747 | You can avoid this gridding by setting the position adjustment to "jitter". `position="jitter"` adds a small amount of random noise to each point. This spreads the points out because no two points are likely to receive the same amount of random noise. 748 | 749 | ```{python} 750 | ggplot(data=mpg) +\ 751 | geom_point(mapping=aes(x="displ", y="hwy"), position="jitter") 752 | ``` 753 | 754 | Adding randomness seems like a strange way to improve your plot, but while it makes your graph less accurate at small scales, it makes your graph _more_ revealing at large scales. Because this is such a useful operation, plotnine comes with a shorthand for `geom_point(position="jitter")`: `geom_jitter()`. 755 | 756 | To learn more about a position adjustment, look up the help page associated with each adjustment: `?position_dodge`, `?position_fill`, `?position_identity`, `?position_jitter`, and `?position_stack`. 757 | 758 | 759 | ### [3.8.1](https://r4ds.had.co.nz/data-visualisation.html#exercises-5)   Exercises 760 | 761 | 1. What is the problem with this plot? How could you improve it? 762 | ```{python} 763 | ggplot(data=mpg, mapping=aes(x="cty", y="hwy")) +\ 764 | geom_point() 765 | ``` 766 | 767 | 1. What parameters to `geom_jitter()` control the amount of jittering? 768 | 769 | 1. Compare and contrast `geom_jitter()` with `geom_count()`. 770 | 771 | 1. What's the default position adjustment for `geom_boxplot()`? Create a visualisation of the `mpg` dataset that demonstrates it. 772 | 773 | 774 | ## [3.9](https://r4ds.had.co.nz/data-visualisation.html#coordinate-systems)   Coordinate systems 775 | 776 | Coordinate systems are probably the most complicated part of plotnine. The default coordinate system is the Cartesian coordinate system where the x and y positions act independently to determine the location of each point. There is one other coordinate system that is occasionally helpful.[^9] 777 | 778 | * `coord_flip()` switches the x and y axes. This is useful (for example), if you want horizontal boxplots. It's also useful for long labels: it's hard to get them to fit without overlapping on the x-axis. 779 | ```{python, eval=FALSE} 780 | # Left 781 | ggplot(data=mpg, mapping=aes(x="class", y="hwy")) +\ 782 | geom_boxplot() 783 | 784 | # Right 785 | ggplot(data=mpg, mapping=aes(x="class", y="hwy")) +\ 786 | geom_boxplot() +\ 787 | coord_flip() 788 | ``` 789 | 790 | ```{python, echo=FALSE, out.extra = ""} 791 | ggplot(data=mpg, mapping=aes(x="class", y="hwy")) +\ 792 | geom_boxplot() 793 | ``` 794 | ```{python, echo=FALSE, out.extra = ""} 795 | ggplot(data=mpg, mapping=aes(x="class", y="hwy")) +\ 796 | geom_boxplot() +\ 797 | coord_flip() 798 | ``` 799 | 800 | 801 | 802 | 803 | 804 | ### [3.9.1](https://r4ds.had.co.nz/data-visualisation.html#exercises-6)   Exercises 805 | 806 | 1. What does `labs()` do? Read the documentation. 807 | 808 | 1. What does the plot below tell you about the relationship between city and highway mpg? Why is `coord_fixed()` important? What does `geom_abline()` do? 809 | ```{python, fig.asp=1, out.extra = "class=\"w-2/3 md:w-1/2\""} 810 | ggplot(data=mpg, mapping=aes(x="cty", y="hwy")) +\ 811 | geom_point() +\ 812 | geom_abline() +\ 813 | coord_fixed() 814 | ``` 815 | 816 | 817 | ## [3.10](https://r4ds.had.co.nz/data-visualisation.html#the-layered-grammar-of-graphics)   The layered grammar of graphics 818 | 819 | In the previous sections, you learned much more than how to make scatterplots, bar charts, and boxplots. You learned a foundation that you can use to make _any_ type of plot with plotnine. To see this, let's add position adjustments, stats, coordinate systems, and faceting to our code template: 820 | 821 | ``` 822 | ggplot(data=) +\ 823 | ( 824 | mapping=aes(), 825 | stat=, 826 | position= 827 | ) +\ 828 | +\ 829 | 830 | ``` 831 | 832 | Our new template takes seven parameters, the bracketed words that appear in the template. In practice, you rarely need to supply all seven parameters to make a graph because plotnine will provide useful defaults for everything except the data, the mappings, and the geom function. 833 | 834 | The seven parameters in the template compose the grammar of graphics, a formal system for building plots. The grammar of graphics is based on the insight that you can uniquely describe _any_ plot as a combination of a dataset, a geom, a set of mappings, a stat, a position adjustment, a coordinate system, and a faceting scheme. 835 | 836 | To see how this works, consider how you could build a basic plot from scratch: you could start with a dataset and then transform it into the information that you want to display (with a stat). 837 | 838 | ```{r, echo=FALSE, dev="png", out.extra="class=\"w-full\""} 839 | knitr::include_graphics("images/visualization-grammar-1.png") 840 | ``` 841 | 842 | 843 | Next, you could choose a geometric object to represent each observation in the transformed data. You could then use the aesthetic properties of the geoms to represent variables in the data. You would map the values of each variable to the levels of an aesthetic. 844 | 845 | ```{r, echo=FALSE, dev="png", out.extra="class=\"w-full\""} 846 | knitr::include_graphics("images/visualization-grammar-2.png") 847 | ``` 848 | 849 | 850 | You'd then select a coordinate system to place the geoms into. You'd use the location of the objects (which is itself an aesthetic property) to display the values of the x and y variables. At that point, you would have a complete graph, but you could further adjust the positions of the geoms within the coordinate system (a position adjustment) or split the graph into subplots (faceting). You could also extend the plot by adding one or more additional layers, where each additional layer uses a dataset, a geom, a set of mappings, a stat, and a position adjustment. 851 | 852 | ```{r, echo=FALSE, dev="png", out.extra="class=\"w-full\""} 853 | knitr::include_graphics("images/visualization-grammar-3.png") 854 | ``` 855 | 856 | 857 | You could use this method to build _any_ plot that you imagine. In other words, you can use the code template that you've learned in this chapter to build hundreds of thousands of unique plots. 858 | 859 | 860 | 861 | 862 | # [28](https://r4ds.had.co.nz/graphics-for-communication.html)   Graphics for communication 863 | 864 | ## [28.1](https://r4ds.had.co.nz/graphics-for-communication.html#introduction-19)   Introduction 865 | 866 | Now that you understand your data, you need to _communicate_ your understanding to others. Your audience will likely not share your background knowledge and will not be deeply invested in the data. To help others quickly build up a good mental model of the data, you will need to invest considerable effort in making your plots as self-explanatory as possible. In this chapter, you'll learn some of the tools that plotnine provides to do so. 867 | 868 | The rest of this tutorial focuses on the tools you need to create good graphics. I assume that you know what you want, and just need to know how to do it. For that reason, I highly recommend pairing this chapter with a good general visualisation book. I particularly like [_The Truthful Art_](https://amzn.com/0321934075), by Albert Cairo. It doesn't teach the mechanics of creating visualisations, but instead focuses on what you need to think about in order to create effective graphics. 869 | 870 | ## [28.2](https://r4ds.had.co.nz/graphics-for-communication.html#label)   Label 871 | 872 | The easiest place to start when turning an exploratory graphic into an expository graphic is with good labels. You add labels with the `labs()` function. This example adds a plot title: 873 | 874 | ```{python, message=FALSE} 875 | ggplot(mpg, aes("displ", "hwy")) +\ 876 | geom_point(aes(color="class")) +\ 877 | geom_smooth(se=False) +\ 878 | labs(title="Fuel efficiency generally decreases with engine size") 879 | ``` 880 | 881 | The purpose of a plot title is to summarise the main finding. Avoid titles that just describe what the plot is, e.g. "A scatterplot of engine displacement vs. fuel economy". 882 | 883 | You can also use `labs()` to replace the axis and legend titles.[^10] It's usually a good idea to replace short variable names with more detailed descriptions, and to include the units. 884 | 885 | ```{python, message=FALSE} 886 | ggplot(mpg, aes("displ", "hwy")) +\ 887 | geom_point(aes(colour="class")) +\ 888 | geom_smooth(se=False) +\ 889 | labs(x="Engine displacement (L)", 890 | y="Highway fuel economy (mpg)", 891 | colour="Car type") 892 | ``` 893 | 894 | It's possible to use mathematical equations instead of text strings. You have to tell matplotlib, which is used by plotnine to do the actuall plotting, to use LaTeX for rendering text: 895 | 896 | ```{python} 897 | from matplotlib import rc 898 | rc('text', usetex=True) 899 | 900 | df = pd.DataFrame({"x": np.random.uniform(size=10), 901 | "y": np.random.uniform(size=10)}) 902 | 903 | ggplot(df, aes("x", "y")) +\ 904 | geom_point() +\ 905 | labs(x="$\\sum_{i = 1}^n{x_i^2}$", 906 | y="$\\alpha + \\beta + \\frac{\\delta}{\\theta}$") 907 | ``` 908 | 909 | ```{python, echo=FALSE} 910 | rc('text', usetex=False) 911 | ``` 912 | 913 | See [the matplotlib documentation](https://matplotlib.org/3.1.1/tutorials/text/mathtext.html) for more information about how to write mathematical equations using LaTeX. 914 | 915 | ### [28.2.1](https://r4ds.had.co.nz/graphics-for-communication.html#exercises-71)   Exercises 916 | 917 | 1. Create one plot on the fuel economy data with customised `title`, `x`, `y`, and `colour` labels. 918 | 919 | 1. The `geom_smooth()` is somewhat misleading because the `hwy` for large engines is skewed upwards due to the inclusion of lightweight sports cars with big engines. Use your modelling tools to fit and display a better model. 920 | 921 | 1. Take an exploratory graphic that you've created in the last month, and add an informative title to make it easier for others to understand. 922 | 923 | 924 | ## [28.3](https://r4ds.had.co.nz/graphics-for-communication.html#annotations)   Annotations 925 | 926 | In addition to labelling major components of your plot, it's often useful to label individual observations or groups of observations. The first tool you have at your disposal is `geom_text()`. `geom_text()` is similar to `geom_point()`, but it has an additional aesthetic: `label`. This makes it possible to add textual labels to your plots. 927 | 928 | There are two possible sources of labels. First, you might have a DataFrame that provides labels. The plot below isn't terribly useful, but it illustrates a useful approach: pull out the most efficient car in each class with pandas, and then label it on the plot: 929 | 930 | ```{python} 931 | best_in_class = mpg\ 932 | .sort_values(by="hwy", ascending=False)\ 933 | .groupby("class")\ 934 | .first() 935 | 936 | ggplot(mpg, aes("displ", "hwy")) +\ 937 | geom_point(aes(colour="class")) +\ 938 | geom_text(aes(label="model"), data=best_in_class) 939 | ``` 940 | 941 | 942 | This is hard to read because the labels overlap with each other, and with the points. We can make things a little better by switching to `geom_label()` which draws a rectangle behind the text. We also use the `nudge_y` parameter to move the labels slightly above the corresponding points: 943 | 944 | ```{python} 945 | ggplot(mpg, aes("displ", "hwy")) +\ 946 | geom_point(aes(colour="class")) +\ 947 | geom_label(aes(label="model"), data=best_in_class, nudge_y=2, alpha=0.5) 948 | ``` 949 | 950 | That helps a bit, but if you look closely in the top-left hand corner, you'll notice that there are two labels practically on top of each other. This happens because the highway mileage and displacement for the best cars in the compact and subcompact categories are exactly the same. There's no way that we can fix these by applying the same transformation for every label. Instead, we can use the adjust_text argument. This useful argument, which employs the adjustText package under the hood, will automatically adjust labels so that they don't overlap: 951 | 952 | ```{python} 953 | ggplot(mpg, aes("displ", "hwy")) +\ 954 | geom_point(aes(colour="class")) +\ 955 | geom_point(data=best_in_class, fill='none') +\ 956 | geom_label(aes(label="model"), data=best_in_class, adjust_text={ 957 | 'expand_points': (1.5, 1.5), 958 | 'arrowprops': { 959 | 'arrowstyle': '-' 960 | }}) 961 | ``` 962 | 963 | Note another handy technique used here: I added a second layer of large, hollow points to highlight the points that I've labelled. 964 | 965 | You can sometimes use the same idea to replace the legend with labels placed directly on the plot. It's not wonderful for this plot, but it isn't too bad.[^11] 966 | (`theme(legend_position="none"`) turns the legend off --- we'll talk about it more shortly.) 967 | 968 | ```{python} 969 | class_avg = mpg\ 970 | .groupby("class")["displ","hwy"].median()\ 971 | .reset_index() 972 | 973 | ggplot(mpg, aes("displ", "hwy", colour="class")) +\ 974 | geom_point() +\ 975 | geom_label(aes(label="class"), data=class_avg, size=16, label_size=0, adjust_text={'expand_points': (0, 0)}) +\ 976 | geom_point() +\ 977 | theme(legend_position="none") 978 | ``` 979 | 980 | 981 | Alternatively, you might just want to add a single label to the plot, but you'll still need to create a DataFrame. Often, you want the label in the corner of the plot, so it's convenient to create a new DataFrame using `pd.DataFrame()` and `max()` to compute the maximum values of x and y. 982 | 983 | ```{python} 984 | label = pd.DataFrame({"displ": [mpg.displ.max()], 985 | "hwy": [mpg.hwy.max()], 986 | "label": "Increasing engine size is \nrelated to decreasing fuel economy."}) 987 | 988 | ggplot(mpg, aes("displ", "hwy")) +\ 989 | geom_point() +\ 990 | geom_text(aes(label="label"), data=label, va="top", ha="right") 991 | ``` 992 | 993 | If you want to place the text exactly on the borders of the plot, you can use `+np.Inf` and `-np.Inf`: 994 | 995 | ```{python} 996 | label = pd.DataFrame({"displ": [np.Inf], 997 | "hwy": [np.Inf], 998 | "label": "Increasing engine size is \nrelated to decreasing fuel economy."}) 999 | 1000 | ggplot(mpg, aes("displ", "hwy")) +\ 1001 | geom_point() +\ 1002 | geom_text(aes(label="label"), data=label, va="top", ha="right") 1003 | ``` 1004 | 1005 | In these examples, I manually broke the label up into lines using `"\n"`. Another approach is to use the `fill` function from the `textwrap` module to automatically add line breaks, given the number of characters you want per line: 1006 | 1007 | ```{python} 1008 | from textwrap import fill 1009 | 1010 | print(fill("Increasing engine size is related to decreasing fuel economy.", width=40)) 1011 | ``` 1012 | 1013 | Note the use of `ha` and `va` to control the alignment of the label. The figure below shows all nine possible combinations. 1014 | 1015 | ```{python, echo=FALSE} 1016 | from itertools import product 1017 | 1018 | has = ["left", "center", "right"] 1019 | vas = ["top", "center", "bottom"] 1020 | xs = [0, 0.5, 1] 1021 | ys = [1, 0.5, 0] 1022 | 1023 | df = pd.DataFrame([{"x": xs[x], 1024 | "y": ys[y], 1025 | "ha": has[x], 1026 | "va": vas[y], 1027 | "label": f"ha=\"{has[x]}\"\nva=\"{vas[y]}\""} 1028 | for x, y in product(range(3), repeat=2)]) 1029 | 1030 | ggplot(df, aes("x", "y")) +\ 1031 | geom_point(colour="grey", size=5) +\ 1032 | geom_point(size=0.5, colour="red") +\ 1033 | geom_text(aes(label="label", ha="ha", va="va")) +\ 1034 | labs(x=None, y=None) 1035 | ``` 1036 | 1037 | Remember, in addition to `geom_text()`, you have many other geoms in plotnine available to help annotate your plot. A few ideas: 1038 | 1039 | * Use `geom_hline()` and `geom_vline()` to add reference lines. I often make 1040 | them thick (`size=2`) and white (`colour="white"`), and draw them 1041 | underneath the primary data layer. That makes them easy to see, without 1042 | drawing attention away from the data. 1043 | 1044 | * Use `geom_rect()` to draw a rectangle around points of interest. The 1045 | boundaries of the rectangle are defined by aesthetics `xmin`, `xmax`, 1046 | `ymin`, `ymax`. 1047 | 1048 | * Use `geom_segment()` with the `arrow` argument to draw attention 1049 | to a point with an arrow. Use aesthetics `x` and `y` to define the 1050 | starting location, and `xend` and `yend` to define the end location. 1051 | 1052 | The only limit is your imagination (and your patience with positioning annotations to be aesthetically pleasing)! 1053 | 1054 | 1055 | ### [28.3.1](https://r4ds.had.co.nz/graphics-for-communication.html#exercises-72)   Exercises 1056 | 1057 | 1. Use `geom_text()` with infinite positions to place text at the 1058 | four corners of the plot. 1059 | 1060 | 1. Read the documentation for `annotate()`. How can you use it to add a text 1061 | label to a plot without having to create a DataFrame? 1062 | 1063 | 1. How do labels with `geom_text()` interact with faceting? How can you 1064 | add a label to a single facet? How can you put a different label in 1065 | each facet? (Hint: think about the underlying data.) 1066 | 1067 | 1. What arguments to `geom_label()` control the appearance of the background 1068 | box? 1069 | 1070 | 1. What are the four arguments to `arrow()`? How do they work? Create a series 1071 | of plots that demonstrate the most important options. 1072 | 1073 | 1074 | 1075 | 1076 | 1077 | 1078 | ## [28.4](https://r4ds.had.co.nz/graphics-for-communication.html#scales)   Scales 1079 | 1080 | The third way you can make your plot better for communication is to adjust the scales. Scales control the mapping from data values to things that you can perceive. Normally, plotnine automatically adds scales for you. For example, when you type: 1081 | 1082 | ```python 1083 | ggplot(mpg, aes("displ", "hwy")) +\ 1084 | geom_point(aes(colour="class")) 1085 | ``` 1086 | 1087 | plotnine automatically adds default scales behind the scenes: 1088 | 1089 | ```python 1090 | ggplot(mpg, aes("displ", "hwy")) +\ 1091 | geom_point(aes(colour="class")) +\ 1092 | scale_x_continuous() +\ 1093 | scale_y_continuous() +\ 1094 | scale_colour_discrete() 1095 | ``` 1096 | 1097 | Note the naming scheme for scales: `scale_` followed by the name of the aesthetic, then `_`, then the name of the scale. The default scales are named according to the type of variable they align with: continuous, discrete, datetime, or date. There are lots of non-default scales which you'll learn about below. 1098 | 1099 | The default scales have been carefully chosen to do a good job for a wide range of inputs. Nevertheless, you might want to override the defaults for two reasons: 1100 | 1101 | * You might want to tweak some of the parameters of the default scale. 1102 | This allows you to do things like change the breaks on the axes, or the 1103 | key labels on the legend. 1104 | 1105 | * You might want to replace the scale altogether, and use a completely 1106 | different algorithm. Often you can do better than the default because 1107 | you know more about the data. 1108 | 1109 | 1110 | ### [28.4.1](https://r4ds.had.co.nz/graphics-for-communication.html#axis-ticks-and-legend-keys)   Axis ticks and legend keys 1111 | 1112 | There are two primary arguments that affect the appearance of the ticks on the axes and the keys on the legend: `breaks` and `labels`. Breaks controls the position of the ticks, or the values associated with the keys. Labels controls the text label associated with each tick/key. The most common use of `breaks` is to override the default choice: 1113 | 1114 | ```{python} 1115 | ggplot(mpg, aes("displ", "hwy")) +\ 1116 | geom_point() +\ 1117 | scale_y_continuous(breaks=range(15, 45, 5)) 1118 | ``` 1119 | 1120 | You can use `labels` in the same way (a list of strings the same length as `breaks`), but you can also suppress the labels altogether by passing a list of empty strings. This is useful for maps, or for publishing plots where you can't share the absolute numbers. Note that the list of labels needs to be of the same length as the list of values, so a helper function like `no_labels` is convenient[^12]: 1121 | 1122 | ```{python} 1123 | def no_labels(values): 1124 | return [""] * len(values) 1125 | 1126 | ggplot(mpg, aes("displ", "hwy")) +\ 1127 | geom_point() +\ 1128 | scale_x_continuous(labels=no_labels) +\ 1129 | scale_y_continuous(labels=no_labels) 1130 | ``` 1131 | 1132 | You can also use `breaks` and `labels` to control the appearance of legends. Collectively axes and legends are called __guides__. Axes are used for x and y aesthetics; legends are used for everything else. 1133 | 1134 | Another use of `breaks` is when you have relatively few data points and want to highlight exactly where the observations occur. For example, take this plot that shows when each US president started and ended their term. 1135 | 1136 | 1137 | ```{python} 1138 | presidential["id"] = 34 + presidential.index 1139 | 1140 | ggplot(presidential, aes("start", "id")) +\ 1141 | geom_point() +\ 1142 | geom_segment(aes(xend="end", yend="id")) +\ 1143 | scale_x_date(name="", breaks=presidential.start, date_labels="'%y") 1144 | ``` 1145 | 1146 | Note that the specification of breaks and labels for date and datetime scales is a little different: 1147 | 1148 | * `date_labels` takes a format specification, in the same form as `time.strptime()`. 1149 | 1150 | * `date_breaks` (not shown here), takes a string like "2 days" or "1 month". 1151 | 1152 | 1153 | ### [28.4.2](https://r4ds.had.co.nz/graphics-for-communication.html#legend-layout)   Legend layout 1154 | 1155 | You will most often use `breaks` and `labels` to tweak the axes. While they both also work for legends, there are a few other techniques you are more likely to use. 1156 | 1157 | To control the overall position of the legend, you need to use a `theme()` setting. We'll come back to themes at the end of the chapter, but in brief, they control the non-data parts of the plot. The theme setting `legend_position` controls where the legend is drawn. Unfortunately, in order to position the legend correctly on the left or the bottom, we have to be a bit more explicit. Just using "left" and "bottom" may cause the legend to overlap the axis labels. Your milage may vary. 1158 | 1159 | ```{python, eval=FALSE} 1160 | base = ggplot(mpg, aes("displ", "hwy")) +\ 1161 | geom_point(aes(colour="class")) 1162 | 1163 | base + theme(legend_position="right") # the default 1164 | base + theme(subplots_adjust={'left': 0.3}) + theme(legend_position=(0, 0.5)) 1165 | base + theme(legend_position="top") 1166 | base + theme(subplots_adjust={'bottom': 0.3}, legend_position=(.5, 0), legend_direction='horizontal') 1167 | ``` 1168 | 1169 | ```{python, echo=FALSE, out.extra = ""} 1170 | base = ggplot(mpg, aes("displ", "hwy")) +\ 1171 | geom_point(aes(colour="class")) 1172 | 1173 | base + theme(legend_position="right") 1174 | ``` 1175 | ```{python, echo=FALSE, out.extra = ""} 1176 | base + theme(subplots_adjust={'left': 0.3}) + theme(legend_position=(0, 0.5)) 1177 | ``` 1178 | ```{python, echo=FALSE, out.extra = ""} 1179 | base + theme(legend_position="top") 1180 | ``` 1181 | ```{python, echo=FALSE, out.extra = ""} 1182 | base + theme(subplots_adjust={'bottom': 0.3}, legend_position=(.5, 0), legend_direction='horizontal') 1183 | ``` 1184 | 1185 | 1186 | You can also use `legend_position="none"` to suppress the display of the legend altogether. 1187 | 1188 | To control the display of individual legends, use `guides()` along with `guide_legend()` or `guide_colourbar()`. The following example shows two important settings: controlling the number of rows the legend uses with `nrow`, and overriding one of the aesthetics to make the points bigger. This is particularly useful if you have used a low `alpha` to display many points on a plot. 1189 | 1190 | ```{python} 1191 | ggplot(mpg, aes("displ", "hwy")) +\ 1192 | geom_point(aes(colour="class")) +\ 1193 | geom_smooth(se=False) +\ 1194 | theme(legend_position="bottom") +\ 1195 | guides(colour=guide_legend(nrow=1, override_aes={"size": 4})) 1196 | ``` 1197 | 1198 | 1199 | ### [28.4.3](https://r4ds.had.co.nz/graphics-for-communication.html#replacing-a-scale)   Replacing a scale 1200 | 1201 | Instead of just tweaking the details a little, you can instead replace the scale altogether. There are two types of scales you're mostly likely to want to switch out: continuous position scales and colour scales. Fortunately, the same principles apply to all the other aesthetics, so once you've mastered position and colour, you'll be able to quickly pick up other scale replacements. 1202 | 1203 | It's very useful to plot transformations of your variable. For example, with the `diamonds` DataFrame, it's easier to see the precise relationship between `carat` and `price` if we log transform them: 1204 | 1205 | 1206 | ```{python, eval=FALSE} 1207 | ggplot(diamonds, aes("carat", "price")) +\ 1208 | geom_bin2d() 1209 | 1210 | ggplot(diamonds, aes("np.log10(carat)", "np.log10(price)")) +\ 1211 | geom_bin2d() 1212 | ``` 1213 | 1214 | ```{python, echo=FALSE, out.extra = ""} 1215 | ggplot(diamonds, aes("carat", "price")) +\ 1216 | geom_bin2d() 1217 | ``` 1218 | ```{python, echo=FALSE, out.extra = ""} 1219 | ggplot(diamonds, aes("np.log10(carat)", "np.log10(price)")) +\ 1220 | geom_bin2d() 1221 | ``` 1222 | 1223 | However, the disadvantage of this transformation is that the axes are now labelled with the transformed values, making it hard to interpret the plot. Instead of doing the transformation in the aesthetic mapping, we can instead do it with the scale. This is visually identical, except the axes are labelled on the original data scale. 1224 | 1225 | ```{python} 1226 | ggplot(diamonds, aes("carat", "price")) +\ 1227 | geom_bin2d() +\ 1228 | scale_x_log10() +\ 1229 | scale_y_log10() 1230 | ``` 1231 | 1232 | Another scale that is frequently customised is colour. The default categorical scale picks colours that are evenly spaced around the colour wheel. Useful alternatives are the ColorBrewer scales which have been hand tuned to work better for people with common types of colour blindness. The two plots below look similar, but there is enough difference in the shades of red and green that the dots on the right can be distinguished even by people with red-green colour blindness. 1233 | 1234 | 1235 | 1236 | 1237 | ```{python, eval=FALSE} 1238 | ggplot(mpg, aes("displ", "hwy")) +\ 1239 | geom_point(aes(color="drv")) 1240 | 1241 | ggplot(mpg, aes("displ", "hwy")) +\ 1242 | geom_point(aes(color="drv")) +\ 1243 | scale_colour_brewer(type="qual", palette="Set1") 1244 | ``` 1245 | 1246 | ```{python, echo=FALSE, out.extra = ""} 1247 | ggplot(mpg, aes("displ", "hwy")) +\ 1248 | geom_point(aes(color="drv")) 1249 | ``` 1250 | ```{python, echo=FALSE, out.extra = ""} 1251 | ggplot(mpg, aes("displ", "hwy")) +\ 1252 | geom_point(aes(color="drv")) +\ 1253 | scale_colour_brewer(type="qual", palette="Set1") 1254 | ``` 1255 | 1256 | Don't forget simpler techniques. If there are just a few colours, you can add a redundant shape mapping. This will also help ensure your plot is interpretable in black and white. 1257 | 1258 | ```{python} 1259 | ggplot(mpg, aes("displ", "hwy")) +\ 1260 | geom_point(aes(color="drv", shape="drv")) +\ 1261 | scale_colour_brewer(type="qual", palette="Set1") 1262 | ``` 1263 | 1264 | The ColorBrewer scales are documented online at and made available in Python via the __mizani__ package, by Hassan Kibirige. The figure below shows the complete list of all palettes. The sequential (top) and diverging (bottom) palettes are particularly useful if your categorical values are ordered, or have a "middle". This often arises if you've used `pd.cut()` to make a continuous variable into a categorical variable. 1265 | 1266 | 1267 | ```{r, echo=FALSE, dev="png", out.extra="class=\"md:w-1/2\""} 1268 | knitr::include_graphics("images/brewer-1.png") 1269 | ``` 1270 | 1271 | 1272 | When you have a predefined mapping between values and colours, use `scale_colour_manual()`. For example, if we map presidential party to colour, we want to use the standard mapping of red for Republicans and blue for Democrats: 1273 | 1274 | 1275 | ```{python} 1276 | presidential["id"] = 34 + presidential.index 1277 | 1278 | ggplot(presidential, aes("start", "id", colour="party")) +\ 1279 | geom_point() +\ 1280 | geom_segment(aes(xend="end", yend="id")) +\ 1281 | scale_colour_manual(values=["red", "blue"], limits=["Republican", "Democratic"]) 1282 | ``` 1283 | 1284 | For continuous colour, you can use the built-in `scale_colour_gradient()` or `scale_fill_gradient()`. If you have a diverging scale, you can use `scale_colour_gradient2()`. That allows you to give, for example, positive and negative values different colours. That's sometimes also useful if you want to distinguish points above or below the mean. 1285 | 1286 | Note that all colour scales come in two variety: `scale_colour_x()` and `scale_fill_x()` for the `colour` and `fill` aesthetics respectively (the colour scales are available in both UK and US spellings). 1287 | 1288 | 1289 | ### [28.4.4](https://r4ds.had.co.nz/graphics-for-communication.html#exercises-73)   Exercises 1290 | 1291 | 1. Why doesn't the following code override the default scale? 1292 | ```python 1293 | ggplot(df, aes("x", "y")) +\ 1294 | geom_hex() +\ 1295 | scale_colour_gradient(low="white", high="red") +\ 1296 | coord_fixed() 1297 | ``` 1298 | 1299 | 1. What is the first argument to every scale? How does it compare to `labs()`? 1300 | 1301 | 1. Change the display of the presidential terms by: 1302 | 1303 | 1. Combining the two variants shown above. 1304 | 1. Improving the display of the y axis. 1305 | 1. Labelling each term with the name of the president. 1306 | 1. Adding informative plot labels. 1307 | 1. Placing breaks every 4 years (this is trickier than it seems!). 1308 | 1309 | 1. Use `override_aes` to make the legend on the following plot easier to see. 1310 | 1311 | ```{python} 1312 | ggplot(diamonds, aes("carat", "price")) +\ 1313 | geom_point(aes(colour="cut"), alpha=1/20) 1314 | ``` 1315 | 1316 | ## [28.5](https://r4ds.had.co.nz/graphics-for-communication.html#zooming)   Zooming 1317 | 1318 | There are three ways to control the plot limits: 1319 | 1320 | 1. Adjusting what data are plotted 1321 | 1. Setting the limits in each scale 1322 | 1. Setting `xlim` and `ylim` in `coord_cartesian()` 1323 | 1324 | To zoom in on a region of the plot, it's generally best to use `coord_cartesian()`. Compare the following two plots: 1325 | 1326 | ```{python, eval=FALSE} 1327 | ggplot(mpg, aes("displ", "hwy")) +\ 1328 | geom_point(aes(color="class")) +\ 1329 | geom_smooth() +\ 1330 | coord_cartesian(xlim=(5, 7), ylim=(10, 30)) 1331 | 1332 | ggplot(mpg.query("5 <= displ <= 7 and 10 <= hwy <= 30"), aes("displ", "hwy")) +\ 1333 | geom_point(aes(color="class")) +\ 1334 | geom_smooth() 1335 | ``` 1336 | 1337 | ```{python, echo=FALSE, out.extra = ""} 1338 | ggplot(mpg, aes("displ", "hwy")) +\ 1339 | geom_point(aes(color="class")) +\ 1340 | geom_smooth() +\ 1341 | coord_cartesian(xlim=(5, 7), ylim=(10, 30)) 1342 | ``` 1343 | ```{python, echo=FALSE, out.extra = ""} 1344 | ggplot(mpg.query("5 <= displ <= 7 and 10 <= hwy <= 30"), aes("displ", "hwy")) +\ 1345 | geom_point(aes(color="class")) +\ 1346 | geom_smooth() 1347 | ``` 1348 | 1349 | You can also set the `limits` on individual scales. Reducing the limits is basically equivalent to subsetting the data. It is generally more useful if you want _expand_ the limits, for example, to match scales across different plots. For example, if we extract two classes of cars and plot them separately, it's difficult to compare the plots because all three scales (the x-axis, the y-axis, and the colour aesthetic) have different ranges. 1350 | 1351 | 1352 | ```{python, eval=FALSE} 1353 | mpg["drv"] = mpg["drv"].astype(str) 1354 | suv = mpg[mpg["class"] == "suv"] 1355 | compact = mpg[mpg["class"] == "compact"] 1356 | 1357 | ggplot(suv, aes("displ", "hwy", colour="drv")) +\ 1358 | geom_point() 1359 | 1360 | ggplot(compact, aes("displ", "hwy", colour="drv")) +\ 1361 | geom_point() 1362 | ``` 1363 | 1364 | 1365 | ```{python, echo=FALSE, out.extra = ""} 1366 | mpg["drv"] = mpg["drv"].astype(str) 1367 | suv = mpg[mpg["class"] == "suv"] 1368 | compact = mpg[mpg["class"] == "compact"] 1369 | ggplot(suv, aes("displ", "hwy", colour="drv")) +\ 1370 | geom_point() 1371 | ``` 1372 | ```{python, echo=FALSE, out.extra = ""} 1373 | ggplot(compact, aes("displ", "hwy", colour="drv")) +\ 1374 | geom_point() 1375 | ``` 1376 | 1377 | 1378 | One way to overcome this problem is to share scales across multiple plots, training the scales with the `limits` of the full data. 1379 | 1380 | 1381 | 1382 | ```{python, eval=FALSE} 1383 | x_scale = scale_x_continuous(limits=(mpg.displ.min(), mpg.displ.max())) 1384 | y_scale = scale_y_continuous(limits=(mpg.hwy.min(), mpg.hwy.max())) 1385 | col_scale = scale_colour_discrete(limits=mpg.drv.unique()) 1386 | 1387 | ggplot(suv, aes("displ", "hwy", colour="drv")) +\ 1388 | geom_point() +\ 1389 | x_scale +\ 1390 | y_scale +\ 1391 | col_scale 1392 | 1393 | ggplot(compact, aes("displ", "hwy", colour="drv")) +\ 1394 | geom_point() +\ 1395 | x_scale +\ 1396 | y_scale +\ 1397 | col_scale 1398 | ``` 1399 | 1400 | ```{python, echo=FALSE, out.extra = ""} 1401 | x_scale = scale_x_continuous(limits=(mpg.displ.min(), mpg.displ.max())) 1402 | y_scale = scale_y_continuous(limits=(mpg.hwy.min(), mpg.hwy.max())) 1403 | col_scale = scale_colour_discrete(limits=mpg.drv.unique()) 1404 | 1405 | ggplot(suv, aes("displ", "hwy", colour="drv")) +\ 1406 | geom_point() +\ 1407 | x_scale +\ 1408 | y_scale +\ 1409 | col_scale 1410 | ``` 1411 | ```{python, echo=FALSE, out.extra = ""} 1412 | ggplot(compact, aes("displ", "hwy", colour="drv")) +\ 1413 | geom_point() +\ 1414 | x_scale +\ 1415 | y_scale +\ 1416 | col_scale 1417 | ``` 1418 | 1419 | In this particular case, you could have simply used faceting, but this technique is useful more generally, if for instance, you want spread plots over multiple pages of a report. 1420 | 1421 | 1422 | ## [28.6](https://r4ds.had.co.nz/graphics-for-communication.html#themes)   Themes 1423 | 1424 | 1425 | Finally, you can customise the non-data elements of your plot with a theme: 1426 | 1427 | ```{python} 1428 | ggplot(mpg, aes("displ", "hwy")) +\ 1429 | geom_point(aes(color="class")) +\ 1430 | geom_smooth(se=False) +\ 1431 | theme_xkcd() 1432 | ``` 1433 | 1434 | plotnine includes twelve themes by default. The figure below shows eight of those. The [documentation](https://plotnine.readthedocs.io/en/stable/api.html#themes) lists all available themes. 1435 | 1436 | ```{r, echo=FALSE, dev="png", out.extra="class=\"md:w-2/3\""} 1437 | knitr::include_graphics("images/visualization-themes.png") 1438 | ``` 1439 | 1440 | 1441 | Many people wonder why the default theme has a grey background. This was a deliberate choice because it puts the data forward while still making the grid lines visible. The white grid lines are visible (which is important because they significantly aid position judgements), but they have little visual impact and we can easily tune them out. The grey background gives the plot a similar typographic colour to the text, ensuring that the graphics fit in with the flow of a document without jumping out with a bright white background. Finally, the grey background creates a continuous field of colour which ensures that the plot is perceived as a single visual entity. 1442 | 1443 | It's also possible to control individual components of each theme, like the size and colour of the font used for the y axis. Unfortunately, this level of detail is outside the scope of this book, so you'll need to read the [ggplot2 book](https://amzn.com/331924275X) for the full details. You can also create your own themes, if you are trying to match a particular corporate or journal style. 1444 | 1445 | 1446 | ## [28.7](https://r4ds.had.co.nz/graphics-for-communication.html#saving-your-plots)   Saving your plots 1447 | 1448 | The best way to get your plots out of Python and into your final write-up[^13] 1449 | is with the `.save()` method. There's also the `ggsave()` function, but the plotnine documentation doesn't recommend using this. The `.save()` method will save the plot to disk. In a Jupyter Notebook you can refer to the last returned value using `_`. Alternatively you first assing your plot to a variable. 1450 | 1451 | ```{python} 1452 | ggplot(mpg, aes("displ", "hwy")) + geom_point() 1453 | ``` 1454 | 1455 | ```python 1456 | _.save("my-plot.pdf") 1457 | ``` 1458 | 1459 | If you don't specify the `width` and `height` they will be set to 6.4 and 4.8 inches, respectively. If you don't specify `filename`, plotnine will generate one for you, e.g., "plotnine-save-297120101.pdf". For reproducible code, you'll want to specify them. You can learn more about the `.save()` method in the documentation. 1460 | 1461 | 1462 | ### [28.7.1](https://r4ds.had.co.nz/graphics-for-communication.html#figure-sizing)   Figure sizing 1463 | 1464 | It can be a challenge to get your figure in the right size and shape. There are four options that control figure sizing: `width`, `height`, `units`, and `dpi`. 1465 | 1466 | If you find that you're having to squint to read the text in your plot, you need to tweak `width` and `height`. If the `width` is larger than the size the figure is rendered in the final doc, the text will be too small; if `width` is smaller, the text will be too big. You'll often need to do a little experimentation to figure out the right ratio between the `width` and the eventual width in your document. To illustrate the principle, the following three plots have `width` of 4, 6, and 8 respectively (and a height which is 0.618 times the width, i.e., the golden ratio): 1467 | 1468 | ```{python, echo=FALSE} 1469 | plot = ggplot(mpg, aes("displ", "hwy")) + geom_point() 1470 | for width in [4, 6, 8]: 1471 | plot.save(f"figure/save-width-{width}.png", width=width, height=width*0.618, dpi=300, verbose=False) 1472 | ``` 1473 | 1474 | ```{r, echo=FALSE, dev="png", out.extra="class=\"md:w-2/3 lg:w-1/2\""} 1475 | knitr::include_graphics("figure/save-width-4.png") 1476 | ``` 1477 | 1478 | ```{r, echo=FALSE, dev="png", out.extra="class=\"md:w-2/3 lg:w-1/2\""} 1479 | knitr::include_graphics("figure/save-width-6.png") 1480 | ``` 1481 | 1482 | ```{r, echo=FALSE, dev="png", out.extra="class=\"md:w-2/3 lg:w-1/2\""} 1483 | knitr::include_graphics("figure/save-width-8.png") 1484 | ``` 1485 | 1486 | 1487 | 1488 | ## [28.8](https://r4ds.had.co.nz/graphics-for-communication.html#learning-more-4)   Learning more 1489 | 1490 | The absolute best place to learn more is the ggplot2 book: [_ggplot2: Elegant graphics for data analysis_](https://amzn.com/331924275X). It goes into much more depth about the underlying theory, and has many more examples of how to combine the individual pieces to solve practical problems. 1491 | Unfortunately, the book is not available online for free, although you can find the source code at . 1492 | 1493 | 1494 | ## Footnotes 1495 | 1496 | [^1]: There have been other attempts at porting ggplot2 to Python, such as [ggpy](https://github.com/yhat/ggpy), but as far as I know, these are no longer maintained. 1497 | [^2]: If you ever need to translate ggplot2 to plotnine yourself, check out my [follow-up post containing heuristics](/blog/heuristics-for-translating-ggplot2-to-plotnine) for doing so. 1498 | [^3]: It's important to note that this tutorial is not meant to compare Python and R. The never-ending flame wars between these two languages are boring and unproductive. 1499 | [^4]: While it's generally considered to be bad practice to import everything into the global namespace, I think it's fine to do this in an ad-hoc environment such as a notebook as it makes using the many functions plotnine provides more convenient. An additional advantage is that the resulting code more closely resembles the original ggplot2 code. Alternatively, it's quite common to `import plotnine as p9` and prefix every function with `p9.`. 1500 | [^5]: This tutorial was compiled with a [fork from version 0.6.0](https://github.com/jeroenjanssens/plotnine) that fixes an [issue](https://github.com/has2k1/plotnine/pull/325) related to using `ha` and `va` in `aes()`. 1501 | [^6]: If you dislike the continuation character `\` then an alternative syntax is to wrap the entire expression in parentheses so that it's not needed. 1502 | [^7]: The original text uses the `class` variable, but to demonstrate the same effect we need to use a variable with more distinct values because plotnine supports more shapes than ggplot2. 1503 | [^8]: The original text has an additional exercise that contains code which is semantically wrong on purpose, but in plotnine, the corresponding code is also syntactically wrong. The reason is that in plotnine, you can only use column names in the aesthetic mapping and not literal values, e.g., `aes(color="blue")`. 1504 | [^9]: ggplot2 also has `coord_quickmap()` for producing maps with the correct aspect ratio and `coord_polar()` for using polar coordinates. plotnine doesn't yet have these two functions. 1505 | [^10]: In ggplot2, you can also use `labs()` to add a subtitle and a caption. 1506 | [^11]: We have to use `geom_point()` twice here because of an [issue](https://github.com/has2k1/plotnine/issues/324) with the adjustText package. 1507 | [^12]: In ggplot2 you can write `labels = NULL` so you don't need a helper function. 1508 | [^13]: The original text discusses how to include your plot in R Markdown. While it's possible to include Python code and graphics in an R Markdown document through the [`reticulate` package](https://rstudio.github.io/reticulate/), like this tutorial demonstrates, it's beyond the scope of this text. If you're interested, you can have a look at the [Github repository](https://github.com/datascienceworkshops/r4ds-python-plotnine) related to this tutorial, which includes the .Rmd source. 1509 | -------------------------------------------------------------------------------- /r4ds-python-plotnine.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Makefile 19 | -------------------------------------------------------------------------------- /renv.lock: -------------------------------------------------------------------------------- 1 | { 2 | "R": { 3 | "Version": "3.6.1", 4 | "Repositories": [ 5 | { 6 | "Name": "CRAN", 7 | "URL": "https://cran.rstudio.com" 8 | } 9 | ] 10 | }, 11 | "Packages": { 12 | "MASS": { 13 | "Package": "MASS", 14 | "Version": "7.3-51.4", 15 | "Source": "Repository", 16 | "Repository": "CRAN", 17 | "Hash": "a94714e63996bc284b8795ec50defc07" 18 | }, 19 | "Matrix": { 20 | "Package": "Matrix", 21 | "Version": "1.2-18", 22 | "Source": "Repository", 23 | "Repository": "CRAN", 24 | "Hash": "08588806cba69f04797dab50627428ed" 25 | }, 26 | "Rcpp": { 27 | "Package": "Rcpp", 28 | "Version": "1.0.3", 29 | "Source": "Repository", 30 | "Repository": "CRAN", 31 | "Hash": "f3ca785924863b0e4c8cb23b6a5c75a1" 32 | }, 33 | "assertthat": { 34 | "Package": "assertthat", 35 | "Version": "0.2.1", 36 | "Source": "Repository", 37 | "Repository": "CRAN", 38 | "Hash": "50c838a310445e954bc13f26f26a6ecf" 39 | }, 40 | "base64enc": { 41 | "Package": "base64enc", 42 | "Version": "0.1-3", 43 | "Source": "Repository", 44 | "Repository": "CRAN", 45 | "Hash": "543776ae6848fde2f48ff3816d0628bc" 46 | }, 47 | "cli": { 48 | "Package": "cli", 49 | "Version": "2.0.0", 50 | "Source": "Repository", 51 | "Repository": "CRAN", 52 | "Hash": "f41a3aa27b911750a6526b450d0b8ea4" 53 | }, 54 | "crayon": { 55 | "Package": "crayon", 56 | "Version": "1.3.4", 57 | "Source": "Repository", 58 | "Repository": "CRAN", 59 | "Hash": "0d57bc8e27b7ba9e45dba825ebc0de6b" 60 | }, 61 | "digest": { 62 | "Package": "digest", 63 | "Version": "0.6.23", 64 | "Source": "Repository", 65 | "Repository": "CRAN", 66 | "Hash": "931fd68809dab4609b4d4b5702206066" 67 | }, 68 | "evaluate": { 69 | "Package": "evaluate", 70 | "Version": "0.14", 71 | "Source": "Repository", 72 | "Repository": "CRAN", 73 | "Hash": "ec8ca05cffcc70569eaaad8469d2a3a7" 74 | }, 75 | "fansi": { 76 | "Package": "fansi", 77 | "Version": "0.4.0", 78 | "Source": "Repository", 79 | "Repository": "CRAN", 80 | "Hash": "b31d9e5d051553d1177083aeba04b5b9" 81 | }, 82 | "glue": { 83 | "Package": "glue", 84 | "Version": "1.3.1", 85 | "Source": "Repository", 86 | "Repository": "CRAN", 87 | "Hash": "d4e25697c450c01b202c79ef35694a83" 88 | }, 89 | "highr": { 90 | "Package": "highr", 91 | "Version": "0.8", 92 | "Source": "Repository", 93 | "Repository": "CRAN", 94 | "Hash": "4dc5bb88961e347a0f4d8aad597cbfac" 95 | }, 96 | "htmltools": { 97 | "Package": "htmltools", 98 | "Version": "0.4.0", 99 | "Source": "Repository", 100 | "Repository": "CRAN", 101 | "Hash": "2d7691222f82f41e93f6d30f169bd5e1" 102 | }, 103 | "jsonlite": { 104 | "Package": "jsonlite", 105 | "Version": "1.6", 106 | "Source": "Repository", 107 | "Repository": "CRAN", 108 | "Hash": "bc5739654d032acf531356e32e0d0f54" 109 | }, 110 | "knitr": { 111 | "Package": "knitr", 112 | "Version": "1.26", 113 | "Source": "Repository", 114 | "Repository": "CRAN", 115 | "Hash": "a1f86bbc39ae43c6ae183223a52527a1" 116 | }, 117 | "lattice": { 118 | "Package": "lattice", 119 | "Version": "0.20-38", 120 | "Source": "Repository", 121 | "Repository": "CRAN", 122 | "Hash": "848f8c593fd1050371042d18d152e3d7" 123 | }, 124 | "magrittr": { 125 | "Package": "magrittr", 126 | "Version": "1.5", 127 | "Source": "Repository", 128 | "Repository": "CRAN", 129 | "Hash": "1bb58822a20301cee84a41678e25d9b7" 130 | }, 131 | "markdown": { 132 | "Package": "markdown", 133 | "Version": "1.1", 134 | "Source": "Repository", 135 | "Repository": "CRAN", 136 | "Hash": "61e4a10781dd00d7d81dd06ca9b94e95" 137 | }, 138 | "mgcv": { 139 | "Package": "mgcv", 140 | "Version": "1.8-31", 141 | "Source": "Repository", 142 | "Repository": "CRAN", 143 | "Hash": "4bb7e0c4f3557583e1e8d3c9ffb8ba5c" 144 | }, 145 | "mime": { 146 | "Package": "mime", 147 | "Version": "0.7", 148 | "Source": "Repository", 149 | "Repository": "CRAN", 150 | "Hash": "f085cb5d1548336cafa5ee7ec56d7e34" 151 | }, 152 | "nlme": { 153 | "Package": "nlme", 154 | "Version": "3.1-142", 155 | "Source": "Repository", 156 | "Repository": "CRAN", 157 | "Hash": "557d78d7eac2c1090ee58647a6274142" 158 | }, 159 | "renv": { 160 | "Package": "renv", 161 | "Version": "0.9.2", 162 | "Source": "Repository", 163 | "Repository": "CRAN", 164 | "Hash": "5181d5f316c7a6589219866d640e004c" 165 | }, 166 | "reticulate": { 167 | "Package": "reticulate", 168 | "Version": "1.13", 169 | "Source": "Repository", 170 | "Repository": "CRAN", 171 | "Hash": "b79cab863b9d9c302467f6cf3ab00f6d" 172 | }, 173 | "rlang": { 174 | "Package": "rlang", 175 | "Version": "0.4.2", 176 | "Source": "Repository", 177 | "Repository": "CRAN", 178 | "Hash": "ff31d958a041593f58ba04fc637d90dd" 179 | }, 180 | "rmarkdown": { 181 | "Package": "rmarkdown", 182 | "Version": "1.18", 183 | "Source": "Repository", 184 | "Repository": "CRAN", 185 | "Hash": "2a1b6baa83112f3c8e1909f3989ea9d9" 186 | }, 187 | "sessioninfo": { 188 | "Package": "sessioninfo", 189 | "Version": "1.1.1", 190 | "Source": "Repository", 191 | "Repository": "CRAN", 192 | "Hash": "308013098befe37484df72c39cf90d6e" 193 | }, 194 | "stringi": { 195 | "Package": "stringi", 196 | "Version": "1.4.3", 197 | "Source": "Repository", 198 | "Repository": "CRAN", 199 | "Hash": "74a50760af835563fb2c124e66aa134e" 200 | }, 201 | "stringr": { 202 | "Package": "stringr", 203 | "Version": "1.4.0", 204 | "Source": "Repository", 205 | "Repository": "CRAN", 206 | "Hash": "0759e6b6c0957edb1311028a49a35e76" 207 | }, 208 | "tinytex": { 209 | "Package": "tinytex", 210 | "Version": "0.17", 211 | "Source": "Repository", 212 | "Repository": "CRAN", 213 | "Hash": "9f259855f66a72efd6a49f1689af935b" 214 | }, 215 | "withr": { 216 | "Package": "withr", 217 | "Version": "2.1.2", 218 | "Source": "Repository", 219 | "Repository": "CRAN", 220 | "Hash": "aa57ed55ff2df4bea697a07df528993d" 221 | }, 222 | "xfun": { 223 | "Package": "xfun", 224 | "Version": "0.11", 225 | "Source": "Repository", 226 | "Repository": "CRAN", 227 | "Hash": "9ec720c772e46177f8a78792939f4bef" 228 | }, 229 | "yaml": { 230 | "Package": "yaml", 231 | "Version": "2.2.0", 232 | "Source": "Repository", 233 | "Repository": "CRAN", 234 | "Hash": "c78bdf1d16bd4ec7ecc86c6986d53309" 235 | } 236 | } 237 | } 238 | -------------------------------------------------------------------------------- /renv/.gitignore: -------------------------------------------------------------------------------- 1 | library/ 2 | python/ 3 | staging/ 4 | -------------------------------------------------------------------------------- /renv/activate.R: -------------------------------------------------------------------------------- 1 | 2 | local({ 3 | 4 | # the requested version of renv 5 | version <- "0.9.2" 6 | 7 | # avoid recursion 8 | if (!is.na(Sys.getenv("RENV_R_INITIALIZING", unset = NA))) 9 | return(invisible(TRUE)) 10 | 11 | # signal that we're loading renv during R startup 12 | Sys.setenv("RENV_R_INITIALIZING" = "true") 13 | on.exit(Sys.unsetenv("RENV_R_INITIALIZING"), add = TRUE) 14 | 15 | # signal that we've consented to use renv 16 | options(renv.consent = TRUE) 17 | 18 | # load the 'utils' package eagerly -- this ensures that renv shims, which 19 | # mask 'utils' packages, will come first on the search path 20 | library(utils, lib.loc = .Library) 21 | 22 | # check to see if renv has already been loaded 23 | if ("renv" %in% loadedNamespaces()) { 24 | 25 | # if renv has already been loaded, and it's the requested version of renv, 26 | # nothing to do 27 | spec <- .getNamespaceInfo(.getNamespace("renv"), "spec") 28 | if (identical(spec[["version"]], version)) 29 | return(invisible(TRUE)) 30 | 31 | # otherwise, unload and attempt to load the correct version of renv 32 | unloadNamespace("renv") 33 | 34 | } 35 | 36 | # construct path to renv in library 37 | libpath <- local({ 38 | 39 | root <- Sys.getenv("RENV_PATHS_LIBRARY", unset = "renv/library") 40 | prefix <- paste("R", getRversion()[1, 1:2], sep = "-") 41 | 42 | # include SVN revision for development versions of R 43 | # (to avoid sharing platform-specific artefacts with released versions of R) 44 | devel <- 45 | identical(R.version[["status"]], "Under development (unstable)") || 46 | identical(R.version[["nickname"]], "Unsuffered Consequences") 47 | 48 | if (devel) 49 | prefix <- paste(prefix, R.version[["svn rev"]], sep = "-r") 50 | 51 | file.path(root, prefix, R.version$platform) 52 | 53 | }) 54 | 55 | # try to load renv from the project library 56 | if (requireNamespace("renv", lib.loc = libpath, quietly = TRUE)) { 57 | 58 | # warn if the version of renv loaded does not match 59 | loadedversion <- utils::packageDescription("renv", fields = "Version") 60 | if (version != loadedversion) { 61 | 62 | # assume four-component versions are from GitHub; three-component 63 | # versions are from CRAN 64 | components <- strsplit(loadedversion, "[.-]")[[1]] 65 | remote <- if (length(components) == 4L) 66 | paste("rstudio/renv", loadedversion, sep = "@") 67 | else 68 | paste("renv", loadedversion, sep = "@") 69 | 70 | fmt <- paste( 71 | "renv %1$s was loaded from project library, but renv %2$s is recorded in lockfile.", 72 | "Use `renv::record(\"%3$s\")` to record this version in the lockfile.", 73 | "Use `renv::restore(packages = \"renv\")` to install renv %2$s into the project library.", 74 | sep = "\n" 75 | ) 76 | 77 | msg <- sprintf(fmt, loadedversion, version, remote) 78 | warning(msg, call. = FALSE) 79 | 80 | } 81 | 82 | # load the project 83 | return(renv::load()) 84 | 85 | } 86 | 87 | # failed to find renv locally; we'll try to install from GitHub. 88 | # first, set up download options as appropriate (try to use GITHUB_PAT) 89 | install_renv <- function() { 90 | 91 | message("Failed to find installation of renv -- attempting to bootstrap...") 92 | 93 | # ensure .Rprofile doesn't get executed 94 | rpu <- Sys.getenv("R_PROFILE_USER", unset = NA) 95 | Sys.setenv(R_PROFILE_USER = "") 96 | on.exit({ 97 | if (is.na(rpu)) 98 | Sys.unsetenv("R_PROFILE_USER") 99 | else 100 | Sys.setenv(R_PROFILE_USER = rpu) 101 | }, add = TRUE) 102 | 103 | # prepare download options 104 | pat <- Sys.getenv("GITHUB_PAT") 105 | if (nzchar(Sys.which("curl")) && nzchar(pat)) { 106 | fmt <- "--location --fail --header \"Authorization: token %s\"" 107 | extra <- sprintf(fmt, pat) 108 | saved <- options("download.file.method", "download.file.extra") 109 | options(download.file.method = "curl", download.file.extra = extra) 110 | on.exit(do.call(base::options, saved), add = TRUE) 111 | } else if (nzchar(Sys.which("wget")) && nzchar(pat)) { 112 | fmt <- "--header=\"Authorization: token %s\"" 113 | extra <- sprintf(fmt, pat) 114 | saved <- options("download.file.method", "download.file.extra") 115 | options(download.file.method = "wget", download.file.extra = extra) 116 | on.exit(do.call(base::options, saved), add = TRUE) 117 | } 118 | 119 | # fix up repos 120 | repos <- getOption("repos") 121 | on.exit(options(repos = repos), add = TRUE) 122 | repos[repos == "@CRAN@"] <- "https://cloud.r-project.org" 123 | options(repos = repos) 124 | 125 | # check for renv on CRAN matching this version 126 | db <- as.data.frame(available.packages(), stringsAsFactors = FALSE) 127 | if ("renv" %in% rownames(db)) { 128 | entry <- db["renv", ] 129 | if (identical(entry$Version, version)) { 130 | message("* Installing renv ", version, " ... ", appendLF = FALSE) 131 | dir.create(libpath, showWarnings = FALSE, recursive = TRUE) 132 | utils::install.packages("renv", lib = libpath, quiet = TRUE) 133 | message("Done!") 134 | return(TRUE) 135 | } 136 | } 137 | 138 | # try to download renv 139 | message("* Downloading renv ", version, " ... ", appendLF = FALSE) 140 | prefix <- "https://api.github.com" 141 | url <- file.path(prefix, "repos/rstudio/renv/tarball", version) 142 | destfile <- tempfile("renv-", fileext = ".tar.gz") 143 | on.exit(unlink(destfile), add = TRUE) 144 | utils::download.file(url, destfile = destfile, mode = "wb", quiet = TRUE) 145 | message("Done!") 146 | 147 | # attempt to install it into project library 148 | message("* Installing renv ", version, " ... ", appendLF = FALSE) 149 | dir.create(libpath, showWarnings = FALSE, recursive = TRUE) 150 | 151 | # invoke using system2 so we can capture and report output 152 | bin <- R.home("bin") 153 | exe <- if (Sys.info()[["sysname"]] == "Windows") "R.exe" else "R" 154 | r <- file.path(bin, exe) 155 | args <- c("--vanilla", "CMD", "INSTALL", "-l", shQuote(libpath), shQuote(destfile)) 156 | output <- system2(r, args, stdout = TRUE, stderr = TRUE) 157 | message("Done!") 158 | 159 | # check for successful install 160 | status <- attr(output, "status") 161 | if (is.numeric(status) && !identical(status, 0L)) { 162 | text <- c("Error installing renv", "=====================", output) 163 | writeLines(text, con = stderr()) 164 | } 165 | 166 | 167 | } 168 | 169 | try(install_renv()) 170 | 171 | # try again to load 172 | if (requireNamespace("renv", lib.loc = libpath, quietly = TRUE)) { 173 | message("Successfully installed and loaded renv ", version, ".") 174 | return(renv::load()) 175 | } 176 | 177 | # failed to download or load renv; warn the user 178 | msg <- c( 179 | "Failed to find an renv installation: the project will not be loaded.", 180 | "Use `renv::activate()` to re-initialize the project." 181 | ) 182 | 183 | warning(paste(msg, collapse = "\n"), call. = FALSE) 184 | 185 | }) 186 | -------------------------------------------------------------------------------- /renv/settings.dcf: -------------------------------------------------------------------------------- 1 | external.libraries: 2 | ignored.packages: 3 | snapshot.type: packrat 4 | use.cache: TRUE 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | adjustText==0.7.3 2 | cycler==0.10.0 3 | descartes==1.1.0 4 | joblib==0.14.0 5 | jupyter==1.0.0 6 | jupytext==1.3.0 7 | kiwisolver==1.1.0 8 | matplotlib==3.1.2 9 | mizani==0.6.0 10 | numpy==1.17.4 11 | palettable==3.3.0 12 | pandas==0.25.3 13 | patsy==0.5.1 14 | git+https://github.com/jeroenjanssens/plotnine.git 15 | pyparsing==2.4.5 16 | python-dateutil==2.8.1 17 | pytz==2019.3 18 | scikit-learn==0.22 19 | scikit-misc==0.1.1 20 | scipy==1.3.3 21 | statsmodels==0.10.2 22 | --------------------------------------------------------------------------------