├── .dockerignore
├── .gitignore
├── Dockerfile
├── README.md
├── googleAnalytics.html
├── index.Rmd
├── install_packages.R
├── main.py
├── report.sh
├── requirements.txt
└── site
    └── index.html


/.dockerignore:
--------------------------------------------------------------------------------
 1 | # Virtual environment
 2 | venv
 3 | .env
 4 | 
 5 | # Git
 6 | .git
 7 | .gitignore
 8 | 
 9 | # Other
10 | site
11 | .DS_Store
12 | .Rhistory
13 | README.md
14 | requirements.txt
15 | site/index.html
16 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # special to this repo...
 2 | *.csv
 3 | *.json
 4 | data
 5 | 
 6 | # Mac
 7 | .DS_Store
 8 | 
 9 | # Excel
10 | *.xlsx
11 | 
12 | # R
13 | .Rhistory
14 | .Rproj.user
15 | *.Rproj
16 | 
17 | # Byte-compiled / optimized / DLL files
18 | __pycache__/
19 | *.py[cod]
20 | *$py.class
21 | 
22 | # Distribution / packaging
23 | .Python
24 | build/
25 | develop-eggs/
26 | dist/
27 | downloads/
28 | eggs/
29 | .eggs/
30 | lib/
31 | lib64/
32 | parts/
33 | sdist/
34 | var/
35 | wheels/
36 | pip-wheel-metadata/
37 | share/python-wheels/
38 | *.egg-info/
39 | .installed.cfg
40 | *.egg
41 | MANIFEST
42 | 
43 | # Jupyter Notebook
44 | .ipynb_checkpoints
45 | 
46 | # IPython
47 | profile_default/
48 | ipython_config.py
49 | 
50 | # pyenv
51 | .python-version
52 | 
53 | # Environments
54 | .env
55 | .venv
56 | env/
57 | venv/
58 | ENV/
59 | env.bak/
60 | venv.bak/
61 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Base R with RMarkdown support
 2 | FROM rocker/verse
 3 | LABEL maintainer="aboghoss@broadinstitute.org"
 4 | 
 5 | ARG BUILD_DATE
 6 | 
 7 | LABEL org.label.schema.name="bargraph"
 8 | LABEL org.label-schema.build-date=$BUILD_DATE
 9 | LABEL org.label-schema.url="https://a-few-beers-later.surge.sh"
10 | LABEL org.label-schema.vcs-url="https://github.com/alexdanilowicz/BarGraph"
11 | LABEL org.label-schema.schema-version="0.0.1"
12 | 
13 | # Update some basic packages for R compatibility
14 | 
15 | # Set working directory
16 | WORKDIR /
17 | 
18 | # Copy scripts into image
19 | COPY ./install_packages.R /install_packages.R
20 | COPY ./index.Rmd /index.Rmd
21 | COPY ./report.sh /bargraph/bin/report.sh
22 | 
23 | # Install R packages
24 | RUN Rscript /install_packages.R
25 | 
26 | # Make report.sh executable
27 | ENV PATH /bargraph/bin:$PATH
28 | RUN ["chmod", "-R", "+x", "/bargraph/bin"]
29 | 
30 | # Entrypoint to run container
31 | ENTRYPOINT ["report.sh"]
32 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # BarGraph 🍻
  2 | 
  3 | > Get your user data with the Untappd API and run fancy stats on it.
  4 | 
  5 | ## Local Development 📊
  6 | 
  7 | We use a simple Python script to hit the Untappd API. We use R to make graphs and run stats.
  8 | 
  9 | ### Getting Untappd API Secrets
 10 | 
 11 | To run the Python script, you'll need an Untappd `CLIENT_ID` and `CLIENT_SECRET`. Put those in the `.env` file in the root of project, like so:
 12 | 
 13 | ```
 14 | echo "CLIENT_ID=123" >> .env
 15 | echo "CLIENT_SECRET=456" >> .env
 16 | ```
 17 | 
 18 | You'll need to apply for an [Untappd API key](https://untappd.com/api/docs).
 19 | 
 20 | You'll also need to put USERNAMES in the `.env` file as a list, like so:
 21 | 
 22 | ```
 23 | echo "USERNAMES=alexdannylow,andrewbogo" >> .env
 24 | ```
 25 | 
 26 | The users cannot be private on Untappd.
 27 | 
 28 | For a single username, you can exclude the comma:
 29 | 
 30 | ```
 31 | "USERNAMES=alexdannylow"
 32 | ```
 33 | 
 34 | ### Running the Python Script
 35 | 
 36 | To create the outfile.csv, you'll need to run the Python script to hit the Untappd API.
 37 | 
 38 | #### First time?
 39 | 
 40 | Create virutal env:
 41 | 
 42 | `python3 -m venv venv`
 43 | 
 44 | In the future, we would like to dockerize this.
 45 | 
 46 | #### Then:
 47 | 
 48 | `source venv/bin/activate`
 49 | 
 50 | `pip3 install -r requirements.txt`
 51 | 
 52 | Once everything is installed, you can run the script.
 53 | 
 54 | ```
 55 | python3 main.py --help
 56 | usage: main.py [-h] [--force] [--outfile OUTFILE]
 57 |                [--number-of-unique-beers NUMBER_OF_UNIQUE_BEERS]
 58 | 
 59 | Hit the Untappd API for user data 🍻.
 60 | 
 61 | optional arguments:
 62 |   -h, --help            show this help message and exit
 63 |   --force               Actually make a request. Used so you don't blow
 64 |                         through your Untappd API limit. (default: False)
 65 |   --outfile OUTFILE     Name of outfile. Should match filename in R script.
 66 |                         (default: data.csv)
 67 |   --number-of-unique-beers NUMBER_OF_UNIQUE_BEERS
 68 |                         How many unique beers for each user? (default: 49)
 69 | ```
 70 | 
 71 | Example usage:
 72 | 
 73 | ```
 74 | python3 main.py --force --outfile "allBeers.csv" --number-of-unique-beers 49
 75 | ```
 76 | 
 77 | Note that we ignore `.csv` via our `.gitignore`.
 78 | 
 79 | ### Generating a report with R
 80 | 
 81 | #### First time?
 82 | 
 83 | Install [Docker Desktop](https://docs.docker.com/desktop/) for your system.
 84 | 
 85 | Then, pull the Docker image from Docker Hub:
 86 | 
 87 | `docker pull aboghoss/bargraph:v0.0.1`
 88 | 
 89 | #### Then
 90 | 
 91 | ```
 92 | docker run -it \
 93 |   -v <DATA_DIRECTORY>:/data \
 94 |   -v <OUTPUT_DIRECTORY>:/out_dir \
 95 |   aboghoss/bargraph:v0.0.1 \
 96 |   -d /data/<FILENAME> \
 97 |   -o /out_dir \
 98 |   -n <OUTFILE_NAME> \
 99 |   -a <ANONYMIZE>
100 | ```
101 | Replacing `<DATA_DIRECTORY>` with the path to the local directory containing your Untappd data (e.g. `~/Desktop/beer_data.csv`) and `<OUTPUT_DIRECTORY>` with a path to the directory you would like the report written to.
102 | 
103 | The `-v` arguments mount directories to the container allowing them to read and write in those directories. If the directory you want to write to is the same as the one containing your data remove line 94 and edit line 97 to be `-o /data`.
104 | 
105 | ##### Arguments
106 | 
107 | `-d`: string name of your data file
108 | 
109 | `-o`: string name of directory to write to
110 | 
111 | `-n`: string name of the report that will be created
112 | 
113 | `-a`: `0` to keep usernames, `1` to anonymize data
114 | 
115 | ## Deploying 🚀
116 | 
117 | In our case, the output index.html file in the `site` directory is deployed via [surge](https://surge.sh/) to [http://a-few-beers-later.surge.sh/](http://a-few-beers-later.surge.sh/)
118 | 
119 | ```
120 | surge ./site a-few-beers-later.surge.sh
121 | ```
122 | 


--------------------------------------------------------------------------------
/googleAnalytics.html:
--------------------------------------------------------------------------------
 1 | <!-- Yes, we use Google Analytics. -->
 2 | <script
 3 |   async
 4 |   src="https://www.googletagmanager.com/gtag/js?id=G-N5LLMRDZPC"
 5 | ></script>
 6 | <script>
 7 |   window.dataLayer = window.dataLayer || [];
 8 |   function gtag() {
 9 |     dataLayer.push(arguments);
10 |   }
11 |   gtag("js", new Date());
12 | 
13 |   gtag("config", "G-N5LLMRDZPC");
14 | </script>
15 | 


--------------------------------------------------------------------------------
/index.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "A few beers later... `r emo::ji('clinking_beer_mugs')`"
  3 | params:
  4 |   data_file: ./data/allBeers2.csv
  5 |   anonymize: 1
  6 | output:
  7 |   html_document:
  8 |     theme: paper
  9 |     highlight: kate
 10 |     toc: false
 11 | ---
 12 | 
 13 | ```{r setup, include=FALSE}
 14 | # Include and silence all chunks
 15 | knitr::opts_chunk$set(include = T, echo = F, warning = F, message = F, fig.width = 12)
 16 | 
 17 | # Load libraries
 18 | library(tidyverse)
 19 | library(caret)  # model setup
 20 | library(ranger)  # random forest
 21 | library(RColorBrewer)  # colors
 22 | library(ggthemes)  # colors
 23 | library(cowplot)  # themes
 24 | library(GGally)  # just for pairs plot (manually do?)
 25 | library(DT)  # javascript tables
 26 | library(emo)  # emojis (has to be installed with devtools)
 27 | library(randomNames)  # for making fake names
 28 | 
 29 | # Load data
 30 | all_beers <- data.table::fread(params$data_file) %>%
 31 |   dplyr::select(-V1) %>%
 32 |   dplyr::group_by(beer.beer_name, brewery.brewery_name, beer.bid) %>%
 33 |   dplyr::mutate(beer.rating_score = max(beer.rating_score)) %>%
 34 |   dplyr::ungroup() %>%
 35 |   dplyr::mutate(beer.rating_score = ifelse(beer.rating_score == 0, NA, beer.rating_score),
 36 |                 beer.beer_ibu = ifelse(beer.beer_ibu == 0, NA, beer.beer_ibu),
 37 |                 beer.beer_abv = ifelse(beer.beer_abv == 0, NA, beer.beer_abv))
 38 | 
 39 | n_users <- length(unique(all_beers$user))
 40 | n_beers <- nrow(all_beers)
 41 | 
 42 | # Anonymizes data if specified
 43 | if (as.numeric(params$anonymize) == 1) {
 44 | 
 45 |   # Maintains 3 original names for consistency otherwise makes random
 46 |   if (n_users == 3) {
 47 |     fake_users <- c("Alice", "Bob", "Carl")
 48 |   } else {
 49 |     fake_users <- randomNames::randomNames(n = n_users, which.names = "first")
 50 |   }
 51 | 
 52 |   # Scrubs usernames from report
 53 |   all_beers <- all_beers %>%
 54 |     dplyr::right_join(dplyr::distinct(., user) %>%
 55 |                         dplyr::mutate(id = rank(user))) %>%
 56 |     dplyr::mutate(user = fake_users[id]) %>%
 57 |     dplyr::select(-id)
 58 | }
 59 | 
 60 | # List of users
 61 | users <- distinct(all_beers, user)$user
 62 | 
 63 | # Set plot theme to "cowplot"
 64 | theme_set(theme_cowplot())
 65 | ```
 66 | 
 67 | # How do our distributions stack up? `r emo::ji('bar_chart')` {.tabset .tabset-pills}
 68 | 
 69 | ## Distributions
 70 | 
 71 | ```{r distributions}
 72 | # Violin plot with boxplot inside
 73 | all_beers %>%
 74 |   ggplot(aes(x = user, y = rating_score, fill = user)) +
 75 |   geom_violin() +
 76 |   geom_boxplot(width = 0.2, outlier.alpha = 0) +
 77 |   scale_fill_tableau() + ylim(c(0, 5)) +
 78 |   labs(x = "User", y = "Rating", fill = "User", title = paste("Total beers:", n_beers))
 79 | ```
 80 | 
 81 | ## Do we agree on ratings?
 82 | 
 83 | ```{r correlations, fig.height=8}
 84 | # Pairs plot
 85 | all_beers %>%
 86 |   dplyr::select(user, beer.beer_name, brewery.brewery_name, rating_score, beer.rating_score) %>%
 87 |   tidyr::pivot_wider(names_from = "user", values_from = "rating_score") %>%  # make wide table for ggpairs
 88 |   dplyr::rename(global = beer.rating_score) %>%
 89 |   GGally::ggpairs(columns = c(3:ncol(.)))
 90 | ```
 91 | 
 92 | ## Controversial beers
 93 | 
 94 | ```{r controversial}
 95 | # Find beers with largest difference between our ratings
 96 | controversial <- all_beers %>%
 97 |   dplyr::select(beer.beer_name, brewery.brewery_name, user, rating_score, beer.rating_score) %>%
 98 |   dplyr::group_by(beer.beer_name, brewery.brewery_name) %>%
 99 |   dplyr::mutate(max_diff = max(rating_score) - min(rating_score)) %>%
100 |   dplyr::ungroup() %>%
101 |   tidyr::pivot_wider(names_from = "user", values_from = "rating_score") %>%
102 |   dplyr::filter(max_diff >= 1)
103 | 
104 | # Make into interactive table
105 | DT::datatable(controversial %>% dplyr::arrange(desc(max_diff)), style="bootstrap", width="100%",
106 |               options = list(lengthChange = FALSE, scrollY = "300px", paging = FALSE),
107 |               colnames = c("Beer", "Brewery", "Rating", "Maximum difference", colnames(controversial)[5:ncol(controversial)]),
108 |               filter = "top") %>%
109 |   DT::formatRound(columns = c("beer.rating_score", "max_diff"), digits = 3)
110 | ```
111 | 
112 | ## Differences from global
113 | 
114 | ```{r global diffs}
115 | # Difference from global average (boxplot)
116 | all_beers %>%
117 |   dplyr::filter(beer.rating_score != 0) %>%
118 |   dplyr::select(user, beer.beer_name, brewery.brewery_name, rating_score, beer.rating_score) %>%
119 |   dplyr::mutate(glob_diff = rating_score - beer.rating_score) %>%  # calculate difference
120 |   ggplot(aes(x = user, y = glob_diff)) +
121 |   geom_boxplot(outlier.alpha = 0) + geom_jitter(aes(color = user)) +
122 |   scale_color_tableau() +
123 |   labs(x = "User", y = "Rating difference", color = "User")
124 | ```
125 | 
126 | # Breweries `r emo::ji('factory')` {.tabset .tabset-pills}
127 | 
128 | Ratings by brewery for each user. User must have had 5 or more unique beers from the brewery.
129 | 
130 | ## Top rated
131 | 
132 | ```{r breweries, fig.height=12}
133 | # Our ratings higher than 4.5
134 | p1 <- all_beers %>%
135 |   dplyr::filter(rating_score >= 4.5) %>%
136 |   dplyr::group_by(brewery.brewery_name) %>%
137 |   dplyr::mutate(n = n()) %>%
138 |   dplyr::ungroup() %>%
139 |   ggplot(aes(y = reorder(brewery.brewery_name, n), fill = user)) +
140 |   geom_bar() + scale_fill_tableau() +
141 |   labs(x = "Number rated higher than 4.5", y = "Brewery", fill = "User")
142 | # Global ratings higher than 4.25
143 | p2 <- all_beers %>%
144 |   dplyr::filter(beer.rating_score >= 4.25) %>%
145 |   dplyr::distinct(beer.rating_score, brewery.brewery_name) %>%
146 |   dplyr::group_by(brewery.brewery_name) %>%
147 |   dplyr::mutate(n = n()) %>%
148 |   dplyr::ungroup() %>%
149 |   dplyr::mutate(rating_bin = fct_rev(cut(beer.rating_score, seq(4, 5, 0.1)))) %>%
150 |   ggplot(aes(y = reorder(brewery.brewery_name, n), fill = rating_bin)) +
151 |   geom_bar() + scale_fill_brewer(palette = "Blues", direction = -1) +
152 |   labs(x = "Number with average global higher than 4.25", y = "Brewery", fill = "Binned rating")
153 | cowplot::plot_grid(p2, p1, ncol = 1, rel_heights = c(6, 10))
154 | ```
155 | 
156 | ## Average ratings {.tabset .tabset-dropdown}
157 | 
158 | ```{r brewery averages}
159 | # Ratings by brewery
160 | brewery_ratings <- all_beers %>%
161 |   dplyr::group_by(user, brewery.brewery_name) %>%
162 |   dplyr::mutate(n = n()) %>%
163 |   dplyr::ungroup() %>%
164 |   dplyr::filter(n >= 5)  # user has had >= 5
165 | 
166 | # Loop through each  user and make plot
167 | brewery_plots <- list()  # store plots
168 | for (i in 1:length(users)) {
169 |   p <- brewery_ratings %>%
170 |     dplyr::filter(user == users[i]) %>%  # filter to current user
171 |     ggplot(aes(x = rating_score, y = fct_reorder(brewery.brewery_name, rating_score, median),
172 |                color = brewery.location.brewery_state)) +
173 |     geom_boxplot(outlier.alpha = 0) + geom_jitter() +
174 |     scale_color_tableau(palette = "Tableau 20") +
175 |     xlim(c(0, 5)) +
176 |     labs(x = "Rating", y = "Brewery", color = "State")
177 |   brewery_plots[[i]] <- p
178 | }
179 | ```
180 | 
181 | ### Everyone
182 | 
183 | ```{r brewery averages everyone, fig.height=18}
184 | # To find our favorites as a group
185 | everyone <- all_beers %>%
186 |   dplyr::mutate(brewery.location.brewery_state =
187 |                   ifelse(nchar(brewery.location.brewery_state) == 2,
188 |                          brewery.location.brewery_state, "Other")) %>%
189 |   dplyr::group_by(brewery.brewery_name) %>%
190 |   dplyr::mutate(n = n()) %>%
191 |   dplyr::ungroup() %>%
192 |   dplyr::filter(n >= 5)
193 | 
194 | # Make new palette in tableau scheme of correct size
195 | n_pal <- colorRampPalette(tableau_color_pal(palette = "Tableau 20")(20))
196 | pal <- n_pal(length(unique(everyone$brewery.location.brewery_state)))
197 | 
198 | everyone %>%
199 |   ggplot(aes(x = rating_score, y = fct_reorder(brewery.brewery_name, rating_score, median),
200 |              color = brewery.location.brewery_state)) +
201 |   geom_boxplot(outlier.alpha = 0) + geom_jitter() +
202 |   scale_color_manual(values = pal) +
203 |   xlim(c(0, 5)) +
204 |   labs(x = "Rating", y = "Brewery", color = "State")
205 | ```
206 | 
207 | ```{r brewery averages plot, results='asis', fig.height=8}
208 | # Print all plots in their own subheading
209 | for (i in 1:length(users)) {
210 |   cat(sprintf('\n\n### %s {.tabset .tabset-pills}\n\n', users[i], '\n\n'))
211 |   print(brewery_plots[[i]])
212 |   cat("\n\n")
213 | }
214 | ```
215 | 
216 | ## Brewery rating table
217 | 
218 | ```{r brewery table}
219 | # Table of brewery ratings
220 | brew_tab <- all_beers %>%
221 |   dplyr::group_by(user, brewery.brewery_name) %>%
222 |   dplyr::summarise(med_rating = median(rating_score), mean_rating = mean(rating_score), n = n(), .groups = "drop") %>%
223 |   dplyr::filter(n >= 5) %>%  # user has  had >= 5
224 |   dplyr::group_by(user) %>%
225 |   dplyr::mutate(rank_med = rank(desc(med_rating), ties.method = "min"),
226 |                 rank_mean = rank(desc(mean_rating), ties.method = "min")) %>%  # rank breweries (ties get same value)
227 |   dplyr::ungroup()
228 | 
229 | # Make into interactive table
230 | DT::datatable(brew_tab %>% dplyr::arrange(rank_med), style="bootstrap", width="100%",
231 |               options = list(lengthChange = FALSE, scrollY = "300px", paging = FALSE),
232 |               colnames = c("User", "Brewery", "Median", "Mean", "Beers had", "Rank (median)", "Rank (mean)"),
233 |               filter = "top") %>%
234 |   DT::formatRound(columns = c("mean_rating", "med_rating"), digits = 3)  #round values
235 | ```
236 | 
237 | # Styles `r emo::ji('woman_dancing')` {.tabset .tabset-pills}
238 | 
239 | Ratings by style for each user. **Note:** "meta" style refers to the  first part of the style while substyle refers to the entire style (e.g. _IPA - Imperial/Double_ would have a meta style of _IPA_ and a "substyle" style of _IPA - Imperial/Double_).
240 | 
241 | ## Meta styles plots {.tabset .tabset-dropdown}
242 | 
243 | Styles broken down by first part of style annotation
244 | 
245 | ```{r meta style, fig.height=8}
246 | # Styles by first word ("meta")
247 | meta_style_tab <- all_beers %>%
248 |   dplyr::mutate(meta_style = str_trim(word(beer.beer_style, 1, sep = fixed("-")))) %>%
249 |   dplyr::group_by(user, meta_style) %>%
250 |   dplyr::mutate(n = n()) %>%
251 |   dplyr::ungroup()
252 | 
253 | # Loop through users and make plots
254 | style_plots <- list()
255 | for (i in 1:length(users)) {
256 |   p <- meta_style_tab %>%
257 |     dplyr::filter(user == users[i]) %>%
258 |     ggplot(aes(x = rating_score, y = fct_reorder(meta_style, rating_score, median),
259 |                color = beer.rating_score)) +
260 |     geom_boxplot(outlier.alpha = 0) +
261 |     geom_jitter() + scale_color_distiller(direction = 1, palette = "Blues", guide = "colourbar") +
262 |     xlim(c(0, 5)) +
263 |     labs(x = "Rating", y = "Style", color = "Global rating")
264 |   style_plots[[i]] <- p
265 | }
266 | ```
267 | 
268 | ### Everyone
269 | 
270 | ```{r meta style all, fig.height=10}
271 | meta_style_tab %>%
272 |   ggplot(aes(x = rating_score, y = fct_reorder(meta_style, rating_score, median),
273 |              color = beer.rating_score)) +
274 |   geom_boxplot(outlier.alpha = 0) +
275 |   geom_jitter() + scale_color_distiller(direction = 1, palette = "Blues", guide = "colourbar") +
276 |   xlim(c(0, 5)) +
277 |   labs(x = "Rating", y = "Style", color = "Global rating")
278 | ```
279 | 
280 | ```{r meta style plot, results='asis', fig.height=8}
281 | # Print each plot in own subheading
282 | for (i in 1:length(users)) {
283 |   cat(sprintf('\n\n### %s {.tabset .tabset-pills}\n\n', users[i], '\n\n'))
284 |   print(style_plots[[i]])
285 |   cat("\n\n")
286 | }
287 | ```
288 | 
289 | ## Meta styles table
290 | 
291 | ```{r meta style table}
292 | # Ratings by meta style
293 | meta_table <- meta_style_tab %>%
294 |   dplyr::group_by(user, meta_style) %>%
295 |   dplyr::summarise(med_rating = median(rating_score), mean_rating = mean(rating_score), n = n(), .groups = "drop") %>%
296 |   dplyr::group_by(user) %>%
297 |   dplyr::mutate(rank_med = rank(desc(med_rating), ties.method = "min"),
298 |                 rank_mean = rank(desc(mean_rating), ties.method = "min")) %>%
299 |   dplyr::ungroup()
300 | 
301 | # Make into interactive table
302 | DT::datatable(meta_table %>% dplyr::arrange(rank_med), style="bootstrap", width="100%",
303 |               options = list(lengthChange = FALSE, scrollY = "300px", paging = FALSE),
304 |               colnames = c("User", "Style", "Median", "Mean", "Beers had", "Rank (median)", "Rank (mean)"),
305 |               filter = "top") %>%
306 |   DT::formatRound(columns = c("mean_rating", "med_rating"), digits = 3)
307 | ```
308 | 
309 | ## Substyles table
310 | 
311 | More detailed style information
312 | 
313 | ```{r granular style table}
314 | # Ratings by granular style
315 | style_tab <- all_beers %>%
316 |   dplyr::group_by(user, beer.beer_style) %>%
317 |   dplyr::summarise(med_rating = median(rating_score), mean_rating = mean(rating_score), n = n(), .groups = "drop") %>%
318 |   dplyr::group_by(user) %>%
319 |   dplyr::mutate(rank_med = rank(desc(med_rating), ties.method = "min"),
320 |                 rank_mean = rank(desc(mean_rating), ties.method = "min")) %>%
321 |   dplyr::ungroup()
322 | 
323 | # Make into interactive table
324 | DT::datatable(style_tab %>% dplyr::arrange(rank_med), style="bootstrap", width="100%",
325 |               options = list(lengthChange = FALSE, scrollY = "300px", paging = FALSE),
326 |               colnames = c("User", "Substyle", "Median", "Mean", "Beers had", "Rank (median)", "Rank (mean)"),
327 |               filter = "top") %>%
328 |   DT::formatRound(columns = c("mean_rating", "med_rating"), digits = 3)
329 | ```
330 | 
331 | # ABV and IBU `r emo::ji('nauseated_face')` {.tabset .tabset-pills}
332 | 
333 | ```{r abv ibu summary, fig.height=10}
334 | # Pairs plot relating our ratings to ABV and IBU
335 | all_beers %>%
336 |   dplyr::select(user, beer.beer_name, brewery.brewery_name, rating_score,
337 |                 beer.beer_abv, beer.beer_ibu, beer.rating_score) %>%
338 |   tidyr::pivot_wider(names_from = "user", values_from = "rating_score") %>%
339 |   dplyr::rename(global = beer.rating_score, ABV = beer.beer_abv, IBU = beer.beer_ibu) %>%
340 |   GGally::ggpairs(columns = c(3:ncol(.)))
341 | ```
342 | 
343 | # Prediction `r emo::ji('monocle')`
344 | 
345 | I fit a 10-fold cross-validated random forest to predict each user's ratings. Reported here are the Pearson score of each model as well as estimated importance of each feature in prediction (and the associated standard deviation).
346 | 
347 | ```{r prediction}
348 | predictions <- list()  # store actual predictions
349 | results <- list()  # store model stats
350 | 
351 | # Fit a 10-fold CV random forest for each user
352 | for (i in 1:length(users)) {
353 | 
354 |   # Filter to user and relevant columns:
355 |   # beer id, rating, global, num ratings abv, ibu, style, brewery type, brewery state
356 |   user <- all_beers %>%
357 |     dplyr::filter(user == users[[i]]) %>%
358 |     dplyr::select(beer.bid, rating_score, beer.beer_abv, beer.beer_ibu,
359 |                   beer.beer_style, beer.rating_score, brewery.brewery_type,
360 |                   brewery.location.brewery_state, beer.rating_count)
361 | 
362 |   # Remove NAs
363 |   user <- user[apply(user, 1, function(x) !any(is.na(x))), ]
364 | 
365 |   # Make CV folds
366 |   folds <- caret::createFolds(user$rating_score, k = 10)
367 | 
368 |   preds <- user$rating_score
369 |   importances <- list()
370 | 
371 |   # Loop through folds for CV
372 |   for (k in 1:length(folds)) {
373 | 
374 |     # Test and training sets
375 |     fold <- folds[[k]]
376 |     test <- user[fold, ]
377 |     train <- dplyr::anti_join(user, test)
378 | 
379 |     # Fit RF
380 |     res <- ranger::ranger(rating_score ~ ., data = train %>%
381 |                             dplyr::select(-beer.bid),
382 |                           importance = "impurity", respect.unordered.factors = T)
383 | 
384 |     # Store model statistics and predictions
385 |     preds[fold] <- predict(res, test %>% dplyr::select(-rating_score, -beer.bid))$predictions
386 |     imps <- tibble::tibble(feature = names(res$variable.importance),
387 |                            RF.imp = res$variable.importance / sum(res$variable.importance),
388 |                            fold = k)
389 |     importances[[k]] <- imps
390 |   }
391 | 
392 |   # Combine across folds
393 |   importances <- dplyr::bind_rows(importances) %>%
394 |     dplyr::distinct(feature, RF.imp, fold) %>%
395 |     reshape2::acast(feature ~ fold, value.var = "RF.imp")
396 | 
397 |   # Feature importances
398 |   imp_table <- tibble::tibble(feature = rownames(importances),
399 |                               imp.mean = importances %>%
400 |                                 apply(1, function(x) mean(x, na.rm = T)),
401 |                               imp.sd = importances %>%
402 |                                 apply(1, function(x) sd(x, na.rm = T)),
403 |                               imp.stability = importances %>%
404 |                                 apply(1, function(x) mean(!is.na(x)))) %>%
405 |     dplyr::filter(feature != "(Intercept)") %>%
406 |     dplyr::arrange(desc(imp.mean)) %>%
407 |     dplyr::mutate(rank = 1:n()) %>%
408 |     dplyr::mutate(pearson = cor(preds, user$rating_score), user = users[[i]])
409 | 
410 |   # Output to final list
411 |   results[[i]] <- imp_table
412 |   predictions[[i]] <- tibble(predicted = preds, actual = user$rating_score,
413 |                              bid = user$beer.bid, user = users[[i]])
414 | }
415 | 
416 | # Combine across users
417 | results <- dplyr::bind_rows(results); predictions <- dplyr::bind_rows(predictions)
418 | 
419 | # Pearson scores plot
420 | p1 <- results %>%
421 |   dplyr::distinct(user, pearson) %>%
422 |   ggplot(aes(x = user, y = pearson, fill = user)) +
423 |   geom_bar(stat = "identity") +
424 |   scale_fill_tableau() + theme(legend.position = "none") +
425 |   labs(x = "User", y = "Pearson score")
426 | 
427 | # Feature importance plot
428 | p2 <- results %>%
429 |   dplyr::distinct(user, feature, imp.mean, rank) %>%
430 |   ggplot(aes(x = imp.mean, y = fct_reorder(feature, rank, "median", .desc = T), fill = user)) +
431 |   geom_bar(stat = "identity", position = "dodge") +
432 |   scale_fill_tableau() +
433 |   labs(y = "Feature", x = "Mean feature importance", fill = "User")
434 | cowplot::plot_grid(p1, p2, rel_widths = c(1, 3))
435 | ```
436 | 
437 | ```{r prediction table}
438 | # Table of random forest results
439 | DT::datatable(results %>%
440 |                 dplyr::arrange(rank) %>%
441 |                 dplyr::select(c(7, 6, 1, 2, 3)),
442 |               style="bootstrap", width="100%",
443 |               options = list(lengthChange = FALSE, scrollY = "300px", paging = FALSE),
444 |               colnames = c("User", "Pearson Score", "Feature", "Importance", "Importance deviation"),
445 |               filter = "top") %>%
446 |   DT::formatRound(columns = c("pearson", "imp.mean", "imp.sd"), digits = 3)
447 | ```
448 | 


--------------------------------------------------------------------------------
/install_packages.R:
--------------------------------------------------------------------------------
 1 | # Point to CRAN for package repos
 2 | options(repos=structure(c(CRAN="http://cran.r-project.org")))
 3 | 
 4 | # CRAN packages
 5 | install.packages(c(
 6 |   "tidyverse",
 7 |   "RColorBrewer",
 8 |   "ggthemes",
 9 |   "cowplot",
10 |   "GGally",
11 |   "DT",
12 |   "randomNames",
13 |   "devtools",
14 |   "caret",
15 |   "ranger"
16 | ))
17 | 
18 | devtools::install_github("hadley/emo")
19 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import csv 
 2 | import requests
 3 | from dotenv import load_dotenv
 4 | import os
 5 | import argparse
 6 | import pandas as pd
 7 | import json 
 8 | 
 9 | load_dotenv()
10 | CLIENT_ID = os.getenv("CLIENT_ID")
11 | CLIENT_SECRET = os.getenv("CLIENT_SECRET")
12 | USERNAMES = (os.getenv("USERNAMES")).split(",")
13 | 
14 | URL_ENDPOINT = "https://api.untappd.com/v4/user/beers/"
15 | UNTAPPD_MAX_LIMT = 50
16 | DEFAULT_PARAMS = {
17 |   "limit": str(UNTAPPD_MAX_LIMT),
18 |   "sort": "highest_rated_you",
19 |   "client_id": CLIENT_ID,
20 |   "client_secret": CLIENT_SECRET,
21 | }
22 | 
23 | def main():
24 |   blob_by_user = {} 
25 |   for user in USERNAMES:
26 |     blob = {}
27 |     blob['root'] = []
28 |     request_user_beers(user, blob)
29 |     blob_by_user[user] = blob
30 |   
31 |   write_blob_by_user_to_one_csv(blob_by_user)
32 |   print("Done 🍻")
33 | 
34 | def get_arguments():
35 |   parser = argparse.ArgumentParser(description='Hit the Untappd API for user data 🍻.',
36 |                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
37 | 
38 |   parser.add_argument("--force", action='store_true', default=False,
39 |                       help="Actually make a request. Used so you don't blow through your Untappd API limit.")
40 | 
41 |   parser.add_argument("--outfile", type=str, default="data.csv",
42 |                       help="Name of outfile. Should match filename in R script.")
43 | 
44 |   # NOTE: this wrongly assumes all users have checked in 
45 |   # roughly the same number of beers. Perhaps it's more approriate
46 |   # to pass a list of key value pairs
47 |   parser.add_argument("--number-of-unique-beers", type=int, default=49,
48 |                       help="How many unique beers for each user?")
49 | 
50 |   return parser.parse_args()
51 | 
52 | def request_user_beers(user, blob):
53 |     url = URL_ENDPOINT + user
54 | 
55 |     for offset in range(0, args.number_of_unique_beers, UNTAPPD_MAX_LIMT):
56 |       params = DEFAULT_PARAMS
57 |       if offset > 1:
58 |         params = {**DEFAULT_PARAMS, 'offset': str(offset)}
59 |       
60 |       if args.force: 
61 |         r = requests.get(url, params = params)
62 |         response = r.json()
63 |       else: 
64 |         response = { 
65 |           'response': 
66 |             {'beers': 
67 |               {'count': 1, 
68 |               'items': [
69 |                 {
70 |                   user: "test-response"
71 |                 }
72 |               ]
73 |             }
74 |           }
75 |         }
76 | 
77 |       print("Fetched response with offset: " + str(offset) + " for " + user + " 🍺...")
78 |       construct_user_json(blob, response)
79 | 
80 | def construct_user_json(blob, response):
81 |   for beer in response['response']['beers']['items']:
82 |     blob['root'].append(beer)
83 | 
84 | def write_blob_by_user_to_one_csv(blob_by_user):
85 |   data = pd.concat([read_blob_into_df(blob_by_user[user], user) for user in USERNAMES])
86 |   data.to_csv(args.outfile)
87 | 
88 | def read_blob_into_df(blob, user):
89 |   # flatten data and read into frame
90 |   df = pd.json_normalize(blob, record_path =["root"])
91 |   # track which user it came from
92 |   df.insert(0, "user", user)
93 |   return df
94 | 
95 | if __name__ == "__main__":
96 |   args = get_arguments()
97 |   main()
98 | 


--------------------------------------------------------------------------------
/report.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Read in arguments
 4 | while getopts ":d:o:n:a:" arg; do
 5 |   case $arg in
 6 |     d) # specify beer csv location
 7 |       data=${OPTARG};;
 8 |     o) # output directory
 9 |       out_dir=${OPTARG};;
10 |     n) # name of file
11 |       name=${OPTARG};;
12 |     a) # anonymize or not
13 |       anon=${OPTARG};;
14 |   esac
15 | done
16 | 
17 | echo "$data" "$out_dir" "$name" "$anon"
18 | 
19 | Rscript -e \
20 |   "rmarkdown::render('index.Rmd', output_dir='$out_dir', output_file='$name', \
21 |   params = list(data_file='$data', anonymize='$anon'))"
22 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | certifi==2020.12.5
 2 | chardet==4.0.0
 3 | idna==2.10
 4 | numpy==1.20.1
 5 | pandas==1.2.3
 6 | python-dateutil==2.8.1
 7 | python-dotenv==0.15.0
 8 | pytz==2021.1
 9 | requests==2.25.1
10 | six==1.15.0
11 | urllib3==1.26.4
12 | 


--------------------------------------------------------------------------------