├── .dockerignore ├── .gitignore ├── Dockerfile ├── README.md ├── googleAnalytics.html ├── index.Rmd ├── install_packages.R ├── main.py ├── report.sh ├── requirements.txt └── site └── index.html /.dockerignore: -------------------------------------------------------------------------------- 1 | # Virtual environment 2 | venv 3 | .env 4 | 5 | # Git 6 | .git 7 | .gitignore 8 | 9 | # Other 10 | site 11 | .DS_Store 12 | .Rhistory 13 | README.md 14 | requirements.txt 15 | site/index.html 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # special to this repo... 2 | *.csv 3 | *.json 4 | data 5 | 6 | # Mac 7 | .DS_Store 8 | 9 | # Excel 10 | *.xlsx 11 | 12 | # R 13 | .Rhistory 14 | .Rproj.user 15 | *.Rproj 16 | 17 | # Byte-compiled / optimized / DLL files 18 | __pycache__/ 19 | *.py[cod] 20 | *$py.class 21 | 22 | # Distribution / packaging 23 | .Python 24 | build/ 25 | develop-eggs/ 26 | dist/ 27 | downloads/ 28 | eggs/ 29 | .eggs/ 30 | lib/ 31 | lib64/ 32 | parts/ 33 | sdist/ 34 | var/ 35 | wheels/ 36 | pip-wheel-metadata/ 37 | share/python-wheels/ 38 | *.egg-info/ 39 | .installed.cfg 40 | *.egg 41 | MANIFEST 42 | 43 | # Jupyter Notebook 44 | .ipynb_checkpoints 45 | 46 | # IPython 47 | profile_default/ 48 | ipython_config.py 49 | 50 | # pyenv 51 | .python-version 52 | 53 | # Environments 54 | .env 55 | .venv 56 | env/ 57 | venv/ 58 | ENV/ 59 | env.bak/ 60 | venv.bak/ 61 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Base R with RMarkdown support 2 | FROM rocker/verse 3 | LABEL maintainer="aboghoss@broadinstitute.org" 4 | 5 | ARG BUILD_DATE 6 | 7 | LABEL org.label.schema.name="bargraph" 8 | LABEL org.label-schema.build-date=$BUILD_DATE 9 | LABEL org.label-schema.url="https://a-few-beers-later.surge.sh" 10 | LABEL org.label-schema.vcs-url="https://github.com/alexdanilowicz/BarGraph" 11 | LABEL org.label-schema.schema-version="0.0.1" 12 | 13 | # Update some basic packages for R compatibility 14 | 15 | # Set working directory 16 | WORKDIR / 17 | 18 | # Copy scripts into image 19 | COPY ./install_packages.R /install_packages.R 20 | COPY ./index.Rmd /index.Rmd 21 | COPY ./report.sh /bargraph/bin/report.sh 22 | 23 | # Install R packages 24 | RUN Rscript /install_packages.R 25 | 26 | # Make report.sh executable 27 | ENV PATH /bargraph/bin:$PATH 28 | RUN ["chmod", "-R", "+x", "/bargraph/bin"] 29 | 30 | # Entrypoint to run container 31 | ENTRYPOINT ["report.sh"] 32 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BarGraph 🍻 2 | 3 | > Get your user data with the Untappd API and run fancy stats on it. 4 | 5 | ## Local Development 📊 6 | 7 | We use a simple Python script to hit the Untappd API. We use R to make graphs and run stats. 8 | 9 | ### Getting Untappd API Secrets 10 | 11 | To run the Python script, you'll need an Untappd `CLIENT_ID` and `CLIENT_SECRET`. Put those in the `.env` file in the root of project, like so: 12 | 13 | ``` 14 | echo "CLIENT_ID=123" >> .env 15 | echo "CLIENT_SECRET=456" >> .env 16 | ``` 17 | 18 | You'll need to apply for an [Untappd API key](https://untappd.com/api/docs). 19 | 20 | You'll also need to put USERNAMES in the `.env` file as a list, like so: 21 | 22 | ``` 23 | echo "USERNAMES=alexdannylow,andrewbogo" >> .env 24 | ``` 25 | 26 | The users cannot be private on Untappd. 27 | 28 | For a single username, you can exclude the comma: 29 | 30 | ``` 31 | "USERNAMES=alexdannylow" 32 | ``` 33 | 34 | ### Running the Python Script 35 | 36 | To create the outfile.csv, you'll need to run the Python script to hit the Untappd API. 37 | 38 | #### First time? 39 | 40 | Create virutal env: 41 | 42 | `python3 -m venv venv` 43 | 44 | In the future, we would like to dockerize this. 45 | 46 | #### Then: 47 | 48 | `source venv/bin/activate` 49 | 50 | `pip3 install -r requirements.txt` 51 | 52 | Once everything is installed, you can run the script. 53 | 54 | ``` 55 | python3 main.py --help 56 | usage: main.py [-h] [--force] [--outfile OUTFILE] 57 | [--number-of-unique-beers NUMBER_OF_UNIQUE_BEERS] 58 | 59 | Hit the Untappd API for user data 🍻. 60 | 61 | optional arguments: 62 | -h, --help show this help message and exit 63 | --force Actually make a request. Used so you don't blow 64 | through your Untappd API limit. (default: False) 65 | --outfile OUTFILE Name of outfile. Should match filename in R script. 66 | (default: data.csv) 67 | --number-of-unique-beers NUMBER_OF_UNIQUE_BEERS 68 | How many unique beers for each user? (default: 49) 69 | ``` 70 | 71 | Example usage: 72 | 73 | ``` 74 | python3 main.py --force --outfile "allBeers.csv" --number-of-unique-beers 49 75 | ``` 76 | 77 | Note that we ignore `.csv` via our `.gitignore`. 78 | 79 | ### Generating a report with R 80 | 81 | #### First time? 82 | 83 | Install [Docker Desktop](https://docs.docker.com/desktop/) for your system. 84 | 85 | Then, pull the Docker image from Docker Hub: 86 | 87 | `docker pull aboghoss/bargraph:v0.0.1` 88 | 89 | #### Then 90 | 91 | ``` 92 | docker run -it \ 93 | -v :/data \ 94 | -v :/out_dir \ 95 | aboghoss/bargraph:v0.0.1 \ 96 | -d /data/ \ 97 | -o /out_dir \ 98 | -n \ 99 | -a 100 | ``` 101 | Replacing `` with the path to the local directory containing your Untappd data (e.g. `~/Desktop/beer_data.csv`) and `` with a path to the directory you would like the report written to. 102 | 103 | The `-v` arguments mount directories to the container allowing them to read and write in those directories. If the directory you want to write to is the same as the one containing your data remove line 94 and edit line 97 to be `-o /data`. 104 | 105 | ##### Arguments 106 | 107 | `-d`: string name of your data file 108 | 109 | `-o`: string name of directory to write to 110 | 111 | `-n`: string name of the report that will be created 112 | 113 | `-a`: `0` to keep usernames, `1` to anonymize data 114 | 115 | ## Deploying 🚀 116 | 117 | In our case, the output index.html file in the `site` directory is deployed via [surge](https://surge.sh/) to [http://a-few-beers-later.surge.sh/](http://a-few-beers-later.surge.sh/) 118 | 119 | ``` 120 | surge ./site a-few-beers-later.surge.sh 121 | ``` 122 | -------------------------------------------------------------------------------- /googleAnalytics.html: -------------------------------------------------------------------------------- 1 | 2 | 6 | 15 | -------------------------------------------------------------------------------- /index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "A few beers later... `r emo::ji('clinking_beer_mugs')`" 3 | params: 4 | data_file: ./data/allBeers2.csv 5 | anonymize: 1 6 | output: 7 | html_document: 8 | theme: paper 9 | highlight: kate 10 | toc: false 11 | --- 12 | 13 | ```{r setup, include=FALSE} 14 | # Include and silence all chunks 15 | knitr::opts_chunk$set(include = T, echo = F, warning = F, message = F, fig.width = 12) 16 | 17 | # Load libraries 18 | library(tidyverse) 19 | library(caret) # model setup 20 | library(ranger) # random forest 21 | library(RColorBrewer) # colors 22 | library(ggthemes) # colors 23 | library(cowplot) # themes 24 | library(GGally) # just for pairs plot (manually do?) 25 | library(DT) # javascript tables 26 | library(emo) # emojis (has to be installed with devtools) 27 | library(randomNames) # for making fake names 28 | 29 | # Load data 30 | all_beers <- data.table::fread(params$data_file) %>% 31 | dplyr::select(-V1) %>% 32 | dplyr::group_by(beer.beer_name, brewery.brewery_name, beer.bid) %>% 33 | dplyr::mutate(beer.rating_score = max(beer.rating_score)) %>% 34 | dplyr::ungroup() %>% 35 | dplyr::mutate(beer.rating_score = ifelse(beer.rating_score == 0, NA, beer.rating_score), 36 | beer.beer_ibu = ifelse(beer.beer_ibu == 0, NA, beer.beer_ibu), 37 | beer.beer_abv = ifelse(beer.beer_abv == 0, NA, beer.beer_abv)) 38 | 39 | n_users <- length(unique(all_beers$user)) 40 | n_beers <- nrow(all_beers) 41 | 42 | # Anonymizes data if specified 43 | if (as.numeric(params$anonymize) == 1) { 44 | 45 | # Maintains 3 original names for consistency otherwise makes random 46 | if (n_users == 3) { 47 | fake_users <- c("Alice", "Bob", "Carl") 48 | } else { 49 | fake_users <- randomNames::randomNames(n = n_users, which.names = "first") 50 | } 51 | 52 | # Scrubs usernames from report 53 | all_beers <- all_beers %>% 54 | dplyr::right_join(dplyr::distinct(., user) %>% 55 | dplyr::mutate(id = rank(user))) %>% 56 | dplyr::mutate(user = fake_users[id]) %>% 57 | dplyr::select(-id) 58 | } 59 | 60 | # List of users 61 | users <- distinct(all_beers, user)$user 62 | 63 | # Set plot theme to "cowplot" 64 | theme_set(theme_cowplot()) 65 | ``` 66 | 67 | # How do our distributions stack up? `r emo::ji('bar_chart')` {.tabset .tabset-pills} 68 | 69 | ## Distributions 70 | 71 | ```{r distributions} 72 | # Violin plot with boxplot inside 73 | all_beers %>% 74 | ggplot(aes(x = user, y = rating_score, fill = user)) + 75 | geom_violin() + 76 | geom_boxplot(width = 0.2, outlier.alpha = 0) + 77 | scale_fill_tableau() + ylim(c(0, 5)) + 78 | labs(x = "User", y = "Rating", fill = "User", title = paste("Total beers:", n_beers)) 79 | ``` 80 | 81 | ## Do we agree on ratings? 82 | 83 | ```{r correlations, fig.height=8} 84 | # Pairs plot 85 | all_beers %>% 86 | dplyr::select(user, beer.beer_name, brewery.brewery_name, rating_score, beer.rating_score) %>% 87 | tidyr::pivot_wider(names_from = "user", values_from = "rating_score") %>% # make wide table for ggpairs 88 | dplyr::rename(global = beer.rating_score) %>% 89 | GGally::ggpairs(columns = c(3:ncol(.))) 90 | ``` 91 | 92 | ## Controversial beers 93 | 94 | ```{r controversial} 95 | # Find beers with largest difference between our ratings 96 | controversial <- all_beers %>% 97 | dplyr::select(beer.beer_name, brewery.brewery_name, user, rating_score, beer.rating_score) %>% 98 | dplyr::group_by(beer.beer_name, brewery.brewery_name) %>% 99 | dplyr::mutate(max_diff = max(rating_score) - min(rating_score)) %>% 100 | dplyr::ungroup() %>% 101 | tidyr::pivot_wider(names_from = "user", values_from = "rating_score") %>% 102 | dplyr::filter(max_diff >= 1) 103 | 104 | # Make into interactive table 105 | DT::datatable(controversial %>% dplyr::arrange(desc(max_diff)), style="bootstrap", width="100%", 106 | options = list(lengthChange = FALSE, scrollY = "300px", paging = FALSE), 107 | colnames = c("Beer", "Brewery", "Rating", "Maximum difference", colnames(controversial)[5:ncol(controversial)]), 108 | filter = "top") %>% 109 | DT::formatRound(columns = c("beer.rating_score", "max_diff"), digits = 3) 110 | ``` 111 | 112 | ## Differences from global 113 | 114 | ```{r global diffs} 115 | # Difference from global average (boxplot) 116 | all_beers %>% 117 | dplyr::filter(beer.rating_score != 0) %>% 118 | dplyr::select(user, beer.beer_name, brewery.brewery_name, rating_score, beer.rating_score) %>% 119 | dplyr::mutate(glob_diff = rating_score - beer.rating_score) %>% # calculate difference 120 | ggplot(aes(x = user, y = glob_diff)) + 121 | geom_boxplot(outlier.alpha = 0) + geom_jitter(aes(color = user)) + 122 | scale_color_tableau() + 123 | labs(x = "User", y = "Rating difference", color = "User") 124 | ``` 125 | 126 | # Breweries `r emo::ji('factory')` {.tabset .tabset-pills} 127 | 128 | Ratings by brewery for each user. User must have had 5 or more unique beers from the brewery. 129 | 130 | ## Top rated 131 | 132 | ```{r breweries, fig.height=12} 133 | # Our ratings higher than 4.5 134 | p1 <- all_beers %>% 135 | dplyr::filter(rating_score >= 4.5) %>% 136 | dplyr::group_by(brewery.brewery_name) %>% 137 | dplyr::mutate(n = n()) %>% 138 | dplyr::ungroup() %>% 139 | ggplot(aes(y = reorder(brewery.brewery_name, n), fill = user)) + 140 | geom_bar() + scale_fill_tableau() + 141 | labs(x = "Number rated higher than 4.5", y = "Brewery", fill = "User") 142 | # Global ratings higher than 4.25 143 | p2 <- all_beers %>% 144 | dplyr::filter(beer.rating_score >= 4.25) %>% 145 | dplyr::distinct(beer.rating_score, brewery.brewery_name) %>% 146 | dplyr::group_by(brewery.brewery_name) %>% 147 | dplyr::mutate(n = n()) %>% 148 | dplyr::ungroup() %>% 149 | dplyr::mutate(rating_bin = fct_rev(cut(beer.rating_score, seq(4, 5, 0.1)))) %>% 150 | ggplot(aes(y = reorder(brewery.brewery_name, n), fill = rating_bin)) + 151 | geom_bar() + scale_fill_brewer(palette = "Blues", direction = -1) + 152 | labs(x = "Number with average global higher than 4.25", y = "Brewery", fill = "Binned rating") 153 | cowplot::plot_grid(p2, p1, ncol = 1, rel_heights = c(6, 10)) 154 | ``` 155 | 156 | ## Average ratings {.tabset .tabset-dropdown} 157 | 158 | ```{r brewery averages} 159 | # Ratings by brewery 160 | brewery_ratings <- all_beers %>% 161 | dplyr::group_by(user, brewery.brewery_name) %>% 162 | dplyr::mutate(n = n()) %>% 163 | dplyr::ungroup() %>% 164 | dplyr::filter(n >= 5) # user has had >= 5 165 | 166 | # Loop through each user and make plot 167 | brewery_plots <- list() # store plots 168 | for (i in 1:length(users)) { 169 | p <- brewery_ratings %>% 170 | dplyr::filter(user == users[i]) %>% # filter to current user 171 | ggplot(aes(x = rating_score, y = fct_reorder(brewery.brewery_name, rating_score, median), 172 | color = brewery.location.brewery_state)) + 173 | geom_boxplot(outlier.alpha = 0) + geom_jitter() + 174 | scale_color_tableau(palette = "Tableau 20") + 175 | xlim(c(0, 5)) + 176 | labs(x = "Rating", y = "Brewery", color = "State") 177 | brewery_plots[[i]] <- p 178 | } 179 | ``` 180 | 181 | ### Everyone 182 | 183 | ```{r brewery averages everyone, fig.height=18} 184 | # To find our favorites as a group 185 | everyone <- all_beers %>% 186 | dplyr::mutate(brewery.location.brewery_state = 187 | ifelse(nchar(brewery.location.brewery_state) == 2, 188 | brewery.location.brewery_state, "Other")) %>% 189 | dplyr::group_by(brewery.brewery_name) %>% 190 | dplyr::mutate(n = n()) %>% 191 | dplyr::ungroup() %>% 192 | dplyr::filter(n >= 5) 193 | 194 | # Make new palette in tableau scheme of correct size 195 | n_pal <- colorRampPalette(tableau_color_pal(palette = "Tableau 20")(20)) 196 | pal <- n_pal(length(unique(everyone$brewery.location.brewery_state))) 197 | 198 | everyone %>% 199 | ggplot(aes(x = rating_score, y = fct_reorder(brewery.brewery_name, rating_score, median), 200 | color = brewery.location.brewery_state)) + 201 | geom_boxplot(outlier.alpha = 0) + geom_jitter() + 202 | scale_color_manual(values = pal) + 203 | xlim(c(0, 5)) + 204 | labs(x = "Rating", y = "Brewery", color = "State") 205 | ``` 206 | 207 | ```{r brewery averages plot, results='asis', fig.height=8} 208 | # Print all plots in their own subheading 209 | for (i in 1:length(users)) { 210 | cat(sprintf('\n\n### %s {.tabset .tabset-pills}\n\n', users[i], '\n\n')) 211 | print(brewery_plots[[i]]) 212 | cat("\n\n") 213 | } 214 | ``` 215 | 216 | ## Brewery rating table 217 | 218 | ```{r brewery table} 219 | # Table of brewery ratings 220 | brew_tab <- all_beers %>% 221 | dplyr::group_by(user, brewery.brewery_name) %>% 222 | dplyr::summarise(med_rating = median(rating_score), mean_rating = mean(rating_score), n = n(), .groups = "drop") %>% 223 | dplyr::filter(n >= 5) %>% # user has had >= 5 224 | dplyr::group_by(user) %>% 225 | dplyr::mutate(rank_med = rank(desc(med_rating), ties.method = "min"), 226 | rank_mean = rank(desc(mean_rating), ties.method = "min")) %>% # rank breweries (ties get same value) 227 | dplyr::ungroup() 228 | 229 | # Make into interactive table 230 | DT::datatable(brew_tab %>% dplyr::arrange(rank_med), style="bootstrap", width="100%", 231 | options = list(lengthChange = FALSE, scrollY = "300px", paging = FALSE), 232 | colnames = c("User", "Brewery", "Median", "Mean", "Beers had", "Rank (median)", "Rank (mean)"), 233 | filter = "top") %>% 234 | DT::formatRound(columns = c("mean_rating", "med_rating"), digits = 3) #round values 235 | ``` 236 | 237 | # Styles `r emo::ji('woman_dancing')` {.tabset .tabset-pills} 238 | 239 | Ratings by style for each user. **Note:** "meta" style refers to the first part of the style while substyle refers to the entire style (e.g. _IPA - Imperial/Double_ would have a meta style of _IPA_ and a "substyle" style of _IPA - Imperial/Double_). 240 | 241 | ## Meta styles plots {.tabset .tabset-dropdown} 242 | 243 | Styles broken down by first part of style annotation 244 | 245 | ```{r meta style, fig.height=8} 246 | # Styles by first word ("meta") 247 | meta_style_tab <- all_beers %>% 248 | dplyr::mutate(meta_style = str_trim(word(beer.beer_style, 1, sep = fixed("-")))) %>% 249 | dplyr::group_by(user, meta_style) %>% 250 | dplyr::mutate(n = n()) %>% 251 | dplyr::ungroup() 252 | 253 | # Loop through users and make plots 254 | style_plots <- list() 255 | for (i in 1:length(users)) { 256 | p <- meta_style_tab %>% 257 | dplyr::filter(user == users[i]) %>% 258 | ggplot(aes(x = rating_score, y = fct_reorder(meta_style, rating_score, median), 259 | color = beer.rating_score)) + 260 | geom_boxplot(outlier.alpha = 0) + 261 | geom_jitter() + scale_color_distiller(direction = 1, palette = "Blues", guide = "colourbar") + 262 | xlim(c(0, 5)) + 263 | labs(x = "Rating", y = "Style", color = "Global rating") 264 | style_plots[[i]] <- p 265 | } 266 | ``` 267 | 268 | ### Everyone 269 | 270 | ```{r meta style all, fig.height=10} 271 | meta_style_tab %>% 272 | ggplot(aes(x = rating_score, y = fct_reorder(meta_style, rating_score, median), 273 | color = beer.rating_score)) + 274 | geom_boxplot(outlier.alpha = 0) + 275 | geom_jitter() + scale_color_distiller(direction = 1, palette = "Blues", guide = "colourbar") + 276 | xlim(c(0, 5)) + 277 | labs(x = "Rating", y = "Style", color = "Global rating") 278 | ``` 279 | 280 | ```{r meta style plot, results='asis', fig.height=8} 281 | # Print each plot in own subheading 282 | for (i in 1:length(users)) { 283 | cat(sprintf('\n\n### %s {.tabset .tabset-pills}\n\n', users[i], '\n\n')) 284 | print(style_plots[[i]]) 285 | cat("\n\n") 286 | } 287 | ``` 288 | 289 | ## Meta styles table 290 | 291 | ```{r meta style table} 292 | # Ratings by meta style 293 | meta_table <- meta_style_tab %>% 294 | dplyr::group_by(user, meta_style) %>% 295 | dplyr::summarise(med_rating = median(rating_score), mean_rating = mean(rating_score), n = n(), .groups = "drop") %>% 296 | dplyr::group_by(user) %>% 297 | dplyr::mutate(rank_med = rank(desc(med_rating), ties.method = "min"), 298 | rank_mean = rank(desc(mean_rating), ties.method = "min")) %>% 299 | dplyr::ungroup() 300 | 301 | # Make into interactive table 302 | DT::datatable(meta_table %>% dplyr::arrange(rank_med), style="bootstrap", width="100%", 303 | options = list(lengthChange = FALSE, scrollY = "300px", paging = FALSE), 304 | colnames = c("User", "Style", "Median", "Mean", "Beers had", "Rank (median)", "Rank (mean)"), 305 | filter = "top") %>% 306 | DT::formatRound(columns = c("mean_rating", "med_rating"), digits = 3) 307 | ``` 308 | 309 | ## Substyles table 310 | 311 | More detailed style information 312 | 313 | ```{r granular style table} 314 | # Ratings by granular style 315 | style_tab <- all_beers %>% 316 | dplyr::group_by(user, beer.beer_style) %>% 317 | dplyr::summarise(med_rating = median(rating_score), mean_rating = mean(rating_score), n = n(), .groups = "drop") %>% 318 | dplyr::group_by(user) %>% 319 | dplyr::mutate(rank_med = rank(desc(med_rating), ties.method = "min"), 320 | rank_mean = rank(desc(mean_rating), ties.method = "min")) %>% 321 | dplyr::ungroup() 322 | 323 | # Make into interactive table 324 | DT::datatable(style_tab %>% dplyr::arrange(rank_med), style="bootstrap", width="100%", 325 | options = list(lengthChange = FALSE, scrollY = "300px", paging = FALSE), 326 | colnames = c("User", "Substyle", "Median", "Mean", "Beers had", "Rank (median)", "Rank (mean)"), 327 | filter = "top") %>% 328 | DT::formatRound(columns = c("mean_rating", "med_rating"), digits = 3) 329 | ``` 330 | 331 | # ABV and IBU `r emo::ji('nauseated_face')` {.tabset .tabset-pills} 332 | 333 | ```{r abv ibu summary, fig.height=10} 334 | # Pairs plot relating our ratings to ABV and IBU 335 | all_beers %>% 336 | dplyr::select(user, beer.beer_name, brewery.brewery_name, rating_score, 337 | beer.beer_abv, beer.beer_ibu, beer.rating_score) %>% 338 | tidyr::pivot_wider(names_from = "user", values_from = "rating_score") %>% 339 | dplyr::rename(global = beer.rating_score, ABV = beer.beer_abv, IBU = beer.beer_ibu) %>% 340 | GGally::ggpairs(columns = c(3:ncol(.))) 341 | ``` 342 | 343 | # Prediction `r emo::ji('monocle')` 344 | 345 | I fit a 10-fold cross-validated random forest to predict each user's ratings. Reported here are the Pearson score of each model as well as estimated importance of each feature in prediction (and the associated standard deviation). 346 | 347 | ```{r prediction} 348 | predictions <- list() # store actual predictions 349 | results <- list() # store model stats 350 | 351 | # Fit a 10-fold CV random forest for each user 352 | for (i in 1:length(users)) { 353 | 354 | # Filter to user and relevant columns: 355 | # beer id, rating, global, num ratings abv, ibu, style, brewery type, brewery state 356 | user <- all_beers %>% 357 | dplyr::filter(user == users[[i]]) %>% 358 | dplyr::select(beer.bid, rating_score, beer.beer_abv, beer.beer_ibu, 359 | beer.beer_style, beer.rating_score, brewery.brewery_type, 360 | brewery.location.brewery_state, beer.rating_count) 361 | 362 | # Remove NAs 363 | user <- user[apply(user, 1, function(x) !any(is.na(x))), ] 364 | 365 | # Make CV folds 366 | folds <- caret::createFolds(user$rating_score, k = 10) 367 | 368 | preds <- user$rating_score 369 | importances <- list() 370 | 371 | # Loop through folds for CV 372 | for (k in 1:length(folds)) { 373 | 374 | # Test and training sets 375 | fold <- folds[[k]] 376 | test <- user[fold, ] 377 | train <- dplyr::anti_join(user, test) 378 | 379 | # Fit RF 380 | res <- ranger::ranger(rating_score ~ ., data = train %>% 381 | dplyr::select(-beer.bid), 382 | importance = "impurity", respect.unordered.factors = T) 383 | 384 | # Store model statistics and predictions 385 | preds[fold] <- predict(res, test %>% dplyr::select(-rating_score, -beer.bid))$predictions 386 | imps <- tibble::tibble(feature = names(res$variable.importance), 387 | RF.imp = res$variable.importance / sum(res$variable.importance), 388 | fold = k) 389 | importances[[k]] <- imps 390 | } 391 | 392 | # Combine across folds 393 | importances <- dplyr::bind_rows(importances) %>% 394 | dplyr::distinct(feature, RF.imp, fold) %>% 395 | reshape2::acast(feature ~ fold, value.var = "RF.imp") 396 | 397 | # Feature importances 398 | imp_table <- tibble::tibble(feature = rownames(importances), 399 | imp.mean = importances %>% 400 | apply(1, function(x) mean(x, na.rm = T)), 401 | imp.sd = importances %>% 402 | apply(1, function(x) sd(x, na.rm = T)), 403 | imp.stability = importances %>% 404 | apply(1, function(x) mean(!is.na(x)))) %>% 405 | dplyr::filter(feature != "(Intercept)") %>% 406 | dplyr::arrange(desc(imp.mean)) %>% 407 | dplyr::mutate(rank = 1:n()) %>% 408 | dplyr::mutate(pearson = cor(preds, user$rating_score), user = users[[i]]) 409 | 410 | # Output to final list 411 | results[[i]] <- imp_table 412 | predictions[[i]] <- tibble(predicted = preds, actual = user$rating_score, 413 | bid = user$beer.bid, user = users[[i]]) 414 | } 415 | 416 | # Combine across users 417 | results <- dplyr::bind_rows(results); predictions <- dplyr::bind_rows(predictions) 418 | 419 | # Pearson scores plot 420 | p1 <- results %>% 421 | dplyr::distinct(user, pearson) %>% 422 | ggplot(aes(x = user, y = pearson, fill = user)) + 423 | geom_bar(stat = "identity") + 424 | scale_fill_tableau() + theme(legend.position = "none") + 425 | labs(x = "User", y = "Pearson score") 426 | 427 | # Feature importance plot 428 | p2 <- results %>% 429 | dplyr::distinct(user, feature, imp.mean, rank) %>% 430 | ggplot(aes(x = imp.mean, y = fct_reorder(feature, rank, "median", .desc = T), fill = user)) + 431 | geom_bar(stat = "identity", position = "dodge") + 432 | scale_fill_tableau() + 433 | labs(y = "Feature", x = "Mean feature importance", fill = "User") 434 | cowplot::plot_grid(p1, p2, rel_widths = c(1, 3)) 435 | ``` 436 | 437 | ```{r prediction table} 438 | # Table of random forest results 439 | DT::datatable(results %>% 440 | dplyr::arrange(rank) %>% 441 | dplyr::select(c(7, 6, 1, 2, 3)), 442 | style="bootstrap", width="100%", 443 | options = list(lengthChange = FALSE, scrollY = "300px", paging = FALSE), 444 | colnames = c("User", "Pearson Score", "Feature", "Importance", "Importance deviation"), 445 | filter = "top") %>% 446 | DT::formatRound(columns = c("pearson", "imp.mean", "imp.sd"), digits = 3) 447 | ``` 448 | -------------------------------------------------------------------------------- /install_packages.R: -------------------------------------------------------------------------------- 1 | # Point to CRAN for package repos 2 | options(repos=structure(c(CRAN="http://cran.r-project.org"))) 3 | 4 | # CRAN packages 5 | install.packages(c( 6 | "tidyverse", 7 | "RColorBrewer", 8 | "ggthemes", 9 | "cowplot", 10 | "GGally", 11 | "DT", 12 | "randomNames", 13 | "devtools", 14 | "caret", 15 | "ranger" 16 | )) 17 | 18 | devtools::install_github("hadley/emo") 19 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import requests 3 | from dotenv import load_dotenv 4 | import os 5 | import argparse 6 | import pandas as pd 7 | import json 8 | 9 | load_dotenv() 10 | CLIENT_ID = os.getenv("CLIENT_ID") 11 | CLIENT_SECRET = os.getenv("CLIENT_SECRET") 12 | USERNAMES = (os.getenv("USERNAMES")).split(",") 13 | 14 | URL_ENDPOINT = "https://api.untappd.com/v4/user/beers/" 15 | UNTAPPD_MAX_LIMT = 50 16 | DEFAULT_PARAMS = { 17 | "limit": str(UNTAPPD_MAX_LIMT), 18 | "sort": "highest_rated_you", 19 | "client_id": CLIENT_ID, 20 | "client_secret": CLIENT_SECRET, 21 | } 22 | 23 | def main(): 24 | blob_by_user = {} 25 | for user in USERNAMES: 26 | blob = {} 27 | blob['root'] = [] 28 | request_user_beers(user, blob) 29 | blob_by_user[user] = blob 30 | 31 | write_blob_by_user_to_one_csv(blob_by_user) 32 | print("Done 🍻") 33 | 34 | def get_arguments(): 35 | parser = argparse.ArgumentParser(description='Hit the Untappd API for user data 🍻.', 36 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 37 | 38 | parser.add_argument("--force", action='store_true', default=False, 39 | help="Actually make a request. Used so you don't blow through your Untappd API limit.") 40 | 41 | parser.add_argument("--outfile", type=str, default="data.csv", 42 | help="Name of outfile. Should match filename in R script.") 43 | 44 | # NOTE: this wrongly assumes all users have checked in 45 | # roughly the same number of beers. Perhaps it's more approriate 46 | # to pass a list of key value pairs 47 | parser.add_argument("--number-of-unique-beers", type=int, default=49, 48 | help="How many unique beers for each user?") 49 | 50 | return parser.parse_args() 51 | 52 | def request_user_beers(user, blob): 53 | url = URL_ENDPOINT + user 54 | 55 | for offset in range(0, args.number_of_unique_beers, UNTAPPD_MAX_LIMT): 56 | params = DEFAULT_PARAMS 57 | if offset > 1: 58 | params = {**DEFAULT_PARAMS, 'offset': str(offset)} 59 | 60 | if args.force: 61 | r = requests.get(url, params = params) 62 | response = r.json() 63 | else: 64 | response = { 65 | 'response': 66 | {'beers': 67 | {'count': 1, 68 | 'items': [ 69 | { 70 | user: "test-response" 71 | } 72 | ] 73 | } 74 | } 75 | } 76 | 77 | print("Fetched response with offset: " + str(offset) + " for " + user + " 🍺...") 78 | construct_user_json(blob, response) 79 | 80 | def construct_user_json(blob, response): 81 | for beer in response['response']['beers']['items']: 82 | blob['root'].append(beer) 83 | 84 | def write_blob_by_user_to_one_csv(blob_by_user): 85 | data = pd.concat([read_blob_into_df(blob_by_user[user], user) for user in USERNAMES]) 86 | data.to_csv(args.outfile) 87 | 88 | def read_blob_into_df(blob, user): 89 | # flatten data and read into frame 90 | df = pd.json_normalize(blob, record_path =["root"]) 91 | # track which user it came from 92 | df.insert(0, "user", user) 93 | return df 94 | 95 | if __name__ == "__main__": 96 | args = get_arguments() 97 | main() 98 | -------------------------------------------------------------------------------- /report.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Read in arguments 4 | while getopts ":d:o:n:a:" arg; do 5 | case $arg in 6 | d) # specify beer csv location 7 | data=${OPTARG};; 8 | o) # output directory 9 | out_dir=${OPTARG};; 10 | n) # name of file 11 | name=${OPTARG};; 12 | a) # anonymize or not 13 | anon=${OPTARG};; 14 | esac 15 | done 16 | 17 | echo "$data" "$out_dir" "$name" "$anon" 18 | 19 | Rscript -e \ 20 | "rmarkdown::render('index.Rmd', output_dir='$out_dir', output_file='$name', \ 21 | params = list(data_file='$data', anonymize='$anon'))" 22 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | certifi==2020.12.5 2 | chardet==4.0.0 3 | idna==2.10 4 | numpy==1.20.1 5 | pandas==1.2.3 6 | python-dateutil==2.8.1 7 | python-dotenv==0.15.0 8 | pytz==2021.1 9 | requests==2.25.1 10 | six==1.15.0 11 | urllib3==1.26.4 12 | --------------------------------------------------------------------------------