├── .devcontainer ├── Dockerfile ├── devcontainer.json └── library-scripts │ └── common-debian.sh ├── .github └── workflows │ ├── metrics.yml │ └── metrics_keepalive.yml ├── .gitignore ├── .stats_timestamp ├── EasyRTutorialsUseR2022.pdf ├── LICENSE ├── README.md ├── intro-regression-R-tidymodels ├── README.md ├── images │ └── promo.png ├── slides.pptx ├── solution │ ├── Challenge-regression.ipynb │ ├── all-systems-check │ │ ├── keybindings.json │ │ ├── test.R │ │ ├── test.Rmd │ │ └── test.ipynb │ └── tests │ │ ├── Question 1.R │ │ ├── Question 2.R │ │ ├── Question 3.R │ │ ├── Question 4.R │ │ ├── Question 5.R │ │ ├── Question 6.R │ │ ├── Question 7.R │ │ └── Question 8.R └── workshop-designer.md └── requirements.txt /.devcontainer/Dockerfile: -------------------------------------------------------------------------------- 1 | # R version: 4, 4.1, 4.0 2 | ARG VARIANT="4" 3 | FROM rocker/r-ver:${VARIANT} 4 | 5 | # Use the [Option] comment to specify true/false arguments that should appear in VS Code UX 6 | # 7 | # [Option] Install zsh 8 | ARG INSTALL_ZSH="true" 9 | # [Option] Upgrade OS packages to their latest versions 10 | ARG UPGRADE_PACKAGES="false" 11 | 12 | # Install needed packages and setup non-root user. Use a separate RUN statement to add your own dependencies. 13 | ARG USERNAME=vscode 14 | ARG USER_UID=1000 15 | ARG USER_GID=$USER_UID 16 | COPY library-scripts/*.sh /tmp/library-scripts/ 17 | RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \ 18 | && /bin/bash /tmp/library-scripts/common-debian.sh "${INSTALL_ZSH}" "${USERNAME}" "${USER_UID}" "${USER_GID}" "${UPGRADE_PACKAGES}" "true" "true" \ 19 | && usermod -a -G staff ${USERNAME} \ 20 | && apt-get -y install \ 21 | python3-pip \ 22 | libgit2-dev \ 23 | libcurl4-openssl-dev \ 24 | libssl-dev \ 25 | libxml2-dev \ 26 | libxt-dev \ 27 | && apt-get autoremove -y && apt-get clean -y && rm -rf /var/lib/apt/lists/* /tmp/library-scripts \ 28 | && python3 -m pip --no-cache-dir install radian \ 29 | && pip --disable-pip-version-check --no-cache-dir install pybryt \ 30 | && pip --disable-pip-version-check --no-cache-dir install pylint \ 31 | && pip --disable-pip-version-check --no-cache-dir install jupyter \ 32 | && pip --disable-pip-version-check --no-cache-dir install datascience \ 33 | && pip --disable-pip-version-check --no-cache-dir install otter-grader \ 34 | && pip --disable-pip-version-check --no-cache-dir install numpy \ 35 | && pip --disable-pip-version-check --no-cache-dir install pandas \ 36 | && pip --disable-pip-version-check --no-cache-dir install scipy \ 37 | && pip --disable-pip-version-check --no-cache-dir install folium>=0.9.1 \ 38 | && pip --disable-pip-version-check --no-cache-dir install matplotlib \ 39 | && pip --disable-pip-version-check --no-cache-dir install ipywidgets>=7.0.0 \ 40 | && pip --disable-pip-version-check --no-cache-dir install bqplot \ 41 | && pip --disable-pip-version-check --no-cache-dir install nbinteract>=0.0.12 \ 42 | && pip --disable-pip-version-check --no-cache-dir install otter-grader \ 43 | && pip --disable-pip-version-check --no-cache-dir install okpy \ 44 | && pip --disable-pip-version-check --no-cache-dir install scikit-learn \ 45 | && install2.r --error --skipinstalled --ncpus -1 \ 46 | devtools \ 47 | languageserver \ 48 | httpgd \ 49 | tidyverse \ 50 | tidymodels \ 51 | statip \ 52 | patchwork \ 53 | paletteer \ 54 | glmnet \ 55 | randomForest \ 56 | xgboost \ 57 | here \ 58 | doParallel \ 59 | janitor \ 60 | vip \ 61 | ranger \ 62 | palmerpenguins \ 63 | skimr \ 64 | nnet \ 65 | kernlab \ 66 | plotly \ 67 | factoextra \ 68 | cluster \ 69 | ottr \ 70 | && rm -rf /tmp/downloaded_packages 71 | 72 | # Install summarytools and load some R package off the bat 73 | RUN R -e "devtools::install_github('https://github.com/dcomtois/summarytools/tree/0-8-9')" 74 | RUN R -e "library(ottr)" 75 | RUN R -e "library(here)" 76 | RUN R -e "library(languageserver)" 77 | # RUN installGithub.r ucbds-infra/ottr@stable 78 | 79 | 80 | # VSCode R Debugger dependency. Install the latest release version from GitHub without using GitHub API. 81 | # See https://github.com/microsoft/vscode-dev-containers/issues/1032 82 | RUN export TAG=$(git ls-remote --tags --refs --sort='version:refname' https://github.com/ManuelHentschel/vscDebugger v\* | tail -n 1 | cut --delimiter='/' --fields=3) \ 83 | && Rscript -e "remotes::install_git('https://github.com/ManuelHentschel/vscDebugger.git', ref = '"${TAG}"', dependencies = FALSE)" 84 | 85 | # R Session watcher settings. 86 | # See more details: https://github.com/REditorSupport/vscode-R/wiki/R-Session-watcher 87 | RUN echo 'source(file.path(Sys.getenv("HOME"), ".vscode-R", "init.R"))' >> ${R_HOME}/etc/Rprofile.site 88 | 89 | # [Optional] Uncomment this section to install additional OS packages. 90 | # RUN apt-get update \ 91 | # && export DEBIAN_FRONTEND=noninteractive \ 92 | # && apt-get -y install --no-install-recommends 93 | 94 | # [Optional] Uncomment this section to install additional R packages. 95 | # RUN install2.r --error --skipinstalled --ncpus -1 96 | 97 | 98 | # [Optional] Uncomment this section to install vscode-jupyter dependencies. 99 | RUN apt-get update \ 100 | && export DEBIAN_FRONTEND=noninteractive \ 101 | && apt-get -y install --no-install-recommends libzmq3-dev \ 102 | && install2.r --error --skipinstalled --ncpus -1 IRkernel \ 103 | && python3 -m pip --no-cache-dir install jupyter \ 104 | && R --vanilla -s -e 'IRkernel::installspec(user = FALSE)' 105 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "R Data Science Environment", 3 | "build": { 4 | "dockerfile": "Dockerfile", 5 | // Update VARIANT to pick a specific R version: 4, 4.1, 4.0 6 | "args": { "VARIANT": "4" } 7 | }, 8 | 9 | // Set *default* container specific settings.json values on container create. 10 | "settings": { 11 | "r.rterm.linux": "/usr/local/bin/radian", 12 | "r.bracketedPaste": true, 13 | "r.plot.useHttpgd": true, 14 | "[r]": { 15 | "editor.wordSeparators": "`~!@#%$^&*()-=+[{]}\\|;:'\",<>/?" 16 | } 17 | }, 18 | 19 | // Add the IDs of extensions you want installed when the container is created. 20 | "extensions": [ 21 | // Add Jupyter, R and Python vscode extensions 22 | "REditorSupport.r", 23 | "rdebugger.r-debugger", 24 | "ms-toolsai.jupyter", 25 | "ms-toolsai.jupyter-renderers", 26 | "ms-python.python", 27 | "ms-python.vscode-pylance", 28 | "vsls-contrib.codetour", 29 | "GitHub.copilot" 30 | ], 31 | 32 | // Use 'forwardPorts' to make a list of ports inside the container available locally. 33 | // "forwardPorts": [], 34 | 35 | // Use 'postCreateCommand' to run commands after the container is created. 36 | "postCreateCommand": "pip3 install -r requirements.txt", 37 | 38 | // Comment out to connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root. 39 | "remoteUser": "vscode" 40 | } 41 | -------------------------------------------------------------------------------- /.devcontainer/library-scripts/common-debian.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #------------------------------------------------------------------------------------------------------------- 3 | # Copyright (c) Microsoft Corporation. All rights reserved. 4 | # Licensed under the MIT License. See https://go.microsoft.com/fwlink/?linkid=2090316 for license information. 5 | #------------------------------------------------------------------------------------------------------------- 6 | # 7 | # Docs: https://github.com/microsoft/vscode-dev-containers/blob/main/script-library/docs/common.md 8 | # Maintainer: The VS Code and Codespaces Teams 9 | # 10 | # Syntax: ./common-debian.sh [install zsh flag] [username] [user UID] [user GID] [upgrade packages flag] [install Oh My Zsh! flag] [Add non-free packages] 11 | 12 | set -e 13 | 14 | INSTALL_ZSH=${1:-"true"} 15 | USERNAME=${2:-"automatic"} 16 | USER_UID=${3:-"automatic"} 17 | USER_GID=${4:-"automatic"} 18 | UPGRADE_PACKAGES=${5:-"true"} 19 | INSTALL_OH_MYS=${6:-"true"} 20 | ADD_NON_FREE_PACKAGES=${7:-"false"} 21 | SCRIPT_DIR="$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)" 22 | MARKER_FILE="/usr/local/etc/vscode-dev-containers/common" 23 | 24 | if [ "$(id -u)" -ne 0 ]; then 25 | echo -e 'Script must be run as root. Use sudo, su, or add "USER root" to your Dockerfile before running this script.' 26 | exit 1 27 | fi 28 | 29 | # Ensure that login shells get the correct path if the user updated the PATH using ENV. 30 | rm -f /etc/profile.d/00-restore-env.sh 31 | echo "export PATH=${PATH//$(sh -lc 'echo $PATH')/\$PATH}" > /etc/profile.d/00-restore-env.sh 32 | chmod +x /etc/profile.d/00-restore-env.sh 33 | 34 | # If in automatic mode, determine if a user already exists, if not use vscode 35 | if [ "${USERNAME}" = "auto" ] || [ "${USERNAME}" = "automatic" ]; then 36 | USERNAME="" 37 | POSSIBLE_USERS=("vscode" "node" "codespace" "$(awk -v val=1000 -F ":" '$3==val{print $1}' /etc/passwd)") 38 | for CURRENT_USER in ${POSSIBLE_USERS[@]}; do 39 | if id -u ${CURRENT_USER} > /dev/null 2>&1; then 40 | USERNAME=${CURRENT_USER} 41 | break 42 | fi 43 | done 44 | if [ "${USERNAME}" = "" ]; then 45 | USERNAME=vscode 46 | fi 47 | elif [ "${USERNAME}" = "none" ]; then 48 | USERNAME=root 49 | USER_UID=0 50 | USER_GID=0 51 | fi 52 | 53 | # Load markers to see which steps have already run 54 | if [ -f "${MARKER_FILE}" ]; then 55 | echo "Marker file found:" 56 | cat "${MARKER_FILE}" 57 | source "${MARKER_FILE}" 58 | fi 59 | 60 | # Ensure apt is in non-interactive to avoid prompts 61 | export DEBIAN_FRONTEND=noninteractive 62 | 63 | # Function to call apt-get if needed 64 | apt_get_update_if_needed() 65 | { 66 | if [ ! -d "/var/lib/apt/lists" ] || [ "$(ls /var/lib/apt/lists/ | wc -l)" = "0" ]; then 67 | echo "Running apt-get update..." 68 | apt-get update 69 | else 70 | echo "Skipping apt-get update." 71 | fi 72 | } 73 | 74 | # Run install apt-utils to avoid debconf warning then verify presence of other common developer tools and dependencies 75 | if [ "${PACKAGES_ALREADY_INSTALLED}" != "true" ]; then 76 | 77 | package_list="apt-utils \ 78 | openssh-client \ 79 | gnupg2 \ 80 | dirmngr \ 81 | iproute2 \ 82 | procps \ 83 | lsof \ 84 | htop \ 85 | net-tools \ 86 | psmisc \ 87 | curl \ 88 | wget \ 89 | rsync \ 90 | ca-certificates \ 91 | unzip \ 92 | zip \ 93 | nano \ 94 | vim-tiny \ 95 | less \ 96 | jq \ 97 | lsb-release \ 98 | apt-transport-https \ 99 | dialog \ 100 | libc6 \ 101 | libgcc1 \ 102 | libkrb5-3 \ 103 | libgssapi-krb5-2 \ 104 | libicu[0-9][0-9] \ 105 | liblttng-ust0 \ 106 | libstdc++6 \ 107 | zlib1g \ 108 | locales \ 109 | sudo \ 110 | ncdu \ 111 | man-db \ 112 | strace \ 113 | manpages \ 114 | manpages-dev \ 115 | init-system-helpers" 116 | 117 | # Needed for adding manpages-posix and manpages-posix-dev which are non-free packages in Debian 118 | if [ "${ADD_NON_FREE_PACKAGES}" = "true" ]; then 119 | # Bring in variables from /etc/os-release like VERSION_CODENAME 120 | . /etc/os-release 121 | sed -i -E "s/deb http:\/\/(deb|httpredir)\.debian\.org\/debian ${VERSION_CODENAME} main/deb http:\/\/\1\.debian\.org\/debian ${VERSION_CODENAME} main contrib non-free/" /etc/apt/sources.list 122 | sed -i -E "s/deb-src http:\/\/(deb|httredir)\.debian\.org\/debian ${VERSION_CODENAME} main/deb http:\/\/\1\.debian\.org\/debian ${VERSION_CODENAME} main contrib non-free/" /etc/apt/sources.list 123 | sed -i -E "s/deb http:\/\/(deb|httpredir)\.debian\.org\/debian ${VERSION_CODENAME}-updates main/deb http:\/\/\1\.debian\.org\/debian ${VERSION_CODENAME}-updates main contrib non-free/" /etc/apt/sources.list 124 | sed -i -E "s/deb-src http:\/\/(deb|httpredir)\.debian\.org\/debian ${VERSION_CODENAME}-updates main/deb http:\/\/\1\.debian\.org\/debian ${VERSION_CODENAME}-updates main contrib non-free/" /etc/apt/sources.list 125 | sed -i "s/deb http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}\/updates main/deb http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}\/updates main contrib non-free/" /etc/apt/sources.list 126 | sed -i "s/deb-src http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}\/updates main/deb http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}\/updates main contrib non-free/" /etc/apt/sources.list 127 | sed -i "s/deb http:\/\/deb\.debian\.org\/debian ${VERSION_CODENAME}-backports main/deb http:\/\/deb\.debian\.org\/debian ${VERSION_CODENAME}-backports main contrib non-free/" /etc/apt/sources.list 128 | sed -i "s/deb-src http:\/\/deb\.debian\.org\/debian ${VERSION_CODENAME}-backports main/deb http:\/\/deb\.debian\.org\/debian ${VERSION_CODENAME}-backports main contrib non-free/" /etc/apt/sources.list 129 | # Handle bullseye location for security https://www.debian.org/releases/bullseye/amd64/release-notes/ch-information.en.html 130 | sed -i "s/deb http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}-security main/deb http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}-security main contrib non-free/" /etc/apt/sources.list 131 | sed -i "s/deb-src http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}-security main/deb http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}-security main contrib non-free/" /etc/apt/sources.list 132 | echo "Running apt-get update..." 133 | apt-get update 134 | package_list="${package_list} manpages-posix manpages-posix-dev" 135 | else 136 | apt_get_update_if_needed 137 | fi 138 | 139 | # Install libssl1.1 if available 140 | if [[ ! -z $(apt-cache --names-only search ^libssl1.1$) ]]; then 141 | package_list="${package_list} libssl1.1" 142 | fi 143 | 144 | # Install appropriate version of libssl1.0.x if available 145 | libssl_package=$(dpkg-query -f '${db:Status-Abbrev}\t${binary:Package}\n' -W 'libssl1\.0\.?' 2>&1 || echo '') 146 | if [ "$(echo "$LIlibssl_packageBSSL" | grep -o 'libssl1\.0\.[0-9]:' | uniq | sort | wc -l)" -eq 0 ]; then 147 | if [[ ! -z $(apt-cache --names-only search ^libssl1.0.2$) ]]; then 148 | # Debian 9 149 | package_list="${package_list} libssl1.0.2" 150 | elif [[ ! -z $(apt-cache --names-only search ^libssl1.0.0$) ]]; then 151 | # Ubuntu 18.04, 16.04, earlier 152 | package_list="${package_list} libssl1.0.0" 153 | fi 154 | fi 155 | 156 | echo "Packages to verify are installed: ${package_list}" 157 | apt-get -y install --no-install-recommends ${package_list} 2> >( grep -v 'debconf: delaying package configuration, since apt-utils is not installed' >&2 ) 158 | 159 | # Install git if not already installed (may be more recent than distro version) 160 | if ! type git > /dev/null 2>&1; then 161 | apt-get -y install --no-install-recommends git 162 | fi 163 | 164 | PACKAGES_ALREADY_INSTALLED="true" 165 | fi 166 | 167 | # Get to latest versions of all packages 168 | if [ "${UPGRADE_PACKAGES}" = "true" ]; then 169 | apt_get_update_if_needed 170 | apt-get -y upgrade --no-install-recommends 171 | apt-get autoremove -y 172 | fi 173 | 174 | # Ensure at least the en_US.UTF-8 UTF-8 locale is available. 175 | # Common need for both applications and things like the agnoster ZSH theme. 176 | if [ "${LOCALE_ALREADY_SET}" != "true" ] && ! grep -o -E '^\s*en_US.UTF-8\s+UTF-8' /etc/locale.gen > /dev/null; then 177 | echo "en_US.UTF-8 UTF-8" >> /etc/locale.gen 178 | locale-gen 179 | LOCALE_ALREADY_SET="true" 180 | fi 181 | 182 | # Create or update a non-root user to match UID/GID. 183 | group_name="${USERNAME}" 184 | if id -u ${USERNAME} > /dev/null 2>&1; then 185 | # User exists, update if needed 186 | if [ "${USER_GID}" != "automatic" ] && [ "$USER_GID" != "$(id -g $USERNAME)" ]; then 187 | group_name="$(id -gn $USERNAME)" 188 | groupmod --gid $USER_GID ${group_name} 189 | usermod --gid $USER_GID $USERNAME 190 | fi 191 | if [ "${USER_UID}" != "automatic" ] && [ "$USER_UID" != "$(id -u $USERNAME)" ]; then 192 | usermod --uid $USER_UID $USERNAME 193 | fi 194 | else 195 | # Create user 196 | if [ "${USER_GID}" = "automatic" ]; then 197 | groupadd $USERNAME 198 | else 199 | groupadd --gid $USER_GID $USERNAME 200 | fi 201 | if [ "${USER_UID}" = "automatic" ]; then 202 | useradd -s /bin/bash --gid $USERNAME -m $USERNAME 203 | else 204 | useradd -s /bin/bash --uid $USER_UID --gid $USERNAME -m $USERNAME 205 | fi 206 | fi 207 | 208 | # Add add sudo support for non-root user 209 | if [ "${USERNAME}" != "root" ] && [ "${EXISTING_NON_ROOT_USER}" != "${USERNAME}" ]; then 210 | echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME 211 | chmod 0440 /etc/sudoers.d/$USERNAME 212 | EXISTING_NON_ROOT_USER="${USERNAME}" 213 | fi 214 | 215 | # ** Shell customization section ** 216 | if [ "${USERNAME}" = "root" ]; then 217 | user_rc_path="/root" 218 | else 219 | user_rc_path="/home/${USERNAME}" 220 | fi 221 | 222 | # Restore user .bashrc defaults from skeleton file if it doesn't exist or is empty 223 | if [ ! -f "${user_rc_path}/.bashrc" ] || [ ! -s "${user_rc_path}/.bashrc" ] ; then 224 | cp /etc/skel/.bashrc "${user_rc_path}/.bashrc" 225 | fi 226 | 227 | # Restore user .profile defaults from skeleton file if it doesn't exist or is empty 228 | if [ ! -f "${user_rc_path}/.profile" ] || [ ! -s "${user_rc_path}/.profile" ] ; then 229 | cp /etc/skel/.profile "${user_rc_path}/.profile" 230 | fi 231 | 232 | # .bashrc/.zshrc snippet 233 | rc_snippet="$(cat << 'EOF' 234 | 235 | if [ -z "${USER}" ]; then export USER=$(whoami); fi 236 | if [[ "${PATH}" != *"$HOME/.local/bin"* ]]; then export PATH="${PATH}:$HOME/.local/bin"; fi 237 | 238 | # Display optional first run image specific notice if configured and terminal is interactive 239 | if [ -t 1 ] && [[ "${TERM_PROGRAM}" = "vscode" || "${TERM_PROGRAM}" = "codespaces" ]] && [ ! -f "$HOME/.config/vscode-dev-containers/first-run-notice-already-displayed" ]; then 240 | if [ -f "/usr/local/etc/vscode-dev-containers/first-run-notice.txt" ]; then 241 | cat "/usr/local/etc/vscode-dev-containers/first-run-notice.txt" 242 | elif [ -f "/workspaces/.codespaces/shared/first-run-notice.txt" ]; then 243 | cat "/workspaces/.codespaces/shared/first-run-notice.txt" 244 | fi 245 | mkdir -p "$HOME/.config/vscode-dev-containers" 246 | # Mark first run notice as displayed after 10s to avoid problems with fast terminal refreshes hiding it 247 | ((sleep 10s; touch "$HOME/.config/vscode-dev-containers/first-run-notice-already-displayed") &) 248 | fi 249 | 250 | # Set the default git editor if not already set 251 | if [ -z "$(git config --get core.editor)" ] && [ -z "${GIT_EDITOR}" ]; then 252 | if [ "${TERM_PROGRAM}" = "vscode" ]; then 253 | if [[ -n $(command -v code-insiders) && -z $(command -v code) ]]; then 254 | export GIT_EDITOR="code-insiders --wait" 255 | else 256 | export GIT_EDITOR="code --wait" 257 | fi 258 | fi 259 | fi 260 | 261 | EOF 262 | )" 263 | 264 | # code shim, it fallbacks to code-insiders if code is not available 265 | cat << 'EOF' > /usr/local/bin/code 266 | #!/bin/sh 267 | 268 | get_in_path_except_current() { 269 | which -a "$1" | grep -A1 "$0" | grep -v "$0" 270 | } 271 | 272 | code="$(get_in_path_except_current code)" 273 | 274 | if [ -n "$code" ]; then 275 | exec "$code" "$@" 276 | elif [ "$(command -v code-insiders)" ]; then 277 | exec code-insiders "$@" 278 | else 279 | echo "code or code-insiders is not installed" >&2 280 | exit 127 281 | fi 282 | EOF 283 | chmod +x /usr/local/bin/code 284 | 285 | # systemctl shim - tells people to use 'service' if systemd is not running 286 | cat << 'EOF' > /usr/local/bin/systemctl 287 | #!/bin/sh 288 | set -e 289 | if [ -d "/run/systemd/system" ]; then 290 | exec /bin/systemctl/systemctl "$@" 291 | else 292 | echo '\n"systemd" is not running in this container due to its overhead.\nUse the "service" command to start services intead. e.g.: \n\nservice --status-all' 293 | fi 294 | EOF 295 | chmod +x /usr/local/bin/systemctl 296 | 297 | # Codespaces bash and OMZ themes - partly inspired by https://github.com/ohmyzsh/ohmyzsh/blob/master/themes/robbyrussell.zsh-theme 298 | codespaces_bash="$(cat \ 299 | <<'EOF' 300 | 301 | # Codespaces bash prompt theme 302 | __bash_prompt() { 303 | local userpart='`export XIT=$? \ 304 | && [ ! -z "${GITHUB_USER}" ] && echo -n "\[\033[0;32m\]@${GITHUB_USER} " || echo -n "\[\033[0;32m\]\u " \ 305 | && [ "$XIT" -ne "0" ] && echo -n "\[\033[1;31m\]➜" || echo -n "\[\033[0m\]➜"`' 306 | local gitbranch='`\ 307 | if [ "$(git config --get codespaces-theme.hide-status 2>/dev/null)" != 1 ]; then \ 308 | export BRANCH=$(git symbolic-ref --short HEAD 2>/dev/null || git rev-parse --short HEAD 2>/dev/null); \ 309 | if [ "${BRANCH}" != "" ]; then \ 310 | echo -n "\[\033[0;36m\](\[\033[1;31m\]${BRANCH}" \ 311 | && if git ls-files --error-unmatch -m --directory --no-empty-directory -o --exclude-standard ":/*" > /dev/null 2>&1; then \ 312 | echo -n " \[\033[1;33m\]✗"; \ 313 | fi \ 314 | && echo -n "\[\033[0;36m\]) "; \ 315 | fi; \ 316 | fi`' 317 | local lightblue='\[\033[1;34m\]' 318 | local removecolor='\[\033[0m\]' 319 | PS1="${userpart} ${lightblue}\w ${gitbranch}${removecolor}\$ " 320 | unset -f __bash_prompt 321 | } 322 | __bash_prompt 323 | 324 | EOF 325 | )" 326 | 327 | codespaces_zsh="$(cat \ 328 | <<'EOF' 329 | # Codespaces zsh prompt theme 330 | __zsh_prompt() { 331 | local prompt_username 332 | if [ ! -z "${GITHUB_USER}" ]; then 333 | prompt_username="@${GITHUB_USER}" 334 | else 335 | prompt_username="%n" 336 | fi 337 | PROMPT="%{$fg[green]%}${prompt_username} %(?:%{$reset_color%}➜ :%{$fg_bold[red]%}➜ )" # User/exit code arrow 338 | PROMPT+='%{$fg_bold[blue]%}%(5~|%-1~/…/%3~|%4~)%{$reset_color%} ' # cwd 339 | PROMPT+='$([ "$(git config --get codespaces-theme.hide-status 2>/dev/null)" != 1 ] && git_prompt_info)' # Git status 340 | PROMPT+='%{$fg[white]%}$ %{$reset_color%}' 341 | unset -f __zsh_prompt 342 | } 343 | ZSH_THEME_GIT_PROMPT_PREFIX="%{$fg_bold[cyan]%}(%{$fg_bold[red]%}" 344 | ZSH_THEME_GIT_PROMPT_SUFFIX="%{$reset_color%} " 345 | ZSH_THEME_GIT_PROMPT_DIRTY=" %{$fg_bold[yellow]%}✗%{$fg_bold[cyan]%})" 346 | ZSH_THEME_GIT_PROMPT_CLEAN="%{$fg_bold[cyan]%})" 347 | __zsh_prompt 348 | 349 | EOF 350 | )" 351 | 352 | # Add RC snippet and custom bash prompt 353 | if [ "${RC_SNIPPET_ALREADY_ADDED}" != "true" ]; then 354 | echo "${rc_snippet}" >> /etc/bash.bashrc 355 | echo "${codespaces_bash}" >> "${user_rc_path}/.bashrc" 356 | echo 'export PROMPT_DIRTRIM=4' >> "${user_rc_path}/.bashrc" 357 | if [ "${USERNAME}" != "root" ]; then 358 | echo "${codespaces_bash}" >> "/root/.bashrc" 359 | echo 'export PROMPT_DIRTRIM=4' >> "/root/.bashrc" 360 | fi 361 | chown ${USERNAME}:${group_name} "${user_rc_path}/.bashrc" 362 | RC_SNIPPET_ALREADY_ADDED="true" 363 | fi 364 | 365 | # Optionally install and configure zsh and Oh My Zsh! 366 | if [ "${INSTALL_ZSH}" = "true" ]; then 367 | if ! type zsh > /dev/null 2>&1; then 368 | apt_get_update_if_needed 369 | apt-get install -y zsh 370 | fi 371 | if [ "${ZSH_ALREADY_INSTALLED}" != "true" ]; then 372 | echo "${rc_snippet}" >> /etc/zsh/zshrc 373 | ZSH_ALREADY_INSTALLED="true" 374 | fi 375 | 376 | # Adapted, simplified inline Oh My Zsh! install steps that adds, defaults to a codespaces theme. 377 | # See https://github.com/ohmyzsh/ohmyzsh/blob/master/tools/install.sh for official script. 378 | oh_my_install_dir="${user_rc_path}/.oh-my-zsh" 379 | if [ ! -d "${oh_my_install_dir}" ] && [ "${INSTALL_OH_MYS}" = "true" ]; then 380 | template_path="${oh_my_install_dir}/templates/zshrc.zsh-template" 381 | user_rc_file="${user_rc_path}/.zshrc" 382 | umask g-w,o-w 383 | mkdir -p ${oh_my_install_dir} 384 | git clone --depth=1 \ 385 | -c core.eol=lf \ 386 | -c core.autocrlf=false \ 387 | -c fsck.zeroPaddedFilemode=ignore \ 388 | -c fetch.fsck.zeroPaddedFilemode=ignore \ 389 | -c receive.fsck.zeroPaddedFilemode=ignore \ 390 | "https://github.com/ohmyzsh/ohmyzsh" "${oh_my_install_dir}" 2>&1 391 | echo -e "$(cat "${template_path}")\nDISABLE_AUTO_UPDATE=true\nDISABLE_UPDATE_PROMPT=true" > ${user_rc_file} 392 | sed -i -e 's/ZSH_THEME=.*/ZSH_THEME="codespaces"/g' ${user_rc_file} 393 | 394 | mkdir -p ${oh_my_install_dir}/custom/themes 395 | echo "${codespaces_zsh}" > "${oh_my_install_dir}/custom/themes/codespaces.zsh-theme" 396 | # Shrink git while still enabling updates 397 | cd "${oh_my_install_dir}" 398 | git repack -a -d -f --depth=1 --window=1 399 | # Copy to non-root user if one is specified 400 | if [ "${USERNAME}" != "root" ]; then 401 | cp -rf "${user_rc_file}" "${oh_my_install_dir}" /root 402 | chown -R ${USERNAME}:${group_name} "${user_rc_path}" 403 | fi 404 | fi 405 | fi 406 | 407 | # Persist image metadata info, script if meta.env found in same directory 408 | meta_info_script="$(cat << 'EOF' 409 | #!/bin/sh 410 | . /usr/local/etc/vscode-dev-containers/meta.env 411 | 412 | # Minimal output 413 | if [ "$1" = "version" ] || [ "$1" = "image-version" ]; then 414 | echo "${VERSION}" 415 | exit 0 416 | elif [ "$1" = "release" ]; then 417 | echo "${GIT_REPOSITORY_RELEASE}" 418 | exit 0 419 | elif [ "$1" = "content" ] || [ "$1" = "content-url" ] || [ "$1" = "contents" ] || [ "$1" = "contents-url" ]; then 420 | echo "${CONTENTS_URL}" 421 | exit 0 422 | fi 423 | 424 | #Full output 425 | echo 426 | echo "Development container image information" 427 | echo 428 | if [ ! -z "${VERSION}" ]; then echo "- Image version: ${VERSION}"; fi 429 | if [ ! -z "${DEFINITION_ID}" ]; then echo "- Definition ID: ${DEFINITION_ID}"; fi 430 | if [ ! -z "${VARIANT}" ]; then echo "- Variant: ${VARIANT}"; fi 431 | if [ ! -z "${GIT_REPOSITORY}" ]; then echo "- Source code repository: ${GIT_REPOSITORY}"; fi 432 | if [ ! -z "${GIT_REPOSITORY_RELEASE}" ]; then echo "- Source code release/branch: ${GIT_REPOSITORY_RELEASE}"; fi 433 | if [ ! -z "${BUILD_TIMESTAMP}" ]; then echo "- Timestamp: ${BUILD_TIMESTAMP}"; fi 434 | if [ ! -z "${CONTENTS_URL}" ]; then echo && echo "More info: ${CONTENTS_URL}"; fi 435 | echo 436 | EOF 437 | )" 438 | if [ -f "${SCRIPT_DIR}/meta.env" ]; then 439 | mkdir -p /usr/local/etc/vscode-dev-containers/ 440 | cp -f "${SCRIPT_DIR}/meta.env" /usr/local/etc/vscode-dev-containers/meta.env 441 | echo "${meta_info_script}" > /usr/local/bin/devcontainer-info 442 | chmod +x /usr/local/bin/devcontainer-info 443 | fi 444 | 445 | # Write marker file 446 | mkdir -p "$(dirname "${MARKER_FILE}")" 447 | echo -e "\ 448 | PACKAGES_ALREADY_INSTALLED=${PACKAGES_ALREADY_INSTALLED}\n\ 449 | LOCALE_ALREADY_SET=${LOCALE_ALREADY_SET}\n\ 450 | EXISTING_NON_ROOT_USER=${EXISTING_NON_ROOT_USER}\n\ 451 | RC_SNIPPET_ALREADY_ADDED=${RC_SNIPPET_ALREADY_ADDED}\n\ 452 | ZSH_ALREADY_INSTALLED=${ZSH_ALREADY_INSTALLED}" > "${MARKER_FILE}" 453 | 454 | echo "Done!" 455 | -------------------------------------------------------------------------------- /.github/workflows/metrics.yml: -------------------------------------------------------------------------------- 1 | # GitHub Action to post GitHub metrics to an Azure Function App webhook 2 | # Required secrets 3 | # 1. A PAT with repo rights: PAT_REPO_REPORT 4 | # 2. The webhook endpoint url: REPORTING_ENDPOINT_URL 5 | # 3. The webhook endpoint key: REPORTING_ENDPOINT_KEY 6 | # 4. Reporting group/team: REPORTING_GROUP 7 | 8 | name: "GitHub repo metrics report" 9 | 10 | on: 11 | schedule: 12 | # Run this once per day, towards the end of the day for keeping the most 13 | # recent data point most meaningful (hours are interpreted in UTC). 14 | - cron: "0 23 * * *" 15 | workflow_dispatch: # Allow for running this manually. 16 | 17 | jobs: 18 | report_metrics_job: 19 | runs-on: ubuntu-latest 20 | name: GitHub repo metrics report 21 | steps: 22 | - name: run github metrics image 23 | id: github_metrics 24 | uses: gloveboxes/GitHubMetricsAction@v1 25 | with: 26 | github_repo: ${{ github.repository }} 27 | github_personal_access_token: ${{ secrets.REPORTING_PAT }} 28 | reporting_endpoint_url: ${{ secrets.REPORTING_ENDPOINT_URL }} 29 | reporting_endpoint_key: ${{ secrets.REPORTING_ENDPOINT_KEY }} 30 | reporting_group: $${{ secrets.REPORTING_GROUP }} 31 | -------------------------------------------------------------------------------- /.github/workflows/metrics_keepalive.yml: -------------------------------------------------------------------------------- 1 | # GitHub Actions go to sleep after 60 days of no activity 2 | # This action runs every month and updates the .stats_timestamp with the current UTC 3 | # This will generate activity on the repo 4 | # This activity will keep the git hub repo metrics action alive 5 | 6 | name: "GitHub repo metrics report keep alive" 7 | 8 | on: 9 | schedule: 10 | # Run this once per day, towards the end of the day for keeping the most 11 | # recent data point most meaningful (hours are interpreted in UTC). 12 | # https://cron.help/every-month 13 | # Action run midnight UTC 1st of each month 14 | - cron: "0 0 1 * *" 15 | workflow_dispatch: # Allow for running this manually. 16 | 17 | jobs: 18 | resources: 19 | name: Update the repo metrics timestamp 20 | runs-on: ubuntu-latest 21 | steps: 22 | - uses: actions/checkout@v3 23 | 24 | - name: timestamp 25 | run: date -Iseconds -u > .stats_timestamp 26 | 27 | - name: Update stats timestamp 28 | run: | 29 | git add .stats_timestamp 30 | git config user.name "github-actions[bot]" 31 | git config user.email "41898282+github-actions[bot]@users.noreply.github.com" 32 | git commit -am "Repo Stats Timestamp" 33 | git push 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | 5 | # Session Data files 6 | .RData 7 | 8 | # User-specific files 9 | .Ruserdata 10 | 11 | # Example code in package build process 12 | *-Ex.R 13 | 14 | # Output files from R CMD build 15 | /*.tar.gz 16 | 17 | # Output files from R CMD check 18 | /*.Rcheck/ 19 | 20 | # RStudio files 21 | .Rproj.user/ 22 | 23 | # produced vignettes 24 | vignettes/*.html 25 | vignettes/*.pdf 26 | 27 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 28 | .httr-oauth 29 | 30 | # knitr and R markdown default cache directories 31 | *_cache/ 32 | /cache/ 33 | 34 | # Temporary files created by R markdown 35 | *.utf8.md 36 | *.knit.md 37 | 38 | # R Environment Variables 39 | .Renviron 40 | -------------------------------------------------------------------------------- /.stats_timestamp: -------------------------------------------------------------------------------- 1 | 2024-05-01T01:08:13+00:00 2 | -------------------------------------------------------------------------------- /EasyRTutorialsUseR2022.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/revodavid/devcontainers-r/ba8b859d19fb84750d162becdb2f890258697ccc/EasyRTutorialsUseR2022.pdf -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 David Smith 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Easy R Tutorials with Dev Containers 2 | 3 | This is the repository supporting the presentation "Easy R Tutorials with Dev Containers". 4 | 5 | * Presenter: [David Smith](https://www.linkedin.com/in/dmsmith/), Cloud Advocate at Microsoft 6 | * Presented at: [UseR!2022, June 21, 2022](https://user2022.r-project.org/program/talks/#session-10-building-the-r-community-1) 7 | * Presentation slides: [PDF](./EasyRTutorialsUseR2022.pdf) 8 | * Presentation recording: [UseR!2022](https://www.accelevents.com/e/user2022/portal/schedule/260730) (Starting at 47:45) 9 | 10 | You can recreate the demos in the talk using the steps outlined below. 11 | 12 | ## Dev Containers in GitHub Codepaces 13 | 14 | If you have access to GitHub CodeSpaces, click the green "Code <>" button at the top right on this repository page, and then select "Create codespace on main". (GitHub CodeSpaces is available with [GitHub Enterprise](https://github.com/enterprise) and [GitHub Education](https://education.github.com/).) 15 | 16 | Once the Dev Container has started, browse to the file [intro-regression-R-tidymodels/solution/Challenge-regression.ipynb](intro-regression-R-tidymodels/solution/Challenge-regression.ipynb). This will launch a Jupyter Notebook. 17 | 18 | ## Dev Containers on a local machine 19 | 20 | You can use Linux, Mac or Windows (including Windows Subsystem for Linux). Just make sure your machine has the following necessary software installed: 21 | - [Visual Studio Code](https://code.visualstudio.com?WT.mc_id=academic-55190-ornella), and the [Remote-Containers extension](https://code.visualstudio.com/docs/remote/containers) 22 | - [Docker Desktop](https://www.docker.com/products/docker-desktop) 23 | - [Git](https://git-scm.com/downloads) 24 | 25 | **Note**: you do not need to install R, Python, or anything like that. These will all be provided by the Dev Container. 26 | 27 | Copy the contents of this repository to your machine. An easy way to do this is with the command: 28 | ``` 29 | git clone https://github.com/revodavid/devcontainers-r 30 | ``` 31 | 32 | Launch Visual Studio Code, and open the directory containing this downloaded repository. An easy way to do this is: 33 | ``` 34 | cd devcontainers-r 35 | code . 36 | ``` 37 | 38 | Open the VS Code command palette (Control-Shift-P) and run the command **Remote-Containers: Reopen in Container**. (You can also use the pop-up dialog that automatically prompts you do this.) The first time you try this, you will need to wait a few minutes for the container to build. After this first time, startup will be near-instantaneous. 39 | 40 | Now, browse to the file [intro-regression-R-tidymodels/solution/Challenge-regression.ipynb](intro-regression-R-tidymodels/solution/Challenge-regression.ipynb). Work through the Jupyter Notebook. 41 | 42 | # Resources and Links 43 | 44 | * [Dev Containers](https://containers.dev/) - Overview and specification 45 | * [Dev Containers in Visual Studio Code](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) - Remote-Containers extension 46 | * [Visual Studio Code](https://code.visualstudio.com/) - Free editor available for Windows, Mac and Linux 47 | * [GitHub Codespaces](https://github.com/features/codespaces) - Available with GitHub Enterprise and GitHub Education 48 | * [Microsoft Workshop Library](https://github.com/microsoft/workshop-library 49 | ) - The source of the workshop "Introduction to regression models by using R and Tidymodels" included in this presentation 50 | 51 | # Image Credits 52 | 53 | Images used in presentation slides: 54 | * [File:A frustrated and depressed man holds his head in his hand.jpg - Wikimedia Commons](https://commons.wikimedia.org/wiki/File:A_frustrated_and_depressed_man_holds_his_head_in_his_hand.jpg) 55 | * [File:Confused Felipe.jpg - Wikimedia Commons](https://commons.wikimedia.org/wiki/File:Confused_Felipe.jpg) 56 | * [File:Woman looking depressed.jpg - Wikimedia Commons](https://commons.wikimedia.org/wiki/File:Woman_looking_depressed.jpg) 57 | * [File:Angry woman.jpg - Wikimedia Commons](https://commons.wikimedia.org/wiki/File:Angry_woman.jpg) 58 | * "Bit" artwork by Ashley Willis 59 | 60 | # Feedback 61 | 62 | If you have any comments or suggestions about this presentation, please leave an issue in this repository. -------------------------------------------------------------------------------- /intro-regression-R-tidymodels/README.md: -------------------------------------------------------------------------------- 1 | # Introduction to regression models by using R and Tidymodels 2 | 3 | ## Module Source 4 | [Introduction to regression models by using R and tidymodels](https://docs.microsoft.com/en-us/learn/modules/introduction-regression-models/?WT.mc_id=academic-59300-cacaste) 5 | 6 | ## Goals 7 | 8 | Hello and welcome to this learning adventure! In this folder, you will find a Regression Challenge Notebook. This is basically an autograding guided assessment notebook that will help you test your understanding in using R to create models that can predict a numeric, quantifiable value, such as a price, amount, size, or other scalar numbers. 9 | 10 | | **Goal** | Description | 11 | | ----------------------------- | -----------------------------------------------| 12 | | **What will you learn** | How to create regression models in R | 13 | | **What you'll need** | [Visual Studio Code](https://code.visualstudio.com?WT.mc_id=academic-59300-cacaste), [Docker Desktop](https://www.docker.com/products/docker-desktop), [Remote Developer Extension](https://aka.ms/vscode-remote/download/extension) and [Git](https://git-scm.com/downloads) | 14 | | **Duration** | 1.5 to 2 Hours | 15 | | **Slides** | [Powerpoint](./slides.pptx) | 16 | 17 | ## Video 18 | 19 | [![workshop walk-through](./images/promo.png)](https://youtu.be/ckqijBKO-Es "workshop walk-through") 20 | > 🎥 Click this image to watch Carlotta walk you through the workshop material and to gain some tips about delivering this workshop. 21 | 22 | ## Pre-Learning 23 | 24 | This workshop allows learners to use the skills learnt in the module [Introduction to regression models by using R and tidymodels](https://docs.microsoft.com/en-us/learn/modules/introduction-regression-models/?WT.mc_id=academic-59300-cacaste) to create their own regression models. As such, learners are encouraged to go through the module beforehand so as to be conversant with some of the concepts covered in this workshop. 25 | This workshop is the second of a series designed to get started with data science in R. So you may also want to have a look to the [first](../explore-analyze-data-with-R) workshop of the series, dealing with exploratory data analysis. 26 | 27 | ## Prerequisites 28 | 29 | To get you up and running and writing R code in no time, we have containerized this workshop such that you have a ready out of the box R coding environment. 30 | 31 | ### Setting up the development container 32 | 33 | A **development container** is a running [Docker](https://www.docker.com) container with a well-defined tool/runtime stack and its prerequisites. You can try out development containers with **[GitHub Codespaces](https://github.com/features/codespaces)**, **[Binder](https://mybinder.org/)** or **[Visual Studio Code Remote - Containers](https://aka.ms/vscode-remote/containers)**. 34 | 35 | #### GitHub Codespaces 36 | Follow these steps to open this workshop in a Codespace: 37 | 1. Click the Code drop-down menu and select the **Open with Codespaces** option. 38 | 2. Select **+ New codespace** at the bottom on the pane. 39 | 40 | For more info, check out the [GitHub documentation](https://docs.github.com/en/free-pro-team@latest/github/developing-online-with-codespaces/creating-a-codespace#creating-a-codespace). 41 | 42 | #### Binder 43 | This workshop is also available on Binder. To open the notebook in a Binder environment, just click the button below. 44 | 45 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/carlotta94c/workshop-library/introToRegressionR%26Tidymodels?labpath=full%2Fintro-regression-R-tidymodels%2Fsolution%2FChallenge-regression.ipynb) 46 | 47 | #### Learn Sandbox 48 | You can go through this challenge also leveraging on the Learn Sandbox environment, provided by [Unit 9](https://docs.microsoft.com/en-us/learn/modules/introduction-regression-models/9-challenge-regression) of the MS Learn module - Introduction to regression models by using R and tidymodels. Just sign in with your Microsoft or GitHub account and click on **Activate sandbox** to start. 49 | 50 | #### VS Code Remote - Containers 51 | Follow these steps to open this workshop in a container using the VS Code Remote - Containers extension: 52 | 53 | 1. If this is your first time using a development container, please ensure your system meets the pre-reqs (i.e. have Docker installed) in the [getting started steps](https://aka.ms/vscode-remote/containers/getting-started). 54 | 55 | 2. Press F1 select and **Add Development Container Configuration Files...** command for **Remote-Containers** or **Codespaces**. 56 | 57 | > **Note:** If needed, you can drag-and-drop the `.devcontainer` folder from this sub-folder in a locally cloned copy of this repository into the VS Code file explorer instead of using the command. 58 | 59 | 3. Select this definition. You may also need to select **Show All Definitions...** for it to appear. 60 | 61 | 4. Finally, press F1 and run **Remote-Containers: Reopen Folder in Container** to start using the definition. 62 | 63 | This definition includes some test code that will help you verify it is working as expected on your system. Open the `all-systems-check` folder where you can choose to run the `.R`, `.Rmd` or `.ipynb` scripts. You should see "Hello, remote world!" in an R terminal window (for `.R` and `.Rmd`) or within a Jupyter Notebook (for `.ipynb`) after the respective script executes. 64 | 65 | At some point, you may want to make changes to your container, such as installing a new package. You'll need to rebuild your container for your changes to take effect. 66 | 67 | ## What you will learn 68 | 69 | Let's say we are a real estate agent and we've just been handed a couple of new houses at different locations of a city. We don't know the selling price, and we want to get an estimate of it by comparing it with that of other houses in the location. 70 | 71 | In this challenge, you will use a dataset of real estate sales transactions to predict the price-per-unit of a property based on features such as the property age, availability of local amenities, and location. 72 | 73 | ## Milestone 1: Explore the Data 🕵️‍️ 74 | 75 | The first step in any machine learning project is typically to explore the data that you will use to train a model. The goal of this exploration is to try to understand the relationships between its attributes; in particular, any apparent correlation between the features and the label your model will try to predict. 76 | 77 | In this section you will: 78 | 79 | - Import the data and identify the `features` (predictors) and `label` (outcome) variables that you'll be working with. 80 | 81 | - Examine the summary statistics and plot the distribution of the outcome variable, in our case: price of a house. 82 | 83 | - Deal with outliers. 84 | 85 | - Examine the apparent relationship between numeric features and the price of a house using the correlation statistic and scatter plots. 86 | 87 | - Examine the distribution of the house price for each categorical feature using boxplots. 88 | 89 | Now that you've explored the data, it's time to use it to train a regression model that uses the features we've identified as potentially predictive to predict the rentals label. 90 | 91 | ## Milestone 2: Data budgeting 92 | 93 | It’s common practice in supervised learning to split the data into two subsets; a (typically larger) set with which to train the model, and a smaller “hold-back” set with which to validate the trained model. This enables us to evaluate how well the model performs in order to get a better estimate of how your models will perform on new data. 94 | 95 | In this section you will: 96 | 97 | - Create a data splitting specification i.e what proportion goes to training and what goes to testing. 98 | 99 | - Extract the training and testing sets. 100 | 101 | ## Milestone 3: Create a Random Forest model specification 102 | 103 | In this section, you will create a model specification with the following information: 104 | 105 | - the `type` of model is `random forest` 106 | 107 | - the `mode` of the model is `regression` (as opposed to classification, etc) 108 | 109 | - the computational `engine` is the name of the R package, which in our case will be the `randomForest` package. 110 | 111 | ## Milestone 4: Preprocess data using recipes 112 | 113 | A recipe is an object that defines a series of steps for data processing. In practice, it's common to perform some preprocessing of the data to make it easier for an algorithm to fit a model to it. 114 | 115 | In this section, you will specify a recipe that will: 116 | 117 | - Remove the transaction_date feature. 118 | 119 | - Transform local_convenience_stores feature into categorical (factor). 120 | 121 | - Center and scale all numeric predictors. 122 | 123 | ## Milestone 5: Create a modeling workflow and train a model 124 | 125 | A model workflow allows the user to bind modeling and preprocessing objects together. You can then fit the entire workflow to the data, so that the model encapsulates all of the preprocessing steps as well as the algorithm. 126 | 127 | In this section you will: 128 | 129 | - Create a model workflow. 130 | 131 | - Train the random forest model. 132 | 133 | ## Milestone 6: Evaluate model performance 134 | 135 | Once you have a trained model using the training set, you will want to evaluate how well (or not) it will perform on new data. 136 | 137 | In this section, you will: 138 | 139 | - Use the trained model to make predictions on the `test set`. 140 | 141 | - Evaluate the model predictions using metrics such as `rmse` and `R^2`. 142 | 143 | ## Milestone 7: Use the trained model 144 | 145 | If you are satisfied with the model performance, you can save it to be used later. You can then load it whenever you need it, and use it to predict labels for new data. This is often called scoring or inferencing. 146 | 147 | In this section you will: 148 | 149 | - Save your trained model, and then use it to predict the price-per-unit for the following real estate transactions: 150 | 151 | | **transaction_date** | **house_age** | **transit_distance** | **local_convenience_stores** | **latitude** | **longitude** | 152 | |---------------------|----------------|--------------|--------|-------|------| 153 | | 2013.167 | 16.2 | 289.3248 | 5 | 24.98203 | 121.54348 | 154 | | 2013.000 | 13.6 | 4082.015 | 0 | 24.94155 | 121.50381 | 155 | 156 | ## Quiz 157 | 158 | Test your knowledge with [a short quiz](https://docs.microsoft.com/en-us/learn/modules/introduction-regression-models/8-knowledge-check)! 159 | 160 | ## Next steps 161 | 162 | Congratulations on finishing this regression challenge 🏅! 163 | 164 | There are other workshops around using R for Data Science. In this workshop, we learnt how regression can be used to create a machine learning model that predicts numeric values. The next set of workshops show you how to [create classification models](../intro-classification-R-tidymodels) and create clustering models (coming soon!). Be sure to check them out! 165 | 166 | ## Practice 167 | 168 | In this workshop, you trained a single model (random forest) to predict house prices based on their features. Sometimes, a data practitioner may need to try out a couple of models. Try using other models discussed in this workshop. Try tuning some model hyperparameters while at it too. Do you obtain better evaluation metrics? 169 | 170 | 171 | ## Feedback 172 | 173 | Be sure to give [feedback about this workshop](https://forms.office.com/r/MdhJWMZthR)! Happy Learning! 174 | 175 | [Code of Conduct](../../CODE_OF_CONDUCT.md) 176 | -------------------------------------------------------------------------------- /intro-regression-R-tidymodels/images/promo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/revodavid/devcontainers-r/ba8b859d19fb84750d162becdb2f890258697ccc/intro-regression-R-tidymodels/images/promo.png -------------------------------------------------------------------------------- /intro-regression-R-tidymodels/slides.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/revodavid/devcontainers-r/ba8b859d19fb84750d162becdb2f890258697ccc/intro-regression-R-tidymodels/slides.pptx -------------------------------------------------------------------------------- /intro-regression-R-tidymodels/solution/Challenge-regression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Regression Challenge\n", 8 | "\n", 9 | "Predicting the selling price of a residential property depends on a number of factors, including the property age, availability of local amenities, and location.\n", 10 | "\n", 11 | "In this challenge, you will use a dataset of real estate sales transactions to predict the price-per-unit of a property based on its features. The price-per-unit in this data is based on a unit measurement of 3.3 square meters.\n", 12 | "\n", 13 | "> **Citation**: The data used in this exercise originates from the following study:\n", 14 | ">\n", 15 | "> *Yeh, I. C., & Hsu, T. K. (2018). Building real estate valuation models with comparative approach through case-based reasoning. Applied Soft Computing, 65, 260-271.*\n", 16 | ">\n", 17 | "> It was obtained from the UCI dataset repository (Dua, D. and Graff, C. (2019). [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml). Irvine, CA: University of California, School of Information and Computer Science).\n", 18 | "\n", 19 | "## Review the data\n", 20 | "\n", 21 | "Let's hit the ground running by importing the data and viewing the first few rows.\n" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "# Load the core tidyverse and tidymodels in your current R session\n", 31 | "suppressPackageStartupMessages({\n", 32 | " library(tidyverse)\n", 33 | " library(tidymodels)\n", 34 | "})\n", 35 | "\n", 36 | "# Read the csv file into a tibble\n", 37 | "estate_data <- read_csv(file = \"https://raw.githubusercontent.com/MicrosoftDocs/ml-basics/master/challenges/data/real_estate.csv\", \n", 38 | "show_col_types = FALSE)\n", 39 | "\n", 40 | "# Print the first 10 rows of the data\n", 41 | "estate_data %>%\n", 42 | " slice_head(n = 10)\n" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "The data consists of the following variables:\n", 50 | "\n", 51 | "- **transaction_date** - the transaction date (for example, 2013.250=2013 March, 2013.500=2013 June, etc.)\n", 52 | "\n", 53 | "- **house_age** - the house age (in years)\n", 54 | "\n", 55 | "- **transit_distance** - the distance to the nearest light rail station (in meters)\n", 56 | "\n", 57 | "- **local_convenience_stores** - the number of convenience stores within walking distance\n", 58 | "\n", 59 | "- **latitude** - the geographic coordinate, latitude\n", 60 | "\n", 61 | "- **longitude** - the geographic coordinate, longitude\n", 62 | "\n", 63 | "- **price_per_unit** house price of unit area (3.3 square meters)\n", 64 | "\n", 65 | "Your challenge is to explore and prepare the data, identify predictive features that will help predict the `price_per_unit` label, and train a regression model that achieves the lowest *Root Mean Square Error* (RMSE) you can achieve (which must be less than *7*) when evaluated against a test subset of data.\n", 66 | "\n", 67 | "### View the label distribution\n", 68 | "\n", 69 | "Let's start our analysis of the data by examining a few key descriptive statistics. We can use the `summarytools::descr()` function to neatly and quickly summarize the numeric features as well as the *rentals* label column.\n" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "# Load summary tools library\n", 79 | "library(summarytools)\n", 80 | "\n", 81 | "# Obtain summary stats for feature and label columns\n", 82 | "estate_data %>%\n", 83 | " # Summary stats\n", 84 | " descr(order = \"preserve\",\n", 85 | " stats = c(\"mean\", \"sd\", \"min\", \"q1\", \"med\", \"q3\", \"max\"),\n", 86 | " round.digits = 6)\n" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "The statistics reveal some information about the distribution of the data in each of the numeric fields, including the number of observations (there are 414 records), the mean, standard deviation, minimum and maximum values, and the quantile values (the threshold values for 25%, 50% - which is also the median, and 75% of the data).\n", 94 | "\n", 95 | "From this, we can see that the mean number of price per unit is around 38. There's a comparatively *small standard deviation*, indicating *not much variance* in the prices per unit.\n", 96 | "\n", 97 | "We might get a clearer idea of the distribution of price values by visualizing the data.\n" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "library(patchwork)\n", 107 | "\n", 108 | "# Plot a histogram\n", 109 | "theme_set(theme_light())\n", 110 | "\n", 111 | "hist_plt <- estate_data %>%\n", 112 | " ggplot(mapping = aes(x = price_per_unit)) +\n", 113 | " geom_histogram(bins = 100, fill = \"midnightblue\", alpha = 0.7) +\n", 114 | "\n", 115 | " # Add lines for mean and median\n", 116 | " geom_vline(aes(xintercept = mean(price_per_unit), color = \"Mean\"),\n", 117 | " linetype = \"dashed\", size = 1.3) +\n", 118 | " geom_vline(aes(xintercept = median(price_per_unit), color = \"Median\"),\n", 119 | " linetype = \"dashed\", size = 1.3) +\n", 120 | " xlab(\"\") +\n", 121 | " ylab(\"Frequency\") +\n", 122 | " scale_color_manual(name = \"\", values = c(Mean = \"red\", Median = \"yellow\")) +\n", 123 | " theme(legend.position = c(0.9, 0.9), legend.background = element_blank())\n", 124 | "\n", 125 | "# Plot a box plot\n", 126 | "box_plt <- estate_data %>%\n", 127 | " ggplot(aes(x = price_per_unit, y = 1)) +\n", 128 | " geom_boxplot(fill = \"#E69F00\", color = \"gray23\", alpha = 0.7) +\n", 129 | " # Add titles and labels\n", 130 | " xlab(\"Price_per_unit\") +\n", 131 | " ylab(\"\")\n", 132 | "\n", 133 | "\n", 134 | "# Combine plots using patchwork syntax\n", 135 | "(hist_plt / box_plt) +\n", 136 | " plot_annotation(title = \"Price Distribution\",\n", 137 | " theme = theme(\n", 138 | " plot.title = element_text(hjust = 0.5)))\n" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "What can we observe from the boxplot? Yes, outliers.\n", 146 | "\n", 147 | "### Remove outliers\n", 148 | "\n", 149 | "We are now set to begin writing some code ourselves 🙂. Let's begin by dealing with outliers. An outlier is a data point that differs significantly from other observations.\n", 150 | "\n", 151 | "**Question 1.**\n", 152 | "\n", 153 | "Starting with the `estate_data` dataset, `filter` to create a subset that contains observations where `price_per_unit` is less than *70*.\n", 154 | "\n", 155 | "Fill in the placeholder `....` with the right code." 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "# Narrow down to observations whose price_per_unit is less than 70\n", 165 | "estate_data <- estate_data %>%\n", 166 | " ....\n" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "Test your answer:\n" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | ". <- ottr::check(\"tests/Question 1.R\")\n" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "Now let's take a look at the distribution without the outliers.\n", 190 | "\n" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "# Plot a histogram\n", 200 | "theme_set(theme_light())\n", 201 | "hist_plt <- estate_data %>%\n", 202 | " ggplot(mapping = aes(x = price_per_unit)) + \n", 203 | " geom_histogram(bins = 100, fill = \"midnightblue\", alpha = 0.7) +\n", 204 | "\n", 205 | " # Add lines for mean and median\n", 206 | " geom_vline(aes(xintercept = mean(price_per_unit), color = \"Mean\"),\n", 207 | " linetype = \"dashed\", size = 1.3) +\n", 208 | " geom_vline(aes(xintercept = median(price_per_unit), color = \"Median\"),\n", 209 | " linetype = \"dashed\", size = 1.3) +\n", 210 | " xlab(\"\") +\n", 211 | " ylab(\"Frequency\") +\n", 212 | " scale_color_manual(name = \"\", values = c(Mean = \"red\", Median = \"yellow\")) +\n", 213 | " theme(legend.position = c(0.9, 0.9), legend.background = element_blank())\n", 214 | "\n", 215 | "# Plot a box plot\n", 216 | "box_plt <- estate_data %>%\n", 217 | " ggplot(aes(x = price_per_unit, y = 1)) +\n", 218 | " geom_boxplot(fill = \"#E69F00\", color = \"gray23\", alpha = 0.7) +\n", 219 | " # Add titles and labels\n", 220 | " xlab(\"Price_per_unit\") +\n", 221 | " ylab(\"\")\n", 222 | "\n", 223 | "\n", 224 | "# Combine plots using patchwork syntax\n", 225 | "(hist_plt / box_plt) +\n", 226 | " plot_annotation(title = \"Price Distribution\",\n", 227 | " theme = theme(\n", 228 | " plot.title = element_text(hjust = 0.5)))\n" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "Much better 🤩! What can we say about the distribution of the price?\n", 236 | "\n", 237 | "### View numeric correlations\n", 238 | "\n", 239 | "We can now start to look for relationships between the *features* and the *label* we want to be able to predict.\n", 240 | "\n", 241 | "The *correlation* statistic, *r*, is a value between -1 and 1 that indicates the strength of a linear relationship.\n", 242 | "\n", 243 | "For numeric feature and label columns, we can create scatter plots that show the intersection of the feature and label values.\n", 244 | "\n", 245 | "**Question 2.**\n", 246 | "\n", 247 | "Starting with the `estate_data` dataset, in a piped sequence:\n", 248 | "\n", 249 | "- `pivot_longer` the data (increase the number of rows and decrease the number of columns) such that all the existing column names except price_per_unit, now fall under a new column name called `features` and their corresponding values under a new column name `values`\n", 250 | "\n", 251 | "- group the data by `features`\n", 252 | "\n", 253 | "- add a new column `corr_coef` which calculates the correlation between `values` and `price_per_unit` (hint: the function used for calculating correlation in R is `cor()`)\n", 254 | "\n", 255 | "Fill in the placeholder `....` with the right code." 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "# Pivot numeric features to a long format\n", 265 | "numeric_features_long <- estate_data %>%\n", 266 | " pivot_....(!price_per_unit, names_to = \"....\", values_to = \"....\") %>%\n", 267 | " # Group by features\n", 268 | " ....(features) %>%\n", 269 | " # Calculate correlation coefficient between values and price_per_unit\n", 270 | " mutate(corr_coef = ....) %>%\n", 271 | "\n", 272 | " # Modifies the feature column to also include corr_coef\n", 273 | " mutate(features = paste(features, \"vs price, r = \",\n", 274 | " round(corr_coef, 2), sep = \"\")) %>%\n", 275 | " ungroup()\n", 276 | "\n", 277 | "# Print the first few rows of the data\n", 278 | "numeric_features_long %>%\n", 279 | " slice_head(n = 10)\n" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "metadata": {}, 285 | "source": [ 286 | "Test your answer:\n" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | ". <- ottr::check(\"tests/Question 2.R\")\n" 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "metadata": {}, 301 | "source": [ 302 | "Fantastic! Now let's use a scatter plot to investigate whether there is any linear relationship between our predictors and outcome variables.\n", 303 | "\n" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "# Plot a scatter plot for each feature\n", 313 | "numeric_features_long %>%\n", 314 | " ggplot(aes(x = values, y = price_per_unit, color = features)) +\n", 315 | " geom_point(alpha = 0.7, show.legend = F) +\n", 316 | " facet_wrap(~ features, scales = \"free\") +\n", 317 | " paletteer::scale_color_paletteer_d(\"ggthemes::excel_Parallax\")\n" 318 | ] 319 | }, 320 | { 321 | "cell_type": "markdown", 322 | "metadata": {}, 323 | "source": [ 324 | "Take a moment and go through the scatter plot. How does the correlation between these features and the price vary?\n", 325 | "\n", 326 | "### View categorical features\n", 327 | "\n", 328 | "Now let's compare the categorical features to the label. We'll do this by creating box plots that show the distribution of rental counts for each category.\n", 329 | "\n", 330 | "`Transaction_date` and `local_convenience_stores` seem to be discrete values - so might work better if treated as categorical features. Let' get right into it.\n", 331 | "\n", 332 | "**Question 3.**\n", 333 | "\n", 334 | "Starting with the `estate_data` dataset, in a piped sequence:\n", 335 | "\n", 336 | "- only keep columns `transaction_date`, `local_convenience_stores` and `price_per_unit`\n", 337 | "\n", 338 | "- encode columns `transaction_date` and `local_convenience_stores` as categorical (factor)\n", 339 | "\n", 340 | "- `pivot_longer` the data (increase the number of rows and decrease the number of columns) such that all the existing column names except price_per_unit now fall under a new column name called `features` and their corresponding values under a new column name `values`\n", 341 | "\n", 342 | "Fill in the placeholder `....` with the right code." 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": null, 348 | "metadata": {}, 349 | "outputs": [], 350 | "source": [ 351 | "# Pivot categorical features to a long format\n", 352 | "cat_features_long <- estate_data %>%\n", 353 | " ....(transaction_date, ...., ....) %>%\n", 354 | " # Encode transaction_date & local_convenience_stores features\n", 355 | " # from numeric to categorical\n", 356 | " mutate(....) %>%\n", 357 | " pivot_longer(....)\n", 358 | "\n", 359 | "# Print some observations\n", 360 | "cat_features_long %>%\n", 361 | " slice_head(n = 10)\n" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": {}, 367 | "source": [ 368 | "Test your answer:\n" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": null, 374 | "metadata": {}, 375 | "outputs": [], 376 | "source": [ 377 | ". <- ottr::check(\"tests/Question 3.R\")\n" 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": {}, 383 | "source": [ 384 | "Perfect! Now, for our categorical features, boxplots can be a great way of visualising how the price per unit varies within the levels of the categorical feature.\n", 385 | "\n" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": null, 391 | "metadata": {}, 392 | "outputs": [], 393 | "source": [ 394 | "# Plot a box plot for each feature\n", 395 | "cat_features_long %>%\n", 396 | " ggplot() +\n", 397 | " geom_boxplot(aes(x = values, y = price_per_unit, fill = features),\n", 398 | " alpha = 0.7, show.legend = F) +\n", 399 | " facet_wrap(~ features, scales = \"free\") +\n", 400 | " scale_fill_viridis_d() +\n", 401 | " theme(panel.grid = element_blank(),\n", 402 | " axis.text.x = element_text(angle = 90))\n" 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": {}, 408 | "source": [ 409 | "Take a moment and interpret the graphics. How does the price vary with these features?\n", 410 | "\n", 411 | "## Split the data into training and test sets.\n", 412 | "\n", 413 | "Now that we've explored the data, it's time to use it to train a regression model that uses the features we've identified as *potentially predictive* to predict the `price_per_unit` label.\n", 414 | "\n", 415 | "`Transaction_date` doesn't seem to be very predictive, so we'll omit it.\n", 416 | "\n", 417 | "Let's begin by splitting the data set such that some goes to training and some goes for validation. This enables us to evaluate how well the model performs in order to get a better estimate of how your models will perform on new data.\n", 418 | "\n", 419 | "**Question 4.**\n", 420 | "\n", 421 | "In this section:\n", 422 | "\n", 423 | "- Make a split specification of `estate_data` such that *70%* goes to training and the rest goes to testing. Save this to a variable name `estate_split`\n", 424 | "\n", 425 | "- Extract the training and testing sets from `estate_split` and save them in `estate_train` and `estate_test` variable names respectively.\n", 426 | "\n", 427 | "Fill in the placeholder `....` with the right code." 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": null, 433 | "metadata": {}, 434 | "outputs": [], 435 | "source": [ 436 | "# Set seed to ensure reproducibility and consistency of outputs\n", 437 | "set.seed(2056)\n", 438 | "\n", 439 | "# Load the tidymodels package\n", 440 | "library(tidymodels)\n", 441 | "\n", 442 | "# Split 70% of the data for training and the rest for tesing\n", 443 | "estate_split <- estate_data %>%\n", 444 | " initial_split(....)\n", 445 | "\n", 446 | "# Extract the train and test data in each split\n", 447 | "estate_train <- ....(estate_split)\n", 448 | "estate_test <- ....(estate_split)\n", 449 | "\n", 450 | "# Print the number of observations in each split\n", 451 | "cat(\"Training Set\", nrow(estate_train), \"rows\",\n", 452 | " \"\\nTest Set\", nrow(estate_test), \"rows\")\n" 453 | ] 454 | }, 455 | { 456 | "cell_type": "markdown", 457 | "metadata": {}, 458 | "source": [ 459 | "Test your answer:\n" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": null, 465 | "metadata": {}, 466 | "outputs": [], 467 | "source": [ 468 | ". <- ottr::check(\"tests/Question 4.R\")\n" 469 | ] 470 | }, 471 | { 472 | "cell_type": "markdown", 473 | "metadata": {}, 474 | "source": [ 475 | "Great progress 💪! Now let's train some models.\n", 476 | "\n", 477 | "## Train a regression model\n", 478 | "\n", 479 | "### Preprocess data using recipes\n", 480 | "\n", 481 | "Often before fitting a model, we may want to reformat the predictor values to make them easier for a model to use effectively. This includes transformations and encodings of the data to best represent their important characteristics. In R,this is done using a `recipe`.\n", 482 | "\n", 483 | "A recipe is an object that defines a series of steps for data processing.\n", 484 | "\n", 485 | "**Question 5.**\n", 486 | "\n", 487 | "In this section, specify a recipe, `estate_recipe`, that will:\n", 488 | "\n", 489 | "- Remove the `transaction_date` feature\n", 490 | "\n", 491 | "- Transform `local_convenience_stores` feature into categorical (factor)\n", 492 | "\n", 493 | "- Center and scale all numeric predictors\n", 494 | "\n", 495 | "Fill in the placeholder `....` with the right code." 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": null, 501 | "metadata": {}, 502 | "outputs": [], 503 | "source": [ 504 | "# Create a preprocessing recipe\n", 505 | "estate_recipe <- ....(price_per_unit ~ ., data = estate_train) %>%\n", 506 | " # Specify the removal of transaction_date\n", 507 | " step_rm(....) %>%\n", 508 | " # Specify the encoding of local_convenience_stores as categorical\n", 509 | " step_mutate(\n", 510 | " local_convenience_stores = ....) %>%\n", 511 | " # Specify the normalization of numeric features\n", 512 | " ....(all_numeric_predictors())\n", 513 | " \n", 514 | "# Print recipe\n", 515 | "estate_recipe\n" 516 | ] 517 | }, 518 | { 519 | "cell_type": "markdown", 520 | "metadata": {}, 521 | "source": [ 522 | "Test your answer:\n" 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": null, 528 | "metadata": {}, 529 | "outputs": [], 530 | "source": [ 531 | ". <- ottr::check(\"tests/Question 5.R\")\n" 532 | ] 533 | }, 534 | { 535 | "cell_type": "markdown", 536 | "metadata": {}, 537 | "source": [ 538 | "Fantastic! We have the data processing in order. Now, let's make a model specification. In this solution, we'll try out a random forest model which applies an averaging function to multiple decision tree models for a better overall model.\n", 539 | "\n", 540 | "**Question 6.**\n", 541 | "\n", 542 | "Create a random forest model specification, `rf_spec`, which uses the `randomForest` package as its engine and then set the mode to `regression`.\n", 543 | "\n", 544 | "Fill in the placeholder `....` with the right code." 545 | ] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "execution_count": null, 550 | "metadata": {}, 551 | "outputs": [], 552 | "source": [ 553 | "# Build a random forest model specification\n", 554 | "rf_spec <- rand_forest() %>%\n", 555 | " # Specify engine\n", 556 | " .... %>%\n", 557 | " # Specify mode\n", 558 | " set_mode(\"....\")\n" 559 | ] 560 | }, 561 | { 562 | "cell_type": "markdown", 563 | "metadata": {}, 564 | "source": [ 565 | "Test your answer:\n" 566 | ] 567 | }, 568 | { 569 | "cell_type": "code", 570 | "execution_count": null, 571 | "metadata": {}, 572 | "outputs": [], 573 | "source": [ 574 | ". <- ottr::check(\"tests/Question 6.R\")\n" 575 | ] 576 | }, 577 | { 578 | "cell_type": "markdown", 579 | "metadata": {}, 580 | "source": [ 581 | "### Create a modeling workflow\n", 582 | "\n", 583 | "The *workflows* package allows the user to bind modeling and preprocessing objects together. You can then fit the entire workflow to the data, so that the model encapsulates all of the preprocessing steps as well as the algorithm.\n", 584 | "\n", 585 | "**Question 7.**\n", 586 | "\n", 587 | "Components of a `workflow()` go together like LEGO blocks. In this section, create a workflow container and then add the preprocessing information from our recipe and then add the model specification to be trained.\n", 588 | "\n", 589 | "Fill in the placeholder `....` with the right code." 590 | ] 591 | }, 592 | { 593 | "cell_type": "code", 594 | "execution_count": null, 595 | "metadata": {}, 596 | "outputs": [], 597 | "source": [ 598 | "# Create a workflow that bundles a recipe and model specification\n", 599 | "rf_workflow <- workflow() %>%\n", 600 | " # Add a recipe\n", 601 | " add_recipe(....) %>%\n", 602 | " # Add a model specification\n", 603 | " ....\n", 604 | "# Print workflow\n", 605 | "rf_workflow\n" 606 | ] 607 | }, 608 | { 609 | "cell_type": "markdown", 610 | "metadata": {}, 611 | "source": [ 612 | "Test your answer:\n" 613 | ] 614 | }, 615 | { 616 | "cell_type": "code", 617 | "execution_count": null, 618 | "metadata": {}, 619 | "outputs": [], 620 | "source": [ 621 | ". <- ottr::check(\"tests/Question 7.R\")\n" 622 | ] 623 | }, 624 | { 625 | "cell_type": "markdown", 626 | "metadata": {}, 627 | "source": [ 628 | "Now that we have everything (recipe + model specification) wrapped together nicely in a workflow, we are ready to train a model. Workflows have a `fit()` method that can be used to train a model.\n", 629 | "\n" 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "execution_count": null, 635 | "metadata": {}, 636 | "outputs": [], 637 | "source": [ 638 | "# For reproducibility\n", 639 | "set.seed(2056)\n", 640 | "\n", 641 | "# Train a random forest model\n", 642 | "rf_workflow_fit <- rf_workflow %>%\n", 643 | " fit(data = estate_train)\n", 644 | "\n", 645 | "# Print out the fitted workflow\n", 646 | "rf_workflow_fit\n" 647 | ] 648 | }, 649 | { 650 | "cell_type": "markdown", 651 | "metadata": {}, 652 | "source": [ 653 | "Excellent! So we now have a trained random forest model; but is it any good? Let's evaluate its performance! We'll do this by making predictions on the `test data` and then evaluate some performance metrics based on the actual outcomes.\n", 654 | "\n", 655 | "**Question 8.**\n", 656 | "\n", 657 | "- We'll evaluate the model performance based on the *rmse* and *rsq* metrics. Use the `metric_set()` function to combine these metric functions together into a new function, `eval_metrics`, that calculates all of them at once.\n", 658 | "\n", 659 | "- Generate predictions for the test data and then bind them to the test set. Rename the column containing predictions from `.pred` to `predictions`.\n", 660 | "\n", 661 | "Fill in the placeholder `....` with the right code." 662 | ] 663 | }, 664 | { 665 | "cell_type": "code", 666 | "execution_count": null, 667 | "metadata": {}, 668 | "outputs": [], 669 | "source": [ 670 | "# Create a metric set\n", 671 | "eval_metrics <- ....(rmse, ....)\n", 672 | "\n", 673 | "\n", 674 | "# Make and bind predictions to test data\n", 675 | "results <- rf_workflow_fit %>%\n", 676 | " ....\n" 677 | ] 678 | }, 679 | { 680 | "cell_type": "markdown", 681 | "metadata": {}, 682 | "source": [ 683 | "Test your answer:\n" 684 | ] 685 | }, 686 | { 687 | "cell_type": "code", 688 | "execution_count": null, 689 | "metadata": {}, 690 | "outputs": [], 691 | "source": [ 692 | ". <- ottr::check(\"tests/Question 8.R\")\n" 693 | ] 694 | }, 695 | { 696 | "cell_type": "markdown", 697 | "metadata": {}, 698 | "source": [ 699 | "Awesome work! You have just used your trained model to make predictions on the test set.\n", 700 | "\n", 701 | "How well did the model predict the prices per unit? Let's find out by looking at the metrics.\n" 702 | ] 703 | }, 704 | { 705 | "cell_type": "code", 706 | "execution_count": null, 707 | "metadata": {}, 708 | "outputs": [], 709 | "source": [ 710 | "# Evaluate the model\n", 711 | "rf_metrics <- eval_metrics(data = results,\n", 712 | " truth = price_per_unit,\n", 713 | " estimate = predictions)\n", 714 | "\n", 715 | "\n", 716 | "# Plot predicted vs actual\n", 717 | "rf_plt <- results %>%\n", 718 | " ggplot(mapping = aes(x = price_per_unit, y = predictions)) +\n", 719 | " geom_point(color = \"darkorchid\", size = 1.6) +\n", 720 | " # overlay regression line\n", 721 | " geom_smooth(method = \"lm\", color = \"black\", se = F) +\n", 722 | " ggtitle(\"Price per unit predictions\") +\n", 723 | " xlab(\"Actual Labels\") +\n", 724 | " ylab(\"Predicted Labels\") +\n", 725 | " theme(plot.title = element_text(hjust = 0.5))\n", 726 | "\n", 727 | "# Return evaluations\n", 728 | "list(metrics = rf_metrics, evaluation_plot = rf_plt)\n" 729 | ] 730 | }, 731 | { 732 | "cell_type": "markdown", 733 | "metadata": {}, 734 | "source": [ 735 | "How do you think the model performed? What do the values for `rsq` and `rmse` tell you? Please refer to the corresponding module for this notebook if you may need help answering these questions.\n", 736 | "\n", 737 | "## Use the Trained Model\n", 738 | "\n", 739 | "Save your trained model, and then use it to predict the price-per-unit for the following real estate transactions:\n", 740 | "\n", 741 | "| **transaction_date** | **house_age** | **transit_distance** | **local_convenience_stores** | **latitude** | **longitude** |\n", 742 | "|----------------------|---------------|----------------------|------------------------------|--------------|---------------|\n", 743 | "| 2013.167 | 16.2 | 289.3248 | 5 | 24.98203 | 121.54348 |\n", 744 | "| 2013.000 | 13.6 | 4082.015 | 0 | 24.94155 | 121.50381 |\n", 745 | "\n" 746 | ] 747 | }, 748 | { 749 | "cell_type": "code", 750 | "execution_count": null, 751 | "metadata": {}, 752 | "outputs": [], 753 | "source": [ 754 | "library(here)\n", 755 | "# Save trained workflow\n", 756 | "saveRDS(rf_workflow_fit, \"rf_price_model.rds\")\n" 757 | ] 758 | }, 759 | { 760 | "cell_type": "markdown", 761 | "metadata": {}, 762 | "source": [ 763 | "In this way, we can load it whenever we need it, and use it to predict labels for new data. This is often called *scoring* or *inferencing*.\n", 764 | "\n" 765 | ] 766 | }, 767 | { 768 | "cell_type": "code", 769 | "execution_count": null, 770 | "metadata": {}, 771 | "outputs": [], 772 | "source": [ 773 | "# Create a tibble for the new real estate samples\n", 774 | "new_data <- tibble(\n", 775 | " transaction_date = c(2013.167, 2013.000),\n", 776 | " house_age = c(16.2, 13.6),\n", 777 | " transit_distance = c(289.3248, 4082.015),\n", 778 | " local_convenience_stores = c(5, 0),\n", 779 | " latitude = c(24.98203, 24.94155),\n", 780 | " longitude = c(121.54348, 121.50381))\n", 781 | "\n", 782 | "# Print out new data\n", 783 | "new_data\n" 784 | ] 785 | }, 786 | { 787 | "cell_type": "markdown", 788 | "metadata": {}, 789 | "source": [ 790 | "Now that we have our data, let's load the saved model and make predictions.\n", 791 | "\n" 792 | ] 793 | }, 794 | { 795 | "cell_type": "code", 796 | "execution_count": null, 797 | "metadata": {}, 798 | "outputs": [], 799 | "source": [ 800 | "# Load the model into the current R session\n", 801 | "loaded_model <- readRDS(\"rf_price_model.rds\")\n", 802 | "\n", 803 | "# Make predictions\n", 804 | "predictions <- loaded_model %>%\n", 805 | " augment(new_data = new_data)\n", 806 | "\n", 807 | "predictions\n" 808 | ] 809 | }, 810 | { 811 | "cell_type": "markdown", 812 | "metadata": {}, 813 | "source": [ 814 | "Congratulations for completing this challenge! In this notebook, you:\n", 815 | "\n", 816 | "- Explored the data set to understand the relationships between the predictors and outcomes\n", 817 | "- Preprocessed the data using recipes to make them easier for a model to use effectively.\n", 818 | "- Made a random forest model specification.\n", 819 | "- Bundles a recipe and model specification into a workflow.\n", 820 | "- Trained a model.\n", 821 | "- Made predictions on test set and evaluated the model performance.\n", 822 | "- Saved the model, loaded it and then used it to predict labels for new data.\n", 823 | "\n", 824 | "Fantastic job for coming this far 👏! Feeling adventurous? Then, be sure to try out other regression models and tune some hyperparameters while at it.\n", 825 | "\n", 826 | "See you in our next module as we explore the realm of *classification* models!\n", 827 | "\n", 828 | "Happy Learning,\n", 829 | "\n", 830 | "[Eric](https://twitter.com/ericntay), Gold Microsoft Learn Student Ambassador.\n" 831 | ] 832 | } 833 | ], 834 | "metadata": { 835 | "anaconda-cloud": "", 836 | "kernelspec": { 837 | "display_name": "R", 838 | "language": "R", 839 | "name": "ir" 840 | }, 841 | "language_info": { 842 | "codemirror_mode": "r", 843 | "file_extension": ".r", 844 | "mimetype": "text/x-r-source", 845 | "name": "R", 846 | "pygments_lexer": "r", 847 | "version": "3.4.1" 848 | } 849 | }, 850 | "nbformat": 4, 851 | "nbformat_minor": 1 852 | } 853 | -------------------------------------------------------------------------------- /intro-regression-R-tidymodels/solution/all-systems-check/keybindings.json: -------------------------------------------------------------------------------- 1 | // Place your key bindings in this file to override the defaults 2 | // Keyboard shortcuts for common R operators 3 | [ 4 | // Add assignment operator 5 | { 6 | "key": "Alt+-", 7 | "command": "type", 8 | "args": { "text": " <- " } 9 | //"when": "editorTextFocus && editorLangId == r" 10 | }, 11 | 12 | // Add pipe 13 | 14 | { 15 | "key": "Ctrl+Shift+m", 16 | "command": "type", 17 | "args": { "text": " %>% " } 18 | //"when": "editorTextFocus && editorLangId == r" 19 | } 20 | -------------------------------------------------------------------------------- /intro-regression-R-tidymodels/solution/all-systems-check/test.R: -------------------------------------------------------------------------------- 1 | #------------------------------------------------------------------------------------------------------------- 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License. See https://go.microsoft.com/fwlink/?linkid=2090316 for license information. 4 | #------------------------------------------------------------------------------------------------------------- 5 | 6 | say_hello <- function(name) { 7 | message(paste0("Hello, ", name, "!")) 8 | } 9 | 10 | say_hello("remote world") 11 | -------------------------------------------------------------------------------- /intro-regression-R-tidymodels/solution/all-systems-check/test.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R Notebook" 3 | output: html_notebook 4 | --- 5 | 6 | This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. 7 | 8 | Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Ctrl+Shift+Enter*. 9 | 10 | ```{r} 11 | plot(cars) 12 | ``` 13 | 14 | Try a function too? 15 | ```{r} 16 | # Function that returns an awesome message 17 | say_hello <- function(name) { 18 | message(paste0("Hello, ", name, ":) In this module, we learn how to Explore 19 | and Analyze Data with R.")) 20 | } 21 | 22 | say_hello("remote world") 23 | 24 | ``` 25 | 26 | -------------------------------------------------------------------------------- /intro-regression-R-tidymodels/solution/all-systems-check/test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "[1] \"Hello from R and Python\"\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "# Both Python and R have the print() function.\n", 18 | "# This hsould work with both kernels\n", 19 | "print(\"Hello from R and Python\")" 20 | ] 21 | } 22 | ], 23 | "metadata": { 24 | "interpreter": { 25 | "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" 26 | }, 27 | "kernelspec": { 28 | "display_name": "R", 29 | "language": "R", 30 | "name": "ir" 31 | }, 32 | "language_info": { 33 | "codemirror_mode": "r", 34 | "file_extension": ".r", 35 | "mimetype": "text/x-r-source", 36 | "name": "R", 37 | "pygments_lexer": "r", 38 | "version": "4.1.2" 39 | }, 40 | "orig_nbformat": 4 41 | }, 42 | "nbformat": 4, 43 | "nbformat_minor": 2 44 | } 45 | -------------------------------------------------------------------------------- /intro-regression-R-tidymodels/solution/tests/Question 1.R: -------------------------------------------------------------------------------- 1 | test = list( 2 | name = "Question 1", 3 | cases = list( 4 | ottr::TestCase$new( 5 | hidden = FALSE, 6 | name = NA, 7 | points = 0.5, 8 | success_message = "Great start! Your tibble dimensions are correct.", 9 | failure_message = "Almost there! Ensure you have filtered correctly to obtain a subset whose observations of `price_per_unit` is less than `70`. Expected dimensions [408 7]", 10 | code = { 11 | suppressPackageStartupMessages({ 12 | library(testthat) 13 | library(ottr) 14 | }) 15 | 16 | ## Test ## 17 | test_that('data dimensions correct', { 18 | expect_equal(dim(estate_data), c(408, 7)) 19 | 20 | }) 21 | } 22 | ), 23 | ottr::TestCase$new( 24 | hidden = FALSE, 25 | name = NA, 26 | points = 0.5, 27 | success_message = "Excellent. You have successfully created a subset whose observations of price_per_unit is less than 70.", 28 | failure_message = "Let's give this another try. Ensure your subset contains observations where **price_per_unit** is less than 70.", 29 | code = { 30 | 31 | 32 | ## Test ## 33 | test_that('the range of values for price per unit is within 7.6 and 69.7', { 34 | expect_equal(range(estate_data$price_per_unit), c(7.6, 69.7)) 35 | }) 36 | } 37 | ) 38 | ) 39 | ) -------------------------------------------------------------------------------- /intro-regression-R-tidymodels/solution/tests/Question 2.R: -------------------------------------------------------------------------------- 1 | test = list( 2 | name = "Question 2", 3 | cases = list( 4 | ottr::TestCase$new( 5 | hidden = FALSE, 6 | name = NA, 7 | points = 0.5, 8 | success_message = "Great start! Your tibble dimensions and corresponding columns are correct.", 9 | failure_message = "Almost there! Let's give this another shot.", 10 | code = { 11 | ## Test ## 12 | test_that('data dimensions correct', { 13 | expect_equal(dim(numeric_features_long), c(2448, 4)) 14 | expect_equal(sort(colnames(numeric_features_long)), c("corr_coef", "features", "price_per_unit", "values")) 15 | 16 | }) 17 | } 18 | ), 19 | ottr::TestCase$new( 20 | hidden = FALSE, 21 | name = NA, 22 | points = 0.5, 23 | success_message = "Excellent! You have successfully pivoted the tibble and found the correlation between the existing numeric column values and the price per unit.", 24 | failure_message = "Let's give this another try. Ensure you have correctly pivoted the data to obtain two new columns **features** and **values** and then grouped by **features** and then added a new column **corr_coef** which is the correlation between **values** and **price_per_unit**. Lastly don't forget to ungroup :).", 25 | code = { 26 | ## Test ## 27 | test_that('the correlation coefficients are correct', { 28 | expect_equal(round(range(numeric_features_long$corr_coef), 7), c(-0.7087782, 0.6101017)) 29 | }) 30 | } 31 | ) 32 | ) 33 | ) -------------------------------------------------------------------------------- /intro-regression-R-tidymodels/solution/tests/Question 3.R: -------------------------------------------------------------------------------- 1 | test = list( 2 | name = "Question 3", 3 | cases = list( 4 | ottr::TestCase$new( 5 | hidden = FALSE, 6 | name = NA, 7 | points = 0.5, 8 | success_message = "success_message: Fantastic! Your tibble dimensions and corresponding columns are correct.", 9 | failure_message = "Almost there! Ensure you have selected columns transaction_date, local_convenience_stores and price_per_unit, and then pivoted the exsiting columns except price_per_unit to obtain two new columns **features** and **values**", 10 | code = { 11 | ## Test ## 12 | test_that('data dimensions correct', { 13 | expect_equal(dim(cat_features_long), c(816, 3)) 14 | expect_equal(sort(colnames(cat_features_long)), c("features", "price_per_unit", "values")) 15 | 16 | }) 17 | } 18 | ), 19 | ottr::TestCase$new( 20 | hidden = FALSE, 21 | name = NA, 22 | points = 0.5, 23 | success_message = "Congratulations! You have successfully selected the desired columns, encoded some of them as categorical and restructured the data to a longer format.", 24 | failure_message = "Almost there! Ensure you have selected columns transaction_date, local_convenience_stores and price_per_unit, and then encoded transaction_date & local_convenience_stores as categorical, and then pivoted the data correctly.", 25 | code = { 26 | ## Test ## 27 | test_that('data contains the correct observations', { 28 | expect_equal(sort(unique(cat_features_long$features)), c("local_convenience_stores", "transaction_date")) 29 | expect_equal(class(cat_features_long$values), "factor") 30 | 31 | }) 32 | } 33 | ) 34 | ) 35 | ) -------------------------------------------------------------------------------- /intro-regression-R-tidymodels/solution/tests/Question 4.R: -------------------------------------------------------------------------------- 1 | test = list( 2 | name = "Question 4", 3 | cases = list( 4 | ottr::TestCase$new( 5 | hidden = FALSE, 6 | name = NA, 7 | points = 0.5, 8 | success_message = "Fantastic! You have successfully split the data and extracted the training (70%) and testing sets (30%).", 9 | failure_message = "Almost there. Let's have a look at this again.Ensure that the splitting specification dictates that 70% of the data should go to training and the rest to testing.", 10 | code = { 11 | ## Test ## 12 | test_that('data dimensions correct', { 13 | expect_equal(dim(estate_train), c(285, 7)) 14 | expect_equal(dim(estate_test), c(123, 7)) 15 | 16 | }) 17 | } 18 | ) 19 | 20 | ) 21 | ) -------------------------------------------------------------------------------- /intro-regression-R-tidymodels/solution/tests/Question 5.R: -------------------------------------------------------------------------------- 1 | test = list( 2 | name = "Question 5", 3 | cases = list( 4 | ottr::TestCase$new( 5 | hidden = TRUE, 6 | name = NA, 7 | points = 1.0, 8 | success_message = "Good job. You have correctly specified a recipe that will remove the `transaction_date` feature, transform `local_convenience_stores` feature into categorical (factor) and then center and scale all numeric predictors.", 9 | failure_message = "Almost there. Ensure your recipe specification will remove the `transaction_date` feature, transform `local_convenience_stores` feature into categorical (factor) and then center and scale all numeric predictors.", 10 | code = { 11 | ## Test ## 12 | test_that('recipe specification is correct', { 13 | 14 | # Test for step_rm 15 | expect_equal(attr(estate_recipe[["steps"]][[1]], "class"), c("step_rm","step")) 16 | expect_equal(as_label(estate_recipe[["steps"]][[1]][["terms"]][[1]]), "transaction_date") 17 | 18 | # Test for step_mutate 19 | expect_equal(attr(estate_recipe[["steps"]][[2]], "class"), c("step_mutate","step")) 20 | expect_equal(as_label(estate_recipe[["steps"]][[2]][["inputs"]][["local_convenience_stores"]]), "factor(local_convenience_stores)") 21 | 22 | # Test for step_normalize 23 | expect_equal(attr(estate_recipe[["steps"]][[3]], "class"), c("step_normalize","step")) 24 | expect_equal(as_label(estate_recipe[["steps"]][[3]][["terms"]][[1]]), "all_numeric_predictors()") 25 | 26 | 27 | }) 28 | } 29 | ) 30 | ) 31 | ) -------------------------------------------------------------------------------- /intro-regression-R-tidymodels/solution/tests/Question 6.R: -------------------------------------------------------------------------------- 1 | test = list( 2 | name = "Question 6", 3 | cases = list( 4 | ottr::TestCase$new( 5 | hidden = FALSE, 6 | name = NA, 7 | points = 1.0, 8 | success_message = "Excellent! Your model specification is looking great!", 9 | failure_message = "Let's have a look at this again. Ensure you have set your engine to **randomForest** and the mode to **regression**.", 10 | code = { 11 | ## Test ## 12 | test_that('the model specification is correct', { 13 | expect_equal(rf_spec$mode, "regression") 14 | expect_equal(rf_spec$engine, "randomForest") 15 | 16 | 17 | }) 18 | } 19 | ) 20 | ) 21 | ) -------------------------------------------------------------------------------- /intro-regression-R-tidymodels/solution/tests/Question 7.R: -------------------------------------------------------------------------------- 1 | test = list( 2 | name = "Question 7", 3 | cases = list( 4 | ottr::TestCase$new( 5 | hidden = FALSE, 6 | name = NA, 7 | points = 1.0, 8 | success_message = "", 9 | failure_message = "", 10 | code = { 11 | ## Test ## 12 | test_that('workflow specification is correct', { 13 | 14 | # Test for step_rm 15 | expect_equal(attr(rf_workflow[["pre"]][["actions"]][["recipe"]][["recipe"]][["steps"]][[1]], "class"), c("step_rm","step")) 16 | expect_equal(as_label(rf_workflow[["pre"]][["actions"]][["recipe"]][["recipe"]][["steps"]][[1]][["terms"]][[1]]), "transaction_date") 17 | 18 | # Test for step_mutate 19 | expect_equal(attr(rf_workflow[["pre"]][["actions"]][["recipe"]][["recipe"]][["steps"]][[2]], "class"), c("step_mutate","step")) 20 | expect_equal(as_label(rf_workflow[["pre"]][["actions"]][["recipe"]][["recipe"]][["steps"]][[2]][["inputs"]][["local_convenience_stores"]]), "factor(local_convenience_stores)") 21 | 22 | # Test for step_normalize 23 | expect_equal(attr(rf_workflow[["pre"]][["actions"]][["recipe"]][["recipe"]][["steps"]][[3]], "class"), c("step_normalize","step")) 24 | expect_equal(as_label(rf_workflow[["pre"]][["actions"]][["recipe"]][["recipe"]][["steps"]][[3]][["terms"]][[1]]), "all_numeric_predictors()") 25 | 26 | 27 | 28 | }) 29 | } 30 | ) 31 | ) 32 | ) -------------------------------------------------------------------------------- /intro-regression-R-tidymodels/solution/tests/Question 8.R: -------------------------------------------------------------------------------- 1 | test = list( 2 | name = "Question 8", 3 | cases = list( 4 | ottr::TestCase$new( 5 | hidden = FALSE, 6 | name = NA, 7 | points = 1.0, 8 | success_message = "Fantastic! You have successfully used the trained model to make predictions for the test set and then binded the predictions to the test set.", 9 | failure_message = "Almost there! Generate predictions for the test data and then bind them to the test set. Hints: augment or predict + bind_cols functions. Also don't forget to rename your .pred column.", 10 | code = { 11 | ## Test ## 12 | test_that('the model specification is correct', { 13 | expect_equal(dim(results), c(123, 8)) 14 | expect_equal(sort(colnames(results)), c("house_age", "latitude", "local_convenience_stores", "longitude", "predictions", "price_per_unit", "transaction_date", "transit_distance")) 15 | 16 | 17 | }) 18 | } 19 | ) 20 | ) 21 | ) -------------------------------------------------------------------------------- /intro-regression-R-tidymodels/workshop-designer.md: -------------------------------------------------------------------------------- 1 | # Introduction to regression models by using R and Tidymodels 2 | 3 | ## Workshop Source 4 | [Introduction to regression models by using R and tidymodels](https://docs.microsoft.com/en-us/learn/modules/introduction-regression-models/?WT.mc_id=academic-59300-cacaste) 5 | 6 | ## Stage 1: Desired Results 7 | 8 | 1. Students will be skilled at: 9 | - Understanding what is regression and when to use a regression model 10 | - Training and evaluating regression models using the Tidymodels framework 11 | 1. Students will be able to independently use their learning to: 12 | - Experimenting with different regression models 13 | - Tuning models hyperparameters 14 | 15 | ## Stage 2: Evidence 16 | 17 | - Hands-on challenge on real estate dataset, following [Challenge-regression.ipynb](./solution/Challenge-regression.ipynb) notebook 18 | - Knowledge check quiz 19 | 20 | ## Stage 3: Learning Plan 21 | 22 | - Introduction 23 | - Train and evaluate a regression model 24 | - Experimenting with models 25 | - Challenge - Create a regression model using Tidymodels 26 | - Knowledge check -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | numpy 3 | flask 4 | jupyter 5 | datascience 6 | pybryt 7 | scipy 8 | folium 9 | matplotlib 10 | ipywidgets>=7.0.0 11 | bqplot 12 | nbinteract==0.0.12 13 | otter-grader 14 | okpy 15 | scikit-learn --------------------------------------------------------------------------------