├── .devcontainer
    ├── Dockerfile
    ├── devcontainer.json
    └── library-scripts
    │   └── common-debian.sh
├── .github
    └── workflows
    │   ├── metrics.yml
    │   └── metrics_keepalive.yml
├── .gitignore
├── .stats_timestamp
├── EasyRTutorialsUseR2022.pdf
├── LICENSE
├── README.md
├── intro-regression-R-tidymodels
    ├── README.md
    ├── images
    │   └── promo.png
    ├── slides.pptx
    ├── solution
    │   ├── Challenge-regression.ipynb
    │   ├── all-systems-check
    │   │   ├── keybindings.json
    │   │   ├── test.R
    │   │   ├── test.Rmd
    │   │   └── test.ipynb
    │   └── tests
    │   │   ├── Question 1.R
    │   │   ├── Question 2.R
    │   │   ├── Question 3.R
    │   │   ├── Question 4.R
    │   │   ├── Question 5.R
    │   │   ├── Question 6.R
    │   │   ├── Question 7.R
    │   │   └── Question 8.R
    └── workshop-designer.md
└── requirements.txt


/.devcontainer/Dockerfile:
--------------------------------------------------------------------------------
  1 | # R version: 4, 4.1, 4.0
  2 | ARG VARIANT="4"
  3 | FROM rocker/r-ver:${VARIANT}
  4 | 
  5 | # Use the [Option] comment to specify true/false arguments that should appear in VS Code UX
  6 | #
  7 | # [Option] Install zsh
  8 | ARG INSTALL_ZSH="true"
  9 | # [Option] Upgrade OS packages to their latest versions
 10 | ARG UPGRADE_PACKAGES="false"
 11 | 
 12 | # Install needed packages and setup non-root user. Use a separate RUN statement to add your own dependencies.
 13 | ARG USERNAME=vscode
 14 | ARG USER_UID=1000
 15 | ARG USER_GID=$USER_UID
 16 | COPY library-scripts/*.sh /tmp/library-scripts/
 17 | RUN apt-get update && export DEBIAN_FRONTEND=noninteractive \
 18 |     && /bin/bash /tmp/library-scripts/common-debian.sh "${INSTALL_ZSH}" "${USERNAME}" "${USER_UID}" "${USER_GID}" "${UPGRADE_PACKAGES}" "true" "true" \
 19 |     && usermod -a -G staff ${USERNAME} \
 20 |     && apt-get -y install \
 21 |         python3-pip \
 22 |         libgit2-dev \
 23 |         libcurl4-openssl-dev \
 24 |         libssl-dev \
 25 |         libxml2-dev \
 26 |         libxt-dev \
 27 |     && apt-get autoremove -y && apt-get clean -y && rm -rf /var/lib/apt/lists/* /tmp/library-scripts \
 28 |     && python3 -m pip --no-cache-dir install radian \
 29 |     && pip --disable-pip-version-check --no-cache-dir install pybryt \
 30 |     && pip --disable-pip-version-check --no-cache-dir install pylint \
 31 |     && pip --disable-pip-version-check --no-cache-dir install jupyter \
 32 |     && pip --disable-pip-version-check --no-cache-dir install datascience \
 33 |     && pip --disable-pip-version-check --no-cache-dir install otter-grader \
 34 |     && pip --disable-pip-version-check --no-cache-dir install numpy \
 35 |     && pip --disable-pip-version-check --no-cache-dir install pandas \
 36 |     && pip --disable-pip-version-check --no-cache-dir install scipy \
 37 |     && pip --disable-pip-version-check --no-cache-dir install folium>=0.9.1 \
 38 |     && pip --disable-pip-version-check --no-cache-dir install matplotlib \
 39 |     && pip --disable-pip-version-check --no-cache-dir install ipywidgets>=7.0.0 \
 40 |     && pip --disable-pip-version-check --no-cache-dir install bqplot \
 41 |     && pip --disable-pip-version-check --no-cache-dir install nbinteract>=0.0.12 \
 42 |     && pip --disable-pip-version-check --no-cache-dir install otter-grader \
 43 |     && pip --disable-pip-version-check --no-cache-dir install okpy \
 44 |     && pip --disable-pip-version-check --no-cache-dir install scikit-learn \
 45 |     && install2.r --error --skipinstalled --ncpus -1 \
 46 |         devtools \
 47 |         languageserver \
 48 |         httpgd \
 49 |         tidyverse \
 50 |         tidymodels \
 51 |         statip \
 52 |         patchwork \
 53 |         paletteer \
 54 |         glmnet \
 55 |         randomForest \
 56 |         xgboost \
 57 |         here \
 58 |         doParallel \
 59 |         janitor \
 60 |         vip \
 61 |         ranger \
 62 |         palmerpenguins \
 63 |         skimr \
 64 |         nnet \
 65 |         kernlab \
 66 |         plotly \
 67 |         factoextra \
 68 |         cluster \
 69 |         ottr \
 70 |     && rm -rf /tmp/downloaded_packages
 71 | 
 72 | # Install summarytools and load some R package off the bat
 73 | RUN R -e "devtools::install_github('https://github.com/dcomtois/summarytools/tree/0-8-9')"
 74 | RUN R -e "library(ottr)"
 75 | RUN R -e "library(here)"
 76 | RUN R -e "library(languageserver)"
 77 | # RUN installGithub.r ucbds-infra/ottr@stable
 78 | 
 79 | 
 80 | # VSCode R Debugger dependency. Install the latest release version from GitHub without using GitHub API.
 81 | # See https://github.com/microsoft/vscode-dev-containers/issues/1032
 82 | RUN export TAG=$(git ls-remote --tags --refs --sort='version:refname' https://github.com/ManuelHentschel/vscDebugger v\* | tail -n 1 | cut --delimiter='/' --fields=3) \
 83 |     && Rscript -e "remotes::install_git('https://github.com/ManuelHentschel/vscDebugger.git', ref = '"${TAG}"', dependencies = FALSE)"
 84 | 
 85 | # R Session watcher settings.
 86 | # See more details: https://github.com/REditorSupport/vscode-R/wiki/R-Session-watcher
 87 | RUN echo 'source(file.path(Sys.getenv("HOME"), ".vscode-R", "init.R"))' >> ${R_HOME}/etc/Rprofile.site
 88 | 
 89 | # [Optional] Uncomment this section to install additional OS packages.
 90 | # RUN apt-get update \
 91 | #     && export DEBIAN_FRONTEND=noninteractive \
 92 | #     && apt-get -y install --no-install-recommends <your-package-list-here>
 93 | 
 94 | # [Optional] Uncomment this section to install additional R packages.
 95 | # RUN install2.r --error --skipinstalled --ncpus -1 <your-package-list-here>
 96 | 
 97 | 
 98 | # [Optional] Uncomment this section to install vscode-jupyter dependencies.
 99 | RUN apt-get update \
100 |     && export DEBIAN_FRONTEND=noninteractive \
101 |     && apt-get -y install --no-install-recommends libzmq3-dev \
102 |     && install2.r --error --skipinstalled --ncpus -1 IRkernel \
103 |     && python3 -m pip --no-cache-dir install jupyter \
104 |     && R --vanilla -s -e 'IRkernel::installspec(user = FALSE)'
105 | 


--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "R Data Science Environment",
 3 | 	"build": {
 4 | 		"dockerfile": "Dockerfile",
 5 | 		// Update VARIANT to pick a specific R version: 4, 4.1, 4.0
 6 | 		"args": { "VARIANT": "4" }
 7 | 	},
 8 | 
 9 | 	// Set *default* container specific settings.json values on container create.
10 | 	"settings": {
11 | 		"r.rterm.linux": "/usr/local/bin/radian",
12 | 		"r.bracketedPaste": true,
13 | 		"r.plot.useHttpgd": true,
14 | 		"[r]": {
15 | 			"editor.wordSeparators": "`~!@#%$^&*()-=+[{]}\\|;:'\",<>/?"
16 | 		}
17 | 	},
18 | 
19 | 	// Add the IDs of extensions you want installed when the container is created.
20 | 	"extensions": [
21 | 		// Add Jupyter, R and Python vscode extensions
22 | 		"REditorSupport.r",
23 | 		"rdebugger.r-debugger",
24 | 		"ms-toolsai.jupyter",
25 | 		"ms-toolsai.jupyter-renderers",
26 | 		"ms-python.python",
27 | 		"ms-python.vscode-pylance",
28 | 		"vsls-contrib.codetour",
29 | 		"GitHub.copilot"
30 | 	],
31 | 
32 | 	// Use 'forwardPorts' to make a list of ports inside the container available locally.
33 | 	// "forwardPorts": [],
34 | 
35 | 	// Use 'postCreateCommand' to run commands after the container is created.
36 | 	"postCreateCommand": "pip3 install -r requirements.txt",
37 | 
38 | 	// Comment out to connect as root instead. More info: https://aka.ms/vscode-remote/containers/non-root.
39 | 	"remoteUser": "vscode"
40 | }
41 | 


--------------------------------------------------------------------------------
/.devcontainer/library-scripts/common-debian.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | #-------------------------------------------------------------------------------------------------------------
  3 | # Copyright (c) Microsoft Corporation. All rights reserved.
  4 | # Licensed under the MIT License. See https://go.microsoft.com/fwlink/?linkid=2090316 for license information.
  5 | #-------------------------------------------------------------------------------------------------------------
  6 | #
  7 | # Docs: https://github.com/microsoft/vscode-dev-containers/blob/main/script-library/docs/common.md
  8 | # Maintainer: The VS Code and Codespaces Teams
  9 | #
 10 | # Syntax: ./common-debian.sh [install zsh flag] [username] [user UID] [user GID] [upgrade packages flag] [install Oh My Zsh! flag] [Add non-free packages]
 11 | 
 12 | set -e
 13 | 
 14 | INSTALL_ZSH=${1:-"true"}
 15 | USERNAME=${2:-"automatic"}
 16 | USER_UID=${3:-"automatic"}
 17 | USER_GID=${4:-"automatic"}
 18 | UPGRADE_PACKAGES=${5:-"true"}
 19 | INSTALL_OH_MYS=${6:-"true"}
 20 | ADD_NON_FREE_PACKAGES=${7:-"false"}
 21 | SCRIPT_DIR="$(cd $(dirname "${BASH_SOURCE[0]}") && pwd)"
 22 | MARKER_FILE="/usr/local/etc/vscode-dev-containers/common"
 23 | 
 24 | if [ "$(id -u)" -ne 0 ]; then
 25 |     echo -e 'Script must be run as root. Use sudo, su, or add "USER root" to your Dockerfile before running this script.'
 26 |     exit 1
 27 | fi
 28 | 
 29 | # Ensure that login shells get the correct path if the user updated the PATH using ENV.
 30 | rm -f /etc/profile.d/00-restore-env.sh
 31 | echo "export PATH=${PATH//$(sh -lc 'echo $PATH')/\$PATH}" > /etc/profile.d/00-restore-env.sh
 32 | chmod +x /etc/profile.d/00-restore-env.sh
 33 | 
 34 | # If in automatic mode, determine if a user already exists, if not use vscode
 35 | if [ "${USERNAME}" = "auto" ] || [ "${USERNAME}" = "automatic" ]; then
 36 |     USERNAME=""
 37 |     POSSIBLE_USERS=("vscode" "node" "codespace" "$(awk -v val=1000 -F ":" '$3==val{print $1}' /etc/passwd)")
 38 |     for CURRENT_USER in ${POSSIBLE_USERS[@]}; do
 39 |         if id -u ${CURRENT_USER} > /dev/null 2>&1; then
 40 |             USERNAME=${CURRENT_USER}
 41 |             break
 42 |         fi
 43 |     done
 44 |     if [ "${USERNAME}" = "" ]; then
 45 |         USERNAME=vscode
 46 |     fi
 47 | elif [ "${USERNAME}" = "none" ]; then
 48 |     USERNAME=root
 49 |     USER_UID=0
 50 |     USER_GID=0
 51 | fi
 52 | 
 53 | # Load markers to see which steps have already run
 54 | if [ -f "${MARKER_FILE}" ]; then
 55 |     echo "Marker file found:"
 56 |     cat "${MARKER_FILE}"
 57 |     source "${MARKER_FILE}"
 58 | fi
 59 | 
 60 | # Ensure apt is in non-interactive to avoid prompts
 61 | export DEBIAN_FRONTEND=noninteractive
 62 | 
 63 | # Function to call apt-get if needed
 64 | apt_get_update_if_needed()
 65 | {
 66 |     if [ ! -d "/var/lib/apt/lists" ] || [ "$(ls /var/lib/apt/lists/ | wc -l)" = "0" ]; then
 67 |         echo "Running apt-get update..."
 68 |         apt-get update
 69 |     else
 70 |         echo "Skipping apt-get update."
 71 |     fi
 72 | }
 73 | 
 74 | # Run install apt-utils to avoid debconf warning then verify presence of other common developer tools and dependencies
 75 | if [ "${PACKAGES_ALREADY_INSTALLED}" != "true" ]; then
 76 | 
 77 |     package_list="apt-utils \
 78 |         openssh-client \
 79 |         gnupg2 \
 80 |         dirmngr \
 81 |         iproute2 \
 82 |         procps \
 83 |         lsof \
 84 |         htop \
 85 |         net-tools \
 86 |         psmisc \
 87 |         curl \
 88 |         wget \
 89 |         rsync \
 90 |         ca-certificates \
 91 |         unzip \
 92 |         zip \
 93 |         nano \
 94 |         vim-tiny \
 95 |         less \
 96 |         jq \
 97 |         lsb-release \
 98 |         apt-transport-https \
 99 |         dialog \
100 |         libc6 \
101 |         libgcc1 \
102 |         libkrb5-3 \
103 |         libgssapi-krb5-2 \
104 |         libicu[0-9][0-9] \
105 |         liblttng-ust0 \
106 |         libstdc++6 \
107 |         zlib1g \
108 |         locales \
109 |         sudo \
110 |         ncdu \
111 |         man-db \
112 |         strace \
113 |         manpages \
114 |         manpages-dev \
115 |         init-system-helpers"
116 |         
117 |     # Needed for adding manpages-posix and manpages-posix-dev which are non-free packages in Debian
118 |     if [ "${ADD_NON_FREE_PACKAGES}" = "true" ]; then
119 |         # Bring in variables from /etc/os-release like VERSION_CODENAME
120 |         . /etc/os-release
121 |         sed -i -E "s/deb http:\/\/(deb|httpredir)\.debian\.org\/debian ${VERSION_CODENAME} main/deb http:\/\/\1\.debian\.org\/debian ${VERSION_CODENAME} main contrib non-free/" /etc/apt/sources.list
122 |         sed -i -E "s/deb-src http:\/\/(deb|httredir)\.debian\.org\/debian ${VERSION_CODENAME} main/deb http:\/\/\1\.debian\.org\/debian ${VERSION_CODENAME} main contrib non-free/" /etc/apt/sources.list
123 |         sed -i -E "s/deb http:\/\/(deb|httpredir)\.debian\.org\/debian ${VERSION_CODENAME}-updates main/deb http:\/\/\1\.debian\.org\/debian ${VERSION_CODENAME}-updates main contrib non-free/" /etc/apt/sources.list
124 |         sed -i -E "s/deb-src http:\/\/(deb|httpredir)\.debian\.org\/debian ${VERSION_CODENAME}-updates main/deb http:\/\/\1\.debian\.org\/debian ${VERSION_CODENAME}-updates main contrib non-free/" /etc/apt/sources.list
125 |         sed -i "s/deb http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}\/updates main/deb http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}\/updates main contrib non-free/" /etc/apt/sources.list
126 |         sed -i "s/deb-src http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}\/updates main/deb http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}\/updates main contrib non-free/" /etc/apt/sources.list
127 |         sed -i "s/deb http:\/\/deb\.debian\.org\/debian ${VERSION_CODENAME}-backports main/deb http:\/\/deb\.debian\.org\/debian ${VERSION_CODENAME}-backports main contrib non-free/" /etc/apt/sources.list 
128 |         sed -i "s/deb-src http:\/\/deb\.debian\.org\/debian ${VERSION_CODENAME}-backports main/deb http:\/\/deb\.debian\.org\/debian ${VERSION_CODENAME}-backports main contrib non-free/" /etc/apt/sources.list
129 |         # Handle bullseye location for security https://www.debian.org/releases/bullseye/amd64/release-notes/ch-information.en.html
130 |         sed -i "s/deb http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}-security main/deb http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}-security main contrib non-free/" /etc/apt/sources.list
131 |         sed -i "s/deb-src http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}-security main/deb http:\/\/security\.debian\.org\/debian-security ${VERSION_CODENAME}-security main contrib non-free/" /etc/apt/sources.list
132 |         echo "Running apt-get update..."
133 |         apt-get update
134 |         package_list="${package_list} manpages-posix manpages-posix-dev"
135 |     else
136 |         apt_get_update_if_needed
137 |     fi
138 | 
139 |     # Install libssl1.1 if available
140 |     if [[ ! -z $(apt-cache --names-only search ^libssl1.1$) ]]; then
141 |         package_list="${package_list}       libssl1.1"
142 |     fi
143 |     
144 |     # Install appropriate version of libssl1.0.x if available
145 |     libssl_package=$(dpkg-query -f '${db:Status-Abbrev}\t${binary:Package}\n' -W 'libssl1\.0\.?' 2>&1 || echo '')
146 |     if [ "$(echo "$LIlibssl_packageBSSL" | grep -o 'libssl1\.0\.[0-9]:' | uniq | sort | wc -l)" -eq 0 ]; then
147 |         if [[ ! -z $(apt-cache --names-only search ^libssl1.0.2$) ]]; then
148 |             # Debian 9
149 |             package_list="${package_list}       libssl1.0.2"
150 |         elif [[ ! -z $(apt-cache --names-only search ^libssl1.0.0$) ]]; then
151 |             # Ubuntu 18.04, 16.04, earlier
152 |             package_list="${package_list}       libssl1.0.0"
153 |         fi
154 |     fi
155 | 
156 |     echo "Packages to verify are installed: ${package_list}"
157 |     apt-get -y install --no-install-recommends ${package_list} 2> >( grep -v 'debconf: delaying package configuration, since apt-utils is not installed' >&2 )
158 |         
159 |     # Install git if not already installed (may be more recent than distro version)
160 |     if ! type git > /dev/null 2>&1; then
161 |         apt-get -y install --no-install-recommends git
162 |     fi
163 | 
164 |     PACKAGES_ALREADY_INSTALLED="true"
165 | fi
166 | 
167 | # Get to latest versions of all packages
168 | if [ "${UPGRADE_PACKAGES}" = "true" ]; then
169 |     apt_get_update_if_needed
170 |     apt-get -y upgrade --no-install-recommends
171 |     apt-get autoremove -y
172 | fi
173 | 
174 | # Ensure at least the en_US.UTF-8 UTF-8 locale is available.
175 | # Common need for both applications and things like the agnoster ZSH theme.
176 | if [ "${LOCALE_ALREADY_SET}" != "true" ] && ! grep -o -E '^\s*en_US.UTF-8\s+UTF-8' /etc/locale.gen > /dev/null; then
177 |     echo "en_US.UTF-8 UTF-8" >> /etc/locale.gen 
178 |     locale-gen
179 |     LOCALE_ALREADY_SET="true"
180 | fi
181 | 
182 | # Create or update a non-root user to match UID/GID.
183 | group_name="${USERNAME}"
184 | if id -u ${USERNAME} > /dev/null 2>&1; then
185 |     # User exists, update if needed
186 |     if [ "${USER_GID}" != "automatic" ] && [ "$USER_GID" != "$(id -g $USERNAME)" ]; then 
187 |         group_name="$(id -gn $USERNAME)"
188 |         groupmod --gid $USER_GID ${group_name}
189 |         usermod --gid $USER_GID $USERNAME
190 |     fi
191 |     if [ "${USER_UID}" != "automatic" ] && [ "$USER_UID" != "$(id -u $USERNAME)" ]; then 
192 |         usermod --uid $USER_UID $USERNAME
193 |     fi
194 | else
195 |     # Create user
196 |     if [ "${USER_GID}" = "automatic" ]; then
197 |         groupadd $USERNAME
198 |     else
199 |         groupadd --gid $USER_GID $USERNAME
200 |     fi
201 |     if [ "${USER_UID}" = "automatic" ]; then 
202 |         useradd -s /bin/bash --gid $USERNAME -m $USERNAME
203 |     else
204 |         useradd -s /bin/bash --uid $USER_UID --gid $USERNAME -m $USERNAME
205 |     fi
206 | fi
207 | 
208 | # Add add sudo support for non-root user
209 | if [ "${USERNAME}" != "root" ] && [ "${EXISTING_NON_ROOT_USER}" != "${USERNAME}" ]; then
210 |     echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME
211 |     chmod 0440 /etc/sudoers.d/$USERNAME
212 |     EXISTING_NON_ROOT_USER="${USERNAME}"
213 | fi
214 | 
215 | # ** Shell customization section **
216 | if [ "${USERNAME}" = "root" ]; then 
217 |     user_rc_path="/root"
218 | else
219 |     user_rc_path="/home/${USERNAME}"
220 | fi
221 | 
222 | # Restore user .bashrc defaults from skeleton file if it doesn't exist or is empty
223 | if [ ! -f "${user_rc_path}/.bashrc" ] || [ ! -s "${user_rc_path}/.bashrc" ] ; then
224 |     cp  /etc/skel/.bashrc "${user_rc_path}/.bashrc"
225 | fi
226 | 
227 | # Restore user .profile defaults from skeleton file if it doesn't exist or is empty
228 | if  [ ! -f "${user_rc_path}/.profile" ] || [ ! -s "${user_rc_path}/.profile" ] ; then
229 |     cp  /etc/skel/.profile "${user_rc_path}/.profile"
230 | fi
231 | 
232 | # .bashrc/.zshrc snippet
233 | rc_snippet="$(cat << 'EOF'
234 | 
235 | if [ -z "${USER}" ]; then export USER=$(whoami); fi
236 | if [[ "${PATH}" != *"$HOME/.local/bin"* ]]; then export PATH="${PATH}:$HOME/.local/bin"; fi
237 | 
238 | # Display optional first run image specific notice if configured and terminal is interactive
239 | if [ -t 1 ] && [[ "${TERM_PROGRAM}" = "vscode" || "${TERM_PROGRAM}" = "codespaces" ]] && [ ! -f "$HOME/.config/vscode-dev-containers/first-run-notice-already-displayed" ]; then
240 |     if [ -f "/usr/local/etc/vscode-dev-containers/first-run-notice.txt" ]; then
241 |         cat "/usr/local/etc/vscode-dev-containers/first-run-notice.txt"
242 |     elif [ -f "/workspaces/.codespaces/shared/first-run-notice.txt" ]; then
243 |         cat "/workspaces/.codespaces/shared/first-run-notice.txt"
244 |     fi
245 |     mkdir -p "$HOME/.config/vscode-dev-containers"
246 |     # Mark first run notice as displayed after 10s to avoid problems with fast terminal refreshes hiding it
247 |     ((sleep 10s; touch "$HOME/.config/vscode-dev-containers/first-run-notice-already-displayed") &)
248 | fi
249 | 
250 | # Set the default git editor if not already set
251 | if [ -z "$(git config --get core.editor)" ] && [ -z "${GIT_EDITOR}" ]; then
252 |     if  [ "${TERM_PROGRAM}" = "vscode" ]; then
253 |         if [[ -n $(command -v code-insiders) &&  -z $(command -v code) ]]; then 
254 |             export GIT_EDITOR="code-insiders --wait"
255 |         else 
256 |             export GIT_EDITOR="code --wait"
257 |         fi
258 |     fi
259 | fi
260 | 
261 | EOF
262 | )"
263 | 
264 | # code shim, it fallbacks to code-insiders if code is not available
265 | cat << 'EOF' > /usr/local/bin/code
266 | #!/bin/sh
267 | 
268 | get_in_path_except_current() {
269 |     which -a "$1" | grep -A1 "$0" | grep -v "$0"
270 | }
271 | 
272 | code="$(get_in_path_except_current code)"
273 | 
274 | if [ -n "$code" ]; then
275 |     exec "$code" "$@"
276 | elif [ "$(command -v code-insiders)" ]; then
277 |     exec code-insiders "$@"
278 | else
279 |     echo "code or code-insiders is not installed" >&2
280 |     exit 127
281 | fi
282 | EOF
283 | chmod +x /usr/local/bin/code
284 | 
285 | # systemctl shim - tells people to use 'service' if systemd is not running
286 | cat << 'EOF' > /usr/local/bin/systemctl
287 | #!/bin/sh
288 | set -e
289 | if [ -d "/run/systemd/system" ]; then
290 |     exec /bin/systemctl/systemctl "$@"
291 | else
292 |     echo '\n"systemd" is not running in this container due to its overhead.\nUse the "service" command to start services intead. e.g.: \n\nservice --status-all'
293 | fi
294 | EOF
295 | chmod +x /usr/local/bin/systemctl
296 | 
297 | # Codespaces bash and OMZ themes - partly inspired by https://github.com/ohmyzsh/ohmyzsh/blob/master/themes/robbyrussell.zsh-theme
298 | codespaces_bash="$(cat \
299 | <<'EOF'
300 | 
301 | # Codespaces bash prompt theme
302 | __bash_prompt() {
303 |     local userpart='`export XIT=$? \
304 |         && [ ! -z "${GITHUB_USER}" ] && echo -n "\[\033[0;32m\]@${GITHUB_USER} " || echo -n "\[\033[0;32m\]\u " \
305 |         && [ "$XIT" -ne "0" ] && echo -n "\[\033[1;31m\]➜" || echo -n "\[\033[0m\]➜"`'
306 |     local gitbranch='`\
307 |         if [ "$(git config --get codespaces-theme.hide-status 2>/dev/null)" != 1 ]; then \
308 |             export BRANCH=$(git symbolic-ref --short HEAD 2>/dev/null || git rev-parse --short HEAD 2>/dev/null); \
309 |             if [ "${BRANCH}" != "" ]; then \
310 |                 echo -n "\[\033[0;36m\](\[\033[1;31m\]${BRANCH}" \
311 |                 && if git ls-files --error-unmatch -m --directory --no-empty-directory -o --exclude-standard ":/*" > /dev/null 2>&1; then \
312 |                         echo -n " \[\033[1;33m\]✗"; \
313 |                 fi \
314 |                 && echo -n "\[\033[0;36m\]) "; \
315 |             fi; \
316 |         fi`'
317 |     local lightblue='\[\033[1;34m\]'
318 |     local removecolor='\[\033[0m\]'
319 |     PS1="${userpart} ${lightblue}\w ${gitbranch}${removecolor}\$ "
320 |     unset -f __bash_prompt
321 | }
322 | __bash_prompt
323 | 
324 | EOF
325 | )"
326 | 
327 | codespaces_zsh="$(cat \
328 | <<'EOF'
329 | # Codespaces zsh prompt theme
330 | __zsh_prompt() {
331 |     local prompt_username
332 |     if [ ! -z "${GITHUB_USER}" ]; then 
333 |         prompt_username="@${GITHUB_USER}"
334 |     else
335 |         prompt_username="%n"
336 |     fi
337 |     PROMPT="%{$fg[green]%}${prompt_username} %(?:%{$reset_color%}➜ :%{$fg_bold[red]%}➜ )" # User/exit code arrow
338 |     PROMPT+='%{$fg_bold[blue]%}%(5~|%-1~/…/%3~|%4~)%{$reset_color%} ' # cwd
339 |     PROMPT+='$([ "$(git config --get codespaces-theme.hide-status 2>/dev/null)" != 1 ] && git_prompt_info)' # Git status
340 |     PROMPT+='%{$fg[white]%}$ %{$reset_color%}'
341 |     unset -f __zsh_prompt
342 | }
343 | ZSH_THEME_GIT_PROMPT_PREFIX="%{$fg_bold[cyan]%}(%{$fg_bold[red]%}"
344 | ZSH_THEME_GIT_PROMPT_SUFFIX="%{$reset_color%} "
345 | ZSH_THEME_GIT_PROMPT_DIRTY=" %{$fg_bold[yellow]%}✗%{$fg_bold[cyan]%})"
346 | ZSH_THEME_GIT_PROMPT_CLEAN="%{$fg_bold[cyan]%})"
347 | __zsh_prompt
348 | 
349 | EOF
350 | )"
351 | 
352 | # Add RC snippet and custom bash prompt
353 | if [ "${RC_SNIPPET_ALREADY_ADDED}" != "true" ]; then
354 |     echo "${rc_snippet}" >> /etc/bash.bashrc
355 |     echo "${codespaces_bash}" >> "${user_rc_path}/.bashrc"
356 |     echo 'export PROMPT_DIRTRIM=4' >> "${user_rc_path}/.bashrc"
357 |     if [ "${USERNAME}" != "root" ]; then
358 |         echo "${codespaces_bash}" >> "/root/.bashrc"
359 |         echo 'export PROMPT_DIRTRIM=4' >> "/root/.bashrc"
360 |     fi
361 |     chown ${USERNAME}:${group_name} "${user_rc_path}/.bashrc"
362 |     RC_SNIPPET_ALREADY_ADDED="true"
363 | fi
364 | 
365 | # Optionally install and configure zsh and Oh My Zsh!
366 | if [ "${INSTALL_ZSH}" = "true" ]; then
367 |     if ! type zsh > /dev/null 2>&1; then
368 |         apt_get_update_if_needed
369 |         apt-get install -y zsh
370 |     fi
371 |     if [ "${ZSH_ALREADY_INSTALLED}" != "true" ]; then
372 |         echo "${rc_snippet}" >> /etc/zsh/zshrc
373 |         ZSH_ALREADY_INSTALLED="true"
374 |     fi
375 | 
376 |     # Adapted, simplified inline Oh My Zsh! install steps that adds, defaults to a codespaces theme.
377 |     # See https://github.com/ohmyzsh/ohmyzsh/blob/master/tools/install.sh for official script.
378 |     oh_my_install_dir="${user_rc_path}/.oh-my-zsh"
379 |     if [ ! -d "${oh_my_install_dir}" ] && [ "${INSTALL_OH_MYS}" = "true" ]; then
380 |         template_path="${oh_my_install_dir}/templates/zshrc.zsh-template"
381 |         user_rc_file="${user_rc_path}/.zshrc"
382 |         umask g-w,o-w
383 |         mkdir -p ${oh_my_install_dir}
384 |         git clone --depth=1 \
385 |             -c core.eol=lf \
386 |             -c core.autocrlf=false \
387 |             -c fsck.zeroPaddedFilemode=ignore \
388 |             -c fetch.fsck.zeroPaddedFilemode=ignore \
389 |             -c receive.fsck.zeroPaddedFilemode=ignore \
390 |             "https://github.com/ohmyzsh/ohmyzsh" "${oh_my_install_dir}" 2>&1
391 |         echo -e "$(cat "${template_path}")\nDISABLE_AUTO_UPDATE=true\nDISABLE_UPDATE_PROMPT=true" > ${user_rc_file}
392 |         sed -i -e 's/ZSH_THEME=.*/ZSH_THEME="codespaces"/g' ${user_rc_file}
393 | 
394 |         mkdir -p ${oh_my_install_dir}/custom/themes
395 |         echo "${codespaces_zsh}" > "${oh_my_install_dir}/custom/themes/codespaces.zsh-theme"
396 |         # Shrink git while still enabling updates
397 |         cd "${oh_my_install_dir}"
398 |         git repack -a -d -f --depth=1 --window=1
399 |         # Copy to non-root user if one is specified
400 |         if [ "${USERNAME}" != "root" ]; then
401 |             cp -rf "${user_rc_file}" "${oh_my_install_dir}" /root
402 |             chown -R ${USERNAME}:${group_name} "${user_rc_path}"
403 |         fi
404 |     fi
405 | fi
406 | 
407 | # Persist image metadata info, script if meta.env found in same directory
408 | meta_info_script="$(cat << 'EOF'
409 | #!/bin/sh
410 | . /usr/local/etc/vscode-dev-containers/meta.env
411 | 
412 | # Minimal output
413 | if [ "$1" = "version" ] || [ "$1" = "image-version" ]; then
414 |     echo "${VERSION}"
415 |     exit 0
416 | elif [ "$1" = "release" ]; then
417 |     echo "${GIT_REPOSITORY_RELEASE}"
418 |     exit 0
419 | elif [ "$1" = "content" ] || [ "$1" = "content-url" ] || [ "$1" = "contents" ] || [ "$1" = "contents-url" ]; then
420 |     echo "${CONTENTS_URL}"
421 |     exit 0
422 | fi
423 | 
424 | #Full output
425 | echo
426 | echo "Development container image information"
427 | echo
428 | if [ ! -z "${VERSION}" ]; then echo "- Image version: ${VERSION}"; fi
429 | if [ ! -z "${DEFINITION_ID}" ]; then echo "- Definition ID: ${DEFINITION_ID}"; fi
430 | if [ ! -z "${VARIANT}" ]; then echo "- Variant: ${VARIANT}"; fi
431 | if [ ! -z "${GIT_REPOSITORY}" ]; then echo "- Source code repository: ${GIT_REPOSITORY}"; fi
432 | if [ ! -z "${GIT_REPOSITORY_RELEASE}" ]; then echo "- Source code release/branch: ${GIT_REPOSITORY_RELEASE}"; fi
433 | if [ ! -z "${BUILD_TIMESTAMP}" ]; then echo "- Timestamp: ${BUILD_TIMESTAMP}"; fi
434 | if [ ! -z "${CONTENTS_URL}" ]; then echo && echo "More info: ${CONTENTS_URL}"; fi
435 | echo
436 | EOF
437 | )"
438 | if [ -f "${SCRIPT_DIR}/meta.env" ]; then
439 |     mkdir -p /usr/local/etc/vscode-dev-containers/
440 |     cp -f "${SCRIPT_DIR}/meta.env" /usr/local/etc/vscode-dev-containers/meta.env
441 |     echo "${meta_info_script}" > /usr/local/bin/devcontainer-info
442 |     chmod +x /usr/local/bin/devcontainer-info
443 | fi
444 | 
445 | # Write marker file
446 | mkdir -p "$(dirname "${MARKER_FILE}")"
447 | echo -e "\
448 |     PACKAGES_ALREADY_INSTALLED=${PACKAGES_ALREADY_INSTALLED}\n\
449 |     LOCALE_ALREADY_SET=${LOCALE_ALREADY_SET}\n\
450 |     EXISTING_NON_ROOT_USER=${EXISTING_NON_ROOT_USER}\n\
451 |     RC_SNIPPET_ALREADY_ADDED=${RC_SNIPPET_ALREADY_ADDED}\n\
452 |     ZSH_ALREADY_INSTALLED=${ZSH_ALREADY_INSTALLED}" > "${MARKER_FILE}"
453 | 
454 | echo "Done!"
455 | 


--------------------------------------------------------------------------------
/.github/workflows/metrics.yml:
--------------------------------------------------------------------------------
 1 | # GitHub Action to post GitHub metrics to an Azure Function App webhook
 2 | # Required secrets
 3 | #   1. A PAT with repo rights:    PAT_REPO_REPORT
 4 | #   2. The webhook endpoint url:  REPORTING_ENDPOINT_URL
 5 | #   3. The webhook endpoint key:  REPORTING_ENDPOINT_KEY
 6 | #   4. Reporting group/team:      REPORTING_GROUP
 7 | 
 8 | name: "GitHub repo metrics report"
 9 | 
10 | on:
11 |   schedule:
12 |     # Run this once per day, towards the end of the day for keeping the most
13 |     # recent data point most meaningful (hours are interpreted in UTC).
14 |     - cron: "0 23 * * *"
15 |   workflow_dispatch: # Allow for running this manually.
16 | 
17 | jobs:
18 |   report_metrics_job:
19 |     runs-on: ubuntu-latest
20 |     name: GitHub repo metrics report
21 |     steps:
22 |       - name: run github metrics image
23 |         id: github_metrics
24 |         uses: gloveboxes/GitHubMetricsAction@v1
25 |         with:
26 |           github_repo: ${{ github.repository }}
27 |           github_personal_access_token: ${{ secrets.REPORTING_PAT }}
28 |           reporting_endpoint_url: ${{ secrets.REPORTING_ENDPOINT_URL }}
29 |           reporting_endpoint_key: ${{ secrets.REPORTING_ENDPOINT_KEY }}
30 |           reporting_group: $${{ secrets.REPORTING_GROUP }}
31 | 


--------------------------------------------------------------------------------
/.github/workflows/metrics_keepalive.yml:
--------------------------------------------------------------------------------
 1 | # GitHub Actions go to sleep after 60 days of no activity
 2 | # This action runs every month and updates the .stats_timestamp with the current UTC
 3 | # This will generate activity on the repo
 4 | # This activity will keep the git hub repo metrics action alive
 5 | 
 6 | name: "GitHub repo metrics report keep alive"
 7 | 
 8 | on:
 9 |   schedule:
10 |     # Run this once per day, towards the end of the day for keeping the most
11 |     # recent data point most meaningful (hours are interpreted in UTC).
12 |     # https://cron.help/every-month
13 |     # Action run midnight UTC 1st of each month
14 |     - cron: "0 0 1 * *"
15 |   workflow_dispatch: # Allow for running this manually.
16 | 
17 | jobs:
18 |     resources:
19 |         name: Update the repo metrics timestamp
20 |         runs-on: ubuntu-latest
21 |         steps:
22 |             - uses: actions/checkout@v3
23 | 
24 |             - name: timestamp
25 |               run: date -Iseconds -u > .stats_timestamp
26 | 
27 |             - name: Update stats timestamp
28 |               run: |
29 |                 git add .stats_timestamp
30 |                 git config user.name "github-actions[bot]"
31 |                 git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
32 |                 git commit -am "Repo Stats Timestamp"
33 |                 git push
34 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | .Rapp.history
 4 | 
 5 | # Session Data files
 6 | .RData
 7 | 
 8 | # User-specific files
 9 | .Ruserdata
10 | 
11 | # Example code in package build process
12 | *-Ex.R
13 | 
14 | # Output files from R CMD build
15 | /*.tar.gz
16 | 
17 | # Output files from R CMD check
18 | /*.Rcheck/
19 | 
20 | # RStudio files
21 | .Rproj.user/
22 | 
23 | # produced vignettes
24 | vignettes/*.html
25 | vignettes/*.pdf
26 | 
27 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
28 | .httr-oauth
29 | 
30 | # knitr and R markdown default cache directories
31 | *_cache/
32 | /cache/
33 | 
34 | # Temporary files created by R markdown
35 | *.utf8.md
36 | *.knit.md
37 | 
38 | # R Environment Variables
39 | .Renviron
40 | 


--------------------------------------------------------------------------------
/.stats_timestamp:
--------------------------------------------------------------------------------
1 | 2024-05-01T01:08:13+00:00
2 | 


--------------------------------------------------------------------------------
/EasyRTutorialsUseR2022.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/revodavid/devcontainers-r/ba8b859d19fb84750d162becdb2f890258697ccc/EasyRTutorialsUseR2022.pdf


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 David Smith
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Easy R Tutorials with Dev Containers
 2 | 
 3 | This is the repository supporting the presentation "Easy R Tutorials with Dev Containers".
 4 | 
 5 | * Presenter: [David Smith](https://www.linkedin.com/in/dmsmith/), Cloud Advocate at Microsoft
 6 | * Presented at: [UseR!2022, June 21, 2022](https://user2022.r-project.org/program/talks/#session-10-building-the-r-community-1) 
 7 | * Presentation slides: [PDF](./EasyRTutorialsUseR2022.pdf)
 8 | * Presentation recording: [UseR!2022](https://www.accelevents.com/e/user2022/portal/schedule/260730) (Starting at 47:45)
 9 | 
10 | You can recreate the demos in the talk using the steps outlined below.
11 | 
12 | ## Dev Containers in GitHub Codepaces
13 | 
14 | If you have access to GitHub CodeSpaces, click the green "Code <>" button at the top right on this repository page, and then select "Create codespace on main". (GitHub CodeSpaces is available with [GitHub Enterprise](https://github.com/enterprise) and [GitHub Education](https://education.github.com/).)
15 | 
16 | Once the Dev Container has started, browse to the file [intro-regression-R-tidymodels/solution/Challenge-regression.ipynb](intro-regression-R-tidymodels/solution/Challenge-regression.ipynb). This will launch a Jupyter Notebook.
17 | 
18 | ## Dev Containers on a local machine
19 | 
20 | You can use Linux, Mac or Windows (including Windows Subsystem for Linux). Just make sure your machine has the following necessary software installed:
21 | - [Visual Studio Code](https://code.visualstudio.com?WT.mc_id=academic-55190-ornella), and the [Remote-Containers extension](https://code.visualstudio.com/docs/remote/containers)
22 | - [Docker Desktop](https://www.docker.com/products/docker-desktop)
23 | - [Git](https://git-scm.com/downloads)
24 | 
25 | **Note**: you do not need to install R, Python, or anything like that. These will all be provided by the Dev Container. 
26 | 
27 | Copy the contents of this repository to your machine. An easy way to do this is with the command: 
28 | ```
29 | git clone https://github.com/revodavid/devcontainers-r
30 | ```
31 | 
32 | Launch Visual Studio Code, and open the directory containing this downloaded repository. An easy way to do this is:
33 | ```
34 | cd devcontainers-r
35 | code .
36 | ```
37 | 
38 | Open the VS Code command palette (Control-Shift-P) and run the command **Remote-Containers: Reopen in Container**. (You can also use the pop-up dialog that automatically prompts you do this.) The first time you try this, you will need to wait a few minutes for the container to build. After this first time, startup will be near-instantaneous.
39 | 
40 | Now, browse to the file [intro-regression-R-tidymodels/solution/Challenge-regression.ipynb](intro-regression-R-tidymodels/solution/Challenge-regression.ipynb). Work through the Jupyter Notebook.
41 | 
42 | # Resources and Links
43 | 
44 | * [Dev Containers](https://containers.dev/) - Overview and specification
45 | * [Dev Containers in Visual Studio Code](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) - Remote-Containers extension 
46 | * [Visual Studio Code](https://code.visualstudio.com/) - Free editor available for Windows, Mac and Linux
47 | * [GitHub Codespaces](https://github.com/features/codespaces) - Available with GitHub Enterprise and GitHub Education
48 | * [Microsoft Workshop Library](https://github.com/microsoft/workshop-library
49 | ) - The source of the workshop "Introduction to regression models by using R and Tidymodels" included in this presentation
50 | 
51 | # Image Credits
52 | 
53 | Images used in presentation slides:
54 | * [File:A frustrated and depressed man holds his head in his hand.jpg - Wikimedia Commons](https://commons.wikimedia.org/wiki/File:A_frustrated_and_depressed_man_holds_his_head_in_his_hand.jpg)
55 | * [File:Confused Felipe.jpg - Wikimedia Commons](https://commons.wikimedia.org/wiki/File:Confused_Felipe.jpg)
56 | * [File:Woman looking depressed.jpg - Wikimedia Commons](https://commons.wikimedia.org/wiki/File:Woman_looking_depressed.jpg)
57 | * [File:Angry woman.jpg - Wikimedia Commons](https://commons.wikimedia.org/wiki/File:Angry_woman.jpg)
58 | * "Bit" artwork by Ashley Willis
59 | 
60 | # Feedback
61 | 
62 | If you have any comments or suggestions about this presentation, please leave an issue in this repository.


--------------------------------------------------------------------------------
/intro-regression-R-tidymodels/README.md:
--------------------------------------------------------------------------------
  1 | # Introduction to regression models by using R and Tidymodels
  2 | 
  3 | ## Module Source
  4 | [Introduction to regression models by using R and tidymodels](https://docs.microsoft.com/en-us/learn/modules/introduction-regression-models/?WT.mc_id=academic-59300-cacaste)
  5 | 
  6 | ## Goals
  7 | 
  8 | Hello and welcome to this learning adventure! In this folder, you will find a Regression Challenge Notebook. This is basically an autograding guided assessment notebook that will help you test your understanding in using R to create models that can predict a numeric, quantifiable value, such as a price, amount, size, or other scalar numbers.
  9 | 
 10 | | **Goal**                      | Description                                    |
 11 | | ----------------------------- | -----------------------------------------------|
 12 | | **What will you learn**       | How to create regression models in R           |
 13 | | **What you'll need**          | [Visual Studio Code](https://code.visualstudio.com?WT.mc_id=academic-59300-cacaste), [Docker Desktop](https://www.docker.com/products/docker-desktop), [Remote Developer Extension](https://aka.ms/vscode-remote/download/extension) and [Git](https://git-scm.com/downloads) |
 14 | | **Duration**                  | 1.5 to 2 Hours                                         |
 15 | | **Slides**                    | [Powerpoint](./slides.pptx)                               |
 16 | 
 17 | ## Video
 18 | 
 19 | [![workshop walk-through](./images/promo.png)](https://youtu.be/ckqijBKO-Es "workshop walk-through")
 20 | > 🎥 Click this image to watch Carlotta walk you through the workshop material and to gain some tips about delivering this workshop.
 21 | 
 22 | ## Pre-Learning
 23 | 
 24 | This workshop allows learners to use the skills learnt in the module [Introduction to regression models by using R and tidymodels](https://docs.microsoft.com/en-us/learn/modules/introduction-regression-models/?WT.mc_id=academic-59300-cacaste) to create their own regression models. As such, learners are encouraged to go through the module beforehand so as to be conversant with some of the concepts covered in this workshop.
 25 | This workshop is the second of a series designed to get started with data science in R. So you may also want to have a look to the [first](../explore-analyze-data-with-R) workshop of the series, dealing with exploratory data analysis.
 26 | 
 27 | ## Prerequisites
 28 | 
 29 | To get you up and running and writing R code in no time, we have containerized this workshop such that you have a ready out of the box R coding environment.
 30 | 
 31 | ### Setting up the development container
 32 | 
 33 | A **development container** is a running [Docker](https://www.docker.com) container with a well-defined tool/runtime stack and its prerequisites. You can try out development containers with **[GitHub Codespaces](https://github.com/features/codespaces)**, **[Binder](https://mybinder.org/)** or **[Visual Studio Code Remote - Containers](https://aka.ms/vscode-remote/containers)**.
 34 | 
 35 | #### GitHub Codespaces
 36 | Follow these steps to open this workshop in a Codespace:
 37 | 1. Click the Code drop-down menu and select the **Open with Codespaces** option.
 38 | 2. Select **+ New codespace** at the bottom on the pane.
 39 | 
 40 | For more info, check out the [GitHub documentation](https://docs.github.com/en/free-pro-team@latest/github/developing-online-with-codespaces/creating-a-codespace#creating-a-codespace).
 41 | 
 42 | #### Binder
 43 | This workshop is also available on Binder. To open the notebook in a Binder environment, just click the button below.
 44 | 
 45 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/carlotta94c/workshop-library/introToRegressionR%26Tidymodels?labpath=full%2Fintro-regression-R-tidymodels%2Fsolution%2FChallenge-regression.ipynb)
 46 | 
 47 | #### Learn Sandbox
 48 | You can go through this challenge also leveraging on the Learn Sandbox environment, provided by [Unit 9](https://docs.microsoft.com/en-us/learn/modules/introduction-regression-models/9-challenge-regression) of the MS Learn module - Introduction to regression models by using R and tidymodels. Just sign in with your Microsoft or GitHub account and click on **Activate sandbox** to start.
 49 | 
 50 | #### VS Code Remote - Containers
 51 | Follow these steps to open this workshop in a container using the VS Code Remote - Containers extension:
 52 | 
 53 | 1. If this is your first time using a development container, please ensure your system meets the pre-reqs (i.e. have Docker installed) in the [getting started steps](https://aka.ms/vscode-remote/containers/getting-started).
 54 | 
 55 | 2. Press <kbd>F1</kbd> select and **Add Development Container Configuration Files...** command for **Remote-Containers** or **Codespaces**.
 56 | 
 57 |    > **Note:** If needed, you can drag-and-drop the `.devcontainer` folder from this sub-folder in a locally cloned copy of this repository into the VS Code file explorer instead of using the command.
 58 | 
 59 | 3. Select this definition. You may also need to select **Show All Definitions...** for it to appear.
 60 | 
 61 | 4. Finally, press <kbd>F1</kbd> and run **Remote-Containers: Reopen Folder in Container** to start using the definition.
 62 | 
 63 | This definition includes some test code that will help you verify it is working as expected on your system. Open the `all-systems-check` folder where you can choose to run the `.R`, `.Rmd` or `.ipynb` scripts. You should see "Hello, remote world!" in an R terminal window (for `.R` and `.Rmd`) or within a Jupyter Notebook (for `.ipynb`) after the respective script executes.
 64 | 
 65 | At some point, you may want to make changes to your container, such as installing a new package. You'll need to rebuild your container for your changes to take effect.
 66 | 
 67 | ## What you will learn
 68 | 
 69 | Let's say we are a real estate agent and we've just been handed a couple of new houses at different locations of a city. We don't know the selling price, and we want to get an estimate of it by comparing it with that of other houses in the location.
 70 | 
 71 | In this challenge, you will use a dataset of real estate sales transactions to predict the price-per-unit of a property based on features such as the property age, availability of local amenities, and location.
 72 | 
 73 | ## Milestone 1: Explore the Data 🕵️‍️
 74 | 
 75 | The first step in any machine learning project is typically to explore the data that you will use to train a model. The goal of this exploration is to try to understand the relationships between its attributes; in particular, any apparent correlation between the features and the label your model will try to predict.
 76 | 
 77 | In this section you will:
 78 | 
 79 | - Import the data and identify the `features` (predictors) and `label` (outcome) variables that you'll be working with.
 80 | 
 81 | - Examine the summary statistics and plot the distribution of the outcome variable, in our case: price of a house.
 82 | 
 83 | - Deal with outliers.
 84 | 
 85 | - Examine the apparent relationship between numeric features and the price of a house using the correlation statistic and scatter plots.
 86 | 
 87 | - Examine the distribution of the house price for each categorical feature using boxplots.
 88 | 
 89 | Now that you've explored the data, it's time to use it to train a regression model that uses the features we've identified as potentially predictive to predict the rentals label.
 90 | 
 91 | ## Milestone 2: Data budgeting
 92 | 
 93 |  It’s common practice in supervised learning to split the data into two subsets; a (typically larger) set with which to train the model, and a smaller “hold-back” set with which to validate the trained model. This enables us to evaluate how well the model performs in order to get a better estimate of how your models will perform on new data.
 94 | 
 95 | In this section you will:
 96 | 
 97 | - Create a data splitting specification i.e what proportion goes to training and what goes to testing.
 98 | 
 99 | - Extract the training and testing sets.
100 | 
101 | ## Milestone 3: Create a Random Forest model specification
102 | 
103 | In this section, you will create a model specification with the following information:
104 | 
105 | - the `type` of model is `random forest`
106 | 
107 | - the `mode` of the model is `regression` (as opposed to classification, etc)
108 | 
109 | - the computational `engine` is the name of the R package, which in our case will be the `randomForest` package.
110 | 
111 | ## Milestone 4: Preprocess data using recipes
112 | 
113 | A recipe is an object that defines a series of steps for data processing. In practice, it's common to perform some preprocessing of the data to make it easier for an algorithm to fit a model to it.
114 | 
115 | In this section, you will specify a recipe that will:
116 | 
117 | - Remove the transaction_date feature.
118 | 
119 | - Transform local_convenience_stores feature into categorical (factor).
120 | 
121 | - Center and scale all numeric predictors.
122 | 
123 | ## Milestone 5: Create a modeling workflow and train a model
124 | 
125 | A model workflow allows the user to bind modeling and preprocessing objects together. You can then fit the entire workflow to the data, so that the model encapsulates all of the preprocessing steps as well as the algorithm.
126 | 
127 | In this section you will:
128 | 
129 | - Create a model workflow.
130 | 
131 | - Train the random forest model.
132 | 
133 | ## Milestone 6: Evaluate model performance
134 | 
135 | Once you have a trained model using the training set, you will want to evaluate how well (or not) it will perform on new data.
136 | 
137 | In this section, you will:
138 | 
139 | - Use the trained model to make predictions on the `test set`.
140 | 
141 | - Evaluate the model predictions using metrics such as `rmse` and `R^2`.
142 | 
143 | ## Milestone 7: Use the trained model
144 | 
145 | If you are satisfied with the model performance, you can save it to be used later. You can then load it whenever you need it, and use it to predict labels for new data. This is often called scoring or inferencing.
146 | 
147 | In this section you will:
148 | 
149 | - Save your trained model, and then use it to predict the price-per-unit for the following real estate transactions:
150 | 
151 |  | **transaction_date** | **house_age** | **transit_distance** | **local_convenience_stores** | **latitude** | **longitude** |
152 | |---------------------|----------------|--------------|--------|-------|------|
153 | | 2013.167             | 16.2          | 289.3248             | 5                            | 24.98203     | 121.54348     |
154 | | 2013.000             | 13.6          | 4082.015             | 0                            | 24.94155     | 121.50381     |
155 | 
156 | ## Quiz
157 | 
158 | Test your knowledge with [a short quiz](https://docs.microsoft.com/en-us/learn/modules/introduction-regression-models/8-knowledge-check)!
159 | 
160 | ## Next steps
161 | 
162 | Congratulations on finishing this regression challenge 🏅!
163 | 
164 | There are other workshops around using R for Data Science. In this workshop, we learnt how regression can be used to create a machine learning model that predicts numeric values. The next set of workshops show you how to [create classification models](../intro-classification-R-tidymodels) and create clustering models (coming soon!). Be sure to check them out!
165 | 
166 | ## Practice
167 | 
168 | In this workshop, you trained a single model (random forest) to predict house prices based on their features. Sometimes, a data practitioner may need to try out a couple of models. Try using other models discussed in this workshop. Try tuning some model hyperparameters while at it too. Do you obtain better evaluation metrics?
169 | 
170 | 
171 | ## Feedback
172 | 
173 | Be sure to give [feedback about this workshop](https://forms.office.com/r/MdhJWMZthR)! Happy Learning!
174 | 
175 | [Code of Conduct](../../CODE_OF_CONDUCT.md)
176 | 


--------------------------------------------------------------------------------
/intro-regression-R-tidymodels/images/promo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/revodavid/devcontainers-r/ba8b859d19fb84750d162becdb2f890258697ccc/intro-regression-R-tidymodels/images/promo.png


--------------------------------------------------------------------------------
/intro-regression-R-tidymodels/slides.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/revodavid/devcontainers-r/ba8b859d19fb84750d162becdb2f890258697ccc/intro-regression-R-tidymodels/slides.pptx


--------------------------------------------------------------------------------
/intro-regression-R-tidymodels/solution/Challenge-regression.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |     "cells": [
  3 |         {
  4 |             "cell_type": "markdown",
  5 |             "metadata": {},
  6 |             "source": [
  7 |                 "## Regression Challenge\n",
  8 |                 "\n",
  9 |                 "Predicting the selling price of a residential property depends on a number of factors, including the property age, availability of local amenities, and location.\n",
 10 |                 "\n",
 11 |                 "In this challenge, you will use a dataset of real estate sales transactions to predict the price-per-unit of a property based on its features. The price-per-unit in this data is based on a unit measurement of 3.3 square meters.\n",
 12 |                 "\n",
 13 |                 "> **Citation**: The data used in this exercise originates from the following study:\n",
 14 |                 ">\n",
 15 |                 "> *Yeh, I. C., & Hsu, T. K. (2018). Building real estate valuation models with comparative approach through case-based reasoning. Applied Soft Computing, 65, 260-271.*\n",
 16 |                 ">\n",
 17 |                 "> It was obtained from the UCI dataset repository (Dua, D. and Graff, C. (2019). [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml). Irvine, CA: University of California, School of Information and Computer Science).\n",
 18 |                 "\n",
 19 |                 "## Review the data\n",
 20 |                 "\n",
 21 |                 "Let's hit the ground running by importing the data and viewing the first few rows.\n"
 22 |             ]
 23 |         },
 24 |         {
 25 |             "cell_type": "code",
 26 |             "execution_count": null,
 27 |             "metadata": {},
 28 |             "outputs": [],
 29 |             "source": [
 30 |                 "# Load the core tidyverse and tidymodels in your current R session\n",
 31 |                 "suppressPackageStartupMessages({\n",
 32 |                 " library(tidyverse)\n",
 33 |                 " library(tidymodels)\n",
 34 |                 "})\n",
 35 |                 "\n",
 36 |                 "# Read the csv file into a tibble\n",
 37 |                 "estate_data <- read_csv(file = \"https://raw.githubusercontent.com/MicrosoftDocs/ml-basics/master/challenges/data/real_estate.csv\", \n",
 38 |                 "show_col_types = FALSE)\n",
 39 |                 "\n",
 40 |                 "# Print the first 10 rows of the data\n",
 41 |                 "estate_data %>%\n",
 42 |                 " slice_head(n = 10)\n"
 43 |             ]
 44 |         },
 45 |         {
 46 |             "cell_type": "markdown",
 47 |             "metadata": {},
 48 |             "source": [
 49 |                 "The data consists of the following variables:\n",
 50 |                 "\n",
 51 |                 "-   **transaction_date** - the transaction date (for example, 2013.250=2013 March, 2013.500=2013 June, etc.)\n",
 52 |                 "\n",
 53 |                 "-   **house_age** - the house age (in years)\n",
 54 |                 "\n",
 55 |                 "-   **transit_distance** - the distance to the nearest light rail station (in meters)\n",
 56 |                 "\n",
 57 |                 "-   **local_convenience_stores** - the number of convenience stores within walking distance\n",
 58 |                 "\n",
 59 |                 "-   **latitude** - the geographic coordinate, latitude\n",
 60 |                 "\n",
 61 |                 "-   **longitude** - the geographic coordinate, longitude\n",
 62 |                 "\n",
 63 |                 "-   **price_per_unit** house price of unit area (3.3 square meters)\n",
 64 |                 "\n",
 65 |                 "Your challenge is to explore and prepare the data, identify predictive features that will help predict the `price_per_unit` label, and train a regression model that achieves the lowest *Root Mean Square Error* (RMSE) you can achieve (which must be less than *7*) when evaluated against a test subset of data.\n",
 66 |                 "\n",
 67 |                 "### View the label distribution\n",
 68 |                 "\n",
 69 |                 "Let's start our analysis of the data by examining a few key descriptive statistics. We can use the `summarytools::descr()` function to neatly and quickly summarize the numeric features as well as the *rentals* label column.\n"
 70 |             ]
 71 |         },
 72 |         {
 73 |             "cell_type": "code",
 74 |             "execution_count": null,
 75 |             "metadata": {},
 76 |             "outputs": [],
 77 |             "source": [
 78 |                 "# Load summary tools library\n",
 79 |                 "library(summarytools)\n",
 80 |                 "\n",
 81 |                 "# Obtain summary stats for feature and label columns\n",
 82 |                 "estate_data %>%\n",
 83 |                 "  # Summary stats\n",
 84 |                 "  descr(order = \"preserve\",\n",
 85 |                 "        stats = c(\"mean\", \"sd\", \"min\", \"q1\", \"med\", \"q3\", \"max\"),\n",
 86 |                 "        round.digits = 6)\n"
 87 |             ]
 88 |         },
 89 |         {
 90 |             "cell_type": "markdown",
 91 |             "metadata": {},
 92 |             "source": [
 93 |                 "The statistics reveal some information about the distribution of the data in each of the numeric fields, including the number of observations (there are 414 records), the mean, standard deviation, minimum and maximum values, and the quantile values (the threshold values for 25%, 50% - which is also the median, and 75% of the data).\n",
 94 |                 "\n",
 95 |                 "From this, we can see that the mean number of price per unit is around 38. There's a comparatively *small standard deviation*, indicating *not much variance* in the prices per unit.\n",
 96 |                 "\n",
 97 |                 "We might get a clearer idea of the distribution of price values by visualizing the data.\n"
 98 |             ]
 99 |         },
100 |         {
101 |             "cell_type": "code",
102 |             "execution_count": null,
103 |             "metadata": {},
104 |             "outputs": [],
105 |             "source": [
106 |                 "library(patchwork)\n",
107 |                 "\n",
108 |                 "# Plot a histogram\n",
109 |                 "theme_set(theme_light())\n",
110 |                 "\n",
111 |                 "hist_plt <- estate_data %>%\n",
112 |                 "  ggplot(mapping = aes(x = price_per_unit)) +\n",
113 |                 "  geom_histogram(bins = 100, fill = \"midnightblue\", alpha = 0.7) +\n",
114 |                 "\n",
115 |                 "  # Add lines for mean and median\n",
116 |                 "  geom_vline(aes(xintercept = mean(price_per_unit), color = \"Mean\"),\n",
117 |                 " linetype = \"dashed\", size = 1.3) +\n",
118 |                 "  geom_vline(aes(xintercept = median(price_per_unit), color = \"Median\"),\n",
119 |                 " linetype = \"dashed\", size = 1.3) +\n",
120 |                 "  xlab(\"\") +\n",
121 |                 "  ylab(\"Frequency\") +\n",
122 |                 "  scale_color_manual(name = \"\", values = c(Mean = \"red\", Median = \"yellow\")) +\n",
123 |                 "  theme(legend.position = c(0.9, 0.9), legend.background = element_blank())\n",
124 |                 "\n",
125 |                 "# Plot a box plot\n",
126 |                 "box_plt <- estate_data %>%\n",
127 |                 "  ggplot(aes(x = price_per_unit, y = 1)) +\n",
128 |                 "  geom_boxplot(fill = \"#E69F00\", color = \"gray23\", alpha = 0.7) +\n",
129 |                 "    # Add titles and labels\n",
130 |                 "  xlab(\"Price_per_unit\") +\n",
131 |                 "  ylab(\"\")\n",
132 |                 "\n",
133 |                 "\n",
134 |                 "# Combine plots using patchwork syntax\n",
135 |                 "(hist_plt / box_plt) +\n",
136 |                 "  plot_annotation(title = \"Price Distribution\",\n",
137 |                 "                  theme = theme(\n",
138 |                 "                    plot.title = element_text(hjust = 0.5)))\n"
139 |             ]
140 |         },
141 |         {
142 |             "cell_type": "markdown",
143 |             "metadata": {},
144 |             "source": [
145 |                 "What can we observe from the boxplot? Yes, outliers.\n",
146 |                 "\n",
147 |                 "### Remove outliers\n",
148 |                 "\n",
149 |                 "We are now set to begin writing some code ourselves 🙂. Let's begin by dealing with outliers. An outlier is a data point that differs significantly from other observations.\n",
150 |                 "\n",
151 |                 "**Question 1.**\n",
152 |                 "\n",
153 |                 "Starting with the `estate_data` dataset, `filter` to create a subset that contains observations where `price_per_unit` is less than *70*.\n",
154 |                 "\n",
155 |                 "Fill in the placeholder `....` with the right code."
156 |             ]
157 |         },
158 |         {
159 |             "cell_type": "code",
160 |             "execution_count": null,
161 |             "metadata": {},
162 |             "outputs": [],
163 |             "source": [
164 |                 "# Narrow down to observations whose price_per_unit is less than 70\n",
165 |                 "estate_data <- estate_data %>%\n",
166 |                 "  ....\n"
167 |             ]
168 |         },
169 |         {
170 |             "cell_type": "markdown",
171 |             "metadata": {},
172 |             "source": [
173 |                 "Test your answer:\n"
174 |             ]
175 |         },
176 |         {
177 |             "cell_type": "code",
178 |             "execution_count": null,
179 |             "metadata": {},
180 |             "outputs": [],
181 |             "source": [
182 |                 ". <- ottr::check(\"tests/Question 1.R\")\n"
183 |             ]
184 |         },
185 |         {
186 |             "cell_type": "markdown",
187 |             "metadata": {},
188 |             "source": [
189 |                 "Now let's take a look at the distribution without the outliers.\n",
190 |                 "\n"
191 |             ]
192 |         },
193 |         {
194 |             "cell_type": "code",
195 |             "execution_count": null,
196 |             "metadata": {},
197 |             "outputs": [],
198 |             "source": [
199 |                 "# Plot a histogram\n",
200 |                 "theme_set(theme_light())\n",
201 |                 "hist_plt <- estate_data %>%\n",
202 |                 "  ggplot(mapping = aes(x = price_per_unit)) + \n",
203 |                 "  geom_histogram(bins = 100, fill = \"midnightblue\", alpha = 0.7) +\n",
204 |                 "\n",
205 |                 "  # Add lines for mean and median\n",
206 |                 "  geom_vline(aes(xintercept = mean(price_per_unit), color = \"Mean\"),\n",
207 |                 " linetype = \"dashed\", size = 1.3) +\n",
208 |                 "  geom_vline(aes(xintercept = median(price_per_unit), color = \"Median\"),\n",
209 |                 " linetype = \"dashed\", size = 1.3) +\n",
210 |                 "  xlab(\"\") +\n",
211 |                 "  ylab(\"Frequency\") +\n",
212 |                 "  scale_color_manual(name = \"\", values = c(Mean = \"red\", Median = \"yellow\")) +\n",
213 |                 "  theme(legend.position = c(0.9, 0.9), legend.background = element_blank())\n",
214 |                 "\n",
215 |                 "# Plot a box plot\n",
216 |                 "box_plt <- estate_data %>%\n",
217 |                 "  ggplot(aes(x = price_per_unit, y = 1)) +\n",
218 |                 "  geom_boxplot(fill = \"#E69F00\", color = \"gray23\", alpha = 0.7) +\n",
219 |                 "    # Add titles and labels\n",
220 |                 "  xlab(\"Price_per_unit\") +\n",
221 |                 "  ylab(\"\")\n",
222 |                 "\n",
223 |                 "\n",
224 |                 "# Combine plots using patchwork syntax\n",
225 |                 "(hist_plt / box_plt) +\n",
226 |                 "  plot_annotation(title = \"Price Distribution\",\n",
227 |                 "                  theme = theme(\n",
228 |                 "                    plot.title = element_text(hjust = 0.5)))\n"
229 |             ]
230 |         },
231 |         {
232 |             "cell_type": "markdown",
233 |             "metadata": {},
234 |             "source": [
235 |                 "Much better 🤩! What can we say about the distribution of the price?\n",
236 |                 "\n",
237 |                 "### View numeric correlations\n",
238 |                 "\n",
239 |                 "We can now start to look for relationships between the *features* and the *label* we want to be able to predict.\n",
240 |                 "\n",
241 |                 "The *correlation* statistic, *r*, is a value between -1 and 1 that indicates the strength of a linear relationship.\n",
242 |                 "\n",
243 |                 "For numeric feature and label columns, we can create scatter plots that show the intersection of the feature and label values.\n",
244 |                 "\n",
245 |                 "**Question 2.**\n",
246 |                 "\n",
247 |                 "Starting with the `estate_data` dataset, in a piped sequence:\n",
248 |                 "\n",
249 |                 "- `pivot_longer` the data (increase the number of rows and decrease the number of columns) such that all the existing column names except price_per_unit, now fall under a new column name called `features` and their corresponding values under a new column name `values`\n",
250 |                 "\n",
251 |                 "-   group the data by `features`\n",
252 |                 "\n",
253 |                 "-   add a new column `corr_coef` which calculates the correlation between `values` and `price_per_unit` (hint: the function used for calculating correlation in R is `cor()`)\n",
254 |                 "\n",
255 |                 "Fill in the placeholder `....` with the right code."
256 |             ]
257 |         },
258 |         {
259 |             "cell_type": "code",
260 |             "execution_count": null,
261 |             "metadata": {},
262 |             "outputs": [],
263 |             "source": [
264 |                 "# Pivot numeric features to a long format\n",
265 |                 "numeric_features_long <- estate_data %>%\n",
266 |                 "  pivot_....(!price_per_unit, names_to = \"....\", values_to = \"....\") %>%\n",
267 |                 "  # Group by features\n",
268 |                 "  ....(features) %>%\n",
269 |                 "  # Calculate correlation coefficient between values and price_per_unit\n",
270 |                 "  mutate(corr_coef = ....) %>%\n",
271 |                 "\n",
272 |                 "  # Modifies the feature column to also include corr_coef\n",
273 |                 "  mutate(features = paste(features, \"vs price, r = \",\n",
274 |                 " round(corr_coef, 2), sep = \"\")) %>%\n",
275 |                 "  ungroup()\n",
276 |                 "\n",
277 |                 "# Print the first few rows of the data\n",
278 |                 "numeric_features_long %>%\n",
279 |                 "  slice_head(n = 10)\n"
280 |             ]
281 |         },
282 |         {
283 |             "cell_type": "markdown",
284 |             "metadata": {},
285 |             "source": [
286 |                 "Test your answer:\n"
287 |             ]
288 |         },
289 |         {
290 |             "cell_type": "code",
291 |             "execution_count": null,
292 |             "metadata": {},
293 |             "outputs": [],
294 |             "source": [
295 |                 ". <- ottr::check(\"tests/Question 2.R\")\n"
296 |             ]
297 |         },
298 |         {
299 |             "cell_type": "markdown",
300 |             "metadata": {},
301 |             "source": [
302 |                 "Fantastic! Now let's use a scatter plot to investigate whether there is any linear relationship between our predictors and outcome variables.\n",
303 |                 "\n"
304 |             ]
305 |         },
306 |         {
307 |             "cell_type": "code",
308 |             "execution_count": null,
309 |             "metadata": {},
310 |             "outputs": [],
311 |             "source": [
312 |                 "# Plot a scatter plot for each feature\n",
313 |                 "numeric_features_long %>%\n",
314 |                 "  ggplot(aes(x = values, y = price_per_unit, color = features)) +\n",
315 |                 "  geom_point(alpha = 0.7, show.legend = F) +\n",
316 |                 "  facet_wrap(~ features, scales = \"free\") +\n",
317 |                 "  paletteer::scale_color_paletteer_d(\"ggthemes::excel_Parallax\")\n"
318 |             ]
319 |         },
320 |         {
321 |             "cell_type": "markdown",
322 |             "metadata": {},
323 |             "source": [
324 |                 "Take a moment and go through the scatter plot. How does the correlation between these features and the price vary?\n",
325 |                 "\n",
326 |                 "### View categorical features\n",
327 |                 "\n",
328 |                 "Now let's compare the categorical features to the label. We'll do this by creating box plots that show the distribution of rental counts for each category.\n",
329 |                 "\n",
330 |                 "`Transaction_date` and `local_convenience_stores` seem to be discrete values - so might work better if treated as categorical features. Let' get right into it.\n",
331 |                 "\n",
332 |                 "**Question 3.**\n",
333 |                 "\n",
334 |                 "Starting with the `estate_data` dataset, in a piped sequence:\n",
335 |                 "\n",
336 |                 "-   only keep columns `transaction_date`, `local_convenience_stores` and `price_per_unit`\n",
337 |                 "\n",
338 |                 "-   encode columns `transaction_date` and `local_convenience_stores` as categorical (factor)\n",
339 |                 "\n",
340 |                 "-   `pivot_longer` the data (increase the number of rows and decrease the number of columns) such that all the existing column names except price_per_unit now fall under a new column name called `features` and their corresponding values under a new column name `values`\n",
341 |                 "\n",
342 |                 "Fill in the placeholder `....` with the right code."
343 |             ]
344 |         },
345 |         {
346 |             "cell_type": "code",
347 |             "execution_count": null,
348 |             "metadata": {},
349 |             "outputs": [],
350 |             "source": [
351 |                 "# Pivot categorical features to a long format\n",
352 |                 "cat_features_long <- estate_data %>%\n",
353 |                 "  ....(transaction_date, ...., ....) %>%\n",
354 |                 "  # Encode transaction_date & local_convenience_stores features\n",
355 |                 "  # from numeric to categorical\n",
356 |                 "  mutate(....) %>%\n",
357 |                 "  pivot_longer(....)\n",
358 |                 "\n",
359 |                 "# Print some observations\n",
360 |                 "cat_features_long %>%\n",
361 |                 "  slice_head(n = 10)\n"
362 |             ]
363 |         },
364 |         {
365 |             "cell_type": "markdown",
366 |             "metadata": {},
367 |             "source": [
368 |                 "Test your answer:\n"
369 |             ]
370 |         },
371 |         {
372 |             "cell_type": "code",
373 |             "execution_count": null,
374 |             "metadata": {},
375 |             "outputs": [],
376 |             "source": [
377 |                 ". <- ottr::check(\"tests/Question 3.R\")\n"
378 |             ]
379 |         },
380 |         {
381 |             "cell_type": "markdown",
382 |             "metadata": {},
383 |             "source": [
384 |                 "Perfect! Now, for our categorical features, boxplots can be a great way of visualising how the price per unit varies within the levels of the categorical feature.\n",
385 |                 "\n"
386 |             ]
387 |         },
388 |         {
389 |             "cell_type": "code",
390 |             "execution_count": null,
391 |             "metadata": {},
392 |             "outputs": [],
393 |             "source": [
394 |                 "# Plot a box plot for each feature\n",
395 |                 "cat_features_long %>%\n",
396 |                 "  ggplot() +\n",
397 |                 "  geom_boxplot(aes(x = values, y = price_per_unit, fill = features),\n",
398 |                 " alpha = 0.7, show.legend = F) +\n",
399 |                 "  facet_wrap(~ features, scales = \"free\") +\n",
400 |                 "  scale_fill_viridis_d() +\n",
401 |                 "  theme(panel.grid = element_blank(),\n",
402 |                 "        axis.text.x = element_text(angle = 90))\n"
403 |             ]
404 |         },
405 |         {
406 |             "cell_type": "markdown",
407 |             "metadata": {},
408 |             "source": [
409 |                 "Take a moment and interpret the graphics. How does the price vary with these features?\n",
410 |                 "\n",
411 |                 "## Split the data into training and test sets.\n",
412 |                 "\n",
413 |                 "Now that we've explored the data, it's time to use it to train a regression model that uses the features we've identified as *potentially predictive* to predict the `price_per_unit` label.\n",
414 |                 "\n",
415 |                 "`Transaction_date` doesn't seem to be very predictive, so we'll omit it.\n",
416 |                 "\n",
417 |                 "Let's begin by splitting the data set such that some goes to training and some goes for validation. This enables us to evaluate how well the model performs in order to get a better estimate of how your models will perform on new data.\n",
418 |                 "\n",
419 |                 "**Question 4.**\n",
420 |                 "\n",
421 |                 "In this section:\n",
422 |                 "\n",
423 |                 "-   Make a split specification of `estate_data` such that *70%* goes to training and the rest goes to testing. Save this to a variable name `estate_split`\n",
424 |                 "\n",
425 |                 "-   Extract the training and testing sets from `estate_split` and save them in `estate_train` and `estate_test` variable names respectively.\n",
426 |                 "\n",
427 |                 "Fill in the placeholder `....` with the right code."
428 |             ]
429 |         },
430 |         {
431 |             "cell_type": "code",
432 |             "execution_count": null,
433 |             "metadata": {},
434 |             "outputs": [],
435 |             "source": [
436 |                 "# Set seed to ensure reproducibility and consistency of outputs\n",
437 |                 "set.seed(2056)\n",
438 |                 "\n",
439 |                 "# Load the tidymodels package\n",
440 |                 "library(tidymodels)\n",
441 |                 "\n",
442 |                 "# Split 70% of the data for training and the rest for tesing\n",
443 |                 "estate_split <- estate_data %>%\n",
444 |                 "  initial_split(....)\n",
445 |                 "\n",
446 |                 "# Extract the train and test data in each split\n",
447 |                 "estate_train <- ....(estate_split)\n",
448 |                 "estate_test <- ....(estate_split)\n",
449 |                 "\n",
450 |                 "# Print the number of observations in each split\n",
451 |                 "cat(\"Training Set\", nrow(estate_train), \"rows\",\n",
452 |                 "    \"\\nTest Set\", nrow(estate_test), \"rows\")\n"
453 |             ]
454 |         },
455 |         {
456 |             "cell_type": "markdown",
457 |             "metadata": {},
458 |             "source": [
459 |                 "Test your answer:\n"
460 |             ]
461 |         },
462 |         {
463 |             "cell_type": "code",
464 |             "execution_count": null,
465 |             "metadata": {},
466 |             "outputs": [],
467 |             "source": [
468 |                 ". <- ottr::check(\"tests/Question 4.R\")\n"
469 |             ]
470 |         },
471 |         {
472 |             "cell_type": "markdown",
473 |             "metadata": {},
474 |             "source": [
475 |                 "Great progress 💪! Now let's train some models.\n",
476 |                 "\n",
477 |                 "## Train a regression model\n",
478 |                 "\n",
479 |                 "### Preprocess data using recipes\n",
480 |                 "\n",
481 |                 "Often before fitting a model, we may want to reformat the predictor values to make them easier for a model to use effectively. This includes transformations and encodings of the data to best represent their important characteristics. In R,this is done using a `recipe`.\n",
482 |                 "\n",
483 |                 "A recipe is an object that defines a series of steps for data processing.\n",
484 |                 "\n",
485 |                 "**Question 5.**\n",
486 |                 "\n",
487 |                 "In this section, specify a recipe, `estate_recipe`, that will:\n",
488 |                 "\n",
489 |                 "-   Remove the `transaction_date` feature\n",
490 |                 "\n",
491 |                 "-   Transform `local_convenience_stores` feature into categorical (factor)\n",
492 |                 "\n",
493 |                 "-   Center and scale all numeric predictors\n",
494 |                 "\n",
495 |                 "Fill in the placeholder `....` with the right code."
496 |             ]
497 |         },
498 |         {
499 |             "cell_type": "code",
500 |             "execution_count": null,
501 |             "metadata": {},
502 |             "outputs": [],
503 |             "source": [
504 |                 "# Create a preprocessing recipe\n",
505 |                 "estate_recipe <- ....(price_per_unit ~ ., data = estate_train) %>%\n",
506 |                 "  # Specify the removal of transaction_date\n",
507 |                 "  step_rm(....) %>%\n",
508 |                 "  # Specify the encoding of local_convenience_stores as categorical\n",
509 |                 "  step_mutate(\n",
510 |                 "    local_convenience_stores = ....) %>%\n",
511 |                 "  # Specify the normalization of numeric features\n",
512 |                 "  ....(all_numeric_predictors())\n",
513 |                 " \n",
514 |                 "# Print recipe\n",
515 |                 "estate_recipe\n"
516 |             ]
517 |         },
518 |         {
519 |             "cell_type": "markdown",
520 |             "metadata": {},
521 |             "source": [
522 |                 "Test your answer:\n"
523 |             ]
524 |         },
525 |         {
526 |             "cell_type": "code",
527 |             "execution_count": null,
528 |             "metadata": {},
529 |             "outputs": [],
530 |             "source": [
531 |                 ". <- ottr::check(\"tests/Question 5.R\")\n"
532 |             ]
533 |         },
534 |         {
535 |             "cell_type": "markdown",
536 |             "metadata": {},
537 |             "source": [
538 |                 "Fantastic! We have the data processing in order. Now, let's make a model specification. In this solution, we'll try out a random forest model which applies an averaging function to multiple decision tree models for a better overall model.\n",
539 |                 "\n",
540 |                 "**Question 6.**\n",
541 |                 "\n",
542 |                 "Create a random forest model specification, `rf_spec`, which uses the `randomForest` package as its engine and then set the mode to `regression`.\n",
543 |                 "\n",
544 |                 "Fill in the placeholder `....` with the right code."
545 |             ]
546 |         },
547 |         {
548 |             "cell_type": "code",
549 |             "execution_count": null,
550 |             "metadata": {},
551 |             "outputs": [],
552 |             "source": [
553 |                 "# Build a random forest model specification\n",
554 |                 "rf_spec <- rand_forest() %>%\n",
555 |                 "  # Specify engine\n",
556 |                 "  .... %>%\n",
557 |                 "  # Specify mode\n",
558 |                 "  set_mode(\"....\")\n"
559 |             ]
560 |         },
561 |         {
562 |             "cell_type": "markdown",
563 |             "metadata": {},
564 |             "source": [
565 |                 "Test your answer:\n"
566 |             ]
567 |         },
568 |         {
569 |             "cell_type": "code",
570 |             "execution_count": null,
571 |             "metadata": {},
572 |             "outputs": [],
573 |             "source": [
574 |                 ". <- ottr::check(\"tests/Question 6.R\")\n"
575 |             ]
576 |         },
577 |         {
578 |             "cell_type": "markdown",
579 |             "metadata": {},
580 |             "source": [
581 |                 "### Create a modeling workflow\n",
582 |                 "\n",
583 |                 "The *workflows* package allows the user to bind modeling and preprocessing objects together. You can then fit the entire workflow to the data, so that the model encapsulates all of the preprocessing steps as well as the algorithm.\n",
584 |                 "\n",
585 |                 "**Question 7.**\n",
586 |                 "\n",
587 |                 "Components of a `workflow()` go together like LEGO blocks. In this section, create a workflow container and then add the preprocessing information from our recipe and then add the model specification to be trained.\n",
588 |                 "\n",
589 |                 "Fill in the placeholder `....` with the right code."
590 |             ]
591 |         },
592 |         {
593 |             "cell_type": "code",
594 |             "execution_count": null,
595 |             "metadata": {},
596 |             "outputs": [],
597 |             "source": [
598 |                 "# Create a workflow that bundles a recipe and model specification\n",
599 |                 "rf_workflow <- workflow() %>%\n",
600 |                 "  # Add a recipe\n",
601 |                 "  add_recipe(....) %>%\n",
602 |                 "  # Add a model specification\n",
603 |                 "  ....\n",
604 |                 "# Print workflow\n",
605 |                 "rf_workflow\n"
606 |             ]
607 |         },
608 |         {
609 |             "cell_type": "markdown",
610 |             "metadata": {},
611 |             "source": [
612 |                 "Test your answer:\n"
613 |             ]
614 |         },
615 |         {
616 |             "cell_type": "code",
617 |             "execution_count": null,
618 |             "metadata": {},
619 |             "outputs": [],
620 |             "source": [
621 |                 ". <- ottr::check(\"tests/Question 7.R\")\n"
622 |             ]
623 |         },
624 |         {
625 |             "cell_type": "markdown",
626 |             "metadata": {},
627 |             "source": [
628 |                 "Now that we have everything (recipe + model specification) wrapped together nicely in a workflow, we are ready to train a model. Workflows have a `fit()` method that can be used to train a model.\n",
629 |                 "\n"
630 |             ]
631 |         },
632 |         {
633 |             "cell_type": "code",
634 |             "execution_count": null,
635 |             "metadata": {},
636 |             "outputs": [],
637 |             "source": [
638 |                 "# For reproducibility\n",
639 |                 "set.seed(2056)\n",
640 |                 "\n",
641 |                 "# Train a random forest model\n",
642 |                 "rf_workflow_fit <- rf_workflow %>%\n",
643 |                 "  fit(data = estate_train)\n",
644 |                 "\n",
645 |                 "# Print out the fitted workflow\n",
646 |                 "rf_workflow_fit\n"
647 |             ]
648 |         },
649 |         {
650 |             "cell_type": "markdown",
651 |             "metadata": {},
652 |             "source": [
653 |                 "Excellent! So we now have a trained random forest model; but is it any good? Let's evaluate its performance! We'll do this by making predictions on the `test data` and then evaluate some performance metrics based on the actual outcomes.\n",
654 |                 "\n",
655 |                 "**Question 8.**\n",
656 |                 "\n",
657 |                 "-   We'll evaluate the model performance based on the *rmse* and *rsq* metrics. Use the `metric_set()` function to combine these metric functions together into a new function, `eval_metrics`, that calculates all of them at once.\n",
658 |                 "\n",
659 |                 "-   Generate predictions for the test data and then bind them to the test set. Rename the column containing predictions from `.pred` to `predictions`.\n",
660 |                 "\n",
661 |                 "Fill in the placeholder `....` with the right code."
662 |             ]
663 |         },
664 |         {
665 |             "cell_type": "code",
666 |             "execution_count": null,
667 |             "metadata": {},
668 |             "outputs": [],
669 |             "source": [
670 |                 "# Create a metric set\n",
671 |                 "eval_metrics <- ....(rmse, ....)\n",
672 |                 "\n",
673 |                 "\n",
674 |                 "# Make and bind predictions to test data\n",
675 |                 "results <- rf_workflow_fit %>%\n",
676 |                 "  ....\n"
677 |             ]
678 |         },
679 |         {
680 |             "cell_type": "markdown",
681 |             "metadata": {},
682 |             "source": [
683 |                 "Test your answer:\n"
684 |             ]
685 |         },
686 |         {
687 |             "cell_type": "code",
688 |             "execution_count": null,
689 |             "metadata": {},
690 |             "outputs": [],
691 |             "source": [
692 |                 ". <- ottr::check(\"tests/Question 8.R\")\n"
693 |             ]
694 |         },
695 |         {
696 |             "cell_type": "markdown",
697 |             "metadata": {},
698 |             "source": [
699 |                 "Awesome work! You have just used your trained model to make predictions on the test set.\n",
700 |                 "\n",
701 |                 "How well did the model predict the prices per unit? Let's find out by looking at the metrics.\n"
702 |             ]
703 |         },
704 |         {
705 |             "cell_type": "code",
706 |             "execution_count": null,
707 |             "metadata": {},
708 |             "outputs": [],
709 |             "source": [
710 |                 "# Evaluate the model\n",
711 |                 "rf_metrics <- eval_metrics(data = results,\n",
712 |                 "                           truth = price_per_unit,\n",
713 |                 "                           estimate = predictions)\n",
714 |                 "\n",
715 |                 "\n",
716 |                 "# Plot predicted vs actual\n",
717 |                 "rf_plt <- results %>%\n",
718 |                 "  ggplot(mapping = aes(x = price_per_unit, y = predictions)) +\n",
719 |                 "  geom_point(color = \"darkorchid\", size = 1.6) +\n",
720 |                 "  # overlay regression line\n",
721 |                 "  geom_smooth(method = \"lm\", color = \"black\", se = F) +\n",
722 |                 "  ggtitle(\"Price per unit predictions\") +\n",
723 |                 "  xlab(\"Actual Labels\") +\n",
724 |                 "  ylab(\"Predicted Labels\") +\n",
725 |                 "  theme(plot.title = element_text(hjust = 0.5))\n",
726 |                 "\n",
727 |                 "# Return evaluations\n",
728 |                 "list(metrics = rf_metrics, evaluation_plot = rf_plt)\n"
729 |             ]
730 |         },
731 |         {
732 |             "cell_type": "markdown",
733 |             "metadata": {},
734 |             "source": [
735 |                 "How do you think the model performed? What do the values for `rsq` and `rmse` tell you? Please refer to the corresponding module for this notebook if you may need help answering these questions.\n",
736 |                 "\n",
737 |                 "## Use the Trained Model\n",
738 |                 "\n",
739 |                 "Save your trained model, and then use it to predict the price-per-unit for the following real estate transactions:\n",
740 |                 "\n",
741 |                 "| **transaction_date** | **house_age** | **transit_distance** | **local_convenience_stores** | **latitude** | **longitude** |\n",
742 |                 "|----------------------|---------------|----------------------|------------------------------|--------------|---------------|\n",
743 |                 "| 2013.167             | 16.2          | 289.3248             | 5                            | 24.98203     | 121.54348     |\n",
744 |                 "| 2013.000             | 13.6          | 4082.015             | 0                            | 24.94155     | 121.50381     |\n",
745 |                 "\n"
746 |             ]
747 |         },
748 |         {
749 |             "cell_type": "code",
750 |             "execution_count": null,
751 |             "metadata": {},
752 |             "outputs": [],
753 |             "source": [
754 |                 "library(here)\n",
755 |                 "# Save trained workflow\n",
756 |                 "saveRDS(rf_workflow_fit, \"rf_price_model.rds\")\n"
757 |             ]
758 |         },
759 |         {
760 |             "cell_type": "markdown",
761 |             "metadata": {},
762 |             "source": [
763 |                 "In this way, we can load it whenever we need it, and use it to predict labels for new data. This is often called *scoring* or *inferencing*.\n",
764 |                 "\n"
765 |             ]
766 |         },
767 |         {
768 |             "cell_type": "code",
769 |             "execution_count": null,
770 |             "metadata": {},
771 |             "outputs": [],
772 |             "source": [
773 |                 "# Create a tibble for the new real estate samples\n",
774 |                 "new_data <- tibble(\n",
775 |                 "  transaction_date = c(2013.167, 2013.000),\n",
776 |                 "  house_age = c(16.2, 13.6),\n",
777 |                 "  transit_distance = c(289.3248, 4082.015),\n",
778 |                 "  local_convenience_stores = c(5, 0),\n",
779 |                 "  latitude = c(24.98203, 24.94155),\n",
780 |                 "  longitude = c(121.54348, 121.50381))\n",
781 |                 "\n",
782 |                 "# Print out new data\n",
783 |                 "new_data\n"
784 |             ]
785 |         },
786 |         {
787 |             "cell_type": "markdown",
788 |             "metadata": {},
789 |             "source": [
790 |                 "Now that we have our data, let's load the saved model and make predictions.\n",
791 |                 "\n"
792 |             ]
793 |         },
794 |         {
795 |             "cell_type": "code",
796 |             "execution_count": null,
797 |             "metadata": {},
798 |             "outputs": [],
799 |             "source": [
800 |                 "# Load the model into the current R session\n",
801 |                 "loaded_model <- readRDS(\"rf_price_model.rds\")\n",
802 |                 "\n",
803 |                 "# Make predictions\n",
804 |                 "predictions <- loaded_model %>%\n",
805 |                 "  augment(new_data = new_data)\n",
806 |                 "\n",
807 |                 "predictions\n"
808 |             ]
809 |         },
810 |         {
811 |             "cell_type": "markdown",
812 |             "metadata": {},
813 |             "source": [
814 |                 "Congratulations for completing this challenge! In this notebook, you:\n",
815 |                 "\n",
816 |                 "-   Explored the data set to understand the relationships between the predictors and outcomes\n",
817 |                 "-   Preprocessed the data using recipes to make them easier for a model to use effectively.\n",
818 |                 "-   Made a random forest model specification.\n",
819 |                 "-   Bundles a recipe and model specification into a workflow.\n",
820 |                 "-   Trained a model.\n",
821 |                 "-   Made predictions on test set and evaluated the model performance.\n",
822 |                 "-   Saved the model, loaded it and then used it to predict labels for new data.\n",
823 |                 "\n",
824 |                 "Fantastic job for coming this far 👏! Feeling adventurous? Then, be sure to try out other regression models and tune some hyperparameters while at it.\n",
825 |                 "\n",
826 |                 "See you in our next module as we explore the realm of *classification* models!\n",
827 |                 "\n",
828 |                 "Happy Learning,\n",
829 |                 "\n",
830 |                 "[Eric](https://twitter.com/ericntay), Gold Microsoft Learn Student Ambassador.\n"
831 |             ]
832 |         }
833 |     ],
834 |     "metadata": {
835 |         "anaconda-cloud": "",
836 |         "kernelspec": {
837 |             "display_name": "R",
838 |             "language": "R",
839 |             "name": "ir"
840 |         },
841 |         "language_info": {
842 |             "codemirror_mode": "r",
843 |             "file_extension": ".r",
844 |             "mimetype": "text/x-r-source",
845 |             "name": "R",
846 |             "pygments_lexer": "r",
847 |             "version": "3.4.1"
848 |         }
849 |     },
850 |     "nbformat": 4,
851 |     "nbformat_minor": 1
852 | }
853 | 


--------------------------------------------------------------------------------
/intro-regression-R-tidymodels/solution/all-systems-check/keybindings.json:
--------------------------------------------------------------------------------
 1 | // Place your key bindings in this file to override the defaults
 2 | // Keyboard shortcuts for common R operators
 3 | [ 
 4 |   // Add assignment operator
 5 |   {
 6 |       "key": "Alt+-",
 7 |       "command": "type",
 8 |       "args": { "text": " <- " }
 9 |       //"when": "editorTextFocus && editorLangId == r"
10 |     },
11 | 
12 |   // Add pipe
13 | 
14 |     {
15 |       "key": "Ctrl+Shift+m",
16 |       "command": "type",
17 |       "args": { "text": " %>% " }
18 |       //"when": "editorTextFocus && editorLangId == r"
19 |     }
20 | 


--------------------------------------------------------------------------------
/intro-regression-R-tidymodels/solution/all-systems-check/test.R:
--------------------------------------------------------------------------------
 1 | #-------------------------------------------------------------------------------------------------------------
 2 | # Copyright (c) Microsoft Corporation. All rights reserved.
 3 | # Licensed under the MIT License. See https://go.microsoft.com/fwlink/?linkid=2090316 for license information.
 4 | #-------------------------------------------------------------------------------------------------------------
 5 | 
 6 | say_hello <- function(name) {
 7 |   message(paste0("Hello, ", name, "!"))
 8 | }
 9 | 
10 | say_hello("remote world")
11 | 


--------------------------------------------------------------------------------
/intro-regression-R-tidymodels/solution/all-systems-check/test.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "R Notebook"
 3 | output: html_notebook
 4 | ---
 5 | 
 6 | This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. 
 7 | 
 8 | Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Ctrl+Shift+Enter*. 
 9 | 
10 | ```{r}
11 | plot(cars)
12 | ```
13 | 
14 | Try a function too?
15 | ```{r}
16 | # Function that returns an awesome message
17 | say_hello <- function(name) {
18 |   message(paste0("Hello, ", name, ":) In this module, we learn how to Explore
19 |   and Analyze Data with R."))
20 | }
21 | 
22 | say_hello("remote world")
23 | 
24 | ```
25 | 
26 | 


--------------------------------------------------------------------------------
/intro-regression-R-tidymodels/solution/all-systems-check/test.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {},
 7 |    "outputs": [
 8 |     {
 9 |      "name": "stdout",
10 |      "output_type": "stream",
11 |      "text": [
12 |       "[1] \"Hello from R and Python\"\n"
13 |      ]
14 |     }
15 |    ],
16 |    "source": [
17 |     "# Both Python and R have the print() function.\n",
18 |     "# This hsould work with both kernels\n",
19 |     "print(\"Hello from R and Python\")"
20 |    ]
21 |   }
22 |  ],
23 |  "metadata": {
24 |   "interpreter": {
25 |    "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
26 |   },
27 |   "kernelspec": {
28 |    "display_name": "R",
29 |    "language": "R",
30 |    "name": "ir"
31 |   },
32 |   "language_info": {
33 |    "codemirror_mode": "r",
34 |    "file_extension": ".r",
35 |    "mimetype": "text/x-r-source",
36 |    "name": "R",
37 |    "pygments_lexer": "r",
38 |    "version": "4.1.2"
39 |   },
40 |   "orig_nbformat": 4
41 |  },
42 |  "nbformat": 4,
43 |  "nbformat_minor": 2
44 | }
45 | 


--------------------------------------------------------------------------------
/intro-regression-R-tidymodels/solution/tests/Question 1.R:
--------------------------------------------------------------------------------
 1 | test = list(
 2 |   name = "Question 1",
 3 |   cases = list(
 4 |     ottr::TestCase$new(
 5 |       hidden = FALSE,
 6 |       name = NA,
 7 |       points = 0.5,
 8 |       success_message = "Great start! Your tibble dimensions are correct.",
 9 |       failure_message = "Almost there! Ensure you have filtered correctly to obtain a subset whose observations of `price_per_unit` is less than `70`. Expected dimensions [408 7]",
10 |       code = {
11 |         suppressPackageStartupMessages({
12 |           library(testthat)
13 |           library(ottr)
14 |         })
15 | 
16 |         ## Test ##
17 |         test_that('data dimensions correct', {
18 |           expect_equal(dim(estate_data), c(408, 7))
19 |           
20 |         })
21 |       }
22 |     ),
23 |     ottr::TestCase$new(
24 |       hidden = FALSE,
25 |       name = NA,
26 |       points = 0.5,
27 |       success_message = "Excellent. You have successfully created a subset whose observations of price_per_unit is less than 70.",
28 |       failure_message = "Let's give this another try. Ensure your subset contains observations where **price_per_unit** is less than 70.",
29 |       code = {
30 |         
31 | 
32 |         ## Test ##
33 |         test_that('the range of values for price per unit is within 7.6 and 69.7', {
34 |           expect_equal(range(estate_data$price_per_unit), c(7.6, 69.7))
35 |         })
36 |       }
37 |     )
38 |   )
39 | )


--------------------------------------------------------------------------------
/intro-regression-R-tidymodels/solution/tests/Question 2.R:
--------------------------------------------------------------------------------
 1 | test = list(
 2 |   name = "Question 2",
 3 |   cases = list(
 4 |     ottr::TestCase$new(
 5 |       hidden = FALSE,
 6 |       name = NA,
 7 |       points = 0.5,
 8 |       success_message = "Great start! Your tibble dimensions and corresponding columns are correct.",
 9 |       failure_message = "Almost there! Let's give this another shot.",
10 |       code = {
11 |         ## Test ##
12 |         test_that('data dimensions correct', {
13 |           expect_equal(dim(numeric_features_long), c(2448, 4))
14 |           expect_equal(sort(colnames(numeric_features_long)), c("corr_coef", "features", "price_per_unit", "values"))
15 |           
16 |         })
17 |       }
18 |     ),
19 |     ottr::TestCase$new(
20 |       hidden = FALSE,
21 |       name = NA,
22 |       points = 0.5,
23 |       success_message = "Excellent! You have successfully pivoted the tibble and found the correlation between the existing numeric column values and the price per unit.",
24 |       failure_message = "Let's give this another try. Ensure you have correctly pivoted the data to obtain two new columns **features** and **values** and then grouped by **features** and then added a new column **corr_coef** which is the correlation between **values** and **price_per_unit**. Lastly don't forget to ungroup :).",
25 |       code = {
26 |         ## Test ##
27 |         test_that('the correlation coefficients are correct', {
28 |           expect_equal(round(range(numeric_features_long$corr_coef), 7), c(-0.7087782, 0.6101017))
29 |         })
30 |       }
31 |     )
32 |   )
33 | )


--------------------------------------------------------------------------------
/intro-regression-R-tidymodels/solution/tests/Question 3.R:
--------------------------------------------------------------------------------
 1 | test = list(
 2 |   name = "Question 3",
 3 |   cases = list(
 4 |     ottr::TestCase$new(
 5 |       hidden = FALSE,
 6 |       name = NA,
 7 |       points = 0.5,
 8 |       success_message = "success_message: Fantastic! Your tibble dimensions and corresponding columns are correct.",
 9 |       failure_message = "Almost there! Ensure you have selected columns transaction_date, local_convenience_stores and price_per_unit, and then pivoted the exsiting columns except price_per_unit to obtain two new columns **features** and **values**",
10 |       code = {
11 |         ## Test ##
12 |         test_that('data dimensions correct', {
13 |           expect_equal(dim(cat_features_long), c(816, 3))
14 |           expect_equal(sort(colnames(cat_features_long)), c("features", "price_per_unit", "values"))
15 |           
16 |         })
17 |       }
18 |     ),
19 |     ottr::TestCase$new(
20 |       hidden = FALSE,
21 |       name = NA,
22 |       points = 0.5,
23 |       success_message = "Congratulations! You have successfully selected the desired columns, encoded some of them as categorical and restructured the data to a longer format.",
24 |       failure_message = "Almost there! Ensure you have selected columns transaction_date, local_convenience_stores and price_per_unit, and then encoded transaction_date & local_convenience_stores as categorical, and then pivoted the data correctly.",
25 |       code = {
26 |         ## Test ##
27 |         test_that('data contains the correct observations', {
28 |           expect_equal(sort(unique(cat_features_long$features)), c("local_convenience_stores", "transaction_date"))
29 |           expect_equal(class(cat_features_long$values), "factor")
30 |           
31 |         })
32 |       }
33 |     )
34 |   )
35 | )


--------------------------------------------------------------------------------
/intro-regression-R-tidymodels/solution/tests/Question 4.R:
--------------------------------------------------------------------------------
 1 | test = list(
 2 |   name = "Question 4",
 3 |   cases = list(
 4 |     ottr::TestCase$new(
 5 |       hidden = FALSE,
 6 |       name = NA,
 7 |       points = 0.5,
 8 |       success_message = "Fantastic! You have successfully split the data and extracted the training (70%) and testing sets (30%).",
 9 |       failure_message = "Almost there. Let's have a look at this again.Ensure that the splitting specification dictates that 70% of the data should go to training and the rest to testing.",
10 |       code = {
11 |         ## Test ##
12 |         test_that('data dimensions correct', {
13 |           expect_equal(dim(estate_train), c(285, 7))
14 |           expect_equal(dim(estate_test), c(123, 7))
15 |           
16 |         })
17 |       }
18 |     )
19 |     
20 |   )
21 | )


--------------------------------------------------------------------------------
/intro-regression-R-tidymodels/solution/tests/Question 5.R:
--------------------------------------------------------------------------------
 1 | test = list(
 2 |   name = "Question 5",
 3 |   cases = list(
 4 |     ottr::TestCase$new(
 5 |       hidden = TRUE,
 6 |       name = NA,
 7 |       points = 1.0,
 8 |       success_message = "Good job. You have correctly specified a recipe that will remove the `transaction_date` feature, transform `local_convenience_stores` feature into categorical (factor) and then center and scale all numeric predictors.",
 9 |       failure_message = "Almost there. Ensure your recipe specification will remove the `transaction_date` feature, transform `local_convenience_stores` feature into categorical (factor) and then center and scale all numeric predictors.",
10 |       code = {
11 |         ## Test ##
12 |         test_that('recipe specification is correct', {
13 |           
14 |           # Test for step_rm
15 |           expect_equal(attr(estate_recipe[["steps"]][[1]], "class"), c("step_rm","step"))
16 |           expect_equal(as_label(estate_recipe[["steps"]][[1]][["terms"]][[1]]), "transaction_date")
17 |           
18 |           # Test for step_mutate
19 |           expect_equal(attr(estate_recipe[["steps"]][[2]], "class"), c("step_mutate","step"))
20 |           expect_equal(as_label(estate_recipe[["steps"]][[2]][["inputs"]][["local_convenience_stores"]]), "factor(local_convenience_stores)")
21 |           
22 |           # Test for step_normalize
23 |           expect_equal(attr(estate_recipe[["steps"]][[3]], "class"), c("step_normalize","step"))
24 |           expect_equal(as_label(estate_recipe[["steps"]][[3]][["terms"]][[1]]), "all_numeric_predictors()")
25 |           
26 |           
27 |         })
28 |       }
29 |     )
30 |   )
31 | )


--------------------------------------------------------------------------------
/intro-regression-R-tidymodels/solution/tests/Question 6.R:
--------------------------------------------------------------------------------
 1 | test = list(
 2 |   name = "Question 6",
 3 |   cases = list(
 4 |     ottr::TestCase$new(
 5 |       hidden = FALSE,
 6 |       name = NA,
 7 |       points = 1.0,
 8 |       success_message = "Excellent! Your model specification is looking great!",
 9 |       failure_message = "Let's have a look at this again. Ensure you have set your engine to **randomForest** and the mode to **regression**.",
10 |       code = {
11 |         ## Test ##
12 |         test_that('the model specification is correct', {
13 |           expect_equal(rf_spec$mode, "regression")
14 |           expect_equal(rf_spec$engine, "randomForest")
15 |           
16 |           
17 |         })
18 |       }
19 |     )
20 |   )
21 | )


--------------------------------------------------------------------------------
/intro-regression-R-tidymodels/solution/tests/Question 7.R:
--------------------------------------------------------------------------------
 1 | test = list(
 2 |   name = "Question 7",
 3 |   cases = list(
 4 |     ottr::TestCase$new(
 5 |       hidden = FALSE,
 6 |       name = NA,
 7 |       points = 1.0,
 8 |       success_message = "",
 9 |       failure_message = "",
10 |       code = {
11 |         ## Test ##
12 |         test_that('workflow specification is correct', {
13 |           
14 |           # Test for step_rm
15 |           expect_equal(attr(rf_workflow[["pre"]][["actions"]][["recipe"]][["recipe"]][["steps"]][[1]], "class"), c("step_rm","step"))
16 |           expect_equal(as_label(rf_workflow[["pre"]][["actions"]][["recipe"]][["recipe"]][["steps"]][[1]][["terms"]][[1]]), "transaction_date")
17 |           
18 |           # Test for step_mutate
19 |           expect_equal(attr(rf_workflow[["pre"]][["actions"]][["recipe"]][["recipe"]][["steps"]][[2]], "class"), c("step_mutate","step"))
20 |           expect_equal(as_label(rf_workflow[["pre"]][["actions"]][["recipe"]][["recipe"]][["steps"]][[2]][["inputs"]][["local_convenience_stores"]]), "factor(local_convenience_stores)")
21 |           
22 |           # Test for step_normalize
23 |           expect_equal(attr(rf_workflow[["pre"]][["actions"]][["recipe"]][["recipe"]][["steps"]][[3]], "class"), c("step_normalize","step"))
24 |           expect_equal(as_label(rf_workflow[["pre"]][["actions"]][["recipe"]][["recipe"]][["steps"]][[3]][["terms"]][[1]]), "all_numeric_predictors()")
25 |           
26 |           
27 |           
28 |         })
29 |       }
30 |     )
31 |   )
32 | )


--------------------------------------------------------------------------------
/intro-regression-R-tidymodels/solution/tests/Question 8.R:
--------------------------------------------------------------------------------
 1 | test = list(
 2 |   name = "Question 8",
 3 |   cases = list(
 4 |     ottr::TestCase$new(
 5 |       hidden = FALSE,
 6 |       name = NA,
 7 |       points = 1.0,
 8 |       success_message = "Fantastic! You have successfully used the trained model to make predictions for the test set and then binded the predictions to the test set.",
 9 |       failure_message = "Almost there! Generate predictions for the test data and then bind them to the test set. Hints: augment or predict + bind_cols functions. Also don't forget to rename your .pred column.",
10 |       code = {
11 |         ## Test ##
12 |         test_that('the model specification is correct', {
13 |           expect_equal(dim(results), c(123, 8))
14 |           expect_equal(sort(colnames(results)), c("house_age", "latitude", "local_convenience_stores", "longitude", "predictions", "price_per_unit", "transaction_date", "transit_distance"))
15 |           
16 |           
17 |         })
18 |       }
19 |     )
20 |   )
21 | )


--------------------------------------------------------------------------------
/intro-regression-R-tidymodels/workshop-designer.md:
--------------------------------------------------------------------------------
 1 | # Introduction to regression models by using R and Tidymodels 
 2 | 
 3 | ## Workshop Source
 4 | [Introduction to regression models by using R and tidymodels](https://docs.microsoft.com/en-us/learn/modules/introduction-regression-models/?WT.mc_id=academic-59300-cacaste)
 5 | 
 6 | ## Stage 1: Desired Results 
 7 | 
 8 | 1. Students will be skilled at:
 9 | - Understanding what is regression and when to use a regression model
10 | - Training and evaluating regression models using the Tidymodels framework
11 | 1. Students will be able to independently use their learning to:
12 | - Experimenting with different regression models 
13 | - Tuning models hyperparameters
14 | 
15 | ## Stage 2: Evidence
16 |  
17 | - Hands-on challenge on real estate dataset, following [Challenge-regression.ipynb](./solution/Challenge-regression.ipynb) notebook
18 | - Knowledge check quiz 
19 | 
20 | ## Stage 3: Learning Plan
21 | 
22 | - Introduction
23 | - Train and evaluate a regression model
24 | - Experimenting with models
25 | - Challenge - Create a regression model using Tidymodels
26 | - Knowledge check


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | pandas
 2 | numpy 
 3 | flask
 4 | jupyter
 5 | datascience 	
 6 | pybryt
 7 | scipy 
 8 | folium
 9 | matplotlib 
10 | ipywidgets>=7.0.0 
11 | bqplot 
12 | nbinteract==0.0.12 
13 | otter-grader 
14 | okpy
15 | scikit-learn


--------------------------------------------------------------------------------