├── Data_Science_Cheatsheet.pdf
├── Data_Science_Cheatsheet.tex
├── README.md
└── images
    ├── CART.JPG
    ├── autoencodeer1.JPG
    ├── dendroedit3.JPG
    ├── factorNew1.JPG
    ├── hingeloss3.JPG
    ├── nn3.JPG
    ├── page1-1.png
    ├── page2-1.png
    ├── reinforcement4.JPG
    ├── relu.JPG
    ├── rnn1.JPG
    ├── sigmoid1.JPG
    ├── svmNew2.JPG
    ├── tanh.JPG
    └── windowCNNNew.JPG


/Data_Science_Cheatsheet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaronwangy/Data-Science-Cheatsheet/705760c47646633c145abb909c4e1f05812581ed/Data_Science_Cheatsheet.pdf


--------------------------------------------------------------------------------
/Data_Science_Cheatsheet.tex:
--------------------------------------------------------------------------------
   1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
   2 | \documentclass[10pt,landscape]{article}
   3 | \usepackage{amssymb,amsmath,amsthm,amsfonts}
   4 | \usepackage{multicol,multirow}
   5 | \DeclareMathOperator*{\argmax}{arg\,max}
   6 | \DeclareMathOperator*{\argmin}{arg\,min}
   7 | \usepackage{calc}
   8 | \usepackage{tikz}
   9 | \usepackage{ifthen}
  10 | \usepackage{textcomp}
  11 | \usepackage{xcolor}
  12 | \usepackage{graphicx}
  13 | \graphicspath{ {./images/} }
  14 | \usepackage{enumitem}
  15 | \usepackage{bm}
  16 | \usepackage{titlesec}
  17 | \usepackage[landscape]{geometry}
  18 | \usepackage{fancyhdr}
  19 | \usepackage[colorlinks=true,citecolor=blue,linkcolor=blue]{hyperref}
  20 | %------------------------------------
  21 | \ifthenelse{\lengthtest { \paperwidth = 11in}}
  22 |     { \geometry{top=.4in,left=.5in,right=.5in,bottom=.4in} }
  23 | 	{\ifthenelse{ \lengthtest{ \paperwidth = 297mm}}
  24 | 		{\geometry{top=1cm,left=1cm,right=1cm,bottom=1cm} }
  25 | 		{\geometry{top=1cm,left=1cm,right=1cm,bottom=1cm} }
  26 | 	}
  27 | \pagestyle{fancy}
  28 | \fancyhf{}
  29 | % Remove line
  30 | \renewcommand{\headrulewidth}{0pt}
  31 | \cfoot{\fontsize{9pt}{11pt}\selectfont Aaron Wang}
  32 | \setlength{\footskip}{16pt} % amount to move footer by
  33 | % Remember to call your parents and tell them you love them!
  34 | 
  35 | % Define smaller plus sign
  36 | \newcommand{\plus}{\raisebox{.3\height}{\scalebox{.7}{+}}}
  37 | 
  38 | \makeatletter
  39 | \renewcommand{\section}{\@startsection{section}{1}{0mm}%
  40 |                                 {-1ex plus -.5ex minus -.2ex}%
  41 |                                 {0.5ex plus .2ex}%x
  42 |                                 {\normalfont\large\bfseries}}
  43 | \renewcommand{\subsection}{\@startsection{subsection}{2}{0mm}%
  44 |                                 {-1ex plus -.5ex minus -.2ex}%
  45 |                                 {0.5ex plus .2ex}%
  46 |                                 {\normalfont\normalsize\bfseries}}
  47 | \renewcommand{\subsubsection}{\@startsection{subsubsection}{3}{0mm}%
  48 |                                 {-1ex plus -.5ex minus -.2ex}%
  49 |                                 {1ex plus .2ex}%
  50 |                                 {\normalfont\small\bfseries}}
  51 | \makeatother
  52 | \setcounter{secnumdepth}{0}
  53 | \setlength{\parindent}{0pt}
  54 | \setlength{\parskip}{0pt plus 0.5ex}
  55 | % ----------------------------------------------------
  56 | 
  57 | \title{Data Science Cheatsheet}
  58 | \begin{document}
  59 | 
  60 | \raggedright
  61 | \footnotesize
  62 | 
  63 | \begin{center}
  64 | \vspace{-50mm}
  65 |      \Large{\vspace{-15mm}\textbf{Data Science Cheatsheet 2.0}} \\
  66 |      \footnotesize{Last Updated \today}
  67 |      \vspace{-.4mm}
  68 | \end{center}
  69 | \begin{multicols}{3}
  70 | \setlength{\premulticols}{1pt}
  71 | \setlength{\postmulticols}{1pt}
  72 | \setlength{\multicolsep}{1pt}
  73 | \setlength{\columnsep}{2pt}
  74 | <<<<<<< Updated upstream
  75 | % -----------------------------------------------------------------------
  76 | % ----------------------------------------------------------------
  77 | \section{Statistics}
  78 | \subsection{Discrete Distributions}
  79 | \textbf{Binomial} - number of successes $x$ in $n$ events, each with $p$ probability $\to \binom{n}x p^x q^{n-x}$, with $\mu = np$ and $\delta^2 = npq$. If n = 1, this is Bernoulli.
  80 | 
  81 | \textbf{Geometric} - first success with $p$ probability on the $n^{th}$ trial $\to q^{n-1}p$, with mean = $1/p$
  82 | 
  83 | \textbf{Negative Binomial} - number of failures before $r$ successes 
  84 | 
  85 | \textbf{Hypergeometric} - number of successes $x$ in a size $N$ population with $n$ draws, without replacement. $\mu=n\frac{x}{N}$ 
  86 | 
  87 | \textbf{Poisson} - number of successes $x$ in a fixed time interval, where success occurs at an average rate $\lambda$ $\to$ $\frac{\lambda^xe^{-\lambda}}{x!}$. $\mu = \delta^2 = \lambda$
  88 | 
  89 | 
  90 | % -----------------------------------------------------------------------
  91 | \subsection{Continuous Distributions} 
  92 | \textbf{Normal/Gaussian} $N(\mu,\sigma$), Standard Normal $Z \sim N(0, 1)$
  93 | 
  94 | Central Limit Theorem - sample mean of i.i.d. data approaches normal distribution 
  95 | =======
  96 | % --------------------------------------------------------------
  97 | \section{Distributions}
  98 | \subsection{Discrete}
  99 | \textbf{Binomial} - $x$  successes in $n$ events, each with $p$ probability $\to \binom{n}x p^x q^{n-x}$, with $\mu = np$ and $\sigma^2 = npq$
 100 | \begin{itemize}[label={--},leftmargin=4mm]
 101 | \itemsep -.4mm
 102 | \vspace{-.5mm}
 103 | \item If n = 1, this is a Bernoulli distribution
 104 | \end{itemize}
 105 | >>>>>>> Stashed changes
 106 | 
 107 | \textbf{Geometric} - first success with $p$ probability on the $n^{th}$ trial $\to q^{n-1}p$, with $\mu = 1/p$ and $\sigma^2 = \frac{1-p}{p^2}$
 108 | 
 109 | \textbf{Negative Binomial} - number of failures before $r$ successes
 110 | 
 111 | \textbf{Hypergeometric} - $x$ successes in $n$ draws, no replacement, from a size $N$ population with $X$ items of that feature
 112 | \vspace{.5mm}
 113 | 
 114 | $\to \frac{\binom{X}{x} \binom{N-X}{n-x}}{\binom{N}{n}}$, with $\mu = \frac{nX}{N}$
 115 | 
 116 | \textbf{Poisson} - number of successes $x$ in a fixed time interval, where success occurs at an average rate $\lambda$ $\to$ $\frac{\lambda^xe^{-\lambda}}{x!}$, with $\mu = \sigma^2 = \lambda$
 117 | % ---------------------------------------------------------
 118 | \subsection{Continuous}
 119 | \textbf{Uniform} - all values between $a$ and $b$ are equally likely $\to \frac{1}{b-a}$ with $\mu = \frac{a+b}{2}$ and $\sigma^2 = \frac{(b-a)^2}{12}$ or $\frac{n^2 - 1}{12}$ if discrete
 120 | 
 121 | \textbf{Normal/Gaussian} $N(\mu,\sigma$), Standard Normal $Z\sim N(0, 1)$
 122 | 
 123 | \begin{itemize}[label={--},leftmargin=4mm]
 124 | \itemsep -.4mm
 125 | \vspace{-.5mm}
 126 | \item Central Limit Theorem - sample mean of i.i.d. data approaches normal distribution
 127 | \item Empirical Rule - 68\%, 95\%, and 99.7\% of values lie within one, two, and three standard deviations of the mean
 128 | \item Normal Approximation - discrete distributions such as Binomial and Poisson can be approximated using z-scores when $np$, $nq$, and $\lambda$ are greater than 10
 129 | \end{itemize}
 130 | 
 131 | \textbf{Exponential} - memoryless time between independent events occurring at an average rate $\lambda$ $\to \lambda e^{-\lambda x}$, with $\mu$ = $\frac{1}{\lambda}$
 132 | 
 133 | \textbf{Gamma} - time until $n$ independent events occurring at an average rate $\lambda$
 134 | 
 135 | % ---------------------------------------------------------------
 136 | \section{Concepts}
 137 | Prediction Error = Bias$^2$ + Variance + Irreducible Noise
 138 | 
 139 | \textbf{Bias} - wrong assumptions when training $\to$ can't capture underlying patterns $\to$ underfit
 140 | 
 141 | \textbf{Variance} - sensitive to fluctuations when training$\to$ can't generalize on unseen data $\to$ overfit
 142 | 
 143 | The bias-variance tradeoff attempts to minimize these two sources of error, through methods such as:
 144 | \begin{itemize}[label={--},leftmargin=4mm]
 145 | \itemsep -.4mm
 146 | \vspace{-.5mm}
 147 | \item Cross validation to generalize to unseen data
 148 | \item Dimension reduction and feature selection
 149 | \end{itemize}
 150 | \vspace{-.5mm}
 151 | In all cases, as variance decreases, bias increases.
 152 | 
 153 | \vspace{.5mm}
 154 | ML models can be divided into two types:
 155 | \vspace{-.5mm}
 156 | \begin{itemize}[label={--},leftmargin=4mm]
 157 |     \itemsep -.4mm
 158 |     \item Parametric - uses a fixed number of parameters with respect to sample size
 159 |     \item Non-Parametric - uses a flexible number of parameters and doesn't make particular assumptions on the data
 160 | \end{itemize}
 161 | 
 162 | \textbf{Cross Validation} - validates test error with a subset of training data, and selects parameters to maximize average performance
 163 | \begin{itemize}[label={--},leftmargin=4mm]
 164 | \itemsep -.4mm
 165 | \vspace{-1mm}
 166 | \item $k$-fold - divide data into $k$ groups, and use one to validate
 167 | \item leave-$p$-out  - use $p$ samples to validate and the rest to train
 168 | \end{itemize}
 169 | 
 170 | \columnbreak
 171 | % -------------------------------------------------
 172 | \section{Model Evaluation}
 173 | % -----------------------------------------------
 174 | \subsection{Regression}
 175 | \textbf{Mean Squared Error} (MSE) = $\frac{1}{n}\sum (y_i -\hat{y})^2$
 176 | \vspace{.1em}
 177 | 
 178 | Sum of Squared Error (SSE) = $\sum (y_i - \hat{y})^2$
 179 | 
 180 | Total Sum of Squares (SST) = $\sum (y_i - \bar{y})^2$
 181 | \vspace{.1em}
 182 | 
 183 | $\boldsymbol{R^2} = 1 - \frac{SSE}{SST}$, the proportion of explained $y$-variability
 184 | 
 185 | Note, negative $R^2$ means the model is worse than just predicting the mean. $R^2$ is not valid for nonlinear models, as $SS_{residual } \plus SS_{error} \neq SST$.
 186 | 
 187 | 
 188 | \textbf{Adjusted} $\boldsymbol{R^2} = 1 - (1-R^2)\frac{N-1}{N-p-1}$, which changes only when predictors affect $R^2$ above what would be expected by chance\\
 189 | 
 190 | 
 191 | % Adjust space between table columns
 192 | \renewcommand{\tabcolsep}{5pt}
 193 | \subsection{Classification}
 194 | \begin{center}
 195 |         \footnotesize
 196 |         \begin{tabular}{ |c|c|c| }
 197 |          \hline
 198 |           & Predict Yes & Predict No \\
 199 |          \hline
 200 |          Actual Yes & True Positive ($1-\beta$) & False Negative ($\beta$)  \\
 201 |          Actual No & False Positive ($\alpha$) & True Negative ($1-\alpha$) \\
 202 |          \hline
 203 |         \end{tabular}
 204 |         \end{center}
 205 | \vspace{-1mm}
 206 | \begin{itemize}[label={--},leftmargin=4mm]
 207 |     \vspace{-1mm}
 208 |     \itemsep -.4mm
 209 |     \item Precision = $\frac{TP}{TP + FP}$, percent correct when predict positive
 210 |     \item Recall, Sensitivity = $\frac{TP}{TP + FN}$, percent of actual positives identified correctly (True Positive Rate)
 211 |     \item Specificity = $\frac{TN}{TN + FP}$, percent of actual negatives identified correctly, also 1 - FPR (True Negative Rate)
 212 |     \item $F_1 = 2\frac{precision\cdot recall}{precision + recall}$, useful when classes are imbalanced
 213 | \end{itemize}
 214 | 
 215 | 
 216 | \textbf{ROC Curve} - plots TPR vs. FPR for every threshold $\alpha$. Area Under the Curve  measures how likely the model differentiates positives and negatives (perfect AUC = 1, baseline = 0.5).
 217 | 
 218 | \textbf{Precision-Recall Curve} - focuses on the correct prediction of the minority class, useful when data is imbalanced
 219 | 
 220 | % ----------------------------------------------------------------
 221 | \section{Linear Regression}
 222 | Models linear relationships between a continuous response and explanatory variables
 223 | 
 224 | \textbf{Ordinary Least Squares} - find $\hat{\beta}$ for $\hat{y} = \hat{\beta_{0}} + \hat{\beta}X + \epsilon$
 225 | by solving $\hat{\beta}$ = $(X^{T}X)^{-1}X^{T}Y$ which minimizes the SSE
 226 | 
 227 | \textbf{Assumptions}
 228 | \begin{itemize}[label={--},leftmargin=4mm]
 229 | \vspace{-1mm}
 230 | \itemsep -.4mm
 231 |     \item Linear relationship and independent observations
 232 |     \item Homoscedasticity - error terms have constant variance
 233 |     \item Errors are uncorrelated and normally distributed
 234 |     \item Low multicollinearity
 235 | \end{itemize}
 236 | 
 237 | \textbf{Variance Inflation Factor} - measures the severity of multicollinearity $\to$ $\frac{1}{1-{R_i}^2}$, where ${R_i}^2$ is found by regressing $X_i$ against all other variables (a common VIF cutoff is 10)
 238 | 
 239 | \textbf{Regularization}
 240 | 
 241 | Add a penalty $\lambda$ for large coefficients to the cost function, which reduces overfitting. Requires normalized data.
 242 | 
 243 | <<<<<<< Updated upstream
 244 | \textbf{Subset}  ($L_0$): $\lambda ||\hat{\beta}||_0 = \lambda (number \;of\;non\hspace{-.7mm}-\hspace{-.7mm}zero\; variables)$ 
 245 | =======
 246 | \textbf{Subset}  $(L_0)$: $\lambda ||\hat{\beta}||_0 = \lambda (number \;of\;non\hspace{-.7mm}-\hspace{-.7mm}zero\; variables)$
 247 | >>>>>>> Stashed changes
 248 | \begin{itemize}[label={--},leftmargin=4mm]
 249 | \vspace{-1mm}
 250 | \itemsep -.4mm
 251 | \item Computationally slow, need to fit $2^k$ models
 252 | \item Alternatives: forward and backward stepwise selection
 253 | \end{itemize}
 254 | <<<<<<< Updated upstream
 255 | \textbf{LASSO}  ($L_1$): $\lambda ||\hat{\beta}||_1 = \lambda\sum | \hat{\beta} |$
 256 | =======
 257 | \textbf{LASSO} $(L_1)$: $\lambda ||\hat{\beta}||_1 = \lambda\sum | \hat{\beta} |$
 258 | >>>>>>> Stashed changes
 259 | \begin{itemize}[label={--},leftmargin=4mm]
 260 | \vspace{-1mm}
 261 | \itemsep -.4mm
 262 | \item Shrinks coefficients to zero, and is robust to outliers
 263 | \end{itemize}
 264 | <<<<<<< Updated upstream
 265 | \textbf{Ridge}  ($L_2$): $\lambda ||\hat{\beta}||_2 = \lambda\sum( \hat{\beta})^2$ 
 266 | =======
 267 | \textbf{Ridge}  $(L_2)$: $\lambda ||\hat{\beta}||_2 = \lambda\sum( \hat{\beta})^2$
 268 | >>>>>>> Stashed changes
 269 | \begin{itemize}[label={--},leftmargin=4mm]
 270 | \vspace{-1mm}
 271 | \itemsep -.4mm
 272 | \item Reduces effects of multicollinearity
 273 | \end{itemize}
 274 | Combining LASSO and Ridge gives Elastic Net\\
 275 | 
 276 | \columnbreak
 277 | % ----------------------------------------------------------------
 278 | \section{Logistic Regression}
 279 | Predicts probability that $y$ belongs to a binary class. Estimates $\beta$ through maximum likelihood estimation (MLE) by fitting a logistic (sigmoid) function to the data. This is equivalent to minimizing the cross entropy loss. Regularization can be added in the exponent.
 280 | \vspace{-3mm}
 281 | \begin{center}
 282 | $\displaystyle P(Y=1) = \frac{1}{1 + e^{-({\beta_0} + {\beta x)}}}$
 283 | 
 284 | \end{center}
 285 | \vspace{-2mm}
 286 | The threshold $a$ classifies predictions as either 1 or 0
 287 | 
 288 | \textbf{Assumptions}
 289 | \begin{itemize}[label={--},leftmargin=4mm]
 290 | \vspace{-1mm}
 291 | \itemsep -.4mm
 292 | \item Linear relationship between X and log-odds of Y
 293 | \item Independent observations
 294 | \item Low multicollinearity
 295 | \end{itemize}
 296 | \textbf{Odds} - output probability can be transformed using $Odds(Y = 1) = \frac{P(Y=1)}{1-P(Y=1)}$, where $P(\frac{1}{3})$ = 1:2 odds
 297 | Coefficients are linearly related to odds, such that a one unit increase in $x_1$ affects odds by $e^{\beta_1}$
 298 | % -------------------------------
 299 | \vspace{-.5mm}
 300 | \section{Decision Trees}
 301 | \subsection{Classification and Regression Tree}
 302 | CART for regression minimizes SSE by splitting data into sub-regions and predicting the average value at leaf nodes.
 303 | 
 304 | The complexity parameter $cp$ only keeps splits that reduce loss by at least $cp$ (small $cp$ $\to$ deep tree)
 305 | \smallskip
 306 | \begin{center}
 307 | \vspace{-1mm}
 308 |     \includegraphics[scale = .08]{images/CART.JPG}
 309 | \end{center}
 310 | \vspace{-2mm}
 311 | CART for classification minimizes the sum of region impurity, \\
 312 | where $\hat{p_i}$ is the probability of a sample being in category $i$.
 313 | Possible measures, each with a max impurity of 0.5.
 314 | \begin{itemize}[label={--},leftmargin=4mm]
 315 | \vspace{-1mm}
 316 | <<<<<<< Updated upstream
 317 | \itemsep -.4mm 
 318 | \item Gini impurity = $\sum_i (\hat{p_i})^2$ 
 319 | \item Cross-Entropy = $-\sum_i (\hat{p_i}) log_2(\hat{p_i})$ 
 320 | =======
 321 | \itemsep -.4mm
 322 | \item Gini Impurity = 1$ - \sum (\hat{p_i})^2$
 323 | \item Cross Entropy = $-\sum (\hat{p_i}) log_2(\hat{p_i})$
 324 | >>>>>>> Stashed changes
 325 | \end{itemize}
 326 | At each leaf node, CART predicts the most frequent category, assuming false negative and false positive costs are the same. The splitting process handles multicollinearity and outliers. Trees are prone to high variance, so tune through CV.
 327 | 
 328 | \subsection{Random Forest}
 329 | Trains an ensemble of trees that vote for the final prediction
 330 | 
 331 | \textbf{Bootstrapping} - sampling with replacement (will contain duplicates), until the sample is as large as the training set
 332 | 
 333 | \textbf{Bagging} - training independent models on different subsets of the data, which reduces variance. Each tree is trained on $\sim$63\% of the data, so the out-of-bag 37\% can estimate prediction error without resorting to CV.
 334 | 
 335 | Deep trees may overfit, but adding more trees does not cause overfitting. Model bias is always equal to one of its individual trees.
 336 | 
 337 | \textbf{Variable Importance} - ranks variables by their ability to minimize error when split upon, averaged across all trees
 338 | % ----------------------------------------------------------------
 339 | \columnbreak
 340 | % ----------------------------------------------------------------
 341 | \\\textcolor{white}{.}\vspace{-5mm}\\ % Add space above column
 342 | 
 343 | % ----------------------------------------------------------------
 344 | \section{Support Vector Machines}
 345 | Separates data between two classes by maximizing the margin between the hyperplane and the nearest data points of any class. Relies on the following:
 346 | \vspace{-2.5mm}
 347 | \begin{center}
 348 |     \includegraphics[scale = .23]{images/svmNew2.JPG}
 349 | \end{center}
 350 | \vspace{-2mm}
 351 | \textbf{Support Vector Classifiers} - account for outliers through the regularization parameter $C$, which penalizes misclassifications in the margin by a factor of $C > 0$
 352 | 
 353 | \textbf{Kernel Functions} - solve nonlinear problems by computing the similarity between points $a$, $b$ and mapping the data to a higher dimension. Common functions:
 354 | \begin{itemize}[label={--},leftmargin=4mm]
 355 | \vspace{-1mm}
 356 | \itemsep -.4mm
 357 | \item Polynomial ($ab + r)^d$
 358 | \item Radial $e^{-\gamma(a-b)^2}$, where smaller $\gamma \to$  smoother boundaries
 359 | \end{itemize}
 360 | 
 361 | \textbf{Hinge Loss} - max($0,1-y_i(w^T x_i - b)$), where
 362 | $w$ is the margin width, $b$ is the offset bias, and classes are labeled $\pm1$. Acts as the cost function for SVM. Note, even a correct prediction inside the margin gives loss $>$ 0.
 363 | \vspace{-1mm}
 364 | \begin{center}
 365 |     \includegraphics[scale = .105]{images/hingeloss3.JPG}
 366 | \end{center}
 367 | \vspace{-3.5mm}
 368 | \subsection{Multiclass Prediction}
 369 | To classify data with 3$\plus$ classes $C$, a common method is to binarize the problem through:
 370 | \begin{itemize}[label={--},leftmargin=4mm]
 371 | \vspace{-1mm}
 372 | \itemsep -.4mm
 373 | \item One vs. Rest - train a classifier for each class $c_i$ by setting $c_i$'s samples as 1 and all others as 0, and predict the class with the highest confidence score
 374 | \item One vs. One - train $\frac{C (C-1)}{2}$ models for each pair of classes, and predict the class with the highest number of positive predictions
 375 | \end{itemize}
 376 | 
 377 | % ---------------------------------------------------
 378 | \section{k-Nearest Neighbors}
 379 | Non-parametric method that calculates $\hat{y}$ using the average value or most common class of its $k$-nearest points. For high-dimensional data, information is lost through equidistant vectors, so dimension reduction is often applied prior to $k$-NN.
 380 | 
 381 | \textbf{Minkowski Distance} = $(\sum|a_i - b_i|^p)^{1/p}$
 382 | \begin{itemize}[label={--},leftmargin=4mm]
 383 | \itemsep -.4mm
 384 | \item p = 1 gives Manhattan distance ${\sum|a_i - b_i|}$
 385 | \item p = 2 gives Euclidean distance $\sqrt{\sum(a_i - b_i)^2}$
 386 | \end{itemize}
 387 | 
 388 | \textbf{Hamming Distance} - count of the differences between two vectors, often used to compare categorical variables \\
 389 | 
 390 | \columnbreak
 391 | % ----------------------------------------------------------------
 392 | \textcolor{white}{.}\vspace{-5mm}\\ % Add space above column
 393 | \section{Clustering}
 394 | Unsupervised, non-parametric methods that groups similar data points together based on distance
 395 | \subsection{k-Means}
 396 | Randomly place $k$ centroids across normalized data, and assig observations to the nearest centroid. Recalculate centroids as the mean of assignments and repeat until convergence. Using the median or medoid (actual data point) may be more robust to noise and outliers. $k$-modes is used for categorical data.
 397 | 
 398 | \def\Plus{\texttt{+}}
 399 | $\boldsymbol{k}$\textbf{-means}\Plus\Plus\hspace{1mm}- improves selection of initial clusters
 400 | \begin{enumerate}[leftmargin=5mm]
 401 | \itemsep -.4mm
 402 | \item Pick the first center randomly
 403 | \item Compute distance between points and the nearest center
 404 | \item Choose new center using a weighted probability distribution proportional to distance
 405 | \item Repeat until $k$ centers are chosen
 406 | \end{enumerate}
 407 | 
 408 | Evaluating the number of clusters and performance:
 409 | 
 410 | \textbf{Silhouette Value} - measures how similar a data point is to its own cluster compared to other clusters, and ranges from 1 (best) to -1 (worst).
 411 | 
 412 | \textbf{Davies-Bouldin Index} - ratio of within cluster scatter to between cluster separation, where lower values are better
 413 | % ----------------------------------------------------------------
 414 | \subsection{Hierarchical Clustering}
 415 | 
 416 | Clusters data into groups using a predominant hierarchy
 417 | 
 418 | \textbf{Agglomerative Approach}
 419 | \begin{enumerate}[leftmargin=5mm]
 420 | \itemsep -.4mm
 421 | \item Each observation starts in its own cluster
 422 | \item Iteratively combine the most similar cluster pairs
 423 | \item Continue until all points are in the same cluster
 424 | \end{enumerate}
 425 | 
 426 | \textbf{Divisive Approach} - all points start in one cluster and splits are performed recursively down the hierarchy
 427 | 
 428 | \textbf{Linkage Metrics} - measure dissimilarity between clusters and combines them using the minimum linkage value over all pairwise points in different clusters by comparing:
 429 | \begin{itemize}[label={--},leftmargin=4mm]
 430 | \itemsep -.4mm
 431 | \item Single - the distance between the closest pair of points
 432 | \item Complete - the distance between the farthest pair of points
 433 | \item Ward's - the increase in within-cluster SSE if two clusters were to be combined
 434 | \end{itemize}
 435 | \textbf{Dendrogram} - plots the full hierarchy of clusters, where the height of a node indicates the dissimilarity between its children
 436 | \begin{center}
 437 |     \includegraphics[scale = .1]{images/dendroedit3.JPG}
 438 | \end{center}
 439 | 
 440 | \columnbreak
 441 | % ----------------------------------------------------------------% ----------------------------------------------------------------
 442 | \textcolor{white}{.}\vspace{-5mm}\\ % Add space above column
 443 | \section{Dimension Reduction}
 444 | High-dimensional data can lead to the \emph{curse of dimensionality}, which increases the risk of overfitting and decreases the value added. The number of samples for each feature combination quickly becomes sparse, reducing model performance.
 445 | 
 446 | \subsection{Principal Component Analysis}
 447 | Projects data onto orthogonal vectors that maximize variance.
 448 | Remember, given an $n\times n$ matrix $A$, a nonzero vector $\vec{x}$, and a scaler $\lambda$, if $A\vec{x} = \lambda \vec{x}$ then $\vec{x}$ and $\lambda$ are an eigenvector and eigenvalue of $A$. In PCA, the eigenvectors are uncorrelated and represent principal components.
 449 | \begin{enumerate}[leftmargin=5mm]
 450 | \itemsep -.4mm
 451 | \item Start with the covariance matrix of standardized data
 452 | \item Calculate eigenvalues and eigenvectors using SVD or eigendecomposition
 453 | \item Rank the principal components by their proportion of variance explained = $\frac{\lambda_i}{\sum{\lambda}}$
 454 | \end{enumerate}
 455 | 
 456 | Data should be linearly related, and for a $p$-dimensional dataset, there will be $p$ principal components.
 457 | 
 458 | Note, PCA explains the variance in X, not necessarily Y.
 459 | 
 460 | \textbf{Sparse PCA} - constrains the number of non-zero values in each component, reducing susceptibility to noise and improving interpretability
 461 | % ----------------------------------------------------------------
 462 | \subsection{Linear Discriminant Analysis}
 463 | Supervised method that maximizes separation between classes and minimizes variance within classes for a labeled dataset
 464 | \begin{enumerate}[leftmargin=5mm]
 465 | \itemsep -.4mm
 466 | \item Compute the mean and variance of each independent variable for every class $C_i$
 467 | \item Calculate the within-class ($\sigma_w^2$) and between-class ($\sigma_b^2$) variance
 468 | \item Find the matrix $W$ = ($\sigma_w^2)^{-1}(\sigma_b^2$) that maximizes Fisher's signal-to-noise ratio
 469 | \item Rank the discriminant components by their signal-to-noise ratio $\lambda$
 470 | \end{enumerate}
 471 | Note, the number of components is at most $C_1 - 1$
 472 | \textbf{Assumptions}
 473 | \vspace{-.7mm}
 474 | \begin{itemize}[label={--},leftmargin=4mm]
 475 | \itemsep -.4mm
 476 | \item Independent variables are normally distributed
 477 | \item Homoscedasticity - constant variance of error
 478 | \item Low multicollinearity
 479 | \end{itemize}
 480 | 
 481 | 
 482 | % ----------------------------------------------------------------
 483 | \subsection{Factor Analysis}
 484 | Describes data using a linear combination of $k$ latent factors.
 485 | Given a normalized matrix $X$, it follows the form  $X = Lf + \epsilon$,  with factor loadings $L$ and hidden factors $f$.
 486 | \vspace{-1mm}
 487 | \begin{center}
 488 |     \includegraphics[scale = .1]{images/factorNew1.JPG}
 489 | \end{center}
 490 | \vspace{-2mm}
 491 | 
 492 | \smallskip
 493 | \textbf{Scree Plot} - graphs the eigenvalues of factors (or principal components) and is used to determine the number of factors to retain. The 'elbow' where values level off is often used as the cutoff.
 494 | \columnbreak
 495 | % ----------------------------------------------------------------
 496 | % ----------------------------------------------------------------
 497 | \\\textcolor{white}{.}\vspace{-5mm}\\ % Add space above column
 498 | \section{Natural Language Processing}
 499 | Transforms human language into machine-usable code
 500 | \textbf{Processing Techniques}
 501 | \begin{itemize}[label={--},leftmargin=4mm]
 502 | \itemsep -.4mm
 503 | \item Tokenization - splits text into individual words (tokens)
 504 | \item Lemmatization - reduces words to its base form based on dictionary definition (\emph{am, are, is} $\to$ \emph{be})
 505 | \item Stemming - reduces words to its base form without context (\emph{ended} $\to$ \emph{end})
 506 | \item Stop words - removes common and irrelevant words (\emph{the, is})
 507 | \end{itemize}
 508 | 
 509 | \textbf{Markov Chain} - stochastic and memoryless process that predicts future events based only on the current state
 510 | 
 511 | $\boldsymbol{n}$\textbf{-gram} - predicts the next term in a sequence of $n$ terms based on Markov chains
 512 | 
 513 | \textbf{Bag-of-words} - represents text using word frequencies, without context or order
 514 | 
 515 | \textbf{tf-idf} - measures word importance for a document in a collection (corpus), by multiplying the term frequency (occurrences of a term in a document) with the inverse document frequency (penalizes common terms across a corpus)
 516 | \textbf{Cosine Similarity} - measures similarity between vectors, calculated as cos($\theta$) =
 517 | $\frac{A\cdot B}{||A||||B||} $, which ranges from o to 1
 518 | 
 519 | \subsection{Word Embedding}
 520 | Maps words and phrases to numerical vectors
 521 | 
 522 | \textbf{word2vec} - trains iteratively over local  word context windows, places similar words close together, and embeds sub-relationships directly into vectors, such that $king - man + woman \approx queen$
 523 | 
 524 | Relies on one of the following:
 525 | \begin{itemize}[label={--},leftmargin=4mm]
 526 | \itemsep -.4mm
 527 | \item Continuous bag-of-words (CBOW) - predicts the word given its context
 528 | \item skip-gram - predicts the context given a word
 529 | \end{itemize}
 530 | 
 531 | \textbf{GloVe} - combines both global and local word co-occurence data to learn word similarity
 532 | 
 533 | \textbf{BERT} - accounts for word order and trains on subwords, and unlike word2vec and GloVe, BERT outputs different vectors for different uses of words ($cell$ phone vs. blood $cell$)
 534 | 
 535 | \subsection{Sentiment Analysis}
 536 | Extracts the attitudes and emotions from text
 537 | 
 538 | \textbf{Polarity} - measures positive, negative, or neutral opinions
 539 | \begin{itemize}[label={--},leftmargin=4mm]
 540 | \itemsep -.4mm
 541 | \item Valence shifters - capture amplifiers or negators such as '$really$ fun' or '$hardly$ fun'
 542 | \end{itemize}
 543 | \textbf{Sentiment} - measures emotional states such as happy or sad
 544 | \textbf{Subject-Object Identification} - classifies sentences as either subjective or objective
 545 | \subsection{Topic Modelling}
 546 | Captures the underlying themes that appear in documents
 547 | \textbf{Latent Dirichlet Allocation} (LDA) - generates $k$ topics by first assigning each word to a random topic, then iteratively updating assignments based on parameters $\alpha$, the mix of topics per document, and $\beta$, the distribution of words per topic
 548 | 
 549 | \textbf{Latent Semantic Analysis} (LSA) - identifies patterns using tf-idf scores and reduces data to $k$ dimensions through SVD
 550 | \columnbreak
 551 | 
 552 | % -------------------------------------------------------------
 553 | \textcolor{white}{.}\vspace{-5mm}\\ % Add space above column
 554 | \section{Neural Network}
 555 | Feeds inputs through different hidden layers and relies on weights and nonlinear functions to reach an output
 556 | \vspace{-1mm}
 557 | \begin{center}
 558 |     \includegraphics[scale = .11]{images/nn3.JPG}
 559 | \end{center}
 560 | \vspace{-1mm}
 561 | \textbf{Perceptron} - the foundation of a neural network that multiplies inputs by weights, adds bias, and feeds the result $z$ to an activation function
 562 | 
 563 | \textbf{Activation Function} - defines a node's output
 564 | 
 565 | \vspace{-1mm}
 566 | \begin{center}
 567 | \begin{tabular}{c|c|c}
 568 |           Sigmoid & ReLU & Tanh\\
 569 |          \hline
 570 |          \rule{0pt}{3ex}
 571 |          $\frac{1}{1+e^{-z}} $  & max$(0,z)$ & $\frac{e^z - e^{-z}}{e^z + e^{-z}}$\\
 572 |            &   &  \vspace{-2mm}\\
 573 | 
 574 |          \hline
 575 |          \includegraphics[scale = .047]{images/sigmoid1.JPG} &
 576 |          \includegraphics[scale = .047]{images/relu.JPG} &
 577 |          \includegraphics[scale = .047]{images/tanh.JPG}
 578 | 
 579 | \end{tabular}
 580 | \end{center}
 581 | \textbf{Softmax} - given final layer outputs, provides class probabilities that sum to 1 $\to \frac{e^{z_i}}{\sum e^{z}}$
 582 | 
 583 | If there is more than one `correct' label, the sigmoid function provides probabilities for all, some, or none of the labels.
 584 | 
 585 | 
 586 | \smallskip
 587 | \textbf{Loss Function} - measures prediction error using functions such as MSE for regression and binary cross-entropy for probability-based classification
 588 | 
 589 | \smallskip
 590 | \textbf{Gradient Descent} - minimizes the average loss by moving iteratively in the direction of steepest descent, controlled by the learning rate $\gamma$ (step size). Note, $\gamma$ can be updated adaptively for better performance. For neural networks, finding the best set of weights involves:
 591 | \begin{enumerate}[leftmargin=5mm]
 592 | \itemsep -.4mm
 593 | \item Initialize weights $W$ randomly with near-zero values
 594 | \item Loop until convergence:
 595 |     \begin{itemize}[label={--},leftmargin=4mm]
 596 |     \itemsep -.4mm
 597 |     \item Calculate the average network loss $J(W)$
 598 |     \item \textbf{Backpropagation} - iterate backwards from the last layer, computing the gradient $\frac{\partial J(W)}{\partial W}$ and updating the weight $W \leftarrow W - \gamma \frac{\partial J(W)}{\partial W}$
 599 |     \end{itemize}
 600 | \item Return the minimum loss weight matrix $W$
 601 | \end{enumerate}
 602 | 
 603 | To prevent overfitting, regularization can be applied by:
 604 | \begin{itemize}[label={--},leftmargin=4mm]
 605 | \itemsep -.4mm
 606 | \item Stopping training when validation performance drops
 607 | \item Dropout - randomly drop some nodes during training to prevent over-reliance on a single node
 608 | \item Embedding weight penalties into the objective function
 609 | \item Batch Normalization - stabilizes learning by normalizing inputs to a layer
 610 | \end{itemize}
 611 | \textbf{Stochastic Gradient Descent} - only uses a single point to compute gradients, leading to smoother convergence and faster compute speeds. Alternatively, mini-batch gradient descent trains on small subsets of the data, striking a balance between the approaches.
 612 | \\
 613 | 
 614 | \columnbreak
 615 | % -------------------------------------------------------------% -------------------------------------------------------------
 616 | \textcolor{white}{.}\vspace{-5mm}\\ % Add space above column
 617 | \section{Convolutional Neural Network}
 618 | Analyzes structural or visual data by extracting local features
 619 | 
 620 | \textbf{Convolutional Layers} - iterate over windows of the image, applying weights, bias, and an activation function to create feature maps. Different weights lead to different features maps.
 621 | \vspace{-4mm}
 622 | \begin{center}
 623 |     \includegraphics[scale = .06]{images/windowCNNNew.JPG}
 624 | \end{center}
 625 | \vspace{-2mm}
 626 | \textbf{Pooling} - downsamples convolution layers to reduce dimensionality and maintain spatial invariance, allowing detection of features even if they have shifted slightly. Common techniques return the max or average value in the pooling window.\\
 627 | \smallskip
 628 | The general CNN architecture is as follows:
 629 | \begin{enumerate}[leftmargin=5mm]
 630 | \itemsep -.4mm
 631 | \item Perform a series of convolution, ReLU, and pooling operations, extracting important features from the data
 632 | \item Feed output into a fully-connected layer for classification, object detection, or other structural analyses
 633 | \end{enumerate}
 634 | 
 635 | \section{Recurrent Neural Network}
 636 | Predicts sequential data using a temporally connected system that captures both new inputs and previous outputs using hidden states
 637 | \begin{center}
 638 | \vspace{-2mm}
 639 |     \includegraphics[scale = .07]{images/rnn1.JPG}
 640 | \end{center}
 641 | \vspace{-2mm}
 642 | 
 643 | RNNs can model various input-output scenarios, such as many-to-one, one-to-many, and many-to-many. Relies on parameter (weight) sharing for efficiency. To avoid redundant calculations during backpropagation, downstream gradients are found by chaining previous gradients. However, repeatedly multiplying values greater than or less than 1 leads to:
 644 | \begin{itemize}[label={--},leftmargin=4mm]
 645 | \itemsep -.4mm
 646 | \item Exploding gradients - model instability and overflows
 647 | \item Vanishing gradients - loss of learning ability
 648 | \end{itemize}
 649 | This can be solved using:
 650 | \begin{itemize}[label={--},leftmargin=4mm]
 651 | \itemsep -.4mm
 652 | \item Gradient clipping - cap the maximum value of gradients
 653 | \item ReLU - its derivative prevents gradient shrinkage for $x > 0$
 654 | \item Gated cells - regulate the flow of information
 655 | \end{itemize}
 656 | 
 657 | \textbf{Long Short-Term Memory} - learns long-term dependencies using gated cells and maintains a separate cell state from what is outputted. Gates in LSTM perform the following:
 658 | 
 659 | \begin{enumerate}[leftmargin=5mm]
 660 | \itemsep -.4mm
 661 | \item Forget and filter out irrelevant info from previous layers
 662 | \item Store relevant info from current input
 663 | \item Update the current cell state
 664 | \item Output the hidden state, a filtered version of the cell state
 665 | \end{enumerate}
 666 | LSTMs can be stacked to improve performance.
 667 | 
 668 | \columnbreak
 669 | % -------------------------------------------------------------% -------------------------------------------------------------
 670 | \textcolor{white}{.}\vspace{-3mm}\\ % Add space above column
 671 | \section{Boosting}
 672 | Sequentially fits many simple models that account for the previous model's errors. As opposed to bagging, boosting trains on all the data and combines models using the learning rate $\alpha$. \\
 673 | \smallskip
 674 | \textbf{AdaBoost} - uses sample weighting and decision 'stumps' (one-level decision trees) to classify samples
 675 | \begin{enumerate}[leftmargin=5mm]
 676 | \itemsep -.4mm
 677 | \item Build decision stumps for every feature, choosing the one with the best classification accuracy
 678 | \item Assign more weight to misclassified samples and reward trees that differentiate them, where  $\alpha = \frac{1}{2}ln\frac{1-TotalError}{TotalError}$
 679 | \item Continue training and weighting decision stumps until convergence
 680 | \end{enumerate}
 681 | 
 682 | \textbf{Gradient Boost} - trains sequential models by minimizing a given loss function using gradient descent at each step
 683 | \begin{enumerate}[leftmargin=5mm]
 684 | \itemsep -.4mm
 685 | \item Start by predicting the average value of the response
 686 | \item Build a tree on the errors, constrained by depth or the number of leaf nodes
 687 | \item Scale decision trees by a constant learning rate $\alpha$
 688 | \item Continue training and weighting decision trees until convergence
 689 | \end{enumerate}
 690 | 
 691 | XGBoost - fast gradient boosting method that utilizes regularization and parallelization
 692 | % -------------------------------------------------------------% -------------------------------------------------------------
 693 | \section{Recommender Systems}
 694 | Suggests relevant items to users by predicting ratings and preferences, and is divided into two main types:
 695 | \begin{itemize}[label={--},leftmargin=4mm]
 696 | \itemsep -.4mm
 697 | \item Content Filtering - recommends similar items
 698 | \item Collaborative Filtering - recommends what similar users like
 699 | \end{itemize}
 700 | 
 701 | The latter is more common, and includes methods such as:
 702 | \textbf{Memory-based Approaches} - finds neighborhoods by using rating data to compute user and item similarity, measured using correlation or cosine similarity
 703 | 
 704 | \begin{itemize}[label={--},leftmargin=4mm]
 705 | \itemsep -.4mm
 706 | \item User-User - similar users also liked...
 707 | \begin{itemize}[label={--},leftmargin=4mm]
 708 |     \vspace{-1mm}
 709 |     \itemsep -.4mm
 710 |     \item Leads to more diverse recommendations, as opposed to just recommending popular items
 711 |     \item Suffers from sparsity, as the number of users who rate items is often low
 712 |     \end{itemize}
 713 |     \vspace{-1mm}
 714 | \item Item-Item - similar users who liked this item also liked...
 715 |     \begin{itemize}[label={--},leftmargin=4mm]
 716 |     \itemsep -.4mm
 717 |     \vspace{-1.5mm}
 718 |     \item Efficient when there are more users than items, since the item neighborhoods update less frequently than users
 719 |     \item Similarity between items is often more reliable than similarity between users
 720 |     \end{itemize}
 721 | \end{itemize}
 722 | \vspace{-1.5mm}
 723 | \smallskip
 724 | \textbf{Model-based Approaches} - predict ratings of unrated items, through methods such as Bayesian networks, SVD, and clustering. Handles sparse data better than memory-based approaches.\\
 725 | \begin{itemize}[label={--},leftmargin=4mm]
 726 | \itemsep -.4mm
 727 | \vspace{-.5mm}
 728 | \item Matrix Factorization - decomposes the user-item rating matrix into two lower-dimensional matrices representing the users and items, each with $k$ latent factors
 729 | \end{itemize}
 730 | \smallskip
 731 | \vspace{-1mm}
 732 | Recommender systems can also be combined through ensemble methods to improve performance.
 733 | \columnbreak
 734 | % -------------------------------------------------------------% -------------------------------------------------------------
 735 | \\\textcolor{white}{.}\vspace{-3mm}\\ % Add space above column
 736 | \section{Reinforcement Learning}
 737 | Maximizes future rewards by learning through state-action pairs. That is, an $agent$ performs $actions$ in an $environment$, which updates the $state$ and provides a $reward$.
 738 | 
 739 | \begin{center}
 740 | \vspace{-2mm}
 741 |     \includegraphics[scale = .085]{images/reinforcement4.JPG}
 742 | \end{center}
 743 | \vspace{-2.5mm}
 744 | \textbf{Multi-armed Bandit Problem} - a gambler plays slot machines with unknown probability distributions and must decide the best strategy to maximize reward. This exemplifies the exploration-exploitation tradeoff, as the best long-term strategy may involve short-term sacrifices.\\
 745 | \smallskip
 746 | RL is divided into two types, with the former being more common:
 747 | \begin{itemize}[label={--},leftmargin=4mm]
 748 | \itemsep -.4mm
 749 | \item Model-free - learn through trial and error in the environment
 750 | \item Model-based - access to the underlying (approximate) state-reward distribution
 751 | \end{itemize}
 752 | 
 753 | \textbf{Q-Value} $Q(s,a)$ - captures the expected discounted total future reward given a state and action
 754 | 
 755 | \textbf{Policy} - chooses the best actions for an agent at various states \\
 756 | $ \pi(s) = \argmax\limits_a Q(s,a)$\\
 757 | \smallskip
 758 | 
 759 | Deep RL algorithms can further be divided into two main types, depending on their learning objective
 760 | 
 761 | \textbf{Value Learning} - aims to approximate $Q(s,a)$ for all actions the agent can take, but is restricted to discrete action spaces. Can use the $\epsilon$-greedy method, where $\epsilon$ measures the probability of exploration. If chosen, the next action is selected uniformly at random.
 762 | \begin{itemize}[label={--},leftmargin=4mm]
 763 | \itemsep -.4mm
 764 | \item Q-Learning - simple value iteration model that maximizes the Q-value using a table on states and actions
 765 | \item Deep Q Network - finds the best action to take by minimizing the Q-loss, the squared error between the target Q-value and the prediction
 766 | \end{itemize}
 767 | 
 768 | \textbf{Policy Gradient Learning} - directly optimize the the policy $\pi(s)$ through a probability distribution of actions, without the need for a value function, allowing for continuous action spaces.  \\
 769 | \smallskip
 770 | \textbf{Actor-Critic Model} - hybrid algorithm that relies on two neural networks, an actor $\pi(s,a,\theta$) which controls agent behavior  and a critic $Q(s,a,w)$ that measures how good an action is. Both run in parallel to find the optimal weights $\theta, w$ to maximize expected reward. At each step:
 771 | \begin{enumerate}[leftmargin=5mm]
 772 | \itemsep -.4mm
 773 | \item Pass the current state into the actor and critic
 774 | \item The critic evaluates the action's Q-value, and the actor updates its weight $\theta$
 775 | \item The actor takes the next action leading to a new state, and the critic updates its weight $w$
 776 | \end{enumerate}
 777 | \columnbreak
 778 | 
 779 | % ----------------------------------------------------------------
 780 | \textcolor{white}{.}\vspace{-3mm}\\ % Add space above column
 781 | \section{Anomaly Detection}
 782 | Identifies unusual patterns that differ from the majority of the data. Assumes that anomalies are:
 783 | \begin{itemize}[label={--},leftmargin=4mm]
 784 | \itemsep -.4mm
 785 | \item Rare - the minority class that occurs rarely in the data
 786 | \item Different - have feature values that are very different from normal observations
 787 | \end{itemize}
 788 | \smallskip
 789 | Anomaly detection techniques spans a wide range, including methods based on:
 790 | 
 791 | \textbf{Statistics} - relies on various statistical methods to identify outliers, such as Z-tests, boxplots, interquartile ranges, and variance comparisons
 792 | 
 793 | \textbf{Density} - useful when data is grouped around dense neighborhoods, measured by distance. Methods include $k$-nearest neighbors, local outlier factor, and isolation forest.
 794 | \begin{itemize}[label={--},leftmargin=4mm]
 795 | \itemsep -.4mm
 796 | \item Isolation Forest - tree-based model that labels outliers based on an anomaly score\\
 797 | \vspace{-1.5mm}
 798 | \begin{enumerate}[leftmargin=4mm]
 799 | \itemsep -.4mm
 800 | \item Select a random feature and split value, dividing the dataset in two
 801 | \item Continue splitting randomly until every point is isolated
 802 | \item Calculate the anomaly score for each observation, based on how many iterations it took to isolate that point.
 803 | \item If the anomaly score is greater than a threshold, mark it as an outlier
 804 | \end{enumerate}
 805 | \end{itemize}
 806 | \vspace{-2.5mm}
 807 | \hspace{4mm}Intuitively, outliers are easier to isolate and should have\\\hspace{4mm}shorter path lengths in the tree
 808 | 
 809 | \vspace{1mm}
 810 | \textbf{Clusters} - data points outside of clusters could potentially be marked as anomalies
 811 | 
 812 | \vspace{1mm}
 813 | \textbf{Autoencoders} - unsupervised neural networks that compress data through an encoder and reconstruct it using a decoder. Autoencoders do not reconstruct the data perfectly, but rather focus on capturing important features in the data.
 814 | \begin{center}
 815 | \vspace{-2mm}
 816 |     \includegraphics[scale = .19]{images/autoencodeerNew.JPG}
 817 |     \vspace{-2mm}
 818 | \end{center}
 819 | The decoder struggles to capture anomalous patterns, and the reconstruction error acts as a score to detect anomalies.
 820 | \\
 821 | \smallskip
 822 | Autoencoders can also be used for image processing, dimension reduction, and information retrieval.
 823 | \smallskip
 824 | 
 825 | \textbf{Hidden Markov Model} - uses observed events $O$ to model a set of $n$ underlying states $Q$ using $\lambda = (A,B,\pi)$
 826 | \begin{itemize}[label={--},leftmargin=4mm]
 827 | \itemsep -.4mm
 828 | \item $A$ - $n\times n$ matrix of transition probabilities from state $i$ to $j$ \item $B$ - sequence of likelihoods of emitting $o_t$ in state $i$
 829 | \item $\pi$ - initial probability distribution over states
 830 | \end{itemize}
 831 | HMMs can calculate $P(O|\lambda)$, find the best hidden state sequence Q, or learn the parameters $A$ and $B$.
 832 | Anomalies are observations that are unlikely to occur across states.
 833 | 
 834 | \smallskip
 835 | HMMs can be applied to many problems such as signal processing and part of speech tagging.
 836 | 
 837 | \newpage
 838 | % ----------------------------------------------------------------
 839 | \textcolor{white}{.}\vspace{-5mm}\\ % Add space above column
 840 | \section{Time Series}
 841 | Extracts characteristics from time-sequenced data, which may exhibit the following characteristics:
 842 | \begin{itemize}[label={--},leftmargin=4mm]
 843 | \itemsep -.4mm
 844 | \item Stationarity - statistical properties such as mean, variance, and auto correlation are constant over time
 845 | \item Trend - long-term rise or fall in values
 846 | \item Seasonality - variations associated with specific calendar times, occurring at regular intervals less than a year
 847 | \item Cyclicality - variations without a fixed time length, occurring in periods of greater or less than one year
 848 | \item Autocorrelation - degree of linear similarity between current and lagged values
 849 | \end{itemize}
 850 | 
 851 | 
 852 | CV must account for the time aspect, such as for each fold $F_x$:
 853 | \begin{itemize}[label={--},leftmargin=4mm]
 854 | \itemsep -.4mm
 855 | \item Sliding Window - train $F_1$, test $F_2$, then train $F_2$, test $F_3$
 856 | \item Forward Chain - train $F_1$, test $F_2$, then train $F_1, F_2$, test $F_3$
 857 | \end{itemize}
 858 | 
 859 | \textbf{Exponential Smoothing} - uses an exponentially decreasing weight to observations over time, and takes a moving average. The time $t$ output is $s_t = \alpha x_t + (1-\alpha)s_{t-1}$, where  $0 < \alpha < 1$.
 860 | \smallskip
 861 | 
 862 | \textbf{Double Exponential Smoothing} - applies a recursive exponential filter to capture trends within a time series
 863 |     \begin{center}
 864 |     \vspace{-2.5mm}
 865 |     $s_t = \alpha x_t + (1-\alpha)(s_{t-1} + b_{t-1})$\\
 866 |     $b_t = \beta (s_t - s_{t-1}) + (1-\beta)b_{t-1}$\\
 867 |     \vspace{-2mm}
 868 |     \end{center}
 869 | Triple exponential smoothing adds a third variable
 870 | $\gamma$ that accounts for seasonality.
 871 | \smallskip
 872 | 
 873 | \textbf{ARIMA} - models time series using three parameters $(p,d,q)$:
 874 | \begin{itemize}[label={--},leftmargin=4mm]
 875 | \itemsep -.4mm
 876 | \item Autoregressive - the past $p$ values affect the next value
 877 | 
 878 | \item Integrated - values are replaced with the difference between current and previous values, using the difference degree $d$ (0 for stationary data, and 1 for non-stationary)
 879 | 
 880 | \item Moving Average - the number of lagged forecast errors and the size of the moving average window $q$
 881 | \end{itemize}
 882 | 
 883 | \textbf{SARIMA} - models seasonality through four additional seasonality-specific parameters: $P$, $D$, $Q$, and the season length $s$
 884 | \smallskip
 885 | 
 886 | \textbf{Prophet} - additive model that uses non-linear trends to account for multiple seasonalities such as yearly, weekly, and daily. Robust to missing data and handles outliers well.\\ Can be represented as: $y(t) = g(t) + s(t) + h(t) + \epsilon(t)$, with four distinct components for the growth over time, seasonality, holiday effects, and error. This specification is similar to a generalized additive model.
 887 | \smallskip
 888 | 
 889 | \textbf{Generalized Additive Model} - combine predictive methods while preserving additivity across variables, in a form such as
 890 | $y = \beta_0 + f_1(x_1) + \cdots + f_m(x_m)$, where functions can be non-linear. GAMs also provide regularized and interpretable solutions for regression and classification problems.
 891 | 
 892 | %%%%%%%%%%%%%%%%%%%%%%
 893 | \section{Naive Bayes}
 894 | Classifies data using the label with the highest conditional probability, given data $a$ and classes $c$. Naive because it assumes variables are independent.
 895 | 
 896 | \textbf{Bayes' Theorem} $ P({c_i}|{a})  = \frac{P({a}|{c_i})P({c_i})}{P({a})}$
 897 | 
 898 | \textbf{Gaussian Naive Bayes} - calculates conditional probability for continuous data by assuming a normal distribution
 899 | 
 900 | \columnbreak
 901 | %------------------------------------------------------
 902 | \textcolor{white}{.}\vspace{-5mm}\\ % Add space above column
 903 | \section{Statistics}
 904 | $\boldsymbol p$\textbf{-value} - probability that an effect could have occurred by chance. If less than the significance level $\alpha$, or if the test statistic is greater than the critical value, then reject the null.
 905 | 
 906 | \textbf{Type I Error} (False Positive $\alpha$) - rejecting a true null
 907 | 
 908 | \textbf{Type II  Error} (False Negative $\beta$)  - not rejecting a false null
 909 | 
 910 | Decreasing Type I Error causes an increase in Type II Error
 911 | 
 912 | \textbf{Confidence Level} (1 - $\alpha$) - probability of finding an effect that did not occur by chance and avoiding a Type I error
 913 | 
 914 | \textbf{Power} (1 - $\beta$) - probability of picking up on an effect that is present and avoiding a Type II Error
 915 | 
 916 | \textbf{Confidence Interval} - estimated interval that models the long-term frequency of capturing the true parameter value
 917 | 
 918 | $\boldsymbol{z}$\textbf{-test} - tests whether normally distributed population means are different, used when $n$ is large and variances are known
 919 | \vspace{-.5mm}
 920 | \begin{itemize}[label={--},leftmargin=4mm]
 921 |     \itemsep -.4mm
 922 |     \item z-score - the number of standard deviations between a data point $x$ and the mean $\to \frac{x - \mu}{\sigma}$
 923 | \end{itemize}
 924 | \vspace{-.5mm}
 925 | $\boldsymbol{t}$\textbf{-test} - used when population variances are unknown, and converges to the $z$-test when $n$ is large
 926 | \vspace{-.5mm}
 927 | \begin{itemize}[label={--},leftmargin=4mm]
 928 |     \itemsep -.4mm
 929 |     \item t-score - uses the standard error as an estimate for population variance $\to \frac{x - \mu}{s/\sqrt{n}}$
 930 | \end{itemize}
 931 | \vspace{-1mm}
 932 | \textbf{Degrees of Freedom} - the number of independent (free) dimensions needed before the parameter estimate can be determined
 933 | 
 934 | \textbf{Chi-Square Tests} - measure differences between categorical variables, using $\chi^2$ = $\sum \frac{observed - expected}{expected}$ to test:
 935 | \begin{itemize}[label={--},leftmargin=4mm]
 936 |     \itemsep -.4mm
 937 |     \item Goodness of fit - if samples of one categorical variable match the population category expectations
 938 |     \item Independence -  if being in one category is independent of another, based off two categories
 939 |     \item Homogeneity - if different subgroups come from the same population, based off a single category
 940 | \end{itemize}
 941 | 
 942 | \textbf{ANOVA} - analysis of variance, used to compare 3$\plus$ samples
 943 | \begin{itemize}[label={--},leftmargin=4mm]
 944 |     \itemsep -.4mm
 945 |     \item F-score - compares the ratio of explained and unexplained variance $\to \frac{\text{between group variance}}{\text{within group variance}}$
 946 | \end{itemize}
 947 | 
 948 | \smallskip
 949 | 
 950 | \textbf{Conditional Probability} $P(A \mid B) = \frac{P(A \cap B)}{P(B)}$
 951 | 
 952 | If $A$ and $B$ are independent, then $P(A \cap B) = P(A) P(B)$.\\
 953 | Note, events that are independent of themselves must have
 954 | probability either 1 or 0.
 955 | 
 956 | 
 957 | \textbf{Union} $P(A \cup B) = P(A) + P(B) - P(A \cap B) $
 958 | 
 959 | \textbf{Mutually Exclusive} - events cannot happen simultaneously
 960 | \smallskip
 961 | 
 962 | \textbf{Expected Value} $E[X] = \sum x_i p_i$, with properties
 963 |     \begin{itemize}[label={--},leftmargin=4mm]
 964 |     \itemsep -.4mm
 965 |     \item $E[X + Y] = E[X] + E[Y]$
 966 |     \item $E[XY] = E[X]E[Y]$ if $X$ and $Y$ are independent
 967 |     \end{itemize}
 968 | 
 969 | \textbf{Variance} Var$(X) = E[X^2] - E[X]^2$, with properties
 970 |     \begin{itemize}[label={--},leftmargin=4mm]
 971 |     \itemsep -.4mm
 972 |     \item Var$(X\pm Y) =$ Var$(X) +$Var$(Y) \pm 2$Cov$(X,Y)$
 973 |     \item Var$(aX \pm b) = a^2$Var$(X)$
 974 |     \end{itemize}
 975 | 
 976 | \textbf{Covariance} - measures the direction of the joint linear relationship of two variables
 977 | $\to \frac{\sum (x_i - \bar{x}) (y_i - \bar{y})}{n-1} $
 978 | 
 979 | \textbf{Correlation} - normalizes covariance to provide both strength and direction of linear relationships $\to r =  \frac{Cov(x,y)}{\sigma_x \sigma_y}$
 980 | Independent variables are uncorrelated,  though the inverse is not necessarily true
 981 | 
 982 | \newcommand*{\Perm}[2]{{}^{#1}\!P_{#2}}%
 983 | \newcommand*{\Comb}[2]{{}^{#1}C_{#2}}%
 984 | \smallskip
 985 | 
 986 | \section{A/B Testing}
 987 | Examines user experience through randomized tests with two variants.
 988 | The typical steps are:
 989 | \begin{enumerate}[leftmargin=5mm]
 990 |     \itemsep -.4mm
 991 |     \item Determine the evaluation metric and experiment goals
 992 |     \item Select a significance level $\alpha$ and power threshold 1 - $\beta$
 993 |     \item Calculate the required sample size per variation
 994 |     \item Randomly assign users into control and treatment groups
 995 |     \item Measure and analyze results using the appropriate test
 996 | \end{enumerate}
 997 | 
 998 | The required sample size depends on $\alpha$, $\beta$, and the MDE
 999 | \textbf{Minimum Detectable Effect} - the target relative minimum increase over the baseline that should be observed from a test
1000 | \vspace{.5mm}
1001 | 
1002 | \textbf{Overall Evaluation Criterion} - quantitative measure of the test's objective, commonly used when short and long-term metrics have inverse relationships
1003 | \vspace{.7mm}
1004 | 
1005 | \textbf{Multivariate Testing} - compares 3$\plus$ variants or combinations, but requires larger sample sizes
1006 | 
1007 | \textbf{Bonferroni Correction} - when conducting $n$ tests,
1008 | run each test at the $\frac{\alpha}{n}$ significance level, which lowers the false positive rate of finding effects by chance
1009 | \vspace{.7mm}
1010 | 
1011 | \textbf{Network Effects} - changes that occur due to effect spillover from other groups. To detect group interference:
1012 | \begin{enumerate}[leftmargin=5mm]
1013 |     \itemsep -.4mm
1014 |     \item Split the population into distinct clusters
1015 |     \item Randomly assign half the clusters to the control and treatment groups $A_1$ and $B_1$
1016 |     \item Randomize the other half at the user-level and assign to control and treatment groups
1017 |     $A_2$ and $B_2$
1018 |     \item Intuitively, if there are network effects, then the tests will have different results
1019 | \end{enumerate}
1020 | To account for network effects, randomize users based on time, cluster, or location
1021 | \smallskip
1022 | 
1023 | \textbf{Sequential Testing} - allows for early experiment stopping by drawing statistical borders based on the Type I Error rate. If the effect reaches a border, the test can be stopped. Used to combat \emph{peeking} (preliminarily checking results of a test), which can inflate $p$-values and lead to incorrect conclusions.
1024 | 
1025 | \textbf{Cohort Analysis} - examines specific groups of users based on behavior or time and can help identify whether novelty or primacy effects are present
1026 | 
1027 | \section{Miscellaneous}
1028 | \textbf{Shapley Values} - measures the marginal contribution of each variable in the output of a model, where the sum of all Shapley values equals the total value (prediction $-$ mean prediction)
1029 | 
1030 | \textbf{SHAP} - interpretable Shapley method that utilizes both global and local importance to model variable explainability
1031 | 
1032 | \textbf{Permutation} - order matters  $\to \frac{n!}{(n-k)!} = \Perm{n}{k}$
1033 | 
1034 | \textbf{Combination} - order doesn't matter $\to \frac{n!}{k!(n-k)!}= \Comb{n}{k}= \binom nk$
1035 | 
1036 | \textbf{Left Skew} - Mean $<$ Median $\leq$ Mode
1037 | 
1038 | \textbf{Right Skew} - Mean $>$ Median $\geq$ Mode
1039 | 
1040 | \textbf{Probability vs Likelihood} - given a situation $\theta$ and observed outcomes $O$, probability is calculated as $P(O|\theta)$. However, when true values for $\theta$ are unknown, $O$ is used to estimate the $\theta$ that maximizes the likelihood function. That is, $L(\theta|O) = P(O|\theta)$.
1041 | \newpage
1042 | \end{multicols}
1043 | 
1044 | \end{document}
1045 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Data Science Cheatsheet 2.0
 2 | 
 3 | A helpful 5-page data science cheatsheet to assist with exam reviews, interview prep, and anything in-between. It covers over a semester of introductory machine learning, and is based on MIT's Machine Learning courses 6.867 and 15.072. The reader should have at least a basic understanding of statistics and linear algebra, though beginners may find this resource helpful as well. 
 4 | 
 5 | Inspired by Maverick's *Data Science Cheatsheet* (hence the 2.0 in the name), located [here](https://github.com/ml874/Data-Science-Cheatsheet).
 6 | 
 7 | Topics covered:
 8 | - Linear and Logistic Regression
 9 | - Decision Trees and Random Forest
10 | - SVM
11 | - K-Nearest Neighbors
12 | - Clustering
13 | - Boosting
14 | - Dimension Reduction (PCA, LDA, Factor Analysis)
15 | - Natural Language Processing
16 | - Neural Networks
17 | - Recommender Systems
18 | - Reinforcement Learning
19 | - Anomaly Detection
20 | - Time Series
21 | - A/B Testing
22 | 
23 | This cheatsheet will be occasionally updated with new/improved info, so consider a follow or star to stay up to date.
24 | 
25 | Future additions (ideas welcome):
26 | - ~~Time Series~~ Added!
27 | - ~~Statistics and Probability~~ Added!
28 | - Data Imputation
29 | - Generative Adversarial Networks
30 | - Graph Neural Networks
31 | 
32 | ## Links
33 | * [Data Science Cheatsheet 2.0 PDF](https://github.com/aaronwangy/Data-Science-Cheatsheet/blob/main/Data_Science_Cheatsheet.pdf)
34 | 
35 | ## Screenshots
36 | 
37 | Here are screenshots of a couple pages - the link to the full cheatsheet is above!
38 | 
39 | ![](images/page1-1.png?raw=true) 
40 | ![](images/page2-1.png?raw=true)
41 | 
42 | ### Why is Python/SQL not covered in this cheatsheet?
43 | I planned for this resource to cover mainly algorithms, models, and concepts, as these rarely change and are common throughout industries. Technical languages and data structures often vary by job function, and refreshing these skills may make more sense on keyboard than on paper.
44 | 
45 | 
46 | ## License
47 | 
48 | Feel free to share this resource in classes, review sessions, or to anyone who might find it helpful :)
49 | 
50 | This work is licensed under the <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License</a>
51 | 
52 | <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br/>
53 | 
54 | Images are used for educational purposes, created by me, or borrowed from my colleagues [here](https://stanford.edu/~shervine/teaching/cs-229/)
55 | 
56 | ## Contact
57 | Feel free to suggest comments, updates, and potential improvements!
58 | 
59 | Author - [Aaron Wang](https://www.linkedin.com/in/axw/)
60 | 
61 | If you'd like to support this cheatsheet, you can buy me a coffee [here](https://www.paypal.me/aaxw). I also do resume, application, and tech consulting - send me a message if interested.
62 | 


--------------------------------------------------------------------------------
/images/CART.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaronwangy/Data-Science-Cheatsheet/705760c47646633c145abb909c4e1f05812581ed/images/CART.JPG


--------------------------------------------------------------------------------
/images/autoencodeer1.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaronwangy/Data-Science-Cheatsheet/705760c47646633c145abb909c4e1f05812581ed/images/autoencodeer1.JPG


--------------------------------------------------------------------------------
/images/dendroedit3.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaronwangy/Data-Science-Cheatsheet/705760c47646633c145abb909c4e1f05812581ed/images/dendroedit3.JPG


--------------------------------------------------------------------------------
/images/factorNew1.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaronwangy/Data-Science-Cheatsheet/705760c47646633c145abb909c4e1f05812581ed/images/factorNew1.JPG


--------------------------------------------------------------------------------
/images/hingeloss3.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaronwangy/Data-Science-Cheatsheet/705760c47646633c145abb909c4e1f05812581ed/images/hingeloss3.JPG


--------------------------------------------------------------------------------
/images/nn3.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaronwangy/Data-Science-Cheatsheet/705760c47646633c145abb909c4e1f05812581ed/images/nn3.JPG


--------------------------------------------------------------------------------
/images/page1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaronwangy/Data-Science-Cheatsheet/705760c47646633c145abb909c4e1f05812581ed/images/page1-1.png


--------------------------------------------------------------------------------
/images/page2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaronwangy/Data-Science-Cheatsheet/705760c47646633c145abb909c4e1f05812581ed/images/page2-1.png


--------------------------------------------------------------------------------
/images/reinforcement4.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaronwangy/Data-Science-Cheatsheet/705760c47646633c145abb909c4e1f05812581ed/images/reinforcement4.JPG


--------------------------------------------------------------------------------
/images/relu.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaronwangy/Data-Science-Cheatsheet/705760c47646633c145abb909c4e1f05812581ed/images/relu.JPG


--------------------------------------------------------------------------------
/images/rnn1.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaronwangy/Data-Science-Cheatsheet/705760c47646633c145abb909c4e1f05812581ed/images/rnn1.JPG


--------------------------------------------------------------------------------
/images/sigmoid1.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaronwangy/Data-Science-Cheatsheet/705760c47646633c145abb909c4e1f05812581ed/images/sigmoid1.JPG


--------------------------------------------------------------------------------
/images/svmNew2.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaronwangy/Data-Science-Cheatsheet/705760c47646633c145abb909c4e1f05812581ed/images/svmNew2.JPG


--------------------------------------------------------------------------------
/images/tanh.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaronwangy/Data-Science-Cheatsheet/705760c47646633c145abb909c4e1f05812581ed/images/tanh.JPG


--------------------------------------------------------------------------------
/images/windowCNNNew.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aaronwangy/Data-Science-Cheatsheet/705760c47646633c145abb909c4e1f05812581ed/images/windowCNNNew.JPG


--------------------------------------------------------------------------------