├── .gitignore ├── ATG.md ├── LICENSE ├── README.md ├── equations.tex ├── house-prices-decision-tree-and-random-forest.ipynb ├── house-prices-lasso-and-ridge.ipynb ├── house-prices-lasso-ridge-log-target.ipynb ├── house-prices-lr-from-scratch.ipynb ├── house-prices-mlpregressor.ipynb ├── house-prices-polynomial.ipynb ├── house-prices-quickstart.ipynb ├── house-prices-rf-gridsearchcv.ipynb ├── house-prices-robust-regression.ipynb ├── house-prices-sgd.ipynb ├── house-prices-simple-imputer.ipynb ├── house-prices-support-vector-regression.ipynb ├── house-prices-target-feature-distributions.ipynb ├── house-prices-tensorflow.ipynb ├── house-prices-xgboost.ipynb └── tex └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /ATG.md: -------------------------------------------------------------------------------- 1 | # Machine Learning Advanced Topics Group 2 | 3 | ## Generalized Linear Models - GLM 4 | 5 | - DSML Section 5.7 p204 6 | - ISLR Section 4.6 p164 7 | - Murphy Book 1 - Chapter 12 8 | - Regression and Other Stories - Part 3 - Chapter 15 9 | 10 | 11 | ## PCA 12 | 13 | - MML Chapter 10 14 | - DSML Section 4.8 p153 15 | - ESL 14.5 Principal Components, Curves and Surfaces p534 16 | - Murphy Book 1 Chapter 20: Dimensionality Reduction 17 | 18 | ## Expectation Maximization - EM 19 | 20 | 21 | ## Maximumn Likelihood Estimation - MLE 22 | 23 | ## Gaussian Processes - GP 24 | 25 | ## Survival Analysis 26 | 27 | ## Anomaly Detection 28 | 29 | ## Convex Optimization 30 | 31 | ## Gaussian Mixture Models - GMM 32 | 33 | 34 | ## Bookshelf 35 | 36 | - [An Introduction to Statistical Learning](https://www.statlearning.com) - ISLR 37 | - [Mathematics for Machine Learning](https://mml-book.github.io) - MML 38 | - [Elements of Statistical Learning](https://hastie.su.domains/ElemStatLearn/) - ESL 39 | - [Data Science and Machine Learning](https://github.com/DSML-book/) - DSML 40 | - [Probabilistic Machine Learning: An Introduction](https://probml.github.io/pml-book/book1.html) - Murphy Book 1 41 | - [Probabilistic Machine Learning: Advanced Topics](https://probml.github.io/pml-book/book2.html) - Murphy Advanced Topics 42 | - [Pattern Recognition and Machine Learning](https://www.microsoft.com/en-us/research/uploads/prod/2006/01/Bishop-Pattern-Recognition-and-Machine-Learning-2006.pdf) - Bishop PRML 43 | - [Regression and Other Stories - Gelman](https://avehtari.github.io/ROS-Examples/) 44 | - 45 | 46 | ## GLM Bookshelf 47 | 48 | - [An Introduction to Generalized Linear Models](https://www.amazon.com/Introduction-Generalized-Chapman-Statistical-Science-dp-1138741515/dp/1138741515/ref=dp_ob_title_bk) 49 | - [Applied Regression Analysis and Generalized Linear Models - John Fox](https://www.amazon.com/Applied-Regression-Analysis-Generalized-Linear-dp-1452205663/dp/1452205663/ref=dp_ob_title_bk) 50 | - [Foundations of Linear and Generalized Linear Models - Alan Agresti](https://www.amazon.com/Foundations-Linear-Generalized-Probability-Statistics/dp/1118730038) 51 | - [Generalized Linear Models and Extensions - 4th ed James Hardin and Joseph Hilbe](https://www.amazon.com/Generalized-Linear-Models-Extensions-Fourth/dp/1597182257) 52 | - [Survival Analysis with Interval-Censored Data_ A Practical Approach with Examples in R, SAS, and BUGS - Kris Bogaerts,Arnost Komarek,Emmanuel Lesaffre](https://www.amazon.com/Survival-Analysis-Interval-Censored-Data-Interdisciplinary/dp/1420077473) 53 | - 54 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This is a collection of repos devoted to learning machine learning with Kaggle. 2 | 3 | - [Regression with Housing Prices](https://github.com/melling/ml-regression) 4 | - [Classification with Titanic](https://github.com/melling/ml-kaggle-titanic) 5 | - [MNIST Solutions](https://github.com/melling/ml-mnist-kaggle-digit-recognizer) 6 | - [Classification with Spaceship Titanic](https://github.com/melling/ml-kaggle-spaceship-titanic) 7 | - [NLP with Disaster Tweets](https://github.com/melling/ml-nlp-kaggle-disaster-tweets) 8 | - 9 | 10 | Follow me on [Kaggle](https://www.kaggle.com/mmellinger66/) 11 | 12 | # Machine Learning Regression with Kaggle House Prices 13 | 14 | This is a deep dive into learning to solve machine learning regression problems. Supervised learning with a continuous target value. 15 | 16 | The data set used is from the the Kaggle Competition [House Prices - Advanced Regression Techniques](https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques) 17 | 18 | Given several dozen predictors/featues, we want to accurately predict the sale price of a house. 19 | 20 | ## Notebooks 21 | 22 | - [Quickstart](house-prices-quickstart.ipynb) 23 | - [Lasso, Ridge, and ElasticNet Regression](house-prices-lasso-and-ridge.ipynb) 24 | - [Polynomial Features](house-prices-polynomial.ipynb) 25 | - [Target and Feature Distributions](house-prices-target-feature-distributions.ipynb) 26 | - [Simple Imputer and Label Encoding](house-prices-simple-imputer.ipynb) 27 | - [Robust Regression - RANSAC](house-prices-robust-regression.ipynb) 28 | - [SelectK Best Features] 29 | - Variance Inflation Factor (VIF) 30 | - Recursive Feature Elimination (RFE) 31 | - Mutual Information Gain 32 | - [Forward Feature Selection] 33 | - [Stochastic Gradient Descent](house-prices-sgd.ipynb) 34 | - [Lasso, Ridge, and ElasticNet with log(target)](house-prices-lasso-ridge-log-target.ipynb) 35 | - [Outliers] 36 | - [Decision Tree and Random Forests](house-prices-decision-tree-and-random-forest.ipynb) 37 | - [GridSearchCV](house-prices-rf-gridsearchcv.ipynb) 38 | - [MLPRegressor](house-prices-mlpregressor.ipynb) 39 | - [Gradient Boosted Trees - XGBoost/Catboost/LightGBM](house-prices-xgboost.ipynb) 40 | - [GBDT Feature Importance] 41 | - [SHAP Values] 42 | - [XGBoost + CV with OOF Results] 43 | - [XGBoost + Optuna] 44 | - [Data Transformation] 45 | - [Support Vector Machines](house-prices-support-vector-regression.ipynb) 46 | - [Tensorflow](house-prices-tensorflow.ipynb) 47 | - [KerasTuner] 48 | - *** 49 | - [Target Encoding] 50 | - [Ensemble Learning - Blending] 51 | - [Ensemble Learning - Stacking] 52 | - [Robust Regression - RANSAC] 53 | - [Nonlinear Regression] 54 | - [PyTorch](house-prices-pytorch.ipynb) 55 | - [Basic EDA](house-prices-eda.ipynb) 56 | - [Enhanced EDA] 57 | - Feature Engineering 58 | - [Linear Regression from Scratch](house-prices-lr-from-scratch.ipynb) 59 | 60 | ## Misc Notebooks 61 | 62 | - [DSML Feature Selection] 63 | 64 | 65 | ## Machine Learning Models Covered 66 | 67 | - Linear Regression 68 | - Lasso - L1 69 | - Ridge - L2 70 | - Polynomial 71 | - Residuals 72 | - Collinearity 73 | - Interactions 74 | - Mathematics 75 | - Solving Ax=b using numpy 76 | - Normal Equations 77 | - Decision Trees 78 | - Gradient Boosted Decision Trees (GBDT) 79 | - Support Vector Machines 80 | - [Principal Component Analysis](pca.md) (PCA) 81 | - Stochastic Gradient Descent 82 | - Deep Neural Networks (DNN) 83 | - Activation Functions 84 | 85 | In addition, we will cover other topics important to machine learning: 86 | 87 | - Feature Engineering 88 | - Data Transformation 89 | - Scaling 90 | - Gaussian Normal 91 | - log transform 92 | - skew, kurtosis 93 | - Missing Values 94 | - Outliers 95 | - Z-score 96 | - IQR Method 97 | - https://www.kaggle.com/code/nareshbhat/outlier-the-silent-killer 98 | - Hypothesis Testing 99 | - DBSCAN Clustering 100 | - Loss Functions 101 | - MAE 102 | - RMSE 103 | - Huber 104 | - Feature Selection 105 | - Forward Selection 106 | - Reverse Selection 107 | - SHAP 108 | - https://h2o.ai/blog/shapley-values-a-gentle-introduction/ 109 | - Permutation Importance 110 | - Mutual Information 111 | - Hyperparameter Optimization 112 | 113 | ## MAE 114 | 115 | 116 | [Mean Absolute Error](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_error.html) 117 | 118 | $$MAE = \frac{\sum_{i=1}^n |y_i - x_i|}{n}$$ 119 | 120 | ## RMSE 121 | 122 | [RMSE](https://en.wikipedia.org/wiki/Root-mean-square_deviation) 123 | 124 | $$RMSE = \sqrt{\frac{1}{n}\Sigma_{i=1}^{n}{\Big(\frac{\hat{y}_i -y_i}{\sigma_i}\Big)^2}}$$ 125 | 126 | -------------------------------------------------------------------------------- /equations.tex: -------------------------------------------------------------------------------- 1 | \documentclass[fleqn,9pt]{article} 2 | 3 | %\documentclass[a4paper]{article} 4 | % 5 | % https://blmoistawinde.github.io/ml_equations_latex/ 6 | 7 | \usepackage{amsmath,amssymb} 8 | 9 | %\usepackage{multicol} 10 | 11 | \usepackage{titling} 12 | \date{} % no date 13 | 14 | % https://kb.mit.edu/confluence/pages/viewpage.action?pageId=3907057 15 | \usepackage[margin=0.1in]{geometry} 16 | 17 | %\geometry{ 18 | % margin=0.55in, 19 | % top=0.35in, 20 | % bottom=0.35in, 21 | %} 22 | 23 | \usepackage{tikz} 24 | \usetikzlibrary{ 25 | arrows, 26 | arrows.meta, 27 | positioning, 28 | } 29 | 30 | \setlength{\parindent}{0pt} 31 | \setlength{\parskip}{1\baselineskip} 32 | \setlength{\mathindent}{0pt} % fleqn option needed 33 | 34 | % https://tex.stackexchange.com/questions/24561/setting-the-column-gap-in-a-twocolumn-or-multicol-document 35 | %\setlength{\columnsep}{3cm} 36 | 37 | \setlength{\droptitle}{-5em} % This is your set screw 38 | 39 | \newcommand{\indep}{\perp \!\!\! \perp} 40 | 41 | \title{Machine Learning Advanced Topics Group Formulas \vspace{-6em}} 42 | 43 | %\pagestyle{empty} 44 | 45 | \begin{document} 46 | \maketitle 47 | 48 | 49 | % Basic use of minipage environment 50 | 51 | %\noindent 52 | \begin{minipage}[t]{0.33\textwidth} 53 | 54 | \textbf{Linear Algebra} 55 | 56 | \begin{equation*} 57 | \begin{split} 58 | A(B+C) = AB + AC\\ 59 | A(BC) = (AB)C \\ 60 | (AB)^T =B^TA^T\\ 61 | A^TA = AA^T = I \implies A^T = A^{-1}\\ 62 | x^Ty = y^Tx\\ 63 | x^Tx - inner\\ 64 | xx^T - outer\\ 65 | \end{split} 66 | \end{equation*} 67 | 68 | \begin{multline*} 69 | \Phi: V \to W \\ 70 | \Phi(x+y) = \Phi(x) + \Phi(y)\; (2.85) \\ 71 | \Phi(\lambda x) = \lambda \Phi(x)\\ 72 | \forall x,y\in V, \forall \lambda, \psi \in \mathbb{R} : \Phi(\lambda x + \psi y) 73 | \end{multline*} 74 | 75 | \begin{equation*} 76 | P_\pi=\frac{bb^T}{\|b\|^2} (3.46) \\ 77 | \end{equation*} 78 | 79 | \begin{multline*} 80 | ker(\Phi) = \Phi^{-1}(0_W) 81 | = \{v\in V: \Phi(v) =0\} (2.122)\\ 82 | Im(\Phi) = \Phi(V) 83 | = \{w \in W | \exists v\in V: \Phi(v) = w \}\\ 84 | \end{multline*} 85 | 86 | \begin{equation*} 87 | \begin{split} 88 | &\|x\|_1 := \sum_i^n |x_i| =\ell_1 \;(3.3)\\ 89 | &\|x\|_2 := \sqrt{\sum_i^n x_i^2} = \sqrt{\textbf{x}^T \textbf{x}} =\ell_2 \;(3.4)\\ 90 | &\forall x \in V\setminus\{0\}: x^TAx > 0\;(3.11)\\ 91 | &\|x\| := \sqrt{\langle x,x\rangle}\;(3.16)\\ 92 | &A = P D P^{-1}\;(4.55)\\ 93 | &A^k =(P D P^{-1})^k = P D^k P^{-1}\;(4.62)\\ 94 | &A = U\Sigma V^T\\ 95 | \end{split} 96 | \end{equation*} 97 | 98 | \textbf{Probability} 99 | %\begin{center} 100 | \begin{equation*} 101 | \begin{split} 102 | p(x,y) = p(x)p(y) - x \indep y\\ 103 | p(x|y) = p(x) - x \indep y\\ 104 | \mathbb{V}_{X,Y} = \mathbb{V}_X[x] + \mathbb{V}_Y[y] - x \indep y\\ 105 | Cov(x,y) = 0 - x \indep y\\ 106 | % Product Rule (6.22) p184 107 | p(x,y) = p(y|x)p(x) - (6.22)\\ 108 | p(x|y) = \frac{p(y|x)p(x)}{p(y)} - (6.23)\\ 109 | Cov[x,y] = \mathbb{E}[xy]-\mathbb{E}[x]\mathbb{E}[y]\;(6.36)\\ 110 | corr[x,y] = \frac{Cov[x,y]}{\sqrt{\mathbb{V}[x] \mathbb{V}[y]}}\;(6.40)\\ 111 | X\sim \mathcal{N}(\mu,\sigma^2)\\ 112 | X\sim \mathcal{N}(\mu, \boldsymbol{\Sigma^2})\\ 113 | X \sim U(0, 1)\\ 114 | \end{split} 115 | \end{equation*} 116 | %\end{center} 117 | 118 | \end{minipage}% <---------------- Note the use of "%" 119 | \begin{minipage}[t]{0.33\textwidth} 120 | 121 | 122 | 123 | \textbf{Optimization} 124 | % Jensen's Inequality (7.30) p236 125 | \begin{equation*} 126 | f(\theta x + (1-\theta)y) = \theta f(x) + (1-\theta)f(y) 127 | \end{equation*} 128 | 129 | \textbf{Linear Regression} 130 | 131 | \begin{equation*} 132 | \begin{split} 133 | \hat{y} = \beta_0 + \beta_1 x_1 \cdots + \beta_n x_n \\ 134 | \boldsymbol{Y} = \boldsymbol{x}^T \boldsymbol{\beta} \\ 135 | \ell_{L1} = Error(Y - \widehat{Y}) + \lambda \sum_1^n |w_i|\\ 136 | \ell_{L2} = Error(Y - \widehat{Y}) + \lambda \sum_1^n w_i^{2}\\ 137 | \end{split} 138 | \end{equation*} 139 | 140 | \textbf{Logistic Regression} 141 | 142 | \begin{equation*} 143 | odds = \frac{p(X)}{1-p(X)} 144 | \end{equation*} 145 | 146 | % ISLR (4.2)) 147 | 148 | \begin{equation*} 149 | \phi(x) = \frac{1}{1+e^{-x}} 150 | \end{equation*} 151 | 152 | 153 | \begin{equation*} 154 | p(x) = \frac{e^{\beta_0 + \beta_1 x_1 \cdots + \beta_m x_p}}{1+e^{\beta_0 + \beta_1 x_1 \cdots + \beta_m x_p}} 155 | \end{equation*} 156 | 157 | \begin{equation*} 158 | \begin{split} 159 | \text{BCE} =& -(y\,log(p)+(1-y)log(1-p))\\ 160 | \text{MLE} =& \max_{\theta} \prod_y p(y;\theta) 161 | \end{split} 162 | \end{equation*} 163 | 164 | 165 | \begin{equation*} 166 | \begin{split} 167 | &\text{Accuracy} = \frac{TP+TN}{TP+TN+FP+FN}\\ 168 | &\text{Precision} = \frac{TP}{TP+FP}\\ 169 | &\text{Recall} = \frac{TP}{TP+FN}\\ 170 | &\text{Sensitivity} = Recall = \frac{TP}{TP+FN}\\ 171 | &\text{Specificity} = \frac{TN}{FP+TN}\\ 172 | &\text{F1} = \frac{2*Precision*Recall}{Precision+Recall} \\ 173 | &]= \frac{2*TP}{2*TP+FP+FN} 174 | \end{split} 175 | \end{equation*} 176 | 177 | \end{minipage}% <---------------- Note the use of "%" 178 | \begin{minipage}[t]{.33333\textwidth} 179 | 180 | 181 | \textbf{Principal Components} 182 | \\\\ 183 | 184 | \begin{equation*} 185 | V_M = \sum_{m=1}^{M}\lambda_m \; (10.24) 186 | \end{equation*} 187 | 188 | \textbf{GMM} 189 | \begin{equation*} 190 | \begin{split} 191 | p(x|\theta) = \sum_{k=1}^K \pi_k \mathcal{N}(x|\mu_k,\Sigma_k)\; (11.3)\\ 192 | 0 \le \pi_k \le 1, \sum_{k=1}^K \pi_k = 1\;(11.4) 193 | \end{split} 194 | \end{equation*} 195 | 196 | \textbf{Support Vector Kernels} 197 | \\\\ 198 | Kernel trick 199 | \\\\ 200 | 201 | \begin{equation*} 202 | \text{hinge} = max(0, 1 - y \cdot \hat{y}) 203 | \end{equation*} 204 | 205 | \textbf{Information Theory} 206 | 207 | % Negative entropy - Example 7.3 p237 208 | 209 | \begin{equation*} 210 | f(x) = x\,log_2\,x 211 | \end{equation*} 212 | 213 | \textbf{Misc} 214 | 215 | \begin{equation*} 216 | t = \frac{\bar{X} - \mu}{\frac{\hat{\sigma}}{\sqrt{n}}} 217 | \end{equation*} 218 | 219 | 220 | \begin{equation*} 221 | \text{Gini} = \sum_{k=1}^{K}\hat{p}_{mk}(1-\hat{p}_{mk}) 222 | \end{equation*} 223 | 224 | \begin{equation*} 225 | \text{CE} = -\sum_{k=1}^{K}\hat{p}_{mk}\log _2 \hat{p}_{mk} ?? 226 | \end{equation*} 227 | 228 | % 229 | 230 | \begin{equation*} 231 | KL(\hat{y} || y) = \sum_{c=1}^{M}\hat{y}_c \log{\frac{\hat{y}_c}{y_c}} 232 | \end{equation*} 233 | 234 | \begin{equation*} 235 | cos(x,y) = \frac{x \cdot y}{|x||y|} 236 | \end{equation*} 237 | 238 | \textbf{Distributions} 239 | 240 | \begin{equation*} 241 | \begin{split} 242 | f(x) &= \frac{1}{\sigma\sqrt{2\pi}}\exp(-\frac{1}{2\sigma^2}(x-\mu)^2) \\ 243 | \\ 244 | f(\boldsymbol{x}) &= \frac{1}{\sqrt{(2\pi)^n det(\Sigma)}} \exp(-\frac{1}{2}(\boldsymbol{x}-\boldsymbol{\mu})^T \Sigma^{-1} (\boldsymbol{x}-\boldsymbol{\mu}) \\ 245 | \end{split} 246 | \end{equation*} 247 | \end{minipage} 248 | 249 | 250 | \end{document} 251 | -------------------------------------------------------------------------------- /house-prices-polynomial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "fab01917", 6 | "metadata": { 7 | "papermill": { 8 | "duration": 0.010612, 9 | "end_time": "2022-06-27T14:33:20.046427", 10 | "exception": false, 11 | "start_time": "2022-06-27T14:33:20.035815", 12 | "status": "completed" 13 | }, 14 | "tags": [] 15 | }, 16 | "source": [ 17 | "

House Prices: Linear Regression with Polynomials

\n", 18 | "
\n", 19 | "\n", 20 | "Lesson: Linear/Lasso/Ridge Regression with Polynomials\n", 21 | "\n", 22 | "\n", 23 | "|Notebook| MAE | LeaderBoard|\n", 24 | "| --- | --- | --- |\n", 25 | "|QuickStart|38341.2045|0.29234|\n", 26 | "|Extra Features|32285.7959|0.24425|\n", 27 | "|Features + Lasso|31349.8387|0.24425|\n", 28 | "|Features + Ridge|31348.1429|0.24422|\n", 29 | "\n" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "id": "0396fe2f", 35 | "metadata": { 36 | "papermill": { 37 | "duration": 0.008801, 38 | "end_time": "2022-06-27T14:33:20.064455", 39 | "exception": false, 40 | "start_time": "2022-06-27T14:33:20.055654", 41 | "status": "completed" 42 | }, 43 | "tags": [] 44 | }, 45 | "source": [ 46 | "

Import Libraries

\n", 47 | "
\n", 48 | "\n", 49 | "A best practise is to include all libraries here. However, I will put a few imports farther down where they are first used so beginners can learn with an \"as needed\" approach." 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 1, 55 | "id": "f4eef50c", 56 | "metadata": { 57 | "execution": { 58 | "iopub.execute_input": "2022-06-27T14:33:20.085582Z", 59 | "iopub.status.busy": "2022-06-27T14:33:20.085136Z", 60 | "iopub.status.idle": "2022-06-27T14:33:20.098792Z", 61 | "shell.execute_reply": "2022-06-27T14:33:20.097844Z" 62 | }, 63 | "papermill": { 64 | "duration": 0.027294, 65 | "end_time": "2022-06-27T14:33:20.101609", 66 | "exception": false, 67 | "start_time": "2022-06-27T14:33:20.074315", 68 | "status": "completed" 69 | }, 70 | "tags": [] 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "import numpy as np # linear algebra\n", 75 | "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", 76 | "\n", 77 | "from pathlib import Path\n", 78 | "\n", 79 | "pd.options.display.max_columns = 100 # Want to view all the columns" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "id": "cb998522", 85 | "metadata": { 86 | "papermill": { 87 | "duration": 0.008833, 88 | "end_time": "2022-06-27T14:33:20.119705", 89 | "exception": false, 90 | "start_time": "2022-06-27T14:33:20.110872", 91 | "status": "completed" 92 | }, 93 | "tags": [] 94 | }, 95 | "source": [ 96 | "

Library

\n", 97 | "
\n", 98 | "\n", 99 | "Creating a few functions that we will reuse in each project." 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 2, 105 | "id": "8808e38c", 106 | "metadata": { 107 | "execution": { 108 | "iopub.execute_input": "2022-06-27T14:33:20.141011Z", 109 | "iopub.status.busy": "2022-06-27T14:33:20.139692Z", 110 | "iopub.status.idle": "2022-06-27T14:33:20.147103Z", 111 | "shell.execute_reply": "2022-06-27T14:33:20.146204Z" 112 | }, 113 | "papermill": { 114 | "duration": 0.020854, 115 | "end_time": "2022-06-27T14:33:20.149577", 116 | "exception": false, 117 | "start_time": "2022-06-27T14:33:20.128723", 118 | "status": "completed" 119 | }, 120 | "tags": [] 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "def read_data(path):\n", 125 | " data_dir = Path(path)\n", 126 | "\n", 127 | " train = pd.read_csv(data_dir / \"train.csv\")\n", 128 | " test = pd.read_csv(data_dir / \"test.csv\")\n", 129 | " submission_df = pd.read_csv(data_dir / \"sample_submission.csv\")\n", 130 | "\n", 131 | " print(f\"train data: Rows={train.shape[0]}, Columns={train.shape[1]}\")\n", 132 | " print(f\"test data : Rows={test.shape[0]}, Columns={test.shape[1]}\")\n", 133 | " return train, test, submission_df" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 3, 139 | "id": "7b472a33", 140 | "metadata": { 141 | "execution": { 142 | "iopub.execute_input": "2022-06-27T14:33:20.174270Z", 143 | "iopub.status.busy": "2022-06-27T14:33:20.173005Z", 144 | "iopub.status.idle": "2022-06-27T14:33:20.181376Z", 145 | "shell.execute_reply": "2022-06-27T14:33:20.179855Z" 146 | }, 147 | "papermill": { 148 | "duration": 0.023282, 149 | "end_time": "2022-06-27T14:33:20.184192", 150 | "exception": false, 151 | "start_time": "2022-06-27T14:33:20.160910", 152 | "status": "completed" 153 | }, 154 | "tags": [] 155 | }, 156 | "outputs": [], 157 | "source": [ 158 | "def create_submission(model_name, target, preds):\n", 159 | " sample_submission[target] = preds\n", 160 | " if len(model_name) > 0:\n", 161 | " sample_submission.to_csv(f\"submission_{model_name}.csv\", index=False)\n", 162 | " else:\n", 163 | " sample_submission.to_csv(f\"submission.csv\", index=False)\n", 164 | "\n", 165 | " return sample_submission[:5]" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 4, 171 | "id": "571bf3eb", 172 | "metadata": { 173 | "execution": { 174 | "iopub.execute_input": "2022-06-27T14:33:20.204805Z", 175 | "iopub.status.busy": "2022-06-27T14:33:20.203868Z", 176 | "iopub.status.idle": "2022-06-27T14:33:21.385845Z", 177 | "shell.execute_reply": "2022-06-27T14:33:21.384595Z" 178 | }, 179 | "papermill": { 180 | "duration": 1.19507, 181 | "end_time": "2022-06-27T14:33:21.388644", 182 | "exception": false, 183 | "start_time": "2022-06-27T14:33:20.193574", 184 | "status": "completed" 185 | }, 186 | "tags": [] 187 | }, 188 | "outputs": [], 189 | "source": [ 190 | "from sklearn.metrics import mean_squared_error\n", 191 | "from sklearn.metrics import mean_absolute_error\n", 192 | "\n", 193 | "def show_scores(gt, yhat):\n", 194 | " rmse = mean_squared_error(gt, yhat)\n", 195 | " mae = mean_absolute_error(gt, yhat)\n", 196 | "\n", 197 | " print(f\"MAE: {mae:.4f}\")\n", 198 | " print(f\"RMSE: {rmse:.4f}\")" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "id": "0b2defc5", 204 | "metadata": { 205 | "papermill": { 206 | "duration": 0.00892, 207 | "end_time": "2022-06-27T14:33:21.407232", 208 | "exception": false, 209 | "start_time": "2022-06-27T14:33:21.398312", 210 | "status": "completed" 211 | }, 212 | "tags": [] 213 | }, 214 | "source": [ 215 | "

Load Train/Test Data

\n", 216 | "
\n", 217 | "\n", 218 | "- train.csv - Data used to build our machine learning model\n", 219 | "- test.csv - Data used to build our machine learning model. Does not contain the target variable\n", 220 | "- sample_submission.csv - A file in the proper format to submit test predictions" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 5, 226 | "id": "17e4733d", 227 | "metadata": { 228 | "execution": { 229 | "iopub.execute_input": "2022-06-27T14:33:21.427215Z", 230 | "iopub.status.busy": "2022-06-27T14:33:21.426760Z", 231 | "iopub.status.idle": "2022-06-27T14:33:21.516146Z", 232 | "shell.execute_reply": "2022-06-27T14:33:21.514835Z" 233 | }, 234 | "papermill": { 235 | "duration": 0.103035, 236 | "end_time": "2022-06-27T14:33:21.519274", 237 | "exception": false, 238 | "start_time": "2022-06-27T14:33:21.416239", 239 | "status": "completed" 240 | }, 241 | "tags": [] 242 | }, 243 | "outputs": [ 244 | { 245 | "name": "stdout", 246 | "output_type": "stream", 247 | "text": [ 248 | "train data: Rows=1460, Columns=81\n", 249 | "test data : Rows=1459, Columns=80\n" 250 | ] 251 | } 252 | ], 253 | "source": [ 254 | "train, test, sample_submission = read_data(\"../input/house-prices-advanced-regression-techniques\")" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 6, 260 | "id": "df1c32e3", 261 | "metadata": { 262 | "execution": { 263 | "iopub.execute_input": "2022-06-27T14:33:21.540241Z", 264 | "iopub.status.busy": "2022-06-27T14:33:21.539429Z", 265 | "iopub.status.idle": "2022-06-27T14:33:21.604080Z", 266 | "shell.execute_reply": "2022-06-27T14:33:21.602612Z" 267 | }, 268 | "papermill": { 269 | "duration": 0.078633, 270 | "end_time": "2022-06-27T14:33:21.607331", 271 | "exception": false, 272 | "start_time": "2022-06-27T14:33:21.528698", 273 | "status": "completed" 274 | }, 275 | "tags": [] 276 | }, 277 | "outputs": [ 278 | { 279 | "data": { 280 | "text/html": [ 281 | "
\n", 282 | "\n", 295 | "\n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | "
IdMSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilitiesLotConfigLandSlopeNeighborhoodCondition1Condition2BldgTypeHouseStyleOverallQualOverallCondYearBuiltYearRemodAddRoofStyleRoofMatlExterior1stExterior2ndMasVnrTypeMasVnrAreaExterQualExterCondFoundationBsmtQualBsmtCondBsmtExposureBsmtFinType1BsmtFinSF1BsmtFinType2BsmtFinSF2BsmtUnfSFTotalBsmtSFHeatingHeatingQCCentralAirElectrical1stFlrSF2ndFlrSFLowQualFinSFGrLivAreaBsmtFullBathBsmtHalfBathFullBathHalfBathBedroomAbvGrKitchenAbvGrKitchenQualTotRmsAbvGrdFunctionalFireplacesFireplaceQuGarageTypeGarageYrBltGarageFinishGarageCarsGarageAreaGarageQualGarageCondPavedDriveWoodDeckSFOpenPorchSFEnclosedPorch3SsnPorchScreenPorchPoolAreaPoolQCFenceMiscFeatureMiscValMoSoldYrSoldSaleTypeSaleConditionSalePrice
0160RL65.08450PaveNaNRegLvlAllPubInsideGtlCollgCrNormNorm1Fam2Story7520032003GableCompShgVinylSdVinylSdBrkFace196.0GdTAPConcGdTANoGLQ706Unf0150856GasAExYSBrkr85685401710102131Gd8Typ0NaNAttchd2003.0RFn2548TATAY0610000NaNNaNNaN022008WDNormal208500
1220RL80.09600PaveNaNRegLvlAllPubFR2GtlVeenkerFeedrNorm1Fam1Story6819761976GableCompShgMetalSdMetalSdNone0.0TATACBlockGdTAGdALQ978Unf02841262GasAExYSBrkr1262001262012031TA6Typ1TAAttchd1976.0RFn2460TATAY29800000NaNNaNNaN052007WDNormal181500
2360RL68.011250PaveNaNIR1LvlAllPubInsideGtlCollgCrNormNorm1Fam2Story7520012002GableCompShgVinylSdVinylSdBrkFace162.0GdTAPConcGdTAMnGLQ486Unf0434920GasAExYSBrkr92086601786102131Gd6Typ1TAAttchd2001.0RFn2608TATAY0420000NaNNaNNaN092008WDNormal223500
3470RL60.09550PaveNaNIR1LvlAllPubCornerGtlCrawforNormNorm1Fam2Story7519151970GableCompShgWd SdngWd ShngNone0.0TATABrkTilTAGdNoALQ216Unf0540756GasAGdYSBrkr96175601717101031Gd7Typ1GdDetchd1998.0Unf3642TATAY035272000NaNNaNNaN022006WDAbnorml140000
4560RL84.014260PaveNaNIR1LvlAllPubFR2GtlNoRidgeNormNorm1Fam2Story8520002000GableCompShgVinylSdVinylSdBrkFace350.0GdTAPConcGdTAAvGLQ655Unf04901145GasAExYSBrkr1145105302198102141Gd9Typ1TAAttchd2000.0RFn3836TATAY192840000NaNNaNNaN0122008WDNormal250000
\n", 805 | "
" 806 | ], 807 | "text/plain": [ 808 | " Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n", 809 | "0 1 60 RL 65.0 8450 Pave NaN Reg \n", 810 | "1 2 20 RL 80.0 9600 Pave NaN Reg \n", 811 | "2 3 60 RL 68.0 11250 Pave NaN IR1 \n", 812 | "3 4 70 RL 60.0 9550 Pave NaN IR1 \n", 813 | "4 5 60 RL 84.0 14260 Pave NaN IR1 \n", 814 | "\n", 815 | " LandContour Utilities LotConfig LandSlope Neighborhood Condition1 \\\n", 816 | "0 Lvl AllPub Inside Gtl CollgCr Norm \n", 817 | "1 Lvl AllPub FR2 Gtl Veenker Feedr \n", 818 | "2 Lvl AllPub Inside Gtl CollgCr Norm \n", 819 | "3 Lvl AllPub Corner Gtl Crawfor Norm \n", 820 | "4 Lvl AllPub FR2 Gtl NoRidge Norm \n", 821 | "\n", 822 | " Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt \\\n", 823 | "0 Norm 1Fam 2Story 7 5 2003 \n", 824 | "1 Norm 1Fam 1Story 6 8 1976 \n", 825 | "2 Norm 1Fam 2Story 7 5 2001 \n", 826 | "3 Norm 1Fam 2Story 7 5 1915 \n", 827 | "4 Norm 1Fam 2Story 8 5 2000 \n", 828 | "\n", 829 | " YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType \\\n", 830 | "0 2003 Gable CompShg VinylSd VinylSd BrkFace \n", 831 | "1 1976 Gable CompShg MetalSd MetalSd None \n", 832 | "2 2002 Gable CompShg VinylSd VinylSd BrkFace \n", 833 | "3 1970 Gable CompShg Wd Sdng Wd Shng None \n", 834 | "4 2000 Gable CompShg VinylSd VinylSd BrkFace \n", 835 | "\n", 836 | " MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure \\\n", 837 | "0 196.0 Gd TA PConc Gd TA No \n", 838 | "1 0.0 TA TA CBlock Gd TA Gd \n", 839 | "2 162.0 Gd TA PConc Gd TA Mn \n", 840 | "3 0.0 TA TA BrkTil TA Gd No \n", 841 | "4 350.0 Gd TA PConc Gd TA Av \n", 842 | "\n", 843 | " BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF \\\n", 844 | "0 GLQ 706 Unf 0 150 856 \n", 845 | "1 ALQ 978 Unf 0 284 1262 \n", 846 | "2 GLQ 486 Unf 0 434 920 \n", 847 | "3 ALQ 216 Unf 0 540 756 \n", 848 | "4 GLQ 655 Unf 0 490 1145 \n", 849 | "\n", 850 | " Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF \\\n", 851 | "0 GasA Ex Y SBrkr 856 854 0 \n", 852 | "1 GasA Ex Y SBrkr 1262 0 0 \n", 853 | "2 GasA Ex Y SBrkr 920 866 0 \n", 854 | "3 GasA Gd Y SBrkr 961 756 0 \n", 855 | "4 GasA Ex Y SBrkr 1145 1053 0 \n", 856 | "\n", 857 | " GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr \\\n", 858 | "0 1710 1 0 2 1 3 \n", 859 | "1 1262 0 1 2 0 3 \n", 860 | "2 1786 1 0 2 1 3 \n", 861 | "3 1717 1 0 1 0 3 \n", 862 | "4 2198 1 0 2 1 4 \n", 863 | "\n", 864 | " KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu \\\n", 865 | "0 1 Gd 8 Typ 0 NaN \n", 866 | "1 1 TA 6 Typ 1 TA \n", 867 | "2 1 Gd 6 Typ 1 TA \n", 868 | "3 1 Gd 7 Typ 1 Gd \n", 869 | "4 1 Gd 9 Typ 1 TA \n", 870 | "\n", 871 | " GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual \\\n", 872 | "0 Attchd 2003.0 RFn 2 548 TA \n", 873 | "1 Attchd 1976.0 RFn 2 460 TA \n", 874 | "2 Attchd 2001.0 RFn 2 608 TA \n", 875 | "3 Detchd 1998.0 Unf 3 642 TA \n", 876 | "4 Attchd 2000.0 RFn 3 836 TA \n", 877 | "\n", 878 | " GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch \\\n", 879 | "0 TA Y 0 61 0 0 \n", 880 | "1 TA Y 298 0 0 0 \n", 881 | "2 TA Y 0 42 0 0 \n", 882 | "3 TA Y 0 35 272 0 \n", 883 | "4 TA Y 192 84 0 0 \n", 884 | "\n", 885 | " ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold \\\n", 886 | "0 0 0 NaN NaN NaN 0 2 2008 \n", 887 | "1 0 0 NaN NaN NaN 0 5 2007 \n", 888 | "2 0 0 NaN NaN NaN 0 9 2008 \n", 889 | "3 0 0 NaN NaN NaN 0 2 2006 \n", 890 | "4 0 0 NaN NaN NaN 0 12 2008 \n", 891 | "\n", 892 | " SaleType SaleCondition SalePrice \n", 893 | "0 WD Normal 208500 \n", 894 | "1 WD Normal 181500 \n", 895 | "2 WD Normal 223500 \n", 896 | "3 WD Abnorml 140000 \n", 897 | "4 WD Normal 250000 " 898 | ] 899 | }, 900 | "execution_count": 6, 901 | "metadata": {}, 902 | "output_type": "execute_result" 903 | } 904 | ], 905 | "source": [ 906 | "train.head()" 907 | ] 908 | }, 909 | { 910 | "cell_type": "markdown", 911 | "id": "9ac02e56", 912 | "metadata": { 913 | "papermill": { 914 | "duration": 0.011106, 915 | "end_time": "2022-06-27T14:33:21.628823", 916 | "exception": false, 917 | "start_time": "2022-06-27T14:33:21.617717", 918 | "status": "completed" 919 | }, 920 | "tags": [] 921 | }, 922 | "source": [ 923 | "In supervised learning problems, we have a label or target." 924 | ] 925 | }, 926 | { 927 | "cell_type": "code", 928 | "execution_count": 7, 929 | "id": "916cb0e6", 930 | "metadata": { 931 | "execution": { 932 | "iopub.execute_input": "2022-06-27T14:33:21.651599Z", 933 | "iopub.status.busy": "2022-06-27T14:33:21.651167Z", 934 | "iopub.status.idle": "2022-06-27T14:33:21.656718Z", 935 | "shell.execute_reply": "2022-06-27T14:33:21.655446Z" 936 | }, 937 | "papermill": { 938 | "duration": 0.01971, 939 | "end_time": "2022-06-27T14:33:21.659150", 940 | "exception": false, 941 | "start_time": "2022-06-27T14:33:21.639440", 942 | "status": "completed" 943 | }, 944 | "tags": [] 945 | }, 946 | "outputs": [], 947 | "source": [ 948 | "TARGET = \"SalePrice\"" 949 | ] 950 | }, 951 | { 952 | "cell_type": "markdown", 953 | "id": "4929e49e", 954 | "metadata": { 955 | "papermill": { 956 | "duration": 0.010903, 957 | "end_time": "2022-06-27T14:33:21.680370", 958 | "exception": false, 959 | "start_time": "2022-06-27T14:33:21.669467", 960 | "status": "completed" 961 | }, 962 | "tags": [] 963 | }, 964 | "source": [ 965 | "There are 79 features but to keep it simple we are only going to start with one." 966 | ] 967 | }, 968 | { 969 | "cell_type": "code", 970 | "execution_count": 8, 971 | "id": "f05f52d4", 972 | "metadata": { 973 | "execution": { 974 | "iopub.execute_input": "2022-06-27T14:33:21.703880Z", 975 | "iopub.status.busy": "2022-06-27T14:33:21.701881Z", 976 | "iopub.status.idle": "2022-06-27T14:33:21.707951Z", 977 | "shell.execute_reply": "2022-06-27T14:33:21.707041Z" 978 | }, 979 | "papermill": { 980 | "duration": 0.02067, 981 | "end_time": "2022-06-27T14:33:21.711365", 982 | "exception": false, 983 | "start_time": "2022-06-27T14:33:21.690695", 984 | "status": "completed" 985 | }, 986 | "tags": [] 987 | }, 988 | "outputs": [], 989 | "source": [ 990 | "FEATURES = [\"GrLivArea\", \"LotArea\", \"TotalBsmtSF\", \"FullBath\"]" 991 | ] 992 | }, 993 | { 994 | "cell_type": "markdown", 995 | "id": "bdb0436d", 996 | "metadata": { 997 | "papermill": { 998 | "duration": 0.010627, 999 | "end_time": "2022-06-27T14:33:21.732547", 1000 | "exception": false, 1001 | "start_time": "2022-06-27T14:33:21.721920", 1002 | "status": "completed" 1003 | }, 1004 | "tags": [] 1005 | }, 1006 | "source": [ 1007 | "

Missing Data

\n", 1008 | "
" 1009 | ] 1010 | }, 1011 | { 1012 | "cell_type": "code", 1013 | "execution_count": 9, 1014 | "id": "bdedcfbb", 1015 | "metadata": { 1016 | "execution": { 1017 | "iopub.execute_input": "2022-06-27T14:33:21.755744Z", 1018 | "iopub.status.busy": "2022-06-27T14:33:21.755371Z", 1019 | "iopub.status.idle": "2022-06-27T14:33:21.772964Z", 1020 | "shell.execute_reply": "2022-06-27T14:33:21.771995Z" 1021 | }, 1022 | "papermill": { 1023 | "duration": 0.032753, 1024 | "end_time": "2022-06-27T14:33:21.776302", 1025 | "exception": false, 1026 | "start_time": "2022-06-27T14:33:21.743549", 1027 | "status": "completed" 1028 | }, 1029 | "tags": [] 1030 | }, 1031 | "outputs": [ 1032 | { 1033 | "name": "stdout", 1034 | "output_type": "stream", 1035 | "text": [ 1036 | "===== Train =====\n", 1037 | "GrLivArea 0\n", 1038 | "LotArea 0\n", 1039 | "TotalBsmtSF 0\n", 1040 | "FullBath 0\n", 1041 | "dtype: int64\n", 1042 | "===== Test =====\n", 1043 | "GrLivArea 0\n", 1044 | "LotArea 0\n", 1045 | "TotalBsmtSF 1\n", 1046 | "FullBath 0\n", 1047 | "dtype: int64\n" 1048 | ] 1049 | } 1050 | ], 1051 | "source": [ 1052 | "print(5*\"=\",\"Train\", 5*\"=\")\n", 1053 | "print(train[FEATURES].isnull().sum())\n", 1054 | "print(5*\"=\",\"Test\", 5*\"=\")\n", 1055 | "print(test[FEATURES].isnull().sum())" 1056 | ] 1057 | }, 1058 | { 1059 | "cell_type": "code", 1060 | "execution_count": 10, 1061 | "id": "fc228691", 1062 | "metadata": { 1063 | "execution": { 1064 | "iopub.execute_input": "2022-06-27T14:33:21.799903Z", 1065 | "iopub.status.busy": "2022-06-27T14:33:21.799521Z", 1066 | "iopub.status.idle": "2022-06-27T14:33:21.806964Z", 1067 | "shell.execute_reply": "2022-06-27T14:33:21.805943Z" 1068 | }, 1069 | "papermill": { 1070 | "duration": 0.021904, 1071 | "end_time": "2022-06-27T14:33:21.809334", 1072 | "exception": false, 1073 | "start_time": "2022-06-27T14:33:21.787430", 1074 | "status": "completed" 1075 | }, 1076 | "tags": [] 1077 | }, 1078 | "outputs": [], 1079 | "source": [ 1080 | "test[\"TotalBsmtSF\"] = test[\"TotalBsmtSF\"].fillna(0)" 1081 | ] 1082 | }, 1083 | { 1084 | "cell_type": "markdown", 1085 | "id": "860d1708", 1086 | "metadata": { 1087 | "papermill": { 1088 | "duration": 0.010104, 1089 | "end_time": "2022-06-27T14:33:21.829528", 1090 | "exception": false, 1091 | "start_time": "2022-06-27T14:33:21.819424", 1092 | "status": "completed" 1093 | }, 1094 | "tags": [] 1095 | }, 1096 | "source": [ 1097 | "## Verify No Missing Data" 1098 | ] 1099 | }, 1100 | { 1101 | "cell_type": "code", 1102 | "execution_count": 11, 1103 | "id": "62f6bac1", 1104 | "metadata": { 1105 | "execution": { 1106 | "iopub.execute_input": "2022-06-27T14:33:21.853514Z", 1107 | "iopub.status.busy": "2022-06-27T14:33:21.852283Z", 1108 | "iopub.status.idle": "2022-06-27T14:33:21.862129Z", 1109 | "shell.execute_reply": "2022-06-27T14:33:21.860798Z" 1110 | }, 1111 | "papermill": { 1112 | "duration": 0.025448, 1113 | "end_time": "2022-06-27T14:33:21.865762", 1114 | "exception": false, 1115 | "start_time": "2022-06-27T14:33:21.840314", 1116 | "status": "completed" 1117 | }, 1118 | "tags": [] 1119 | }, 1120 | "outputs": [ 1121 | { 1122 | "name": "stdout", 1123 | "output_type": "stream", 1124 | "text": [ 1125 | "GrLivArea 0\n", 1126 | "LotArea 0\n", 1127 | "TotalBsmtSF 0\n", 1128 | "FullBath 0\n", 1129 | "dtype: int64\n" 1130 | ] 1131 | } 1132 | ], 1133 | "source": [ 1134 | "print(test[FEATURES].isnull().sum())" 1135 | ] 1136 | }, 1137 | { 1138 | "cell_type": "code", 1139 | "execution_count": 12, 1140 | "id": "9d40cd69", 1141 | "metadata": { 1142 | "execution": { 1143 | "iopub.execute_input": "2022-06-27T14:33:21.889625Z", 1144 | "iopub.status.busy": "2022-06-27T14:33:21.888754Z", 1145 | "iopub.status.idle": "2022-06-27T14:33:21.895593Z", 1146 | "shell.execute_reply": "2022-06-27T14:33:21.894499Z" 1147 | }, 1148 | "papermill": { 1149 | "duration": 0.021672, 1150 | "end_time": "2022-06-27T14:33:21.898252", 1151 | "exception": false, 1152 | "start_time": "2022-06-27T14:33:21.876580", 1153 | "status": "completed" 1154 | }, 1155 | "tags": [] 1156 | }, 1157 | "outputs": [], 1158 | "source": [ 1159 | "y = train[TARGET]\n", 1160 | "X = train[FEATURES].copy()\n", 1161 | "\n", 1162 | "X_test = test[FEATURES].copy()" 1163 | ] 1164 | }, 1165 | { 1166 | "cell_type": "code", 1167 | "execution_count": 13, 1168 | "id": "af1277ce", 1169 | "metadata": { 1170 | "execution": { 1171 | "iopub.execute_input": "2022-06-27T14:33:21.920841Z", 1172 | "iopub.status.busy": "2022-06-27T14:33:21.920234Z", 1173 | "iopub.status.idle": "2022-06-27T14:33:21.929466Z", 1174 | "shell.execute_reply": "2022-06-27T14:33:21.928656Z" 1175 | }, 1176 | "papermill": { 1177 | "duration": 0.022972, 1178 | "end_time": "2022-06-27T14:33:21.931706", 1179 | "exception": false, 1180 | "start_time": "2022-06-27T14:33:21.908734", 1181 | "status": "completed" 1182 | }, 1183 | "tags": [] 1184 | }, 1185 | "outputs": [ 1186 | { 1187 | "data": { 1188 | "text/html": [ 1189 | "
\n", 1190 | "\n", 1203 | "\n", 1204 | " \n", 1205 | " \n", 1206 | " \n", 1207 | " \n", 1208 | " \n", 1209 | " \n", 1210 | " \n", 1211 | " \n", 1212 | " \n", 1213 | " \n", 1214 | " \n", 1215 | " \n", 1216 | " \n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | "
GrLivAreaLotAreaTotalBsmtSFFullBath
0171084508562
11262960012622
21786112509202
3171795507561
421981426011452
\n", 1251 | "
" 1252 | ], 1253 | "text/plain": [ 1254 | " GrLivArea LotArea TotalBsmtSF FullBath\n", 1255 | "0 1710 8450 856 2\n", 1256 | "1 1262 9600 1262 2\n", 1257 | "2 1786 11250 920 2\n", 1258 | "3 1717 9550 756 1\n", 1259 | "4 2198 14260 1145 2" 1260 | ] 1261 | }, 1262 | "execution_count": 13, 1263 | "metadata": {}, 1264 | "output_type": "execute_result" 1265 | } 1266 | ], 1267 | "source": [ 1268 | "X.head()" 1269 | ] 1270 | }, 1271 | { 1272 | "cell_type": "markdown", 1273 | "id": "89669055", 1274 | "metadata": { 1275 | "papermill": { 1276 | "duration": 0.010483, 1277 | "end_time": "2022-06-27T14:33:21.952393", 1278 | "exception": false, 1279 | "start_time": "2022-06-27T14:33:21.941910", 1280 | "status": "completed" 1281 | }, 1282 | "tags": [] 1283 | }, 1284 | "source": [ 1285 | "## Scale the Data\n", 1286 | "\n", 1287 | "Doesn't make a difference so it's commented out." 1288 | ] 1289 | }, 1290 | { 1291 | "cell_type": "code", 1292 | "execution_count": 14, 1293 | "id": "6b64725f", 1294 | "metadata": { 1295 | "execution": { 1296 | "iopub.execute_input": "2022-06-27T14:33:21.975274Z", 1297 | "iopub.status.busy": "2022-06-27T14:33:21.974658Z", 1298 | "iopub.status.idle": "2022-06-27T14:33:21.987274Z", 1299 | "shell.execute_reply": "2022-06-27T14:33:21.986191Z" 1300 | }, 1301 | "papermill": { 1302 | "duration": 0.027457, 1303 | "end_time": "2022-06-27T14:33:21.990138", 1304 | "exception": false, 1305 | "start_time": "2022-06-27T14:33:21.962681", 1306 | "status": "completed" 1307 | }, 1308 | "tags": [] 1309 | }, 1310 | "outputs": [], 1311 | "source": [ 1312 | "from sklearn.preprocessing import StandardScaler, RobustScaler\n", 1313 | "\n", 1314 | "scaler = StandardScaler()\n", 1315 | "\n", 1316 | "X = scaler.fit(X).transform(X)\n", 1317 | "X_test = scaler.transform(X_test)" 1318 | ] 1319 | }, 1320 | { 1321 | "cell_type": "code", 1322 | "execution_count": 15, 1323 | "id": "f1449c33", 1324 | "metadata": { 1325 | "execution": { 1326 | "iopub.execute_input": "2022-06-27T14:33:22.012237Z", 1327 | "iopub.status.busy": "2022-06-27T14:33:22.011829Z", 1328 | "iopub.status.idle": "2022-06-27T14:33:22.018681Z", 1329 | "shell.execute_reply": "2022-06-27T14:33:22.017754Z" 1330 | }, 1331 | "papermill": { 1332 | "duration": 0.020573, 1333 | "end_time": "2022-06-27T14:33:22.020947", 1334 | "exception": false, 1335 | "start_time": "2022-06-27T14:33:22.000374", 1336 | "status": "completed" 1337 | }, 1338 | "tags": [] 1339 | }, 1340 | "outputs": [ 1341 | { 1342 | "data": { 1343 | "text/plain": [ 1344 | "array([[ 0.37033344, -0.20714171, -0.45930254, 0.78974052],\n", 1345 | " [-0.48251191, -0.09188637, 0.46646492, 0.78974052],\n", 1346 | " [ 0.51501256, 0.07347998, -0.31336875, 0.78974052],\n", 1347 | " [ 0.38365915, -0.09689747, -0.68732408, -1.02604084],\n", 1348 | " [ 1.2993257 , 0.37514829, 0.19967971, 0.78974052]])" 1349 | ] 1350 | }, 1351 | "execution_count": 15, 1352 | "metadata": {}, 1353 | "output_type": "execute_result" 1354 | } 1355 | ], 1356 | "source": [ 1357 | "X[:5]" 1358 | ] 1359 | }, 1360 | { 1361 | "cell_type": "markdown", 1362 | "id": "b80150bc", 1363 | "metadata": { 1364 | "papermill": { 1365 | "duration": 0.01002, 1366 | "end_time": "2022-06-27T14:33:22.041670", 1367 | "exception": false, 1368 | "start_time": "2022-06-27T14:33:22.031650", 1369 | "status": "completed" 1370 | }, 1371 | "tags": [] 1372 | }, 1373 | "source": [ 1374 | "

Train Model with Train/Test Split

\n", 1375 | "
\n", 1376 | "\n", 1377 | "We split the training data so we can evaluate how well each model performs We are saving 20% of the training data to validate the model(s)." 1378 | ] 1379 | }, 1380 | { 1381 | "cell_type": "code", 1382 | "execution_count": 16, 1383 | "id": "2b604476", 1384 | "metadata": { 1385 | "execution": { 1386 | "iopub.execute_input": "2022-06-27T14:33:22.064910Z", 1387 | "iopub.status.busy": "2022-06-27T14:33:22.064295Z", 1388 | "iopub.status.idle": "2022-06-27T14:33:22.088034Z", 1389 | "shell.execute_reply": "2022-06-27T14:33:22.087197Z" 1390 | }, 1391 | "papermill": { 1392 | "duration": 0.037943, 1393 | "end_time": "2022-06-27T14:33:22.090320", 1394 | "exception": false, 1395 | "start_time": "2022-06-27T14:33:22.052377", 1396 | "status": "completed" 1397 | }, 1398 | "tags": [] 1399 | }, 1400 | "outputs": [ 1401 | { 1402 | "data": { 1403 | "text/plain": [ 1404 | "((1168, 4), (1168,), (292, 4), (292,))" 1405 | ] 1406 | }, 1407 | "execution_count": 16, 1408 | "metadata": {}, 1409 | "output_type": "execute_result" 1410 | } 1411 | ], 1412 | "source": [ 1413 | "from sklearn.model_selection import train_test_split\n", 1414 | "\n", 1415 | "X_train, X_valid, y_train, y_valid = train_test_split(\n", 1416 | " X,\n", 1417 | " y,\n", 1418 | " test_size=0.2, # Save 20% for validation\n", 1419 | " random_state=42, # Make the split deterministic\n", 1420 | ")\n", 1421 | "X_train.shape, y_train.shape, X_valid.shape, y_valid.shape" 1422 | ] 1423 | }, 1424 | { 1425 | "cell_type": "markdown", 1426 | "id": "660c8b65", 1427 | "metadata": { 1428 | "papermill": { 1429 | "duration": 0.010528, 1430 | "end_time": "2022-06-27T14:33:22.111692", 1431 | "exception": false, 1432 | "start_time": "2022-06-27T14:33:22.101164", 1433 | "status": "completed" 1434 | }, 1435 | "tags": [] 1436 | }, 1437 | "source": [ 1438 | "# Create a Model" 1439 | ] 1440 | }, 1441 | { 1442 | "cell_type": "code", 1443 | "execution_count": 17, 1444 | "id": "8eaee441", 1445 | "metadata": { 1446 | "execution": { 1447 | "iopub.execute_input": "2022-06-27T14:33:22.137956Z", 1448 | "iopub.status.busy": "2022-06-27T14:33:22.137032Z", 1449 | "iopub.status.idle": "2022-06-27T14:33:22.235337Z", 1450 | "shell.execute_reply": "2022-06-27T14:33:22.233806Z" 1451 | }, 1452 | "papermill": { 1453 | "duration": 0.115007, 1454 | "end_time": "2022-06-27T14:33:22.238276", 1455 | "exception": false, 1456 | "start_time": "2022-06-27T14:33:22.123269", 1457 | "status": "completed" 1458 | }, 1459 | "tags": [] 1460 | }, 1461 | "outputs": [], 1462 | "source": [ 1463 | "from sklearn.linear_model import LinearRegression, Lasso, Ridge\n", 1464 | " \n", 1465 | "def run_regression_model(model):\n", 1466 | "# model = Lasso(alpha=0.5)\n", 1467 | " poly_features = PolynomialFeatures(degree = 3, include_bias=False)\n", 1468 | " X_poly = poly_features.fit_transform(X_train)\n", 1469 | "\n", 1470 | " model.fit(X_poly,y_train)\n", 1471 | "\n", 1472 | " valid_preds = model.predict(poly_features.transform(X_valid))\n", 1473 | " test_preds = model.predict(poly_features.transform(X_test))\n", 1474 | "\n", 1475 | " return valid_preds, test_preds" 1476 | ] 1477 | }, 1478 | { 1479 | "cell_type": "code", 1480 | "execution_count": 18, 1481 | "id": "4911d2ef", 1482 | "metadata": { 1483 | "execution": { 1484 | "iopub.execute_input": "2022-06-27T14:33:22.261958Z", 1485 | "iopub.status.busy": "2022-06-27T14:33:22.261137Z", 1486 | "iopub.status.idle": "2022-06-27T14:33:22.300237Z", 1487 | "shell.execute_reply": "2022-06-27T14:33:22.298743Z" 1488 | }, 1489 | "papermill": { 1490 | "duration": 0.054965, 1491 | "end_time": "2022-06-27T14:33:22.304165", 1492 | "exception": false, 1493 | "start_time": "2022-06-27T14:33:22.249200", 1494 | "status": "completed" 1495 | }, 1496 | "tags": [] 1497 | }, 1498 | "outputs": [ 1499 | { 1500 | "data": { 1501 | "text/plain": [ 1502 | "LinearRegression()" 1503 | ] 1504 | }, 1505 | "execution_count": 18, 1506 | "metadata": {}, 1507 | "output_type": "execute_result" 1508 | } 1509 | ], 1510 | "source": [ 1511 | "from sklearn.preprocessing import PolynomialFeatures\n", 1512 | "\n", 1513 | "poly_features = PolynomialFeatures(degree = 2, include_bias=False)\n", 1514 | "X_poly = poly_features.fit_transform(X_train)\n", 1515 | "model = LinearRegression()\n", 1516 | "model.fit(X_poly, y_train)" 1517 | ] 1518 | }, 1519 | { 1520 | "cell_type": "code", 1521 | "execution_count": 19, 1522 | "id": "ed9930c8", 1523 | "metadata": { 1524 | "execution": { 1525 | "iopub.execute_input": "2022-06-27T14:33:22.348334Z", 1526 | "iopub.status.busy": "2022-06-27T14:33:22.347703Z", 1527 | "iopub.status.idle": "2022-06-27T14:33:22.359369Z", 1528 | "shell.execute_reply": "2022-06-27T14:33:22.357691Z" 1529 | }, 1530 | "papermill": { 1531 | "duration": 0.038677, 1532 | "end_time": "2022-06-27T14:33:22.364129", 1533 | "exception": false, 1534 | "start_time": "2022-06-27T14:33:22.325452", 1535 | "status": "completed" 1536 | }, 1537 | "tags": [] 1538 | }, 1539 | "outputs": [ 1540 | { 1541 | "name": "stdout", 1542 | "output_type": "stream", 1543 | "text": [ 1544 | "MAE: 31493.3462\n", 1545 | "RMSE: 2296540215.3963\n" 1546 | ] 1547 | } 1548 | ], 1549 | "source": [ 1550 | "valid_preds = model.predict(poly_features.transform(X_valid))\n", 1551 | "show_scores(y_valid, valid_preds)\n" 1552 | ] 1553 | }, 1554 | { 1555 | "cell_type": "code", 1556 | "execution_count": 20, 1557 | "id": "5a1d1f3b", 1558 | "metadata": { 1559 | "execution": { 1560 | "iopub.execute_input": "2022-06-27T14:33:22.408160Z", 1561 | "iopub.status.busy": "2022-06-27T14:33:22.407775Z", 1562 | "iopub.status.idle": "2022-06-27T14:33:22.447234Z", 1563 | "shell.execute_reply": "2022-06-27T14:33:22.445287Z" 1564 | }, 1565 | "papermill": { 1566 | "duration": 0.06631, 1567 | "end_time": "2022-06-27T14:33:22.452552", 1568 | "exception": false, 1569 | "start_time": "2022-06-27T14:33:22.386242", 1570 | "status": "completed" 1571 | }, 1572 | "tags": [] 1573 | }, 1574 | "outputs": [ 1575 | { 1576 | "data": { 1577 | "text/html": [ 1578 | "
\n", 1579 | "\n", 1592 | "\n", 1593 | " \n", 1594 | " \n", 1595 | " \n", 1596 | " \n", 1597 | " \n", 1598 | " \n", 1599 | " \n", 1600 | " \n", 1601 | " \n", 1602 | " \n", 1603 | " \n", 1604 | " \n", 1605 | " \n", 1606 | " \n", 1607 | " \n", 1608 | " \n", 1609 | " \n", 1610 | " \n", 1611 | " \n", 1612 | " \n", 1613 | " \n", 1614 | " \n", 1615 | " \n", 1616 | " \n", 1617 | " \n", 1618 | " \n", 1619 | " \n", 1620 | " \n", 1621 | " \n", 1622 | " \n", 1623 | " \n", 1624 | " \n", 1625 | " \n", 1626 | " \n", 1627 | "
IdSalePrice
01461122888.699954
11462181942.783849
21463188807.493137
31464184008.030413
41465183214.790072
\n", 1628 | "
" 1629 | ], 1630 | "text/plain": [ 1631 | " Id SalePrice\n", 1632 | "0 1461 122888.699954\n", 1633 | "1 1462 181942.783849\n", 1634 | "2 1463 188807.493137\n", 1635 | "3 1464 184008.030413\n", 1636 | "4 1465 183214.790072" 1637 | ] 1638 | }, 1639 | "execution_count": 20, 1640 | "metadata": {}, 1641 | "output_type": "execute_result" 1642 | } 1643 | ], 1644 | "source": [ 1645 | "test_preds = model.predict(poly_features.transform(X_test))\n", 1646 | "create_submission(\"poly2\", TARGET, test_preds)" 1647 | ] 1648 | }, 1649 | { 1650 | "cell_type": "markdown", 1651 | "id": "7b323bcf", 1652 | "metadata": { 1653 | "papermill": { 1654 | "duration": 0.02136, 1655 | "end_time": "2022-06-27T14:33:22.499456", 1656 | "exception": false, 1657 | "start_time": "2022-06-27T14:33:22.478096", 1658 | "status": "completed" 1659 | }, 1660 | "tags": [] 1661 | }, 1662 | "source": [ 1663 | "## Fit/Train the model" 1664 | ] 1665 | }, 1666 | { 1667 | "cell_type": "code", 1668 | "execution_count": 21, 1669 | "id": "6a9651e1", 1670 | "metadata": { 1671 | "execution": { 1672 | "iopub.execute_input": "2022-06-27T14:33:22.538986Z", 1673 | "iopub.status.busy": "2022-06-27T14:33:22.538552Z", 1674 | "iopub.status.idle": "2022-06-27T14:33:22.603492Z", 1675 | "shell.execute_reply": "2022-06-27T14:33:22.601835Z" 1676 | }, 1677 | "papermill": { 1678 | "duration": 0.087085, 1679 | "end_time": "2022-06-27T14:33:22.607043", 1680 | "exception": false, 1681 | "start_time": "2022-06-27T14:33:22.519958", 1682 | "status": "completed" 1683 | }, 1684 | "tags": [] 1685 | }, 1686 | "outputs": [ 1687 | { 1688 | "name": "stdout", 1689 | "output_type": "stream", 1690 | "text": [ 1691 | "MAE: 30806.0773\n", 1692 | "RMSE: 2326803199.3127\n" 1693 | ] 1694 | }, 1695 | { 1696 | "data": { 1697 | "text/html": [ 1698 | "
\n", 1699 | "\n", 1712 | "\n", 1713 | " \n", 1714 | " \n", 1715 | " \n", 1716 | " \n", 1717 | " \n", 1718 | " \n", 1719 | " \n", 1720 | " \n", 1721 | " \n", 1722 | " \n", 1723 | " \n", 1724 | " \n", 1725 | " \n", 1726 | " \n", 1727 | " \n", 1728 | " \n", 1729 | " \n", 1730 | " \n", 1731 | " \n", 1732 | " \n", 1733 | " \n", 1734 | " \n", 1735 | " \n", 1736 | " \n", 1737 | " \n", 1738 | " \n", 1739 | " \n", 1740 | " \n", 1741 | " \n", 1742 | " \n", 1743 | " \n", 1744 | " \n", 1745 | " \n", 1746 | " \n", 1747 | "
IdSalePrice
01461113975.794861
11462182646.170136
21463187077.189972
31464182186.048862
41465185989.648806
\n", 1748 | "
" 1749 | ], 1750 | "text/plain": [ 1751 | " Id SalePrice\n", 1752 | "0 1461 113975.794861\n", 1753 | "1 1462 182646.170136\n", 1754 | "2 1463 187077.189972\n", 1755 | "3 1464 182186.048862\n", 1756 | "4 1465 185989.648806" 1757 | ] 1758 | }, 1759 | "execution_count": 21, 1760 | "metadata": {}, 1761 | "output_type": "execute_result" 1762 | } 1763 | ], 1764 | "source": [ 1765 | "valid_preds, test_preds = run_regression_model(LinearRegression())\n", 1766 | "show_scores(y_valid, valid_preds)\n", 1767 | "create_submission(\"\", TARGET, test_preds)" 1768 | ] 1769 | }, 1770 | { 1771 | "cell_type": "markdown", 1772 | "id": "ebd3b0a9", 1773 | "metadata": { 1774 | "papermill": { 1775 | "duration": 0.033999, 1776 | "end_time": "2022-06-27T14:33:22.676561", 1777 | "exception": false, 1778 | "start_time": "2022-06-27T14:33:22.642562", 1779 | "status": "completed" 1780 | }, 1781 | "tags": [] 1782 | }, 1783 | "source": [ 1784 | "\n", 1785 | "- https://stackoverflow.com/questions/20681864/lasso-on-sklearn-does-not-converge\n", 1786 | "- https://stats.stackexchange.com/questions/445831/how-is-tol-used-in-scikit-learns-lasso-and-elasticnet\n" 1787 | ] 1788 | }, 1789 | { 1790 | "cell_type": "code", 1791 | "execution_count": 22, 1792 | "id": "3ef6eb2d", 1793 | "metadata": { 1794 | "execution": { 1795 | "iopub.execute_input": "2022-06-27T14:33:22.702749Z", 1796 | "iopub.status.busy": "2022-06-27T14:33:22.702329Z", 1797 | "iopub.status.idle": "2022-06-27T14:33:23.077128Z", 1798 | "shell.execute_reply": "2022-06-27T14:33:23.074976Z" 1799 | }, 1800 | "papermill": { 1801 | "duration": 0.391008, 1802 | "end_time": "2022-06-27T14:33:23.080682", 1803 | "exception": false, 1804 | "start_time": "2022-06-27T14:33:22.689674", 1805 | "status": "completed" 1806 | }, 1807 | "tags": [] 1808 | }, 1809 | "outputs": [ 1810 | { 1811 | "name": "stdout", 1812 | "output_type": "stream", 1813 | "text": [ 1814 | "MAE: 30691.9468\n", 1815 | "RMSE: 2256165402.5832\n" 1816 | ] 1817 | }, 1818 | { 1819 | "name": "stderr", 1820 | "output_type": "stream", 1821 | "text": [ 1822 | "/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.573e+11, tolerance: 6.967e+06\n", 1823 | " coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive\n" 1824 | ] 1825 | }, 1826 | { 1827 | "data": { 1828 | "text/html": [ 1829 | "
\n", 1830 | "\n", 1843 | "\n", 1844 | " \n", 1845 | " \n", 1846 | " \n", 1847 | " \n", 1848 | " \n", 1849 | " \n", 1850 | " \n", 1851 | " \n", 1852 | " \n", 1853 | " \n", 1854 | " \n", 1855 | " \n", 1856 | " \n", 1857 | " \n", 1858 | " \n", 1859 | " \n", 1860 | " \n", 1861 | " \n", 1862 | " \n", 1863 | " \n", 1864 | " \n", 1865 | " \n", 1866 | " \n", 1867 | " \n", 1868 | " \n", 1869 | " \n", 1870 | " \n", 1871 | " \n", 1872 | " \n", 1873 | " \n", 1874 | " \n", 1875 | " \n", 1876 | " \n", 1877 | " \n", 1878 | "
IdSalePrice
01461114014.327233
11462182808.417126
21463187253.624146
31464182564.341035
41465185412.534119
\n", 1879 | "
" 1880 | ], 1881 | "text/plain": [ 1882 | " Id SalePrice\n", 1883 | "0 1461 114014.327233\n", 1884 | "1 1462 182808.417126\n", 1885 | "2 1463 187253.624146\n", 1886 | "3 1464 182564.341035\n", 1887 | "4 1465 185412.534119" 1888 | ] 1889 | }, 1890 | "execution_count": 22, 1891 | "metadata": {}, 1892 | "output_type": "execute_result" 1893 | } 1894 | ], 1895 | "source": [ 1896 | "valid_preds, test_preds = run_regression_model(Lasso(alpha=1.0, max_iter=10_000, tol= 0.000001))\n", 1897 | "show_scores(y_valid, valid_preds)\n", 1898 | "create_submission(\"lasso\", TARGET, test_preds)" 1899 | ] 1900 | }, 1901 | { 1902 | "cell_type": "code", 1903 | "execution_count": 23, 1904 | "id": "a31b7774", 1905 | "metadata": { 1906 | "execution": { 1907 | "iopub.execute_input": "2022-06-27T14:33:23.155309Z", 1908 | "iopub.status.busy": "2022-06-27T14:33:23.154054Z", 1909 | "iopub.status.idle": "2022-06-27T14:33:23.199986Z", 1910 | "shell.execute_reply": "2022-06-27T14:33:23.197275Z" 1911 | }, 1912 | "papermill": { 1913 | "duration": 0.07136, 1914 | "end_time": "2022-06-27T14:33:23.204056", 1915 | "exception": false, 1916 | "start_time": "2022-06-27T14:33:23.132696", 1917 | "status": "completed" 1918 | }, 1919 | "tags": [] 1920 | }, 1921 | "outputs": [ 1922 | { 1923 | "name": "stdout", 1924 | "output_type": "stream", 1925 | "text": [ 1926 | "MAE: 30798.8145\n", 1927 | "RMSE: 2327135205.2686\n" 1928 | ] 1929 | }, 1930 | { 1931 | "data": { 1932 | "text/html": [ 1933 | "
\n", 1934 | "\n", 1947 | "\n", 1948 | " \n", 1949 | " \n", 1950 | " \n", 1951 | " \n", 1952 | " \n", 1953 | " \n", 1954 | " \n", 1955 | " \n", 1956 | " \n", 1957 | " \n", 1958 | " \n", 1959 | " \n", 1960 | " \n", 1961 | " \n", 1962 | " \n", 1963 | " \n", 1964 | " \n", 1965 | " \n", 1966 | " \n", 1967 | " \n", 1968 | " \n", 1969 | " \n", 1970 | " \n", 1971 | " \n", 1972 | " \n", 1973 | " \n", 1974 | " \n", 1975 | " \n", 1976 | " \n", 1977 | " \n", 1978 | " \n", 1979 | " \n", 1980 | " \n", 1981 | " \n", 1982 | "
IdSalePrice
01461114030.583492
11462182578.537196
21463187091.311323
31464182193.038395
41465185889.742427
\n", 1983 | "
" 1984 | ], 1985 | "text/plain": [ 1986 | " Id SalePrice\n", 1987 | "0 1461 114030.583492\n", 1988 | "1 1462 182578.537196\n", 1989 | "2 1463 187091.311323\n", 1990 | "3 1464 182193.038395\n", 1991 | "4 1465 185889.742427" 1992 | ] 1993 | }, 1994 | "execution_count": 23, 1995 | "metadata": {}, 1996 | "output_type": "execute_result" 1997 | } 1998 | ], 1999 | "source": [ 2000 | "valid_preds, test_preds = run_regression_model(Ridge(alpha=0.5, max_iter=1000, tol= 0.0001))\n", 2001 | "show_scores(y_valid, valid_preds)\n", 2002 | "create_submission(\"ridge\", TARGET, test_preds)" 2003 | ] 2004 | } 2005 | ], 2006 | "metadata": { 2007 | "kernelspec": { 2008 | "display_name": "Python 3", 2009 | "language": "python", 2010 | "name": "python3" 2011 | }, 2012 | "language_info": { 2013 | "codemirror_mode": { 2014 | "name": "ipython", 2015 | "version": 3 2016 | }, 2017 | "file_extension": ".py", 2018 | "mimetype": "text/x-python", 2019 | "name": "python", 2020 | "nbconvert_exporter": "python", 2021 | "pygments_lexer": "ipython3", 2022 | "version": "3.7.12" 2023 | }, 2024 | "papermill": { 2025 | "default_parameters": {}, 2026 | "duration": 14.366576, 2027 | "end_time": "2022-06-27T14:33:23.956582", 2028 | "environment_variables": {}, 2029 | "exception": null, 2030 | "input_path": "__notebook__.ipynb", 2031 | "output_path": "__notebook__.ipynb", 2032 | "parameters": {}, 2033 | "start_time": "2022-06-27T14:33:09.590006", 2034 | "version": "2.3.4" 2035 | } 2036 | }, 2037 | "nbformat": 4, 2038 | "nbformat_minor": 5 2039 | } 2040 | -------------------------------------------------------------------------------- /house-prices-quickstart.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "3742faf5", 6 | "metadata": { 7 | "papermill": { 8 | "duration": 0.006142, 9 | "end_time": "2022-06-23T18:06:05.259277", 10 | "exception": false, 11 | "start_time": "2022-06-23T18:06:05.253135", 12 | "status": "completed" 13 | }, 14 | "tags": [] 15 | }, 16 | "source": [ 17 | "

House Prices: Learn Regression

\n", 18 | "
\n", 19 | "\n", 20 | "This a small tutorial targeted at the complete beginner. It's no substitue for a good book on Machine Learning. In fact, I highly recommend this book: \n", 21 | "\n", 22 | "[Hands on Machine Learning](https://www.oreilly.com/library/view/hands-on-machine-learning/9781492032632/) (HOML)\n", 23 | "\n", 24 | "Chapter 2 of HOML introduces regression with a different House Price dataset.\n", 25 | "\n", 26 | "My main goal here is to get the beginner started on Kaggle, where there's no limit to learning ML. " 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "id": "3b83541d", 32 | "metadata": { 33 | "papermill": { 34 | "duration": 0.005153, 35 | "end_time": "2022-06-23T18:06:05.270624", 36 | "exception": false, 37 | "start_time": "2022-06-23T18:06:05.265471", 38 | "status": "completed" 39 | }, 40 | "tags": [] 41 | }, 42 | "source": [ 43 | "

Import Libraries

\n", 44 | "
\n", 45 | "\n", 46 | "A best practise is to include all libraries here. However, I will put a few imports farther down where they are first used so beginners can learn with an \"as needed\" approach." 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 1, 52 | "id": "dbdcd1f6", 53 | "metadata": { 54 | "execution": { 55 | "iopub.execute_input": "2022-06-23T18:06:05.283217Z", 56 | "iopub.status.busy": "2022-06-23T18:06:05.282584Z", 57 | "iopub.status.idle": "2022-06-23T18:06:05.293400Z", 58 | "shell.execute_reply": "2022-06-23T18:06:05.292161Z" 59 | }, 60 | "papermill": { 61 | "duration": 0.019894, 62 | "end_time": "2022-06-23T18:06:05.295805", 63 | "exception": false, 64 | "start_time": "2022-06-23T18:06:05.275911", 65 | "status": "completed" 66 | }, 67 | "tags": [] 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "import numpy as np # linear algebra\n", 72 | "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", 73 | "\n", 74 | "from pathlib import Path" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "id": "051ce93e", 80 | "metadata": { 81 | "papermill": { 82 | "duration": 0.004826, 83 | "end_time": "2022-06-23T18:06:05.306144", 84 | "exception": false, 85 | "start_time": "2022-06-23T18:06:05.301318", 86 | "status": "completed" 87 | }, 88 | "tags": [] 89 | }, 90 | "source": [ 91 | "

Load Train/Test Data

\n", 92 | "
\n", 93 | "\n", 94 | "- train.csv - Data used to build our machine learning model\n", 95 | "- test.csv - Data used to build our machine learning model. Does not contain the target variable\n", 96 | "- sample_submission.csv - A file in the proper format to submit test predictions" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 2, 102 | "id": "98dbf7f7", 103 | "metadata": { 104 | "execution": { 105 | "iopub.execute_input": "2022-06-23T18:06:05.317896Z", 106 | "iopub.status.busy": "2022-06-23T18:06:05.317503Z", 107 | "iopub.status.idle": "2022-06-23T18:06:05.392513Z", 108 | "shell.execute_reply": "2022-06-23T18:06:05.391709Z" 109 | }, 110 | "papermill": { 111 | "duration": 0.083482, 112 | "end_time": "2022-06-23T18:06:05.394585", 113 | "exception": false, 114 | "start_time": "2022-06-23T18:06:05.311103", 115 | "status": "completed" 116 | }, 117 | "tags": [] 118 | }, 119 | "outputs": [ 120 | { 121 | "name": "stdout", 122 | "output_type": "stream", 123 | "text": [ 124 | "train data: Rows=1460, Columns=81\n", 125 | "test data : Rows=1459, Columns=80\n" 126 | ] 127 | } 128 | ], 129 | "source": [ 130 | "data_dir = Path(\"../input/house-prices-advanced-regression-techniques\")\n", 131 | "\n", 132 | "train = pd.read_csv(data_dir / \"train.csv\")\n", 133 | "test = pd.read_csv(data_dir / \"test.csv\")\n", 134 | "sample_submission = pd.read_csv(data_dir / \"sample_submission.csv\")\n", 135 | "\n", 136 | "print(f\"train data: Rows={train.shape[0]}, Columns={train.shape[1]}\")\n", 137 | "print(f\"test data : Rows={test.shape[0]}, Columns={test.shape[1]}\")" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 3, 143 | "id": "56fe0550", 144 | "metadata": { 145 | "execution": { 146 | "iopub.execute_input": "2022-06-23T18:06:05.407107Z", 147 | "iopub.status.busy": "2022-06-23T18:06:05.406355Z", 148 | "iopub.status.idle": "2022-06-23T18:06:05.460929Z", 149 | "shell.execute_reply": "2022-06-23T18:06:05.459575Z" 150 | }, 151 | "papermill": { 152 | "duration": 0.063537, 153 | "end_time": "2022-06-23T18:06:05.463327", 154 | "exception": false, 155 | "start_time": "2022-06-23T18:06:05.399790", 156 | "status": "completed" 157 | }, 158 | "tags": [] 159 | }, 160 | "outputs": [ 161 | { 162 | "data": { 163 | "text/html": [ 164 | "
\n", 165 | "\n", 178 | "\n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | "
IdMSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilitiesLotConfigLandSlopeNeighborhoodCondition1Condition2BldgTypeHouseStyleOverallQualOverallCondYearBuiltYearRemodAddRoofStyleRoofMatlExterior1stExterior2ndMasVnrTypeMasVnrAreaExterQualExterCondFoundationBsmtQualBsmtCondBsmtExposureBsmtFinType1BsmtFinSF1BsmtFinType2BsmtFinSF2BsmtUnfSFTotalBsmtSFHeatingHeatingQCCentralAirElectrical1stFlrSF2ndFlrSFLowQualFinSFGrLivAreaBsmtFullBathBsmtHalfBathFullBathHalfBathBedroomAbvGrKitchenAbvGrKitchenQualTotRmsAbvGrdFunctionalFireplacesFireplaceQuGarageTypeGarageYrBltGarageFinishGarageCarsGarageAreaGarageQualGarageCondPavedDriveWoodDeckSFOpenPorchSFEnclosedPorch3SsnPorchScreenPorchPoolAreaPoolQCFenceMiscFeatureMiscValMoSoldYrSoldSaleTypeSaleConditionSalePrice
0160RL65.08450PaveNaNRegLvlAllPubInsideGtlCollgCrNormNorm1Fam2Story7520032003GableCompShgVinylSdVinylSdBrkFace196.0GdTAPConcGdTANoGLQ706Unf0150856GasAExYSBrkr85685401710102131Gd8Typ0NaNAttchd2003.0RFn2548TATAY0610000NaNNaNNaN022008WDNormal208500
1220RL80.09600PaveNaNRegLvlAllPubFR2GtlVeenkerFeedrNorm1Fam1Story6819761976GableCompShgMetalSdMetalSdNone0.0TATACBlockGdTAGdALQ978Unf02841262GasAExYSBrkr1262001262012031TA6Typ1TAAttchd1976.0RFn2460TATAY29800000NaNNaNNaN052007WDNormal181500
2360RL68.011250PaveNaNIR1LvlAllPubInsideGtlCollgCrNormNorm1Fam2Story7520012002GableCompShgVinylSdVinylSdBrkFace162.0GdTAPConcGdTAMnGLQ486Unf0434920GasAExYSBrkr92086601786102131Gd6Typ1TAAttchd2001.0RFn2608TATAY0420000NaNNaNNaN092008WDNormal223500
3470RL60.09550PaveNaNIR1LvlAllPubCornerGtlCrawforNormNorm1Fam2Story7519151970GableCompShgWd SdngWd ShngNone0.0TATABrkTilTAGdNoALQ216Unf0540756GasAGdYSBrkr96175601717101031Gd7Typ1GdDetchd1998.0Unf3642TATAY035272000NaNNaNNaN022006WDAbnorml140000
4560RL84.014260PaveNaNIR1LvlAllPubFR2GtlNoRidgeNormNorm1Fam2Story8520002000GableCompShgVinylSdVinylSdBrkFace350.0GdTAPConcGdTAAvGLQ655Unf04901145GasAExYSBrkr1145105302198102141Gd9Typ1TAAttchd2000.0RFn3836TATAY192840000NaNNaNNaN0122008WDNormal250000
\n", 688 | "
" 689 | ], 690 | "text/plain": [ 691 | " Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n", 692 | "0 1 60 RL 65.0 8450 Pave NaN Reg \n", 693 | "1 2 20 RL 80.0 9600 Pave NaN Reg \n", 694 | "2 3 60 RL 68.0 11250 Pave NaN IR1 \n", 695 | "3 4 70 RL 60.0 9550 Pave NaN IR1 \n", 696 | "4 5 60 RL 84.0 14260 Pave NaN IR1 \n", 697 | "\n", 698 | " LandContour Utilities LotConfig LandSlope Neighborhood Condition1 \\\n", 699 | "0 Lvl AllPub Inside Gtl CollgCr Norm \n", 700 | "1 Lvl AllPub FR2 Gtl Veenker Feedr \n", 701 | "2 Lvl AllPub Inside Gtl CollgCr Norm \n", 702 | "3 Lvl AllPub Corner Gtl Crawfor Norm \n", 703 | "4 Lvl AllPub FR2 Gtl NoRidge Norm \n", 704 | "\n", 705 | " Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt \\\n", 706 | "0 Norm 1Fam 2Story 7 5 2003 \n", 707 | "1 Norm 1Fam 1Story 6 8 1976 \n", 708 | "2 Norm 1Fam 2Story 7 5 2001 \n", 709 | "3 Norm 1Fam 2Story 7 5 1915 \n", 710 | "4 Norm 1Fam 2Story 8 5 2000 \n", 711 | "\n", 712 | " YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType \\\n", 713 | "0 2003 Gable CompShg VinylSd VinylSd BrkFace \n", 714 | "1 1976 Gable CompShg MetalSd MetalSd None \n", 715 | "2 2002 Gable CompShg VinylSd VinylSd BrkFace \n", 716 | "3 1970 Gable CompShg Wd Sdng Wd Shng None \n", 717 | "4 2000 Gable CompShg VinylSd VinylSd BrkFace \n", 718 | "\n", 719 | " MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure \\\n", 720 | "0 196.0 Gd TA PConc Gd TA No \n", 721 | "1 0.0 TA TA CBlock Gd TA Gd \n", 722 | "2 162.0 Gd TA PConc Gd TA Mn \n", 723 | "3 0.0 TA TA BrkTil TA Gd No \n", 724 | "4 350.0 Gd TA PConc Gd TA Av \n", 725 | "\n", 726 | " BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF \\\n", 727 | "0 GLQ 706 Unf 0 150 856 \n", 728 | "1 ALQ 978 Unf 0 284 1262 \n", 729 | "2 GLQ 486 Unf 0 434 920 \n", 730 | "3 ALQ 216 Unf 0 540 756 \n", 731 | "4 GLQ 655 Unf 0 490 1145 \n", 732 | "\n", 733 | " Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF \\\n", 734 | "0 GasA Ex Y SBrkr 856 854 0 \n", 735 | "1 GasA Ex Y SBrkr 1262 0 0 \n", 736 | "2 GasA Ex Y SBrkr 920 866 0 \n", 737 | "3 GasA Gd Y SBrkr 961 756 0 \n", 738 | "4 GasA Ex Y SBrkr 1145 1053 0 \n", 739 | "\n", 740 | " GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr \\\n", 741 | "0 1710 1 0 2 1 3 \n", 742 | "1 1262 0 1 2 0 3 \n", 743 | "2 1786 1 0 2 1 3 \n", 744 | "3 1717 1 0 1 0 3 \n", 745 | "4 2198 1 0 2 1 4 \n", 746 | "\n", 747 | " KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu \\\n", 748 | "0 1 Gd 8 Typ 0 NaN \n", 749 | "1 1 TA 6 Typ 1 TA \n", 750 | "2 1 Gd 6 Typ 1 TA \n", 751 | "3 1 Gd 7 Typ 1 Gd \n", 752 | "4 1 Gd 9 Typ 1 TA \n", 753 | "\n", 754 | " GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual \\\n", 755 | "0 Attchd 2003.0 RFn 2 548 TA \n", 756 | "1 Attchd 1976.0 RFn 2 460 TA \n", 757 | "2 Attchd 2001.0 RFn 2 608 TA \n", 758 | "3 Detchd 1998.0 Unf 3 642 TA \n", 759 | "4 Attchd 2000.0 RFn 3 836 TA \n", 760 | "\n", 761 | " GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch \\\n", 762 | "0 TA Y 0 61 0 0 \n", 763 | "1 TA Y 298 0 0 0 \n", 764 | "2 TA Y 0 42 0 0 \n", 765 | "3 TA Y 0 35 272 0 \n", 766 | "4 TA Y 192 84 0 0 \n", 767 | "\n", 768 | " ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold \\\n", 769 | "0 0 0 NaN NaN NaN 0 2 2008 \n", 770 | "1 0 0 NaN NaN NaN 0 5 2007 \n", 771 | "2 0 0 NaN NaN NaN 0 9 2008 \n", 772 | "3 0 0 NaN NaN NaN 0 2 2006 \n", 773 | "4 0 0 NaN NaN NaN 0 12 2008 \n", 774 | "\n", 775 | " SaleType SaleCondition SalePrice \n", 776 | "0 WD Normal 208500 \n", 777 | "1 WD Normal 181500 \n", 778 | "2 WD Normal 223500 \n", 779 | "3 WD Abnorml 140000 \n", 780 | "4 WD Normal 250000 " 781 | ] 782 | }, 783 | "execution_count": 3, 784 | "metadata": {}, 785 | "output_type": "execute_result" 786 | } 787 | ], 788 | "source": [ 789 | "pd.options.display.max_columns = 100 # Want to view all the columns\n", 790 | "\n", 791 | "train.head()" 792 | ] 793 | }, 794 | { 795 | "cell_type": "markdown", 796 | "id": "903f70c1", 797 | "metadata": { 798 | "papermill": { 799 | "duration": 0.005975, 800 | "end_time": "2022-06-23T18:06:05.475306", 801 | "exception": false, 802 | "start_time": "2022-06-23T18:06:05.469331", 803 | "status": "completed" 804 | }, 805 | "tags": [] 806 | }, 807 | "source": [ 808 | "In supervised learning problems, we have a label or target." 809 | ] 810 | }, 811 | { 812 | "cell_type": "code", 813 | "execution_count": 4, 814 | "id": "ba4fa5bf", 815 | "metadata": { 816 | "execution": { 817 | "iopub.execute_input": "2022-06-23T18:06:05.489256Z", 818 | "iopub.status.busy": "2022-06-23T18:06:05.488867Z", 819 | "iopub.status.idle": "2022-06-23T18:06:05.494142Z", 820 | "shell.execute_reply": "2022-06-23T18:06:05.493129Z" 821 | }, 822 | "papermill": { 823 | "duration": 0.014935, 824 | "end_time": "2022-06-23T18:06:05.496406", 825 | "exception": false, 826 | "start_time": "2022-06-23T18:06:05.481471", 827 | "status": "completed" 828 | }, 829 | "tags": [] 830 | }, 831 | "outputs": [], 832 | "source": [ 833 | "TARGET = \"SalePrice\"" 834 | ] 835 | }, 836 | { 837 | "cell_type": "markdown", 838 | "id": "d57d86c2", 839 | "metadata": { 840 | "papermill": { 841 | "duration": 0.005805, 842 | "end_time": "2022-06-23T18:06:05.508243", 843 | "exception": false, 844 | "start_time": "2022-06-23T18:06:05.502438", 845 | "status": "completed" 846 | }, 847 | "tags": [] 848 | }, 849 | "source": [ 850 | "There are 79 features but to keep it simple we are only going to start with one." 851 | ] 852 | }, 853 | { 854 | "cell_type": "code", 855 | "execution_count": 5, 856 | "id": "922235c1", 857 | "metadata": { 858 | "execution": { 859 | "iopub.execute_input": "2022-06-23T18:06:05.522291Z", 860 | "iopub.status.busy": "2022-06-23T18:06:05.521319Z", 861 | "iopub.status.idle": "2022-06-23T18:06:05.525828Z", 862 | "shell.execute_reply": "2022-06-23T18:06:05.524922Z" 863 | }, 864 | "papermill": { 865 | "duration": 0.013509, 866 | "end_time": "2022-06-23T18:06:05.527790", 867 | "exception": false, 868 | "start_time": "2022-06-23T18:06:05.514281", 869 | "status": "completed" 870 | }, 871 | "tags": [] 872 | }, 873 | "outputs": [], 874 | "source": [ 875 | "FEATURES = [\"GrLivArea\"] # A not so random feature to start with" 876 | ] 877 | }, 878 | { 879 | "cell_type": "code", 880 | "execution_count": 6, 881 | "id": "9d034fee", 882 | "metadata": { 883 | "execution": { 884 | "iopub.execute_input": "2022-06-23T18:06:05.541525Z", 885 | "iopub.status.busy": "2022-06-23T18:06:05.540800Z", 886 | "iopub.status.idle": "2022-06-23T18:06:05.556563Z", 887 | "shell.execute_reply": "2022-06-23T18:06:05.555705Z" 888 | }, 889 | "papermill": { 890 | "duration": 0.025199, 891 | "end_time": "2022-06-23T18:06:05.558906", 892 | "exception": false, 893 | "start_time": "2022-06-23T18:06:05.533707", 894 | "status": "completed" 895 | }, 896 | "tags": [] 897 | }, 898 | "outputs": [], 899 | "source": [ 900 | "y = train[TARGET]\n", 901 | "X = train[FEATURES].copy()\n", 902 | "\n", 903 | "X_test = test[FEATURES].copy()" 904 | ] 905 | }, 906 | { 907 | "cell_type": "code", 908 | "execution_count": 7, 909 | "id": "cec30148", 910 | "metadata": { 911 | "execution": { 912 | "iopub.execute_input": "2022-06-23T18:06:05.573796Z", 913 | "iopub.status.busy": "2022-06-23T18:06:05.573121Z", 914 | "iopub.status.idle": "2022-06-23T18:06:05.581271Z", 915 | "shell.execute_reply": "2022-06-23T18:06:05.580425Z" 916 | }, 917 | "papermill": { 918 | "duration": 0.017747, 919 | "end_time": "2022-06-23T18:06:05.583266", 920 | "exception": false, 921 | "start_time": "2022-06-23T18:06:05.565519", 922 | "status": "completed" 923 | }, 924 | "tags": [] 925 | }, 926 | "outputs": [ 927 | { 928 | "data": { 929 | "text/html": [ 930 | "
\n", 931 | "\n", 944 | "\n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | "
GrLivArea
01710
11262
21786
31717
42198
\n", 974 | "
" 975 | ], 976 | "text/plain": [ 977 | " GrLivArea\n", 978 | "0 1710\n", 979 | "1 1262\n", 980 | "2 1786\n", 981 | "3 1717\n", 982 | "4 2198" 983 | ] 984 | }, 985 | "execution_count": 7, 986 | "metadata": {}, 987 | "output_type": "execute_result" 988 | } 989 | ], 990 | "source": [ 991 | "X.head()" 992 | ] 993 | }, 994 | { 995 | "cell_type": "markdown", 996 | "id": "9272ad38", 997 | "metadata": { 998 | "papermill": { 999 | "duration": 0.005954, 1000 | "end_time": "2022-06-23T18:06:05.595231", 1001 | "exception": false, 1002 | "start_time": "2022-06-23T18:06:05.589277", 1003 | "status": "completed" 1004 | }, 1005 | "tags": [] 1006 | }, 1007 | "source": [ 1008 | "

Train Model with Train/Test Split

\n", 1009 | "
\n", 1010 | "\n", 1011 | "We split the training data so we can evaluate how well each model performs We are saving 20% of the training data to validate the model(s)." 1012 | ] 1013 | }, 1014 | { 1015 | "cell_type": "code", 1016 | "execution_count": 8, 1017 | "id": "6568727f", 1018 | "metadata": { 1019 | "execution": { 1020 | "iopub.execute_input": "2022-06-23T18:06:05.610570Z", 1021 | "iopub.status.busy": "2022-06-23T18:06:05.609926Z", 1022 | "iopub.status.idle": "2022-06-23T18:06:06.886284Z", 1023 | "shell.execute_reply": "2022-06-23T18:06:06.884967Z" 1024 | }, 1025 | "papermill": { 1026 | "duration": 1.28676, 1027 | "end_time": "2022-06-23T18:06:06.888756", 1028 | "exception": false, 1029 | "start_time": "2022-06-23T18:06:05.601996", 1030 | "status": "completed" 1031 | }, 1032 | "tags": [] 1033 | }, 1034 | "outputs": [ 1035 | { 1036 | "data": { 1037 | "text/plain": [ 1038 | "((1168, 1), (1168,), (292, 1), (292,))" 1039 | ] 1040 | }, 1041 | "execution_count": 8, 1042 | "metadata": {}, 1043 | "output_type": "execute_result" 1044 | } 1045 | ], 1046 | "source": [ 1047 | "from sklearn.model_selection import train_test_split\n", 1048 | "\n", 1049 | "X_train, X_valid, y_train, y_valid = train_test_split(\n", 1050 | " X,\n", 1051 | " y,\n", 1052 | " test_size=0.2, # Save 20% for validation\n", 1053 | " random_state=42, # Make the split deterministic\n", 1054 | ")\n", 1055 | "X_train.shape, y_train.shape, X_valid.shape, y_valid.shape" 1056 | ] 1057 | }, 1058 | { 1059 | "cell_type": "markdown", 1060 | "id": "8003dff2", 1061 | "metadata": { 1062 | "papermill": { 1063 | "duration": 0.006326, 1064 | "end_time": "2022-06-23T18:06:06.901575", 1065 | "exception": false, 1066 | "start_time": "2022-06-23T18:06:06.895249", 1067 | "status": "completed" 1068 | }, 1069 | "tags": [] 1070 | }, 1071 | "source": [ 1072 | "# Create a Model" 1073 | ] 1074 | }, 1075 | { 1076 | "cell_type": "code", 1077 | "execution_count": 9, 1078 | "id": "fa38802f", 1079 | "metadata": { 1080 | "execution": { 1081 | "iopub.execute_input": "2022-06-23T18:06:06.916670Z", 1082 | "iopub.status.busy": "2022-06-23T18:06:06.916088Z", 1083 | "iopub.status.idle": "2022-06-23T18:06:07.010840Z", 1084 | "shell.execute_reply": "2022-06-23T18:06:07.010006Z" 1085 | }, 1086 | "papermill": { 1087 | "duration": 0.104619, 1088 | "end_time": "2022-06-23T18:06:07.013059", 1089 | "exception": false, 1090 | "start_time": "2022-06-23T18:06:06.908440", 1091 | "status": "completed" 1092 | }, 1093 | "tags": [] 1094 | }, 1095 | "outputs": [], 1096 | "source": [ 1097 | "from sklearn.linear_model import LinearRegression, Lasso, Ridge\n", 1098 | "\n", 1099 | "model = LinearRegression()" 1100 | ] 1101 | }, 1102 | { 1103 | "cell_type": "markdown", 1104 | "id": "1503dd70", 1105 | "metadata": { 1106 | "papermill": { 1107 | "duration": 0.006069, 1108 | "end_time": "2022-06-23T18:06:07.025198", 1109 | "exception": false, 1110 | "start_time": "2022-06-23T18:06:07.019129", 1111 | "status": "completed" 1112 | }, 1113 | "tags": [] 1114 | }, 1115 | "source": [ 1116 | "## Fit/Train the model" 1117 | ] 1118 | }, 1119 | { 1120 | "cell_type": "code", 1121 | "execution_count": 10, 1122 | "id": "4cea7e0b", 1123 | "metadata": { 1124 | "execution": { 1125 | "iopub.execute_input": "2022-06-23T18:06:07.040205Z", 1126 | "iopub.status.busy": "2022-06-23T18:06:07.039577Z", 1127 | "iopub.status.idle": "2022-06-23T18:06:07.064471Z", 1128 | "shell.execute_reply": "2022-06-23T18:06:07.063264Z" 1129 | }, 1130 | "papermill": { 1131 | "duration": 0.035449, 1132 | "end_time": "2022-06-23T18:06:07.067055", 1133 | "exception": false, 1134 | "start_time": "2022-06-23T18:06:07.031606", 1135 | "status": "completed" 1136 | }, 1137 | "tags": [] 1138 | }, 1139 | "outputs": [ 1140 | { 1141 | "data": { 1142 | "text/plain": [ 1143 | "LinearRegression()" 1144 | ] 1145 | }, 1146 | "execution_count": 10, 1147 | "metadata": {}, 1148 | "output_type": "execute_result" 1149 | } 1150 | ], 1151 | "source": [ 1152 | "model.fit(X_train,y_train)" 1153 | ] 1154 | }, 1155 | { 1156 | "cell_type": "markdown", 1157 | "id": "35aac55d", 1158 | "metadata": { 1159 | "papermill": { 1160 | "duration": 0.00588, 1161 | "end_time": "2022-06-23T18:06:07.079228", 1162 | "exception": false, 1163 | "start_time": "2022-06-23T18:06:07.073348", 1164 | "status": "completed" 1165 | }, 1166 | "tags": [] 1167 | }, 1168 | "source": [ 1169 | "## Use the Trained Model to Predict the Validation Data" 1170 | ] 1171 | }, 1172 | { 1173 | "cell_type": "code", 1174 | "execution_count": 11, 1175 | "id": "85a98c8c", 1176 | "metadata": { 1177 | "execution": { 1178 | "iopub.execute_input": "2022-06-23T18:06:07.093402Z", 1179 | "iopub.status.busy": "2022-06-23T18:06:07.093009Z", 1180 | "iopub.status.idle": "2022-06-23T18:06:07.100962Z", 1181 | "shell.execute_reply": "2022-06-23T18:06:07.099706Z" 1182 | }, 1183 | "papermill": { 1184 | "duration": 0.017916, 1185 | "end_time": "2022-06-23T18:06:07.103338", 1186 | "exception": false, 1187 | "start_time": "2022-06-23T18:06:07.085422", 1188 | "status": "completed" 1189 | }, 1190 | "tags": [] 1191 | }, 1192 | "outputs": [], 1193 | "source": [ 1194 | "yhat = model.predict(X_valid)" 1195 | ] 1196 | }, 1197 | { 1198 | "cell_type": "markdown", 1199 | "id": "1f0c17f6", 1200 | "metadata": { 1201 | "papermill": { 1202 | "duration": 0.005918, 1203 | "end_time": "2022-06-23T18:06:07.115530", 1204 | "exception": false, 1205 | "start_time": "2022-06-23T18:06:07.109612", 1206 | "status": "completed" 1207 | }, 1208 | "tags": [] 1209 | }, 1210 | "source": [ 1211 | "

Score the Model

\n", 1212 | "
\n", 1213 | "\n", 1214 | "We get a score by evaluating our model on the validation data.\n", 1215 | "\n", 1216 | "First, we use RMSE then MAE." 1217 | ] 1218 | }, 1219 | { 1220 | "cell_type": "code", 1221 | "execution_count": 12, 1222 | "id": "ffa1440c", 1223 | "metadata": { 1224 | "execution": { 1225 | "iopub.execute_input": "2022-06-23T18:06:07.130268Z", 1226 | "iopub.status.busy": "2022-06-23T18:06:07.129610Z", 1227 | "iopub.status.idle": "2022-06-23T18:06:07.136308Z", 1228 | "shell.execute_reply": "2022-06-23T18:06:07.135040Z" 1229 | }, 1230 | "papermill": { 1231 | "duration": 0.016843, 1232 | "end_time": "2022-06-23T18:06:07.138686", 1233 | "exception": false, 1234 | "start_time": "2022-06-23T18:06:07.121843", 1235 | "status": "completed" 1236 | }, 1237 | "tags": [] 1238 | }, 1239 | "outputs": [ 1240 | { 1241 | "name": "stdout", 1242 | "output_type": "stream", 1243 | "text": [ 1244 | "RMSE: 3418946311.1808\n" 1245 | ] 1246 | } 1247 | ], 1248 | "source": [ 1249 | "from sklearn.metrics import mean_squared_error\n", 1250 | "\n", 1251 | "rmse = mean_squared_error(y_valid, yhat)\n", 1252 | "print(f\"RMSE: {rmse:.4f}\")" 1253 | ] 1254 | }, 1255 | { 1256 | "cell_type": "code", 1257 | "execution_count": 13, 1258 | "id": "f258e2f2", 1259 | "metadata": { 1260 | "execution": { 1261 | "iopub.execute_input": "2022-06-23T18:06:07.153262Z", 1262 | "iopub.status.busy": "2022-06-23T18:06:07.152885Z", 1263 | "iopub.status.idle": "2022-06-23T18:06:07.159649Z", 1264 | "shell.execute_reply": "2022-06-23T18:06:07.158328Z" 1265 | }, 1266 | "papermill": { 1267 | "duration": 0.016976, 1268 | "end_time": "2022-06-23T18:06:07.162001", 1269 | "exception": false, 1270 | "start_time": "2022-06-23T18:06:07.145025", 1271 | "status": "completed" 1272 | }, 1273 | "tags": [] 1274 | }, 1275 | "outputs": [ 1276 | { 1277 | "name": "stdout", 1278 | "output_type": "stream", 1279 | "text": [ 1280 | "MAE: 38341.2045\n" 1281 | ] 1282 | } 1283 | ], 1284 | "source": [ 1285 | "from sklearn.metrics import mean_absolute_error\n", 1286 | "\n", 1287 | "mae = mean_absolute_error(y_valid, yhat)\n", 1288 | "\n", 1289 | "print(f\"MAE: {mae:.4f}\")" 1290 | ] 1291 | }, 1292 | { 1293 | "cell_type": "markdown", 1294 | "id": "25deeda3", 1295 | "metadata": { 1296 | "papermill": { 1297 | "duration": 0.006018, 1298 | "end_time": "2022-06-23T18:06:07.174455", 1299 | "exception": false, 1300 | "start_time": "2022-06-23T18:06:07.168437", 1301 | "status": "completed" 1302 | }, 1303 | "tags": [] 1304 | }, 1305 | "source": [ 1306 | "## Predict the Test Data" 1307 | ] 1308 | }, 1309 | { 1310 | "cell_type": "code", 1311 | "execution_count": 14, 1312 | "id": "a30f1ba5", 1313 | "metadata": { 1314 | "execution": { 1315 | "iopub.execute_input": "2022-06-23T18:06:07.188732Z", 1316 | "iopub.status.busy": "2022-06-23T18:06:07.188335Z", 1317 | "iopub.status.idle": "2022-06-23T18:06:07.194913Z", 1318 | "shell.execute_reply": "2022-06-23T18:06:07.194026Z" 1319 | }, 1320 | "papermill": { 1321 | "duration": 0.016032, 1322 | "end_time": "2022-06-23T18:06:07.196828", 1323 | "exception": false, 1324 | "start_time": "2022-06-23T18:06:07.180796", 1325 | "status": "completed" 1326 | }, 1327 | "tags": [] 1328 | }, 1329 | "outputs": [], 1330 | "source": [ 1331 | "preds = model.predict(X_test)" 1332 | ] 1333 | }, 1334 | { 1335 | "cell_type": "code", 1336 | "execution_count": 15, 1337 | "id": "1c7023e7", 1338 | "metadata": { 1339 | "execution": { 1340 | "iopub.execute_input": "2022-06-23T18:06:07.211864Z", 1341 | "iopub.status.busy": "2022-06-23T18:06:07.211312Z", 1342 | "iopub.status.idle": "2022-06-23T18:06:07.217621Z", 1343 | "shell.execute_reply": "2022-06-23T18:06:07.216799Z" 1344 | }, 1345 | "papermill": { 1346 | "duration": 0.016026, 1347 | "end_time": "2022-06-23T18:06:07.219512", 1348 | "exception": false, 1349 | "start_time": "2022-06-23T18:06:07.203486", 1350 | "status": "completed" 1351 | }, 1352 | "tags": [] 1353 | }, 1354 | "outputs": [ 1355 | { 1356 | "data": { 1357 | "text/plain": [ 1358 | "array([116729.85534672, 161107.57455766, 191854.26223268, 189292.03825976,\n", 1359 | " 156085.61557074])" 1360 | ] 1361 | }, 1362 | "execution_count": 15, 1363 | "metadata": {}, 1364 | "output_type": "execute_result" 1365 | } 1366 | ], 1367 | "source": [ 1368 | "preds[:5]" 1369 | ] 1370 | }, 1371 | { 1372 | "cell_type": "markdown", 1373 | "id": "ca630066", 1374 | "metadata": { 1375 | "papermill": { 1376 | "duration": 0.006015, 1377 | "end_time": "2022-06-23T18:06:07.231973", 1378 | "exception": false, 1379 | "start_time": "2022-06-23T18:06:07.225958", 1380 | "status": "completed" 1381 | }, 1382 | "tags": [] 1383 | }, 1384 | "source": [ 1385 | "

Submission File

\n", 1386 | "
\n", 1387 | "\n", 1388 | "The sample file and our data is in the same row order. This allows us to simply assign our prediction to the target column (`SalePrice`) in the sample submission." 1389 | ] 1390 | }, 1391 | { 1392 | "cell_type": "code", 1393 | "execution_count": 16, 1394 | "id": "3b8fc226", 1395 | "metadata": { 1396 | "execution": { 1397 | "iopub.execute_input": "2022-06-23T18:06:07.247473Z", 1398 | "iopub.status.busy": "2022-06-23T18:06:07.247014Z", 1399 | "iopub.status.idle": "2022-06-23T18:06:07.266970Z", 1400 | "shell.execute_reply": "2022-06-23T18:06:07.265892Z" 1401 | }, 1402 | "papermill": { 1403 | "duration": 0.030114, 1404 | "end_time": "2022-06-23T18:06:07.269137", 1405 | "exception": false, 1406 | "start_time": "2022-06-23T18:06:07.239023", 1407 | "status": "completed" 1408 | }, 1409 | "tags": [] 1410 | }, 1411 | "outputs": [ 1412 | { 1413 | "data": { 1414 | "text/html": [ 1415 | "
\n", 1416 | "\n", 1429 | "\n", 1430 | " \n", 1431 | " \n", 1432 | " \n", 1433 | " \n", 1434 | " \n", 1435 | " \n", 1436 | " \n", 1437 | " \n", 1438 | " \n", 1439 | " \n", 1440 | " \n", 1441 | " \n", 1442 | " \n", 1443 | " \n", 1444 | " \n", 1445 | " \n", 1446 | " \n", 1447 | " \n", 1448 | " \n", 1449 | " \n", 1450 | " \n", 1451 | " \n", 1452 | " \n", 1453 | " \n", 1454 | " \n", 1455 | " \n", 1456 | " \n", 1457 | " \n", 1458 | " \n", 1459 | " \n", 1460 | " \n", 1461 | " \n", 1462 | " \n", 1463 | " \n", 1464 | " \n", 1465 | " \n", 1466 | " \n", 1467 | " \n", 1468 | " \n", 1469 | " \n", 1470 | " \n", 1471 | " \n", 1472 | " \n", 1473 | " \n", 1474 | " \n", 1475 | " \n", 1476 | " \n", 1477 | " \n", 1478 | " \n", 1479 | " \n", 1480 | " \n", 1481 | " \n", 1482 | " \n", 1483 | " \n", 1484 | " \n", 1485 | " \n", 1486 | " \n", 1487 | " \n", 1488 | " \n", 1489 | " \n", 1490 | " \n", 1491 | " \n", 1492 | " \n", 1493 | " \n", 1494 | "
IdSalePrice
01461116729.855347
11462161107.574558
21463191854.262233
31464189292.038260
41465156085.615571
.........
14542915136817.691294
14552916136817.691294
14562917150346.233871
14572918124314.038307
14582919229877.665991
\n", 1495 | "

1459 rows × 2 columns

\n", 1496 | "
" 1497 | ], 1498 | "text/plain": [ 1499 | " Id SalePrice\n", 1500 | "0 1461 116729.855347\n", 1501 | "1 1462 161107.574558\n", 1502 | "2 1463 191854.262233\n", 1503 | "3 1464 189292.038260\n", 1504 | "4 1465 156085.615571\n", 1505 | "... ... ...\n", 1506 | "1454 2915 136817.691294\n", 1507 | "1455 2916 136817.691294\n", 1508 | "1456 2917 150346.233871\n", 1509 | "1457 2918 124314.038307\n", 1510 | "1458 2919 229877.665991\n", 1511 | "\n", 1512 | "[1459 rows x 2 columns]" 1513 | ] 1514 | }, 1515 | "execution_count": 16, 1516 | "metadata": {}, 1517 | "output_type": "execute_result" 1518 | } 1519 | ], 1520 | "source": [ 1521 | "sample_submission[TARGET] = preds\n", 1522 | "sample_submission.to_csv(f\"submission.csv\", index=False)\n", 1523 | "sample_submission" 1524 | ] 1525 | } 1526 | ], 1527 | "metadata": { 1528 | "kernelspec": { 1529 | "display_name": "Python 3", 1530 | "language": "python", 1531 | "name": "python3" 1532 | }, 1533 | "language_info": { 1534 | "codemirror_mode": { 1535 | "name": "ipython", 1536 | "version": 3 1537 | }, 1538 | "file_extension": ".py", 1539 | "mimetype": "text/x-python", 1540 | "name": "python", 1541 | "nbconvert_exporter": "python", 1542 | "pygments_lexer": "ipython3", 1543 | "version": "3.7.12" 1544 | }, 1545 | "papermill": { 1546 | "default_parameters": {}, 1547 | "duration": 12.014499, 1548 | "end_time": "2022-06-23T18:06:07.996953", 1549 | "environment_variables": {}, 1550 | "exception": null, 1551 | "input_path": "__notebook__.ipynb", 1552 | "output_path": "__notebook__.ipynb", 1553 | "parameters": {}, 1554 | "start_time": "2022-06-23T18:05:55.982454", 1555 | "version": "2.3.4" 1556 | } 1557 | }, 1558 | "nbformat": 4, 1559 | "nbformat_minor": 5 1560 | } 1561 | -------------------------------------------------------------------------------- /house-prices-support-vector-regression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "e8928847", 6 | "metadata": { 7 | "papermill": { 8 | "duration": 0.00973, 9 | "end_time": "2022-06-26T19:08:37.205804", 10 | "exception": false, 11 | "start_time": "2022-06-26T19:08:37.196074", 12 | "status": "completed" 13 | }, 14 | "tags": [] 15 | }, 16 | "source": [ 17 | "

House Prices: Support Vector Regression

\n", 18 | "
\n", 19 | "\n", 20 | "## Lesson\n", 21 | "\n", 22 | "\n", 23 | "|Notebook| MAE | LeaderBoard|\n", 24 | "| --- | --- | --- |\n", 25 | "|QuickStart|38341.2045|0.29234|\n", 26 | "|Extra Features|32285.7959|0.24425|\n", 27 | "|Features + Lasso|31349.8387|0.24425|\n", 28 | "|Features + Ridge|31348.1429|0.24422|\n", 29 | "|Random Forests|27414.8115|0.23152|\n" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "id": "cc750656", 35 | "metadata": { 36 | "papermill": { 37 | "duration": 0.008949, 38 | "end_time": "2022-06-26T19:08:37.223966", 39 | "exception": false, 40 | "start_time": "2022-06-26T19:08:37.215017", 41 | "status": "completed" 42 | }, 43 | "tags": [] 44 | }, 45 | "source": [ 46 | "

Import Libraries

\n", 47 | "
\n", 48 | "\n", 49 | "A best practise is to include all libraries here. However, I will put a few imports farther down where they are first used so beginners can learn with an \"as needed\" approach." 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 1, 55 | "id": "5b627a86", 56 | "metadata": { 57 | "execution": { 58 | "iopub.execute_input": "2022-06-26T19:08:37.246316Z", 59 | "iopub.status.busy": "2022-06-26T19:08:37.245627Z", 60 | "iopub.status.idle": "2022-06-26T19:08:37.257805Z", 61 | "shell.execute_reply": "2022-06-26T19:08:37.256965Z" 62 | }, 63 | "papermill": { 64 | "duration": 0.024731, 65 | "end_time": "2022-06-26T19:08:37.260273", 66 | "exception": false, 67 | "start_time": "2022-06-26T19:08:37.235542", 68 | "status": "completed" 69 | }, 70 | "tags": [] 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "import numpy as np # linear algebra\n", 75 | "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", 76 | "\n", 77 | "from pathlib import Path\n", 78 | "\n", 79 | "pd.options.display.max_columns = 100 # Want to view all the columns" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "id": "e508e14d", 85 | "metadata": { 86 | "papermill": { 87 | "duration": 0.00822, 88 | "end_time": "2022-06-26T19:08:37.277173", 89 | "exception": false, 90 | "start_time": "2022-06-26T19:08:37.268953", 91 | "status": "completed" 92 | }, 93 | "tags": [] 94 | }, 95 | "source": [ 96 | "

Library

\n", 97 | "
\n", 98 | "\n", 99 | "Creating a few functions that we will reuse in each project." 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 2, 105 | "id": "0388c5fa", 106 | "metadata": { 107 | "execution": { 108 | "iopub.execute_input": "2022-06-26T19:08:37.296629Z", 109 | "iopub.status.busy": "2022-06-26T19:08:37.295947Z", 110 | "iopub.status.idle": "2022-06-26T19:08:37.301674Z", 111 | "shell.execute_reply": "2022-06-26T19:08:37.300872Z" 112 | }, 113 | "papermill": { 114 | "duration": 0.017918, 115 | "end_time": "2022-06-26T19:08:37.303935", 116 | "exception": false, 117 | "start_time": "2022-06-26T19:08:37.286017", 118 | "status": "completed" 119 | }, 120 | "tags": [] 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "def read_data(path):\n", 125 | " data_dir = Path(path)\n", 126 | "\n", 127 | " train = pd.read_csv(data_dir / \"train.csv\")\n", 128 | " test = pd.read_csv(data_dir / \"test.csv\")\n", 129 | " submission_df = pd.read_csv(data_dir / \"sample_submission.csv\")\n", 130 | "\n", 131 | " print(f\"train data: Rows={train.shape[0]}, Columns={train.shape[1]}\")\n", 132 | " print(f\"test data : Rows={test.shape[0]}, Columns={test.shape[1]}\")\n", 133 | " return train, test, submission_df" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 3, 139 | "id": "d828b20d", 140 | "metadata": { 141 | "execution": { 142 | "iopub.execute_input": "2022-06-26T19:08:37.323608Z", 143 | "iopub.status.busy": "2022-06-26T19:08:37.322940Z", 144 | "iopub.status.idle": "2022-06-26T19:08:37.328427Z", 145 | "shell.execute_reply": "2022-06-26T19:08:37.327675Z" 146 | }, 147 | "papermill": { 148 | "duration": 0.017923, 149 | "end_time": "2022-06-26T19:08:37.330498", 150 | "exception": false, 151 | "start_time": "2022-06-26T19:08:37.312575", 152 | "status": "completed" 153 | }, 154 | "tags": [] 155 | }, 156 | "outputs": [], 157 | "source": [ 158 | "def create_submission(model_name, target, preds):\n", 159 | " sample_submission[target] = preds\n", 160 | " if len(model_name) > 0:\n", 161 | " sample_submission.to_csv(f\"submission_{model_name}.csv\", index=False)\n", 162 | " else:\n", 163 | " sample_submission.to_csv(f\"submission.csv\", index=False)\n", 164 | "\n", 165 | " return sample_submission[:5]" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 4, 171 | "id": "069ae166", 172 | "metadata": { 173 | "execution": { 174 | "iopub.execute_input": "2022-06-26T19:08:37.350034Z", 175 | "iopub.status.busy": "2022-06-26T19:08:37.349646Z", 176 | "iopub.status.idle": "2022-06-26T19:08:38.617881Z", 177 | "shell.execute_reply": "2022-06-26T19:08:38.616783Z" 178 | }, 179 | "papermill": { 180 | "duration": 1.281355, 181 | "end_time": "2022-06-26T19:08:38.620499", 182 | "exception": false, 183 | "start_time": "2022-06-26T19:08:37.339144", 184 | "status": "completed" 185 | }, 186 | "tags": [] 187 | }, 188 | "outputs": [], 189 | "source": [ 190 | "from sklearn.metrics import mean_squared_error\n", 191 | "from sklearn.metrics import mean_absolute_error\n", 192 | "\n", 193 | "def show_scores(gt, yhat):\n", 194 | " rmse = mean_squared_error(gt, yhat)\n", 195 | " mae = mean_absolute_error(gt, yhat)\n", 196 | "\n", 197 | " print(f\"MAE: {mae:.4f}\")\n", 198 | " print(f\"RMSE: {rmse:.4f}\")" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "id": "2e06d6a4", 204 | "metadata": { 205 | "papermill": { 206 | "duration": 0.008327, 207 | "end_time": "2022-06-26T19:08:38.637957", 208 | "exception": false, 209 | "start_time": "2022-06-26T19:08:38.629630", 210 | "status": "completed" 211 | }, 212 | "tags": [] 213 | }, 214 | "source": [ 215 | "

Load Train/Test Data

\n", 216 | "
\n", 217 | "\n", 218 | "- train.csv - Data used to build our machine learning model\n", 219 | "- test.csv - Data used to build our machine learning model. Does not contain the target variable\n", 220 | "- sample_submission.csv - A file in the proper format to submit test predictions" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 5, 226 | "id": "3baf8429", 227 | "metadata": { 228 | "execution": { 229 | "iopub.execute_input": "2022-06-26T19:08:38.657173Z", 230 | "iopub.status.busy": "2022-06-26T19:08:38.656746Z", 231 | "iopub.status.idle": "2022-06-26T19:08:38.742136Z", 232 | "shell.execute_reply": "2022-06-26T19:08:38.740852Z" 233 | }, 234 | "papermill": { 235 | "duration": 0.099881, 236 | "end_time": "2022-06-26T19:08:38.746365", 237 | "exception": false, 238 | "start_time": "2022-06-26T19:08:38.646484", 239 | "status": "completed" 240 | }, 241 | "tags": [] 242 | }, 243 | "outputs": [ 244 | { 245 | "name": "stdout", 246 | "output_type": "stream", 247 | "text": [ 248 | "train data: Rows=1460, Columns=81\n", 249 | "test data : Rows=1459, Columns=80\n" 250 | ] 251 | } 252 | ], 253 | "source": [ 254 | "train, test, sample_submission = read_data(\"../input/house-prices-advanced-regression-techniques\")" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 6, 260 | "id": "dddbdeea", 261 | "metadata": { 262 | "execution": { 263 | "iopub.execute_input": "2022-06-26T19:08:38.766405Z", 264 | "iopub.status.busy": "2022-06-26T19:08:38.765615Z", 265 | "iopub.status.idle": "2022-06-26T19:08:38.828773Z", 266 | "shell.execute_reply": "2022-06-26T19:08:38.827584Z" 267 | }, 268 | "papermill": { 269 | "duration": 0.076089, 270 | "end_time": "2022-06-26T19:08:38.831690", 271 | "exception": false, 272 | "start_time": "2022-06-26T19:08:38.755601", 273 | "status": "completed" 274 | }, 275 | "tags": [] 276 | }, 277 | "outputs": [ 278 | { 279 | "data": { 280 | "text/html": [ 281 | "
\n", 282 | "\n", 295 | "\n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | "
IdMSSubClassMSZoningLotFrontageLotAreaStreetAlleyLotShapeLandContourUtilitiesLotConfigLandSlopeNeighborhoodCondition1Condition2BldgTypeHouseStyleOverallQualOverallCondYearBuiltYearRemodAddRoofStyleRoofMatlExterior1stExterior2ndMasVnrTypeMasVnrAreaExterQualExterCondFoundationBsmtQualBsmtCondBsmtExposureBsmtFinType1BsmtFinSF1BsmtFinType2BsmtFinSF2BsmtUnfSFTotalBsmtSFHeatingHeatingQCCentralAirElectrical1stFlrSF2ndFlrSFLowQualFinSFGrLivAreaBsmtFullBathBsmtHalfBathFullBathHalfBathBedroomAbvGrKitchenAbvGrKitchenQualTotRmsAbvGrdFunctionalFireplacesFireplaceQuGarageTypeGarageYrBltGarageFinishGarageCarsGarageAreaGarageQualGarageCondPavedDriveWoodDeckSFOpenPorchSFEnclosedPorch3SsnPorchScreenPorchPoolAreaPoolQCFenceMiscFeatureMiscValMoSoldYrSoldSaleTypeSaleConditionSalePrice
0160RL65.08450PaveNaNRegLvlAllPubInsideGtlCollgCrNormNorm1Fam2Story7520032003GableCompShgVinylSdVinylSdBrkFace196.0GdTAPConcGdTANoGLQ706Unf0150856GasAExYSBrkr85685401710102131Gd8Typ0NaNAttchd2003.0RFn2548TATAY0610000NaNNaNNaN022008WDNormal208500
1220RL80.09600PaveNaNRegLvlAllPubFR2GtlVeenkerFeedrNorm1Fam1Story6819761976GableCompShgMetalSdMetalSdNone0.0TATACBlockGdTAGdALQ978Unf02841262GasAExYSBrkr1262001262012031TA6Typ1TAAttchd1976.0RFn2460TATAY29800000NaNNaNNaN052007WDNormal181500
2360RL68.011250PaveNaNIR1LvlAllPubInsideGtlCollgCrNormNorm1Fam2Story7520012002GableCompShgVinylSdVinylSdBrkFace162.0GdTAPConcGdTAMnGLQ486Unf0434920GasAExYSBrkr92086601786102131Gd6Typ1TAAttchd2001.0RFn2608TATAY0420000NaNNaNNaN092008WDNormal223500
3470RL60.09550PaveNaNIR1LvlAllPubCornerGtlCrawforNormNorm1Fam2Story7519151970GableCompShgWd SdngWd ShngNone0.0TATABrkTilTAGdNoALQ216Unf0540756GasAGdYSBrkr96175601717101031Gd7Typ1GdDetchd1998.0Unf3642TATAY035272000NaNNaNNaN022006WDAbnorml140000
4560RL84.014260PaveNaNIR1LvlAllPubFR2GtlNoRidgeNormNorm1Fam2Story8520002000GableCompShgVinylSdVinylSdBrkFace350.0GdTAPConcGdTAAvGLQ655Unf04901145GasAExYSBrkr1145105302198102141Gd9Typ1TAAttchd2000.0RFn3836TATAY192840000NaNNaNNaN0122008WDNormal250000
\n", 805 | "
" 806 | ], 807 | "text/plain": [ 808 | " Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape \\\n", 809 | "0 1 60 RL 65.0 8450 Pave NaN Reg \n", 810 | "1 2 20 RL 80.0 9600 Pave NaN Reg \n", 811 | "2 3 60 RL 68.0 11250 Pave NaN IR1 \n", 812 | "3 4 70 RL 60.0 9550 Pave NaN IR1 \n", 813 | "4 5 60 RL 84.0 14260 Pave NaN IR1 \n", 814 | "\n", 815 | " LandContour Utilities LotConfig LandSlope Neighborhood Condition1 \\\n", 816 | "0 Lvl AllPub Inside Gtl CollgCr Norm \n", 817 | "1 Lvl AllPub FR2 Gtl Veenker Feedr \n", 818 | "2 Lvl AllPub Inside Gtl CollgCr Norm \n", 819 | "3 Lvl AllPub Corner Gtl Crawfor Norm \n", 820 | "4 Lvl AllPub FR2 Gtl NoRidge Norm \n", 821 | "\n", 822 | " Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt \\\n", 823 | "0 Norm 1Fam 2Story 7 5 2003 \n", 824 | "1 Norm 1Fam 1Story 6 8 1976 \n", 825 | "2 Norm 1Fam 2Story 7 5 2001 \n", 826 | "3 Norm 1Fam 2Story 7 5 1915 \n", 827 | "4 Norm 1Fam 2Story 8 5 2000 \n", 828 | "\n", 829 | " YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType \\\n", 830 | "0 2003 Gable CompShg VinylSd VinylSd BrkFace \n", 831 | "1 1976 Gable CompShg MetalSd MetalSd None \n", 832 | "2 2002 Gable CompShg VinylSd VinylSd BrkFace \n", 833 | "3 1970 Gable CompShg Wd Sdng Wd Shng None \n", 834 | "4 2000 Gable CompShg VinylSd VinylSd BrkFace \n", 835 | "\n", 836 | " MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure \\\n", 837 | "0 196.0 Gd TA PConc Gd TA No \n", 838 | "1 0.0 TA TA CBlock Gd TA Gd \n", 839 | "2 162.0 Gd TA PConc Gd TA Mn \n", 840 | "3 0.0 TA TA BrkTil TA Gd No \n", 841 | "4 350.0 Gd TA PConc Gd TA Av \n", 842 | "\n", 843 | " BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF \\\n", 844 | "0 GLQ 706 Unf 0 150 856 \n", 845 | "1 ALQ 978 Unf 0 284 1262 \n", 846 | "2 GLQ 486 Unf 0 434 920 \n", 847 | "3 ALQ 216 Unf 0 540 756 \n", 848 | "4 GLQ 655 Unf 0 490 1145 \n", 849 | "\n", 850 | " Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF \\\n", 851 | "0 GasA Ex Y SBrkr 856 854 0 \n", 852 | "1 GasA Ex Y SBrkr 1262 0 0 \n", 853 | "2 GasA Ex Y SBrkr 920 866 0 \n", 854 | "3 GasA Gd Y SBrkr 961 756 0 \n", 855 | "4 GasA Ex Y SBrkr 1145 1053 0 \n", 856 | "\n", 857 | " GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr \\\n", 858 | "0 1710 1 0 2 1 3 \n", 859 | "1 1262 0 1 2 0 3 \n", 860 | "2 1786 1 0 2 1 3 \n", 861 | "3 1717 1 0 1 0 3 \n", 862 | "4 2198 1 0 2 1 4 \n", 863 | "\n", 864 | " KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu \\\n", 865 | "0 1 Gd 8 Typ 0 NaN \n", 866 | "1 1 TA 6 Typ 1 TA \n", 867 | "2 1 Gd 6 Typ 1 TA \n", 868 | "3 1 Gd 7 Typ 1 Gd \n", 869 | "4 1 Gd 9 Typ 1 TA \n", 870 | "\n", 871 | " GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual \\\n", 872 | "0 Attchd 2003.0 RFn 2 548 TA \n", 873 | "1 Attchd 1976.0 RFn 2 460 TA \n", 874 | "2 Attchd 2001.0 RFn 2 608 TA \n", 875 | "3 Detchd 1998.0 Unf 3 642 TA \n", 876 | "4 Attchd 2000.0 RFn 3 836 TA \n", 877 | "\n", 878 | " GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch \\\n", 879 | "0 TA Y 0 61 0 0 \n", 880 | "1 TA Y 298 0 0 0 \n", 881 | "2 TA Y 0 42 0 0 \n", 882 | "3 TA Y 0 35 272 0 \n", 883 | "4 TA Y 192 84 0 0 \n", 884 | "\n", 885 | " ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold \\\n", 886 | "0 0 0 NaN NaN NaN 0 2 2008 \n", 887 | "1 0 0 NaN NaN NaN 0 5 2007 \n", 888 | "2 0 0 NaN NaN NaN 0 9 2008 \n", 889 | "3 0 0 NaN NaN NaN 0 2 2006 \n", 890 | "4 0 0 NaN NaN NaN 0 12 2008 \n", 891 | "\n", 892 | " SaleType SaleCondition SalePrice \n", 893 | "0 WD Normal 208500 \n", 894 | "1 WD Normal 181500 \n", 895 | "2 WD Normal 223500 \n", 896 | "3 WD Abnorml 140000 \n", 897 | "4 WD Normal 250000 " 898 | ] 899 | }, 900 | "execution_count": 6, 901 | "metadata": {}, 902 | "output_type": "execute_result" 903 | } 904 | ], 905 | "source": [ 906 | "train.head()" 907 | ] 908 | }, 909 | { 910 | "cell_type": "markdown", 911 | "id": "56b3c06c", 912 | "metadata": { 913 | "papermill": { 914 | "duration": 0.009867, 915 | "end_time": "2022-06-26T19:08:38.851124", 916 | "exception": false, 917 | "start_time": "2022-06-26T19:08:38.841257", 918 | "status": "completed" 919 | }, 920 | "tags": [] 921 | }, 922 | "source": [ 923 | "In supervised learning problems, we have a label or target." 924 | ] 925 | }, 926 | { 927 | "cell_type": "code", 928 | "execution_count": 7, 929 | "id": "54548c5a", 930 | "metadata": { 931 | "execution": { 932 | "iopub.execute_input": "2022-06-26T19:08:38.872015Z", 933 | "iopub.status.busy": "2022-06-26T19:08:38.871616Z", 934 | "iopub.status.idle": "2022-06-26T19:08:38.875792Z", 935 | "shell.execute_reply": "2022-06-26T19:08:38.874984Z" 936 | }, 937 | "papermill": { 938 | "duration": 0.016975, 939 | "end_time": "2022-06-26T19:08:38.877717", 940 | "exception": false, 941 | "start_time": "2022-06-26T19:08:38.860742", 942 | "status": "completed" 943 | }, 944 | "tags": [] 945 | }, 946 | "outputs": [], 947 | "source": [ 948 | "TARGET = \"SalePrice\"" 949 | ] 950 | }, 951 | { 952 | "cell_type": "markdown", 953 | "id": "fb703318", 954 | "metadata": { 955 | "papermill": { 956 | "duration": 0.009171, 957 | "end_time": "2022-06-26T19:08:38.896368", 958 | "exception": false, 959 | "start_time": "2022-06-26T19:08:38.887197", 960 | "status": "completed" 961 | }, 962 | "tags": [] 963 | }, 964 | "source": [ 965 | "There are 79 features but to keep it simple we are only going to start with one." 966 | ] 967 | }, 968 | { 969 | "cell_type": "code", 970 | "execution_count": 8, 971 | "id": "d6a33275", 972 | "metadata": { 973 | "execution": { 974 | "iopub.execute_input": "2022-06-26T19:08:38.918059Z", 975 | "iopub.status.busy": "2022-06-26T19:08:38.917390Z", 976 | "iopub.status.idle": "2022-06-26T19:08:38.921655Z", 977 | "shell.execute_reply": "2022-06-26T19:08:38.920844Z" 978 | }, 979 | "papermill": { 980 | "duration": 0.018064, 981 | "end_time": "2022-06-26T19:08:38.924104", 982 | "exception": false, 983 | "start_time": "2022-06-26T19:08:38.906040", 984 | "status": "completed" 985 | }, 986 | "tags": [] 987 | }, 988 | "outputs": [], 989 | "source": [ 990 | "FEATURES = [\"GrLivArea\", \"LotArea\", \"TotalBsmtSF\", \"FullBath\"]" 991 | ] 992 | }, 993 | { 994 | "cell_type": "markdown", 995 | "id": "34e18fb3", 996 | "metadata": { 997 | "papermill": { 998 | "duration": 0.009445, 999 | "end_time": "2022-06-26T19:08:38.943475", 1000 | "exception": false, 1001 | "start_time": "2022-06-26T19:08:38.934030", 1002 | "status": "completed" 1003 | }, 1004 | "tags": [] 1005 | }, 1006 | "source": [ 1007 | "

Missing Data

\n", 1008 | "
" 1009 | ] 1010 | }, 1011 | { 1012 | "cell_type": "code", 1013 | "execution_count": 9, 1014 | "id": "efa2c3f0", 1015 | "metadata": { 1016 | "execution": { 1017 | "iopub.execute_input": "2022-06-26T19:08:38.964845Z", 1018 | "iopub.status.busy": "2022-06-26T19:08:38.964244Z", 1019 | "iopub.status.idle": "2022-06-26T19:08:38.981794Z", 1020 | "shell.execute_reply": "2022-06-26T19:08:38.980875Z" 1021 | }, 1022 | "papermill": { 1023 | "duration": 0.031101, 1024 | "end_time": "2022-06-26T19:08:38.984315", 1025 | "exception": false, 1026 | "start_time": "2022-06-26T19:08:38.953214", 1027 | "status": "completed" 1028 | }, 1029 | "tags": [] 1030 | }, 1031 | "outputs": [ 1032 | { 1033 | "name": "stdout", 1034 | "output_type": "stream", 1035 | "text": [ 1036 | "===== Train =====\n", 1037 | "GrLivArea 0\n", 1038 | "LotArea 0\n", 1039 | "TotalBsmtSF 0\n", 1040 | "FullBath 0\n", 1041 | "dtype: int64\n", 1042 | "===== Test =====\n", 1043 | "GrLivArea 0\n", 1044 | "LotArea 0\n", 1045 | "TotalBsmtSF 1\n", 1046 | "FullBath 0\n", 1047 | "dtype: int64\n" 1048 | ] 1049 | } 1050 | ], 1051 | "source": [ 1052 | "print(5*\"=\",\"Train\", 5*\"=\")\n", 1053 | "print(train[FEATURES].isnull().sum())\n", 1054 | "print(5*\"=\",\"Test\", 5*\"=\")\n", 1055 | "print(test[FEATURES].isnull().sum())" 1056 | ] 1057 | }, 1058 | { 1059 | "cell_type": "code", 1060 | "execution_count": 10, 1061 | "id": "f326aeaf", 1062 | "metadata": { 1063 | "execution": { 1064 | "iopub.execute_input": "2022-06-26T19:08:39.005829Z", 1065 | "iopub.status.busy": "2022-06-26T19:08:39.005080Z", 1066 | "iopub.status.idle": "2022-06-26T19:08:39.012807Z", 1067 | "shell.execute_reply": "2022-06-26T19:08:39.011763Z" 1068 | }, 1069 | "papermill": { 1070 | "duration": 0.021473, 1071 | "end_time": "2022-06-26T19:08:39.015483", 1072 | "exception": false, 1073 | "start_time": "2022-06-26T19:08:38.994010", 1074 | "status": "completed" 1075 | }, 1076 | "tags": [] 1077 | }, 1078 | "outputs": [], 1079 | "source": [ 1080 | "test[\"TotalBsmtSF\"] = test[\"TotalBsmtSF\"].fillna(0)" 1081 | ] 1082 | }, 1083 | { 1084 | "cell_type": "markdown", 1085 | "id": "0d4584ab", 1086 | "metadata": { 1087 | "papermill": { 1088 | "duration": 0.009323, 1089 | "end_time": "2022-06-26T19:08:39.034419", 1090 | "exception": false, 1091 | "start_time": "2022-06-26T19:08:39.025096", 1092 | "status": "completed" 1093 | }, 1094 | "tags": [] 1095 | }, 1096 | "source": [ 1097 | "## Verify No Missing Data" 1098 | ] 1099 | }, 1100 | { 1101 | "cell_type": "code", 1102 | "execution_count": 11, 1103 | "id": "44ef427e", 1104 | "metadata": { 1105 | "execution": { 1106 | "iopub.execute_input": "2022-06-26T19:08:39.056083Z", 1107 | "iopub.status.busy": "2022-06-26T19:08:39.055301Z", 1108 | "iopub.status.idle": "2022-06-26T19:08:39.063768Z", 1109 | "shell.execute_reply": "2022-06-26T19:08:39.062640Z" 1110 | }, 1111 | "papermill": { 1112 | "duration": 0.023281, 1113 | "end_time": "2022-06-26T19:08:39.067297", 1114 | "exception": false, 1115 | "start_time": "2022-06-26T19:08:39.044016", 1116 | "status": "completed" 1117 | }, 1118 | "tags": [] 1119 | }, 1120 | "outputs": [ 1121 | { 1122 | "name": "stdout", 1123 | "output_type": "stream", 1124 | "text": [ 1125 | "GrLivArea 0\n", 1126 | "LotArea 0\n", 1127 | "TotalBsmtSF 0\n", 1128 | "FullBath 0\n", 1129 | "dtype: int64\n" 1130 | ] 1131 | } 1132 | ], 1133 | "source": [ 1134 | "print(test[FEATURES].isnull().sum())" 1135 | ] 1136 | }, 1137 | { 1138 | "cell_type": "code", 1139 | "execution_count": 12, 1140 | "id": "05271dee", 1141 | "metadata": { 1142 | "execution": { 1143 | "iopub.execute_input": "2022-06-26T19:08:39.088405Z", 1144 | "iopub.status.busy": "2022-06-26T19:08:39.087595Z", 1145 | "iopub.status.idle": "2022-06-26T19:08:39.095630Z", 1146 | "shell.execute_reply": "2022-06-26T19:08:39.094588Z" 1147 | }, 1148 | "papermill": { 1149 | "duration": 0.021808, 1150 | "end_time": "2022-06-26T19:08:39.098540", 1151 | "exception": false, 1152 | "start_time": "2022-06-26T19:08:39.076732", 1153 | "status": "completed" 1154 | }, 1155 | "tags": [] 1156 | }, 1157 | "outputs": [], 1158 | "source": [ 1159 | "y = train[TARGET]\n", 1160 | "X = train[FEATURES].copy()\n", 1161 | "\n", 1162 | "X_test = test[FEATURES].copy()" 1163 | ] 1164 | }, 1165 | { 1166 | "cell_type": "code", 1167 | "execution_count": 13, 1168 | "id": "ef331cfe", 1169 | "metadata": { 1170 | "execution": { 1171 | "iopub.execute_input": "2022-06-26T19:08:39.120457Z", 1172 | "iopub.status.busy": "2022-06-26T19:08:39.119696Z", 1173 | "iopub.status.idle": "2022-06-26T19:08:39.130924Z", 1174 | "shell.execute_reply": "2022-06-26T19:08:39.129824Z" 1175 | }, 1176 | "papermill": { 1177 | "duration": 0.024864, 1178 | "end_time": "2022-06-26T19:08:39.133226", 1179 | "exception": false, 1180 | "start_time": "2022-06-26T19:08:39.108362", 1181 | "status": "completed" 1182 | }, 1183 | "tags": [] 1184 | }, 1185 | "outputs": [ 1186 | { 1187 | "data": { 1188 | "text/html": [ 1189 | "
\n", 1190 | "\n", 1203 | "\n", 1204 | " \n", 1205 | " \n", 1206 | " \n", 1207 | " \n", 1208 | " \n", 1209 | " \n", 1210 | " \n", 1211 | " \n", 1212 | " \n", 1213 | " \n", 1214 | " \n", 1215 | " \n", 1216 | " \n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | "
GrLivAreaLotAreaTotalBsmtSFFullBath
0171084508562
11262960012622
21786112509202
3171795507561
421981426011452
\n", 1251 | "
" 1252 | ], 1253 | "text/plain": [ 1254 | " GrLivArea LotArea TotalBsmtSF FullBath\n", 1255 | "0 1710 8450 856 2\n", 1256 | "1 1262 9600 1262 2\n", 1257 | "2 1786 11250 920 2\n", 1258 | "3 1717 9550 756 1\n", 1259 | "4 2198 14260 1145 2" 1260 | ] 1261 | }, 1262 | "execution_count": 13, 1263 | "metadata": {}, 1264 | "output_type": "execute_result" 1265 | } 1266 | ], 1267 | "source": [ 1268 | "X.head()" 1269 | ] 1270 | }, 1271 | { 1272 | "cell_type": "markdown", 1273 | "id": "d8ba43c0", 1274 | "metadata": { 1275 | "papermill": { 1276 | "duration": 0.009481, 1277 | "end_time": "2022-06-26T19:08:39.152605", 1278 | "exception": false, 1279 | "start_time": "2022-06-26T19:08:39.143124", 1280 | "status": "completed" 1281 | }, 1282 | "tags": [] 1283 | }, 1284 | "source": [ 1285 | "## Scale the Data\n", 1286 | "\n", 1287 | "Doesn't make a difference so it's commented out." 1288 | ] 1289 | }, 1290 | { 1291 | "cell_type": "code", 1292 | "execution_count": 14, 1293 | "id": "6ad0b044", 1294 | "metadata": { 1295 | "execution": { 1296 | "iopub.execute_input": "2022-06-26T19:08:39.174223Z", 1297 | "iopub.status.busy": "2022-06-26T19:08:39.173836Z", 1298 | "iopub.status.idle": "2022-06-26T19:08:39.187573Z", 1299 | "shell.execute_reply": "2022-06-26T19:08:39.186284Z" 1300 | }, 1301 | "papermill": { 1302 | "duration": 0.027871, 1303 | "end_time": "2022-06-26T19:08:39.190327", 1304 | "exception": false, 1305 | "start_time": "2022-06-26T19:08:39.162456", 1306 | "status": "completed" 1307 | }, 1308 | "tags": [] 1309 | }, 1310 | "outputs": [], 1311 | "source": [ 1312 | "from sklearn.preprocessing import StandardScaler, RobustScaler\n", 1313 | "\n", 1314 | "scaler = StandardScaler()\n", 1315 | "\n", 1316 | "X = scaler.fit(X).transform(X)\n", 1317 | "X_test = scaler.transform(X_test)" 1318 | ] 1319 | }, 1320 | { 1321 | "cell_type": "code", 1322 | "execution_count": 15, 1323 | "id": "cbfc9c45", 1324 | "metadata": { 1325 | "execution": { 1326 | "iopub.execute_input": "2022-06-26T19:08:39.212030Z", 1327 | "iopub.status.busy": "2022-06-26T19:08:39.211592Z", 1328 | "iopub.status.idle": "2022-06-26T19:08:39.219701Z", 1329 | "shell.execute_reply": "2022-06-26T19:08:39.218390Z" 1330 | }, 1331 | "papermill": { 1332 | "duration": 0.02213, 1333 | "end_time": "2022-06-26T19:08:39.222348", 1334 | "exception": false, 1335 | "start_time": "2022-06-26T19:08:39.200218", 1336 | "status": "completed" 1337 | }, 1338 | "tags": [] 1339 | }, 1340 | "outputs": [ 1341 | { 1342 | "data": { 1343 | "text/plain": [ 1344 | "array([[ 0.37033344, -0.20714171, -0.45930254, 0.78974052],\n", 1345 | " [-0.48251191, -0.09188637, 0.46646492, 0.78974052],\n", 1346 | " [ 0.51501256, 0.07347998, -0.31336875, 0.78974052],\n", 1347 | " [ 0.38365915, -0.09689747, -0.68732408, -1.02604084],\n", 1348 | " [ 1.2993257 , 0.37514829, 0.19967971, 0.78974052]])" 1349 | ] 1350 | }, 1351 | "execution_count": 15, 1352 | "metadata": {}, 1353 | "output_type": "execute_result" 1354 | } 1355 | ], 1356 | "source": [ 1357 | "X[:5]" 1358 | ] 1359 | }, 1360 | { 1361 | "cell_type": "markdown", 1362 | "id": "bd0d7597", 1363 | "metadata": { 1364 | "papermill": { 1365 | "duration": 0.009701, 1366 | "end_time": "2022-06-26T19:08:39.242623", 1367 | "exception": false, 1368 | "start_time": "2022-06-26T19:08:39.232922", 1369 | "status": "completed" 1370 | }, 1371 | "tags": [] 1372 | }, 1373 | "source": [ 1374 | "

Train Model with Train/Test Split

\n", 1375 | "
\n", 1376 | "\n", 1377 | "We split the training data so we can evaluate how well each model performs We are saving 20% of the training data to validate the model(s)." 1378 | ] 1379 | }, 1380 | { 1381 | "cell_type": "code", 1382 | "execution_count": 16, 1383 | "id": "7ba8229f", 1384 | "metadata": { 1385 | "execution": { 1386 | "iopub.execute_input": "2022-06-26T19:08:39.264387Z", 1387 | "iopub.status.busy": "2022-06-26T19:08:39.263989Z", 1388 | "iopub.status.idle": "2022-06-26T19:08:39.288524Z", 1389 | "shell.execute_reply": "2022-06-26T19:08:39.287669Z" 1390 | }, 1391 | "papermill": { 1392 | "duration": 0.038305, 1393 | "end_time": "2022-06-26T19:08:39.290901", 1394 | "exception": false, 1395 | "start_time": "2022-06-26T19:08:39.252596", 1396 | "status": "completed" 1397 | }, 1398 | "tags": [] 1399 | }, 1400 | "outputs": [ 1401 | { 1402 | "data": { 1403 | "text/plain": [ 1404 | "((1022, 4), (1022,), (438, 4), (438,))" 1405 | ] 1406 | }, 1407 | "execution_count": 16, 1408 | "metadata": {}, 1409 | "output_type": "execute_result" 1410 | } 1411 | ], 1412 | "source": [ 1413 | "from sklearn.model_selection import train_test_split\n", 1414 | "\n", 1415 | "X_train, X_valid, y_train, y_valid = train_test_split(\n", 1416 | " X,\n", 1417 | " y,\n", 1418 | " test_size=0.3, # Save 20% for validation\n", 1419 | " random_state=42, # Make the split deterministic\n", 1420 | ")\n", 1421 | "X_train.shape, y_train.shape, X_valid.shape, y_valid.shape" 1422 | ] 1423 | }, 1424 | { 1425 | "cell_type": "markdown", 1426 | "id": "6a46811a", 1427 | "metadata": { 1428 | "papermill": { 1429 | "duration": 0.010196, 1430 | "end_time": "2022-06-26T19:08:39.311310", 1431 | "exception": false, 1432 | "start_time": "2022-06-26T19:08:39.301114", 1433 | "status": "completed" 1434 | }, 1435 | "tags": [] 1436 | }, 1437 | "source": [ 1438 | "

Create Models

\n", 1439 | "
\n", 1440 | "\n", 1441 | "\n", 1442 | "- [Parameters](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html)" 1443 | ] 1444 | }, 1445 | { 1446 | "cell_type": "code", 1447 | "execution_count": 17, 1448 | "id": "3ede0d2d", 1449 | "metadata": { 1450 | "execution": { 1451 | "iopub.execute_input": "2022-06-26T19:08:39.333540Z", 1452 | "iopub.status.busy": "2022-06-26T19:08:39.333122Z", 1453 | "iopub.status.idle": "2022-06-26T19:08:39.487823Z", 1454 | "shell.execute_reply": "2022-06-26T19:08:39.486003Z" 1455 | }, 1456 | "papermill": { 1457 | "duration": 0.168774, 1458 | "end_time": "2022-06-26T19:08:39.490394", 1459 | "exception": false, 1460 | "start_time": "2022-06-26T19:08:39.321620", 1461 | "status": "completed" 1462 | }, 1463 | "tags": [] 1464 | }, 1465 | "outputs": [ 1466 | { 1467 | "name": "stdout", 1468 | "output_type": "stream", 1469 | "text": [ 1470 | "MAE: 32359.5799\n", 1471 | "RMSE: 2773359243.2345\n" 1472 | ] 1473 | } 1474 | ], 1475 | "source": [ 1476 | "from sklearn.svm import SVR\n", 1477 | "\n", 1478 | "model = SVR(kernel=\"linear\", C=100, gamma=\"auto\")\n", 1479 | "model.fit(X_train, y_train)\n", 1480 | "\n", 1481 | "valid_preds = model.predict(X_valid)\n", 1482 | "show_scores(y_valid, valid_preds)" 1483 | ] 1484 | }, 1485 | { 1486 | "cell_type": "code", 1487 | "execution_count": 18, 1488 | "id": "ba5ba2f5", 1489 | "metadata": { 1490 | "execution": { 1491 | "iopub.execute_input": "2022-06-26T19:08:39.512530Z", 1492 | "iopub.status.busy": "2022-06-26T19:08:39.512094Z", 1493 | "iopub.status.idle": "2022-06-26T19:08:39.569698Z", 1494 | "shell.execute_reply": "2022-06-26T19:08:39.568547Z" 1495 | }, 1496 | "papermill": { 1497 | "duration": 0.072106, 1498 | "end_time": "2022-06-26T19:08:39.572638", 1499 | "exception": false, 1500 | "start_time": "2022-06-26T19:08:39.500532", 1501 | "status": "completed" 1502 | }, 1503 | "tags": [] 1504 | }, 1505 | "outputs": [ 1506 | { 1507 | "data": { 1508 | "text/html": [ 1509 | "
\n", 1510 | "\n", 1523 | "\n", 1524 | " \n", 1525 | " \n", 1526 | " \n", 1527 | " \n", 1528 | " \n", 1529 | " \n", 1530 | " \n", 1531 | " \n", 1532 | " \n", 1533 | " \n", 1534 | " \n", 1535 | " \n", 1536 | " \n", 1537 | " \n", 1538 | " \n", 1539 | " \n", 1540 | " \n", 1541 | " \n", 1542 | " \n", 1543 | " \n", 1544 | " \n", 1545 | " \n", 1546 | " \n", 1547 | " \n", 1548 | " \n", 1549 | " \n", 1550 | " \n", 1551 | " \n", 1552 | " \n", 1553 | " \n", 1554 | " \n", 1555 | " \n", 1556 | " \n", 1557 | " \n", 1558 | "
IdSalePrice
01461127011.171812
11462165442.766019
21463187487.183847
31464184086.496238
41465182795.993094
\n", 1559 | "
" 1560 | ], 1561 | "text/plain": [ 1562 | " Id SalePrice\n", 1563 | "0 1461 127011.171812\n", 1564 | "1 1462 165442.766019\n", 1565 | "2 1463 187487.183847\n", 1566 | "3 1464 184086.496238\n", 1567 | "4 1465 182795.993094" 1568 | ] 1569 | }, 1570 | "execution_count": 18, 1571 | "metadata": {}, 1572 | "output_type": "execute_result" 1573 | } 1574 | ], 1575 | "source": [ 1576 | "test_preds = model.predict(X_test)\n", 1577 | "\n", 1578 | "create_submission(\"svr_lin\", TARGET, test_preds)" 1579 | ] 1580 | }, 1581 | { 1582 | "cell_type": "markdown", 1583 | "id": "5508e984", 1584 | "metadata": { 1585 | "papermill": { 1586 | "duration": 0.010723, 1587 | "end_time": "2022-06-26T19:08:39.594101", 1588 | "exception": false, 1589 | "start_time": "2022-06-26T19:08:39.583378", 1590 | "status": "completed" 1591 | }, 1592 | "tags": [] 1593 | }, 1594 | "source": [ 1595 | "### RBF Kernel" 1596 | ] 1597 | }, 1598 | { 1599 | "cell_type": "code", 1600 | "execution_count": 19, 1601 | "id": "6688d4c3", 1602 | "metadata": { 1603 | "execution": { 1604 | "iopub.execute_input": "2022-06-26T19:08:39.618574Z", 1605 | "iopub.status.busy": "2022-06-26T19:08:39.618039Z", 1606 | "iopub.status.idle": "2022-06-26T19:08:39.707210Z", 1607 | "shell.execute_reply": "2022-06-26T19:08:39.705582Z" 1608 | }, 1609 | "papermill": { 1610 | "duration": 0.104665, 1611 | "end_time": "2022-06-26T19:08:39.709956", 1612 | "exception": false, 1613 | "start_time": "2022-06-26T19:08:39.605291", 1614 | "status": "completed" 1615 | }, 1616 | "tags": [] 1617 | }, 1618 | "outputs": [ 1619 | { 1620 | "name": "stdout", 1621 | "output_type": "stream", 1622 | "text": [ 1623 | "MAE: 49202.1042\n", 1624 | "RMSE: 6041789054.8483\n" 1625 | ] 1626 | } 1627 | ], 1628 | "source": [ 1629 | "model = SVR(kernel=\"rbf\", C=100, gamma=0.1, epsilon=0.1)\n", 1630 | "model.fit(X_train, y_train)\n", 1631 | "\n", 1632 | "valid_preds = model.predict(X_valid)\n", 1633 | "show_scores(y_valid, valid_preds)" 1634 | ] 1635 | }, 1636 | { 1637 | "cell_type": "code", 1638 | "execution_count": 20, 1639 | "id": "c1669705", 1640 | "metadata": { 1641 | "execution": { 1642 | "iopub.execute_input": "2022-06-26T19:08:39.732682Z", 1643 | "iopub.status.busy": "2022-06-26T19:08:39.732281Z", 1644 | "iopub.status.idle": "2022-06-26T19:08:39.824440Z", 1645 | "shell.execute_reply": "2022-06-26T19:08:39.823525Z" 1646 | }, 1647 | "papermill": { 1648 | "duration": 0.106608, 1649 | "end_time": "2022-06-26T19:08:39.826993", 1650 | "exception": false, 1651 | "start_time": "2022-06-26T19:08:39.720385", 1652 | "status": "completed" 1653 | }, 1654 | "tags": [] 1655 | }, 1656 | "outputs": [ 1657 | { 1658 | "data": { 1659 | "text/html": [ 1660 | "
\n", 1661 | "\n", 1674 | "\n", 1675 | " \n", 1676 | " \n", 1677 | " \n", 1678 | " \n", 1679 | " \n", 1680 | " \n", 1681 | " \n", 1682 | " \n", 1683 | " \n", 1684 | " \n", 1685 | " \n", 1686 | " \n", 1687 | " \n", 1688 | " \n", 1689 | " \n", 1690 | " \n", 1691 | " \n", 1692 | " \n", 1693 | " \n", 1694 | " \n", 1695 | " \n", 1696 | " \n", 1697 | " \n", 1698 | " \n", 1699 | " \n", 1700 | " \n", 1701 | " \n", 1702 | " \n", 1703 | " \n", 1704 | " \n", 1705 | " \n", 1706 | " \n", 1707 | " \n", 1708 | " \n", 1709 | "
IdSalePrice
01461153257.415273
11462162495.518762
21463173119.150335
31464172210.631489
41465171427.171419
\n", 1710 | "
" 1711 | ], 1712 | "text/plain": [ 1713 | " Id SalePrice\n", 1714 | "0 1461 153257.415273\n", 1715 | "1 1462 162495.518762\n", 1716 | "2 1463 173119.150335\n", 1717 | "3 1464 172210.631489\n", 1718 | "4 1465 171427.171419" 1719 | ] 1720 | }, 1721 | "execution_count": 20, 1722 | "metadata": {}, 1723 | "output_type": "execute_result" 1724 | } 1725 | ], 1726 | "source": [ 1727 | "test_preds = model.predict(X_test)\n", 1728 | "\n", 1729 | "create_submission(\"svr_rbf\", TARGET, test_preds)" 1730 | ] 1731 | }, 1732 | { 1733 | "cell_type": "markdown", 1734 | "id": "ce283f35", 1735 | "metadata": { 1736 | "papermill": { 1737 | "duration": 0.010606, 1738 | "end_time": "2022-06-26T19:08:39.848185", 1739 | "exception": false, 1740 | "start_time": "2022-06-26T19:08:39.837579", 1741 | "status": "completed" 1742 | }, 1743 | "tags": [] 1744 | }, 1745 | "source": [ 1746 | "### Polynomial Kernel" 1747 | ] 1748 | }, 1749 | { 1750 | "cell_type": "code", 1751 | "execution_count": 21, 1752 | "id": "1a11f448", 1753 | "metadata": { 1754 | "execution": { 1755 | "iopub.execute_input": "2022-06-26T19:08:39.871733Z", 1756 | "iopub.status.busy": "2022-06-26T19:08:39.871353Z", 1757 | "iopub.status.idle": "2022-06-26T19:08:40.032033Z", 1758 | "shell.execute_reply": "2022-06-26T19:08:40.030769Z" 1759 | }, 1760 | "papermill": { 1761 | "duration": 0.175448, 1762 | "end_time": "2022-06-26T19:08:40.034759", 1763 | "exception": false, 1764 | "start_time": "2022-06-26T19:08:39.859311", 1765 | "status": "completed" 1766 | }, 1767 | "tags": [] 1768 | }, 1769 | "outputs": [ 1770 | { 1771 | "name": "stdout", 1772 | "output_type": "stream", 1773 | "text": [ 1774 | "MAE: 29928.0756\n", 1775 | "RMSE: 2239106181.3367\n" 1776 | ] 1777 | } 1778 | ], 1779 | "source": [ 1780 | "model = SVR(kernel=\"poly\", C=150, gamma=\"auto\", degree=5, epsilon=0.01, coef0=1)\n", 1781 | "model.fit(X_train, y_train)\n", 1782 | "\n", 1783 | "valid_preds = model.predict(X_valid)\n", 1784 | "show_scores(y_valid, valid_preds)" 1785 | ] 1786 | }, 1787 | { 1788 | "cell_type": "code", 1789 | "execution_count": 22, 1790 | "id": "2296155a", 1791 | "metadata": { 1792 | "execution": { 1793 | "iopub.execute_input": "2022-06-26T19:08:40.058446Z", 1794 | "iopub.status.busy": "2022-06-26T19:08:40.057755Z", 1795 | "iopub.status.idle": "2022-06-26T19:08:40.118457Z", 1796 | "shell.execute_reply": "2022-06-26T19:08:40.116917Z" 1797 | }, 1798 | "papermill": { 1799 | "duration": 0.075624, 1800 | "end_time": "2022-06-26T19:08:40.121175", 1801 | "exception": false, 1802 | "start_time": "2022-06-26T19:08:40.045551", 1803 | "status": "completed" 1804 | }, 1805 | "tags": [] 1806 | }, 1807 | "outputs": [ 1808 | { 1809 | "data": { 1810 | "text/html": [ 1811 | "
\n", 1812 | "\n", 1825 | "\n", 1826 | " \n", 1827 | " \n", 1828 | " \n", 1829 | " \n", 1830 | " \n", 1831 | " \n", 1832 | " \n", 1833 | " \n", 1834 | " \n", 1835 | " \n", 1836 | " \n", 1837 | " \n", 1838 | " \n", 1839 | " \n", 1840 | " \n", 1841 | " \n", 1842 | " \n", 1843 | " \n", 1844 | " \n", 1845 | " \n", 1846 | " \n", 1847 | " \n", 1848 | " \n", 1849 | " \n", 1850 | " \n", 1851 | " \n", 1852 | " \n", 1853 | " \n", 1854 | " \n", 1855 | " \n", 1856 | " \n", 1857 | " \n", 1858 | " \n", 1859 | " \n", 1860 | "
IdSalePrice
01461128694.315636
11462164224.048269
21463186707.703004
31464183115.317499
41465178954.288237
\n", 1861 | "
" 1862 | ], 1863 | "text/plain": [ 1864 | " Id SalePrice\n", 1865 | "0 1461 128694.315636\n", 1866 | "1 1462 164224.048269\n", 1867 | "2 1463 186707.703004\n", 1868 | "3 1464 183115.317499\n", 1869 | "4 1465 178954.288237" 1870 | ] 1871 | }, 1872 | "execution_count": 22, 1873 | "metadata": {}, 1874 | "output_type": "execute_result" 1875 | } 1876 | ], 1877 | "source": [ 1878 | "test_preds = model.predict(X_test)\n", 1879 | "\n", 1880 | "create_submission(\"svr_poly\", TARGET, test_preds)" 1881 | ] 1882 | } 1883 | ], 1884 | "metadata": { 1885 | "kernelspec": { 1886 | "display_name": "Python 3", 1887 | "language": "python", 1888 | "name": "python3" 1889 | }, 1890 | "language_info": { 1891 | "codemirror_mode": { 1892 | "name": "ipython", 1893 | "version": 3 1894 | }, 1895 | "file_extension": ".py", 1896 | "mimetype": "text/x-python", 1897 | "name": "python", 1898 | "nbconvert_exporter": "python", 1899 | "pygments_lexer": "ipython3", 1900 | "version": "3.7.12" 1901 | }, 1902 | "papermill": { 1903 | "default_parameters": {}, 1904 | "duration": 14.33741, 1905 | "end_time": "2022-06-26T19:08:40.854138", 1906 | "environment_variables": {}, 1907 | "exception": null, 1908 | "input_path": "__notebook__.ipynb", 1909 | "output_path": "__notebook__.ipynb", 1910 | "parameters": {}, 1911 | "start_time": "2022-06-26T19:08:26.516728", 1912 | "version": "2.3.4" 1913 | } 1914 | }, 1915 | "nbformat": 4, 1916 | "nbformat_minor": 5 1917 | } 1918 | -------------------------------------------------------------------------------- /tex/README.md: -------------------------------------------------------------------------------- 1 | # LaTeX Documents 2 | 3 | --------------------------------------------------------------------------------