├── .gitignore ├── Anaconda-HowTo.md ├── Data-Science-Codex.Rproj ├── Git-HowTo.md ├── LICENSE ├── Python ├── Clustering.ipynb ├── Hello_NLP.ipynb ├── LSTM-Demo.ipynb ├── Pandas_Codeblocks.ipynb ├── Pandas_Essentials.Rmd ├── Pandas_Essentials.ipynb ├── Pandas_Essentials.md ├── R-Python-Hybrid.ipynb ├── SQL_Databases.ipynb ├── Sklearn-Workflow.ipynb ├── Tidy_Pandas.ipynb ├── plotnine_ggrepel.ipynb ├── sklearn_skopt_pipeline.ipynb ├── state_of_union_embeddings.ipynb └── state_of_union_v2.ipynb ├── R-Development.md ├── R ├── Bayesian_Basics.Rmd ├── Bayesian_Basics.md ├── Bayesian_Modeling.Rmd ├── Bayesian_Modeling.md ├── Caret.Rmd ├── Caret.md ├── Clustering.Rmd ├── Clustering.md ├── Comparing_Bayesian_Packages.Rmd ├── Comparing_Bayesian_Packages.md ├── Create_Formatted_Spreadsheet.Rmd ├── Create_Formatted_Spreadsheet.md ├── Distribution_Sampling.Rmd ├── Distribution_Sampling.md ├── Geospatial_Analysis.Rmd ├── Geospatial_Analysis.md ├── Modeling_Workflow.Rmd ├── Modeling_Workflow.md ├── Multilevel-Models.Rmd ├── Multilevel-Models.md ├── Ordinal_Regression.Rmd ├── Ordinal_Regression.md ├── Parsnip.Rmd ├── Parsnip.md ├── Power_Analysis.Rmd ├── Power_Analysis.md ├── R-Quickstart.Rmd ├── R-Quickstart.md ├── R_Quotation.Rmd ├── R_Quotation.md ├── Regression Model Tidying.Rmd ├── Regression-Model-Tidying.md ├── Rethinking-Tadpoles.Rmd ├── Sentiment_Analysis.Rmd ├── Sentiment_Analysis.md ├── Survival.Rmd ├── Survival.md ├── Time_Series_Modeling.Rmd ├── Time_Series_Modeling.md ├── Titanic.Rmd ├── Titanic.md ├── Titanic_files │ └── figure-gfm │ │ ├── explore-1.png │ │ ├── explore-2.png │ │ ├── explore-3.png │ │ ├── imputation-1.png │ │ ├── imputation-2.png │ │ ├── linear-regression-1.png │ │ ├── linear-regression-2.png │ │ ├── linear-regression-3.png │ │ ├── linear-regression-4.png │ │ ├── linear-regression-5.png │ │ ├── logistic-regression-1.png │ │ ├── logistic-regression-2.png │ │ ├── logistic-regression-3.png │ │ └── logistic-regression-4.png ├── Visualization_Cookbook.Rmd ├── Visualization_Cookbook.md ├── gapminder_summary_report.xlsx ├── hypothesis_testing.Rmd └── hypothesis_testing.md ├── README.md ├── Resources.md ├── rmd_config.R └── rmd_images ├── Bayes └── unnamed-chunk-5-1.png ├── Bayesian_Basics ├── unnamed-chunk-10-1.png ├── unnamed-chunk-3-1.png ├── unnamed-chunk-4-1.png ├── unnamed-chunk-4-2.png ├── unnamed-chunk-5-1.png ├── unnamed-chunk-6-1.png ├── unnamed-chunk-7-1.png ├── unnamed-chunk-7-2.png ├── unnamed-chunk-7-3.png ├── unnamed-chunk-8-1.png └── unnamed-chunk-9-1.png ├── Bayesian_Distributions ├── unnamed-chunk-3-1.png ├── unnamed-chunk-4-1.png └── unnamed-chunk-5-1.png ├── Bayesian_Modeling ├── unnamed-chunk-12-1.png ├── unnamed-chunk-12-2.png ├── unnamed-chunk-15-1.png ├── unnamed-chunk-15-2.png ├── unnamed-chunk-15-3.png ├── unnamed-chunk-19-1.png ├── unnamed-chunk-19-2.png ├── unnamed-chunk-19-3.png ├── unnamed-chunk-20-1.png ├── unnamed-chunk-20-2.png ├── unnamed-chunk-20-3.png ├── unnamed-chunk-21-1.png ├── unnamed-chunk-23-1.png └── unnamed-chunk-4-1.png ├── Caret ├── results-1.png └── results-2.png ├── Clustering ├── unnamed-chunk-3-1.png └── unnamed-chunk-3-2.png ├── Comparing_Bayesian_Packages ├── unnamed-chunk-10-1.png ├── unnamed-chunk-10-2.png ├── unnamed-chunk-11-1.png ├── unnamed-chunk-11-2.png ├── unnamed-chunk-11-3.png ├── unnamed-chunk-12-1.png ├── unnamed-chunk-6-1.png └── unnamed-chunk-6-2.png ├── Distribution_Sampling ├── unnamed-chunk-3-1.png └── unnamed-chunk-4-1.png ├── Geospatial_Analysis ├── locale-1.png ├── unnamed-chunk-2-1.png ├── unnamed-chunk-3-1.png └── unnamed-chunk-3-2.png ├── Modeling_Workflow ├── explore-1.png ├── explore-2.png ├── explore-3.png ├── plot-1.png └── plot-2.png ├── Multilevel-Models ├── unnamed-chunk-3-1.png ├── unnamed-chunk-3-2.png ├── unnamed-chunk-6-1.png ├── unnamed-chunk-7-1.png ├── unnamed-chunk-8-1.png ├── unnamed-chunk-9-1.png └── unnamed-chunk-9-2.png ├── Ordinal_Regression ├── unnamed-chunk-2-1.png ├── unnamed-chunk-4-1.png ├── unnamed-chunk-6-1.png ├── unnamed-chunk-7-1.png └── unnamed-chunk-8-1.png ├── Parsnip ├── unnamed-chunk-5-1.png └── unnamed-chunk-6-1.png ├── R-Quickstart ├── histogram-1.png ├── line-1.png ├── lollipop-1.png ├── unnamed-chunk-18-1.png └── unnamed-chunk-22-1.png ├── R_Quotation ├── unnamed-chunk-1-1.png ├── unnamed-chunk-1-2.png └── unnamed-chunk-1-3.png ├── Regression-Model-Tidying ├── unnamed-chunk-3-1.png └── unnamed-chunk-3-2.png ├── Survival ├── unnamed-chunk-10-1.png ├── unnamed-chunk-11-1.png ├── unnamed-chunk-15-1.png ├── unnamed-chunk-15-2.png ├── unnamed-chunk-16-1.png ├── unnamed-chunk-16-2.png ├── unnamed-chunk-17-1.png ├── unnamed-chunk-17-2.png ├── unnamed-chunk-18-1.png ├── unnamed-chunk-19-1.png ├── unnamed-chunk-2-1.png ├── unnamed-chunk-20-1.png ├── unnamed-chunk-21-1.png ├── unnamed-chunk-5-1.png ├── unnamed-chunk-6-1.png ├── unnamed-chunk-7-1.png ├── unnamed-chunk-7-2.png ├── unnamed-chunk-8-1.png ├── unnamed-chunk-9-1.png └── unnamed-chunk-9-2.png ├── Time_Series_Modeling ├── unnamed-chunk-2-1.png ├── unnamed-chunk-2-2.png ├── unnamed-chunk-4-1.png ├── unnamed-chunk-5-1.png ├── unnamed-chunk-5-2.png └── unnamed-chunk-5-3.png ├── Titanic ├── explore-1.png ├── explore-2.png ├── explore-3.png ├── imputation-1.png ├── imputation-2.png ├── linear-regression-1.png ├── linear-regression-2.png ├── linear-regression-3.png ├── linear-regression-4.png ├── linear-regression-5.png ├── logistic-regression-1.png ├── logistic-regression-2.png ├── logistic-regression-3.png └── logistic-regression-4.png ├── Vehicles ├── compare-models-1.png └── compare-models-2.png ├── Visualization_Cookbook ├── bar-1.png ├── bar-2.png ├── bar-3.png ├── bar-4.png ├── boxplot-1.png ├── bubbleplot-1.png ├── dotplot-1.png ├── dotplot-rank-1.png ├── heatmap-1.png ├── histogram-1.png ├── line-1.png ├── line-2.png ├── lollipop-1.png ├── pyramid-1.png ├── ridge-1.png ├── scatter-1.png ├── stackedarea-1.png ├── treemap-1.png ├── treemap-2.png └── violin-1.png └── hypothesis_testing ├── unnamed-chunk-5-1.png ├── unnamed-chunk-6-1.png └── unnamed-chunk-7-1.png /.gitignore: -------------------------------------------------------------------------------- 1 | source/.ipynb_checkpoints 2 | source/.ipynb_checkpoints/* 3 | source/.DS_Store 4 | .DS_Store 5 | .ipynb* 6 | .Rproj.user 7 | -------------------------------------------------------------------------------- /Anaconda-HowTo.md: -------------------------------------------------------------------------------- 1 | ## Reference for managing an Anaconda environment 2 | 3 | List all installed environments: 4 | 5 | ```conda env list``` 6 | 7 | Activate a conda environment: 8 | 9 | ```conda activate ``` 10 | 11 | List all modules installed in current active environment: 12 | 13 | ```conda list``` 14 | 15 | [Updating all packages](https://www.anaconda.com/keeping-anaconda-date/): 16 | 17 | ``` conda update --all``` 18 | 19 | ## References 20 | 21 | * [Conda cheatsheet](https://docs.conda.io/projects/conda/en/4.6.0/_downloads/52a95608c49671267e40c689e0bc00ca/conda-cheatsheet.pdf) 22 | * [Conda Environments Documentation](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#) 23 | -------------------------------------------------------------------------------- /Data-Science-Codex.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /Git-HowTo.md: -------------------------------------------------------------------------------- 1 | Process to get your code up on the server: 2 | 3 | 1. `git init` ( If the repo doesn’t already exist) 4 | 2. `git add .` (adds all the files to the repo) 5 | 3. `git commit -m "name"` commits changes 6 | 4. `git remote add origin https://github.com//.git` (if you haven’t set the URL yet) 7 | 5. `git push -u origin master` pushes the code to the server 8 | 9 | To remove cached files: 10 | 11 | ```git rm -r --cached . ``` 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Jesse Cambon 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Python/Hello_NLP.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Testing out some NLP techniques using the in-built Wall Street Journal dataset in NLTK" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 13, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import nltk\n", 17 | "from nltk.stem.wordnet import *\n", 18 | "from nltk.book import *\n", 19 | "from sklearn.feature_extraction.text import CountVectorizer\n", 20 | "from sklearn.decomposition import LatentDirichletAllocation\n", 21 | "from nltk.stem.snowball import PorterStemmer\n", 22 | "default_stopwords = set(nltk.corpus.stopwords.words('english'))" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "https://www.kaggle.com/arthurtok/spooky-nlp-and-topic-modelling-tutorial" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "## Topic Modeling with LDA" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 19, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "#WSJ_freq = nltk.FreqDist(text7)\n", 46 | "\n", 47 | "stemmer = PorterStemmer()\n", 48 | "lda = LatentDirichletAllocation(n_components=7, max_iter=5,\n", 49 | " learning_method = 'online',\n", 50 | " learning_offset = 50.,\n", 51 | " random_state = 0)\n", 52 | "\n", 53 | "lemm = WordNetLemmatizer()\n", 54 | "class LemmaCountVectorizer(CountVectorizer):\n", 55 | " def build_analyzer(self):\n", 56 | " analyzer = super(LemmaCountVectorizer, self).build_analyzer()\n", 57 | " return lambda doc: (lemm.lemmatize(w) for w in analyzer(doc))\n", 58 | "\n", 59 | "tf_vectorizer = LemmaCountVectorizer(max_df=0.95, \n", 60 | " min_df=2,\n", 61 | " stop_words='english',\n", 62 | " decode_error='ignore')\n", 63 | "\n", 64 | "tf = tf_vectorizer.fit_transform(text7)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 20, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "# train lda\n", 74 | "wsj_lda = lda.fit(tf)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 21, 80 | "metadata": {}, 81 | "outputs": [ 82 | { 83 | "name": "stdout", 84 | "output_type": "stream", 85 | "text": [ 86 | "\n", 87 | "Topics in LDA model: \n", 88 | "\n", 89 | "Topic #0:million program corp bond profit time analyst industry\n", 90 | "======================================================================\n", 91 | "\n", 92 | "Topic #1:market say trading sale rrb cent plan 30\n", 93 | "======================================================================\n", 94 | "\n", 95 | "Topic #2:company stock billion month investor bank buy ich\n", 96 | "======================================================================\n", 97 | "\n", 98 | "Topic #3:year new 000 future lrb 50 quarter service\n", 99 | "======================================================================\n", 100 | "\n", 101 | "Topic #4:said mr index business investment 10 rate contract\n", 102 | "======================================================================\n", 103 | "\n", 104 | "Topic #5:price president york day exchange rose term yesterday\n", 105 | "======================================================================\n", 106 | "\n", 107 | "Topic #6:share issue government executive house financial october trader\n", 108 | "======================================================================\n" 109 | ] 110 | } 111 | ], 112 | "source": [ 113 | "n_top_words = 8\n", 114 | "\n", 115 | "# Define helper function to print top words\n", 116 | "def print_top_words(model, feature_names, n_top_words):\n", 117 | " for index, topic in enumerate(model.components_):\n", 118 | " message = \"\\nTopic #{}:\".format(index)\n", 119 | " message += \" \".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1 :-1]])\n", 120 | " print(message)\n", 121 | " print(\"=\"*70)\n", 122 | "\n", 123 | "print(\"\\nTopics in LDA model: \")\n", 124 | "tf_feature_names = tf_vectorizer.get_feature_names()\n", 125 | "print_top_words(wsj_lda, tf_feature_names, n_top_words)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "## Stemming" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 63, 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "data": { 142 | "text/plain": [ 143 | "[('said', 628),\n", 144 | " ('million', 383),\n", 145 | " ('compani', 260),\n", 146 | " ('year', 212),\n", 147 | " ('say', 210),\n", 148 | " ('would', 209),\n", 149 | " ('market', 176),\n", 150 | " ('new', 165),\n", 151 | " ('new', 162),\n", 152 | " ('trade', 162),\n", 153 | " ('billion', 159),\n", 154 | " ('also', 147),\n", 155 | " ('stock', 136),\n", 156 | " ('presid', 133),\n", 157 | " ('one', 132)]" 158 | ] 159 | }, 160 | "execution_count": 63, 161 | "metadata": {}, 162 | "output_type": "execute_result" 163 | } 164 | ], 165 | "source": [ 166 | "[ (stemmer.stem(w),f) for (w,f) in WSJ_freq.most_common(80) if w.lower() not in default_stopwords and w.isalpha()]" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 61, 172 | "metadata": {}, 173 | "outputs": [ 174 | { 175 | "data": { 176 | "text/plain": [ 177 | "[(',', 4885),\n", 178 | " ('.', 3828),\n", 179 | " ('*-1', 1123),\n", 180 | " ('0', 1099),\n", 181 | " ('*', 965),\n", 182 | " (\"'s\", 864),\n", 183 | " ('*T*-1', 806),\n", 184 | " ('*U*', 744),\n", 185 | " ('$', 718),\n", 186 | " ('``', 702),\n", 187 | " (\"''\", 684)]" 188 | ] 189 | }, 190 | "execution_count": 61, 191 | "metadata": {}, 192 | "output_type": "execute_result" 193 | } 194 | ], 195 | "source": [ 196 | "[ (w,f) for (w,f) in WSJ_freq.most_common(20) if w.lower() not in default_stopwords ]" 197 | ] 198 | } 199 | ], 200 | "metadata": { 201 | "kernelspec": { 202 | "display_name": "Python 3", 203 | "language": "python", 204 | "name": "python3" 205 | }, 206 | "language_info": { 207 | "codemirror_mode": { 208 | "name": "ipython", 209 | "version": 3 210 | }, 211 | "file_extension": ".py", 212 | "mimetype": "text/x-python", 213 | "name": "python", 214 | "nbconvert_exporter": "python", 215 | "pygments_lexer": "ipython3", 216 | "version": "3.6.8" 217 | } 218 | }, 219 | "nbformat": 4, 220 | "nbformat_minor": 2 221 | } 222 | -------------------------------------------------------------------------------- /Python/Pandas_Essentials.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Pandas Essentials" 3 | author: "Jesse Cambon" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | output: 6 | github_document: 7 | toc: true 8 | --- 9 | 10 | https://github.com/rstudio/reticulate/#python-in-r-markdown 11 | 12 | ```{r knit-settings, include=FALSE} 13 | library(here) 14 | source(here("rmd_config.R")) 15 | ``` 16 | 17 | 18 | ```{python} 19 | import pandas as pd 20 | from rpy2.robjects import r,pandas2ri 21 | pandas2ri.activate() 22 | 23 | iris = r.data('iris') 24 | 25 | iris.info() 26 | #mtcars.info() 27 | 28 | ``` 29 | 30 | -------------------------------------------------------------------------------- /Python/Pandas_Essentials.md: -------------------------------------------------------------------------------- 1 | Pandas Essentials 2 | ================ 3 | Jesse Cambon 4 | 11 April, 2020 5 | 6 | 7 | 8 | ``` r 9 | library(reticulate) 10 | library(knitr) 11 | ``` 12 | 13 | ``` python 14 | import pandas as pd 15 | mtcars = r.mtcars 16 | ``` 17 | 18 | Counting 19 | 20 | ``` python 21 | am_vs = mtcars.groupby(['am','vs']).size().reset_index(name='count').\ 22 | sort_values('count',ascending=False) 23 | 24 | am_vs 25 | ``` 26 | 27 | ## am vs count 28 | ## 0 0.0 0.0 12 29 | ## 1 0.0 1.0 7 30 | ## 3 1.0 1.0 7 31 | ## 2 1.0 0.0 6 32 | -------------------------------------------------------------------------------- /Python/SQL_Databases.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Code for creating and using a SQL database in Python. We will use pandas, sqlalchemy, and sqlite\n", 8 | "\n", 9 | "References:\n", 10 | "\n", 11 | "* https://towardsdatascience.com/sqlalchemy-python-tutorial-79a577141a91\n", 12 | "* https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_sql.html\n", 13 | "* https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_sql.html" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 3, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "from sqlalchemy import create_engine\n", 23 | "import pandas as pd\n", 24 | "\n", 25 | "# Create local in-memory SQL database using sqlite\n", 26 | "# Note that you could also create this database in a .db file if desired\n", 27 | "engine = create_engine('sqlite://', echo=False)\n", 28 | " \n", 29 | "# Create some sample data \n", 30 | "df = pd.DataFrame({'name' : ['User 1', 'User 2', 'User 3']})\n", 31 | " \n", 32 | "# Put the sample data in the database\n", 33 | "df.to_sql('users', con=engine,index=False)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 4, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "text/html": [ 44 | "
\n", 45 | "\n", 58 | "\n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | "
name
0User 1
1User 2
2User 3
\n", 80 | "
" 81 | ], 82 | "text/plain": [ 83 | " name\n", 84 | "0 User 1\n", 85 | "1 User 2\n", 86 | "2 User 3" 87 | ] 88 | }, 89 | "execution_count": 4, 90 | "metadata": {}, 91 | "output_type": "execute_result" 92 | } 93 | ], 94 | "source": [ 95 | "# Examine our database\n", 96 | "pd.read_sql(\"select * from users\",engine)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 6, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "df1 = pd.DataFrame({'name' : ['User 8', 'User 9', 'User 10']}) # create more data\n", 106 | "\n", 107 | "# Append our extra data into the database\n", 108 | "df1.to_sql('users', con=engine,index=False,if_exists='append')" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 7, 114 | "metadata": {}, 115 | "outputs": [ 116 | { 117 | "data": { 118 | "text/html": [ 119 | "
\n", 120 | "\n", 133 | "\n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | "
name
0User 1
1User 2
2User 3
3User 8
4User 9
5User 10
\n", 167 | "
" 168 | ], 169 | "text/plain": [ 170 | " name\n", 171 | "0 User 1\n", 172 | "1 User 2\n", 173 | "2 User 3\n", 174 | "3 User 8\n", 175 | "4 User 9\n", 176 | "5 User 10" 177 | ] 178 | }, 179 | "execution_count": 7, 180 | "metadata": {}, 181 | "output_type": "execute_result" 182 | } 183 | ], 184 | "source": [ 185 | "# Re-examine our database\n", 186 | "pd.read_sql(\"select * from users\",engine)" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [] 195 | } 196 | ], 197 | "metadata": { 198 | "kernelspec": { 199 | "display_name": "Python 3", 200 | "language": "python", 201 | "name": "python3" 202 | }, 203 | "language_info": { 204 | "codemirror_mode": { 205 | "name": "ipython", 206 | "version": 3 207 | }, 208 | "file_extension": ".py", 209 | "mimetype": "text/x-python", 210 | "name": "python", 211 | "nbconvert_exporter": "python", 212 | "pygments_lexer": "ipython3", 213 | "version": "3.7.4" 214 | } 215 | }, 216 | "nbformat": 4, 217 | "nbformat_minor": 4 218 | } 219 | -------------------------------------------------------------------------------- /R-Development.md: -------------------------------------------------------------------------------- 1 | # Notes on Developing R Packages 2 | 3 | * The man documents are created with roxygen2 based on `R/` directory code files with `devtools::document()` 4 | * Test package with `devtools::test()` (see `/tests` directory) 5 | * Use `devtools::check()` to check for issues (also runs `devtools::test()`) 6 | * Use `devtools::build()` to build the package with vignettes included (creates .tar.gz file) 7 | * Check package on other environments for CRAN release using [rhub::check_for_cran()](https://r-hub.github.io/rhub/reference/check_for_cran.html) 8 | * To run all code examples in the package documentation, use this command from devtools: `run_examples(test=TRUE)` 9 | * Use [pkgdown::build_site()](https://pkgdown.r-lib.org/reference/build_site.html) to build the website. 10 | 11 | ### Development Resources 12 | * General Instructions: http://r-pkgs.had.co.nz/ 13 | * More general instructions: https://rstats-pkgs.readthedocs.io 14 | * roxygen2 : https://cran.r-project.org/web/packages/roxygen2/ 15 | * Devtools cheat sheet: https://www.rstudio.com/wp-content/uploads/2015/03/devtools-cheatsheet.pdf 16 | -------------------------------------------------------------------------------- /R/Bayesian_Basics.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Bayesian Basics" 3 | author: "Jesse Cambon" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | output: 6 | github_document: 7 | toc: true 8 | --- 9 | 10 | 11 | ```{r knit-settings, include=FALSE} 12 | library(here) 13 | source(here("rmd_config.R")) 14 | ``` 15 | 16 | * https://github.com/easystats/see/issues/48 17 | * https://easystats.github.io/see/articles/bayestestR.html 18 | * https://cran.r-project.org/web/packages/bayestestR/vignettes/bayes_factors.html 19 | 20 | ```{r} 21 | library(rstanarm) 22 | library(tidyverse) 23 | library(bayestestR) 24 | library(bayesplot) 25 | library(wesanderson) 26 | library(broom.mixed) 27 | 28 | options(mc.cores = parallel::detectCores()) 29 | 30 | model <- stan_glmer(extra ~ group + (1 | ID), data = sleep, 31 | prior = normal(0, 3, autoscale = FALSE)) 32 | 33 | summary(model) 34 | 35 | tidy(model) 36 | ``` 37 | 38 | https://github.com/easystats/see/issues/48 39 | 40 | ```{r} 41 | #My_first_BF <- bayesfactor_parameters(model, null = c(-1, 1)) 42 | 43 | density <- estimate_density(model) 44 | sim_prior <- simulate_prior(model) 45 | density_prior <- estimate_density(sim_prior) 46 | 47 | # Combine density for prior and posterior distributions 48 | post_prior <- density %>% mutate(type = 'posterior') %>% 49 | bind_rows(density_prior %>% mutate(type = 'prior')) 50 | 51 | ``` 52 | 53 | Plot the prior and posterior distributions for the fixed effects 54 | 55 | ```{r} 56 | ggplot(data = post_prior, aes(x = x ,y = y, fill = type)) + 57 | theme_bw() + 58 | facet_wrap(~Parameter, ncol = 1, scales='free') + 59 | geom_ribbon( mapping = aes( 60 | ymin = 0, 61 | ymax = y ), 62 | alpha = .8) + 63 | scale_fill_manual(values=c('steelblue', 'grey')) 64 | 65 | 66 | # scale_x_continuous(expand=expand_scale(mult = c(-.4, -.4))) 67 | ``` 68 | 69 | ```{r} 70 | mcmc_trace(model) 71 | mcmc_areas(model) 72 | ``` 73 | 74 | 75 | https://easystats.github.io/see/articles/bayestestR.html 76 | 77 | ```{r} 78 | plot(model) 79 | 80 | p_direction(model) 81 | ``` 82 | 83 | ```{r, fig.height = 10, fig.width = 8} 84 | plot(p_direction(model, effects = "all", component = "all")) 85 | ``` 86 | 87 | 88 | Check posterior distribution 89 | 90 | ```{r} 91 | pp_check(model) 92 | 93 | pp_check(model, plotfun = "hist") 94 | 95 | pp_check(model, plotfun = "intervals") 96 | ``` 97 | 98 | 99 | ```{r} 100 | ppc_intervals_grouped( 101 | y = sleep$extra, 102 | yrep = posterior_predict(model), 103 | x = as.numeric(sleep$group), 104 | prob = 0.5, 105 | group = sleep$ID 106 | ) 107 | ``` 108 | 109 | -------------------------------------------------------------------------------- /R/Bayesian_Basics.md: -------------------------------------------------------------------------------- 1 | Bayesian Basics 2 | ================ 3 | Jesse Cambon 4 | 06 February, 2021 5 | 6 | - 7 | - 8 | - 9 | 10 | 11 | 12 | ``` r 13 | library(rstanarm) 14 | ``` 15 | 16 | ## Loading required package: Rcpp 17 | 18 | ## This is rstanarm version 2.21.1 19 | 20 | ## - See https://mc-stan.org/rstanarm/articles/priors for changes to default priors! 21 | 22 | ## - Default priors may change, so it's safest to specify priors, even if equivalent to the defaults. 23 | 24 | ## - For execution on a local, multicore CPU with excess RAM we recommend calling 25 | 26 | ## options(mc.cores = parallel::detectCores()) 27 | 28 | ``` r 29 | library(tidyverse) 30 | ``` 31 | 32 | ## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ── 33 | 34 | ## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4 35 | ## ✓ tibble 3.0.6 ✓ dplyr 1.0.4 36 | ## ✓ tidyr 1.1.2 ✓ forcats 0.5.1 37 | ## ✓ readr 1.4.0 38 | 39 | ## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ── 40 | ## x dplyr::filter() masks stats::filter() 41 | ## x dplyr::lag() masks stats::lag() 42 | 43 | ``` r 44 | library(bayestestR) 45 | ``` 46 | 47 | ## Note: The default CI width (currently `ci=0.89`) might change in future versions (see https://github.com/easystats/bayestestR/discussions/250). To prevent any issues, please set it explicitly when using bayestestR functions, via the 'ci' argument. 48 | 49 | ``` r 50 | library(bayesplot) 51 | ``` 52 | 53 | ## This is bayesplot version 1.8.0 54 | 55 | ## - Online documentation and vignettes at mc-stan.org/bayesplot 56 | 57 | ## - bayesplot theme set to bayesplot::theme_default() 58 | 59 | ## * Does _not_ affect other ggplot2 plots 60 | 61 | ## * See ?bayesplot_theme_set for details on theme setting 62 | 63 | ``` r 64 | library(wesanderson) 65 | library(broom.mixed) 66 | ``` 67 | 68 | ## Registered S3 method overwritten by 'broom.mixed': 69 | ## method from 70 | ## tidy.gamlss broom 71 | 72 | ``` r 73 | options(mc.cores = parallel::detectCores()) 74 | 75 | model <- stan_glmer(extra ~ group + (1 | ID), data = sleep, 76 | prior = normal(0, 3, autoscale = FALSE)) 77 | ``` 78 | 79 | ## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable. 80 | ## Running the chains for more iterations may help. See 81 | ## http://mc-stan.org/misc/warnings.html#tail-ess 82 | 83 | ``` r 84 | summary(model) 85 | ``` 86 | 87 | ## 88 | ## Model Info: 89 | ## function: stan_glmer 90 | ## family: gaussian [identity] 91 | ## formula: extra ~ group + (1 | ID) 92 | ## algorithm: sampling 93 | ## sample: 4000 (posterior sample size) 94 | ## priors: see help('prior_summary') 95 | ## observations: 20 96 | ## groups: ID (10) 97 | ## 98 | ## Estimates: 99 | ## mean sd 10% 50% 90% 100 | ## (Intercept) 0.8 0.6 0.0 0.8 1.6 101 | ## group2 1.5 0.5 0.9 1.5 2.1 102 | ## b[(Intercept) ID:1] -0.2 0.8 -1.2 -0.2 0.8 103 | ## b[(Intercept) ID:2] -1.5 0.9 -2.6 -1.5 -0.3 104 | ## b[(Intercept) ID:3] -0.9 0.8 -1.9 -0.8 0.2 105 | ## b[(Intercept) ID:4] -1.6 0.9 -2.8 -1.7 -0.5 106 | ## b[(Intercept) ID:5] -1.3 0.9 -2.4 -1.3 -0.2 107 | ## b[(Intercept) ID:6] 1.8 0.9 0.6 1.8 3.0 108 | ## b[(Intercept) ID:7] 2.4 1.0 1.1 2.4 3.6 109 | ## b[(Intercept) ID:8] -0.3 0.8 -1.2 -0.3 0.7 110 | ## b[(Intercept) ID:9] 0.6 0.8 -0.5 0.6 1.6 111 | ## b[(Intercept) ID:10] 0.9 0.8 -0.1 0.9 1.9 112 | ## sigma 1.1 0.3 0.8 1.0 1.5 113 | ## Sigma[ID:(Intercept),(Intercept)] 2.8 1.8 1.1 2.5 5.1 114 | ## 115 | ## Fit Diagnostics: 116 | ## mean sd 10% 50% 90% 117 | ## mean_PPD 1.5 0.4 1.1 1.5 2.0 118 | ## 119 | ## The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')). 120 | ## 121 | ## MCMC diagnostics 122 | ## mcse Rhat n_eff 123 | ## (Intercept) 0.0 1.0 1102 124 | ## group2 0.0 1.0 3499 125 | ## b[(Intercept) ID:1] 0.0 1.0 1927 126 | ## b[(Intercept) ID:2] 0.0 1.0 825 127 | ## b[(Intercept) ID:3] 0.0 1.0 1240 128 | ## b[(Intercept) ID:4] 0.0 1.0 690 129 | ## b[(Intercept) ID:5] 0.0 1.0 1027 130 | ## b[(Intercept) ID:6] 0.0 1.0 990 131 | ## b[(Intercept) ID:7] 0.0 1.0 740 132 | ## b[(Intercept) ID:8] 0.0 1.0 1652 133 | ## b[(Intercept) ID:9] 0.0 1.0 1405 134 | ## b[(Intercept) ID:10] 0.0 1.0 1447 135 | ## sigma 0.0 1.0 345 136 | ## Sigma[ID:(Intercept),(Intercept)] 0.1 1.0 984 137 | ## mean_PPD 0.0 1.0 4029 138 | ## log-posterior 0.3 1.0 299 139 | ## 140 | ## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1). 141 | 142 | ``` r 143 | tidy(model) 144 | ``` 145 | 146 | ## # A tibble: 2 x 3 147 | ## term estimate std.error 148 | ## 149 | ## 1 (Intercept) 0.781 0.616 150 | ## 2 group2 1.55 0.450 151 | 152 | 153 | 154 | ``` r 155 | #My_first_BF <- bayesfactor_parameters(model, null = c(-1, 1)) 156 | 157 | density <- estimate_density(model) 158 | sim_prior <- simulate_prior(model) 159 | density_prior <- estimate_density(sim_prior) 160 | 161 | # Combine density for prior and posterior distributions 162 | post_prior <- density %>% mutate(type = 'posterior') %>% 163 | bind_rows(density_prior %>% mutate(type = 'prior')) 164 | ``` 165 | 166 | Plot the prior and posterior distributions for the fixed effects 167 | 168 | ``` r 169 | ggplot(data = post_prior, aes(x = x ,y = y, fill = type)) + 170 | theme_bw() + 171 | facet_wrap(~Parameter, ncol = 1, scales='free') + 172 | geom_ribbon( mapping = aes( 173 | ymin = 0, 174 | ymax = y ), 175 | alpha = .8) + 176 | scale_fill_manual(values=c('steelblue', 'grey')) 177 | ``` 178 | 179 | ![](../rmd_images/Bayesian_Basics/unnamed-chunk-3-1.png) 180 | 181 | ``` r 182 | # scale_x_continuous(expand=expand_scale(mult = c(-.4, -.4))) 183 | ``` 184 | 185 | ``` r 186 | mcmc_trace(model) 187 | ``` 188 | 189 | ![](../rmd_images/Bayesian_Basics/unnamed-chunk-4-1.png) 190 | 191 | ``` r 192 | mcmc_areas(model) 193 | ``` 194 | 195 | ![](../rmd_images/Bayesian_Basics/unnamed-chunk-4-2.png) 196 | 197 | 198 | 199 | ``` r 200 | plot(model) 201 | ``` 202 | 203 | ![](../rmd_images/Bayesian_Basics/unnamed-chunk-5-1.png) 204 | 205 | ``` r 206 | p_direction(model) 207 | ``` 208 | 209 | ## # Probability of Direction (pd) 210 | ## 211 | ## Parameter | pd 212 | ## -------------------- 213 | ## (Intercept) | 90.60% 214 | ## group2 | 99.40% 215 | 216 | ``` r 217 | plot(p_direction(model, effects = "all", component = "all")) 218 | ``` 219 | 220 | ![](../rmd_images/Bayesian_Basics/unnamed-chunk-6-1.png) 221 | 222 | Check posterior distribution 223 | 224 | ``` r 225 | pp_check(model) 226 | ``` 227 | 228 | ![](../rmd_images/Bayesian_Basics/unnamed-chunk-7-1.png) 229 | 230 | ``` r 231 | pp_check(model, plotfun = "hist") 232 | ``` 233 | 234 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. 235 | 236 | ![](../rmd_images/Bayesian_Basics/unnamed-chunk-7-2.png) 237 | 238 | ``` r 239 | pp_check(model, plotfun = "intervals") 240 | ``` 241 | 242 | ## 'x' not specified in '...'. Using x=1:length(y). 243 | 244 | ![](../rmd_images/Bayesian_Basics/unnamed-chunk-7-3.png) 245 | 246 | ``` r 247 | ppc_intervals_grouped( 248 | y = sleep$extra, 249 | yrep = posterior_predict(model), 250 | x = as.numeric(sleep$group), 251 | prob = 0.5, 252 | group = sleep$ID 253 | ) 254 | ``` 255 | 256 | ![](../rmd_images/Bayesian_Basics/unnamed-chunk-8-1.png) 257 | -------------------------------------------------------------------------------- /R/Caret.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Machine Learning with Caret" 3 | author: "Jesse Cambon" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | output: 6 | github_document: 7 | toc: true 8 | toc_depth: 2 9 | --- 10 | 11 | Demonstrate a machine learning workflow with caret 12 | 13 | ## References 14 | * https://topepo.github.io/caret/model-training-and-tuning.html 15 | * https://cran.r-project.org/web/packages/caretEnsemble/vignettes/caretEnsemble-intro.html 16 | 17 | ```{r knit-settings, include=FALSE} 18 | library(knitr) 19 | library(stringr) 20 | library(here) 21 | # get name of file during knitting and strip file extension 22 | rmd_filename <- str_remove(knitr::current_input(),"\\.Rmd") 23 | print(rmd_filename) 24 | knitr::opts_chunk$set(fig.path = str_c(here::here("rmd_images",rmd_filename),'/'),echo=TRUE) # image path 25 | ``` 26 | 27 | ## Setup 28 | 29 | ```{r setup,warning=F,message=F} 30 | library(mlbench) # machine learning reference datasets 31 | library(tidyverse) 32 | library(broom) 33 | library(caret) 34 | library(skimr) 35 | library(knitr) 36 | library(kableExtra) 37 | 38 | data(BreastCancer) # load 39 | 40 | # Set seed for reproducibility 41 | set.seed(45) 42 | ``` 43 | 44 | 45 | ## Build Model 46 | 47 | ```{r models,message=F,results=F,warning=F} 48 | #skim(BreastCancer) 49 | 50 | BC <- BreastCancer %>% as_tibble() %>% 51 | dplyr::select(-Id) %>% 52 | # should really use imputation but we'll just drop nas for now 53 | drop_na() 54 | 55 | # Use k-fold cross-validation 56 | TC <- trainControl(method="cv", number=5) 57 | 58 | # Train some models 59 | 60 | # Neural Net 61 | nnet <- train(Class ~ . , BC,method='nnet',trControl=TC,verbose=FALSE) 62 | 63 | # Gradient Boosted Machine 64 | gbm <- train(Class ~ . , BC,method='gbm',trControl=TC) 65 | 66 | # Radial SVM 67 | svmrad <- train(Class ~ . , BC,method='svmRadial',trControl=TC) 68 | 69 | # Elastic-net 70 | glmnet <- train(Class ~ . , BC,method='glmnet',trControl=TC,tuneLength=5) 71 | 72 | # Logistic regression - did not converge 73 | glm <- train(Class ~ . , BC,method='glm',trControl=TC) 74 | 75 | ``` 76 | 77 | 78 | ```{r} 79 | 80 | # Look at results of Glmnet model 81 | 82 | # Extract coefficients from optimal model 83 | glm_coeff <- coef(glmnet$finalModel,glmnet$finalModel$lambdaOpt) %>% 84 | as.matrix() %>% as.data.frame() %>% 85 | rownames_to_column('Variable') %>% 86 | as_tibble() %>% 87 | rename(Coefficient=2) %>% 88 | arrange(desc(abs(Coefficient))) 89 | 90 | 91 | # Combine variable importance data with coefficients 92 | varImportance <- varImp(glmnet)$importance %>% 93 | rownames_to_column('Variable') %>% 94 | rename(Importance=2) %>% 95 | arrange(desc(Importance)) %>% 96 | full_join(glm_coeff,by='Variable') %>% 97 | filter(Coefficient != 0) 98 | ``` 99 | 100 | 101 | 102 | 103 | ```{r results} 104 | resamps <- resamples(list(nnet=nnet, 105 | glmnet=glmnet, 106 | svmrad=svmrad, 107 | gbm=gbm, 108 | glm=glm)) 109 | 110 | # Accuracy comparison 111 | dotplot(resamps, metric = "Accuracy") 112 | 113 | # Difference in accuracy 114 | bwplot(diff(resamps)) 115 | ``` 116 | 117 | ## Glmnet (Elastic Net) Model 118 | 119 | ```{r,results='asis',warning=F,message=F} 120 | kable(varImportance,format='markdown') %>% 121 | kable_styling(bootstrap_options = c("striped",'border')) 122 | ``` 123 | -------------------------------------------------------------------------------- /R/Caret.md: -------------------------------------------------------------------------------- 1 | Machine Learning with Caret 2 | ================ 3 | Jesse Cambon 4 | 22 November, 2019 5 | 6 | Demonstrate a machine learning workflow with 7 | caret 8 | 9 | ## References 10 | 11 | - 12 | - 13 | 14 | ## Setup 15 | 16 | ``` r 17 | library(mlbench) # machine learning reference datasets 18 | library(tidyverse) 19 | library(broom) 20 | library(caret) 21 | library(skimr) 22 | library(knitr) 23 | library(kableExtra) 24 | 25 | data(BreastCancer) # load 26 | 27 | # Set seed for reproducibility 28 | set.seed(45) 29 | ``` 30 | 31 | ## Build Model 32 | 33 | ``` r 34 | #skim(BreastCancer) 35 | 36 | BC <- BreastCancer %>% as_tibble() %>% 37 | dplyr::select(-Id) %>% 38 | # should really use imputation but we'll just drop nas for now 39 | drop_na() 40 | 41 | # Use k-fold cross-validation 42 | TC <- trainControl(method="cv", number=5) 43 | 44 | # Train some models 45 | 46 | # Neural Net 47 | nnet <- train(Class ~ . , BC,method='nnet',trControl=TC,verbose=FALSE) 48 | 49 | # Gradient Boosted Machine 50 | gbm <- train(Class ~ . , BC,method='gbm',trControl=TC) 51 | 52 | # Radial SVM 53 | svmrad <- train(Class ~ . , BC,method='svmRadial',trControl=TC) 54 | 55 | # Elastic-net 56 | glmnet <- train(Class ~ . , BC,method='glmnet',trControl=TC,tuneLength=5) 57 | 58 | # Logistic regression - did not converge 59 | glm <- train(Class ~ . , BC,method='glm',trControl=TC) 60 | ``` 61 | 62 | ``` r 63 | # Look at results of Glmnet model 64 | 65 | # Extract coefficients from optimal model 66 | glm_coeff <- coef(glmnet$finalModel,glmnet$finalModel$lambdaOpt) %>% 67 | as.matrix() %>% as.data.frame() %>% 68 | rownames_to_column('Variable') %>% 69 | as_tibble() %>% 70 | rename(Coefficient=2) %>% 71 | arrange(desc(abs(Coefficient))) 72 | 73 | 74 | # Combine variable importance data with coefficients 75 | varImportance <- varImp(glmnet)$importance %>% 76 | rownames_to_column('Variable') %>% 77 | rename(Importance=2) %>% 78 | arrange(desc(Importance)) %>% 79 | full_join(glm_coeff,by='Variable') %>% 80 | filter(Coefficient != 0) 81 | ``` 82 | 83 | ``` r 84 | resamps <- resamples(list(nnet=nnet, 85 | glmnet=glmnet, 86 | svmrad=svmrad, 87 | gbm=gbm, 88 | glm=glm)) 89 | 90 | # Accuracy comparison 91 | dotplot(resamps, metric = "Accuracy") 92 | ``` 93 | 94 | ![](/Users/jessecambon/Documents/Data-Science-Codex/rmd_images/Caret/results-1.png) 95 | 96 | ``` r 97 | # Difference in accuracy 98 | bwplot(diff(resamps)) 99 | ``` 100 | 101 | ![](/Users/jessecambon/Documents/Data-Science-Codex/rmd_images/Caret/results-2.png) 102 | 103 | ## Glmnet (Elastic Net) Model 104 | 105 | ``` r 106 | kable(varImportance,format='markdown') %>% 107 | kable_styling(bootstrap_options = c("striped",'border')) 108 | ``` 109 | 110 | | Variable | Importance | Coefficient | 111 | | :---------------- | ----------: | ----------: | 112 | | Cl.thickness.L | 100.0000000 | 3.1361533 | 113 | | Bare.nuclei9 | 80.1349380 | 2.5131545 | 114 | | Bare.nuclei6 | 72.1692163 | 2.2633373 | 115 | | Bare.nuclei10 | 62.8228881 | 1.9702221 | 116 | | Cell.shape.L | 60.0936317 | 1.8846284 | 117 | | Marg.adhesion.L | 59.9667240 | 1.8806484 | 118 | | Cell.size.L | 54.3790530 | 1.7054105 | 119 | | Normal.nucleoli10 | 51.3425770 | 1.6101819 | 120 | | Normal.nucleoli9 | 48.7621790 | 1.5292567 | 121 | | Bl.cromatin5 | 42.9191401 | 1.3460100 | 122 | | Marg.adhesion^9 | 38.8206640 | 1.2174755 | 123 | | Normal.nucleoli4 | 38.0200861 | 1.1923682 | 124 | | Cell.shape.Q | 34.0212706 | \-1.0669592 | 125 | | Cl.thickness^8 | 27.9829503 | \-0.8775882 | 126 | | Normal.nucleoli2 | 27.4707392 | \-0.8615245 | 127 | | Epith.c.size^4 | 27.1064505 | 0.8500998 | 128 | | Bare.nuclei4 | 26.4996140 | 0.8310685 | 129 | | Cell.size^8 | 25.6821554 | 0.8054318 | 130 | | Bare.nuclei3 | 24.5833551 | 0.7709717 | 131 | | Bare.nuclei7 | 21.8755488 | 0.6860507 | 132 | | Cell.size.C | 20.0900403 | 0.6300545 | 133 | | Bare.nuclei5 | 19.8094615 | 0.6212551 | 134 | | Bl.cromatin7 | 18.8161804 | 0.5901043 | 135 | | Cl.thickness.Q | 17.8206566 | 0.5588831 | 136 | | Epith.c.size.L | 17.2140778 | 0.5398599 | 137 | | Cell.shape.C | 15.8214699 | 0.4961855 | 138 | | Bare.nuclei8 | 15.7832696 | 0.4949875 | 139 | | Cell.shape^8 | 15.5873838 | 0.4888443 | 140 | | Normal.nucleoli6 | 15.5750643 | 0.4884579 | 141 | | Epith.c.size^8 | 15.1072819 | 0.4737875 | 142 | | Cell.size^5 | 14.7341024 | 0.4620840 | 143 | | Mitoses10 | 14.6725910 | 0.4601549 | 144 | | Cell.size.Q | 13.5285932 | \-0.4242774 | 145 | | Cl.thickness^5 | 12.5086592 | 0.3922907 | 146 | | Normal.nucleoli7 | 11.2372547 | \-0.3524175 | 147 | | Bl.cromatin4 | 11.0371135 | 0.3461408 | 148 | | Epith.c.size^5 | 10.1426736 | \-0.3180898 | 149 | | Bl.cromatin8 | 8.5926819 | 0.2694797 | 150 | | Epith.c.size^9 | 8.1088768 | 0.2543068 | 151 | | Normal.nucleoli3 | 6.1834552 | 0.1939226 | 152 | | Cell.size^6 | 6.0301535 | \-0.1891149 | 153 | | Marg.adhesion.C | 5.6067864 | 0.1758374 | 154 | | Marg.adhesion^8 | 5.0073879 | \-0.1570394 | 155 | | Epith.c.size^7 | 4.8165935 | \-0.1510558 | 156 | | Bl.cromatin10 | 3.7579941 | 0.1178565 | 157 | | Marg.adhesion^4 | 0.3997860 | \-0.0125379 | 158 | | Cell.shape^5 | 0.2113266 | \-0.0066275 | 159 | | Cl.thickness.C | 0.1668182 | 0.0052317 | 160 | | (Intercept) | NA | 0.5035466 | 161 | -------------------------------------------------------------------------------- /R/Clustering.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "K-means Clustering" 3 | author: "Jesse Cambon" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | output: 6 | github_document: 7 | toc: true 8 | toc_depth: 2 9 | --- 10 | 11 | Demonstrate K-means clustering 12 | 13 | ## References 14 | * https://uc-r.github.io/kmeans_clustering 15 | * https://cran.r-project.org/web/packages/broom/vignettes/kmeans.html 16 | 17 | ```{r knit-settings, include=FALSE} 18 | library(here) 19 | source(here("rmd_config.R")) 20 | ``` 21 | 22 | ## Setup 23 | 24 | ```{r setup,warning=F,message=F} 25 | library(tidyverse) 26 | library(knitr) 27 | library(kableExtra) 28 | library(broom) 29 | library(factoextra) 30 | library(wesanderson) 31 | ``` 32 | 33 | ## Cluster Data 34 | 35 | ```{r} 36 | library(fueleconomy) 37 | my_vehicles <- vehicles %>% filter(year == 2015) %>% 38 | drop_na(cyl,displ,cty) 39 | 40 | vehicles_sel <- my_vehicles %>% 41 | select(cty,cyl,displ) 42 | 43 | # Scale variables for clustering 44 | vehicles_sel_scaled <- vehicles_sel %>% 45 | mutate_all(scale) 46 | 47 | # Try different numbers of clusters 48 | clust3 <- kmeans(vehicles_sel_scaled, centers = 3) 49 | clust5 <- kmeans(vehicles_sel_scaled, centers = 5) 50 | clust7 <- kmeans(vehicles_sel_scaled, centers = 7) 51 | clust10 <- kmeans(vehicles_sel_scaled, centers = 10) 52 | ``` 53 | 54 | 55 | ## View Results 56 | 57 | ```{r} 58 | combine_results <- augment(clust3, my_vehicles) 59 | 60 | combine_summ <- combine_results %>% group_by(.cluster) %>% 61 | summarize(num_vehicles=n(), 62 | mean_cty=mean(cty), 63 | min_cty=min(cty), 64 | max_cty=max(cty), 65 | mean_displ=mean(displ), 66 | mean_cyl=mean(cyl)) %>% 67 | arrange(desc(mean_cty)) 68 | ``` 69 | 70 | 71 | ```{r clusterresults,results='asis',warning=F} 72 | kable(combine_summ,format='markdown',digits=2) %>% 73 | kable_styling(bootstrap_options = c("striped",'border')) 74 | ``` 75 | 76 | ## Visualize 77 | 78 | ```{r} 79 | fviz_cluster(clust3,data=vehicles_sel,repel=F,ggtheme=theme_bw()) 80 | 81 | ggplot(aes(x=cyl,y=cty,color=.cluster),data=combine_results) + 82 | geom_jitter() + 83 | theme_bw() + 84 | theme(legend.position='top') + 85 | scale_color_manual(values=wes_palette('Darjeeling1')) + 86 | guides(color = guide_legend(title='Cluster',override.aes = list(size=2.5))) + 87 | xlab('Cylinders (cyl)') + 88 | ylab('City Fuel Economy (cty)') 89 | ``` 90 | -------------------------------------------------------------------------------- /R/Clustering.md: -------------------------------------------------------------------------------- 1 | K-means Clustering 2 | ================ 3 | Jesse Cambon 4 | 24 November, 2019 5 | 6 | Demonstrate K-means clustering 7 | 8 | ## References 9 | 10 | - 11 | - 12 | 13 | ## Setup 14 | 15 | ``` r 16 | library(tidyverse) 17 | library(knitr) 18 | library(kableExtra) 19 | library(broom) 20 | library(factoextra) 21 | library(wesanderson) 22 | ``` 23 | 24 | ## Cluster Data 25 | 26 | ``` r 27 | library(fueleconomy) 28 | my_vehicles <- vehicles %>% filter(year == 2015) %>% 29 | drop_na(cyl,displ,cty) 30 | 31 | vehicles_sel <- my_vehicles %>% 32 | select(cty,cyl,displ) 33 | 34 | # Scale variables for clustering 35 | vehicles_sel_scaled <- vehicles_sel %>% 36 | mutate_all(scale) 37 | 38 | # Try different numbers of clusters 39 | clust3 <- kmeans(vehicles_sel_scaled, centers = 3) 40 | clust5 <- kmeans(vehicles_sel_scaled, centers = 5) 41 | clust7 <- kmeans(vehicles_sel_scaled, centers = 7) 42 | clust10 <- kmeans(vehicles_sel_scaled, centers = 10) 43 | ``` 44 | 45 | ## View Results 46 | 47 | ``` r 48 | combine_results <- augment(clust3, my_vehicles) 49 | 50 | combine_summ <- combine_results %>% group_by(.cluster) %>% 51 | summarize(num_vehicles=n(), 52 | mean_cty=mean(cty), 53 | min_cty=min(cty), 54 | max_cty=max(cty), 55 | mean_displ=mean(displ), 56 | mean_cyl=mean(cyl)) %>% 57 | arrange(desc(mean_cty)) 58 | ``` 59 | 60 | ``` r 61 | kable(combine_summ,format='markdown',digits=2) %>% 62 | kable_styling(bootstrap_options = c("striped",'border')) 63 | ``` 64 | 65 | | .cluster | num\_vehicles | mean\_cty | min\_cty | max\_cty | mean\_displ | mean\_cyl | 66 | | :------- | ------------: | --------: | -------: | -------: | ----------: | --------: | 67 | | 3 | 86 | 25.05 | 20 | 33 | 1.87 | 4.05 | 68 | | 2 | 55 | 18.71 | 16 | 24 | 3.12 | 5.78 | 69 | | 1 | 63 | 15.19 | 11 | 20 | 5.12 | 8.29 | 70 | 71 | ## Visualize 72 | 73 | ``` r 74 | fviz_cluster(clust3,data=vehicles_sel,repel=F,ggtheme=theme_bw()) 75 | ``` 76 | 77 | ![](/rmd_images/Clustering/unnamed-chunk-3-1.png) 78 | 79 | ``` r 80 | ggplot(aes(x=cyl,y=cty,color=.cluster),data=combine_results) + 81 | geom_jitter() + 82 | theme_bw() + 83 | theme(legend.position='top') + 84 | scale_color_manual(values=wes_palette('Darjeeling1')) + 85 | guides(color = guide_legend(title='Cluster',override.aes = list(size=2.5))) + 86 | xlab('Cylinders (cyl)') + 87 | ylab('City Fuel Economy (cty)') 88 | ``` 89 | 90 | ![](../rmd_images/Clustering/unnamed-chunk-3-2.png) 91 | -------------------------------------------------------------------------------- /R/Comparing_Bayesian_Packages.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Comparing Bayesian Modeling Packages" 3 | author: "Jesse Cambon" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | output: 6 | github_document: 7 | toc: true 8 | --- 9 | 10 | Compare rstan, brms, and rstanarm 11 | 12 | ```{r knit-settings, include=FALSE} 13 | library(here) 14 | source(here("rmd_config.R")) 15 | ``` 16 | 17 | ```{r,message=F,warning=F} 18 | library(rstan) 19 | library(brms) 20 | library(rstanarm) 21 | library(tidyverse) 22 | library(bayesplot) 23 | options(mc.cores = parallel::detectCores()) 24 | ``` 25 | 26 | 27 | ## Rstan 28 | 29 | Walking through this example: https://cran.r-project.org/web/packages/rstan/vignettes/rstan.html#sample-from-the-posterior-distribution 30 | 31 | ```{r} 32 | # Sample Dataset 33 | schools_data <- list( 34 | J = 8, 35 | y = c(28, 8, -3, 7, -1, 1, 18, 12), 36 | sigma = c(15, 10, 16, 11, 9, 11, 10, 18) 37 | ) 38 | 39 | stan_code <- " 40 | data { 41 | int J; // number of schools 42 | real y[J]; // estimated treatment effects 43 | real sigma[J]; // s.e. of effect estimates 44 | } 45 | parameters { 46 | real mu; 47 | real tau; 48 | vector[J] eta; 49 | } 50 | transformed parameters { 51 | vector[J] theta; 52 | theta = mu + tau * eta; 53 | } 54 | model { 55 | target += normal_lpdf(eta | 0, 1); 56 | target += normal_lpdf(y | theta, sigma); 57 | }" 58 | ``` 59 | 60 | 61 | ```{r} 62 | fit1 <- stan( 63 | model_code = stan_code, # Stan program 64 | data = schools_data, # named list of data 65 | chains = 4, # number of Markov chains 66 | warmup = 1000, # number of warmup iterations per chain 67 | iter = 2000, # total number of iterations per chain 68 | cores = 2, # number of cores (could use one per chain) 69 | refresh = 0 # no progress shown 70 | ) 71 | ``` 72 | 73 | ## Brms 74 | 75 | Example based on : https://github.com/paul-buerkner/brms 76 | 77 | * `(1 | var)` is used to specify a random intercept 78 | 79 | Mixed effect model has both random effects and fixed effects 80 | 81 | * https://www.theanalysisfactor.com/understanding-random-effects-in-mixed-models/ 82 | * https://ourcodingclub.github.io/tutorials/mixed-models/#what 83 | * https://ase.tufts.edu/gsc/gradresources/guidetomixedmodelsinr/mixed%20model%20guide.html 84 | * https://en.wikipedia.org/wiki/Mixed_model 85 | 86 | ```{r} 87 | fit1 <- brm(count ~ zAge + zBase * Trt + (1|patient), 88 | data = epilepsy, family = poisson()) 89 | fit2 <- brm(count ~ zAge + zBase * Trt + (1|patient) + (1|obs), 90 | data = epilepsy, family = poisson()) 91 | 92 | ``` 93 | 94 | ```{r} 95 | fit1 96 | ``` 97 | 98 | 99 | ```{r} 100 | plot(fit1, pars = c("Trt", "zBase")) 101 | plot(fit2, pars = c("Trt", "zBase")) 102 | 103 | ``` 104 | 105 | Compare model results with leave-one-out validation 106 | 107 | https://mc-stan.org/loo/ 108 | 109 | ```{r} 110 | loo(fit1, fit2) 111 | ``` 112 | 113 | ## rstanarm 114 | 115 | Rstanarm examle compared with brms 116 | 117 | * https://mc-stan.org/loo/articles/loo2-example.html 118 | * http://mc-stan.org/rstanarm/articles/count.html 119 | 120 | brms prior setting: https://www.jamesrrae.com/post/bayesian-logistic-regression-using-brms-part-1/ 121 | 122 | ```{r} 123 | # Use rstanarm to fit a poisson model 124 | roach_pois <- 125 | stan_glm( 126 | formula = y ~ roach1 + treatment + senior, 127 | offset = log(exposure2), 128 | data = roaches, 129 | family = poisson(link = "log"), 130 | prior = normal(0, 2.5, autoscale = TRUE), 131 | prior_intercept = normal(0, 5, autoscale = TRUE), 132 | seed = 12345 133 | ) 134 | 135 | # # Use rstanarm to fit a negative binomial model 136 | roach_negbinom2 <- update(roach_pois, family = neg_binomial_2) 137 | ``` 138 | 139 | Fit a Brms model for comparison 140 | 141 | ```{r} 142 | # Priors to be used by brm 143 | my_priors <- c( 144 | prior(normal(0, 5), class = "Intercept"), 145 | prior(normal(0, 2.5), class = "b") 146 | ) 147 | 148 | # Fit with zero inflated negative binomial with brm 149 | roach_zinb <- 150 | brm( 151 | formula=y ~ roach1 + treatment + senior, 152 | data = roaches, 153 | family = zero_inflated_negbinomial, 154 | seed = 12345 155 | ) 156 | ``` 157 | 158 | ```{r} 159 | plot(roach_pois) 160 | plot(roach_zinb,pars=c('roach1','treatment','senior')) 161 | ``` 162 | 163 | 164 | ```{r} 165 | pp_check(roach_pois, plotfun='stat') 166 | pp_check(roach_negbinom2, plotfun='stat') 167 | pp_check(roach_zinb, plotfun='stat') 168 | ``` 169 | 170 | ```{r} 171 | prop_zero <- function(y) mean(y == 0) 172 | 173 | prop_zero_test1 <- pp_check(roach_pois, plotfun = "stat", stat = "prop_zero", binwidth = .005) 174 | prop_zero_test2 <- pp_check(roach_negbinom2, plotfun = "stat", stat = "prop_zero", 175 | binwidth = 0.01) 176 | prop_zero_test3 <- pp_check(roach_zinb, plotfun = "stat", stat = "prop_zero", 177 | binwidth = 0.01) 178 | 179 | # Show graphs for Poisson and negative binomial side by side 180 | bayesplot_grid(prop_zero_test1 + ggtitle("Poisson"), 181 | prop_zero_test2 + ggtitle("Negative Binomial"), 182 | prop_zero_test3 + ggtitle("Zero Inflated Negative Binomial"), 183 | grid_args = list(ncol = 3)) 184 | ``` 185 | 186 | 187 | 188 | ```{r} 189 | #loo(roach_pois, roach_negbinom2) 190 | ``` 191 | 192 | -------------------------------------------------------------------------------- /R/Distribution_Sampling.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Distribution Sampling and Hypothesis Testing" 3 | author: "Jesse Cambon" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | output: 6 | github_document: 7 | toc: true 8 | --- 9 | 10 | ```{r knit-settings, include=FALSE} 11 | library(here) 12 | source(here("rmd_config.R")) 13 | ``` 14 | 15 | References: 16 | * http://appliedpredictivemodeling.com/data 17 | * http://faculty.marshall.usc.edu/gareth-james/ISL/data.html 18 | 19 | ```{r,message=F,warning=F} 20 | library(tidyverse) 21 | library(bayestestR) 22 | library(BayesFactor) 23 | library(jcolors) 24 | library(infer) 25 | library(broom) 26 | library(knitr) 27 | 28 | set.seed(42) # for reproducibility 29 | ``` 30 | 31 | Perform sampling 32 | 33 | ```{r} 34 | bernouli_sample <- rbernoulli(10,p=0.9) # T/F 35 | uniform = runif(10,-4,4) 36 | 37 | num_rows <- 1000 38 | 39 | dist <- 40 | tibble( 41 | cauchy=rcauchy(num_rows,0,0.5), 42 | norm_sample = rnorm(num_rows,0,0.5), 43 | beta_sample = rbeta(num_rows,0,1) 44 | ) %>% 45 | pivot_longer(everything(),values_to='value',names_to='distribution') 46 | 47 | 48 | # Distributions used for count data 49 | count_dist <- tibble(poisson= rpois(num_rows,2), 50 | `negative binomial`=rnbinom(num_rows,1,mu=2), 51 | binom_sample = rbinom(num_rows,9,.25), 52 | weibull=rweibull(num_rows,1.4) 53 | ) %>% 54 | pivot_longer(everything(),values_to='value',names_to='distribution') 55 | ``` 56 | 57 | Compare some distributions 58 | 59 | ```{r} 60 | ggplot(data=dist,aes(x=value,color=distribution)) + 61 | # facet_wrap(~distribution,ncol=1) + 62 | scale_x_continuous(limits =c(-3,3)) + 63 | theme_minimal() + 64 | theme(legend.position='top') + 65 | geom_density(alpha=0.8) + 66 | scale_color_jcolors('default') + 67 | xlab('') + ylab('') 68 | ``` 69 | 70 | Poisson v Neg Binomial v Weibull 71 | 72 | ```{r} 73 | ggplot(data=count_dist,aes(x=value,color=distribution)) + 74 | # facet_wrap(~distribution,ncol=1) + 75 | scale_x_continuous(limits =c(0,8)) + 76 | theme_minimal() + 77 | theme(legend.position='top') + 78 | geom_density(alpha=0.8) + 79 | scale_color_jcolors('default') + 80 | xlab('') + ylab('') 81 | ``` 82 | 83 | 84 | ## Significance Testing 85 | 86 | ### T-test (Frequentist version) 87 | 88 | ```{r} 89 | t.test(trees$Height) 90 | ``` 91 | 92 | Simulate some data and run more T-tests 93 | 94 | ```{r} 95 | compare_norms <- rnorm(100,25,10) %>% 96 | as_tibble() %>% rename(sample1=value) %>% 97 | mutate(sample2 = rnorm(100,28,10)) 98 | 99 | results <- t.test(compare_norms$sample1,compare_norms$sample2) 100 | results 101 | ``` 102 | 103 | Tidy T-test (infer package) 104 | 105 | https://infer.netlify.app/ 106 | 107 | ```{r} 108 | compare_norms_long <- 109 | compare_norms %>% 110 | pivot_longer(everything(),names_to='sample', values_to='value') 111 | 112 | compare_norms_long %>% 113 | t_test(value ~ sample,order=c('sample1','sample2')) %>% 114 | kable() 115 | ``` 116 | 117 | 118 | ### Bayesian T-test 119 | 120 | https://easystats.github.io/bayestestR/articles/example2.html 121 | ```{r} 122 | bayes_result <- BayesFactor::ttestBF(compare_norms$sample1,compare_norms$sample2) 123 | bayes_result 124 | ``` 125 | ```{r} 126 | describe_posterior(bayes_result) %>% kable() 127 | ``` 128 | 129 | -------------------------------------------------------------------------------- /R/Distribution_Sampling.md: -------------------------------------------------------------------------------- 1 | Distribution Sampling and Hypothesis Testing 2 | ================ 3 | Jesse Cambon 4 | 02 February, 2021 5 | 6 | - [Significance Testing](#significance-testing) 7 | - [T-test (Frequentist version)](#t-test-frequentist-version) 8 | - [Bayesian T-test](#bayesian-t-test) 9 | 10 | References: \* \* 11 | 12 | 13 | ``` r 14 | library(tidyverse) 15 | library(bayestestR) 16 | library(BayesFactor) 17 | library(jcolors) 18 | library(infer) 19 | library(broom) 20 | library(knitr) 21 | 22 | set.seed(42) # for reproducibility 23 | ``` 24 | 25 | Perform sampling 26 | 27 | ``` r 28 | bernouli_sample <- rbernoulli(10,p=0.9) # T/F 29 | uniform = runif(10,-4,4) 30 | 31 | num_rows <- 1000 32 | 33 | dist <- 34 | tibble( 35 | cauchy=rcauchy(num_rows,0,0.5), 36 | norm_sample = rnorm(num_rows,0,0.5), 37 | beta_sample = rbeta(num_rows,0,1) 38 | ) %>% 39 | pivot_longer(everything(),values_to='value',names_to='distribution') 40 | 41 | 42 | # Distributions used for count data 43 | count_dist <- tibble(poisson= rpois(num_rows,2), 44 | `negative binomial`=rnbinom(num_rows,1,mu=2), 45 | binom_sample = rbinom(num_rows,9,.25), 46 | weibull=rweibull(num_rows,1.4) 47 | ) %>% 48 | pivot_longer(everything(),values_to='value',names_to='distribution') 49 | ``` 50 | 51 | Compare some distributions 52 | 53 | ``` r 54 | ggplot(data=dist,aes(x=value,color=distribution)) + 55 | # facet_wrap(~distribution,ncol=1) + 56 | scale_x_continuous(limits =c(-3,3)) + 57 | theme_minimal() + 58 | theme(legend.position='top') + 59 | geom_density(alpha=0.8) + 60 | scale_color_jcolors('default') + 61 | xlab('') + ylab('') 62 | ``` 63 | 64 | ## Warning: Removed 116 rows containing non-finite values (stat_density). 65 | 66 | ![](../rmd_images/Distribution_Sampling/unnamed-chunk-3-1.png) 67 | 68 | Poisson v Neg Binomial v Weibull 69 | 70 | ``` r 71 | ggplot(data=count_dist,aes(x=value,color=distribution)) + 72 | # facet_wrap(~distribution,ncol=1) + 73 | scale_x_continuous(limits =c(0,8)) + 74 | theme_minimal() + 75 | theme(legend.position='top') + 76 | geom_density(alpha=0.8) + 77 | scale_color_jcolors('default') + 78 | xlab('') + ylab('') 79 | ``` 80 | 81 | ## Warning: Removed 25 rows containing non-finite values (stat_density). 82 | 83 | ![](../rmd_images/Distribution_Sampling/unnamed-chunk-4-1.png) 84 | 85 | ## Significance Testing 86 | 87 | ### T-test (Frequentist version) 88 | 89 | ``` r 90 | t.test(trees$Height) 91 | ``` 92 | 93 | ## 94 | ## One Sample t-test 95 | ## 96 | ## data: trees$Height 97 | ## t = 66.41, df = 30, p-value < 2.2e-16 98 | ## alternative hypothesis: true mean is not equal to 0 99 | ## 95 percent confidence interval: 100 | ## 73.6628 78.3372 101 | ## sample estimates: 102 | ## mean of x 103 | ## 76 104 | 105 | Simulate some data and run more T-tests 106 | 107 | ``` r 108 | compare_norms <- rnorm(100,25,10) %>% 109 | as_tibble() %>% rename(sample1=value) %>% 110 | mutate(sample2 = rnorm(100,28,10)) 111 | 112 | results <- t.test(compare_norms$sample1,compare_norms$sample2) 113 | results 114 | ``` 115 | 116 | ## 117 | ## Welch Two Sample t-test 118 | ## 119 | ## data: compare_norms$sample1 and compare_norms$sample2 120 | ## t = 1.4176, df = 197.32, p-value = 0.1579 121 | ## alternative hypothesis: true difference in means is not equal to 0 122 | ## 95 percent confidence interval: 123 | ## -0.7904857 4.8324013 124 | ## sample estimates: 125 | ## mean of x mean of y 126 | ## 28.30323 26.28227 127 | 128 | Tidy T-test (infer package) 129 | 130 | 131 | 132 | ``` r 133 | compare_norms_long <- 134 | compare_norms %>% 135 | pivot_longer(everything(),names_to='sample',values_to='value') 136 | 137 | compare_norms_long %>% 138 | t_test(value ~ sample,order=c('sample1','sample2')) %>% 139 | kable() 140 | ``` 141 | 142 | | statistic | t\_df | p\_value | alternative | lower\_ci | upper\_ci | 143 | |----------:|---------:|----------:|:------------|-----------:|----------:| 144 | | 1.417581 | 197.3181 | 0.1578903 | two.sided | -0.7904857 | 4.832401 | 145 | 146 | ### Bayesian T-test 147 | 148 | 149 | 150 | ``` r 151 | bayes_result <- BayesFactor::ttestBF(compare_norms$sample1,compare_norms$sample2) 152 | bayes_result 153 | ``` 154 | 155 | ## Bayes factor analysis 156 | ## -------------- 157 | ## [1] Alt., r=0.707 : 0.3932028 ±0% 158 | ## 159 | ## Against denominator: 160 | ## Null, mu1-mu2 = 0 161 | ## --- 162 | ## Bayes factor type: BFindepSample, JZS 163 | 164 | ``` r 165 | describe_posterior(bayes_result) %>% kable() 166 | ``` 167 | 168 | | Parameter | Median | CI | CI\_low | CI\_high | pd | ROPE\_CI | ROPE\_low | ROPE\_high | ROPE\_Percentage | BF | Prior\_Distribution | Prior\_Location | Prior\_Scale | 169 | |:-----------|----------:|----:|----------:|----------:|--------:|---------:|----------:|-----------:|-----------------:|----------:|:--------------------|----------------:|-------------:| 170 | | Difference | -1.885353 | 89 | -4.149145 | 0.2915764 | 0.91525 | 89 | -1.010632 | 1.010632 | 0.232519 | 0.3932028 | cauchy | 0 | 0.7071068 | 171 | -------------------------------------------------------------------------------- /R/Geospatial_Analysis.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Geospatial Analysis" 3 | author: "Jesse Cambon" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | output: 6 | github_document: 7 | toc: true 8 | --- 9 | 10 | ```{r knit-settings, include=FALSE} 11 | library(here) 12 | source(here("rmd_config.R")) 13 | ``` 14 | 15 | Install fifystater package from: https://github.com/wmurphyrd/fiftystater 16 | 17 | ## References 18 | * https://github.com/mtennekes/tmap 19 | * https://mran.revolutionanalytics.com/snapshot/2016-03-22/web/packages/tmap/vignettes/tmap-nutshell.html 20 | 21 | ## Setup 22 | 23 | ```{r,warning=F,message=F} 24 | library(tidyverse) 25 | library(tidycensus) # census data 26 | library(ggplot2) 27 | #library(sf) # geospatial methods 28 | library(tmap) # thematic mapping 29 | library(viridis) # color scheme 30 | #library(wbstats) # world bank 31 | library(wesanderson) # colors 32 | library(fiftystater) # US state geometries 33 | 34 | options(tigris_use_cache = TRUE) 35 | 36 | ``` 37 | 38 | # Geographies 39 | 40 | ## Locales 41 | 42 | Use the tidycensus package to pull Census data and display it on a map with the tmap package. 43 | ```{r locale,warning=F,message=F} 44 | # Pull Census Rent Data for Boston using tidycensus package 45 | bos <- get_acs(geography = "tract", 46 | variables = "B25064_001E", # median gross rent 47 | state = "MA", 48 | county = c("Suffolk",'Middlesex'), 49 | geometry = TRUE) 50 | 51 | tm_shape(bos) + 52 | tm_fill('estimate',colorNA = "white",breaks=c(0,1000,1500,2000,3500), 53 | title='Median Rent') + 54 | tm_borders() + 55 | tm_style("classic") + 56 | # margin format is c(bottom,left,top,right) 57 | tm_layout(inner.margins = c(0.05, .05, .05, .05),main.title.position='center',legend.position=c('left','bottom'), 58 | legend.text.size=0.8,legend.title.size=1.3, 59 | main.title='Boston Area Rent by Census Tract', 60 | main.title.size=1.5) 61 | 62 | #vars <- load_variables(2016,'acs1') # view census variables 63 | ``` 64 | 65 | http://www.robinlovelace.net/presentations/spatial-tidyverse.html#11 66 | https://cran.r-project.org/web/packages/wbstats/vignettes/Using_the_wbstats_package.html 67 | 68 | ## United States 69 | 70 | ```{r} 71 | 72 | data("fifty_states") # fiftystater package 73 | 74 | crimes <- data.frame(state = tolower(rownames(USArrests)), USArrests) %>% 75 | # Make a categorical variable for Murder rates with a predefined interval 76 | mutate(Murder_cut = str_replace_all(cut_width(Murder,5,boundary=0),',',' - ')) %>% 77 | # Delete all characters except for digits, whitespace, and '-' 78 | mutate(Murder_cut = str_replace_all(Murder_cut,'[^\\d\\s-]','')) 79 | 80 | # make an ordered list of levels so our categorical variable is sorted properly 81 | Murder_cut_levels <- crimes %>% arrange(Murder) %>% pull(Murder_cut) %>% 82 | unique() 83 | 84 | 85 | # map_id creates the aesthetic mapping to the state name column in your data 86 | ggplot(crimes, aes(map_id = state)) + 87 | # map points to the fifty_states shape data 88 | geom_map(aes(fill = factor(Murder_cut,levels=Murder_cut_levels)), 89 | map = fifty_states, color='white',size=0.2) + # geometry from fiftystater package 90 | expand_limits(x = fifty_states$long, y = fifty_states$lat) + 91 | coord_map() + 92 | theme(plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)) + 93 | scale_x_continuous(breaks = NULL) + 94 | scale_y_continuous(breaks = NULL) + 95 | 96 | labs(x = "", y = "",title='State Murder Rates in 1975', 97 | caption='Data: World Almanac and Book of facts 1975. (Crime rates)') + 98 | theme(legend.position = "right", 99 | panel.background = element_blank(), 100 | panel.border=element_blank()) + 101 | scale_fill_viridis_d(direction=-1,option='inferno',end=0.9) + 102 | guides(fill = guide_legend(title='Murders Per\n100,000 Residents')) 103 | 104 | ``` 105 | 106 | ## The World 107 | 108 | ```{r} 109 | # Load world map geometry 110 | data(World) 111 | 112 | # Load coordinates of cities 113 | data(metro) 114 | 115 | tm_shape(World, projection = "eck4" # Eckert IV 1906 project (preserves area) 116 | ) + 117 | tm_polygons("gdp_cap_est", 118 | palette = "Greens", 119 | breaks = c(0, 1000, 5000, 10000, 25000, 50000, Inf), 120 | title = "GDP per capita") + 121 | # tm_style("classic",frame=F, 122 | # earth.boundary = c(-180, -87, 180, 87), 123 | # legend.text.size=0.8,legend.title.size=1.3) + 124 | tm_layout(bg.color='white') + 125 | # tm_format("World", inner.margins = 0.02, frame = FALSE) 126 | tm_legend(frame = TRUE) 127 | # tm_format("World",frame=F) 128 | 129 | metro <- metro %>% 130 | mutate(growth= 100*(pop2020 - pop2010) / pop2010) 131 | 132 | tm_shape(World, projection = "eck4" # Eckert IV 1906 project (preserves area) 133 | ) + 134 | tm_polygons("life_exp", palette = "Purples", 135 | breaks=c(50,65,80,Inf), 136 | title = "Life Expectancy", contrast=0.7, border.col = "gray30", id = "name") + 137 | # tm_borders() + 138 | tm_shape(metro) + 139 | tm_bubbles("pop2010", col = "growth", border.col = "black", 140 | border.alpha = 0.6, 141 | breaks=c(0,25,50,75,Inf), 142 | palette = "-RdYlGn", 143 | title.size = "Metro population (2010)", 144 | title.col = "Projected Growth by 2020 (%)", 145 | id = "name") + 146 | # tm_style("classic",frame=F, 147 | # earth.boundary = c(-180, -87, 180, 87), 148 | # legend.text.size=0.8,legend.title.size=1.3) + 149 | tm_layout(bg.color='white') + 150 | # tm_format("World", inner.margins = 0.02, frame = FALSE) 151 | tm_legend(frame = F) 152 | 153 | ``` 154 | 155 | 156 | ```{r,include=F,eval=F} 157 | View(worldbank_df) 158 | 159 | qtm(world) 160 | 161 | Arrests <- USArrests %>% rownames_to_column('State') %>% 162 | as_tibble() 163 | 164 | us <- usa_composite() %>% forti 165 | %>% 166 | left_join(Arrests,by=c('name'='State')) 167 | 168 | left_join(U) 169 | 170 | 171 | us_map <- fortify(usa_composite() , region="fips_state") 172 | ggplot(us_map, aes(map_id=fips_state,fill=pop_2014)) + 173 | geom_map(map=us_map, color='#ffffff', size=0.1) + 174 | expand_limits(x=us_map$long,y=us_map$lat) + 175 | theme_map() + 176 | theme(legend.position="right") + 177 | coord_map("albers", lat0=30, lat1=40) + 178 | scale_fill_viridis(options='magma') 179 | # scale_fill_colormap("State Population\n(2014 Estimates)", labels=comma, 180 | # colormap = colormaps$copper, reverse = T, discrete = F) 181 | ``` 182 | 183 | 184 | 185 | -------------------------------------------------------------------------------- /R/Geospatial_Analysis.md: -------------------------------------------------------------------------------- 1 | Geospatial Analysis 2 | ================ 3 | Jesse Cambon 4 | 24 November, 2019 5 | 6 | Install fifystater package from: 7 | 8 | 9 | ## References 10 | 11 | - 12 | - 13 | 14 | ## Setup 15 | 16 | ``` r 17 | library(tidyverse) 18 | library(tidycensus) # census data 19 | library(ggplot2) 20 | #library(sf) # geospatial methods 21 | library(tmap) # thematic mapping 22 | library(viridis) # color scheme 23 | #library(wbstats) # world bank 24 | library(wesanderson) # colors 25 | library(fiftystater) # US state geometries 26 | 27 | options(tigris_use_cache = TRUE) 28 | ``` 29 | 30 | # Geographies 31 | 32 | ## Locales 33 | 34 | Use the tidycensus package to pull Census data and display it on a map 35 | with the tmap package. 36 | 37 | ``` r 38 | # Pull Census Rent Data for Boston using tidycensus package 39 | bos <- get_acs(geography = "tract", 40 | variables = "B25064_001E", # median gross rent 41 | state = "MA", 42 | county = c("Suffolk",'Middlesex'), 43 | geometry = TRUE) 44 | 45 | tm_shape(bos) + 46 | tm_fill('estimate',colorNA = "white",breaks=c(0,1000,1500,2000,3500), 47 | title='Median Rent') + 48 | tm_borders() + 49 | tm_style("classic") + 50 | # margin format is c(bottom,left,top,right) 51 | tm_layout(inner.margins = c(0.05, .05, .05, .05),main.title.position='center',legend.position=c('left','bottom'), 52 | legend.text.size=0.8,legend.title.size=1.3, 53 | main.title='Boston Area Rent by Census Tract', 54 | main.title.size=1.5) 55 | ``` 56 | 57 | ![](../rmd_images/Geospatial_Analysis/locale-1.png) 58 | 59 | ``` r 60 | #vars <- load_variables(2016,'acs1') # view census variables 61 | ``` 62 | 63 | 64 | 65 | 66 | ## United States 67 | 68 | ``` r 69 | data("fifty_states") # fiftystater package 70 | 71 | crimes <- data.frame(state = tolower(rownames(USArrests)), USArrests) %>% 72 | # Make a categorical variable for Murder rates with a predefined interval 73 | mutate(Murder_cut = str_replace_all(cut_width(Murder,5,boundary=0),',',' - ')) %>% 74 | # Delete all characters except for digits, whitespace, and '-' 75 | mutate(Murder_cut = str_replace_all(Murder_cut,'[^\\d\\s-]','')) 76 | 77 | # make an ordered list of levels so our categorical variable is sorted properly 78 | Murder_cut_levels <- crimes %>% arrange(Murder) %>% pull(Murder_cut) %>% 79 | unique() 80 | 81 | 82 | # map_id creates the aesthetic mapping to the state name column in your data 83 | ggplot(crimes, aes(map_id = state)) + 84 | # map points to the fifty_states shape data 85 | geom_map(aes(fill = factor(Murder_cut,levels=Murder_cut_levels)), 86 | map = fifty_states, color='white',size=0.2) + # geometry from fiftystater package 87 | expand_limits(x = fifty_states$long, y = fifty_states$lat) + 88 | coord_map() + 89 | theme(plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)) + 90 | scale_x_continuous(breaks = NULL) + 91 | scale_y_continuous(breaks = NULL) + 92 | 93 | labs(x = "", y = "",title='State Murder Rates in 1975', 94 | caption='Data: World Almanac and Book of facts 1975. (Crime rates)') + 95 | theme(legend.position = "right", 96 | panel.background = element_blank(), 97 | panel.border=element_blank()) + 98 | scale_fill_viridis_d(direction=-1,option='inferno',end=0.9) + 99 | guides(fill = guide_legend(title='Murders Per\n100,000 Residents')) 100 | ``` 101 | 102 | ![](../rmd_images/Geospatial_Analysis/unnamed-chunk-2-1.png) 103 | 104 | ## The World 105 | 106 | ``` r 107 | # Load world map geometry 108 | data(World) 109 | 110 | # Load coordinates of cities 111 | data(metro) 112 | 113 | tm_shape(World, projection = "eck4" # Eckert IV 1906 project (preserves area) 114 | ) + 115 | tm_polygons("gdp_cap_est", 116 | palette = "Greens", 117 | breaks = c(0, 1000, 5000, 10000, 25000, 50000, Inf), 118 | title = "GDP per capita") + 119 | # tm_style("classic",frame=F, 120 | # earth.boundary = c(-180, -87, 180, 87), 121 | # legend.text.size=0.8,legend.title.size=1.3) + 122 | tm_layout(bg.color='white') + 123 | # tm_format("World", inner.margins = 0.02, frame = FALSE) 124 | tm_legend(frame = TRUE) 125 | ``` 126 | 127 | ![](../rmd_images/Geospatial_Analysis/unnamed-chunk-3-1.png) 128 | 129 | ``` r 130 | # tm_format("World",frame=F) 131 | 132 | metro <- metro %>% 133 | mutate(growth= 100*(pop2020 - pop2010) / pop2010) 134 | 135 | tm_shape(World, projection = "eck4" # Eckert IV 1906 project (preserves area) 136 | ) + 137 | tm_polygons("life_exp", palette = "Purples", 138 | breaks=c(50,65,80,Inf), 139 | title = "Life Expectancy", contrast=0.7, border.col = "gray30", id = "name") + 140 | # tm_borders() + 141 | tm_shape(metro) + 142 | tm_bubbles("pop2010", col = "growth", border.col = "black", 143 | border.alpha = 0.6, 144 | breaks=c(0,25,50,75,Inf), 145 | palette = "-RdYlGn", 146 | title.size = "Metro population (2010)", 147 | title.col = "Projected Growth by 2020 (%)", 148 | id = "name") + 149 | # tm_style("classic",frame=F, 150 | # earth.boundary = c(-180, -87, 180, 87), 151 | # legend.text.size=0.8,legend.title.size=1.3) + 152 | tm_layout(bg.color='white') + 153 | # tm_format("World", inner.margins = 0.02, frame = FALSE) 154 | tm_legend(frame = F) 155 | ``` 156 | 157 | ## Warning: Values have found that are less than the lowest break 158 | 159 | ## Warning: Values have found that are less than the lowest break 160 | 161 | ## Variable "growth" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette. 162 | 163 | ![](../rmd_images/Geospatial_Analysis/unnamed-chunk-3-2.png) 164 | -------------------------------------------------------------------------------- /R/Modeling_Workflow.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Modeling Workflow" 3 | author: "Jesse Cambon" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | output: 6 | github_document: 7 | toc: true 8 | --- 9 | 10 | Demonstrate model workflows with tidyverse, modelr, and broom. This notebook includes both a group_by and a nested approach which offer similar results. However, the nested model workflow embeds the data into the dataframe along with objects such as models. 11 | 12 | ## References 13 | * http://r4ds.had.co.nz/many-models.html 14 | 15 | ## Setup 16 | 17 | ```{r knit-settings, include=FALSE} 18 | library(here) 19 | source(here("rmd_config.R")) 20 | ``` 21 | 22 | ```{r setup,warning=F,message=F} 23 | library(tidyverse) 24 | library(gapminder) 25 | library(broom) 26 | #library(modelr) 27 | library(knitr) 28 | library(kableExtra) 29 | ``` 30 | 31 | ## Exploration 32 | 33 | These graphs show why log transforming GDP per Capita makes it correlate more linearly to our response variable, life expectancy. Log transformations are often useful for highly skewed variables in regression. 34 | 35 | ```{r explore} 36 | ggplot(data=gapminder, 37 | aes(x = gdpPercap, y = lifeExp, color = continent,group=1)) + 38 | geom_point(alpha=0.7) + 39 | theme_bw() + 40 | geom_smooth() + 41 | theme(legend.position='top', 42 | plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)) + 43 | guides(color=guide_legend(override.aes = list(size=2.5))) 44 | 45 | ggplot(data=gapminder, 46 | aes(x = log10(gdpPercap), y = lifeExp, color = continent,group=1)) + 47 | geom_point(alpha=0.7) + 48 | theme_bw() + 49 | geom_smooth() + 50 | theme(legend.position='top', 51 | plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)) + 52 | guides(color=guide_legend(override.aes = list(size=2.5))) 53 | 54 | ggplot(data=gapminder, 55 | aes(x = log10(pop), y = lifeExp, color = continent,group=1)) + 56 | geom_point(alpha=0.7) + 57 | #facet_grid(~continent) + 58 | theme_bw() + 59 | geom_smooth() + 60 | theme(legend.position='top', 61 | plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)) + 62 | guides(color=guide_legend(override.aes = list(size=2.5))) 63 | 64 | 65 | 66 | ``` 67 | 68 | ## Grouped Models 69 | 70 | ```{r models} 71 | 72 | # One model per continent 73 | models <- gapminder %>% 74 | group_by(continent) %>% 75 | do(fit=lm(lifeExp ~ log10(gdpPercap)+log10(pop) + year, data=.)) 76 | 77 | stats <- glance(models,fit) %>% 78 | arrange(desc(r.squared)) 79 | 80 | coefficients <- tidy(models,fit) %>% 81 | filter(term != '(Intercept)') %>% 82 | arrange(continent,p.value) 83 | 84 | model_fit <- augment(models,fit) 85 | ``` 86 | 87 | 88 | ```{r plot} 89 | ggplot(data=model_fit, 90 | aes(x = .fitted, y = .resid, color = continent,group=1)) + 91 | geom_point(alpha=0.8) + 92 | facet_grid(~continent) + 93 | ggtitle('Fitted vs. Residual Check') + 94 | theme_bw() + 95 | geom_hline(yintercept=0,color='blue') + # horizontal line at 0 residual 96 | theme(legend.position='none', 97 | plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)) + 98 | guides(color=guide_legend(override.aes = list(size=2.5))) + 99 | xlab('Fitted') + 100 | ylab('Residual') 101 | 102 | ggplot(data=model_fit, 103 | aes(.resid)) + 104 | geom_histogram(aes(fill=continent)) + 105 | facet_grid(~continent) + 106 | ggtitle('Residual Distribution') + 107 | theme_bw() + 108 | scale_y_continuous(expand = c(0,0,0.05,0)) + 109 | theme(legend.position='none', 110 | plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)) + 111 | guides(color=guide_legend(override.aes = list(size=2.5))) + 112 | xlab('Residual') + 113 | ylab('Count') 114 | ``` 115 | 116 | ```{r modeldisplay,results='asis',warning=F} 117 | kable(stats,format='markdown',digits=2) %>% 118 | kable_styling(bootstrap_options = c("striped",'border')) 119 | 120 | kable(coefficients,format='markdown',digits=4) %>% 121 | kable_styling(bootstrap_options = c("striped",'border')) 122 | ``` 123 | 124 | ## Nested Models 125 | 126 | Now we create a similar model with nesting 127 | 128 | ```{r} 129 | my_model <- function(df) { 130 | lm(lifeExp ~ log10(gdpPercap)+log10(pop) + year, data= df) 131 | } 132 | 133 | # Nest models by continent 134 | nested_models <- gapminder %>% 135 | group_by(continent,country) %>% 136 | nest() %>% 137 | # fit models 138 | mutate(fit = map(data, my_model)) %>% 139 | # calculate residuals 140 | mutate(augment = map(fit, augment), 141 | stats = map(fit,glance), 142 | terms = map(fit,tidy)) %>% 143 | ungroup() 144 | 145 | # Dataset with predictions and residuals 146 | nest_fit <- nested_models %>% unnest(augment) 147 | 148 | nest_stats <- nested_models %>% 149 | unnest(stats,.drop=TRUE) %>% 150 | arrange(desc(r.squared)) 151 | 152 | nest_coefficients <- nested_models %>% 153 | unnest(terms,.drop=TRUE) %>% 154 | filter(term != '(Intercept)') %>% 155 | arrange(continent,country,desc(p.value)) 156 | 157 | most_important_vars <- nest_coefficients %>% 158 | group_by(country) %>% 159 | slice(1) 160 | 161 | summ_imp_vars <- most_important_vars %>% 162 | group_by(continent) %>% 163 | count(term) %>% 164 | arrange(continent,desc(n)) 165 | ``` 166 | 167 | 168 | -------------------------------------------------------------------------------- /R/Modeling_Workflow.md: -------------------------------------------------------------------------------- 1 | Modeling Workflow 2 | ================ 3 | Jesse Cambon 4 | 24 November, 2019 5 | 6 | Demonstrate model workflows with tidyverse, modelr, and broom. This 7 | notebook includes both a group\_by and a nested approach which offer 8 | similar results. However, the nested model workflow embeds the data into 9 | the dataframe along with objects such as models. 10 | 11 | ## References 12 | 13 | - 14 | 15 | ## Setup 16 | 17 | ``` r 18 | library(tidyverse) 19 | library(gapminder) 20 | library(broom) 21 | #library(modelr) 22 | library(knitr) 23 | library(kableExtra) 24 | ``` 25 | 26 | ## Exploration 27 | 28 | These graphs show why log transforming GDP per Capita makes it correlate 29 | more linearly to our response variable, life expectancy. Log 30 | transformations are often useful for highly skewed variables in 31 | regression. 32 | 33 | ``` r 34 | ggplot(data=gapminder, 35 | aes(x = gdpPercap, y = lifeExp, color = continent,group=1)) + 36 | geom_point(alpha=0.7) + 37 | theme_bw() + 38 | geom_smooth() + 39 | theme(legend.position='top', 40 | plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)) + 41 | guides(color=guide_legend(override.aes = list(size=2.5))) 42 | ``` 43 | 44 | ## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")' 45 | 46 | ![](../rmd_images/Modeling_Workflow/explore-1.png) 47 | 48 | ``` r 49 | ggplot(data=gapminder, 50 | aes(x = log10(gdpPercap), y = lifeExp, color = continent,group=1)) + 51 | geom_point(alpha=0.7) + 52 | theme_bw() + 53 | geom_smooth() + 54 | theme(legend.position='top', 55 | plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)) + 56 | guides(color=guide_legend(override.aes = list(size=2.5))) 57 | ``` 58 | 59 | ## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")' 60 | 61 | ![](../rmd_images/Modeling_Workflow/explore-2.png) 62 | 63 | ``` r 64 | ggplot(data=gapminder, 65 | aes(x = log10(pop), y = lifeExp, color = continent,group=1)) + 66 | geom_point(alpha=0.7) + 67 | #facet_grid(~continent) + 68 | theme_bw() + 69 | geom_smooth() + 70 | theme(legend.position='top', 71 | plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)) + 72 | guides(color=guide_legend(override.aes = list(size=2.5))) 73 | ``` 74 | 75 | ## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")' 76 | 77 | ![](../rmd_images/Modeling_Workflow/explore-3.png) 78 | 79 | ## Grouped Models 80 | 81 | ``` r 82 | # One model per continent 83 | models <- gapminder %>% 84 | group_by(continent) %>% 85 | do(fit=lm(lifeExp ~ log10(gdpPercap)+log10(pop) + year, data=.)) 86 | 87 | stats <- glance(models,fit) %>% 88 | arrange(desc(r.squared)) 89 | 90 | coefficients <- tidy(models,fit) %>% 91 | filter(term != '(Intercept)') %>% 92 | arrange(continent,p.value) 93 | 94 | model_fit <- augment(models,fit) 95 | ``` 96 | 97 | ``` r 98 | ggplot(data=model_fit, 99 | aes(x = .fitted, y = .resid, color = continent,group=1)) + 100 | geom_point(alpha=0.8) + 101 | facet_grid(~continent) + 102 | ggtitle('Fitted vs. Residual Check') + 103 | theme_bw() + 104 | geom_hline(yintercept=0,color='blue') + # horizontal line at 0 residual 105 | theme(legend.position='none', 106 | plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)) + 107 | guides(color=guide_legend(override.aes = list(size=2.5))) + 108 | xlab('Fitted') + 109 | ylab('Residual') 110 | ``` 111 | 112 | ![](../rmd_images/Modeling_Workflow/plot-1.png) 113 | 114 | ``` r 115 | ggplot(data=model_fit, 116 | aes(.resid)) + 117 | geom_histogram(aes(fill=continent)) + 118 | facet_grid(~continent) + 119 | ggtitle('Residual Distribution') + 120 | theme_bw() + 121 | scale_y_continuous(expand = c(0,0,0.05,0)) + 122 | theme(legend.position='none', 123 | plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)) + 124 | guides(color=guide_legend(override.aes = list(size=2.5))) + 125 | xlab('Residual') + 126 | ylab('Count') 127 | ``` 128 | 129 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. 130 | 131 | ![](../rmd_images/Modeling_Workflow/plot-2.png) 132 | 133 | ``` r 134 | kable(stats,format='markdown',digits=2) %>% 135 | kable_styling(bootstrap_options = c("striped",'border')) 136 | ``` 137 | 138 | | continent | r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | 139 | | :-------- | --------: | ------------: | ----: | --------: | ------: | -: | --------: | ------: | ------: | -------: | ----------: | 140 | | Oceania | 0.96 | 0.96 | 0.78 | 172.97 | 0 | 4 | \-26.03 | 62.06 | 67.95 | 12.30 | 20 | 141 | | Europe | 0.80 | 0.80 | 2.41 | 487.82 | 0 | 4 | \-825.98 | 1661.96 | 1681.39 | 2073.54 | 356 | 142 | | Americas | 0.72 | 0.72 | 4.96 | 255.52 | 0 | 4 | \-903.93 | 1817.85 | 1836.37 | 7274.08 | 296 | 143 | | Asia | 0.70 | 0.70 | 6.50 | 308.12 | 0 | 4 | \-1301.08 | 2612.15 | 2632.06 | 16558.14 | 392 | 144 | | Africa | 0.50 | 0.50 | 6.48 | 207.77 | 0 | 4 | \-2049.22 | 4108.45 | 4130.63 | 26011.51 | 620 | 145 | 146 | ``` r 147 | kable(coefficients,format='markdown',digits=4) %>% 148 | kable_styling(bootstrap_options = c("striped",'border')) 149 | ``` 150 | 151 | | continent | term | estimate | std.error | statistic | p.value | 152 | | :-------- | :--------------- | -------: | --------: | --------: | ------: | 153 | | Africa | year | 0.2551 | 0.0160 | 15.8991 | 0.0000 | 154 | | Africa | log10(gdpPercap) | 11.0142 | 0.7141 | 15.4237 | 0.0000 | 155 | | Africa | log10(pop) | \-0.5390 | 0.4192 | \-1.2857 | 0.1990 | 156 | | Americas | log10(gdpPercap) | 18.5492 | 1.1513 | 16.1118 | 0.0000 | 157 | | Americas | year | 0.2690 | 0.0179 | 15.0519 | 0.0000 | 158 | | Americas | log10(pop) | \-1.9190 | 0.5545 | \-3.4607 | 0.0006 | 159 | | Asia | log10(gdpPercap) | 12.6233 | 0.7074 | 17.8454 | 0.0000 | 160 | | Asia | year | 0.2974 | 0.0219 | 13.5703 | 0.0000 | 161 | | Asia | log10(pop) | 2.0425 | 0.4854 | 4.2077 | 0.0000 | 162 | | Europe | log10(gdpPercap) | 11.5695 | 0.4930 | 23.4667 | 0.0000 | 163 | | Europe | year | 0.1005 | 0.0091 | 11.0939 | 0.0000 | 164 | | Europe | log10(pop) | \-1.0054 | 0.2244 | \-4.4804 | 0.0000 | 165 | | Oceania | year | 0.1737 | 0.0384 | 4.5299 | 0.0002 | 166 | | Oceania | log10(pop) | 0.6644 | 0.5984 | 1.1102 | 0.2801 | 167 | | Oceania | log10(gdpPercap) | 4.1229 | 4.9721 | 0.8292 | 0.4168 | 168 | 169 | ## Nested Models 170 | 171 | Now we create a similar model with nesting 172 | 173 | ``` r 174 | my_model <- function(df) { 175 | lm(lifeExp ~ log10(gdpPercap)+log10(pop) + year, data= df) 176 | } 177 | 178 | # Nest models by continent 179 | nested_models <- gapminder %>% 180 | group_by(continent,country) %>% 181 | nest() %>% 182 | # fit models 183 | mutate(fit = map(data, my_model)) %>% 184 | # calculate residuals 185 | mutate(augment = map(fit, augment), 186 | stats = map(fit,glance), 187 | terms = map(fit,tidy)) %>% 188 | ungroup() 189 | 190 | # Dataset with predictions and residuals 191 | nest_fit <- nested_models %>% unnest(augment) 192 | 193 | nest_stats <- nested_models %>% 194 | unnest(stats,.drop=TRUE) %>% 195 | arrange(desc(r.squared)) 196 | ``` 197 | 198 | ## Warning: The `.drop` argument of `unnest()` is deprecated as of tidyr 1.0.0. 199 | ## All list-columns are now preserved. 200 | ## This warning is displayed once per session. 201 | ## Call `lifecycle::last_warnings()` to see where this warning was generated. 202 | 203 | ``` r 204 | nest_coefficients <- nested_models %>% 205 | unnest(terms,.drop=TRUE) %>% 206 | filter(term != '(Intercept)') %>% 207 | arrange(continent,country,desc(p.value)) 208 | 209 | most_important_vars <- nest_coefficients %>% 210 | group_by(country) %>% 211 | slice(1) 212 | 213 | summ_imp_vars <- most_important_vars %>% 214 | group_by(continent) %>% 215 | count(term) %>% 216 | arrange(continent,desc(n)) 217 | ``` 218 | -------------------------------------------------------------------------------- /R/Multilevel-Models.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Multilevel Models" 3 | author: "Jesse Cambon" 4 | date: "`r format(Sys.time(), '%B, %Y')`" 5 | output: 6 | github_document: 7 | toc: true 8 | --- 9 | 10 | ```{r knit-settings, include=FALSE} 11 | library(here) 12 | source(here("rmd_config.R")) 13 | ``` 14 | 15 | References: 16 | 17 | - https://cran.r-project.org/web/packages/lme4/vignettes/lmer.pdf 18 | - https://www.rensvandeschoot.com/tutorials/lme4/ 19 | 20 | 21 | ```{r setup, message = F, warning = F} 22 | library(lme4) 23 | library(broom.mixed) 24 | library(rstanarm) 25 | library(bayesplot) 26 | library(tidyverse) 27 | library(bayestestR) 28 | ``` 29 | 30 | 31 | ```{r} 32 | fm1 <- lmer(Reaction ~ Days + (1 + Days | Subject), data = sleepstudy) 33 | ``` 34 | 35 | ```{r} 36 | #sleepstudy 37 | ``` 38 | ```{r} 39 | # Overall Trend 40 | sleepstudy %>% 41 | ggplot(aes(x = Days, y = Reaction)) + 42 | geom_point() + geom_smooth(method = 'lm') 43 | 44 | # 45 | sleepstudy %>% 46 | ggplot(aes(x = Days, y = Reaction)) + 47 | facet_wrap(~Subject) + 48 | geom_point() + geom_smooth(method = 'lm') 49 | ``` 50 | 51 | 52 | 53 | ```{r} 54 | tidy(fm1) 55 | glance(fm1) 56 | ``` 57 | 58 | ## Bayesian approach 59 | 60 | https://mc-stan.org/users/documentation/case-studies/tutorial_rstanarm.html 61 | 62 | ```{r} 63 | bm1 <- stan_lmer(Reaction ~ Days + (1 + Days | Subject), data = sleepstudy) 64 | ``` 65 | 66 | ```{r} 67 | pp_check(bm1) 68 | ``` 69 | 70 | 71 | 72 | ```{r, fig.height = 8, fig.width = 5} 73 | mcmc_areas(bm1) 74 | ``` 75 | 76 | Posterior predictive check 77 | 78 | ```{r} 79 | ppc_ribbon_grouped( 80 | y = sleepstudy$Reaction, 81 | yrep = posterior_predict(bm1), 82 | x = sleepstudy$Days, 83 | prob = 0.5, 84 | group = sleepstudy$Subject 85 | ) 86 | ``` 87 | 88 | ```{r} 89 | plot(p_direction(bm1, effects = "fixed", component = "all")) 90 | plot(p_direction(bm1, effects = "random", component = "all")) 91 | 92 | p_direction(bm1, effects = 'all') 93 | ``` 94 | 95 | 96 | ```{r} 97 | summary(bm1) 98 | ``` 99 | 100 | -------------------------------------------------------------------------------- /R/Ordinal_Regression.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Ordinal Regression" 3 | author: "Jesse Cambon" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | output: 6 | github_document: 7 | toc: true 8 | --- 9 | 10 | GAM ordinal regression: https://stat.ethz.ch/R-manual/R-devel/library/mgcv/html/ocat.html 11 | Example using polr: https://stats.idre.ucla.edu/r/dae/ordinal-logistic-regression/ 12 | Explanation of GAM interpretation: https://stats.stackexchange.com/questions/226645/generalized-additive-model-interpretation-with-ordered-categorical-family-in-r 13 | 14 | ```{r knit-settings, include=FALSE} 15 | library(here) 16 | source(here("rmd_config.R")) 17 | ``` 18 | 19 | ```{r, warning=F} 20 | #library(Hmisc) 21 | library(MASS) # polr() 22 | library(car) 23 | library(mgcv) # gam model 24 | library(mgcViz) # gam visualization 25 | library(ordinal) # clm() 26 | library(broom) 27 | library(tidyverse) 28 | 29 | # Find frequency counts for all variables in var list 30 | 31 | var_freq <- function(data,var) { 32 | var <- rlang::sym(var) 33 | print(var) 34 | # print(quo_name(var)) 35 | if (is.factor(data %>% pull(!!var)) | is.character(data %>% pull(!!var))) { 36 | return(data %>% count(!!var) %>% mutate(term=quo_name(var)) %>% 37 | rename(level=!!var) %>% 38 | mutate(level=as.character(level), # convert to char 39 | is_categorical=1)) 40 | } else { 41 | return(tibble()) 42 | } 43 | } 44 | 45 | # Iterate through an entire dataset and return a dataset with all 46 | # frequencies 47 | find_all_freqs <- function(data,var_list) { 48 | all_freqs <- tibble() 49 | for (var in var_list) { 50 | all_freqs <- all_freqs %>% 51 | bind_rows(var_freq(data,var)) 52 | } 53 | return(all_freqs) 54 | } 55 | 56 | # obtain list of variables in a model. Remove smooth terms (s()) 57 | obtain_model_varlist <- function(model_obj) { 58 | var_list_raw <- unlist(strsplit(as.character(model_obj$formula[3]),split=' \\+ ')) 59 | # Remove smooth terms (s()) 60 | return(var_list_raw[!str_detect(var_list_raw,'^s\\(')]) 61 | } 62 | 63 | # adds term_name field to a tidy dataframe which includes frequency count 64 | add_termnames <- function(data,term_freqs,var_list) { 65 | # Regexs to match the varname (when it begins a string) 66 | varregex <- paste(str_replace(var_list,'^','\\^'), collapse = "|") 67 | 68 | return( 69 | data %>% 70 | mutate(term_name = str_extract(term,varregex), 71 | level = case_when(!is.na(term_name) ~ str_replace(term,varregex,""))) %>% 72 | # add in frequency counts and labels 73 | left_join(term_freqs,by=c('term_name'='term','level')) %>% 74 | mutate(label=case_when(is.na(n) ~ term, # if not categorical than use original label 75 | is_categorical == 1 ~ str_c(term_name,': ', level,' (',scales::comma(n),')'), 76 | TRUE ~ str_c(level,' (',scales::comma(n),')'))) 77 | 78 | ) 79 | 80 | } 81 | 82 | 83 | ``` 84 | 85 | 86 | ```{r} 87 | 88 | Mydiamonds <- diamonds %>% 89 | # convert factor to numeric for gam model 90 | mutate(cutN=as.numeric(cut), 91 | # convert to non-ordered factors 92 | color=factor(color,ordered=F), 93 | clarity=factor(clarity,ordered=F) 94 | ) 95 | 96 | # make wine show up in the R studio environment 97 | 98 | outcomeVar <- 'cut' 99 | predictors <- 'carat + color + clarity' 100 | 101 | # Construct formula from strings 102 | lmformula <- as.formula(str_c(outcomeVar,' ~ ',predictors)) 103 | 104 | # train ordinal logistic models 105 | clm_model <- clm(lmformula, data=Mydiamonds) 106 | polr_model <- polr(lmformula, data=Mydiamonds) 107 | # train ordinal GAM model (R is the number of outcome categories) 108 | gam_model <- gam(cutN ~ s(carat) + color + clarity,family=ocat(R=5),data=Mydiamonds) 109 | 110 | gam.check(gam_model) 111 | 112 | # Check for collinearity 113 | concurvity(gam_model) 114 | vif(polr_model) 115 | 116 | ``` 117 | 118 | 119 | ```{r} 120 | 121 | # Find categorical variables and the 122 | # frequency counts of their levels 123 | gam_varlist <- obtain_model_varlist(gam_model) 124 | gam_varfreqs <- find_all_freqs(Mydiamonds,gam_varlist) 125 | 126 | # Evaluate models 127 | clm_stats <- glance(clm_model) 128 | clm_coef <- tidy(clm_model,exponentiate=T) 129 | 130 | polr_stats <- glance(polr_model) 131 | polr_coef <- tidy(polr_model,exponentiate=T) 132 | 133 | gam_stats <- glance(gam_model) 134 | gam_Lcoef <- tidy(gam_model,parametric=T) %>% # get parametric coefficients 135 | add_termnames(gam_varfreqs,gam_varlist) 136 | gam_Scoef <- tidy(gam_model,parametric=F) # get smooth term coefficients 137 | 138 | # gam_allpvalues <- gam_Lcoef %>% 139 | # dplyr::select(term,p.value) %>% 140 | # bind_rows(gam_Scoef %>% select(term,p.value)) %>% 141 | # arrange(p.value) 142 | 143 | # Extract probability predictions from GAM 144 | gam_probs <- predict(gam_model,type='response') %>% 145 | # remove "V" from column names so we now have the class labels 146 | as.data.frame() %>% rename_all(list(replace= ~str_replace_all(.,'V',''))) %>% 147 | mutate(obs_num=1:nrow(.)) %>% 148 | gather(class,prob,-obs_num) %>% 149 | mutate(class=as.numeric(class)) %>% arrange(obs_num,class) 150 | 151 | # Extract class predictions 152 | gam_pred <- gam_probs %>% group_by(obs_num) %>% 153 | filter(prob==max(prob)) 154 | 155 | # Compare predictions of polr() and clm() 156 | compare_models <- Mydiamonds %>% 157 | # clm predictions returned as list for some reason 158 | # have to unlist it so we can put it in a column 159 | mutate(clm_pred=unlist(predict(clm_model,type='class')), 160 | polr_pred=predict(polr_model,type='class'), 161 | gam_pred=gam_pred %>% pull(class)) %>% 162 | mutate_all(as.numeric) # convert from factor to numeric 163 | 164 | # Make frequency tables 165 | # freq_preds <- compare_models %>% count(polr_pred,clm_pred) 166 | # freq_predcheck <- compare_models %>% count(cut,clm_pred) 167 | 168 | # Chi square test 169 | # chisq.test(freq_preds) 170 | # chisq.test(freq_predcheck) 171 | 172 | #Spearman correlations 173 | cor(compare_models$cut,compare_models$clm_pred,method='spearman') 174 | cor(compare_models$cut,compare_models$polr_pred,method='spearman') 175 | cor(compare_models$cut,compare_models$gam_pred,method='spearman') 176 | 177 | 178 | ``` 179 | 180 | ```{r,results='asis'} 181 | ggplot(data=gam_Lcoef %>% filter(label != '(Intercept)'), 182 | aes(x = reorder(label,-estimate), y = exp(estimate))) + 183 | geom_point() + 184 | scale_y_continuous(breaks=seq(0,10,2),limits=c(0,10)) + 185 | geom_hline(yintercept=1,color='grey') + 186 | coord_flip() + 187 | theme_classic() + 188 | #geom_pointrange(mapping=aes(ymin=LCLM, ymax=UCLM)) + 189 | labs(title='Odds Ratios of Parametric Terms', 190 | caption='Sample sizes shown in ()') + 191 | xlab('Term') + ylab('Odds Ratio') 192 | ``` 193 | 194 | 195 | 196 | 197 | ```{r} 198 | # Confusion matrixes 199 | 200 | check_gam <- compare_models %>% count(cut,gam_pred) %>% 201 | spread(cut,n,fill=0) 202 | 203 | check_clm <- compare_models %>% count(cut,clm_pred) %>% 204 | spread(cut,n,fill=0) 205 | 206 | ``` 207 | 208 | ## Extract data from smooths and plot 209 | 210 | This method allows us some more direct control over how we plot the smooth terms since we extract the plot data. Alternatively, mgcViz (shown below) can be used. 211 | 212 | ```{r} 213 | 214 | # Returns the data to plot all smooth turns in a gam model object 215 | # 100 points per plot 216 | smooth_data <- function(gam_model) { 217 | # select=0 prevents plots being shown on screen 218 | gam_viz <- plot(gam_model, rug=FALSE,select=0) 219 | 220 | num_smooths <- length(gam_viz) # number of smooth terms 221 | smooth_df <- tibble() # initialize a dataframe 222 | 223 | for (i in 1:num_smooths) { 224 | print(gam_viz[[i]]$xlab) 225 | # extract and append data we want 226 | smooth_df <- smooth_df %>% 227 | bind_rows(tibble( xlab=gam_viz[[i]]$xlab, 228 | ylab=gam_viz[[i]]$ylab, 229 | x=gam_viz[[i]]$x, 230 | fit=gam_viz[[i]]$fit, 231 | se=gam_viz[[i]]$se 232 | )) 233 | } 234 | return(smooth_df) 235 | } 236 | 237 | gam_smoothdata <- smooth_data(gam_model) 238 | 239 | ggplot(gam_smoothdata, 240 | aes(x, fit)) + 241 | facet_wrap(~xlab,scales='free') + 242 | geom_line() + 243 | theme_minimal() + 244 | geom_line(aes(y=fit+(2*se)),linetype='dashed') + 245 | geom_line(aes(y=fit-(2*se)),linetype='dashed') + 246 | scale_y_continuous() + 247 | scale_x_continuous(labels=scales::comma) 248 | ``` 249 | 250 | 251 | 252 | ## Alternatively, Plot Smooth Terms with MgcViz 253 | 254 | ```{r} 255 | gam_viz <- getViz(gam_model) 256 | 257 | plot(sm(gam_viz, 1)) + 258 | l_fitLine(colour = "red") + 259 | # l_rug(mapping = aes(x=x, y=y), alpha = 0.8) + 260 | l_ciLine(mul = 5, colour = "blue", linetype = 2) + 261 | # l_points(shape = 19, size = 1, alpha = 0.1) + 262 | theme_classic() 263 | ``` 264 | 265 | ```{r} 266 | print(plot(gam_viz, allTerms = T), pages = 1) 267 | ``` 268 | 269 | -------------------------------------------------------------------------------- /R/Parsnip.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Parsnip" 3 | author: "Jesse Cambon" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | output: 6 | github_document: 7 | toc: true 8 | --- 9 | 10 | 11 | ```{r knit-settings, include=FALSE} 12 | library(here) 13 | source(here("rmd_config.R")) 14 | ``` 15 | 16 | References: 17 | * https://tidymodels.github.io/parsnip/articles/parsnip_Intro.html 18 | * https://www.tidyverse.org/blog/2018/11/parsnip-0-0-1/ 19 | * https://m-clark.github.io/workshops/bayesian/04_R.html 20 | 21 | Requires packages: ranger, randomForest, rstanarm 22 | 23 | ## Setup 24 | 25 | ```{r} 26 | library(parsnip) 27 | library(tidymodels) 28 | 29 | set.seed(4831) # set seed for reproducibility 30 | # Split data 31 | split <- initial_split(mtcars, props = 9/10) 32 | car_train <- training(split) 33 | car_test <- testing(split) 34 | 35 | # Defines type of model we want 36 | car_model <- linear_reg() 37 | ``` 38 | 39 | ## Bayesian Model 40 | 41 | 42 | ```{r} 43 | library(rstanarm) 44 | 45 | wide_prior <- normal(0, 10) 46 | 47 | stan_car_model <- 48 | car_model %>% 49 | set_engine("stan", iter = 5000, prior = wide_prior, seed = 2347) 50 | stan_car_model 51 | ``` 52 | 53 | ```{r} 54 | # don't print anything: 55 | ctrl <- fit_control(verbosity = 0) 56 | 57 | stan_fit <- 58 | stan_car_model %>% 59 | fit(mpg ~ ., data = car_train, control = ctrl) 60 | stan_fit 61 | ``` 62 | 63 | 64 | ```{r} 65 | predict(stan_fit, car_test,type='conf_int') 66 | ``` 67 | 68 | 69 | ```{r} 70 | library(tidybayes) 71 | library(bayesplot) 72 | 73 | # tidybayes to extract info 74 | stan_fit$fit %>% get_variables() 75 | 76 | 77 | ## Use bayesplot to plot 78 | posterior <- as.matrix(stan_fit$fit) 79 | plot_title <- ggtitle("Posterior distributions", 80 | "with medians and 80% intervals") 81 | mcmc_areas(posterior, 82 | pars = c("cyl", "drat", "am", "wt"), 83 | prob = .8) + plot_title 84 | 85 | #stan_fit %>% spread_draws() 86 | ``` 87 | 88 | ```{r} 89 | pp_check(stan_fit$fit) 90 | ``` 91 | 92 | ```{r} 93 | library(shinystan) 94 | launch_shinystan(stan_fit$fit) 95 | 96 | ``` 97 | 98 | 99 | 100 | ## Random Forests 101 | 102 | ```{r} 103 | 104 | rf_with_seed <- 105 | rand_forest(trees = 2000, mtry = varying(), mode = "regression") %>% 106 | set_engine("ranger", seed = 63233) 107 | 108 | # Fig with ranger 109 | ranger_model <- rf_with_seed %>% 110 | set_args(mtry = 4) %>% 111 | set_engine("ranger",keep.inbag=TRUE) %>% 112 | fit(mpg ~ ., data = mtcars) 113 | 114 | # First with random forest package 115 | rf_model <- rf_with_seed %>% 116 | set_args(mtry = 4) %>% 117 | set_engine("randomForest") %>% 118 | fit(mpg ~ ., data = mtcars) 119 | ``` 120 | 121 | ```{r} 122 | ranger_predictions <- predict(ranger_model, mtcars, type = "conf_int") 123 | ``` 124 | 125 | -------------------------------------------------------------------------------- /R/Parsnip.md: -------------------------------------------------------------------------------- 1 | Parsnip 2 | ================ 3 | Jesse Cambon 4 | 12 April, 2020 5 | 6 | References: \* 7 | \* 8 | \* 9 | 10 | 11 | Requires packages: ranger, randomForest, rstanarm 12 | 13 | ## Setup 14 | 15 | ``` r 16 | library(parsnip) 17 | library(tidymodels) 18 | ``` 19 | 20 | ## ── Attaching packages ────────────────────────────────────────────────────────────────────── tidymodels 0.1.0 ── 21 | 22 | ## ✓ broom 0.5.5 ✓ recipes 0.1.9 23 | ## ✓ dials 0.0.6 ✓ rsample 0.0.6 24 | ## ✓ dplyr 0.8.5 ✓ tibble 2.1.3 25 | ## ✓ ggplot2 3.3.0 ✓ tune 0.1.0 26 | ## ✓ infer 0.5.1 ✓ workflows 0.1.1 27 | ## ✓ purrr 0.3.3 ✓ yardstick 0.0.6 28 | 29 | ## ── Conflicts ───────────────────────────────────────────────────────────────────────── tidymodels_conflicts() ── 30 | ## x purrr::discard() masks scales::discard() 31 | ## x dplyr::filter() masks stats::filter() 32 | ## x recipes::fixed() masks stringr::fixed() 33 | ## x dplyr::lag() masks stats::lag() 34 | ## x ggplot2::margin() masks dials::margin() 35 | ## x recipes::step() masks stats::step() 36 | ## x recipes::yj_trans() masks scales::yj_trans() 37 | 38 | ``` r 39 | set.seed(4831) # set seed for reproducibility 40 | # Split data 41 | split <- initial_split(mtcars, props = 9/10) 42 | car_train <- training(split) 43 | car_test <- testing(split) 44 | 45 | # Defines type of model we want 46 | car_model <- linear_reg() 47 | ``` 48 | 49 | ## Bayesian Model 50 | 51 | ``` r 52 | library(rstanarm) 53 | ``` 54 | 55 | ## Loading required package: Rcpp 56 | 57 | ## 58 | ## Attaching package: 'Rcpp' 59 | 60 | ## The following object is masked from 'package:rsample': 61 | ## 62 | ## populate 63 | 64 | ## rstanarm (Version 2.19.3, packaged: 2020-02-11 05:16:41 UTC) 65 | 66 | ## - Do not expect the default priors to remain the same in future rstanarm versions. 67 | 68 | ## Thus, R scripts should specify priors explicitly, even if they are just the defaults. 69 | 70 | ## - For execution on a local, multicore CPU with excess RAM we recommend calling 71 | 72 | ## options(mc.cores = parallel::detectCores()) 73 | 74 | ## - bayesplot theme set to bayesplot::theme_default() 75 | 76 | ## * Does _not_ affect other ggplot2 plots 77 | 78 | ## * See ?bayesplot_theme_set for details on theme setting 79 | 80 | ``` r 81 | wide_prior <- normal(0, 10) 82 | 83 | stan_car_model <- 84 | car_model %>% 85 | set_engine("stan", iter = 5000, prior = wide_prior, seed = 2347) 86 | stan_car_model 87 | ``` 88 | 89 | ## Linear Regression Model Specification (regression) 90 | ## 91 | ## Engine-Specific Arguments: 92 | ## iter = 5000 93 | ## prior = wide_prior 94 | ## seed = 2347 95 | ## 96 | ## Computational engine: stan 97 | 98 | ``` r 99 | # don't print anything: 100 | ctrl <- fit_control(verbosity = 0) 101 | 102 | stan_fit <- 103 | stan_car_model %>% 104 | fit(mpg ~ ., data = car_train, control = ctrl) 105 | stan_fit 106 | ``` 107 | 108 | ## parsnip model object 109 | ## 110 | ## Fit time: 6s 111 | ## stan_glm 112 | ## family: gaussian [identity] 113 | ## formula: mpg ~ . 114 | ## observations: 24 115 | ## predictors: 11 116 | ## ------ 117 | ## Median MAD_SD 118 | ## (Intercept) -10.9 32.4 119 | ## cyl 0.8 1.9 120 | ## disp 0.0 0.0 121 | ## hp 0.0 0.0 122 | ## drat 2.4 2.3 123 | ## wt -3.3 2.3 124 | ## qsec 1.0 0.9 125 | ## vs 1.5 2.9 126 | ## am 3.4 2.8 127 | ## gear 2.6 2.8 128 | ## carb -0.9 1.3 129 | ## 130 | ## Auxiliary parameter(s): 131 | ## Median MAD_SD 132 | ## sigma 3.0 0.6 133 | ## 134 | ## ------ 135 | ## * For help interpreting the printed output see ?print.stanreg 136 | ## * For info on the priors used see ?prior_summary.stanreg 137 | 138 | ``` r 139 | predict(stan_fit, car_test,type='conf_int') 140 | ``` 141 | 142 | ## # A tibble: 8 x 2 143 | ## .pred_lower .pred_upper 144 | ## 145 | ## 1 17.2 26.1 146 | ## 2 11.1 18.9 147 | ## 3 11.2 19.3 148 | ## 4 6.95 18.1 149 | ## 5 13.2 29.4 150 | ## 6 13.5 21.7 151 | ## 7 26.0 31.3 152 | ## 8 12.5 36.4 153 | 154 | ``` r 155 | library(tidybayes) 156 | ``` 157 | 158 | ## 159 | ## Attaching package: 'tidybayes' 160 | 161 | ## The following object is masked from 'package:tune': 162 | ## 163 | ## parameters 164 | 165 | ## The following object is masked from 'package:dials': 166 | ## 167 | ## parameters 168 | 169 | ``` r 170 | library(bayesplot) 171 | ``` 172 | 173 | ## This is bayesplot version 1.7.1 174 | 175 | ## - Online documentation and vignettes at mc-stan.org/bayesplot 176 | 177 | ## - bayesplot theme set to bayesplot::theme_default() 178 | 179 | ## * Does _not_ affect other ggplot2 plots 180 | 181 | ## * See ?bayesplot_theme_set for details on theme setting 182 | 183 | ``` r 184 | # tidybayes to extract info 185 | stan_fit$fit %>% get_variables() 186 | ``` 187 | 188 | ## [1] "(Intercept)" "cyl" "disp" "hp" 189 | ## [5] "drat" "wt" "qsec" "vs" 190 | ## [9] "am" "gear" "carb" "sigma" 191 | ## [13] "accept_stat__" "stepsize__" "treedepth__" "n_leapfrog__" 192 | ## [17] "divergent__" "energy__" 193 | 194 | ``` r 195 | ## Use bayesplot to plot 196 | posterior <- as.matrix(stan_fit$fit) 197 | plot_title <- ggtitle("Posterior distributions", 198 | "with medians and 80% intervals") 199 | mcmc_areas(posterior, 200 | pars = c("cyl", "drat", "am", "wt"), 201 | prob = .8) + plot_title 202 | ``` 203 | 204 | ## Warning: `expand_scale()` is deprecated; use `expansion()` instead. 205 | 206 | ![](../rmd_images/Parsnip/unnamed-chunk-5-1.png) 207 | 208 | ``` r 209 | #stan_fit %>% spread_draws() 210 | ``` 211 | 212 | ``` r 213 | pp_check(stan_fit$fit) 214 | ``` 215 | 216 | ![](../rmd_images/Parsnip/unnamed-chunk-6-1.png) 217 | 218 | ``` r 219 | library(shinystan) 220 | ``` 221 | 222 | ## Loading required package: shiny 223 | 224 | ## 225 | ## This is shinystan version 2.5.0 226 | 227 | ``` r 228 | launch_shinystan(stan_fit$fit) 229 | ``` 230 | 231 | ## 232 | ## Hang on... preparing graphical posterior predictive checks for rstanarm model. 233 | ## See help('shinystan', 'rstanarm') for how to disable this feature. 234 | 235 | ## 236 | ## Launching ShinyStan interface... for large models this may take some time. 237 | 238 | ## 239 | ## Listening on http://127.0.0.1:6429 240 | 241 | ## Random Forests 242 | 243 | ``` r 244 | rf_with_seed <- 245 | rand_forest(trees = 2000, mtry = varying(), mode = "regression") %>% 246 | set_engine("ranger", seed = 63233) 247 | 248 | # Fig with ranger 249 | ranger_model <- rf_with_seed %>% 250 | set_args(mtry = 4) %>% 251 | set_engine("ranger",keep.inbag=TRUE) %>% 252 | fit(mpg ~ ., data = mtcars) 253 | 254 | # First with random forest package 255 | rf_model <- rf_with_seed %>% 256 | set_args(mtry = 4) %>% 257 | set_engine("randomForest") %>% 258 | fit(mpg ~ ., data = mtcars) 259 | ``` 260 | 261 | ``` r 262 | ranger_predictions <- predict(ranger_model, mtcars, type = "conf_int") 263 | ``` 264 | -------------------------------------------------------------------------------- /R/Power_Analysis.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Power Analysis" 3 | author: "Jesse Cambon" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | output: 6 | github_document: 7 | toc: true 8 | --- 9 | 10 | ```{r knit-settings, include=FALSE} 11 | library(here) 12 | source(here("rmd_config.R")) 13 | ``` 14 | 15 | ```{r} 16 | library(pwr) 17 | pwr.2p.test( 18 | h=ES.h(0.6,0.8), 19 | n=NULL, 20 | sig.level=0.05, 21 | power=0.80, 22 | alternative="two.sided") 23 | 24 | ``` 25 | 26 | 27 | ```{r} 28 | library(pwr) 29 | power.prop.test(n = NULL, 30 | p1 = .6, 31 | p2 = .8, 32 | power=0.8, 33 | sig.level=0.05, 34 | alternative="two.sided" 35 | ) 36 | ``` 37 | 38 | 39 | ### Additional References 40 | 41 | * [Biostat Handbook – Power Analysis](http://www.biostathandbook.com/power.html) 42 | * [Biostat Handbook - Hypothesis Testing](http://www.biostathandbook.com/hypothesistesting.html) 43 | * [UCLA Intro to Power Analysis](https://stats.idre.ucla.edu/other/mult-pkg/seminars/intro-power/) 44 | * [An online power calculator for proportions](https://www.stat.ubc.ca/~rollin/stats/ssize/b2.html) 45 | * [The pwr R package](https://cran.r-project.org/web/packages/pwr/vignettes/pwr-vignette.html) – a popular R package used for power analysis 46 | 47 | -------------------------------------------------------------------------------- /R/Power_Analysis.md: -------------------------------------------------------------------------------- 1 | Power Analysis 2 | ================ 3 | Jesse Cambon 4 | 22 November, 2019 5 | 6 | Check with 7 | 8 | ``` r 9 | library(pwr) 10 | pwr.2p.test( 11 | h=ES.h(0.6,0.8), 12 | n=NULL, 13 | sig.level=0.05, 14 | power=0.80, 15 | alternative="two.sided") 16 | ``` 17 | 18 | ## 19 | ## Difference of proportion power calculation for binomial distribution (arcsine transformation) 20 | ## 21 | ## h = 0.4421432 22 | ## n = 80.29912 23 | ## sig.level = 0.05 24 | ## power = 0.8 25 | ## alternative = two.sided 26 | ## 27 | ## NOTE: same sample sizes 28 | 29 | ``` r 30 | library(pwr) 31 | power.prop.test(n = NULL, 32 | p1 = .6, 33 | p2 = .8, 34 | power=0.8, 35 | sig.level=0.05, 36 | alternative="two.sided" 37 | ) 38 | ``` 39 | 40 | ## 41 | ## Two-sample comparison of proportions power calculation 42 | ## 43 | ## n = 81.22424 44 | ## p1 = 0.6 45 | ## p2 = 0.8 46 | ## sig.level = 0.05 47 | ## power = 0.8 48 | ## alternative = two.sided 49 | ## 50 | ## NOTE: n is number in *each* group 51 | 52 | ### Additional References 53 | 54 | - [Biostat Handbook – Power 55 | Analysis](http://www.biostathandbook.com/power.html) 56 | - [Biostat Handbook - Hypothesis 57 | Testing](http://www.biostathandbook.com/hypothesistesting.html) 58 | - [UCLA Intro to Power 59 | Analysis](https://stats.idre.ucla.edu/other/mult-pkg/seminars/intro-power/) 60 | - [An online power calculator for 61 | proportions](https://www.stat.ubc.ca/~rollin/stats/ssize/b2.html) 62 | - [The pwr R 63 | package](https://cran.r-project.org/web/packages/pwr/vignettes/pwr-vignette.html) 64 | – a popular R package used for power analysis 65 | -------------------------------------------------------------------------------- /R/R-Quickstart.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R Quickstart" 3 | author: "Jesse Cambon" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | output: 6 | github_document: 7 | toc: true 8 | --- 9 | 10 | ```{r knit-settings, include=FALSE} 11 | library(here) 12 | source(here("rmd_config.R")) 13 | ``` 14 | 15 | Simple tidyverse code for common data science operations in R. 16 | 17 | ## Setup 18 | 19 | ```{r setup, message=FALSE, results=FALSE, warning=FALSE} 20 | library(tidyverse) 21 | library(ggplot2) 22 | 23 | # Set default ggplot theme 24 | theme_set(theme_bw()+ 25 | theme(legend.position = "top", 26 | plot.subtitle= element_text(face="bold",hjust=0.5), 27 | plot.title = element_text(lineheight=1, face="bold",hjust = 0.5))) 28 | ``` 29 | 30 | ## Data Manipulation 31 | 32 | ### Warm Up 33 | 34 | Initial 'mpg' Dataset: 35 | ```{r,echo=F} 36 | kable(head(mpg,3)) 37 | ``` 38 | 39 | Use `View(mpg)` to preview the dataset in R. 40 | 41 | ```{r} 42 | mpg_subset <- mpg %>% 43 | filter(cyl==4 & year >= 2005 & manufacturer == "nissan") %>% 44 | mutate(ratio=hwy/cty, 45 | make_model=str_c(manufacturer,' ',model)) %>% 46 | select(make_model,cyl,year,hwy,cty,ratio) 47 | ``` 48 | 49 | ```{r,echo=F} 50 | kable(mpg_subset) 51 | ``` 52 | 53 | 54 | ### Counting 55 | ```{r} 56 | count_cyl <- mpg %>% 57 | count(cyl) 58 | ``` 59 | 60 | ```{r,echo=F} 61 | kable(count_cyl) 62 | ``` 63 | 64 | ### Calculate Summary Stats 65 | ```{r} 66 | mpg_stats <- mpg %>% select(class,hwy) %>% 67 | mutate(class_c=case_when(class %in% c("2seater","subcompact") ~ "subcompact", 68 | TRUE ~ class)) %>% 69 | group_by(class_c) %>% 70 | summarize(count=n(), 71 | max_hwy=max(hwy), 72 | min_hwy=min(hwy), 73 | median_hwy=median(hwy), 74 | mean_hwy=mean(hwy)) %>% 75 | ungroup() %>% 76 | arrange(desc(count)) # sort dataset 77 | ``` 78 | 79 | Note that '2seater' is reclassified as 'subcompact' 80 | 81 | ```{r,echo=F} 82 | kable(mpg_stats) 83 | ``` 84 | 85 | ### Stacking Data 86 | 87 | 88 | Initial 'mpg' Dataset: 89 | ```{r,echo=F} 90 | kable(head(mpg,3)) 91 | ``` 92 | 93 | 94 | ```{r} 95 | mpg1 <- mpg %>% slice(1:2) %>% 96 | select(manufacturer,model,hwy,cty) %>% 97 | mutate(dataset=1) 98 | 99 | mpg2 <- mpg %>% slice(44:45) %>% 100 | select(manufacturer,model,hwy,cty) %>% 101 | mutate(dataset=2) 102 | 103 | mpg3 <- mpg %>% slice(1:2,5:6) %>% 104 | select(displ,year) 105 | ``` 106 | 107 | Stack vertically and horizontally 108 | ```{r} 109 | mpg_stack_vert <- mpg1 %>% 110 | bind_rows(mpg2) 111 | 112 | mpg_stack_horz <- mpg_stack_vert %>% 113 | bind_cols(mpg3) 114 | ``` 115 | 116 | ### Joining 117 | 118 | ```{r} 119 | car_type <- mpg %>% select(manufacturer,model,class) %>% 120 | distinct() # distinct rows only 121 | 122 | joined <- mpg_stack_horz %>% 123 | left_join(car_type,by=c('manufacturer','model')) %>% 124 | select(-dataset,everything()) 125 | ``` 126 | 127 | ### Long to Wide 128 | 129 | Initial Data: 130 | ```{r,echo=F} 131 | kable(head(us_rent_income,4)) 132 | ``` 133 | 134 | * pivot_wider 135 | * names_from: column containing values that we will use for our new column names 136 | 137 | ```{r} 138 | col_ratio <- us_rent_income %>% 139 | select(-GEOID,-moe) %>% 140 | pivot_wider(names_from = variable, values_from = estimate) %>% 141 | drop_na() %>% # drop missing values 142 | mutate(income_rent_ratio = income / (12*rent)) 143 | ``` 144 | 145 | Income and Rent are now in separate columns: 146 | 147 | ```{r,echo=F} 148 | kable(head(col_ratio,4)) 149 | ``` 150 | 151 | ### Wide to Long 152 | 153 | Initial Data: 154 | ```{r,echo=F} 155 | kable(head(world_bank_pop,3)) 156 | ``` 157 | 158 | * pivot_longer 159 | * cols (1st arg): what columns do we want to pivot? (ie. subtract ones we don't want to) 160 | * names_to : the name of new column holding the column names as values 161 | * values_to : name of new column containing values 162 | * seq(start, stop, increment) -> generates sequence 163 | 164 | ```{r} 165 | wb_pop <- world_bank_pop %>% 166 | pivot_longer(c(-country,-indicator), names_to = "year", values_to = "value") %>% 167 | mutate(year=as.numeric(year)) %>% # convert to numeric 168 | filter(year %in% seq(2000,2016,2)) 169 | ``` 170 | 171 | After: 172 | ```{r,echo=F} 173 | kable(head(wb_pop,3)) 174 | ``` 175 | 176 | 177 | ## Visualizations 178 | 179 | ### Bar Chart 180 | 181 | * use fill argument in ggplot() to set bar color based on a variable 182 | * reorder() orders the bars 183 | ```{r} 184 | # A simple bar chart - average heights of the species 185 | # the reorder command orders our bars in order of descending height 186 | ggplot(data=mpg_stats, 187 | aes(x = reorder(class_c,-mean_hwy), y=mean_hwy)) + 188 | geom_bar(stat='identity',position='dodge',color='black') + 189 | scale_y_continuous(expand = expand_scale(mult = c(0, .1))) + # plot margins 190 | geom_text(aes(label=round(mean_hwy)), vjust=-0.5) + # labelling 191 | theme(legend.position="none", # no legend (in case we want to use fill) 192 | panel.grid = element_blank()) + # turn off grid 193 | labs(title='') + 194 | xlab('') + 195 | ylab('') 196 | ``` 197 | 198 | 199 | ```{r histogram} 200 | # Histogram with autobinning based on gender 201 | ggplot(mpg,aes(hwy)) + 202 | geom_histogram(aes(fill=cyl),binwidth=1) + 203 | scale_y_continuous(expand = expand_scale(mult = c(0, .05))) + 204 | xlab('Highway mpg') + ylab('Count') 205 | ``` 206 | 207 | ## Line 208 | 209 | We divide the `value` field by 100 since to convert it to a decimal percentage value. 210 | 211 | SP.POP.GROW is the % population growth 212 | 213 | ```{r line} 214 | ggplot(wb_pop %>% filter(country %in% c("USA","CAN","MEX") & indicator == "SP.POP.GROW"), 215 | aes(x=year,y=value/100,color = country)) + 216 | theme_classic() + 217 | geom_line() + geom_point() + # lines and points 218 | scale_x_continuous(expand = expand_scale(mult = c(.05, .05))) + 219 | scale_y_continuous(labels=scales::percent) + 220 | labs(title='', 221 | caption='') + 222 | theme(legend.title = element_blank(), 223 | panel.grid.minor.x = element_blank(), 224 | legend.text=element_text(size=10), 225 | legend.position='right') + 226 | xlab('Year') + 227 | ylab('Population Growth') + 228 | # make legend items bigger 229 | guides(colour = guide_legend(override.aes = list(size=2))) 230 | 231 | ``` 232 | 233 | ## Lollipop 234 | 235 | ```{r lollipop} 236 | ggplot(data=col_ratio %>% arrange(desc(rent)) %>% head(15), aes(x=NAME, y=rent) ) + 237 | geom_segment( aes(x=reorder(NAME,rent) ,xend=NAME, y=0, yend=rent), color="grey") + 238 | geom_point(size=3) + 239 | theme_minimal() + 240 | theme(plot.subtitle= element_text(face="bold",hjust=0.5), 241 | plot.title = element_text(lineheight=1, face="bold",hjust = 0.5), 242 | panel.grid.minor.y = element_blank(), 243 | panel.grid.major.y = element_blank(), 244 | panel.grid.minor.x = element_blank() 245 | ) + 246 | coord_flip() + 247 | scale_y_continuous(labels=scales::dollar,expand = expand_scale(mult = c(0, .1))) + 248 | labs(title='States With Highest Rent', 249 | caption='Source: 2017 American Community Survey (Census)') + 250 | xlab('') + ylab('Median Monthly Rent') 251 | ``` 252 | 253 | 254 | -------------------------------------------------------------------------------- /R/R_Quotation.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R Quotation Methods" 3 | author: "Jesse Cambon" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | output: 6 | github_document: 7 | toc: true 8 | --- 9 | 10 | Demonstrate the use of the quo_name() and enquo() functions 11 | to pass variable names to functions and utilize both the variables 12 | and the variable names 13 | 14 | ```{r knit-settings, include=FALSE} 15 | library(here) 16 | source(here("rmd_config.R")) 17 | ``` 18 | 19 | ```{r, warning=F,message=F} 20 | library(tidyverse) 21 | library(knitr) 22 | 23 | # Create a heatmap with two axis variables 24 | # of the mean of a given metric variable 25 | car_heatmap <- function(data,axis1,axis2,metric) { 26 | # handle quotations 27 | 28 | # The dataset can be passed easily as a function argument 29 | # but we need to use enquo() and the !! operator (below) 30 | # for variable names 31 | # If you were doing this outside of function you would use quo() instead of enquo() 32 | axis1 <- enquo(axis1) 33 | axis2 <- enquo(axis2) 34 | metric <- enquo(metric) 35 | 36 | 37 | print(quo_name(metric)) 38 | 39 | cars_agg <- data %>% 40 | group_by(!!axis1,!!axis2) %>% 41 | summarize(fill_metric = mean(!!metric),n=n()) 42 | 43 | ggplot(cars_agg, aes(factor(!!axis1), factor(!!axis2))) + 44 | geom_tile(aes(fill = fill_metric), colour = "grey") + 45 | scale_fill_gradient(low = "white",high = "steelblue") + 46 | geom_text(aes(label=round(fill_metric,1))) + 47 | theme_minimal() + 48 | theme(panel.grid = element_blank()) + 49 | scale_x_discrete(expand=c(0,0,0,0)) + 50 | # quo_name() access the character name of a variable 51 | guides(fill=guide_legend(title=str_c('mean ',quo_name(metric)))) + 52 | xlab(quo_name(axis1)) + ylab(quo_name(axis2)) 53 | } 54 | 55 | car_heatmap(mtcars,cyl,gear,mpg) 56 | car_heatmap(mtcars,cyl,gear,hp) 57 | car_heatmap(mtcars,gear,carb,mpg) 58 | ``` 59 | 60 | To quote a character list of variables, use rlang:syms() and the !!! operator 61 | 62 | ```{r} 63 | # Find frequency counts for all variables in var list 64 | 65 | hp_calc <- function(data,variables) { 66 | variables <- rlang::syms(variables) 67 | return(data %>% group_by(!!!variables) %>% 68 | summarize(n=n(), 69 | mean_hp=mean(hp), 70 | min_hp=min(hp), 71 | max_hp=max(hp)) 72 | ) 73 | } 74 | 75 | gear_hp <- hp_calc(mtcars,c('gear')) 76 | vs_am_hp <- hp_calc(mtcars,c('vs','am')) 77 | 78 | kable(gear_hp) 79 | kable(vs_am_hp) 80 | ``` 81 | 82 | 83 | -------------------------------------------------------------------------------- /R/R_Quotation.md: -------------------------------------------------------------------------------- 1 | R Quotation Methods 2 | ================ 3 | Jesse Cambon 4 | 24 November, 2019 5 | 6 | Demonstrate the use of the quo\_name() and enquo() functions to pass 7 | variable names to functions and utilize both the variables and the 8 | variable names 9 | 10 | ``` r 11 | library(tidyverse) 12 | library(knitr) 13 | 14 | # Create a heatmap with two axis variables 15 | # of the mean of a given metric variable 16 | car_heatmap <- function(data,axis1,axis2,metric) { 17 | # handle quotations 18 | 19 | # The dataset can be passed easily as a function argument 20 | # but we need to use enquo() and the !! operator (below) 21 | # for variable names 22 | # If you were doing this outside of function you would use quo() instead of enquo() 23 | axis1 <- enquo(axis1) 24 | axis2 <- enquo(axis2) 25 | metric <- enquo(metric) 26 | 27 | 28 | print(quo_name(metric)) 29 | 30 | cars_agg <- data %>% 31 | group_by(!!axis1,!!axis2) %>% 32 | summarize(fill_metric = mean(!!metric),n=n()) 33 | 34 | ggplot(cars_agg, aes(factor(!!axis1), factor(!!axis2))) + 35 | geom_tile(aes(fill = fill_metric), colour = "grey") + 36 | scale_fill_gradient(low = "white",high = "steelblue") + 37 | geom_text(aes(label=round(fill_metric,1))) + 38 | theme_minimal() + 39 | theme(panel.grid = element_blank()) + 40 | scale_x_discrete(expand=c(0,0,0,0)) + 41 | # quo_name() access the character name of a variable 42 | guides(fill=guide_legend(title=str_c('mean ',quo_name(metric)))) + 43 | xlab(quo_name(axis1)) + ylab(quo_name(axis2)) 44 | } 45 | 46 | car_heatmap(mtcars,cyl,gear,mpg) 47 | ``` 48 | 49 | ## [1] "mpg" 50 | 51 | ![](../rmd_images/R_Quotation/unnamed-chunk-1-1.png) 52 | 53 | ``` r 54 | car_heatmap(mtcars,cyl,gear,hp) 55 | ``` 56 | 57 | ## [1] "hp" 58 | 59 | ![](../rmd_images/R_Quotation/unnamed-chunk-1-2.png) 60 | 61 | ``` r 62 | car_heatmap(mtcars,gear,carb,mpg) 63 | ``` 64 | 65 | ## [1] "mpg" 66 | 67 | ![](../rmd_images/R_Quotation/unnamed-chunk-1-3.png) 68 | 69 | To quote a character list of variables, use rlang:syms() and the \!\!\! 70 | operator 71 | 72 | ``` r 73 | # Find frequency counts for all variables in var list 74 | 75 | hp_calc <- function(data,variables) { 76 | variables <- rlang::syms(variables) 77 | return(data %>% group_by(!!!variables) %>% 78 | summarize(n=n(), 79 | mean_hp=mean(hp), 80 | min_hp=min(hp), 81 | max_hp=max(hp)) 82 | ) 83 | } 84 | 85 | gear_hp <- hp_calc(mtcars,c('gear')) 86 | vs_am_hp <- hp_calc(mtcars,c('vs','am')) 87 | 88 | kable(gear_hp) 89 | ``` 90 | 91 | | gear | n | mean\_hp | min\_hp | max\_hp | 92 | | ---: | -: | -------: | ------: | ------: | 93 | | 3 | 15 | 176.1333 | 97 | 245 | 94 | | 4 | 12 | 89.5000 | 52 | 123 | 95 | | 5 | 5 | 195.6000 | 91 | 335 | 96 | 97 | ``` r 98 | kable(vs_am_hp) 99 | ``` 100 | 101 | | vs | am | n | mean\_hp | min\_hp | max\_hp | 102 | | -: | -: | -: | --------: | ------: | ------: | 103 | | 0 | 0 | 12 | 194.16667 | 150 | 245 | 104 | | 0 | 1 | 6 | 180.83333 | 91 | 335 | 105 | | 1 | 0 | 7 | 102.14286 | 62 | 123 | 106 | | 1 | 1 | 7 | 80.57143 | 52 | 113 | 107 | -------------------------------------------------------------------------------- /R/Regression Model Tidying.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Regression Model Tidying" 3 | author: "Jesse Cambon" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | output: 6 | github_document: 7 | toc: true 8 | --- 9 | 10 | Example of labeling categorical variables in a regression model 11 | 12 | ```{r knit-settings, include=FALSE} 13 | source(here::here("rmd_config.R")) 14 | ``` 15 | 16 | ## Setup 17 | 18 | ```{r, message = FALSE, warning = FALSE} 19 | library(broom) 20 | library(tidyverse) 21 | 22 | # obtain character list of independent variables in a model object. 23 | obtain_model_varlist <- function(model_obj) { 24 | var_list_raw <- unlist(strsplit(as.character(formula(model_obj)[3]), split = " \\+ ")) 25 | # Remove smooth terms (s()) 26 | return(var_list_raw[!str_detect(var_list_raw, "^s\\(")]) 27 | } 28 | 29 | # Find frequency counts for all categorical variables in var list 30 | var_freq <- function(data, var) { 31 | var <- rlang::sym(var) 32 | print(var) 33 | 34 | if (is.factor(data %>% pull(!!var)) | is.character(data %>% pull(!!var))) { 35 | return(data %>% count(!!var) %>% mutate(term = quo_name(var)) %>% 36 | rename(level = !!var) %>% 37 | mutate( 38 | level = as.character(level), # convert to char 39 | is_categorical = 1 40 | ) %>% 41 | select(term, everything())) 42 | } else { 43 | return(tibble()) 44 | } 45 | } 46 | 47 | # Iterate through an entire dataset and return a dataset with sample 48 | # sizes for all levels of categorical variables 49 | find_all_freqs <- function(data, var_list) { 50 | all_freqs <- tibble() 51 | for (var in var_list) { 52 | all_freqs <- all_freqs %>% 53 | bind_rows(var_freq(data, var)) 54 | } 55 | return(all_freqs) 56 | } 57 | 58 | # adds term_name field to a tidy dataframe which includes sample sizes 59 | add_termnames <- function(data, term_freqs, var_list) { 60 | # Regexs to match the varname (when it begins a string) 61 | varregex <- paste(str_replace(var_list, "^", "\\^"), collapse = "|") 62 | 63 | return( 64 | data %>% 65 | mutate( 66 | term_name = coalesce( 67 | str_extract(term, varregex), 68 | term 69 | ), 70 | level = case_when(!is.na(term_name) ~ str_replace(term, varregex, "")) 71 | ) %>% 72 | # add in frequency counts and labels 73 | left_join(term_freqs, by = c("term_name" = "term", "level")) %>% 74 | mutate(label = case_when( 75 | is_categorical == 1 ~ str_c(term_name, ": ", level, " (", scales::comma(n), ")"), 76 | TRUE ~ str_c(term_name) 77 | )) 78 | ) 79 | } 80 | ``` 81 | 82 | ## Build Linear Model 83 | 84 | ```{r} 85 | Mymtcars <- mtcars %>% 86 | mutate( 87 | Cylinders = factor(cyl), 88 | Gears = factor(gear) 89 | ) 90 | 91 | car_model <- lm(mpg ~ Cylinders + disp + Gears, data = Mymtcars) 92 | 93 | # obtain list of independent variables 94 | car_varlist <- obtain_model_varlist(car_model) 95 | # sample sizes for categorical variable levels 96 | car_freqs <- find_all_freqs(Mymtcars, car_varlist) 97 | 98 | tidy_car <- tidy(car_model, conf.int = T) %>% 99 | add_termnames(car_freqs, car_varlist) 100 | 101 | glance_car <- glance(car_model) 102 | ``` 103 | 104 | ## Plot Coefficients 105 | 106 | ```{r} 107 | 108 | tidy_car %>% 109 | ggplot( 110 | aes(x = reorder(term, -estimate), y = estimate) 111 | ) + 112 | geom_point() + 113 | scale_y_continuous() + 114 | geom_hline(yintercept = 0, color = "grey") + 115 | coord_flip() + 116 | theme_bw() + 117 | theme(plot.title = element_text(lineheight = 1, face = "bold", hjust = 0.5)) + 118 | geom_pointrange(mapping = aes(ymin = conf.low, ymax = conf.high)) + 119 | labs( 120 | title = "MPG Linear Model - Default Labels", 121 | caption = "Horizontal lines represents 95% confidence intervals." 122 | ) + 123 | xlab("Term") + 124 | ylab("Coefficient") 125 | 126 | 127 | tidy_car %>% 128 | ggplot( 129 | aes(x = reorder(label, -estimate), y = estimate) 130 | ) + 131 | geom_point() + 132 | scale_y_continuous() + 133 | geom_hline(yintercept = 0, color = "grey") + 134 | coord_flip() + 135 | theme_bw() + 136 | theme(plot.title = element_text(lineheight = 1, face = "bold", hjust = 0.5)) + 137 | geom_pointrange(mapping = aes(ymin = conf.low, ymax = conf.high)) + 138 | labs( 139 | title = "MPG Linear Model - With Improved Labels", 140 | caption = "Sample sizes shown in (). Horizontal lines represents 95% confidence intervals." 141 | ) + 142 | xlab("Term") + 143 | ylab("Coefficient") 144 | ``` 145 | -------------------------------------------------------------------------------- /R/Regression-Model-Tidying.md: -------------------------------------------------------------------------------- 1 | Regression Model Tidying 2 | ================ 3 | Jesse Cambon 4 | 12 September, 2021 5 | 6 | - [Setup](#setup) 7 | - [Build Linear Model](#build-linear-model) 8 | - [Plot Coefficients](#plot-coefficients) 9 | 10 | Example of labeling categorical variables in a regression model 11 | 12 | ## Setup 13 | 14 | ``` r 15 | library(broom) 16 | library(tidyverse) 17 | 18 | # obtain character list of independent variables in a model object. 19 | obtain_model_varlist <- function(model_obj) { 20 | var_list_raw <- unlist(strsplit(as.character(formula(model_obj)[3]), split = " \\+ ")) 21 | # Remove smooth terms (s()) 22 | return(var_list_raw[!str_detect(var_list_raw, "^s\\(")]) 23 | } 24 | 25 | # Find frequency counts for all categorical variables in var list 26 | var_freq <- function(data, var) { 27 | var <- rlang::sym(var) 28 | print(var) 29 | 30 | if (is.factor(data %>% pull(!!var)) | is.character(data %>% pull(!!var))) { 31 | return(data %>% count(!!var) %>% mutate(term = quo_name(var)) %>% 32 | rename(level = !!var) %>% 33 | mutate( 34 | level = as.character(level), # convert to char 35 | is_categorical = 1 36 | ) %>% 37 | select(term, everything())) 38 | } else { 39 | return(tibble()) 40 | } 41 | } 42 | 43 | # Iterate through an entire dataset and return a dataset with sample 44 | # sizes for all levels of categorical variables 45 | find_all_freqs <- function(data, var_list) { 46 | all_freqs <- tibble() 47 | for (var in var_list) { 48 | all_freqs <- all_freqs %>% 49 | bind_rows(var_freq(data, var)) 50 | } 51 | return(all_freqs) 52 | } 53 | 54 | # adds term_name field to a tidy dataframe which includes sample sizes 55 | add_termnames <- function(data, term_freqs, var_list) { 56 | # Regexs to match the varname (when it begins a string) 57 | varregex <- paste(str_replace(var_list, "^", "\\^"), collapse = "|") 58 | 59 | return( 60 | data %>% 61 | mutate( 62 | term_name = coalesce( 63 | str_extract(term, varregex), 64 | term 65 | ), 66 | level = case_when(!is.na(term_name) ~ str_replace(term, varregex, "")) 67 | ) %>% 68 | # add in frequency counts and labels 69 | left_join(term_freqs, by = c("term_name" = "term", "level")) %>% 70 | mutate(label = case_when( 71 | is_categorical == 1 ~ str_c(term_name, ": ", level, " (", scales::comma(n), ")"), 72 | TRUE ~ str_c(term_name) 73 | )) 74 | ) 75 | } 76 | ``` 77 | 78 | ## Build Linear Model 79 | 80 | ``` r 81 | Mymtcars <- mtcars %>% 82 | mutate( 83 | Cylinders = factor(cyl), 84 | Gears = factor(gear) 85 | ) 86 | 87 | car_model <- lm(mpg ~ Cylinders + disp + Gears, data = Mymtcars) 88 | 89 | # obtain list of independent variables 90 | car_varlist <- obtain_model_varlist(car_model) 91 | # sample sizes for categorical variable levels 92 | car_freqs <- find_all_freqs(Mymtcars, car_varlist) 93 | ``` 94 | 95 | ## Cylinders 96 | ## disp 97 | ## Gears 98 | 99 | ``` r 100 | tidy_car <- tidy(car_model, conf.int = T) %>% 101 | add_termnames(car_freqs, car_varlist) 102 | 103 | glance_car <- glance(car_model) 104 | ``` 105 | 106 | ## Plot Coefficients 107 | 108 | ``` r 109 | tidy_car %>% 110 | ggplot( 111 | aes(x = reorder(term, -estimate), y = estimate) 112 | ) + 113 | geom_point() + 114 | scale_y_continuous() + 115 | geom_hline(yintercept = 0, color = "grey") + 116 | coord_flip() + 117 | theme_bw() + 118 | theme(plot.title = element_text(lineheight = 1, face = "bold", hjust = 0.5)) + 119 | geom_pointrange(mapping = aes(ymin = conf.low, ymax = conf.high)) + 120 | labs( 121 | title = "MPG Linear Model - Default Labels", 122 | caption = "Horizontal lines represents 95% confidence intervals." 123 | ) + 124 | xlab("Term") + 125 | ylab("Coefficient") 126 | ``` 127 | 128 | ![](../rmd_images/Regression-Model-Tidying/unnamed-chunk-3-1.png) 129 | 130 | ``` r 131 | tidy_car %>% 132 | ggplot( 133 | aes(x = reorder(label, -estimate), y = estimate) 134 | ) + 135 | geom_point() + 136 | scale_y_continuous() + 137 | geom_hline(yintercept = 0, color = "grey") + 138 | coord_flip() + 139 | theme_bw() + 140 | theme(plot.title = element_text(lineheight = 1, face = "bold", hjust = 0.5)) + 141 | geom_pointrange(mapping = aes(ymin = conf.low, ymax = conf.high)) + 142 | labs( 143 | title = "MPG Linear Model - With Improved Labels", 144 | caption = "Sample sizes shown in (). Horizontal lines represents 95% confidence intervals." 145 | ) + 146 | xlab("Term") + 147 | ylab("Coefficient") 148 | ``` 149 | 150 | ![](../rmd_images/Regression-Model-Tidying/unnamed-chunk-3-2.png) 151 | -------------------------------------------------------------------------------- /R/Rethinking-Tadpoles.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Multilevel Tadpoles" 3 | author: "Jesse Cambon" 4 | date: "`r format(Sys.time(), '%B, %Y')`" 5 | output: 6 | github_document: 7 | toc: true 8 | --- 9 | 10 | ```{r knit-settings, include=FALSE} 11 | library(here) 12 | source(here("rmd_config.R")) 13 | ``` 14 | 15 | Reproduce the multilevel model made for the tadpole data in Chapter 13 of [Statistical Rethinking](https://xcelab.net/rm/statistical-rethinking/). 16 | 17 | Use this code as a starting point: 18 | 19 | https://bookdown.org/ajkurz/Statistical_Rethinking_recoded/multilevel-models.html 20 | 21 | Also see: 22 | 23 | - https://cran.r-project.org/web/packages/brms/vignettes/brms_multilevel.pdf 24 | - https://www.rensvandeschoot.com/tutorials/brms-started/ 25 | - https://www.rensvandeschoot.com/tutorials/brms/ 26 | 27 | Info on 'trials' function: https://cran.r-project.org/web/packages/brms/vignettes/brms_customfamilies.html 28 | 29 | ```{r setup, message = F, warning = F} 30 | library(rethinking) 31 | library(brms) 32 | library(tidyverse) 33 | library(bayesplot) 34 | data("reedfrogs") 35 | ``` 36 | 37 | 38 | 39 | ```{r} 40 | d <- reedfrogs %>% 41 | mutate(tank = as.factor(1:nrow(.)), 42 | across(tank, as.factor)) 43 | ``` 44 | 45 | ```{r} 46 | 47 | # No pooling 48 | b12.1 <- 49 | brm(data = d, family = binomial, 50 | surv | trials(density) ~ tank, 51 | prior(normal(0, 5), class = b), 52 | iter = 2000, warmup = 500, chains = 4, cores = 4, 53 | seed = 12) 54 | 55 | # Partial pooling (multilevel model) 56 | b12.2 <- 57 | brm(data = d, family = binomial, 58 | surv | trials(density) ~ (1 | tank), 59 | prior = c(prior(normal(0, 1), class = Intercept), 60 | prior(cauchy(0, 1), class = sd)), 61 | iter = 4000, warmup = 1000, chains = 4, cores = 4, 62 | seed = 12) 63 | 64 | ``` 65 | 66 | 67 | ```{r} 68 | pp_check(b12.1) 69 | pp_check(b12.2) 70 | ``` 71 | 72 | ```{r} 73 | mcmc_areas(b12.2, pars = c('r_*', 'b_*')) 74 | ``` 75 | ```{r} 76 | summary(b12.2) 77 | 78 | post <- posterior_samples(b12.2) 79 | ``` 80 | 81 | -------------------------------------------------------------------------------- /R/Sentiment_Analysis.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Sentiment Analysis with R" 3 | author: "Jesse Cambon" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | output: 6 | github_document: 7 | toc: true 8 | --- 9 | 10 | 11 | ```{r knit-settings, include=FALSE} 12 | source(here::here("rmd_config.R")) 13 | ``` 14 | 15 | ## Tidytext 16 | 17 | Using tidytext for sentiment analysis 18 | 19 | * https://www.tidytextmining.com/sentiment.html 20 | 21 | ```{r, warning=F, message=F} 22 | library(janeaustenr) 23 | library(tidyverse) 24 | library(tidytext) 25 | library(knitr) 26 | library(sentimentr) 27 | 28 | # import original dataset - one row per line of each jane austen book 29 | austen_df <- austen_books() 30 | 31 | # tokenize each jane austen book 32 | tidy_books <- austen_df %>% 33 | group_by(book) %>% 34 | # add some variables 35 | mutate( 36 | linenumber = row_number(), 37 | chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", 38 | ignore_case = TRUE 39 | ))) 40 | ) %>% 41 | ungroup() %>% 42 | unnest_tokens(word, text) 43 | ``` 44 | 45 | Sentiment Analysis (grouping by book) 46 | 47 | ```{r} 48 | jane_austen_sentiment <- tidy_books %>% 49 | inner_join(get_sentiments("afinn")) 50 | 51 | kable(head(jane_austen_sentiment, 5)) 52 | 53 | # Summarize sentiment by book 54 | sentiment_summ <- jane_austen_sentiment %>% 55 | group_by(book) %>% 56 | summarize( 57 | mean_sentiment = mean(value), 58 | num_sentiment_words = n() 59 | ) %>% 60 | ungroup() %>% 61 | arrange(desc(mean_sentiment)) 62 | 63 | kable(sentiment_summ) 64 | ``` 65 | 66 | ## Sentiment Aggregation with tidytext 67 | 68 | Make a function for quickly tokenizing a string and returning the mean sentiment 69 | 70 | ```{r} 71 | mean_sentiment <- function(.tbl, text) { 72 | # Returns mean sentiment 73 | # Args: 74 | # .tbl : tibble dataframe 75 | # text (STRING) : quoted column name in .tbl of text content 76 | # Returns: 77 | # Dataframe with mean sentiment and counts of both total words 78 | # and words that had a sentiment value 79 | 80 | # text <- enquo(text) 81 | 82 | # number each row 83 | # use this to join text column back on later 84 | text_num <- .tbl %>% 85 | mutate(row_num = row_number()) 86 | 87 | # tokenize the dataset (one row per token) 88 | tokens <- text_num %>% 89 | unnest_tokens(word, {{ text }}) %>% 90 | left_join(get_sentiments("afinn"), by = "word") %>% 91 | mutate(non_missing = case_when(!is.na(value) ~ 1, TRUE ~ 0)) # record if missing sentiment value 92 | 93 | # summarize the sentiment (value column contains sentiment of each token) 94 | summ <- tokens %>% 95 | group_by(row_num) %>% 96 | summarize( 97 | mean_sentiment = mean(value, na.rm = TRUE), 98 | num_words = n(), 99 | num_sentiment_words = sum(non_missing) 100 | ) %>% 101 | ungroup() %>% 102 | left_join(text_num, by = "row_num") %>% 103 | select(row_num, {{ text }}, everything()) 104 | 105 | return(summ) 106 | } 107 | ``` 108 | 109 | Note that the tidytext approach doesn't handle negation in sentiment 110 | 111 | ```{r} 112 | test_df <- tribble( 113 | ~review, 114 | "This is the worst restaurant I have ever eaten at. It's service is abysmal.", 115 | "Wow, amazing food, great atmosphere. Will definitely be coming back.", 116 | "The restaurant was okay. Not good or bad", 117 | "The restaurant was okay. Not good or bad", # duplicate row 118 | "The stock market crashed and it was a disaster", 119 | "This place was not terrible.", # test negation 120 | "Really wasn't the best meal.", # test negation 121 | "Sunshine and rainbows. Everything is fantastic, couldn't be better." 122 | ) 123 | 124 | test_sentiment <- test_df %>% 125 | mean_sentiment(review) %>% 126 | arrange(desc(mean_sentiment)) 127 | 128 | # test <- test_df %>% 129 | # unnest_tokens(word,review) %>% 130 | # left_join(get_sentiments("afinn"), by='word') 131 | 132 | kable(test_sentiment) 133 | ``` 134 | 135 | ## Sentimentr 136 | 137 | https://github.com/trinker/sentimentr 138 | 139 | An example of using the `sentimentr` package for sentiment analysis. This approach does handle negation. 140 | 141 | ```{r} 142 | # Split entities into sentences, use 'element_id' column to aggregate back 143 | # to the original entitites 144 | sentences_df <- get_sentences(test_df) 145 | 146 | kable(sentences_df) 147 | # Sentiment by sentence 148 | sentiment_df <- sentiment(sentences_df) 149 | 150 | # Aggregate sentiment to original entities 151 | sentiment_summ <- sentiment_by(sentences_df, by = "element_id") %>% 152 | bind_cols(test_df %>% select(review)) %>% 153 | select(element_id, review, everything()) %>% 154 | arrange(desc(ave_sentiment)) 155 | 156 | kable(sentiment_summ) 157 | ``` 158 | -------------------------------------------------------------------------------- /R/Survival.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Survival Models" 3 | author: "Jesse Cambon" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | output: 6 | github_document: 7 | toc: true 8 | --- 9 | 10 | Survival Analysis 11 | 12 | * Kaplan-Meier Plots 13 | * Log-rank test 14 | * Cox Proportional Hazard Model 15 | * Parametric survival models 16 | * Bayesian Approaches 17 | 18 | ```{r knit-settings, include=FALSE} 19 | source(here::here("rmd_config.R")) 20 | ``` 21 | 22 | Reference: 23 | 24 | https://www.emilyzabor.com/tutorials/survival_analysis_in_r_tutorial.html 25 | 26 | 27 | ```{r setup, message = F, warning = F} 28 | library(survival) 29 | library(survminer) 30 | library(tidyverse) 31 | library(broom) 32 | library(broom.mixed) 33 | library(brms) 34 | library(bayesplot) 35 | options(mc.cores = parallel::detectCores()) 36 | ``` 37 | 38 | 39 | # Kaplan-Meier 40 | 41 | status: censoring status 1=censored, 2=dead. Can also use TRUE/FALSE see documentation for 42 | event in ?Surv 43 | 44 | ```{r} 45 | lung %>% count(status) 46 | ``` 47 | 48 | 49 | ```{r} 50 | ggsurvplot( 51 | fit = survfit(Surv(time, status) ~ sex, data = lung), 52 | xlab = "Days", 53 | ylab = "Survival Probability")$plot 54 | ``` 55 | 56 | # Log-Rank Test 57 | 58 | Test if there was a statistically significant difference in survival time between the groups 59 | 60 | ```{r} 61 | survdiff(Surv(time, status) ~ sex, 62 | data = lung, 63 | rho = 0 # log-rank, see ?survdiff 64 | ) 65 | ``` 66 | 67 | # Cox Proportional Hazard Model 68 | 69 | * Multivariate "semi-parametric" regression approach 70 | * Assumes hazard can change over time, but is proportional between groups at all points in time (ie. hazard ratio is constant over time). 71 | 72 | 73 | ```{r} 74 | cox_fit <- coxph(Surv(time, status) ~ sex + age + ph.ecog, 75 | data = lung) 76 | 77 | # Exponentiate coefficients to get hazard ratios 78 | cox_hr <- tidy(cox_fit, exponentiate = TRUE, conf.int = TRUE) 79 | ``` 80 | 81 | Survival curve 82 | 83 | ```{r} 84 | ggsurvplot(survfit(cox_fit), data = lung, risk.table = TRUE) 85 | ``` 86 | 87 | Plot Hazard Ratios 88 | 89 | ```{r} 90 | ggplot(data=cox_hr, 91 | aes(x = term, y = estimate)) + 92 | geom_point() + 93 | scale_y_continuous() + 94 | geom_hline(yintercept=0,color='grey') + 95 | coord_flip() + 96 | theme_bw() + 97 | theme(plot.title = element_text(lineheight = 1, face="bold",hjust = 0.5)) + 98 | geom_pointrange(mapping = aes(ymin = conf.low, ymax = conf.high)) + 99 | xlab('Term') + ylab('HR') + geom_hline(yintercept = 1, color = "grey") 100 | ``` 101 | 102 | ### Predictions 103 | 104 | ```{r} 105 | sample_obs <- lung %>% 106 | sample_n(2, seed = 104) %>% 107 | mutate(id = 1:n()) %>% 108 | select(id, status, everything()) 109 | 110 | cox_pred <- predict(cox_fit, newdata = sample_obs, type = 'expected') 111 | 112 | ``` 113 | 114 | 115 | 116 | ### Validation 117 | 118 | Reference: 119 | http://www.sthda.com/english/wiki/cox-model-assumptions 120 | 121 | 122 | ```{r} 123 | concordance(cox_fit) 124 | ``` 125 | 126 | 127 | Look at residuals 128 | 129 | ```{r} 130 | ggcoxdiagnostics(cox_fit, type = "deviance", ox.scale = 'observation.id') 131 | ggcoxdiagnostics(cox_fit, type = "deviance", ox.scale = 'linear.predictions') 132 | ``` 133 | 134 | ```{r} 135 | ggcoxdiagnostics(cox_fit, type = 'dfbeta') 136 | ``` 137 | 138 | 139 | 140 | Test proportional hazards assumption 141 | 142 | ```{r, fig.height = 7, fig.wdith = 5} 143 | zph_fit <- cox.zph(cox_fit) 144 | ggcoxzph(zph_fit) 145 | ``` 146 | 147 | ## Parametric Survival Model 148 | 149 | Accelerated Failure Time models, an alternative to cox regression 150 | 151 | ```{r} 152 | aft_fit <- survreg(Surv(time, status) ~ sex + age + ph.ecog, 153 | dist = 'weibull', 154 | data = lung) 155 | 156 | # Exponentiate coefficients to get hazard ratios 157 | aft_hr <- tidy(aft_fit, exponentiate = TRUE, conf.int = TRUE) 158 | 159 | aft_hr 160 | ``` 161 | 162 | 163 | 164 | 165 | 166 | ## Bayesian Survival Models 167 | 168 | - http://paul-buerkner.github.io/brms/reference/kidney.html 169 | - https://mc-stan.org/rstanarm/reference/adapt_delta.html 170 | 171 | 172 | ```{r} 173 | print('Default priors:') 174 | get_prior(time | cens(censored) ~ sex + disease + age + (1 | patient), 175 | data = kidney, family = weibull() 176 | ) 177 | 178 | print('Horseshoe priors:') 179 | 180 | get_prior(time | cens(censored) ~ sex + disease + age + (1 | patient), 181 | data = kidney, family = weibull(), 182 | prior = set_prior("horseshoe(3)", class = 'b') + 183 | set_prior("horseshoe(3)", class = 'Intercept') + 184 | set_prior("horseshoe(3)", class = 'sd') 185 | ) 186 | ``` 187 | 188 | 189 | ```{r, message = FALSE, warning = F, error = F, results = 'hide'} 190 | # fit weibull model 191 | fit2 <- brm(time | cens(censored) ~ sex + disease + (1 | patient), 192 | data = kidney, family = weibull(), 193 | prior = set_prior("horseshoe(3)"), 194 | iter = 3000, 195 | control = list(adapt_delta = 0.98)) 196 | ``` 197 | 198 | 199 | ```{r} 200 | summary(fit2) 201 | tidy(fit2) 202 | prior_summary(fit2) 203 | ``` 204 | 205 | ```{r, fig.height = 12, fig.width = 10} 206 | mcmc_trace(fit2) 207 | ``` 208 | 209 | 210 | ```{r} 211 | pp_check(fit2) 212 | 213 | pp_check(fit2, type = 'intervals') 214 | ``` 215 | ```{r, fig.height = 8, fig.width = 4} 216 | mcmc_areas(fit2, regex_pars = c('b_*', 'r_*')) 217 | ``` 218 | 219 | https://mc-stan.org/bayesplot/reference/PPC-censoring.html 220 | 221 | ```{r} 222 | yrep <- posterior_predict(fit2) 223 | 224 | loo(fit2) 225 | ``` 226 | 227 | 228 | ```{r} 229 | hist(kidney$time) 230 | ``` 231 | 232 | 233 | ```{r} 234 | ppc_km_overlay(kidney$time, yrep, status_y = kidney$censored) + 235 | xlim(0, 200) 236 | ``` 237 | 238 | -------------------------------------------------------------------------------- /R/Time_Series_Modeling.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Time Series Modeling" 3 | author: "Jesse Cambon" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | output: 6 | github_document: 7 | toc: true 8 | --- 9 | 10 | ```{r knit-settings, include=FALSE} 11 | source(here::here("rmd_config.R")) 12 | ``` 13 | 14 | ## References 15 | * https://github.com/christophsax/tsbox 16 | * https://github.com/tidyverts/tsibble 17 | * http://pkg.robjhyndman.com/forecast/ 18 | * https://business-science.github.io/sweep/index.html 19 | * https://cran.rstudio.com/web/packages/sweep/vignettes/SW01_Forecasting_Time_Series_Groups.html 20 | * https://www.r-bloggers.com/climate-change-modeling-140-years-of-temperature-data-with-tsibble-and-fable/ 21 | * https://github.com/tidyverts/fable 22 | * https://feasts.tidyverts.org/ 23 | 24 | 25 | ## Setup 26 | 27 | ```{r setup} 28 | library(nycflights13) 29 | library(tsibbledata) 30 | library(wesanderson) # color palettes 31 | library(tidyverse) 32 | library(tsibble) 33 | # library(skimr) 34 | # library(feasts) # Removed for now since it breaks the forecast package 35 | library(lubridate) 36 | library(forecast) 37 | library(tsbox) 38 | library(prophet) 39 | 40 | 41 | # Set default ggplot theme 42 | theme_set(theme_bw() + 43 | theme( 44 | legend.position = "top", 45 | plot.subtitle = element_text(face = "bold", hjust = 0.5), 46 | plot.title = element_text(lineheight = 1, face = "bold", hjust = 0.5) 47 | )) 48 | ``` 49 | 50 | ## Importing Data 51 | 52 | Import Data and Convert to Tsibble format 53 | 54 | ```{r} 55 | weather <- nycflights13::weather %>% 56 | select(origin, time_hour, temp, humid, precip) 57 | 58 | weather_tsbl <- as_tsibble(weather, key = origin, index = time_hour) 59 | ``` 60 | 61 | 62 | ## Forecasting with Prophet 63 | 64 | https://facebook.github.io/prophet 65 | 66 | ```{r} 67 | # convert to format needed by prophet (needs specific column names) 68 | weather_ts <- weather_tsbl %>% 69 | filter(origin == "EWR") %>% 70 | select(time_hour, temp) %>% 71 | rename(ds = time_hour, y = temp) 72 | 73 | # create prophet model 74 | m <- prophet(weather_ts, yearly.seasonality = TRUE) 75 | 76 | future <- make_future_dataframe(m, periods = 400) 77 | 78 | # Create forecast 79 | prophet_forecast <- predict(m, future) 80 | 81 | # Plot 82 | plot(m, prophet_forecast) 83 | 84 | prophet_plot_components(m, prophet_forecast) 85 | ``` 86 | 87 | ## Data Cleaning 88 | 89 | Fill Missing Gaps in Data 90 | 91 | ```{r} 92 | nrow(ansett) 93 | 94 | # Fill gaps and filter 95 | ansett_fill <- ansett %>% 96 | filter(Airports == "MEL-SYD") %>% 97 | fill_gaps(Passengers = 0) 98 | 99 | nrow(ansett_fill) 100 | 101 | # Aggregate all classes together , limit to 1990 onward 102 | ansett_summ <- ansett_fill %>% 103 | group_by() %>% 104 | summarize(Passengers = sum(Passengers, na.rm = TRUE)) %>% 105 | filter_index("1990-01" ~ .) %>% 106 | as_tsibble(index = Week) 107 | ``` 108 | 109 | ```{r} 110 | ggplot( 111 | ansett_fill, 112 | aes(x = Week, y = Passengers) 113 | ) + 114 | geom_area(aes(fill = Class), alpha = 1.0) + 115 | scale_fill_manual(values = wes_palette("Moonrise2")) + 116 | scale_y_continuous(labels = scales::comma) + 117 | labs( 118 | title = "", 119 | caption = "" 120 | ) + 121 | theme( 122 | legend.title = element_blank(), 123 | legend.position = "right" 124 | ) 125 | ``` 126 | 127 | Test forecast package 128 | 129 | ```{r} 130 | USAccDeaths %>% 131 | stl(s.window = "periodic") %>% 132 | forecast() %>% 133 | autoplot() 134 | 135 | AirPassengers %>% 136 | stlf(lambda = 0) %>% 137 | autoplot() 138 | 139 | # Have to convert this dataset to time series format with tsbox::ts_ts() 140 | ansett_summ %>% 141 | ts_ts() %>% 142 | stlf(lambda = 0) %>% 143 | autoplot() 144 | ``` 145 | 146 | Feasts package unfortunately breaks the forecast package 147 | -------------------------------------------------------------------------------- /R/Time_Series_Modeling.md: -------------------------------------------------------------------------------- 1 | Time Series Modeling 2 | ================ 3 | Jesse Cambon 4 | 24 November, 2019 5 | 6 | ## References 7 | 8 | - 9 | - 10 | - 11 | - 12 | - 13 | - 14 | - 15 | - 16 | 17 | ## Setup 18 | 19 | ``` r 20 | library(nycflights13) 21 | library(tsibbledata) 22 | library(wesanderson) # color palettes 23 | library(tidyverse) 24 | ``` 25 | 26 | ## ── Attaching packages ───────────────────────────── tidyverse 1.3.0 ── 27 | 28 | ## ✔ ggplot2 3.2.1 ✔ purrr 0.3.3 29 | ## ✔ tibble 2.1.3 ✔ dplyr 0.8.3 30 | ## ✔ tidyr 1.0.0 ✔ forcats 0.4.0 31 | ## ✔ readr 1.3.1 32 | 33 | ## ── Conflicts ──────────────────────────────── tidyverse_conflicts() ── 34 | ## ✖ dplyr::filter() masks stats::filter() 35 | ## ✖ dplyr::lag() masks stats::lag() 36 | 37 | ``` r 38 | library(tsibble) 39 | ``` 40 | 41 | ## 42 | ## Attaching package: 'tsibble' 43 | 44 | ## The following object is masked from 'package:dplyr': 45 | ## 46 | ## id 47 | 48 | ``` r 49 | #library(skimr) 50 | #library(feasts) # Removed for now since it breaks the forecast package 51 | library(lubridate) 52 | ``` 53 | 54 | ## 55 | ## Attaching package: 'lubridate' 56 | 57 | ## The following objects are masked from 'package:tsibble': 58 | ## 59 | ## interval, new_interval 60 | 61 | ## The following object is masked from 'package:here': 62 | ## 63 | ## here 64 | 65 | ## The following object is masked from 'package:base': 66 | ## 67 | ## date 68 | 69 | ``` r 70 | library(forecast) 71 | ``` 72 | 73 | ## Registered S3 method overwritten by 'xts': 74 | ## method from 75 | ## as.zoo.xts zoo 76 | 77 | ## Registered S3 method overwritten by 'quantmod': 78 | ## method from 79 | ## as.zoo.data.frame zoo 80 | 81 | ## Registered S3 methods overwritten by 'forecast': 82 | ## method from 83 | ## fitted.fracdiff fracdiff 84 | ## residuals.fracdiff fracdiff 85 | 86 | ``` r 87 | library(tsbox) 88 | library(prophet) 89 | ``` 90 | 91 | ## Loading required package: Rcpp 92 | 93 | ## Loading required package: rlang 94 | 95 | ## 96 | ## Attaching package: 'rlang' 97 | 98 | ## The following objects are masked from 'package:purrr': 99 | ## 100 | ## %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int, 101 | ## flatten_lgl, flatten_raw, invoke, list_along, modify, prepend, 102 | ## splice 103 | 104 | ``` r 105 | # Set default ggplot theme 106 | theme_set(theme_bw() + 107 | theme(legend.position = "top", 108 | plot.subtitle= element_text(face="bold",hjust=0.5), 109 | plot.title = element_text(lineheight=1, face="bold",hjust = 0.5))) 110 | ``` 111 | 112 | \#Importing Data Import Data and Convert to Tsibble format 113 | 114 | ``` r 115 | weather <- nycflights13::weather %>% 116 | select(origin, time_hour, temp, humid, precip) 117 | 118 | weather_tsbl <- as_tsibble(weather, key = origin, index = time_hour) 119 | ``` 120 | 121 | # Forecasting with Prophet 122 | 123 | 124 | 125 | ``` r 126 | # convert to format needed by prophet (needs specific column names) 127 | weather_ts <- weather_tsbl %>% filter(origin == 'EWR') %>% select(time_hour,temp) %>% 128 | rename(ds=time_hour,y=temp) 129 | 130 | # create prophet model 131 | m <- prophet(weather_ts,yearly.seasonality=TRUE) 132 | 133 | future <- make_future_dataframe(m, periods = 400) 134 | 135 | # Create forecast 136 | prophet_forecast <- predict(m, future) 137 | 138 | # Plot 139 | plot(m, prophet_forecast) 140 | ``` 141 | 142 | ![](../rmd_images/Time_Series_Modeling/unnamed-chunk-2-1.png) 143 | 144 | ``` r 145 | prophet_plot_components(m, prophet_forecast) 146 | ``` 147 | 148 | ![](../rmd_images/Time_Series_Modeling/unnamed-chunk-2-2.png) 149 | 150 | ## Data Cleaning 151 | 152 | Fill Missing Gaps in Data 153 | 154 | ``` r 155 | nrow(ansett) 156 | ``` 157 | 158 | ## [1] 7407 159 | 160 | ``` r 161 | # Fill gaps and filter 162 | ansett_fill <- ansett %>% 163 | filter(Airports == 'MEL-SYD') %>% 164 | fill_gaps(Passengers = 0) 165 | 166 | nrow(ansett_fill) 167 | ``` 168 | 169 | ## [1] 742 170 | 171 | ``` r 172 | # Aggregate all classes together , limit to 1990 onward 173 | ansett_summ <- ansett_fill %>% group_by %>% 174 | summarize(Passengers=sum(Passengers,na.rm=TRUE)) %>% 175 | filter_index("1990-01" ~ .) %>% as_tsibble(index = Week) 176 | ``` 177 | 178 | ``` r 179 | ggplot(ansett_fill, 180 | aes(x=Week,y=Passengers)) + 181 | geom_area(aes(fill = Class), alpha = 1.0) + 182 | scale_fill_manual(values=wes_palette('Moonrise2')) + 183 | scale_y_continuous(labels=scales::comma) + 184 | labs(title='', 185 | caption='') + 186 | theme(legend.title = element_blank(), 187 | legend.position='right') 188 | ``` 189 | 190 | ![](../rmd_images/Time_Series_Modeling/unnamed-chunk-4-1.png) 191 | 192 | Test forecast package 193 | 194 | ``` r 195 | USAccDeaths %>% 196 | stl(s.window='periodic') %>% 197 | forecast() %>% 198 | autoplot() 199 | ``` 200 | 201 | ![](../rmd_images/Time_Series_Modeling/unnamed-chunk-5-1.png) 202 | 203 | ``` r 204 | AirPassengers %>% 205 | stlf(lambda=0) %>% 206 | autoplot() 207 | ``` 208 | 209 | ![](../rmd_images/Time_Series_Modeling/unnamed-chunk-5-2.png) 210 | 211 | ``` r 212 | # Have to convert this dataset to time series format with tsbox::ts_ts() 213 | ansett_summ %>% ts_ts(.) %>% 214 | stlf(lambda=0) %>% 215 | autoplot() 216 | ``` 217 | 218 | ## [time]: 'Week' [value]: 'Passengers' 219 | 220 | ![](../rmd_images/Time_Series_Modeling/unnamed-chunk-5-3.png) 221 | 222 | Feasts package unfortunately breaks the forecast package 223 | -------------------------------------------------------------------------------- /R/Titanic_files/figure-gfm/explore-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/R/Titanic_files/figure-gfm/explore-1.png -------------------------------------------------------------------------------- /R/Titanic_files/figure-gfm/explore-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/R/Titanic_files/figure-gfm/explore-2.png -------------------------------------------------------------------------------- /R/Titanic_files/figure-gfm/explore-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/R/Titanic_files/figure-gfm/explore-3.png -------------------------------------------------------------------------------- /R/Titanic_files/figure-gfm/imputation-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/R/Titanic_files/figure-gfm/imputation-1.png -------------------------------------------------------------------------------- /R/Titanic_files/figure-gfm/imputation-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/R/Titanic_files/figure-gfm/imputation-2.png -------------------------------------------------------------------------------- /R/Titanic_files/figure-gfm/linear-regression-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/R/Titanic_files/figure-gfm/linear-regression-1.png -------------------------------------------------------------------------------- /R/Titanic_files/figure-gfm/linear-regression-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/R/Titanic_files/figure-gfm/linear-regression-2.png -------------------------------------------------------------------------------- /R/Titanic_files/figure-gfm/linear-regression-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/R/Titanic_files/figure-gfm/linear-regression-3.png -------------------------------------------------------------------------------- /R/Titanic_files/figure-gfm/linear-regression-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/R/Titanic_files/figure-gfm/linear-regression-4.png -------------------------------------------------------------------------------- /R/Titanic_files/figure-gfm/linear-regression-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/R/Titanic_files/figure-gfm/linear-regression-5.png -------------------------------------------------------------------------------- /R/Titanic_files/figure-gfm/logistic-regression-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/R/Titanic_files/figure-gfm/logistic-regression-1.png -------------------------------------------------------------------------------- /R/Titanic_files/figure-gfm/logistic-regression-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/R/Titanic_files/figure-gfm/logistic-regression-2.png -------------------------------------------------------------------------------- /R/Titanic_files/figure-gfm/logistic-regression-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/R/Titanic_files/figure-gfm/logistic-regression-3.png -------------------------------------------------------------------------------- /R/Titanic_files/figure-gfm/logistic-regression-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/R/Titanic_files/figure-gfm/logistic-regression-4.png -------------------------------------------------------------------------------- /R/gapminder_summary_report.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/R/gapminder_summary_report.xlsx -------------------------------------------------------------------------------- /R/hypothesis_testing.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Hypothesis Testing" 3 | author: "Jesse Cambon" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | output: 6 | github_document: 7 | toc: true 8 | --- 9 | 10 | ```{r knit-settings, include=FALSE} 11 | library(here) 12 | source(here("rmd_config.R")) 13 | ``` 14 | 15 | References: 16 | * https://www.tidymodels.org/learn/statistics/xtabs/ 17 | 18 | ```{r,message=F,warning=F} 19 | library(tidymodels) # Includes the infer package 20 | library(knitr) 21 | 22 | # Set ggplot theme 23 | theme_set(theme_minimal()) 24 | 25 | 26 | data(ad_data, package = "modeldata") 27 | 28 | ``` 29 | 30 | ```{r} 31 | ad_data %>% 32 | count(Genotype, Class,sort=T) %>% head(5) %>% kable() 33 | ``` 34 | 35 | Chi Squared Test of Independences 36 | 37 | ```{r} 38 | ad_data %>% 39 | chisq_test(Genotype ~ Class) %>% 40 | kable() 41 | ``` 42 | 43 | 44 | ```{r} 45 | observed_indep_statistic <- ad_data %>% 46 | specify(Genotype ~ Class) %>% 47 | calculate(stat = "Chisq") 48 | 49 | 50 | # generate the null distribution using randomization 51 | null_distribution_simulated <- ad_data %>% 52 | specify(Genotype ~ Class) %>% 53 | hypothesize(null = "independence") %>% 54 | generate(reps = 500, type = "permute") %>% 55 | calculate(stat = "Chisq") 56 | ``` 57 | 58 | ```{r} 59 | null_distribution_simulated %>% 60 | visualize() + 61 | shade_p_value(observed_indep_statistic, 62 | direction = "greater") + theme_minimal() 63 | ``` 64 | 65 | ```{r} 66 | ad_data %>% 67 | specify(Genotype ~ Class) %>% 68 | hypothesize(null = "independence") %>% 69 | visualize(method = "theoretical") + 70 | shade_p_value(observed_indep_statistic, 71 | direction = "greater") 72 | ``` 73 | 74 | 75 | ```{r} 76 | null_distribution_simulated %>% 77 | visualize(method = "both") + 78 | shade_p_value(observed_indep_statistic, 79 | direction = "greater") 80 | ``` 81 | 82 | -------------------------------------------------------------------------------- /R/hypothesis_testing.md: -------------------------------------------------------------------------------- 1 | Hypothesis Testing 2 | ================ 3 | Jesse Cambon 4 | 02 February, 2021 5 | 6 | References: \* 7 | 8 | ``` r 9 | library(tidymodels) # Includes the infer package 10 | library(knitr) 11 | 12 | # Set ggplot theme 13 | theme_set(theme_minimal()) 14 | 15 | 16 | data(ad_data, package = "modeldata") 17 | ``` 18 | 19 | ``` r 20 | ad_data %>% 21 | count(Genotype, Class,sort=T) %>% head(5) %>% kable() 22 | ``` 23 | 24 | | Genotype | Class | n | 25 | |:---------|:---------|----:| 26 | | E3E3 | Control | 133 | 27 | | E3E4 | Control | 65 | 28 | | E3E4 | Impaired | 41 | 29 | | E3E3 | Impaired | 34 | 30 | | E2E3 | Control | 30 | 31 | 32 | Chi Squared Test of Independences 33 | 34 | ``` r 35 | ad_data %>% 36 | chisq_test(Genotype ~ Class) %>% 37 | kable() 38 | ``` 39 | 40 | ## Warning in stats::chisq.test(table(x), ...): Chi-squared approximation may be 41 | ## incorrect 42 | 43 | | statistic | chisq\_df | p\_value | 44 | |----------:|----------:|----------:| 45 | | 21.57748 | 5 | 0.0006298 | 46 | 47 | ``` r 48 | observed_indep_statistic <- ad_data %>% 49 | specify(Genotype ~ Class) %>% 50 | calculate(stat = "Chisq") 51 | 52 | 53 | # generate the null distribution using randomization 54 | null_distribution_simulated <- ad_data %>% 55 | specify(Genotype ~ Class) %>% 56 | hypothesize(null = "independence") %>% 57 | generate(reps = 500, type = "permute") %>% 58 | calculate(stat = "Chisq") 59 | ``` 60 | 61 | ``` r 62 | null_distribution_simulated %>% 63 | visualize() + 64 | shade_p_value(observed_indep_statistic, 65 | direction = "greater") + theme_minimal() 66 | ``` 67 | 68 | ![](../rmd_images/hypothesis_testing/unnamed-chunk-5-1.png) 69 | 70 | ``` r 71 | ad_data %>% 72 | specify(Genotype ~ Class) %>% 73 | hypothesize(null = "independence") %>% 74 | visualize(method = "theoretical") + 75 | shade_p_value(observed_indep_statistic, 76 | direction = "greater") 77 | ``` 78 | 79 | ## Warning: Check to make sure the conditions have been met for the theoretical 80 | ## method. {infer} currently does not check these for you. 81 | 82 | ![](../rmd_images/hypothesis_testing/unnamed-chunk-6-1.png) 83 | 84 | ``` r 85 | null_distribution_simulated %>% 86 | visualize(method = "both") + 87 | shade_p_value(observed_indep_statistic, 88 | direction = "greater") 89 | ``` 90 | 91 | ## Warning: Check to make sure the conditions have been met for the theoretical 92 | ## method. {infer} currently does not check these for you. 93 | 94 | ![](../rmd_images/hypothesis_testing/unnamed-chunk-7-1.png) 95 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # The Data Science Codex 2 | 3 | A collection of code and resources to serve as a starting point for data science projects. For more explanation and material on R visit [my blog](https://jessecambon.github.io/). 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | ## Notes 15 | * [Resources](Resources.md) - Websites and references that I find helpful for data science projects 16 | * [Developing With R](R-Development.md) - Notes on R package development 17 | * [How to Git](Git-HowTo.md) - version control with git 18 | * [How to Anaconda](Anaconda-HowTo.md) - managing environments with Anaconda 19 | 20 | ## Data Visualization 21 | * [Visualization Cookbook](R/Visualization_Cookbook.md) (R) - A wide variety of data visualizations demonstrated. 22 | * [Geospatial Data Analysis](R/Geospatial_Analysis.md) (R) - Making maps with R. 23 | 24 | ## Statistical Modeling and Machine Learning 25 | * [Modeling Fundamentals](R/Titanic.md) (R) - A primer on logistic and linear regression modeling with the classic Titanic dataset. 26 | * [Survival Analysis](R/Survival.md) (R) - Survival analysis methods such as cox proportion hazard models and Kaplan-Meier curves. 27 | * [Modeling Workflows](R/Modeling_Workflow.md) (R) - Streamlined Tidyverse modeling workflows with the gapminder dataset. 28 | * [Multilevel Models](R/Multilevel-Models.md) (R) - Multi-level aka. mixed effects models 29 | * [Time Series Modeling](R/Time_Series_Modeling.md) (R) - Experimenting with time series modeling (tsibble, forecast libraries, prophet, etc.) 30 | * [Ordinal Regression](R/Ordinal_Regression.md) (R) - Experimenting with ordinal (ranked categorical outcome) regression 31 | * [Presenting Regression Models](R/Regression-Model-Tidying.md) (R) - Code for cleaning the outputs of regression models for presentations. 32 | * [Sklearn Modeling Workflows](Python/Sklearn-Workflow.ipynb) (Python) - Modeling workflows with sklearn (cross-validation, randomized search for optimizing hyperparameters, lift curves). 33 | * [Sklearn - Skopt Workflow](Python/sklearn_skopt_pipeline.ipynb) (Python) - Modeling workflow with sklearn and scikit-optimize (bayesian hyperparameter optimization. 34 | * [Machine Learning with Caret](R/Caret.md) (R) - Using the Caret library for machine learning. 35 | * [Parsnip](R/Parsnip.md) (R) - fitting models with the parsnip package (from tidymodels) 36 | 37 | ## Bayesian Models 38 | * [Bayesian Basics](R/Bayesian_Basics.md) (R) - exploring a simple Bayesian multilevel model 39 | * [Bayesian Modeling](R/Bayesian_Modeling.md) (R) - Experimenting with Bayesian models using rstanarm 40 | * [Comparing Bayesian Packages](R/Comparing_Bayesian_Packages.md) (R) - Comparing rstanarm, brms, and rstan. 41 | 42 | ## Clustering 43 | * [k-means clustering](R/Clustering.md) (R) - Using the k-means algorithm to cluster data. 44 | * [Clustering](Python/Clustering.ipynb) (Python) - Agglomerative (Hierarchical) clustering, k-means clustering, and Gaussian mixture models 45 | 46 | ## Stats Analysis 47 | * [Power Analysis](R/Power_Analysis.md) (R) - Statistical power analysis 48 | * [Distribution Sampling and Hypothesis Testing](R/Distribution_Sampling.md) (R) 49 | * [Hypothesis Testing](R/hypothesis_testing.md) (R) 50 | 51 | ## NLP 52 | * [Document Embeddings](Python/state_of_union_embeddings.ipynb) (Python) - Using word embeddings to compare the similarity of State of the Union addresses. 53 | * [State of the Union Analysis](Python/state_of_union_v2.ipynb) (Python) - An exploration of state of the union addresses with topic modeling and sentiment analysis. 54 | * [Sentiment Analysis](R/Sentiment_Analysis.md) (R) - Exploring sentiment analysis in R. 55 | * [LSTM Demo](Python/LSTM-Demo.ipynb) (Python) - An LSTM network for predicting if a company review from glassdoor is positive 56 | 57 | ## Miscellaneous 58 | * [R-Quickstart](R/R-Quickstart.md) (R) - Minimal data analysis and visualization workflows. See [the blog post "Data Science Essentials"](https://jessecambon.github.io/2020/01/12/data-science-essentials.html) for more details and explanation. 59 | * [Creating Formatted Spreadsheets](R/Create_Formatted_Spreadsheet.md) (R) - How to create a custom formatted spreadsheet report with the openxlsx R package. 60 | * [Using Python and R Together](Python/R-Python-Hybrid.ipynb) - How to use python and R code together in the same Jupyter notebook with the rpy2 python package. 61 | * [R Quotation](R/R_Quotation.md) (R) - If you want to do certain things such as pass variable names as arguments to a function in R, you have to use quotation methods like `quo()` and `enquo()`. This notebook demonstrates how to do this. See [my blog post on Tidy Evaluation](https://jessecambon.github.io/2019/12/08/practical-tidy-evaluation.html) for more details and explanation. 62 | * [SQL Databases](Python/SQL_Databases.ipynb) (Python) - Code for creating and manipulating a SQL database. 63 | -------------------------------------------------------------------------------- /Resources.md: -------------------------------------------------------------------------------- 1 | A categorized list of data science resources. 2 | 3 | ## General 4 | * [RStudio Cheatsheets](https://www.rstudio.com/resources/cheatsheets/) 5 | * [Bookdown Books](https://bookdown.org/) - A great collection of free R books. 6 | * [R For Data Science](http://r4ds.had.co.nz/index.html) - Classic text by Hadley Wickham, chief overlord of all things Tidyverse. 7 | * [R-Cookbook](http://www.cookbook-r.com) - Categorized useful R code. 8 | * [R for Public Policy](http://www.lecy.info/r-for-public-policy/) - List of resources. 9 | * [R-Bloggers](https://www.r-bloggers.com) - Great resource for the latest developments in the R community. Subscribe to their emails. 10 | * [Tidyverse](https://www.tidyverse.org/index.html) - A well documented ecosystem of packages for elegant data manipulation and visualization. 11 | * [Data Science with Python](https://jakevdp.github.io/PythonDataScienceHandbook/) - Book by Jake VanderPlas 12 | * [Pandas Cookbook](http://pandas.pydata.org/pandas-docs/stable/user_guide/cookbook.html) - Code for data manipulation in python 13 | 14 | ## Data 15 | * [An Increxibly Comprehensive List of APIs for R](https://github.com/ropensci/opendata/blob/master/README.md) 16 | * [Google Dataset Search](https://toolbox.google.com/datasetsearch) 17 | * [KDNuggets](https://www.kdnuggets.com/datasets/index.html) 18 | * [Microsoft's List of R Data Sources](https://mran.microsoft.com/documents/data) 19 | * [Kaggle](https://www.kaggle.com/datasets) - Datasets used for Kaggle competitions. 20 | * [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml) 21 | * [R Datasets](http://stat.ethz.ch/R-manual/R-devel/library/datasets/html/00Index.html) - Documentation on the inbuilt datasets in R. 22 | * [Data.gov](https://www.data.gov/) - A good place to start for government data. 23 | * [R Packages for Importing Data](https://www.computerworld.com/article/3109890/data-analytics/these-r-packages-import-sports-weather-stock-data-and-more.html) 24 | 25 | ## Visualization Cookbooks 26 | * [Data-To-Viz](https://www.data-to-viz.com) - A comprehensive data viz reference with lots of great code. 27 | * [BBC R Cookbook](https://bbc.github.io/rcookbook) - R code to create plots in the style BBC 28 | * [Top 50 Ggplot Visualizations](http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html) 29 | * [My R Cookbook](https://github.com/jessecambon/Data-Science-Codex/blob/master/source/Chart_Collection.md) - My hodge-podge collection of visualizations with R code. 30 | * [D3 Blocks](https://bl.ocks.org/) 31 | * [D3.JS Gallery](https://github.com/d3/d3/wiki/Gallery) 32 | * [R-Shiny Gallery](https://shiny.rstudio.com/gallery/) 33 | * [Seaborn Gallery](https://seaborn.pydata.org/examples/index.html) 34 | * [Matplotlib Gallery](https://matplotlib.org/gallery.html) 35 | 36 | ## Visualization References 37 | * [Dataviz Project](http://datavizproject.com/) - Categorized data visualizations. 38 | * [Visual Capitalist](http://www.visualcapitalist.com/) 39 | * [Flowing Data](http://flowingdata.com/) - Good newsletter and site on data viz techniques. 40 | * [Stephen Few](http://www.perceptualedge.com/) - A data visualization expert. 41 | * [Edward Tufte](https://www.edwardtufte.com/tufte/) - Another data viz expert. 42 | 43 | ## Visualization Libraries 44 | * [ggplot](https://ggplot2.tidyverse.org/index.html) 45 | * [cowplot](https://cran.r-project.org/web/packages/cowplot/vignettes/introduction.html) - Tweaks to ggplot for publication charts. 46 | * [plotly](https://plot.ly/r/) - Good tool for interactive graphs. 47 | * [dygraphs](https://rstudio.github.io/dygraphs/) 48 | * [rbokeh](http://hafen.github.io/rbokeh/) 49 | * [C3](https://github.com/mrjoh3/c3) 50 | * [rCharts](https://github.com/ramnathv/rCharts) 51 | * [tmap](https://github.com/mtennekes/tmap) - Geospatial mapping. 52 | * [ggalluvial](https://github.com/corybrunson/ggalluvial) - Flow diagrams 53 | 54 | ## Statistics 55 | * [Cross Validated](https://stats.stackexchange.com/) - StackExchange for Statistics 56 | * [Intro to Statistical Learning Textbook](http://www-bcf.usc.edu/~gareth/ISL/) - Freely available statistics textbook that is tailored to application with lots of code examples in R. This a reworked version of the classic 'Elements of Statistical Learning' text which is heavier on statistical theory. 57 | * [Elements of Statistical Learning](https://web.stanford.edu/~hastie/ElemStatLearn/) - Freely available classic Statistics textbook. 58 | * [Biostat Handbook](http://www.biostathandbook.com/) - A great concise and and accessible reference for statistical methods which have relevance in a wide variety of fields (not just Biostatistics). 59 | * [Frank Harrell](http://www.fharrell.com/) - Head of the Biostats department @ Vanderbilt and author of several R packages (Hmsic and rms). Good resource on regression modeling. 60 | * [Survival analysis tutorial](http://rpubs.com/sinhrks/plot_surv) 61 | * [A very comprehensive article on regression](https://www.r-bloggers.com/15-types-of-regression-you-should-know/) 62 | * [Tidyverse style survey package](https://cran.r-project.org/web/packages/srvyr/vignettes/srvyr-vs-survey.html) 63 | * [Course on Generalized Linear Models](http://data.princeton.edu/wws509/notes/#) - Princeton course on GLMs including logistic and poisson regression. 64 | 65 | ## Code Reference 66 | * [Code for NLP Models](https://github.com/bicepjai/Deep-Survey-Text-Classification) - Code implementations for 14 NLP text classification papers 67 | * [Papers With Code (Github)](https://github.com/zziz/pwc) 68 | * [Papers With Code (Site)](https://paperswithcode.com/) 69 | 70 | ## Maps 71 | * [Sample maps](https://bhaskarvk.github.io/user2017.geodataviz/notebooks/02-Static-Maps.nb.html) 72 | * [Making Maps with R](http://eriqande.github.io/rep-res-web/lectures/making-maps-with-R.html) 73 | * [Wind Maps](http://www.hilltop-analytics.com/2018/08/football-wind-maps/) 74 | 75 | ## Miscellaneous Packages 76 | * [TidyText](https://github.com/juliasilge/tidytext) 77 | * [Broom](https://github.com/tidymodels/broom) - Useful functions for formatting the output of statistical models. 78 | 79 | ## Creating Deliverables 80 | * [R Markdown](https://rmarkdown.rstudio.com/) 81 | * [R Markdown - The Definitive Guide](https://bookdown.org/yihui/rmarkdown/) - Free book on RMarkdown. 82 | * [Tables in R Markdown](https://haozhu233.github.io/kableExtra/awesome_table_in_html.html) 83 | * [RMarkdown Cheatsheet](https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet) 84 | * [officer](https://davidgohel.github.io/officer/index.html) - Creating Microsoft Office deliverables in R. 85 | 86 | ## Colors 87 | * [R-Cookbook Color Tutorial](http://www.cookbook-r.com/Graphs/Colors_(ggplot2)/) - Includes a good color blind palette 88 | * [Datawrapper - Colorguide](https://blog.datawrapper.de/colorguide/) 89 | * [Color Brewer](http://colorbrewer2.org) 90 | * [Viz Palette](http://projects.susielu.com/viz-palette?colors=%5B%22#1DABE6%22,%22#1C366A%22,%22#C3CED0%22,%22#E43034%22,%22#FC4E51%22,%22#AF060F%22%5D&backgroundColor=%22white%22&fontColor=%22black%22) 91 | * [Viridis](https://cran.r-project.org/web/packages/viridis/vignettes/intro-to-viridis.html#gallery) 92 | 93 | ## Census Data 94 | * [IPUMS](https://usa.ipums.org/) - Documented Census Microdata 95 | * [ipumsr](https://cran.r-project.org/web/packages/ipumsr/vignettes/ipums.html) – for loading and manipulating IPUMs data in R 96 | * [Tidycensus](https://walkerke.github.io/tidycensus/) - Great package for Census data analysis, particularly for geospatial analysis. 97 | * [Tidycensus Tutorial](https://www.mytinyshinys.com/2017/06/30/tidycensus/) 98 | * [CensusReporter](https://censusreporter.org/) - Tools for exploring Census data. 99 | -------------------------------------------------------------------------------- /rmd_config.R: -------------------------------------------------------------------------------- 1 | # This file contains knitr settings for Rmarkdown files 2 | # run this file via source() in all RMarkdown files 3 | library(knitr) 4 | library(stringr) 5 | # get name of file during knitting and strip file extension 6 | rmd_filename <- str_remove(knitr::current_input(), "\\.Rmd") 7 | 8 | # Figure path on disk = base.dir + fig.path 9 | # Figure URL online = base.url + fig.path 10 | knitr::opts_knit$set(base.dir = str_c(here::here(), "/"), base.url = "../") # project root folder 11 | knitr::opts_chunk$set(fig.path = str_c("rmd_images/", rmd_filename, "/"), echo = TRUE) 12 | -------------------------------------------------------------------------------- /rmd_images/Bayes/unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayes/unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /rmd_images/Bayesian_Basics/unnamed-chunk-10-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Basics/unnamed-chunk-10-1.png -------------------------------------------------------------------------------- /rmd_images/Bayesian_Basics/unnamed-chunk-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Basics/unnamed-chunk-3-1.png -------------------------------------------------------------------------------- /rmd_images/Bayesian_Basics/unnamed-chunk-4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Basics/unnamed-chunk-4-1.png -------------------------------------------------------------------------------- /rmd_images/Bayesian_Basics/unnamed-chunk-4-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Basics/unnamed-chunk-4-2.png -------------------------------------------------------------------------------- /rmd_images/Bayesian_Basics/unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Basics/unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /rmd_images/Bayesian_Basics/unnamed-chunk-6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Basics/unnamed-chunk-6-1.png -------------------------------------------------------------------------------- /rmd_images/Bayesian_Basics/unnamed-chunk-7-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Basics/unnamed-chunk-7-1.png -------------------------------------------------------------------------------- /rmd_images/Bayesian_Basics/unnamed-chunk-7-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Basics/unnamed-chunk-7-2.png -------------------------------------------------------------------------------- /rmd_images/Bayesian_Basics/unnamed-chunk-7-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Basics/unnamed-chunk-7-3.png -------------------------------------------------------------------------------- /rmd_images/Bayesian_Basics/unnamed-chunk-8-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Basics/unnamed-chunk-8-1.png -------------------------------------------------------------------------------- /rmd_images/Bayesian_Basics/unnamed-chunk-9-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Basics/unnamed-chunk-9-1.png -------------------------------------------------------------------------------- /rmd_images/Bayesian_Distributions/unnamed-chunk-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Distributions/unnamed-chunk-3-1.png -------------------------------------------------------------------------------- /rmd_images/Bayesian_Distributions/unnamed-chunk-4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Distributions/unnamed-chunk-4-1.png -------------------------------------------------------------------------------- /rmd_images/Bayesian_Distributions/unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Distributions/unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /rmd_images/Bayesian_Modeling/unnamed-chunk-12-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Modeling/unnamed-chunk-12-1.png -------------------------------------------------------------------------------- /rmd_images/Bayesian_Modeling/unnamed-chunk-12-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Modeling/unnamed-chunk-12-2.png -------------------------------------------------------------------------------- /rmd_images/Bayesian_Modeling/unnamed-chunk-15-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Modeling/unnamed-chunk-15-1.png -------------------------------------------------------------------------------- /rmd_images/Bayesian_Modeling/unnamed-chunk-15-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Modeling/unnamed-chunk-15-2.png -------------------------------------------------------------------------------- /rmd_images/Bayesian_Modeling/unnamed-chunk-15-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Modeling/unnamed-chunk-15-3.png -------------------------------------------------------------------------------- /rmd_images/Bayesian_Modeling/unnamed-chunk-19-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Modeling/unnamed-chunk-19-1.png -------------------------------------------------------------------------------- /rmd_images/Bayesian_Modeling/unnamed-chunk-19-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Modeling/unnamed-chunk-19-2.png -------------------------------------------------------------------------------- /rmd_images/Bayesian_Modeling/unnamed-chunk-19-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Modeling/unnamed-chunk-19-3.png -------------------------------------------------------------------------------- /rmd_images/Bayesian_Modeling/unnamed-chunk-20-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Modeling/unnamed-chunk-20-1.png -------------------------------------------------------------------------------- /rmd_images/Bayesian_Modeling/unnamed-chunk-20-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Modeling/unnamed-chunk-20-2.png -------------------------------------------------------------------------------- /rmd_images/Bayesian_Modeling/unnamed-chunk-20-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Modeling/unnamed-chunk-20-3.png -------------------------------------------------------------------------------- /rmd_images/Bayesian_Modeling/unnamed-chunk-21-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Modeling/unnamed-chunk-21-1.png -------------------------------------------------------------------------------- /rmd_images/Bayesian_Modeling/unnamed-chunk-23-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Modeling/unnamed-chunk-23-1.png -------------------------------------------------------------------------------- /rmd_images/Bayesian_Modeling/unnamed-chunk-4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Modeling/unnamed-chunk-4-1.png -------------------------------------------------------------------------------- /rmd_images/Caret/results-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Caret/results-1.png -------------------------------------------------------------------------------- /rmd_images/Caret/results-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Caret/results-2.png -------------------------------------------------------------------------------- /rmd_images/Clustering/unnamed-chunk-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Clustering/unnamed-chunk-3-1.png -------------------------------------------------------------------------------- /rmd_images/Clustering/unnamed-chunk-3-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Clustering/unnamed-chunk-3-2.png -------------------------------------------------------------------------------- /rmd_images/Comparing_Bayesian_Packages/unnamed-chunk-10-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Comparing_Bayesian_Packages/unnamed-chunk-10-1.png -------------------------------------------------------------------------------- /rmd_images/Comparing_Bayesian_Packages/unnamed-chunk-10-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Comparing_Bayesian_Packages/unnamed-chunk-10-2.png -------------------------------------------------------------------------------- /rmd_images/Comparing_Bayesian_Packages/unnamed-chunk-11-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Comparing_Bayesian_Packages/unnamed-chunk-11-1.png -------------------------------------------------------------------------------- /rmd_images/Comparing_Bayesian_Packages/unnamed-chunk-11-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Comparing_Bayesian_Packages/unnamed-chunk-11-2.png -------------------------------------------------------------------------------- /rmd_images/Comparing_Bayesian_Packages/unnamed-chunk-11-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Comparing_Bayesian_Packages/unnamed-chunk-11-3.png -------------------------------------------------------------------------------- /rmd_images/Comparing_Bayesian_Packages/unnamed-chunk-12-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Comparing_Bayesian_Packages/unnamed-chunk-12-1.png -------------------------------------------------------------------------------- /rmd_images/Comparing_Bayesian_Packages/unnamed-chunk-6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Comparing_Bayesian_Packages/unnamed-chunk-6-1.png -------------------------------------------------------------------------------- /rmd_images/Comparing_Bayesian_Packages/unnamed-chunk-6-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Comparing_Bayesian_Packages/unnamed-chunk-6-2.png -------------------------------------------------------------------------------- /rmd_images/Distribution_Sampling/unnamed-chunk-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Distribution_Sampling/unnamed-chunk-3-1.png -------------------------------------------------------------------------------- /rmd_images/Distribution_Sampling/unnamed-chunk-4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Distribution_Sampling/unnamed-chunk-4-1.png -------------------------------------------------------------------------------- /rmd_images/Geospatial_Analysis/locale-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Geospatial_Analysis/locale-1.png -------------------------------------------------------------------------------- /rmd_images/Geospatial_Analysis/unnamed-chunk-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Geospatial_Analysis/unnamed-chunk-2-1.png -------------------------------------------------------------------------------- /rmd_images/Geospatial_Analysis/unnamed-chunk-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Geospatial_Analysis/unnamed-chunk-3-1.png -------------------------------------------------------------------------------- /rmd_images/Geospatial_Analysis/unnamed-chunk-3-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Geospatial_Analysis/unnamed-chunk-3-2.png -------------------------------------------------------------------------------- /rmd_images/Modeling_Workflow/explore-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Modeling_Workflow/explore-1.png -------------------------------------------------------------------------------- /rmd_images/Modeling_Workflow/explore-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Modeling_Workflow/explore-2.png -------------------------------------------------------------------------------- /rmd_images/Modeling_Workflow/explore-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Modeling_Workflow/explore-3.png -------------------------------------------------------------------------------- /rmd_images/Modeling_Workflow/plot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Modeling_Workflow/plot-1.png -------------------------------------------------------------------------------- /rmd_images/Modeling_Workflow/plot-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Modeling_Workflow/plot-2.png -------------------------------------------------------------------------------- /rmd_images/Multilevel-Models/unnamed-chunk-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Multilevel-Models/unnamed-chunk-3-1.png -------------------------------------------------------------------------------- /rmd_images/Multilevel-Models/unnamed-chunk-3-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Multilevel-Models/unnamed-chunk-3-2.png -------------------------------------------------------------------------------- /rmd_images/Multilevel-Models/unnamed-chunk-6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Multilevel-Models/unnamed-chunk-6-1.png -------------------------------------------------------------------------------- /rmd_images/Multilevel-Models/unnamed-chunk-7-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Multilevel-Models/unnamed-chunk-7-1.png -------------------------------------------------------------------------------- /rmd_images/Multilevel-Models/unnamed-chunk-8-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Multilevel-Models/unnamed-chunk-8-1.png -------------------------------------------------------------------------------- /rmd_images/Multilevel-Models/unnamed-chunk-9-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Multilevel-Models/unnamed-chunk-9-1.png -------------------------------------------------------------------------------- /rmd_images/Multilevel-Models/unnamed-chunk-9-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Multilevel-Models/unnamed-chunk-9-2.png -------------------------------------------------------------------------------- /rmd_images/Ordinal_Regression/unnamed-chunk-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Ordinal_Regression/unnamed-chunk-2-1.png -------------------------------------------------------------------------------- /rmd_images/Ordinal_Regression/unnamed-chunk-4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Ordinal_Regression/unnamed-chunk-4-1.png -------------------------------------------------------------------------------- /rmd_images/Ordinal_Regression/unnamed-chunk-6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Ordinal_Regression/unnamed-chunk-6-1.png -------------------------------------------------------------------------------- /rmd_images/Ordinal_Regression/unnamed-chunk-7-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Ordinal_Regression/unnamed-chunk-7-1.png -------------------------------------------------------------------------------- /rmd_images/Ordinal_Regression/unnamed-chunk-8-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Ordinal_Regression/unnamed-chunk-8-1.png -------------------------------------------------------------------------------- /rmd_images/Parsnip/unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Parsnip/unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /rmd_images/Parsnip/unnamed-chunk-6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Parsnip/unnamed-chunk-6-1.png -------------------------------------------------------------------------------- /rmd_images/R-Quickstart/histogram-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/R-Quickstart/histogram-1.png -------------------------------------------------------------------------------- /rmd_images/R-Quickstart/line-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/R-Quickstart/line-1.png -------------------------------------------------------------------------------- /rmd_images/R-Quickstart/lollipop-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/R-Quickstart/lollipop-1.png -------------------------------------------------------------------------------- /rmd_images/R-Quickstart/unnamed-chunk-18-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/R-Quickstart/unnamed-chunk-18-1.png -------------------------------------------------------------------------------- /rmd_images/R-Quickstart/unnamed-chunk-22-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/R-Quickstart/unnamed-chunk-22-1.png -------------------------------------------------------------------------------- /rmd_images/R_Quotation/unnamed-chunk-1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/R_Quotation/unnamed-chunk-1-1.png -------------------------------------------------------------------------------- /rmd_images/R_Quotation/unnamed-chunk-1-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/R_Quotation/unnamed-chunk-1-2.png -------------------------------------------------------------------------------- /rmd_images/R_Quotation/unnamed-chunk-1-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/R_Quotation/unnamed-chunk-1-3.png -------------------------------------------------------------------------------- /rmd_images/Regression-Model-Tidying/unnamed-chunk-3-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Regression-Model-Tidying/unnamed-chunk-3-1.png -------------------------------------------------------------------------------- /rmd_images/Regression-Model-Tidying/unnamed-chunk-3-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Regression-Model-Tidying/unnamed-chunk-3-2.png -------------------------------------------------------------------------------- /rmd_images/Survival/unnamed-chunk-10-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-10-1.png -------------------------------------------------------------------------------- /rmd_images/Survival/unnamed-chunk-11-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-11-1.png -------------------------------------------------------------------------------- /rmd_images/Survival/unnamed-chunk-15-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-15-1.png -------------------------------------------------------------------------------- /rmd_images/Survival/unnamed-chunk-15-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-15-2.png -------------------------------------------------------------------------------- /rmd_images/Survival/unnamed-chunk-16-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-16-1.png -------------------------------------------------------------------------------- /rmd_images/Survival/unnamed-chunk-16-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-16-2.png -------------------------------------------------------------------------------- /rmd_images/Survival/unnamed-chunk-17-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-17-1.png -------------------------------------------------------------------------------- /rmd_images/Survival/unnamed-chunk-17-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-17-2.png -------------------------------------------------------------------------------- /rmd_images/Survival/unnamed-chunk-18-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-18-1.png -------------------------------------------------------------------------------- /rmd_images/Survival/unnamed-chunk-19-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-19-1.png -------------------------------------------------------------------------------- /rmd_images/Survival/unnamed-chunk-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-2-1.png -------------------------------------------------------------------------------- /rmd_images/Survival/unnamed-chunk-20-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-20-1.png -------------------------------------------------------------------------------- /rmd_images/Survival/unnamed-chunk-21-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-21-1.png -------------------------------------------------------------------------------- /rmd_images/Survival/unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /rmd_images/Survival/unnamed-chunk-6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-6-1.png -------------------------------------------------------------------------------- /rmd_images/Survival/unnamed-chunk-7-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-7-1.png -------------------------------------------------------------------------------- /rmd_images/Survival/unnamed-chunk-7-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-7-2.png -------------------------------------------------------------------------------- /rmd_images/Survival/unnamed-chunk-8-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-8-1.png -------------------------------------------------------------------------------- /rmd_images/Survival/unnamed-chunk-9-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-9-1.png -------------------------------------------------------------------------------- /rmd_images/Survival/unnamed-chunk-9-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-9-2.png -------------------------------------------------------------------------------- /rmd_images/Time_Series_Modeling/unnamed-chunk-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Time_Series_Modeling/unnamed-chunk-2-1.png -------------------------------------------------------------------------------- /rmd_images/Time_Series_Modeling/unnamed-chunk-2-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Time_Series_Modeling/unnamed-chunk-2-2.png -------------------------------------------------------------------------------- /rmd_images/Time_Series_Modeling/unnamed-chunk-4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Time_Series_Modeling/unnamed-chunk-4-1.png -------------------------------------------------------------------------------- /rmd_images/Time_Series_Modeling/unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Time_Series_Modeling/unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /rmd_images/Time_Series_Modeling/unnamed-chunk-5-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Time_Series_Modeling/unnamed-chunk-5-2.png -------------------------------------------------------------------------------- /rmd_images/Time_Series_Modeling/unnamed-chunk-5-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Time_Series_Modeling/unnamed-chunk-5-3.png -------------------------------------------------------------------------------- /rmd_images/Titanic/explore-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Titanic/explore-1.png -------------------------------------------------------------------------------- /rmd_images/Titanic/explore-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Titanic/explore-2.png -------------------------------------------------------------------------------- /rmd_images/Titanic/explore-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Titanic/explore-3.png -------------------------------------------------------------------------------- /rmd_images/Titanic/imputation-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Titanic/imputation-1.png -------------------------------------------------------------------------------- /rmd_images/Titanic/imputation-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Titanic/imputation-2.png -------------------------------------------------------------------------------- /rmd_images/Titanic/linear-regression-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Titanic/linear-regression-1.png -------------------------------------------------------------------------------- /rmd_images/Titanic/linear-regression-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Titanic/linear-regression-2.png -------------------------------------------------------------------------------- /rmd_images/Titanic/linear-regression-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Titanic/linear-regression-3.png -------------------------------------------------------------------------------- /rmd_images/Titanic/linear-regression-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Titanic/linear-regression-4.png -------------------------------------------------------------------------------- /rmd_images/Titanic/linear-regression-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Titanic/linear-regression-5.png -------------------------------------------------------------------------------- /rmd_images/Titanic/logistic-regression-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Titanic/logistic-regression-1.png -------------------------------------------------------------------------------- /rmd_images/Titanic/logistic-regression-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Titanic/logistic-regression-2.png -------------------------------------------------------------------------------- /rmd_images/Titanic/logistic-regression-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Titanic/logistic-regression-3.png -------------------------------------------------------------------------------- /rmd_images/Titanic/logistic-regression-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Titanic/logistic-regression-4.png -------------------------------------------------------------------------------- /rmd_images/Vehicles/compare-models-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Vehicles/compare-models-1.png -------------------------------------------------------------------------------- /rmd_images/Vehicles/compare-models-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Vehicles/compare-models-2.png -------------------------------------------------------------------------------- /rmd_images/Visualization_Cookbook/bar-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/bar-1.png -------------------------------------------------------------------------------- /rmd_images/Visualization_Cookbook/bar-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/bar-2.png -------------------------------------------------------------------------------- /rmd_images/Visualization_Cookbook/bar-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/bar-3.png -------------------------------------------------------------------------------- /rmd_images/Visualization_Cookbook/bar-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/bar-4.png -------------------------------------------------------------------------------- /rmd_images/Visualization_Cookbook/boxplot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/boxplot-1.png -------------------------------------------------------------------------------- /rmd_images/Visualization_Cookbook/bubbleplot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/bubbleplot-1.png -------------------------------------------------------------------------------- /rmd_images/Visualization_Cookbook/dotplot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/dotplot-1.png -------------------------------------------------------------------------------- /rmd_images/Visualization_Cookbook/dotplot-rank-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/dotplot-rank-1.png -------------------------------------------------------------------------------- /rmd_images/Visualization_Cookbook/heatmap-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/heatmap-1.png -------------------------------------------------------------------------------- /rmd_images/Visualization_Cookbook/histogram-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/histogram-1.png -------------------------------------------------------------------------------- /rmd_images/Visualization_Cookbook/line-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/line-1.png -------------------------------------------------------------------------------- /rmd_images/Visualization_Cookbook/line-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/line-2.png -------------------------------------------------------------------------------- /rmd_images/Visualization_Cookbook/lollipop-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/lollipop-1.png -------------------------------------------------------------------------------- /rmd_images/Visualization_Cookbook/pyramid-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/pyramid-1.png -------------------------------------------------------------------------------- /rmd_images/Visualization_Cookbook/ridge-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/ridge-1.png -------------------------------------------------------------------------------- /rmd_images/Visualization_Cookbook/scatter-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/scatter-1.png -------------------------------------------------------------------------------- /rmd_images/Visualization_Cookbook/stackedarea-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/stackedarea-1.png -------------------------------------------------------------------------------- /rmd_images/Visualization_Cookbook/treemap-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/treemap-1.png -------------------------------------------------------------------------------- /rmd_images/Visualization_Cookbook/treemap-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/treemap-2.png -------------------------------------------------------------------------------- /rmd_images/Visualization_Cookbook/violin-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/violin-1.png -------------------------------------------------------------------------------- /rmd_images/hypothesis_testing/unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/hypothesis_testing/unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /rmd_images/hypothesis_testing/unnamed-chunk-6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/hypothesis_testing/unnamed-chunk-6-1.png -------------------------------------------------------------------------------- /rmd_images/hypothesis_testing/unnamed-chunk-7-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/hypothesis_testing/unnamed-chunk-7-1.png --------------------------------------------------------------------------------