├── .gitignore
├── Anaconda-HowTo.md
├── Data-Science-Codex.Rproj
├── Git-HowTo.md
├── LICENSE
├── Python
    ├── Clustering.ipynb
    ├── Hello_NLP.ipynb
    ├── LSTM-Demo.ipynb
    ├── Pandas_Codeblocks.ipynb
    ├── Pandas_Essentials.Rmd
    ├── Pandas_Essentials.ipynb
    ├── Pandas_Essentials.md
    ├── R-Python-Hybrid.ipynb
    ├── SQL_Databases.ipynb
    ├── Sklearn-Workflow.ipynb
    ├── Tidy_Pandas.ipynb
    ├── plotnine_ggrepel.ipynb
    ├── sklearn_skopt_pipeline.ipynb
    ├── state_of_union_embeddings.ipynb
    └── state_of_union_v2.ipynb
├── R-Development.md
├── R
    ├── Bayesian_Basics.Rmd
    ├── Bayesian_Basics.md
    ├── Bayesian_Modeling.Rmd
    ├── Bayesian_Modeling.md
    ├── Caret.Rmd
    ├── Caret.md
    ├── Clustering.Rmd
    ├── Clustering.md
    ├── Comparing_Bayesian_Packages.Rmd
    ├── Comparing_Bayesian_Packages.md
    ├── Create_Formatted_Spreadsheet.Rmd
    ├── Create_Formatted_Spreadsheet.md
    ├── Distribution_Sampling.Rmd
    ├── Distribution_Sampling.md
    ├── Geospatial_Analysis.Rmd
    ├── Geospatial_Analysis.md
    ├── Modeling_Workflow.Rmd
    ├── Modeling_Workflow.md
    ├── Multilevel-Models.Rmd
    ├── Multilevel-Models.md
    ├── Ordinal_Regression.Rmd
    ├── Ordinal_Regression.md
    ├── Parsnip.Rmd
    ├── Parsnip.md
    ├── Power_Analysis.Rmd
    ├── Power_Analysis.md
    ├── R-Quickstart.Rmd
    ├── R-Quickstart.md
    ├── R_Quotation.Rmd
    ├── R_Quotation.md
    ├── Regression Model Tidying.Rmd
    ├── Regression-Model-Tidying.md
    ├── Rethinking-Tadpoles.Rmd
    ├── Sentiment_Analysis.Rmd
    ├── Sentiment_Analysis.md
    ├── Survival.Rmd
    ├── Survival.md
    ├── Time_Series_Modeling.Rmd
    ├── Time_Series_Modeling.md
    ├── Titanic.Rmd
    ├── Titanic.md
    ├── Titanic_files
    │   └── figure-gfm
    │   │   ├── explore-1.png
    │   │   ├── explore-2.png
    │   │   ├── explore-3.png
    │   │   ├── imputation-1.png
    │   │   ├── imputation-2.png
    │   │   ├── linear-regression-1.png
    │   │   ├── linear-regression-2.png
    │   │   ├── linear-regression-3.png
    │   │   ├── linear-regression-4.png
    │   │   ├── linear-regression-5.png
    │   │   ├── logistic-regression-1.png
    │   │   ├── logistic-regression-2.png
    │   │   ├── logistic-regression-3.png
    │   │   └── logistic-regression-4.png
    ├── Visualization_Cookbook.Rmd
    ├── Visualization_Cookbook.md
    ├── gapminder_summary_report.xlsx
    ├── hypothesis_testing.Rmd
    └── hypothesis_testing.md
├── README.md
├── Resources.md
├── rmd_config.R
└── rmd_images
    ├── Bayes
        └── unnamed-chunk-5-1.png
    ├── Bayesian_Basics
        ├── unnamed-chunk-10-1.png
        ├── unnamed-chunk-3-1.png
        ├── unnamed-chunk-4-1.png
        ├── unnamed-chunk-4-2.png
        ├── unnamed-chunk-5-1.png
        ├── unnamed-chunk-6-1.png
        ├── unnamed-chunk-7-1.png
        ├── unnamed-chunk-7-2.png
        ├── unnamed-chunk-7-3.png
        ├── unnamed-chunk-8-1.png
        └── unnamed-chunk-9-1.png
    ├── Bayesian_Distributions
        ├── unnamed-chunk-3-1.png
        ├── unnamed-chunk-4-1.png
        └── unnamed-chunk-5-1.png
    ├── Bayesian_Modeling
        ├── unnamed-chunk-12-1.png
        ├── unnamed-chunk-12-2.png
        ├── unnamed-chunk-15-1.png
        ├── unnamed-chunk-15-2.png
        ├── unnamed-chunk-15-3.png
        ├── unnamed-chunk-19-1.png
        ├── unnamed-chunk-19-2.png
        ├── unnamed-chunk-19-3.png
        ├── unnamed-chunk-20-1.png
        ├── unnamed-chunk-20-2.png
        ├── unnamed-chunk-20-3.png
        ├── unnamed-chunk-21-1.png
        ├── unnamed-chunk-23-1.png
        └── unnamed-chunk-4-1.png
    ├── Caret
        ├── results-1.png
        └── results-2.png
    ├── Clustering
        ├── unnamed-chunk-3-1.png
        └── unnamed-chunk-3-2.png
    ├── Comparing_Bayesian_Packages
        ├── unnamed-chunk-10-1.png
        ├── unnamed-chunk-10-2.png
        ├── unnamed-chunk-11-1.png
        ├── unnamed-chunk-11-2.png
        ├── unnamed-chunk-11-3.png
        ├── unnamed-chunk-12-1.png
        ├── unnamed-chunk-6-1.png
        └── unnamed-chunk-6-2.png
    ├── Distribution_Sampling
        ├── unnamed-chunk-3-1.png
        └── unnamed-chunk-4-1.png
    ├── Geospatial_Analysis
        ├── locale-1.png
        ├── unnamed-chunk-2-1.png
        ├── unnamed-chunk-3-1.png
        └── unnamed-chunk-3-2.png
    ├── Modeling_Workflow
        ├── explore-1.png
        ├── explore-2.png
        ├── explore-3.png
        ├── plot-1.png
        └── plot-2.png
    ├── Multilevel-Models
        ├── unnamed-chunk-3-1.png
        ├── unnamed-chunk-3-2.png
        ├── unnamed-chunk-6-1.png
        ├── unnamed-chunk-7-1.png
        ├── unnamed-chunk-8-1.png
        ├── unnamed-chunk-9-1.png
        └── unnamed-chunk-9-2.png
    ├── Ordinal_Regression
        ├── unnamed-chunk-2-1.png
        ├── unnamed-chunk-4-1.png
        ├── unnamed-chunk-6-1.png
        ├── unnamed-chunk-7-1.png
        └── unnamed-chunk-8-1.png
    ├── Parsnip
        ├── unnamed-chunk-5-1.png
        └── unnamed-chunk-6-1.png
    ├── R-Quickstart
        ├── histogram-1.png
        ├── line-1.png
        ├── lollipop-1.png
        ├── unnamed-chunk-18-1.png
        └── unnamed-chunk-22-1.png
    ├── R_Quotation
        ├── unnamed-chunk-1-1.png
        ├── unnamed-chunk-1-2.png
        └── unnamed-chunk-1-3.png
    ├── Regression-Model-Tidying
        ├── unnamed-chunk-3-1.png
        └── unnamed-chunk-3-2.png
    ├── Survival
        ├── unnamed-chunk-10-1.png
        ├── unnamed-chunk-11-1.png
        ├── unnamed-chunk-15-1.png
        ├── unnamed-chunk-15-2.png
        ├── unnamed-chunk-16-1.png
        ├── unnamed-chunk-16-2.png
        ├── unnamed-chunk-17-1.png
        ├── unnamed-chunk-17-2.png
        ├── unnamed-chunk-18-1.png
        ├── unnamed-chunk-19-1.png
        ├── unnamed-chunk-2-1.png
        ├── unnamed-chunk-20-1.png
        ├── unnamed-chunk-21-1.png
        ├── unnamed-chunk-5-1.png
        ├── unnamed-chunk-6-1.png
        ├── unnamed-chunk-7-1.png
        ├── unnamed-chunk-7-2.png
        ├── unnamed-chunk-8-1.png
        ├── unnamed-chunk-9-1.png
        └── unnamed-chunk-9-2.png
    ├── Time_Series_Modeling
        ├── unnamed-chunk-2-1.png
        ├── unnamed-chunk-2-2.png
        ├── unnamed-chunk-4-1.png
        ├── unnamed-chunk-5-1.png
        ├── unnamed-chunk-5-2.png
        └── unnamed-chunk-5-3.png
    ├── Titanic
        ├── explore-1.png
        ├── explore-2.png
        ├── explore-3.png
        ├── imputation-1.png
        ├── imputation-2.png
        ├── linear-regression-1.png
        ├── linear-regression-2.png
        ├── linear-regression-3.png
        ├── linear-regression-4.png
        ├── linear-regression-5.png
        ├── logistic-regression-1.png
        ├── logistic-regression-2.png
        ├── logistic-regression-3.png
        └── logistic-regression-4.png
    ├── Vehicles
        ├── compare-models-1.png
        └── compare-models-2.png
    ├── Visualization_Cookbook
        ├── bar-1.png
        ├── bar-2.png
        ├── bar-3.png
        ├── bar-4.png
        ├── boxplot-1.png
        ├── bubbleplot-1.png
        ├── dotplot-1.png
        ├── dotplot-rank-1.png
        ├── heatmap-1.png
        ├── histogram-1.png
        ├── line-1.png
        ├── line-2.png
        ├── lollipop-1.png
        ├── pyramid-1.png
        ├── ridge-1.png
        ├── scatter-1.png
        ├── stackedarea-1.png
        ├── treemap-1.png
        ├── treemap-2.png
        └── violin-1.png
    └── hypothesis_testing
        ├── unnamed-chunk-5-1.png
        ├── unnamed-chunk-6-1.png
        └── unnamed-chunk-7-1.png


/.gitignore:
--------------------------------------------------------------------------------
1 | source/.ipynb_checkpoints
2 | source/.ipynb_checkpoints/*
3 | source/.DS_Store
4 | .DS_Store
5 | .ipynb*
6 | .Rproj.user
7 | 


--------------------------------------------------------------------------------
/Anaconda-HowTo.md:
--------------------------------------------------------------------------------
 1 | ## Reference for managing an Anaconda environment
 2 | 
 3 | List all installed environments:
 4 | 
 5 | ```conda env list```
 6 | 
 7 | Activate a conda environment:
 8 | 
 9 | ```conda activate <env-name>```
10 | 
11 | List all modules installed in current active environment:
12 | 
13 | ```conda list```
14 | 
15 | [Updating all packages](https://www.anaconda.com/keeping-anaconda-date/):
16 | 
17 | ``` conda update --all```
18 | 
19 | ## References
20 | 
21 | * [Conda cheatsheet](https://docs.conda.io/projects/conda/en/4.6.0/_downloads/52a95608c49671267e40c689e0bc00ca/conda-cheatsheet.pdf)
22 | * [Conda Environments Documentation](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#)
23 | 


--------------------------------------------------------------------------------
/Data-Science-Codex.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------
/Git-HowTo.md:
--------------------------------------------------------------------------------
 1 | Process to get your code up on the server:
 2 | 
 3 | 1. `git init`   ( If the repo doesn’t already exist)
 4 | 2. `git add .` (adds all the files to the repo)
 5 | 3. `git commit -m "name"`  commits changes
 6 | 4. `git remote add origin https://github.com/<username>/<reponame>.git` (if you haven’t set the URL yet)
 7 | 5. `git push -u origin master`   pushes the code to the server
 8 | 
 9 | To remove cached files:
10 | 
11 | ```git rm -r --cached . ```
12 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Jesse Cambon
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Python/Hello_NLP.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Testing out some NLP techniques using the in-built Wall Street Journal dataset in NLTK"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 13,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import nltk\n",
 17 |     "from nltk.stem.wordnet import *\n",
 18 |     "from nltk.book import *\n",
 19 |     "from sklearn.feature_extraction.text import CountVectorizer\n",
 20 |     "from sklearn.decomposition import LatentDirichletAllocation\n",
 21 |     "from nltk.stem.snowball import PorterStemmer\n",
 22 |     "default_stopwords = set(nltk.corpus.stopwords.words('english'))"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "metadata": {},
 28 |    "source": [
 29 |     "https://www.kaggle.com/arthurtok/spooky-nlp-and-topic-modelling-tutorial"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "## Topic Modeling with LDA"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 19,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "#WSJ_freq = nltk.FreqDist(text7)\n",
 46 |     "\n",
 47 |     "stemmer = PorterStemmer()\n",
 48 |     "lda = LatentDirichletAllocation(n_components=7, max_iter=5,\n",
 49 |     "                                learning_method = 'online',\n",
 50 |     "                                learning_offset = 50.,\n",
 51 |     "                                random_state = 0)\n",
 52 |     "\n",
 53 |     "lemm = WordNetLemmatizer()\n",
 54 |     "class LemmaCountVectorizer(CountVectorizer):\n",
 55 |     "    def build_analyzer(self):\n",
 56 |     "        analyzer = super(LemmaCountVectorizer, self).build_analyzer()\n",
 57 |     "        return lambda doc: (lemm.lemmatize(w) for w in analyzer(doc))\n",
 58 |     "\n",
 59 |     "tf_vectorizer = LemmaCountVectorizer(max_df=0.95, \n",
 60 |     "                                     min_df=2,\n",
 61 |     "                                     stop_words='english',\n",
 62 |     "                                     decode_error='ignore')\n",
 63 |     "\n",
 64 |     "tf = tf_vectorizer.fit_transform(text7)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 20,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "# train lda\n",
 74 |     "wsj_lda = lda.fit(tf)"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 21,
 80 |    "metadata": {},
 81 |    "outputs": [
 82 |     {
 83 |      "name": "stdout",
 84 |      "output_type": "stream",
 85 |      "text": [
 86 |       "\n",
 87 |       "Topics in LDA model: \n",
 88 |       "\n",
 89 |       "Topic #0:million program corp bond profit time analyst industry\n",
 90 |       "======================================================================\n",
 91 |       "\n",
 92 |       "Topic #1:market say trading sale rrb cent plan 30\n",
 93 |       "======================================================================\n",
 94 |       "\n",
 95 |       "Topic #2:company stock billion month investor bank buy ich\n",
 96 |       "======================================================================\n",
 97 |       "\n",
 98 |       "Topic #3:year new 000 future lrb 50 quarter service\n",
 99 |       "======================================================================\n",
100 |       "\n",
101 |       "Topic #4:said mr index business investment 10 rate contract\n",
102 |       "======================================================================\n",
103 |       "\n",
104 |       "Topic #5:price president york day exchange rose term yesterday\n",
105 |       "======================================================================\n",
106 |       "\n",
107 |       "Topic #6:share issue government executive house financial october trader\n",
108 |       "======================================================================\n"
109 |      ]
110 |     }
111 |    ],
112 |    "source": [
113 |     "n_top_words = 8\n",
114 |     "\n",
115 |     "# Define helper function to print top words\n",
116 |     "def print_top_words(model, feature_names, n_top_words):\n",
117 |     "    for index, topic in enumerate(model.components_):\n",
118 |     "        message = \"\\nTopic #{}:\".format(index)\n",
119 |     "        message += \" \".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1 :-1]])\n",
120 |     "        print(message)\n",
121 |     "        print(\"=\"*70)\n",
122 |     "\n",
123 |     "print(\"\\nTopics in LDA model: \")\n",
124 |     "tf_feature_names = tf_vectorizer.get_feature_names()\n",
125 |     "print_top_words(wsj_lda, tf_feature_names, n_top_words)"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {},
131 |    "source": [
132 |     "## Stemming"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 63,
138 |    "metadata": {},
139 |    "outputs": [
140 |     {
141 |      "data": {
142 |       "text/plain": [
143 |        "[('said', 628),\n",
144 |        " ('million', 383),\n",
145 |        " ('compani', 260),\n",
146 |        " ('year', 212),\n",
147 |        " ('say', 210),\n",
148 |        " ('would', 209),\n",
149 |        " ('market', 176),\n",
150 |        " ('new', 165),\n",
151 |        " ('new', 162),\n",
152 |        " ('trade', 162),\n",
153 |        " ('billion', 159),\n",
154 |        " ('also', 147),\n",
155 |        " ('stock', 136),\n",
156 |        " ('presid', 133),\n",
157 |        " ('one', 132)]"
158 |       ]
159 |      },
160 |      "execution_count": 63,
161 |      "metadata": {},
162 |      "output_type": "execute_result"
163 |     }
164 |    ],
165 |    "source": [
166 |     "[ (stemmer.stem(w),f) for (w,f) in WSJ_freq.most_common(80) if w.lower() not in default_stopwords and w.isalpha()]"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": 61,
172 |    "metadata": {},
173 |    "outputs": [
174 |     {
175 |      "data": {
176 |       "text/plain": [
177 |        "[(',', 4885),\n",
178 |        " ('.', 3828),\n",
179 |        " ('*-1', 1123),\n",
180 |        " ('0', 1099),\n",
181 |        " ('*', 965),\n",
182 |        " (\"'s\", 864),\n",
183 |        " ('*T*-1', 806),\n",
184 |        " ('*U*', 744),\n",
185 |        " ('$', 718),\n",
186 |        " ('``', 702),\n",
187 |        " (\"''\", 684)]"
188 |       ]
189 |      },
190 |      "execution_count": 61,
191 |      "metadata": {},
192 |      "output_type": "execute_result"
193 |     }
194 |    ],
195 |    "source": [
196 |     "[ (w,f) for (w,f) in WSJ_freq.most_common(20) if w.lower() not in default_stopwords ]"
197 |    ]
198 |   }
199 |  ],
200 |  "metadata": {
201 |   "kernelspec": {
202 |    "display_name": "Python 3",
203 |    "language": "python",
204 |    "name": "python3"
205 |   },
206 |   "language_info": {
207 |    "codemirror_mode": {
208 |     "name": "ipython",
209 |     "version": 3
210 |    },
211 |    "file_extension": ".py",
212 |    "mimetype": "text/x-python",
213 |    "name": "python",
214 |    "nbconvert_exporter": "python",
215 |    "pygments_lexer": "ipython3",
216 |    "version": "3.6.8"
217 |   }
218 |  },
219 |  "nbformat": 4,
220 |  "nbformat_minor": 2
221 | }
222 | 


--------------------------------------------------------------------------------
/Python/Pandas_Essentials.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Pandas Essentials"
 3 | author: "Jesse Cambon"
 4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
 5 | output:
 6 |   github_document:
 7 |     toc: true
 8 | ---
 9 | 
10 | https://github.com/rstudio/reticulate/#python-in-r-markdown 
11 | 
12 | ```{r knit-settings, include=FALSE}
13 | library(here)
14 | source(here("rmd_config.R"))
15 | ```
16 | 
17 | 
18 | ```{python}
19 | import pandas as pd
20 | from rpy2.robjects import r,pandas2ri
21 | pandas2ri.activate()
22 | 
23 | iris = r.data('iris')
24 | 
25 | iris.info()
26 | #mtcars.info()
27 | 
28 | ```
29 | 
30 | 


--------------------------------------------------------------------------------
/Python/Pandas_Essentials.md:
--------------------------------------------------------------------------------
 1 | Pandas Essentials
 2 | ================
 3 | Jesse Cambon
 4 | 11 April, 2020
 5 | 
 6 | <https://github.com/rstudio/reticulate/#python-in-r-markdown>
 7 | 
 8 | ``` r
 9 | library(reticulate)
10 | library(knitr)
11 | ```
12 | 
13 | ``` python
14 | import pandas as pd
15 | mtcars = r.mtcars
16 | ```
17 | 
18 | Counting
19 | 
20 | ``` python
21 | am_vs = mtcars.groupby(['am','vs']).size().reset_index(name='count').\
22 |   sort_values('count',ascending=False)
23 |   
24 | am_vs
25 | ```
26 | 
27 |     ##     am   vs  count
28 |     ## 0  0.0  0.0     12
29 |     ## 1  0.0  1.0      7
30 |     ## 3  1.0  1.0      7
31 |     ## 2  1.0  0.0      6
32 | 


--------------------------------------------------------------------------------
/Python/SQL_Databases.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Code for creating and using a SQL database in Python. We will use pandas, sqlalchemy, and sqlite\n",
  8 |     "\n",
  9 |     "References:\n",
 10 |     "\n",
 11 |     "* https://towardsdatascience.com/sqlalchemy-python-tutorial-79a577141a91\n",
 12 |     "* https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_sql.html\n",
 13 |     "* https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_sql.html"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 3,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "from sqlalchemy import create_engine\n",
 23 |     "import pandas as pd\n",
 24 |     "\n",
 25 |     "# Create local in-memory SQL database using sqlite\n",
 26 |     "# Note that you could also create this database in a .db file if desired\n",
 27 |     "engine = create_engine('sqlite://', echo=False)\n",
 28 |     " \n",
 29 |     "# Create some sample data \n",
 30 |     "df = pd.DataFrame({'name' : ['User 1', 'User 2', 'User 3']})\n",
 31 |     " \n",
 32 |     "# Put the sample data in the database\n",
 33 |     "df.to_sql('users', con=engine,index=False)"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 4,
 39 |    "metadata": {},
 40 |    "outputs": [
 41 |     {
 42 |      "data": {
 43 |       "text/html": [
 44 |        "<div>\n",
 45 |        "<style scoped>\n",
 46 |        "    .dataframe tbody tr th:only-of-type {\n",
 47 |        "        vertical-align: middle;\n",
 48 |        "    }\n",
 49 |        "\n",
 50 |        "    .dataframe tbody tr th {\n",
 51 |        "        vertical-align: top;\n",
 52 |        "    }\n",
 53 |        "\n",
 54 |        "    .dataframe thead th {\n",
 55 |        "        text-align: right;\n",
 56 |        "    }\n",
 57 |        "</style>\n",
 58 |        "<table border=\"1\" class=\"dataframe\">\n",
 59 |        "  <thead>\n",
 60 |        "    <tr style=\"text-align: right;\">\n",
 61 |        "      <th></th>\n",
 62 |        "      <th>name</th>\n",
 63 |        "    </tr>\n",
 64 |        "  </thead>\n",
 65 |        "  <tbody>\n",
 66 |        "    <tr>\n",
 67 |        "      <td>0</td>\n",
 68 |        "      <td>User 1</td>\n",
 69 |        "    </tr>\n",
 70 |        "    <tr>\n",
 71 |        "      <td>1</td>\n",
 72 |        "      <td>User 2</td>\n",
 73 |        "    </tr>\n",
 74 |        "    <tr>\n",
 75 |        "      <td>2</td>\n",
 76 |        "      <td>User 3</td>\n",
 77 |        "    </tr>\n",
 78 |        "  </tbody>\n",
 79 |        "</table>\n",
 80 |        "</div>"
 81 |       ],
 82 |       "text/plain": [
 83 |        "     name\n",
 84 |        "0  User 1\n",
 85 |        "1  User 2\n",
 86 |        "2  User 3"
 87 |       ]
 88 |      },
 89 |      "execution_count": 4,
 90 |      "metadata": {},
 91 |      "output_type": "execute_result"
 92 |     }
 93 |    ],
 94 |    "source": [
 95 |     "# Examine our database\n",
 96 |     "pd.read_sql(\"select * from users\",engine)"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 6,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "df1 = pd.DataFrame({'name' : ['User 8', 'User 9', 'User 10']}) # create more data\n",
106 |     "\n",
107 |     "# Append our extra data into the database\n",
108 |     "df1.to_sql('users', con=engine,index=False,if_exists='append')"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 7,
114 |    "metadata": {},
115 |    "outputs": [
116 |     {
117 |      "data": {
118 |       "text/html": [
119 |        "<div>\n",
120 |        "<style scoped>\n",
121 |        "    .dataframe tbody tr th:only-of-type {\n",
122 |        "        vertical-align: middle;\n",
123 |        "    }\n",
124 |        "\n",
125 |        "    .dataframe tbody tr th {\n",
126 |        "        vertical-align: top;\n",
127 |        "    }\n",
128 |        "\n",
129 |        "    .dataframe thead th {\n",
130 |        "        text-align: right;\n",
131 |        "    }\n",
132 |        "</style>\n",
133 |        "<table border=\"1\" class=\"dataframe\">\n",
134 |        "  <thead>\n",
135 |        "    <tr style=\"text-align: right;\">\n",
136 |        "      <th></th>\n",
137 |        "      <th>name</th>\n",
138 |        "    </tr>\n",
139 |        "  </thead>\n",
140 |        "  <tbody>\n",
141 |        "    <tr>\n",
142 |        "      <td>0</td>\n",
143 |        "      <td>User 1</td>\n",
144 |        "    </tr>\n",
145 |        "    <tr>\n",
146 |        "      <td>1</td>\n",
147 |        "      <td>User 2</td>\n",
148 |        "    </tr>\n",
149 |        "    <tr>\n",
150 |        "      <td>2</td>\n",
151 |        "      <td>User 3</td>\n",
152 |        "    </tr>\n",
153 |        "    <tr>\n",
154 |        "      <td>3</td>\n",
155 |        "      <td>User 8</td>\n",
156 |        "    </tr>\n",
157 |        "    <tr>\n",
158 |        "      <td>4</td>\n",
159 |        "      <td>User 9</td>\n",
160 |        "    </tr>\n",
161 |        "    <tr>\n",
162 |        "      <td>5</td>\n",
163 |        "      <td>User 10</td>\n",
164 |        "    </tr>\n",
165 |        "  </tbody>\n",
166 |        "</table>\n",
167 |        "</div>"
168 |       ],
169 |       "text/plain": [
170 |        "      name\n",
171 |        "0   User 1\n",
172 |        "1   User 2\n",
173 |        "2   User 3\n",
174 |        "3   User 8\n",
175 |        "4   User 9\n",
176 |        "5  User 10"
177 |       ]
178 |      },
179 |      "execution_count": 7,
180 |      "metadata": {},
181 |      "output_type": "execute_result"
182 |     }
183 |    ],
184 |    "source": [
185 |     "# Re-examine our database\n",
186 |     "pd.read_sql(\"select * from users\",engine)"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": []
195 |   }
196 |  ],
197 |  "metadata": {
198 |   "kernelspec": {
199 |    "display_name": "Python 3",
200 |    "language": "python",
201 |    "name": "python3"
202 |   },
203 |   "language_info": {
204 |    "codemirror_mode": {
205 |     "name": "ipython",
206 |     "version": 3
207 |    },
208 |    "file_extension": ".py",
209 |    "mimetype": "text/x-python",
210 |    "name": "python",
211 |    "nbconvert_exporter": "python",
212 |    "pygments_lexer": "ipython3",
213 |    "version": "3.7.4"
214 |   }
215 |  },
216 |  "nbformat": 4,
217 |  "nbformat_minor": 4
218 | }
219 | 


--------------------------------------------------------------------------------
/R-Development.md:
--------------------------------------------------------------------------------
 1 | # Notes on Developing R Packages
 2 | 
 3 | * The man documents are created with roxygen2 based on `R/` directory code files with `devtools::document()`
 4 | * Test package with `devtools::test()` (see `/tests` directory)
 5 | * Use `devtools::check()` to check for issues (also runs `devtools::test()`)
 6 | * Use `devtools::build()` to build the package with vignettes included (creates .tar.gz file)
 7 | * Check package on other environments for CRAN release using [rhub::check_for_cran()](https://r-hub.github.io/rhub/reference/check_for_cran.html)
 8 | * To run all code examples in the package documentation, use this command from devtools: `run_examples(test=TRUE)`
 9 | * Use [pkgdown::build_site()](https://pkgdown.r-lib.org/reference/build_site.html) to build the website.
10 | 
11 | ### Development Resources
12 | * General Instructions: http://r-pkgs.had.co.nz/
13 | * More general instructions: https://rstats-pkgs.readthedocs.io
14 | * roxygen2 : https://cran.r-project.org/web/packages/roxygen2/
15 | * Devtools cheat sheet: https://www.rstudio.com/wp-content/uploads/2015/03/devtools-cheatsheet.pdf
16 | 


--------------------------------------------------------------------------------
/R/Bayesian_Basics.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Bayesian Basics"
  3 | author: "Jesse Cambon"
  4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  5 | output:
  6 |   github_document:
  7 |     toc: true
  8 | ---
  9 | 
 10 | 
 11 | ```{r knit-settings, include=FALSE}
 12 | library(here)
 13 | source(here("rmd_config.R"))
 14 | ```
 15 | 
 16 | * https://github.com/easystats/see/issues/48
 17 | * https://easystats.github.io/see/articles/bayestestR.html
 18 | * https://cran.r-project.org/web/packages/bayestestR/vignettes/bayes_factors.html
 19 | 
 20 | ```{r}
 21 | library(rstanarm)
 22 | library(tidyverse)
 23 | library(bayestestR)
 24 | library(bayesplot)
 25 | library(wesanderson)
 26 | library(broom.mixed)
 27 | 
 28 | options(mc.cores = parallel::detectCores()) 
 29 | 
 30 | model <- stan_glmer(extra ~ group + (1 | ID), data = sleep,
 31 |                   prior = normal(0, 3, autoscale = FALSE))
 32 | 
 33 | summary(model)
 34 | 
 35 | tidy(model)
 36 | ```
 37 | 
 38 | https://github.com/easystats/see/issues/48
 39 | 
 40 | ```{r}
 41 | #My_first_BF <- bayesfactor_parameters(model, null = c(-1, 1))
 42 | 
 43 | density <- estimate_density(model)
 44 | sim_prior <- simulate_prior(model)
 45 | density_prior <- estimate_density(sim_prior)
 46 | 
 47 | # Combine density for prior and posterior distributions
 48 | post_prior <- density %>% mutate(type = 'posterior') %>%
 49 |   bind_rows(density_prior %>% mutate(type = 'prior'))
 50 | 
 51 | ```
 52 | 
 53 | Plot the prior and posterior distributions for the fixed effects
 54 | 
 55 | ```{r}
 56 | ggplot(data = post_prior, aes(x = x ,y = y, fill = type)) + 
 57 |   theme_bw() +
 58 |   facet_wrap(~Parameter, ncol = 1, scales='free') +
 59 |   geom_ribbon( mapping = aes(
 60 |     ymin = 0,
 61 |     ymax = y  ),
 62 |   alpha = .8) +   
 63 |   scale_fill_manual(values=c('steelblue', 'grey'))
 64 | 
 65 | 
 66 | #  scale_x_continuous(expand=expand_scale(mult = c(-.4, -.4)))
 67 | ```
 68 | 
 69 | ```{r}
 70 | mcmc_trace(model)
 71 | mcmc_areas(model)
 72 | ```
 73 | 
 74 | 
 75 | https://easystats.github.io/see/articles/bayestestR.html
 76 | 
 77 | ```{r}
 78 | plot(model)
 79 | 
 80 | p_direction(model)
 81 | ```
 82 | 
 83 | ```{r, fig.height = 10, fig.width = 8}
 84 | plot(p_direction(model, effects = "all", component = "all"))
 85 | ```
 86 | 
 87 | 
 88 | Check posterior distribution
 89 | 
 90 | ```{r}
 91 | pp_check(model)
 92 | 
 93 | pp_check(model, plotfun = "hist")
 94 | 
 95 | pp_check(model, plotfun = "intervals")
 96 | ```
 97 | 
 98 | 
 99 | ```{r}
100 | ppc_intervals_grouped(
101 |   y = sleep$extra,
102 |   yrep = posterior_predict(model),
103 |   x = as.numeric(sleep$group),
104 |   prob = 0.5,
105 |   group = sleep$ID
106 | ) 
107 | ```
108 | 
109 | 


--------------------------------------------------------------------------------
/R/Bayesian_Basics.md:
--------------------------------------------------------------------------------
  1 | Bayesian Basics
  2 | ================
  3 | Jesse Cambon
  4 | 06 February, 2021
  5 | 
  6 |   - <https://github.com/easystats/see/issues/48>
  7 |   - <https://easystats.github.io/see/articles/bayestestR.html>
  8 |   - <https://cran.r-project.org/web/packages/bayestestR/vignettes/bayes_factors.html>
  9 | 
 10 | <!-- end list -->
 11 | 
 12 | ``` r
 13 | library(rstanarm)
 14 | ```
 15 | 
 16 |     ## Loading required package: Rcpp
 17 | 
 18 |     ## This is rstanarm version 2.21.1
 19 | 
 20 |     ## - See https://mc-stan.org/rstanarm/articles/priors for changes to default priors!
 21 | 
 22 |     ## - Default priors may change, so it's safest to specify priors, even if equivalent to the defaults.
 23 | 
 24 |     ## - For execution on a local, multicore CPU with excess RAM we recommend calling
 25 | 
 26 |     ##   options(mc.cores = parallel::detectCores())
 27 | 
 28 | ``` r
 29 | library(tidyverse)
 30 | ```
 31 | 
 32 |     ## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
 33 | 
 34 |     ## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
 35 |     ## ✓ tibble  3.0.6     ✓ dplyr   1.0.4
 36 |     ## ✓ tidyr   1.1.2     ✓ forcats 0.5.1
 37 |     ## ✓ readr   1.4.0
 38 | 
 39 |     ## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
 40 |     ## x dplyr::filter() masks stats::filter()
 41 |     ## x dplyr::lag()    masks stats::lag()
 42 | 
 43 | ``` r
 44 | library(bayestestR)
 45 | ```
 46 | 
 47 |     ## Note: The default CI width (currently `ci=0.89`) might change in future versions (see https://github.com/easystats/bayestestR/discussions/250). To prevent any issues, please set it explicitly when using bayestestR functions, via the 'ci' argument.
 48 | 
 49 | ``` r
 50 | library(bayesplot)
 51 | ```
 52 | 
 53 |     ## This is bayesplot version 1.8.0
 54 | 
 55 |     ## - Online documentation and vignettes at mc-stan.org/bayesplot
 56 | 
 57 |     ## - bayesplot theme set to bayesplot::theme_default()
 58 | 
 59 |     ##    * Does _not_ affect other ggplot2 plots
 60 | 
 61 |     ##    * See ?bayesplot_theme_set for details on theme setting
 62 | 
 63 | ``` r
 64 | library(wesanderson)
 65 | library(broom.mixed)
 66 | ```
 67 | 
 68 |     ## Registered S3 method overwritten by 'broom.mixed':
 69 |     ##   method      from 
 70 |     ##   tidy.gamlss broom
 71 | 
 72 | ``` r
 73 | options(mc.cores = parallel::detectCores()) 
 74 | 
 75 | model <- stan_glmer(extra ~ group + (1 | ID), data = sleep,
 76 |                   prior = normal(0, 3, autoscale = FALSE))
 77 | ```
 78 | 
 79 |     ## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
 80 |     ## Running the chains for more iterations may help. See
 81 |     ## http://mc-stan.org/misc/warnings.html#tail-ess
 82 | 
 83 | ``` r
 84 | summary(model)
 85 | ```
 86 | 
 87 |     ## 
 88 |     ## Model Info:
 89 |     ##  function:     stan_glmer
 90 |     ##  family:       gaussian [identity]
 91 |     ##  formula:      extra ~ group + (1 | ID)
 92 |     ##  algorithm:    sampling
 93 |     ##  sample:       4000 (posterior sample size)
 94 |     ##  priors:       see help('prior_summary')
 95 |     ##  observations: 20
 96 |     ##  groups:       ID (10)
 97 |     ## 
 98 |     ## Estimates:
 99 |     ##                                     mean   sd   10%   50%   90%
100 |     ## (Intercept)                        0.8    0.6  0.0   0.8   1.6 
101 |     ## group2                             1.5    0.5  0.9   1.5   2.1 
102 |     ## b[(Intercept) ID:1]               -0.2    0.8 -1.2  -0.2   0.8 
103 |     ## b[(Intercept) ID:2]               -1.5    0.9 -2.6  -1.5  -0.3 
104 |     ## b[(Intercept) ID:3]               -0.9    0.8 -1.9  -0.8   0.2 
105 |     ## b[(Intercept) ID:4]               -1.6    0.9 -2.8  -1.7  -0.5 
106 |     ## b[(Intercept) ID:5]               -1.3    0.9 -2.4  -1.3  -0.2 
107 |     ## b[(Intercept) ID:6]                1.8    0.9  0.6   1.8   3.0 
108 |     ## b[(Intercept) ID:7]                2.4    1.0  1.1   2.4   3.6 
109 |     ## b[(Intercept) ID:8]               -0.3    0.8 -1.2  -0.3   0.7 
110 |     ## b[(Intercept) ID:9]                0.6    0.8 -0.5   0.6   1.6 
111 |     ## b[(Intercept) ID:10]               0.9    0.8 -0.1   0.9   1.9 
112 |     ## sigma                              1.1    0.3  0.8   1.0   1.5 
113 |     ## Sigma[ID:(Intercept),(Intercept)]  2.8    1.8  1.1   2.5   5.1 
114 |     ## 
115 |     ## Fit Diagnostics:
116 |     ##            mean   sd   10%   50%   90%
117 |     ## mean_PPD 1.5    0.4  1.1   1.5   2.0  
118 |     ## 
119 |     ## The mean_ppd is the sample average posterior predictive distribution of the outcome variable (for details see help('summary.stanreg')).
120 |     ## 
121 |     ## MCMC diagnostics
122 |     ##                                   mcse Rhat n_eff
123 |     ## (Intercept)                       0.0  1.0  1102 
124 |     ## group2                            0.0  1.0  3499 
125 |     ## b[(Intercept) ID:1]               0.0  1.0  1927 
126 |     ## b[(Intercept) ID:2]               0.0  1.0   825 
127 |     ## b[(Intercept) ID:3]               0.0  1.0  1240 
128 |     ## b[(Intercept) ID:4]               0.0  1.0   690 
129 |     ## b[(Intercept) ID:5]               0.0  1.0  1027 
130 |     ## b[(Intercept) ID:6]               0.0  1.0   990 
131 |     ## b[(Intercept) ID:7]               0.0  1.0   740 
132 |     ## b[(Intercept) ID:8]               0.0  1.0  1652 
133 |     ## b[(Intercept) ID:9]               0.0  1.0  1405 
134 |     ## b[(Intercept) ID:10]              0.0  1.0  1447 
135 |     ## sigma                             0.0  1.0   345 
136 |     ## Sigma[ID:(Intercept),(Intercept)] 0.1  1.0   984 
137 |     ## mean_PPD                          0.0  1.0  4029 
138 |     ## log-posterior                     0.3  1.0   299 
139 |     ## 
140 |     ## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
141 | 
142 | ``` r
143 | tidy(model)
144 | ```
145 | 
146 |     ## # A tibble: 2 x 3
147 |     ##   term        estimate std.error
148 |     ##   <chr>          <dbl>     <dbl>
149 |     ## 1 (Intercept)    0.781     0.616
150 |     ## 2 group2         1.55      0.450
151 | 
152 | <https://github.com/easystats/see/issues/48>
153 | 
154 | ``` r
155 | #My_first_BF <- bayesfactor_parameters(model, null = c(-1, 1))
156 | 
157 | density <- estimate_density(model)
158 | sim_prior <- simulate_prior(model)
159 | density_prior <- estimate_density(sim_prior)
160 | 
161 | # Combine density for prior and posterior distributions
162 | post_prior <- density %>% mutate(type = 'posterior') %>%
163 |   bind_rows(density_prior %>% mutate(type = 'prior'))
164 | ```
165 | 
166 | Plot the prior and posterior distributions for the fixed effects
167 | 
168 | ``` r
169 | ggplot(data = post_prior, aes(x = x ,y = y, fill = type)) + 
170 |   theme_bw() +
171 |   facet_wrap(~Parameter, ncol = 1, scales='free') +
172 |   geom_ribbon( mapping = aes(
173 |     ymin = 0,
174 |     ymax = y  ),
175 |   alpha = .8) +   
176 |   scale_fill_manual(values=c('steelblue', 'grey'))
177 | ```
178 | 
179 | ![](../rmd_images/Bayesian_Basics/unnamed-chunk-3-1.png)<!-- -->
180 | 
181 | ``` r
182 | #  scale_x_continuous(expand=expand_scale(mult = c(-.4, -.4)))
183 | ```
184 | 
185 | ``` r
186 | mcmc_trace(model)
187 | ```
188 | 
189 | ![](../rmd_images/Bayesian_Basics/unnamed-chunk-4-1.png)<!-- -->
190 | 
191 | ``` r
192 | mcmc_areas(model)
193 | ```
194 | 
195 | ![](../rmd_images/Bayesian_Basics/unnamed-chunk-4-2.png)<!-- -->
196 | 
197 | <https://easystats.github.io/see/articles/bayestestR.html>
198 | 
199 | ``` r
200 | plot(model)
201 | ```
202 | 
203 | ![](../rmd_images/Bayesian_Basics/unnamed-chunk-5-1.png)<!-- -->
204 | 
205 | ``` r
206 | p_direction(model)
207 | ```
208 | 
209 |     ## # Probability of Direction (pd)
210 |     ## 
211 |     ## Parameter   |     pd
212 |     ## --------------------
213 |     ## (Intercept) | 90.60%
214 |     ## group2      | 99.40%
215 | 
216 | ``` r
217 | plot(p_direction(model, effects = "all", component = "all"))
218 | ```
219 | 
220 | ![](../rmd_images/Bayesian_Basics/unnamed-chunk-6-1.png)<!-- -->
221 | 
222 | Check posterior distribution
223 | 
224 | ``` r
225 | pp_check(model)
226 | ```
227 | 
228 | ![](../rmd_images/Bayesian_Basics/unnamed-chunk-7-1.png)<!-- -->
229 | 
230 | ``` r
231 | pp_check(model, plotfun = "hist")
232 | ```
233 | 
234 |     ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
235 | 
236 | ![](../rmd_images/Bayesian_Basics/unnamed-chunk-7-2.png)<!-- -->
237 | 
238 | ``` r
239 | pp_check(model, plotfun = "intervals")
240 | ```
241 | 
242 |     ## 'x' not specified in '...'. Using x=1:length(y).
243 | 
244 | ![](../rmd_images/Bayesian_Basics/unnamed-chunk-7-3.png)<!-- -->
245 | 
246 | ``` r
247 | ppc_intervals_grouped(
248 |   y = sleep$extra,
249 |   yrep = posterior_predict(model),
250 |   x = as.numeric(sleep$group),
251 |   prob = 0.5,
252 |   group = sleep$ID
253 | ) 
254 | ```
255 | 
256 | ![](../rmd_images/Bayesian_Basics/unnamed-chunk-8-1.png)<!-- -->
257 | 


--------------------------------------------------------------------------------
/R/Caret.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Machine Learning with Caret"
  3 | author: "Jesse Cambon"
  4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  5 | output:
  6 |   github_document:
  7 |     toc: true
  8 |     toc_depth: 2
  9 | ---
 10 | 
 11 | Demonstrate a machine learning workflow with caret
 12 | 
 13 | ## References
 14 | * https://topepo.github.io/caret/model-training-and-tuning.html
 15 | * https://cran.r-project.org/web/packages/caretEnsemble/vignettes/caretEnsemble-intro.html 
 16 | 
 17 | ```{r knit-settings, include=FALSE}
 18 | library(knitr)
 19 | library(stringr)
 20 | library(here)
 21 | # get name of file during knitting and strip file extension
 22 | rmd_filename <- str_remove(knitr::current_input(),"\\.Rmd")
 23 | print(rmd_filename)
 24 | knitr::opts_chunk$set(fig.path = str_c(here::here("rmd_images",rmd_filename),'/'),echo=TRUE) # image path
 25 | ```
 26 | 
 27 | ## Setup
 28 | 
 29 | ```{r setup,warning=F,message=F}
 30 | library(mlbench) # machine learning reference datasets
 31 | library(tidyverse)
 32 | library(broom)
 33 | library(caret)
 34 | library(skimr)
 35 | library(knitr)
 36 | library(kableExtra)
 37 | 
 38 | data(BreastCancer) # load
 39 | 
 40 | # Set seed for reproducibility
 41 | set.seed(45)
 42 | ```
 43 | 
 44 | 
 45 | ## Build Model
 46 | 
 47 | ```{r models,message=F,results=F,warning=F}
 48 | #skim(BreastCancer)
 49 | 
 50 | BC <- BreastCancer %>% as_tibble() %>%
 51 |   dplyr::select(-Id) %>%
 52 |   # should really use imputation but we'll just drop nas for now
 53 |   drop_na() 
 54 | 
 55 | # Use k-fold cross-validation
 56 | TC <- trainControl(method="cv", number=5)
 57 | 
 58 | # Train some models
 59 | 
 60 | # Neural Net
 61 | nnet <- train(Class ~ . , BC,method='nnet',trControl=TC,verbose=FALSE)
 62 | 
 63 | # Gradient Boosted Machine
 64 | gbm <- train(Class ~ . , BC,method='gbm',trControl=TC)
 65 | 
 66 | # Radial SVM
 67 | svmrad <- train(Class ~ . , BC,method='svmRadial',trControl=TC)
 68 | 
 69 | # Elastic-net
 70 | glmnet <- train(Class ~ . , BC,method='glmnet',trControl=TC,tuneLength=5)
 71 | 
 72 | # Logistic regression - did not converge
 73 | glm <- train(Class ~ . , BC,method='glm',trControl=TC)
 74 | 
 75 | ```
 76 | 
 77 | 
 78 | ```{r}
 79 | 
 80 | # Look at results of Glmnet model
 81 | 
 82 | # Extract coefficients from optimal model
 83 | glm_coeff <- coef(glmnet$finalModel,glmnet$finalModel$lambdaOpt) %>% 
 84 |   as.matrix() %>% as.data.frame() %>%
 85 |   rownames_to_column('Variable') %>%
 86 |   as_tibble() %>%
 87 |   rename(Coefficient=2) %>%
 88 |   arrange(desc(abs(Coefficient)))
 89 | 
 90 | 
 91 | # Combine variable importance data with coefficients
 92 | varImportance <- varImp(glmnet)$importance %>% 
 93 |   rownames_to_column('Variable') %>%
 94 |   rename(Importance=2) %>%
 95 |   arrange(desc(Importance)) %>%
 96 |   full_join(glm_coeff,by='Variable') %>%
 97 |   filter(Coefficient != 0) 
 98 | ```
 99 | 
100 | 
101 | 
102 | 
103 | ```{r results}
104 | resamps <- resamples(list(nnet=nnet,
105 |                           glmnet=glmnet,
106 |                           svmrad=svmrad,
107 |                           gbm=gbm,
108 |                           glm=glm))
109 | 
110 | # Accuracy comparison
111 | dotplot(resamps, metric = "Accuracy")
112 | 
113 | # Difference in accuracy
114 | bwplot(diff(resamps))
115 | ```
116 | 
117 | ## Glmnet (Elastic Net) Model
118 | 
119 | ```{r,results='asis',warning=F,message=F}
120 | kable(varImportance,format='markdown') %>%
121 |   kable_styling(bootstrap_options = c("striped",'border'))
122 | ```
123 | 


--------------------------------------------------------------------------------
/R/Caret.md:
--------------------------------------------------------------------------------
  1 | Machine Learning with Caret
  2 | ================
  3 | Jesse Cambon
  4 | 22 November, 2019
  5 | 
  6 | Demonstrate a machine learning workflow with
  7 |     caret
  8 | 
  9 | ## References
 10 | 
 11 |   - <https://topepo.github.io/caret/model-training-and-tuning.html>
 12 |   - <https://cran.r-project.org/web/packages/caretEnsemble/vignettes/caretEnsemble-intro.html>
 13 | 
 14 | ## Setup
 15 | 
 16 | ``` r
 17 | library(mlbench) # machine learning reference datasets
 18 | library(tidyverse)
 19 | library(broom)
 20 | library(caret)
 21 | library(skimr)
 22 | library(knitr)
 23 | library(kableExtra)
 24 | 
 25 | data(BreastCancer) # load
 26 | 
 27 | # Set seed for reproducibility
 28 | set.seed(45)
 29 | ```
 30 | 
 31 | ## Build Model
 32 | 
 33 | ``` r
 34 | #skim(BreastCancer)
 35 | 
 36 | BC <- BreastCancer %>% as_tibble() %>%
 37 |   dplyr::select(-Id) %>%
 38 |   # should really use imputation but we'll just drop nas for now
 39 |   drop_na() 
 40 | 
 41 | # Use k-fold cross-validation
 42 | TC <- trainControl(method="cv", number=5)
 43 | 
 44 | # Train some models
 45 | 
 46 | # Neural Net
 47 | nnet <- train(Class ~ . , BC,method='nnet',trControl=TC,verbose=FALSE)
 48 | 
 49 | # Gradient Boosted Machine
 50 | gbm <- train(Class ~ . , BC,method='gbm',trControl=TC)
 51 | 
 52 | # Radial SVM
 53 | svmrad <- train(Class ~ . , BC,method='svmRadial',trControl=TC)
 54 | 
 55 | # Elastic-net
 56 | glmnet <- train(Class ~ . , BC,method='glmnet',trControl=TC,tuneLength=5)
 57 | 
 58 | # Logistic regression - did not converge
 59 | glm <- train(Class ~ . , BC,method='glm',trControl=TC)
 60 | ```
 61 | 
 62 | ``` r
 63 | # Look at results of Glmnet model
 64 | 
 65 | # Extract coefficients from optimal model
 66 | glm_coeff <- coef(glmnet$finalModel,glmnet$finalModel$lambdaOpt) %>% 
 67 |   as.matrix() %>% as.data.frame() %>%
 68 |   rownames_to_column('Variable') %>%
 69 |   as_tibble() %>%
 70 |   rename(Coefficient=2) %>%
 71 |   arrange(desc(abs(Coefficient)))
 72 | 
 73 | 
 74 | # Combine variable importance data with coefficients
 75 | varImportance <- varImp(glmnet)$importance %>% 
 76 |   rownames_to_column('Variable') %>%
 77 |   rename(Importance=2) %>%
 78 |   arrange(desc(Importance)) %>%
 79 |   full_join(glm_coeff,by='Variable') %>%
 80 |   filter(Coefficient != 0) 
 81 | ```
 82 | 
 83 | ``` r
 84 | resamps <- resamples(list(nnet=nnet,
 85 |                           glmnet=glmnet,
 86 |                           svmrad=svmrad,
 87 |                           gbm=gbm,
 88 |                           glm=glm))
 89 | 
 90 | # Accuracy comparison
 91 | dotplot(resamps, metric = "Accuracy")
 92 | ```
 93 | 
 94 | ![](/Users/jessecambon/Documents/Data-Science-Codex/rmd_images/Caret/results-1.png)<!-- -->
 95 | 
 96 | ``` r
 97 | # Difference in accuracy
 98 | bwplot(diff(resamps))
 99 | ```
100 | 
101 | ![](/Users/jessecambon/Documents/Data-Science-Codex/rmd_images/Caret/results-2.png)<!-- -->
102 | 
103 | ## Glmnet (Elastic Net) Model
104 | 
105 | ``` r
106 | kable(varImportance,format='markdown') %>%
107 |   kable_styling(bootstrap_options = c("striped",'border'))
108 | ```
109 | 
110 | | Variable          |  Importance | Coefficient |
111 | | :---------------- | ----------: | ----------: |
112 | | Cl.thickness.L    | 100.0000000 |   3.1361533 |
113 | | Bare.nuclei9      |  80.1349380 |   2.5131545 |
114 | | Bare.nuclei6      |  72.1692163 |   2.2633373 |
115 | | Bare.nuclei10     |  62.8228881 |   1.9702221 |
116 | | Cell.shape.L      |  60.0936317 |   1.8846284 |
117 | | Marg.adhesion.L   |  59.9667240 |   1.8806484 |
118 | | Cell.size.L       |  54.3790530 |   1.7054105 |
119 | | Normal.nucleoli10 |  51.3425770 |   1.6101819 |
120 | | Normal.nucleoli9  |  48.7621790 |   1.5292567 |
121 | | Bl.cromatin5      |  42.9191401 |   1.3460100 |
122 | | Marg.adhesion^9   |  38.8206640 |   1.2174755 |
123 | | Normal.nucleoli4  |  38.0200861 |   1.1923682 |
124 | | Cell.shape.Q      |  34.0212706 | \-1.0669592 |
125 | | Cl.thickness^8    |  27.9829503 | \-0.8775882 |
126 | | Normal.nucleoli2  |  27.4707392 | \-0.8615245 |
127 | | Epith.c.size^4    |  27.1064505 |   0.8500998 |
128 | | Bare.nuclei4      |  26.4996140 |   0.8310685 |
129 | | Cell.size^8       |  25.6821554 |   0.8054318 |
130 | | Bare.nuclei3      |  24.5833551 |   0.7709717 |
131 | | Bare.nuclei7      |  21.8755488 |   0.6860507 |
132 | | Cell.size.C       |  20.0900403 |   0.6300545 |
133 | | Bare.nuclei5      |  19.8094615 |   0.6212551 |
134 | | Bl.cromatin7      |  18.8161804 |   0.5901043 |
135 | | Cl.thickness.Q    |  17.8206566 |   0.5588831 |
136 | | Epith.c.size.L    |  17.2140778 |   0.5398599 |
137 | | Cell.shape.C      |  15.8214699 |   0.4961855 |
138 | | Bare.nuclei8      |  15.7832696 |   0.4949875 |
139 | | Cell.shape^8      |  15.5873838 |   0.4888443 |
140 | | Normal.nucleoli6  |  15.5750643 |   0.4884579 |
141 | | Epith.c.size^8    |  15.1072819 |   0.4737875 |
142 | | Cell.size^5       |  14.7341024 |   0.4620840 |
143 | | Mitoses10         |  14.6725910 |   0.4601549 |
144 | | Cell.size.Q       |  13.5285932 | \-0.4242774 |
145 | | Cl.thickness^5    |  12.5086592 |   0.3922907 |
146 | | Normal.nucleoli7  |  11.2372547 | \-0.3524175 |
147 | | Bl.cromatin4      |  11.0371135 |   0.3461408 |
148 | | Epith.c.size^5    |  10.1426736 | \-0.3180898 |
149 | | Bl.cromatin8      |   8.5926819 |   0.2694797 |
150 | | Epith.c.size^9    |   8.1088768 |   0.2543068 |
151 | | Normal.nucleoli3  |   6.1834552 |   0.1939226 |
152 | | Cell.size^6       |   6.0301535 | \-0.1891149 |
153 | | Marg.adhesion.C   |   5.6067864 |   0.1758374 |
154 | | Marg.adhesion^8   |   5.0073879 | \-0.1570394 |
155 | | Epith.c.size^7    |   4.8165935 | \-0.1510558 |
156 | | Bl.cromatin10     |   3.7579941 |   0.1178565 |
157 | | Marg.adhesion^4   |   0.3997860 | \-0.0125379 |
158 | | Cell.shape^5      |   0.2113266 | \-0.0066275 |
159 | | Cl.thickness.C    |   0.1668182 |   0.0052317 |
160 | | (Intercept)       |          NA |   0.5035466 |
161 | 


--------------------------------------------------------------------------------
/R/Clustering.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "K-means Clustering"
 3 | author: "Jesse Cambon"
 4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
 5 | output:
 6 |   github_document:
 7 |     toc: true
 8 |     toc_depth: 2
 9 | ---
10 | 
11 | Demonstrate K-means clustering 
12 | 
13 | ## References
14 | * https://uc-r.github.io/kmeans_clustering
15 | * https://cran.r-project.org/web/packages/broom/vignettes/kmeans.html 
16 | 
17 | ```{r knit-settings, include=FALSE}
18 | library(here)
19 | source(here("rmd_config.R"))
20 | ```
21 | 
22 | ## Setup
23 | 
24 | ```{r setup,warning=F,message=F}
25 | library(tidyverse)
26 | library(knitr)
27 | library(kableExtra)
28 | library(broom)
29 | library(factoextra)
30 | library(wesanderson)
31 | ```
32 | 
33 | ## Cluster Data
34 | 
35 | ```{r}
36 | library(fueleconomy)
37 | my_vehicles <- vehicles %>% filter(year == 2015) %>%
38 |   drop_na(cyl,displ,cty)
39 | 
40 | vehicles_sel <- my_vehicles %>%
41 |   select(cty,cyl,displ)
42 | 
43 | # Scale variables for clustering
44 | vehicles_sel_scaled <- vehicles_sel %>%
45 |   mutate_all(scale)
46 | 
47 | # Try different numbers of clusters
48 | clust3 <- kmeans(vehicles_sel_scaled, centers = 3)
49 | clust5 <- kmeans(vehicles_sel_scaled, centers = 5)
50 | clust7 <- kmeans(vehicles_sel_scaled, centers = 7) 
51 | clust10 <- kmeans(vehicles_sel_scaled, centers = 10)
52 | ```
53 | 
54 | 
55 | ## View Results
56 | 
57 | ```{r}
58 | combine_results <- augment(clust3, my_vehicles)  
59 | 
60 | combine_summ <- combine_results %>% group_by(.cluster) %>% 
61 |   summarize(num_vehicles=n(),
62 |             mean_cty=mean(cty),
63 |             min_cty=min(cty),
64 |             max_cty=max(cty),
65 |             mean_displ=mean(displ),
66 |             mean_cyl=mean(cyl)) %>%
67 |   arrange(desc(mean_cty))
68 | ```
69 | 
70 | 
71 | ```{r clusterresults,results='asis',warning=F}
72 | kable(combine_summ,format='markdown',digits=2) %>%
73 |   kable_styling(bootstrap_options = c("striped",'border'))
74 | ```
75 | 
76 | ## Visualize 
77 | 
78 | ```{r}
79 | fviz_cluster(clust3,data=vehicles_sel,repel=F,ggtheme=theme_bw())
80 | 
81 | ggplot(aes(x=cyl,y=cty,color=.cluster),data=combine_results) + 
82 |   geom_jitter() +
83 |   theme_bw() +
84 |   theme(legend.position='top') +
85 |   scale_color_manual(values=wes_palette('Darjeeling1')) +
86 |   guides(color = guide_legend(title='Cluster',override.aes = list(size=2.5))) +
87 |   xlab('Cylinders (cyl)') +
88 |   ylab('City Fuel Economy (cty)')
89 | ```
90 | 


--------------------------------------------------------------------------------
/R/Clustering.md:
--------------------------------------------------------------------------------
 1 | K-means Clustering
 2 | ================
 3 | Jesse Cambon
 4 | 24 November, 2019
 5 | 
 6 | Demonstrate K-means clustering
 7 | 
 8 | ## References
 9 | 
10 |   - <https://uc-r.github.io/kmeans_clustering>
11 |   - <https://cran.r-project.org/web/packages/broom/vignettes/kmeans.html>
12 | 
13 | ## Setup
14 | 
15 | ``` r
16 | library(tidyverse)
17 | library(knitr)
18 | library(kableExtra)
19 | library(broom)
20 | library(factoextra)
21 | library(wesanderson)
22 | ```
23 | 
24 | ## Cluster Data
25 | 
26 | ``` r
27 | library(fueleconomy)
28 | my_vehicles <- vehicles %>% filter(year == 2015) %>%
29 |   drop_na(cyl,displ,cty)
30 | 
31 | vehicles_sel <- my_vehicles %>%
32 |   select(cty,cyl,displ)
33 | 
34 | # Scale variables for clustering
35 | vehicles_sel_scaled <- vehicles_sel %>%
36 |   mutate_all(scale)
37 | 
38 | # Try different numbers of clusters
39 | clust3 <- kmeans(vehicles_sel_scaled, centers = 3)
40 | clust5 <- kmeans(vehicles_sel_scaled, centers = 5)
41 | clust7 <- kmeans(vehicles_sel_scaled, centers = 7) 
42 | clust10 <- kmeans(vehicles_sel_scaled, centers = 10)
43 | ```
44 | 
45 | ## View Results
46 | 
47 | ``` r
48 | combine_results <- augment(clust3, my_vehicles)  
49 | 
50 | combine_summ <- combine_results %>% group_by(.cluster) %>% 
51 |   summarize(num_vehicles=n(),
52 |             mean_cty=mean(cty),
53 |             min_cty=min(cty),
54 |             max_cty=max(cty),
55 |             mean_displ=mean(displ),
56 |             mean_cyl=mean(cyl)) %>%
57 |   arrange(desc(mean_cty))
58 | ```
59 | 
60 | ``` r
61 | kable(combine_summ,format='markdown',digits=2) %>%
62 |   kable_styling(bootstrap_options = c("striped",'border'))
63 | ```
64 | 
65 | | .cluster | num\_vehicles | mean\_cty | min\_cty | max\_cty | mean\_displ | mean\_cyl |
66 | | :------- | ------------: | --------: | -------: | -------: | ----------: | --------: |
67 | | 3        |            86 |     25.05 |       20 |       33 |        1.87 |      4.05 |
68 | | 2        |            55 |     18.71 |       16 |       24 |        3.12 |      5.78 |
69 | | 1        |            63 |     15.19 |       11 |       20 |        5.12 |      8.29 |
70 | 
71 | ## Visualize
72 | 
73 | ``` r
74 | fviz_cluster(clust3,data=vehicles_sel,repel=F,ggtheme=theme_bw())
75 | ```
76 | 
77 | ![](/rmd_images/Clustering/unnamed-chunk-3-1.png)<!-- -->
78 | 
79 | ``` r
80 | ggplot(aes(x=cyl,y=cty,color=.cluster),data=combine_results) + 
81 |   geom_jitter() +
82 |   theme_bw() +
83 |   theme(legend.position='top') +
84 |   scale_color_manual(values=wes_palette('Darjeeling1')) +
85 |   guides(color = guide_legend(title='Cluster',override.aes = list(size=2.5))) +
86 |   xlab('Cylinders (cyl)') +
87 |   ylab('City Fuel Economy (cty)')
88 | ```
89 | 
90 | ![](../rmd_images/Clustering/unnamed-chunk-3-2.png)<!-- -->
91 | 


--------------------------------------------------------------------------------
/R/Comparing_Bayesian_Packages.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Comparing Bayesian Modeling Packages"
  3 | author: "Jesse Cambon"
  4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  5 | output:
  6 |   github_document:
  7 |     toc: true
  8 | ---
  9 | 
 10 | Compare rstan, brms, and rstanarm
 11 | 
 12 | ```{r knit-settings, include=FALSE}
 13 | library(here)
 14 | source(here("rmd_config.R"))
 15 | ```
 16 | 
 17 | ```{r,message=F,warning=F}
 18 | library(rstan)
 19 | library(brms)
 20 | library(rstanarm)
 21 | library(tidyverse)
 22 | library(bayesplot)
 23 | options(mc.cores = parallel::detectCores())
 24 | ```
 25 | 
 26 | 
 27 | ## Rstan
 28 | 
 29 | Walking through this example: https://cran.r-project.org/web/packages/rstan/vignettes/rstan.html#sample-from-the-posterior-distribution
 30 | 
 31 | ```{r}
 32 | # Sample Dataset
 33 | schools_data <- list(
 34 |   J = 8,
 35 |   y = c(28,  8, -3,  7, -1,  1, 18, 12),
 36 |   sigma = c(15, 10, 16, 11,  9, 11, 10, 18)
 37 | )
 38 | 
 39 | stan_code <- "
 40 | data {
 41 |   int<lower=0> J;          // number of schools 
 42 |   real y[J];               // estimated treatment effects
 43 |   real<lower=0> sigma[J];  // s.e. of effect estimates 
 44 | }
 45 | parameters {
 46 |   real mu; 
 47 |   real<lower=0> tau;
 48 |   vector[J] eta;
 49 | }
 50 | transformed parameters {
 51 |   vector[J] theta;
 52 |   theta = mu + tau * eta;
 53 | }
 54 | model {
 55 |   target += normal_lpdf(eta | 0, 1);
 56 |   target += normal_lpdf(y | theta, sigma);
 57 | }"
 58 | ```
 59 | 
 60 | 
 61 | ```{r}
 62 | fit1 <- stan(
 63 |   model_code = stan_code,  # Stan program
 64 |   data = schools_data,    # named list of data
 65 |   chains = 4,             # number of Markov chains
 66 |   warmup = 1000,          # number of warmup iterations per chain
 67 |   iter = 2000,            # total number of iterations per chain
 68 |   cores = 2,              # number of cores (could use one per chain)
 69 |   refresh = 0             # no progress shown
 70 |   )
 71 | ```
 72 | 
 73 | ## Brms
 74 | 
 75 | Example based on : https://github.com/paul-buerkner/brms
 76 | 
 77 | * `(1 | var)` is used to specify a random intercept 
 78 | 
 79 | Mixed effect model has both random effects and fixed effects
 80 | 
 81 | * https://www.theanalysisfactor.com/understanding-random-effects-in-mixed-models/
 82 | * https://ourcodingclub.github.io/tutorials/mixed-models/#what
 83 | * https://ase.tufts.edu/gsc/gradresources/guidetomixedmodelsinr/mixed%20model%20guide.html 
 84 | * https://en.wikipedia.org/wiki/Mixed_model
 85 | 
 86 | ```{r}
 87 | fit1 <- brm(count ~ zAge + zBase * Trt + (1|patient), 
 88 |             data = epilepsy, family = poisson())
 89 | fit2 <- brm(count ~ zAge + zBase * Trt + (1|patient) + (1|obs), 
 90 |             data = epilepsy, family = poisson())
 91 | 
 92 | ```
 93 | 
 94 | ```{r}
 95 | fit1
 96 | ```
 97 | 
 98 | 
 99 | ```{r}
100 | plot(fit1, pars = c("Trt", "zBase")) 
101 | plot(fit2, pars = c("Trt", "zBase"))
102 | 
103 | ```
104 | 
105 | Compare model results with leave-one-out validation
106 | 
107 | https://mc-stan.org/loo/
108 | 
109 | ```{r}
110 | loo(fit1, fit2)
111 | ```
112 | 
113 | ## rstanarm
114 | 
115 | Rstanarm examle compared with brms
116 | 
117 | * https://mc-stan.org/loo/articles/loo2-example.html
118 | * http://mc-stan.org/rstanarm/articles/count.html
119 | 
120 | brms prior setting: https://www.jamesrrae.com/post/bayesian-logistic-regression-using-brms-part-1/
121 | 
122 | ```{r}
123 | # Use rstanarm to fit a poisson model
124 | roach_pois <-
125 |   stan_glm(
126 |     formula = y ~ roach1 + treatment + senior,
127 |     offset = log(exposure2),
128 |     data = roaches,
129 |     family = poisson(link = "log"),
130 |     prior = normal(0, 2.5, autoscale = TRUE),
131 |     prior_intercept = normal(0, 5, autoscale = TRUE),
132 |     seed = 12345
133 |   )
134 | 
135 | # # Use rstanarm to fit a negative binomial model
136 | roach_negbinom2 <- update(roach_pois, family = neg_binomial_2)
137 | ```
138 | 
139 | Fit a Brms model for comparison
140 | 
141 | ```{r}
142 | # Priors to be used by brm
143 | my_priors <- c(
144 |   prior(normal(0, 5), class = "Intercept"),
145 |   prior(normal(0, 2.5), class = "b")
146 | )
147 | 
148 | # Fit with zero inflated negative binomial with brm
149 | roach_zinb <-
150 |   brm(
151 |     formula=y ~ roach1 + treatment + senior,
152 |     data = roaches,
153 |     family = zero_inflated_negbinomial,
154 |     seed = 12345
155 |   )
156 | ```
157 | 
158 | ```{r}
159 | plot(roach_pois)
160 | plot(roach_zinb,pars=c('roach1','treatment','senior'))
161 | ```
162 | 
163 | 
164 | ```{r}
165 | pp_check(roach_pois, plotfun='stat')
166 | pp_check(roach_negbinom2, plotfun='stat')
167 | pp_check(roach_zinb, plotfun='stat')
168 | ```
169 | 
170 | ```{r}
171 | prop_zero <- function(y) mean(y == 0)
172 | 
173 | prop_zero_test1 <- pp_check(roach_pois, plotfun = "stat", stat = "prop_zero", binwidth = .005)
174 | prop_zero_test2 <- pp_check(roach_negbinom2, plotfun = "stat", stat = "prop_zero", 
175 |                             binwidth = 0.01)
176 | prop_zero_test3 <- pp_check(roach_zinb, plotfun = "stat", stat = "prop_zero", 
177 |                             binwidth = 0.01)
178 | 
179 | # Show graphs for Poisson and negative binomial side by side
180 | bayesplot_grid(prop_zero_test1 + ggtitle("Poisson"), 
181 |                prop_zero_test2 + ggtitle("Negative Binomial"), 
182 |                prop_zero_test3 + ggtitle("Zero Inflated Negative Binomial"),
183 |                grid_args = list(ncol = 3))
184 | ```
185 | 
186 | 
187 | 
188 | ```{r}
189 | #loo(roach_pois, roach_negbinom2)
190 | ```
191 | 
192 | 


--------------------------------------------------------------------------------
/R/Distribution_Sampling.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Distribution Sampling and Hypothesis Testing"
  3 | author: "Jesse Cambon"
  4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  5 | output:
  6 |   github_document:
  7 |     toc: true
  8 | ---
  9 | 
 10 | ```{r knit-settings, include=FALSE}
 11 | library(here)
 12 | source(here("rmd_config.R"))
 13 | ```
 14 | 
 15 | References:
 16 | * http://appliedpredictivemodeling.com/data
 17 | * http://faculty.marshall.usc.edu/gareth-james/ISL/data.html
 18 | 
 19 | ```{r,message=F,warning=F}
 20 | library(tidyverse)
 21 | library(bayestestR)
 22 | library(BayesFactor)
 23 | library(jcolors)
 24 | library(infer)
 25 | library(broom)
 26 | library(knitr)
 27 | 
 28 | set.seed(42) # for reproducibility
 29 | ```
 30 | 
 31 | Perform sampling
 32 | 
 33 | ```{r}
 34 | bernouli_sample <- rbernoulli(10,p=0.9) # T/F
 35 | uniform = runif(10,-4,4)
 36 | 
 37 | num_rows <- 1000
 38 | 
 39 | dist <- 
 40 |   tibble(
 41 |     cauchy=rcauchy(num_rows,0,0.5),
 42 |     norm_sample = rnorm(num_rows,0,0.5),
 43 |     beta_sample = rbeta(num_rows,0,1)
 44 | ) %>%
 45 |   pivot_longer(everything(),values_to='value',names_to='distribution')
 46 | 
 47 | 
 48 | # Distributions used for count data
 49 | count_dist <- tibble(poisson= rpois(num_rows,2),
 50 |                       `negative binomial`=rnbinom(num_rows,1,mu=2),
 51 |                       binom_sample = rbinom(num_rows,9,.25),
 52 |                       weibull=rweibull(num_rows,1.4)
 53 |                       ) %>%
 54 |   pivot_longer(everything(),values_to='value',names_to='distribution')
 55 | ```
 56 | 
 57 | Compare some distributions
 58 | 
 59 | ```{r}
 60 | ggplot(data=dist,aes(x=value,color=distribution)) + 
 61 | #  facet_wrap(~distribution,ncol=1) +
 62 |   scale_x_continuous(limits =c(-3,3)) +
 63 |   theme_minimal() +
 64 |   theme(legend.position='top') +
 65 |   geom_density(alpha=0.8) +
 66 |   scale_color_jcolors('default') + 
 67 |   xlab('') + ylab('')
 68 | ```
 69 | 
 70 | Poisson v Neg Binomial v Weibull
 71 | 
 72 | ```{r}
 73 | ggplot(data=count_dist,aes(x=value,color=distribution)) + 
 74 | #  facet_wrap(~distribution,ncol=1) +
 75 |   scale_x_continuous(limits =c(0,8)) +
 76 |   theme_minimal() +
 77 |   theme(legend.position='top') +
 78 |   geom_density(alpha=0.8) +
 79 |   scale_color_jcolors('default') + 
 80 |   xlab('') + ylab('')
 81 | ```
 82 | 
 83 | 
 84 | ## Significance Testing
 85 | 
 86 | ### T-test (Frequentist version)
 87 | 
 88 | ```{r}
 89 | t.test(trees$Height)
 90 | ```
 91 | 
 92 | Simulate some data and run more T-tests
 93 | 
 94 | ```{r}
 95 | compare_norms <- rnorm(100,25,10) %>%
 96 |   as_tibble() %>% rename(sample1=value) %>%
 97 |   mutate(sample2 = rnorm(100,28,10))
 98 | 
 99 | results <- t.test(compare_norms$sample1,compare_norms$sample2)
100 | results
101 | ```
102 | 
103 | Tidy T-test (infer package)
104 | 
105 | https://infer.netlify.app/
106 | 
107 | ```{r}
108 | compare_norms_long <- 
109 |   compare_norms %>%
110 |   pivot_longer(everything(),names_to='sample', values_to='value')
111 | 
112 | compare_norms_long %>%
113 |   t_test(value ~ sample,order=c('sample1','sample2')) %>%
114 |   kable()
115 | ```
116 | 
117 | 
118 | ### Bayesian T-test
119 | 
120 | https://easystats.github.io/bayestestR/articles/example2.html
121 | ```{r}
122 | bayes_result <- BayesFactor::ttestBF(compare_norms$sample1,compare_norms$sample2)
123 | bayes_result
124 | ```
125 | ```{r}
126 | describe_posterior(bayes_result) %>% kable()
127 | ```
128 | 
129 | 


--------------------------------------------------------------------------------
/R/Distribution_Sampling.md:
--------------------------------------------------------------------------------
  1 | Distribution Sampling and Hypothesis Testing
  2 | ================
  3 | Jesse Cambon
  4 | 02 February, 2021
  5 | 
  6 | -   [Significance Testing](#significance-testing)
  7 |     -   [T-test (Frequentist version)](#t-test-frequentist-version)
  8 |     -   [Bayesian T-test](#bayesian-t-test)
  9 | 
 10 | References: \* <http://appliedpredictivemodeling.com/data> \*
 11 | <http://faculty.marshall.usc.edu/gareth-james/ISL/data.html>
 12 | 
 13 | ``` r
 14 | library(tidyverse)
 15 | library(bayestestR)
 16 | library(BayesFactor)
 17 | library(jcolors)
 18 | library(infer)
 19 | library(broom)
 20 | library(knitr)
 21 | 
 22 | set.seed(42) # for reproducibility
 23 | ```
 24 | 
 25 | Perform sampling
 26 | 
 27 | ``` r
 28 | bernouli_sample <- rbernoulli(10,p=0.9) # T/F
 29 | uniform = runif(10,-4,4)
 30 | 
 31 | num_rows <- 1000
 32 | 
 33 | dist <- 
 34 |   tibble(
 35 |     cauchy=rcauchy(num_rows,0,0.5),
 36 |     norm_sample = rnorm(num_rows,0,0.5),
 37 |     beta_sample = rbeta(num_rows,0,1)
 38 | ) %>%
 39 |   pivot_longer(everything(),values_to='value',names_to='distribution')
 40 | 
 41 | 
 42 | # Distributions used for count data
 43 | count_dist <- tibble(poisson= rpois(num_rows,2),
 44 |                       `negative binomial`=rnbinom(num_rows,1,mu=2),
 45 |                       binom_sample = rbinom(num_rows,9,.25),
 46 |                       weibull=rweibull(num_rows,1.4)
 47 |                       ) %>%
 48 |   pivot_longer(everything(),values_to='value',names_to='distribution')
 49 | ```
 50 | 
 51 | Compare some distributions
 52 | 
 53 | ``` r
 54 | ggplot(data=dist,aes(x=value,color=distribution)) + 
 55 | #  facet_wrap(~distribution,ncol=1) +
 56 |   scale_x_continuous(limits =c(-3,3)) +
 57 |   theme_minimal() +
 58 |   theme(legend.position='top') +
 59 |   geom_density(alpha=0.8) +
 60 |   scale_color_jcolors('default') + 
 61 |   xlab('') + ylab('')
 62 | ```
 63 | 
 64 |     ## Warning: Removed 116 rows containing non-finite values (stat_density).
 65 | 
 66 | ![](../rmd_images/Distribution_Sampling/unnamed-chunk-3-1.png)<!-- -->
 67 | 
 68 | Poisson v Neg Binomial v Weibull
 69 | 
 70 | ``` r
 71 | ggplot(data=count_dist,aes(x=value,color=distribution)) + 
 72 | #  facet_wrap(~distribution,ncol=1) +
 73 |   scale_x_continuous(limits =c(0,8)) +
 74 |   theme_minimal() +
 75 |   theme(legend.position='top') +
 76 |   geom_density(alpha=0.8) +
 77 |   scale_color_jcolors('default') + 
 78 |   xlab('') + ylab('')
 79 | ```
 80 | 
 81 |     ## Warning: Removed 25 rows containing non-finite values (stat_density).
 82 | 
 83 | ![](../rmd_images/Distribution_Sampling/unnamed-chunk-4-1.png)<!-- -->
 84 | 
 85 | ## Significance Testing
 86 | 
 87 | ### T-test (Frequentist version)
 88 | 
 89 | ``` r
 90 | t.test(trees$Height)
 91 | ```
 92 | 
 93 |     ## 
 94 |     ##  One Sample t-test
 95 |     ## 
 96 |     ## data:  trees$Height
 97 |     ## t = 66.41, df = 30, p-value < 2.2e-16
 98 |     ## alternative hypothesis: true mean is not equal to 0
 99 |     ## 95 percent confidence interval:
100 |     ##  73.6628 78.3372
101 |     ## sample estimates:
102 |     ## mean of x 
103 |     ##        76
104 | 
105 | Simulate some data and run more T-tests
106 | 
107 | ``` r
108 | compare_norms <- rnorm(100,25,10) %>%
109 |   as_tibble() %>% rename(sample1=value) %>%
110 |   mutate(sample2 = rnorm(100,28,10))
111 | 
112 | results <- t.test(compare_norms$sample1,compare_norms$sample2)
113 | results
114 | ```
115 | 
116 |     ## 
117 |     ##  Welch Two Sample t-test
118 |     ## 
119 |     ## data:  compare_norms$sample1 and compare_norms$sample2
120 |     ## t = 1.4176, df = 197.32, p-value = 0.1579
121 |     ## alternative hypothesis: true difference in means is not equal to 0
122 |     ## 95 percent confidence interval:
123 |     ##  -0.7904857  4.8324013
124 |     ## sample estimates:
125 |     ## mean of x mean of y 
126 |     ##  28.30323  26.28227
127 | 
128 | Tidy T-test (infer package)
129 | 
130 | <https://infer.netlify.app/>
131 | 
132 | ``` r
133 | compare_norms_long <- 
134 |   compare_norms %>%
135 |   pivot_longer(everything(),names_to='sample',values_to='value')
136 | 
137 | compare_norms_long %>%
138 |   t_test(value ~ sample,order=c('sample1','sample2')) %>%
139 |   kable()
140 | ```
141 | 
142 | | statistic |    t\_df |  p\_value | alternative |  lower\_ci | upper\_ci |
143 | |----------:|---------:|----------:|:------------|-----------:|----------:|
144 | |  1.417581 | 197.3181 | 0.1578903 | two.sided   | -0.7904857 |  4.832401 |
145 | 
146 | ### Bayesian T-test
147 | 
148 | <https://easystats.github.io/bayestestR/articles/example2.html>
149 | 
150 | ``` r
151 | bayes_result <- BayesFactor::ttestBF(compare_norms$sample1,compare_norms$sample2)
152 | bayes_result
153 | ```
154 | 
155 |     ## Bayes factor analysis
156 |     ## --------------
157 |     ## [1] Alt., r=0.707 : 0.3932028 ±0%
158 |     ## 
159 |     ## Against denominator:
160 |     ##   Null, mu1-mu2 = 0 
161 |     ## ---
162 |     ## Bayes factor type: BFindepSample, JZS
163 | 
164 | ``` r
165 | describe_posterior(bayes_result) %>% kable()
166 | ```
167 | 
168 | | Parameter  |    Median |  CI |   CI\_low |  CI\_high |      pd | ROPE\_CI | ROPE\_low | ROPE\_high | ROPE\_Percentage |        BF | Prior\_Distribution | Prior\_Location | Prior\_Scale |
169 | |:-----------|----------:|----:|----------:|----------:|--------:|---------:|----------:|-----------:|-----------------:|----------:|:--------------------|----------------:|-------------:|
170 | | Difference | -1.885353 |  89 | -4.149145 | 0.2915764 | 0.91525 |       89 | -1.010632 |   1.010632 |         0.232519 | 0.3932028 | cauchy              |               0 |    0.7071068 |
171 | 


--------------------------------------------------------------------------------
/R/Geospatial_Analysis.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Geospatial Analysis"
  3 | author: "Jesse Cambon"
  4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  5 | output:
  6 |   github_document:
  7 |     toc: true
  8 | ---
  9 | 
 10 | ```{r knit-settings, include=FALSE}
 11 | library(here)
 12 | source(here("rmd_config.R"))
 13 | ```
 14 | 
 15 | Install fifystater package from: https://github.com/wmurphyrd/fiftystater
 16 | 
 17 | ## References
 18 | * https://github.com/mtennekes/tmap
 19 | * https://mran.revolutionanalytics.com/snapshot/2016-03-22/web/packages/tmap/vignettes/tmap-nutshell.html
 20 | 
 21 | ## Setup
 22 | 
 23 | ```{r,warning=F,message=F}
 24 | library(tidyverse)
 25 | library(tidycensus) # census data
 26 | library(ggplot2)
 27 | #library(sf) # geospatial methods
 28 | library(tmap) # thematic mapping
 29 | library(viridis) # color scheme
 30 | #library(wbstats) # world bank
 31 | library(wesanderson) # colors
 32 | library(fiftystater) # US state geometries
 33 | 
 34 | options(tigris_use_cache = TRUE)
 35 | 
 36 | ```
 37 | 
 38 | # Geographies
 39 | 
 40 | ## Locales 
 41 | 
 42 | Use the tidycensus package to pull Census data and display it on a map with the tmap package.
 43 | ```{r locale,warning=F,message=F}
 44 | # Pull Census Rent Data for Boston using tidycensus package
 45 | bos <- get_acs(geography = "tract", 
 46 |               variables = "B25064_001E",  # median gross rent
 47 |               state = "MA", 
 48 |               county = c("Suffolk",'Middlesex'), 
 49 |               geometry = TRUE)
 50 | 
 51 | tm_shape(bos) +
 52 |   tm_fill('estimate',colorNA = "white",breaks=c(0,1000,1500,2000,3500),
 53 |           title='Median Rent') +
 54 |   tm_borders() +
 55 |    tm_style("classic") +
 56 |   # margin format is c(bottom,left,top,right)
 57 |   tm_layout(inner.margins = c(0.05, .05, .05, .05),main.title.position='center',legend.position=c('left','bottom'),
 58 |             legend.text.size=0.8,legend.title.size=1.3,
 59 |             main.title='Boston Area Rent by Census Tract',
 60 |             main.title.size=1.5) 
 61 | 
 62 | #vars <- load_variables(2016,'acs1') # view census variables
 63 | ```
 64 |     
 65 | http://www.robinlovelace.net/presentations/spatial-tidyverse.html#11
 66 | https://cran.r-project.org/web/packages/wbstats/vignettes/Using_the_wbstats_package.html
 67 | 
 68 | ## United States
 69 | 
 70 | ```{r}
 71 | 
 72 | data("fifty_states") # fiftystater package
 73 | 
 74 | crimes <- data.frame(state = tolower(rownames(USArrests)), USArrests) %>%
 75 |   # Make a categorical variable for Murder rates with a predefined interval
 76 |   mutate(Murder_cut = str_replace_all(cut_width(Murder,5,boundary=0),',',' - ')) %>%
 77 |   # Delete all characters except for digits, whitespace, and '-'
 78 |   mutate(Murder_cut = str_replace_all(Murder_cut,'[^\\d\\s-]',''))
 79 | 
 80 | # make an ordered list of levels so our categorical variable is sorted properly
 81 | Murder_cut_levels <- crimes %>% arrange(Murder) %>% pull(Murder_cut) %>%
 82 |   unique()
 83 | 
 84 | 
 85 | # map_id creates the aesthetic mapping to the state name column in your data
 86 | ggplot(crimes, aes(map_id = state)) + 
 87 |   # map points to the fifty_states shape data
 88 |   geom_map(aes(fill = factor(Murder_cut,levels=Murder_cut_levels)), 
 89 |            map = fifty_states, color='white',size=0.2) +  # geometry from fiftystater package
 90 |   expand_limits(x = fifty_states$long, y = fifty_states$lat) +
 91 |   coord_map() +
 92 |   theme(plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)) + 
 93 |   scale_x_continuous(breaks = NULL) + 
 94 |   scale_y_continuous(breaks = NULL) +
 95 |   
 96 |   labs(x = "", y = "",title='State Murder Rates in 1975',
 97 |    caption='Data: World Almanac and Book of facts 1975. (Crime rates)') +
 98 |   theme(legend.position = "right", 
 99 |         panel.background = element_blank(),
100 |         panel.border=element_blank())  +
101 |   scale_fill_viridis_d(direction=-1,option='inferno',end=0.9)  +
102 |   guides(fill = guide_legend(title='Murders Per\n100,000 Residents'))
103 | 
104 | ```
105 | 
106 | ## The World
107 | 
108 | ```{r}
109 | # Load world map geometry
110 | data(World)
111 | 
112 | # Load coordinates of cities
113 | data(metro) 
114 | 
115 | tm_shape(World, projection = "eck4" # Eckert IV 1906 project (preserves area)
116 |          ) +
117 |   tm_polygons("gdp_cap_est",
118 |               palette = "Greens",
119 |               breaks = c(0, 1000, 5000, 10000, 25000, 50000, Inf),
120 |               title = "GDP per capita") +
121 |   # tm_style("classic",frame=F,
122 |   #          earth.boundary = c(-180, -87, 180, 87),
123 |   #          legend.text.size=0.8,legend.title.size=1.3)   +
124 |   tm_layout(bg.color='white') +
125 | #  tm_format("World", inner.margins = 0.02, frame = FALSE) 
126 |   tm_legend(frame = TRUE) 
127 | # tm_format("World",frame=F) 
128 | 
129 | metro <- metro %>%
130 |   mutate(growth= 100*(pop2020 - pop2010) / pop2010)
131 | 
132 | tm_shape(World, projection = "eck4" # Eckert IV 1906 project (preserves area)
133 |          ) +
134 |     tm_polygons("life_exp", palette = "Purples", 
135 |         breaks=c(50,65,80,Inf),
136 |     title = "Life Expectancy", contrast=0.7, border.col = "gray30", id = "name") +
137 | #  tm_borders() +
138 |   tm_shape(metro) +
139 |   tm_bubbles("pop2010", col = "growth", border.col = "black", 
140 |     border.alpha = 0.6,
141 |     breaks=c(0,25,50,75,Inf),
142 |     palette = "-RdYlGn",
143 |     title.size = "Metro population (2010)", 
144 |     title.col = "Projected Growth by 2020 (%)",
145 |     id = "name") +
146 |   # tm_style("classic",frame=F,
147 |   #          earth.boundary = c(-180, -87, 180, 87),
148 |   #          legend.text.size=0.8,legend.title.size=1.3)   +
149 |   tm_layout(bg.color='white') +
150 | #  tm_format("World", inner.margins = 0.02, frame = FALSE) 
151 |   tm_legend(frame = F) 
152 | 
153 | ```
154 | 
155 | 
156 | ```{r,include=F,eval=F}
157 | View(worldbank_df)
158 | 
159 | qtm(world)
160 | 
161 | Arrests <- USArrests %>% rownames_to_column('State') %>%
162 |   as_tibble()
163 |   
164 | us <- usa_composite() %>% forti
165 | %>% 
166 |   left_join(Arrests,by=c('name'='State')) 
167 | 
168 |   left_join(U)
169 | 
170 | 
171 | us_map <- fortify(usa_composite() , region="fips_state")
172 | ggplot(us_map, aes(map_id=fips_state,fill=pop_2014)) +
173 |   geom_map(map=us_map, color='#ffffff', size=0.1) + 
174 |   expand_limits(x=us_map$long,y=us_map$lat) +
175 |   theme_map() +  
176 |   theme(legend.position="right") +
177 |   coord_map("albers", lat0=30, lat1=40) +
178 |    scale_fill_viridis(options='magma')
179 |   # scale_fill_colormap("State Population\n(2014 Estimates)", labels=comma,
180 |                       # colormap = colormaps$copper, reverse = T, discrete = F)
181 | ```
182 | 
183 | 
184 | 
185 | 


--------------------------------------------------------------------------------
/R/Geospatial_Analysis.md:
--------------------------------------------------------------------------------
  1 | Geospatial Analysis
  2 | ================
  3 | Jesse Cambon
  4 | 24 November, 2019
  5 | 
  6 | Install fifystater package from:
  7 | <https://github.com/wmurphyrd/fiftystater>
  8 | 
  9 | ## References
 10 | 
 11 |   - <https://github.com/mtennekes/tmap>
 12 |   - <https://mran.revolutionanalytics.com/snapshot/2016-03-22/web/packages/tmap/vignettes/tmap-nutshell.html>
 13 | 
 14 | ## Setup
 15 | 
 16 | ``` r
 17 | library(tidyverse)
 18 | library(tidycensus) # census data
 19 | library(ggplot2)
 20 | #library(sf) # geospatial methods
 21 | library(tmap) # thematic mapping
 22 | library(viridis) # color scheme
 23 | #library(wbstats) # world bank
 24 | library(wesanderson) # colors
 25 | library(fiftystater) # US state geometries
 26 | 
 27 | options(tigris_use_cache = TRUE)
 28 | ```
 29 | 
 30 | # Geographies
 31 | 
 32 | ## Locales
 33 | 
 34 | Use the tidycensus package to pull Census data and display it on a map
 35 | with the tmap package.
 36 | 
 37 | ``` r
 38 | # Pull Census Rent Data for Boston using tidycensus package
 39 | bos <- get_acs(geography = "tract", 
 40 |               variables = "B25064_001E",  # median gross rent
 41 |               state = "MA", 
 42 |               county = c("Suffolk",'Middlesex'), 
 43 |               geometry = TRUE)
 44 | 
 45 | tm_shape(bos) +
 46 |   tm_fill('estimate',colorNA = "white",breaks=c(0,1000,1500,2000,3500),
 47 |           title='Median Rent') +
 48 |   tm_borders() +
 49 |    tm_style("classic") +
 50 |   # margin format is c(bottom,left,top,right)
 51 |   tm_layout(inner.margins = c(0.05, .05, .05, .05),main.title.position='center',legend.position=c('left','bottom'),
 52 |             legend.text.size=0.8,legend.title.size=1.3,
 53 |             main.title='Boston Area Rent by Census Tract',
 54 |             main.title.size=1.5) 
 55 | ```
 56 | 
 57 | ![](../rmd_images/Geospatial_Analysis/locale-1.png)<!-- -->
 58 | 
 59 | ``` r
 60 | #vars <- load_variables(2016,'acs1') # view census variables
 61 | ```
 62 | 
 63 | <http://www.robinlovelace.net/presentations/spatial-tidyverse.html#11>
 64 | <https://cran.r-project.org/web/packages/wbstats/vignettes/Using_the_wbstats_package.html>
 65 | 
 66 | ## United States
 67 | 
 68 | ``` r
 69 | data("fifty_states") # fiftystater package
 70 | 
 71 | crimes <- data.frame(state = tolower(rownames(USArrests)), USArrests) %>%
 72 |   # Make a categorical variable for Murder rates with a predefined interval
 73 |   mutate(Murder_cut = str_replace_all(cut_width(Murder,5,boundary=0),',',' - ')) %>%
 74 |   # Delete all characters except for digits, whitespace, and '-'
 75 |   mutate(Murder_cut = str_replace_all(Murder_cut,'[^\\d\\s-]',''))
 76 | 
 77 | # make an ordered list of levels so our categorical variable is sorted properly
 78 | Murder_cut_levels <- crimes %>% arrange(Murder) %>% pull(Murder_cut) %>%
 79 |   unique()
 80 | 
 81 | 
 82 | # map_id creates the aesthetic mapping to the state name column in your data
 83 | ggplot(crimes, aes(map_id = state)) + 
 84 |   # map points to the fifty_states shape data
 85 |   geom_map(aes(fill = factor(Murder_cut,levels=Murder_cut_levels)), 
 86 |            map = fifty_states, color='white',size=0.2) +  # geometry from fiftystater package
 87 |   expand_limits(x = fifty_states$long, y = fifty_states$lat) +
 88 |   coord_map() +
 89 |   theme(plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)) + 
 90 |   scale_x_continuous(breaks = NULL) + 
 91 |   scale_y_continuous(breaks = NULL) +
 92 |   
 93 |   labs(x = "", y = "",title='State Murder Rates in 1975',
 94 |    caption='Data: World Almanac and Book of facts 1975. (Crime rates)') +
 95 |   theme(legend.position = "right", 
 96 |         panel.background = element_blank(),
 97 |         panel.border=element_blank())  +
 98 |   scale_fill_viridis_d(direction=-1,option='inferno',end=0.9)  +
 99 |   guides(fill = guide_legend(title='Murders Per\n100,000 Residents'))
100 | ```
101 | 
102 | ![](../rmd_images/Geospatial_Analysis/unnamed-chunk-2-1.png)<!-- -->
103 | 
104 | ## The World
105 | 
106 | ``` r
107 | # Load world map geometry
108 | data(World)
109 | 
110 | # Load coordinates of cities
111 | data(metro) 
112 | 
113 | tm_shape(World, projection = "eck4" # Eckert IV 1906 project (preserves area)
114 |          ) +
115 |   tm_polygons("gdp_cap_est",
116 |               palette = "Greens",
117 |               breaks = c(0, 1000, 5000, 10000, 25000, 50000, Inf),
118 |               title = "GDP per capita") +
119 |   # tm_style("classic",frame=F,
120 |   #          earth.boundary = c(-180, -87, 180, 87),
121 |   #          legend.text.size=0.8,legend.title.size=1.3)   +
122 |   tm_layout(bg.color='white') +
123 | #  tm_format("World", inner.margins = 0.02, frame = FALSE) 
124 |   tm_legend(frame = TRUE) 
125 | ```
126 | 
127 | ![](../rmd_images/Geospatial_Analysis/unnamed-chunk-3-1.png)<!-- -->
128 | 
129 | ``` r
130 | # tm_format("World",frame=F) 
131 | 
132 | metro <- metro %>%
133 |   mutate(growth= 100*(pop2020 - pop2010) / pop2010)
134 | 
135 | tm_shape(World, projection = "eck4" # Eckert IV 1906 project (preserves area)
136 |          ) +
137 |     tm_polygons("life_exp", palette = "Purples", 
138 |         breaks=c(50,65,80,Inf),
139 |     title = "Life Expectancy", contrast=0.7, border.col = "gray30", id = "name") +
140 | #  tm_borders() +
141 |   tm_shape(metro) +
142 |   tm_bubbles("pop2010", col = "growth", border.col = "black", 
143 |     border.alpha = 0.6,
144 |     breaks=c(0,25,50,75,Inf),
145 |     palette = "-RdYlGn",
146 |     title.size = "Metro population (2010)", 
147 |     title.col = "Projected Growth by 2020 (%)",
148 |     id = "name") +
149 |   # tm_style("classic",frame=F,
150 |   #          earth.boundary = c(-180, -87, 180, 87),
151 |   #          legend.text.size=0.8,legend.title.size=1.3)   +
152 |   tm_layout(bg.color='white') +
153 | #  tm_format("World", inner.margins = 0.02, frame = FALSE) 
154 |   tm_legend(frame = F) 
155 | ```
156 | 
157 |     ## Warning: Values have found that are less than the lowest break
158 |     
159 |     ## Warning: Values have found that are less than the lowest break
160 | 
161 |     ## Variable "growth" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
162 | 
163 | ![](../rmd_images/Geospatial_Analysis/unnamed-chunk-3-2.png)<!-- -->
164 | 


--------------------------------------------------------------------------------
/R/Modeling_Workflow.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Modeling Workflow"
  3 | author: "Jesse Cambon"
  4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  5 | output:
  6 |   github_document:
  7 |     toc: true
  8 | ---
  9 | 
 10 | Demonstrate model workflows with tidyverse, modelr, and broom. This notebook includes both a group_by and a nested approach which offer similar results. However, the nested model workflow embeds the data into the dataframe along with objects such as models. 
 11 | 
 12 | ## References
 13 | * http://r4ds.had.co.nz/many-models.html 
 14 | 
 15 | ## Setup
 16 | 
 17 | ```{r knit-settings, include=FALSE}
 18 | library(here)
 19 | source(here("rmd_config.R"))
 20 | ```
 21 | 
 22 | ```{r setup,warning=F,message=F}
 23 | library(tidyverse)
 24 | library(gapminder)
 25 | library(broom)
 26 | #library(modelr)
 27 | library(knitr)
 28 | library(kableExtra)
 29 | ```
 30 | 
 31 | ## Exploration
 32 | 
 33 | These graphs show why log transforming GDP per Capita makes it correlate more linearly to our response variable, life expectancy. Log transformations are often useful for highly skewed variables in regression.
 34 | 
 35 | ```{r explore}
 36 | ggplot(data=gapminder,
 37 |           aes(x = gdpPercap, y = lifeExp, color = continent,group=1)) +
 38 | geom_point(alpha=0.7) +
 39 | theme_bw() +
 40 | geom_smooth() +
 41 | theme(legend.position='top',
 42 |   plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)) + 
 43 | guides(color=guide_legend(override.aes = list(size=2.5))) 
 44 | 
 45 | ggplot(data=gapminder,
 46 |           aes(x = log10(gdpPercap), y = lifeExp, color = continent,group=1)) +
 47 | geom_point(alpha=0.7) +
 48 | theme_bw() +
 49 | geom_smooth() +
 50 | theme(legend.position='top',
 51 |   plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)) + 
 52 | guides(color=guide_legend(override.aes = list(size=2.5))) 
 53 | 
 54 | ggplot(data=gapminder,
 55 |           aes(x = log10(pop), y = lifeExp, color = continent,group=1)) +
 56 | geom_point(alpha=0.7) +
 57 | #facet_grid(~continent) +
 58 | theme_bw() +
 59 | geom_smooth() +
 60 | theme(legend.position='top',
 61 |   plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)) + 
 62 | guides(color=guide_legend(override.aes = list(size=2.5))) 
 63 | 
 64 | 
 65 | 
 66 | ```
 67 | 
 68 | ## Grouped Models
 69 | 
 70 | ```{r models}
 71 | 
 72 | # One model per continent
 73 | models <- gapminder %>%
 74 |   group_by(continent) %>%
 75 |   do(fit=lm(lifeExp ~ log10(gdpPercap)+log10(pop) + year, data=.)) 
 76 | 
 77 | stats <- glance(models,fit) %>%
 78 |   arrange(desc(r.squared))
 79 | 
 80 | coefficients <- tidy(models,fit) %>%
 81 |   filter(term != '(Intercept)') %>%
 82 |   arrange(continent,p.value)
 83 | 
 84 | model_fit <- augment(models,fit)
 85 | ```
 86 | 
 87 | 
 88 | ```{r plot}
 89 | ggplot(data=model_fit,
 90 |           aes(x = .fitted, y = .resid, color = continent,group=1)) +
 91 | geom_point(alpha=0.8) +
 92 | facet_grid(~continent) +
 93 | ggtitle('Fitted vs. Residual Check') +
 94 | theme_bw() +
 95 | geom_hline(yintercept=0,color='blue') + # horizontal line at 0 residual
 96 | theme(legend.position='none',
 97 |   plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)) + 
 98 | guides(color=guide_legend(override.aes = list(size=2.5))) +
 99 | xlab('Fitted') +
100 | ylab('Residual')
101 | 
102 | ggplot(data=model_fit,
103 |           aes(.resid)) +
104 | geom_histogram(aes(fill=continent)) +
105 | facet_grid(~continent) +
106 | ggtitle('Residual Distribution') +
107 | theme_bw() +
108 | scale_y_continuous(expand = c(0,0,0.05,0)) + 
109 | theme(legend.position='none',
110 |   plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)) + 
111 | guides(color=guide_legend(override.aes = list(size=2.5))) +
112 | xlab('Residual') +
113 | ylab('Count')
114 | ```
115 | 
116 | ```{r modeldisplay,results='asis',warning=F}
117 | kable(stats,format='markdown',digits=2) %>%
118 |   kable_styling(bootstrap_options = c("striped",'border'))
119 | 
120 | kable(coefficients,format='markdown',digits=4) %>%
121 |   kable_styling(bootstrap_options = c("striped",'border'))
122 | ```
123 | 
124 | ## Nested Models
125 | 
126 | Now we create a similar model with nesting
127 | 
128 | ```{r}
129 | my_model <- function(df) {
130 |   lm(lifeExp ~ log10(gdpPercap)+log10(pop) + year, data= df)
131 | }
132 | 
133 | # Nest models by continent 
134 | nested_models <- gapminder %>% 
135 |   group_by(continent,country) %>% 
136 |   nest() %>%
137 |   # fit models
138 |   mutate(fit = map(data, my_model)) %>%
139 |   # calculate residuals
140 |   mutate(augment = map(fit, augment),
141 |     stats = map(fit,glance),
142 |     terms = map(fit,tidy)) %>%
143 |   ungroup()
144 | 
145 | # Dataset with predictions and residuals
146 | nest_fit <- nested_models %>% unnest(augment)
147 | 
148 | nest_stats <- nested_models %>%
149 |   unnest(stats,.drop=TRUE) %>%
150 |   arrange(desc(r.squared)) 
151 | 
152 | nest_coefficients <- nested_models %>%
153 |   unnest(terms,.drop=TRUE) %>%
154 |   filter(term != '(Intercept)') %>%
155 |   arrange(continent,country,desc(p.value))
156 | 
157 | most_important_vars <- nest_coefficients %>%
158 |   group_by(country) %>% 
159 |   slice(1)
160 | 
161 | summ_imp_vars <- most_important_vars %>%
162 |   group_by(continent) %>%
163 |   count(term) %>%
164 |   arrange(continent,desc(n))
165 | ```
166 | 
167 | 
168 | 


--------------------------------------------------------------------------------
/R/Modeling_Workflow.md:
--------------------------------------------------------------------------------
  1 | Modeling Workflow
  2 | ================
  3 | Jesse Cambon
  4 | 24 November, 2019
  5 | 
  6 | Demonstrate model workflows with tidyverse, modelr, and broom. This
  7 | notebook includes both a group\_by and a nested approach which offer
  8 | similar results. However, the nested model workflow embeds the data into
  9 | the dataframe along with objects such as models.
 10 | 
 11 | ## References
 12 | 
 13 |   - <http://r4ds.had.co.nz/many-models.html>
 14 | 
 15 | ## Setup
 16 | 
 17 | ``` r
 18 | library(tidyverse)
 19 | library(gapminder)
 20 | library(broom)
 21 | #library(modelr)
 22 | library(knitr)
 23 | library(kableExtra)
 24 | ```
 25 | 
 26 | ## Exploration
 27 | 
 28 | These graphs show why log transforming GDP per Capita makes it correlate
 29 | more linearly to our response variable, life expectancy. Log
 30 | transformations are often useful for highly skewed variables in
 31 | regression.
 32 | 
 33 | ``` r
 34 | ggplot(data=gapminder,
 35 |           aes(x = gdpPercap, y = lifeExp, color = continent,group=1)) +
 36 | geom_point(alpha=0.7) +
 37 | theme_bw() +
 38 | geom_smooth() +
 39 | theme(legend.position='top',
 40 |   plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)) + 
 41 | guides(color=guide_legend(override.aes = list(size=2.5))) 
 42 | ```
 43 | 
 44 |     ## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
 45 | 
 46 | ![](../rmd_images/Modeling_Workflow/explore-1.png)<!-- -->
 47 | 
 48 | ``` r
 49 | ggplot(data=gapminder,
 50 |           aes(x = log10(gdpPercap), y = lifeExp, color = continent,group=1)) +
 51 | geom_point(alpha=0.7) +
 52 | theme_bw() +
 53 | geom_smooth() +
 54 | theme(legend.position='top',
 55 |   plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)) + 
 56 | guides(color=guide_legend(override.aes = list(size=2.5))) 
 57 | ```
 58 | 
 59 |     ## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
 60 | 
 61 | ![](../rmd_images/Modeling_Workflow/explore-2.png)<!-- -->
 62 | 
 63 | ``` r
 64 | ggplot(data=gapminder,
 65 |           aes(x = log10(pop), y = lifeExp, color = continent,group=1)) +
 66 | geom_point(alpha=0.7) +
 67 | #facet_grid(~continent) +
 68 | theme_bw() +
 69 | geom_smooth() +
 70 | theme(legend.position='top',
 71 |   plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)) + 
 72 | guides(color=guide_legend(override.aes = list(size=2.5))) 
 73 | ```
 74 | 
 75 |     ## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
 76 | 
 77 | ![](../rmd_images/Modeling_Workflow/explore-3.png)<!-- -->
 78 | 
 79 | ## Grouped Models
 80 | 
 81 | ``` r
 82 | # One model per continent
 83 | models <- gapminder %>%
 84 |   group_by(continent) %>%
 85 |   do(fit=lm(lifeExp ~ log10(gdpPercap)+log10(pop) + year, data=.)) 
 86 | 
 87 | stats <- glance(models,fit) %>%
 88 |   arrange(desc(r.squared))
 89 | 
 90 | coefficients <- tidy(models,fit) %>%
 91 |   filter(term != '(Intercept)') %>%
 92 |   arrange(continent,p.value)
 93 | 
 94 | model_fit <- augment(models,fit)
 95 | ```
 96 | 
 97 | ``` r
 98 | ggplot(data=model_fit,
 99 |           aes(x = .fitted, y = .resid, color = continent,group=1)) +
100 | geom_point(alpha=0.8) +
101 | facet_grid(~continent) +
102 | ggtitle('Fitted vs. Residual Check') +
103 | theme_bw() +
104 | geom_hline(yintercept=0,color='blue') + # horizontal line at 0 residual
105 | theme(legend.position='none',
106 |   plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)) + 
107 | guides(color=guide_legend(override.aes = list(size=2.5))) +
108 | xlab('Fitted') +
109 | ylab('Residual')
110 | ```
111 | 
112 | ![](../rmd_images/Modeling_Workflow/plot-1.png)<!-- -->
113 | 
114 | ``` r
115 | ggplot(data=model_fit,
116 |           aes(.resid)) +
117 | geom_histogram(aes(fill=continent)) +
118 | facet_grid(~continent) +
119 | ggtitle('Residual Distribution') +
120 | theme_bw() +
121 | scale_y_continuous(expand = c(0,0,0.05,0)) + 
122 | theme(legend.position='none',
123 |   plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)) + 
124 | guides(color=guide_legend(override.aes = list(size=2.5))) +
125 | xlab('Residual') +
126 | ylab('Count')
127 | ```
128 | 
129 |     ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
130 | 
131 | ![](../rmd_images/Modeling_Workflow/plot-2.png)<!-- -->
132 | 
133 | ``` r
134 | kable(stats,format='markdown',digits=2) %>%
135 |   kable_styling(bootstrap_options = c("striped",'border'))
136 | ```
137 | 
138 | | continent | r.squared | adj.r.squared | sigma | statistic | p.value | df |    logLik |     AIC |     BIC | deviance | df.residual |
139 | | :-------- | --------: | ------------: | ----: | --------: | ------: | -: | --------: | ------: | ------: | -------: | ----------: |
140 | | Oceania   |      0.96 |          0.96 |  0.78 |    172.97 |       0 |  4 |   \-26.03 |   62.06 |   67.95 |    12.30 |          20 |
141 | | Europe    |      0.80 |          0.80 |  2.41 |    487.82 |       0 |  4 |  \-825.98 | 1661.96 | 1681.39 |  2073.54 |         356 |
142 | | Americas  |      0.72 |          0.72 |  4.96 |    255.52 |       0 |  4 |  \-903.93 | 1817.85 | 1836.37 |  7274.08 |         296 |
143 | | Asia      |      0.70 |          0.70 |  6.50 |    308.12 |       0 |  4 | \-1301.08 | 2612.15 | 2632.06 | 16558.14 |         392 |
144 | | Africa    |      0.50 |          0.50 |  6.48 |    207.77 |       0 |  4 | \-2049.22 | 4108.45 | 4130.63 | 26011.51 |         620 |
145 | 
146 | ``` r
147 | kable(coefficients,format='markdown',digits=4) %>%
148 |   kable_styling(bootstrap_options = c("striped",'border'))
149 | ```
150 | 
151 | | continent | term             | estimate | std.error | statistic | p.value |
152 | | :-------- | :--------------- | -------: | --------: | --------: | ------: |
153 | | Africa    | year             |   0.2551 |    0.0160 |   15.8991 |  0.0000 |
154 | | Africa    | log10(gdpPercap) |  11.0142 |    0.7141 |   15.4237 |  0.0000 |
155 | | Africa    | log10(pop)       | \-0.5390 |    0.4192 |  \-1.2857 |  0.1990 |
156 | | Americas  | log10(gdpPercap) |  18.5492 |    1.1513 |   16.1118 |  0.0000 |
157 | | Americas  | year             |   0.2690 |    0.0179 |   15.0519 |  0.0000 |
158 | | Americas  | log10(pop)       | \-1.9190 |    0.5545 |  \-3.4607 |  0.0006 |
159 | | Asia      | log10(gdpPercap) |  12.6233 |    0.7074 |   17.8454 |  0.0000 |
160 | | Asia      | year             |   0.2974 |    0.0219 |   13.5703 |  0.0000 |
161 | | Asia      | log10(pop)       |   2.0425 |    0.4854 |    4.2077 |  0.0000 |
162 | | Europe    | log10(gdpPercap) |  11.5695 |    0.4930 |   23.4667 |  0.0000 |
163 | | Europe    | year             |   0.1005 |    0.0091 |   11.0939 |  0.0000 |
164 | | Europe    | log10(pop)       | \-1.0054 |    0.2244 |  \-4.4804 |  0.0000 |
165 | | Oceania   | year             |   0.1737 |    0.0384 |    4.5299 |  0.0002 |
166 | | Oceania   | log10(pop)       |   0.6644 |    0.5984 |    1.1102 |  0.2801 |
167 | | Oceania   | log10(gdpPercap) |   4.1229 |    4.9721 |    0.8292 |  0.4168 |
168 | 
169 | ## Nested Models
170 | 
171 | Now we create a similar model with nesting
172 | 
173 | ``` r
174 | my_model <- function(df) {
175 |   lm(lifeExp ~ log10(gdpPercap)+log10(pop) + year, data= df)
176 | }
177 | 
178 | # Nest models by continent 
179 | nested_models <- gapminder %>% 
180 |   group_by(continent,country) %>% 
181 |   nest() %>%
182 |   # fit models
183 |   mutate(fit = map(data, my_model)) %>%
184 |   # calculate residuals
185 |   mutate(augment = map(fit, augment),
186 |     stats = map(fit,glance),
187 |     terms = map(fit,tidy)) %>%
188 |   ungroup()
189 | 
190 | # Dataset with predictions and residuals
191 | nest_fit <- nested_models %>% unnest(augment)
192 | 
193 | nest_stats <- nested_models %>%
194 |   unnest(stats,.drop=TRUE) %>%
195 |   arrange(desc(r.squared)) 
196 | ```
197 | 
198 |     ## Warning: The `.drop` argument of `unnest()` is deprecated as of tidyr 1.0.0.
199 |     ## All list-columns are now preserved.
200 |     ## This warning is displayed once per session.
201 |     ## Call `lifecycle::last_warnings()` to see where this warning was generated.
202 | 
203 | ``` r
204 | nest_coefficients <- nested_models %>%
205 |   unnest(terms,.drop=TRUE) %>%
206 |   filter(term != '(Intercept)') %>%
207 |   arrange(continent,country,desc(p.value))
208 | 
209 | most_important_vars <- nest_coefficients %>%
210 |   group_by(country) %>% 
211 |   slice(1)
212 | 
213 | summ_imp_vars <- most_important_vars %>%
214 |   group_by(continent) %>%
215 |   count(term) %>%
216 |   arrange(continent,desc(n))
217 | ```
218 | 


--------------------------------------------------------------------------------
/R/Multilevel-Models.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Multilevel Models"
  3 | author: "Jesse Cambon"
  4 | date: "`r format(Sys.time(), '%B, %Y')`"
  5 | output:
  6 |   github_document:
  7 |     toc: true
  8 | ---
  9 | 
 10 | ```{r knit-settings, include=FALSE}
 11 | library(here)
 12 | source(here("rmd_config.R"))
 13 | ```
 14 | 
 15 | References:
 16 | 
 17 | - https://cran.r-project.org/web/packages/lme4/vignettes/lmer.pdf
 18 | - https://www.rensvandeschoot.com/tutorials/lme4/
 19 | 
 20 | 
 21 | ```{r setup, message = F, warning = F}
 22 | library(lme4)
 23 | library(broom.mixed)
 24 | library(rstanarm)
 25 | library(bayesplot)
 26 | library(tidyverse)
 27 | library(bayestestR)
 28 | ```
 29 | 
 30 | 
 31 | ```{r}
 32 | fm1 <- lmer(Reaction ~ Days + (1 + Days | Subject), data = sleepstudy)
 33 | ```
 34 | 
 35 | ```{r}
 36 | #sleepstudy
 37 | ```
 38 | ```{r}
 39 | # Overall Trend
 40 | sleepstudy %>%
 41 |   ggplot(aes(x = Days, y = Reaction)) +
 42 |   geom_point() + geom_smooth(method = 'lm')
 43 | 
 44 | #
 45 | sleepstudy %>%
 46 |   ggplot(aes(x = Days, y = Reaction)) +
 47 |   facet_wrap(~Subject) +
 48 |   geom_point() + geom_smooth(method = 'lm')
 49 | ```
 50 | 
 51 | 
 52 | 
 53 | ```{r}
 54 | tidy(fm1)
 55 | glance(fm1)
 56 | ```
 57 | 
 58 | ## Bayesian approach
 59 | 
 60 | https://mc-stan.org/users/documentation/case-studies/tutorial_rstanarm.html
 61 | 
 62 | ```{r}
 63 | bm1 <- stan_lmer(Reaction ~ Days + (1 + Days | Subject), data = sleepstudy)
 64 | ```
 65 | 
 66 | ```{r}
 67 | pp_check(bm1)
 68 | ```
 69 | 
 70 | 
 71 | 
 72 | ```{r, fig.height = 8, fig.width = 5}
 73 | mcmc_areas(bm1)
 74 | ```
 75 | 
 76 | Posterior predictive check
 77 | 
 78 | ```{r}
 79 | ppc_ribbon_grouped(
 80 |   y = sleepstudy$Reaction,
 81 |   yrep = posterior_predict(bm1),
 82 |   x = sleepstudy$Days,
 83 |   prob = 0.5,
 84 |   group = sleepstudy$Subject
 85 | ) 
 86 | ```
 87 | 
 88 | ```{r}
 89 | plot(p_direction(bm1, effects = "fixed", component = "all"))
 90 | plot(p_direction(bm1, effects = "random", component = "all"))
 91 | 
 92 | p_direction(bm1, effects = 'all')
 93 | ```
 94 | 
 95 | 
 96 | ```{r}
 97 | summary(bm1)
 98 | ```
 99 | 
100 | 


--------------------------------------------------------------------------------
/R/Ordinal_Regression.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Ordinal Regression"
  3 | author: "Jesse Cambon"
  4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  5 | output:
  6 |   github_document:
  7 |     toc: true
  8 | ---
  9 | 
 10 | GAM ordinal regression: https://stat.ethz.ch/R-manual/R-devel/library/mgcv/html/ocat.html
 11 | Example using polr: https://stats.idre.ucla.edu/r/dae/ordinal-logistic-regression/
 12 | Explanation of GAM interpretation: https://stats.stackexchange.com/questions/226645/generalized-additive-model-interpretation-with-ordered-categorical-family-in-r
 13 | 
 14 | ```{r knit-settings, include=FALSE}
 15 | library(here)
 16 | source(here("rmd_config.R"))
 17 | ```
 18 | 
 19 | ```{r, warning=F}
 20 | #library(Hmisc)
 21 | library(MASS) # polr()
 22 | library(car)
 23 | library(mgcv) # gam model
 24 | library(mgcViz) # gam visualization
 25 | library(ordinal) # clm()
 26 | library(broom)
 27 | library(tidyverse)
 28 | 
 29 | # Find frequency counts for all variables in var list
 30 | 
 31 | var_freq <- function(data,var) {
 32 |   var <- rlang::sym(var)
 33 |   print(var)
 34 | #  print(quo_name(var))
 35 |   if (is.factor(data %>% pull(!!var)) | is.character(data %>% pull(!!var))) {
 36 |   return(data %>% count(!!var) %>% mutate(term=quo_name(var)) %>%
 37 |           rename(level=!!var) %>%
 38 |           mutate(level=as.character(level), # convert to char
 39 |                  is_categorical=1))
 40 |   } else {
 41 |     return(tibble())
 42 |   }
 43 | }
 44 | 
 45 | # Iterate through an entire dataset and return a dataset with all
 46 | # frequencies
 47 | find_all_freqs <- function(data,var_list) {
 48 |   all_freqs <- tibble()
 49 |   for (var in var_list) {
 50 |     all_freqs <- all_freqs %>%
 51 |       bind_rows(var_freq(data,var))
 52 |   }
 53 |   return(all_freqs)
 54 | }
 55 | 
 56 | # obtain list of variables in a model. Remove smooth terms (s())
 57 | obtain_model_varlist <- function(model_obj) {
 58 |     var_list_raw <- unlist(strsplit(as.character(model_obj$formula[3]),split=' \\+ '))
 59 |     # Remove smooth terms (s())
 60 |     return(var_list_raw[!str_detect(var_list_raw,'^s\\(')])
 61 | }
 62 | 
 63 | # adds term_name field to a tidy dataframe which includes frequency count
 64 | add_termnames <- function(data,term_freqs,var_list) {
 65 |   # Regexs to match the varname (when it begins a string)
 66 |   varregex <- paste(str_replace(var_list,'^','\\^'), collapse = "|")
 67 | 
 68 |   return(
 69 |   data %>%
 70 |   mutate(term_name = str_extract(term,varregex),
 71 |          level = case_when(!is.na(term_name) ~ str_replace(term,varregex,""))) %>%
 72 |   # add in frequency counts and labels
 73 |   left_join(term_freqs,by=c('term_name'='term','level')) %>%
 74 |   mutate(label=case_when(is.na(n) ~ term, # if not categorical than use original label
 75 |         is_categorical == 1 ~ str_c(term_name,': ', level,' (',scales::comma(n),')'),
 76 |                 TRUE ~ str_c(level,' (',scales::comma(n),')')))
 77 | 
 78 |   )
 79 | 
 80 | }
 81 | 
 82 | 
 83 | ```
 84 | 
 85 | 
 86 | ```{r}
 87 | 
 88 | Mydiamonds <- diamonds %>% 
 89 |   # convert factor to numeric for gam model
 90 |   mutate(cutN=as.numeric(cut),
 91 |           # convert to non-ordered factors
 92 |          color=factor(color,ordered=F),
 93 |          clarity=factor(clarity,ordered=F)
 94 |          )
 95 | 
 96 |     # make wine show up in the R studio environment
 97 | 
 98 | outcomeVar <- 'cut'
 99 | predictors <- 'carat + color + clarity'
100 | 
101 | # Construct formula from strings
102 | lmformula <- as.formula(str_c(outcomeVar,' ~ ',predictors))
103 | 
104 | # train ordinal logistic models
105 | clm_model <- clm(lmformula, data=Mydiamonds)
106 | polr_model <- polr(lmformula, data=Mydiamonds)
107 | # train ordinal GAM model (R is the number of outcome categories)
108 | gam_model <- gam(cutN ~ s(carat) + color + clarity,family=ocat(R=5),data=Mydiamonds) 
109 | 
110 | gam.check(gam_model)
111 | 
112 | # Check for collinearity
113 | concurvity(gam_model)
114 | vif(polr_model)
115 | 
116 | ```
117 | 
118 | 
119 | ```{r}
120 | 
121 | # Find categorical variables and the
122 | # frequency counts of their levels
123 | gam_varlist <- obtain_model_varlist(gam_model)
124 | gam_varfreqs <- find_all_freqs(Mydiamonds,gam_varlist)
125 | 
126 | # Evaluate models
127 | clm_stats <- glance(clm_model)
128 | clm_coef <- tidy(clm_model,exponentiate=T) 
129 | 
130 | polr_stats <- glance(polr_model)
131 | polr_coef <- tidy(polr_model,exponentiate=T)
132 | 
133 | gam_stats <- glance(gam_model)
134 | gam_Lcoef <-  tidy(gam_model,parametric=T) %>% # get parametric coefficients
135 |   add_termnames(gam_varfreqs,gam_varlist)
136 | gam_Scoef <-  tidy(gam_model,parametric=F) # get smooth term coefficients
137 | 
138 | # gam_allpvalues <- gam_Lcoef %>%
139 | #   dplyr::select(term,p.value) %>%
140 | #   bind_rows(gam_Scoef %>% select(term,p.value)) %>%
141 | #   arrange(p.value)
142 | 
143 | # Extract probability predictions from GAM
144 | gam_probs <- predict(gam_model,type='response') %>% 
145 |   # remove "V" from column names so we now have the class labels
146 |   as.data.frame() %>% rename_all(list(replace= ~str_replace_all(.,'V',''))) %>% 
147 |   mutate(obs_num=1:nrow(.)) %>%
148 |   gather(class,prob,-obs_num) %>%
149 |   mutate(class=as.numeric(class)) %>% arrange(obs_num,class)
150 | 
151 | # Extract class predictions
152 | gam_pred <- gam_probs %>% group_by(obs_num) %>%
153 |   filter(prob==max(prob))
154 | 
155 | # Compare predictions of polr() and clm()
156 | compare_models <- Mydiamonds %>% 
157 |   # clm predictions returned as list for some reason
158 |   # have to unlist it so we can put it in a column
159 |   mutate(clm_pred=unlist(predict(clm_model,type='class')),
160 |          polr_pred=predict(polr_model,type='class'),
161 |          gam_pred=gam_pred %>% pull(class)) %>%
162 |   mutate_all(as.numeric)  # convert from factor to numeric
163 | 
164 | # Make frequency tables
165 | # freq_preds <- compare_models %>% count(polr_pred,clm_pred)
166 | # freq_predcheck <- compare_models %>% count(cut,clm_pred)
167 | 
168 | # Chi square test
169 | # chisq.test(freq_preds)
170 | # chisq.test(freq_predcheck)
171 | 
172 | #Spearman correlations
173 | cor(compare_models$cut,compare_models$clm_pred,method='spearman')
174 | cor(compare_models$cut,compare_models$polr_pred,method='spearman')
175 | cor(compare_models$cut,compare_models$gam_pred,method='spearman')
176 | 
177 | 
178 | ```
179 | 
180 | ```{r,results='asis'}
181 | ggplot(data=gam_Lcoef %>% filter(label != '(Intercept)'),
182 |           aes(x = reorder(label,-estimate), y = exp(estimate))) +
183 | geom_point() +
184 |   scale_y_continuous(breaks=seq(0,10,2),limits=c(0,10)) +
185 | geom_hline(yintercept=1,color='grey') +
186 | coord_flip() +
187 |   theme_classic() +
188 | #geom_pointrange(mapping=aes(ymin=LCLM, ymax=UCLM)) + 
189 | labs(title='Odds Ratios of Parametric Terms',
190 |      caption='Sample sizes shown in ()') +
191 | xlab('Term') + ylab('Odds Ratio')
192 | ```
193 | 
194 | 
195 | 
196 | 
197 | ```{r}
198 | # Confusion matrixes 
199 | 
200 | check_gam <- compare_models %>% count(cut,gam_pred) %>%
201 |   spread(cut,n,fill=0)
202 | 
203 | check_clm <- compare_models %>% count(cut,clm_pred) %>%
204 |   spread(cut,n,fill=0)
205 | 
206 | ```
207 | 
208 | ## Extract data from smooths and plot
209 | 
210 | This method allows us some more direct control over how we plot the smooth terms since we extract the plot data. Alternatively, mgcViz (shown below) can be used.
211 | 
212 | ```{r}
213 | 
214 | # Returns the data to plot all smooth turns in a gam model object
215 | # 100 points per plot
216 | smooth_data <- function(gam_model) {
217 |   # select=0 prevents plots being shown on screen
218 |   gam_viz <- plot(gam_model, rug=FALSE,select=0)
219 |   
220 |   num_smooths <- length(gam_viz) # number of smooth terms
221 |   smooth_df <- tibble() # initialize a dataframe
222 |   
223 |   for (i in 1:num_smooths) {
224 |      print(gam_viz[[i]]$xlab)
225 |     # extract and append data we want
226 |     smooth_df <- smooth_df %>%
227 |       bind_rows(tibble( xlab=gam_viz[[i]]$xlab,
228 |                         ylab=gam_viz[[i]]$ylab,
229 |                         x=gam_viz[[i]]$x,
230 |                         fit=gam_viz[[i]]$fit,
231 |                         se=gam_viz[[i]]$se
232 |                         ))
233 |   }
234 |   return(smooth_df)
235 | } 
236 | 
237 | gam_smoothdata <- smooth_data(gam_model)
238 | 
239 | ggplot(gam_smoothdata, 
240 |       aes(x, fit)) + 
241 |   facet_wrap(~xlab,scales='free') +
242 |   geom_line() +
243 |   theme_minimal() +
244 |  geom_line(aes(y=fit+(2*se)),linetype='dashed') +
245 |  geom_line(aes(y=fit-(2*se)),linetype='dashed') +
246 |   scale_y_continuous() +
247 |   scale_x_continuous(labels=scales::comma)
248 | ```
249 | 
250 | 
251 | 
252 | ## Alternatively, Plot Smooth Terms with MgcViz
253 | 
254 | ```{r}
255 | gam_viz <- getViz(gam_model)
256 | 
257 | plot(sm(gam_viz, 1)) +
258 |   l_fitLine(colour = "red") + 
259 | #  l_rug(mapping = aes(x=x, y=y), alpha = 0.8) +
260 |     l_ciLine(mul = 5, colour = "blue", linetype = 2) + 
261 |  #   l_points(shape = 19, size = 1, alpha = 0.1) +
262 |   theme_classic()
263 | ```
264 | 
265 | ```{r}
266 | print(plot(gam_viz, allTerms = T), pages = 1)
267 | ```
268 | 
269 | 


--------------------------------------------------------------------------------
/R/Parsnip.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Parsnip"
  3 | author: "Jesse Cambon"
  4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  5 | output:
  6 |   github_document:
  7 |     toc: true
  8 | ---
  9 | 
 10 | 
 11 | ```{r knit-settings, include=FALSE}
 12 | library(here)
 13 | source(here("rmd_config.R"))
 14 | ```
 15 | 
 16 | References: 
 17 | * https://tidymodels.github.io/parsnip/articles/parsnip_Intro.html
 18 | * https://www.tidyverse.org/blog/2018/11/parsnip-0-0-1/
 19 | * https://m-clark.github.io/workshops/bayesian/04_R.html
 20 | 
 21 | Requires packages: ranger, randomForest, rstanarm
 22 | 
 23 | ## Setup
 24 | 
 25 | ```{r}
 26 | library(parsnip)
 27 | library(tidymodels)
 28 | 
 29 | set.seed(4831) # set seed for reproducibility
 30 | # Split data
 31 | split <- initial_split(mtcars, props = 9/10)
 32 | car_train <- training(split)
 33 | car_test  <- testing(split)
 34 | 
 35 | # Defines type of model we want
 36 | car_model <- linear_reg()
 37 | ```
 38 | 
 39 | ## Bayesian Model
 40 | 
 41 | 
 42 | ```{r}
 43 | library(rstanarm)
 44 | 
 45 | wide_prior <- normal(0, 10)
 46 | 
 47 | stan_car_model <- 
 48 |   car_model %>%
 49 |   set_engine("stan", iter = 5000, prior = wide_prior, seed = 2347)
 50 | stan_car_model
 51 | ```
 52 | 
 53 | ```{r}
 54 | # don't print anything:
 55 | ctrl <- fit_control(verbosity = 0)
 56 | 
 57 | stan_fit <- 
 58 |   stan_car_model %>%
 59 |     fit(mpg ~ ., data = car_train, control = ctrl)
 60 | stan_fit
 61 | ```
 62 | 
 63 | 
 64 | ```{r}
 65 | predict(stan_fit, car_test,type='conf_int')
 66 | ```
 67 | 
 68 | 
 69 | ```{r}
 70 | library(tidybayes)
 71 | library(bayesplot)
 72 | 
 73 | # tidybayes to extract info
 74 | stan_fit$fit %>% get_variables()
 75 | 
 76 | 
 77 | ## Use bayesplot to plot
 78 | posterior <- as.matrix(stan_fit$fit)
 79 | plot_title <- ggtitle("Posterior distributions",
 80 |                       "with medians and 80% intervals")
 81 | mcmc_areas(posterior,
 82 |            pars = c("cyl", "drat", "am", "wt"),
 83 |            prob = .8) + plot_title
 84 | 
 85 | #stan_fit %>% spread_draws()
 86 | ```
 87 | 
 88 | ```{r}
 89 | pp_check(stan_fit$fit)
 90 | ```
 91 | 
 92 | ```{r}
 93 | library(shinystan)
 94 | launch_shinystan(stan_fit$fit)
 95 | 
 96 | ```
 97 | 
 98 | 
 99 | 
100 | ## Random Forests 
101 | 
102 | ```{r}
103 | 
104 | rf_with_seed <- 
105 |   rand_forest(trees = 2000, mtry = varying(), mode = "regression") %>%
106 |   set_engine("ranger", seed = 63233)
107 | 
108 | # Fig with ranger
109 | ranger_model <- rf_with_seed %>% 
110 |   set_args(mtry = 4) %>% 
111 |   set_engine("ranger",keep.inbag=TRUE) %>%
112 |   fit(mpg ~ ., data = mtcars)
113 | 
114 | # First with random forest package
115 | rf_model <- rf_with_seed %>% 
116 |   set_args(mtry = 4) %>% 
117 |   set_engine("randomForest") %>%
118 |   fit(mpg ~ ., data = mtcars)
119 | ```
120 | 
121 | ```{r}
122 | ranger_predictions <- predict(ranger_model, mtcars, type = "conf_int")
123 | ```
124 | 
125 | 


--------------------------------------------------------------------------------
/R/Parsnip.md:
--------------------------------------------------------------------------------
  1 | Parsnip
  2 | ================
  3 | Jesse Cambon
  4 | 12 April, 2020
  5 | 
  6 | References: \*
  7 | <https://tidymodels.github.io/parsnip/articles/parsnip_Intro.html> \*
  8 | <https://www.tidyverse.org/blog/2018/11/parsnip-0-0-1/> \*
  9 | <https://m-clark.github.io/workshops/bayesian/04_R.html>
 10 | 
 11 | Requires packages: ranger, randomForest, rstanarm
 12 | 
 13 | ## Setup
 14 | 
 15 | ``` r
 16 | library(parsnip)
 17 | library(tidymodels)
 18 | ```
 19 | 
 20 |     ## ── Attaching packages ────────────────────────────────────────────────────────────────────── tidymodels 0.1.0 ──
 21 | 
 22 |     ## ✓ broom     0.5.5     ✓ recipes   0.1.9
 23 |     ## ✓ dials     0.0.6     ✓ rsample   0.0.6
 24 |     ## ✓ dplyr     0.8.5     ✓ tibble    2.1.3
 25 |     ## ✓ ggplot2   3.3.0     ✓ tune      0.1.0
 26 |     ## ✓ infer     0.5.1     ✓ workflows 0.1.1
 27 |     ## ✓ purrr     0.3.3     ✓ yardstick 0.0.6
 28 | 
 29 |     ## ── Conflicts ───────────────────────────────────────────────────────────────────────── tidymodels_conflicts() ──
 30 |     ## x purrr::discard()    masks scales::discard()
 31 |     ## x dplyr::filter()     masks stats::filter()
 32 |     ## x recipes::fixed()    masks stringr::fixed()
 33 |     ## x dplyr::lag()        masks stats::lag()
 34 |     ## x ggplot2::margin()   masks dials::margin()
 35 |     ## x recipes::step()     masks stats::step()
 36 |     ## x recipes::yj_trans() masks scales::yj_trans()
 37 | 
 38 | ``` r
 39 | set.seed(4831) # set seed for reproducibility
 40 | # Split data
 41 | split <- initial_split(mtcars, props = 9/10)
 42 | car_train <- training(split)
 43 | car_test  <- testing(split)
 44 | 
 45 | # Defines type of model we want
 46 | car_model <- linear_reg()
 47 | ```
 48 | 
 49 | ## Bayesian Model
 50 | 
 51 | ``` r
 52 | library(rstanarm)
 53 | ```
 54 | 
 55 |     ## Loading required package: Rcpp
 56 | 
 57 |     ## 
 58 |     ## Attaching package: 'Rcpp'
 59 | 
 60 |     ## The following object is masked from 'package:rsample':
 61 |     ## 
 62 |     ##     populate
 63 | 
 64 |     ## rstanarm (Version 2.19.3, packaged: 2020-02-11 05:16:41 UTC)
 65 | 
 66 |     ## - Do not expect the default priors to remain the same in future rstanarm versions.
 67 | 
 68 |     ## Thus, R scripts should specify priors explicitly, even if they are just the defaults.
 69 | 
 70 |     ## - For execution on a local, multicore CPU with excess RAM we recommend calling
 71 | 
 72 |     ## options(mc.cores = parallel::detectCores())
 73 | 
 74 |     ## - bayesplot theme set to bayesplot::theme_default()
 75 | 
 76 |     ##    * Does _not_ affect other ggplot2 plots
 77 | 
 78 |     ##    * See ?bayesplot_theme_set for details on theme setting
 79 | 
 80 | ``` r
 81 | wide_prior <- normal(0, 10)
 82 | 
 83 | stan_car_model <- 
 84 |   car_model %>%
 85 |   set_engine("stan", iter = 5000, prior = wide_prior, seed = 2347)
 86 | stan_car_model
 87 | ```
 88 | 
 89 |     ## Linear Regression Model Specification (regression)
 90 |     ## 
 91 |     ## Engine-Specific Arguments:
 92 |     ##   iter = 5000
 93 |     ##   prior = wide_prior
 94 |     ##   seed = 2347
 95 |     ## 
 96 |     ## Computational engine: stan
 97 | 
 98 | ``` r
 99 | # don't print anything:
100 | ctrl <- fit_control(verbosity = 0)
101 | 
102 | stan_fit <- 
103 |   stan_car_model %>%
104 |     fit(mpg ~ ., data = car_train, control = ctrl)
105 | stan_fit
106 | ```
107 | 
108 |     ## parsnip model object
109 |     ## 
110 |     ## Fit time:  6s 
111 |     ## stan_glm
112 |     ##  family:       gaussian [identity]
113 |     ##  formula:      mpg ~ .
114 |     ##  observations: 24
115 |     ##  predictors:   11
116 |     ## ------
117 |     ##             Median MAD_SD
118 |     ## (Intercept) -10.9   32.4 
119 |     ## cyl           0.8    1.9 
120 |     ## disp          0.0    0.0 
121 |     ## hp            0.0    0.0 
122 |     ## drat          2.4    2.3 
123 |     ## wt           -3.3    2.3 
124 |     ## qsec          1.0    0.9 
125 |     ## vs            1.5    2.9 
126 |     ## am            3.4    2.8 
127 |     ## gear          2.6    2.8 
128 |     ## carb         -0.9    1.3 
129 |     ## 
130 |     ## Auxiliary parameter(s):
131 |     ##       Median MAD_SD
132 |     ## sigma 3.0    0.6   
133 |     ## 
134 |     ## ------
135 |     ## * For help interpreting the printed output see ?print.stanreg
136 |     ## * For info on the priors used see ?prior_summary.stanreg
137 | 
138 | ``` r
139 | predict(stan_fit, car_test,type='conf_int')
140 | ```
141 | 
142 |     ## # A tibble: 8 x 2
143 |     ##   .pred_lower .pred_upper
144 |     ##         <dbl>       <dbl>
145 |     ## 1       17.2         26.1
146 |     ## 2       11.1         18.9
147 |     ## 3       11.2         19.3
148 |     ## 4        6.95        18.1
149 |     ## 5       13.2         29.4
150 |     ## 6       13.5         21.7
151 |     ## 7       26.0         31.3
152 |     ## 8       12.5         36.4
153 | 
154 | ``` r
155 | library(tidybayes)
156 | ```
157 | 
158 |     ## 
159 |     ## Attaching package: 'tidybayes'
160 | 
161 |     ## The following object is masked from 'package:tune':
162 |     ## 
163 |     ##     parameters
164 | 
165 |     ## The following object is masked from 'package:dials':
166 |     ## 
167 |     ##     parameters
168 | 
169 | ``` r
170 | library(bayesplot)
171 | ```
172 | 
173 |     ## This is bayesplot version 1.7.1
174 | 
175 |     ## - Online documentation and vignettes at mc-stan.org/bayesplot
176 | 
177 |     ## - bayesplot theme set to bayesplot::theme_default()
178 | 
179 |     ##    * Does _not_ affect other ggplot2 plots
180 | 
181 |     ##    * See ?bayesplot_theme_set for details on theme setting
182 | 
183 | ``` r
184 | # tidybayes to extract info
185 | stan_fit$fit %>% get_variables()
186 | ```
187 | 
188 |     ##  [1] "(Intercept)"   "cyl"           "disp"          "hp"           
189 |     ##  [5] "drat"          "wt"            "qsec"          "vs"           
190 |     ##  [9] "am"            "gear"          "carb"          "sigma"        
191 |     ## [13] "accept_stat__" "stepsize__"    "treedepth__"   "n_leapfrog__" 
192 |     ## [17] "divergent__"   "energy__"
193 | 
194 | ``` r
195 | ## Use bayesplot to plot
196 | posterior <- as.matrix(stan_fit$fit)
197 | plot_title <- ggtitle("Posterior distributions",
198 |                       "with medians and 80% intervals")
199 | mcmc_areas(posterior,
200 |            pars = c("cyl", "drat", "am", "wt"),
201 |            prob = .8) + plot_title
202 | ```
203 | 
204 |     ## Warning: `expand_scale()` is deprecated; use `expansion()` instead.
205 | 
206 | ![](../rmd_images/Parsnip/unnamed-chunk-5-1.png)<!-- -->
207 | 
208 | ``` r
209 | #stan_fit %>% spread_draws()
210 | ```
211 | 
212 | ``` r
213 | pp_check(stan_fit$fit)
214 | ```
215 | 
216 | ![](../rmd_images/Parsnip/unnamed-chunk-6-1.png)<!-- -->
217 | 
218 | ``` r
219 | library(shinystan)
220 | ```
221 | 
222 |     ## Loading required package: shiny
223 | 
224 |     ## 
225 |     ## This is shinystan version 2.5.0
226 | 
227 | ``` r
228 | launch_shinystan(stan_fit$fit)
229 | ```
230 | 
231 |     ## 
232 |     ## Hang on... preparing graphical posterior predictive checks for rstanarm model.
233 |     ## See help('shinystan', 'rstanarm') for how to disable this feature.
234 | 
235 |     ## 
236 |     ## Launching ShinyStan interface... for large models this  may take some time.
237 | 
238 |     ## 
239 |     ## Listening on http://127.0.0.1:6429
240 | 
241 | ## Random Forests
242 | 
243 | ``` r
244 | rf_with_seed <- 
245 |   rand_forest(trees = 2000, mtry = varying(), mode = "regression") %>%
246 |   set_engine("ranger", seed = 63233)
247 | 
248 | # Fig with ranger
249 | ranger_model <- rf_with_seed %>% 
250 |   set_args(mtry = 4) %>% 
251 |   set_engine("ranger",keep.inbag=TRUE) %>%
252 |   fit(mpg ~ ., data = mtcars)
253 | 
254 | # First with random forest package
255 | rf_model <- rf_with_seed %>% 
256 |   set_args(mtry = 4) %>% 
257 |   set_engine("randomForest") %>%
258 |   fit(mpg ~ ., data = mtcars)
259 | ```
260 | 
261 | ``` r
262 | ranger_predictions <- predict(ranger_model, mtcars, type = "conf_int")
263 | ```
264 | 


--------------------------------------------------------------------------------
/R/Power_Analysis.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Power Analysis"
 3 | author: "Jesse Cambon"
 4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
 5 | output:
 6 |   github_document:
 7 |     toc: true
 8 | ---
 9 | 
10 | ```{r knit-settings, include=FALSE}
11 | library(here)
12 | source(here("rmd_config.R"))
13 | ```
14 | 
15 | ```{r}
16 | library(pwr)
17 | pwr.2p.test(
18 |        h=ES.h(0.6,0.8),
19 |        n=NULL,
20 |        sig.level=0.05,
21 |        power=0.80,
22 |        alternative="two.sided")
23 | 
24 | ```
25 | 
26 | 
27 | ```{r}
28 | library(pwr)
29 | power.prop.test(n = NULL, 
30 |                 p1 = .6, 
31 |                 p2 = .8,
32 |                 power=0.8,
33 |                 sig.level=0.05,
34 |                 alternative="two.sided"
35 |                 ) 
36 | ```
37 | 
38 | 
39 | ### Additional References 
40 | 
41 | * [Biostat Handbook – Power Analysis](http://www.biostathandbook.com/power.html)
42 | * [Biostat Handbook - Hypothesis Testing](http://www.biostathandbook.com/hypothesistesting.html)
43 | * [UCLA Intro to Power Analysis](https://stats.idre.ucla.edu/other/mult-pkg/seminars/intro-power/)
44 | * [An online power calculator for proportions](https://www.stat.ubc.ca/~rollin/stats/ssize/b2.html)
45 | * [The pwr R package](https://cran.r-project.org/web/packages/pwr/vignettes/pwr-vignette.html) – a popular R package used for power analysis
46 | 
47 | 


--------------------------------------------------------------------------------
/R/Power_Analysis.md:
--------------------------------------------------------------------------------
 1 | Power Analysis
 2 | ================
 3 | Jesse Cambon
 4 | 22 November, 2019
 5 | 
 6 | Check with <https://www.stat.ubc.ca/~rollin/stats/ssize/b2.html>
 7 | 
 8 | ``` r
 9 | library(pwr)
10 | pwr.2p.test(
11 |        h=ES.h(0.6,0.8),
12 |        n=NULL,
13 |        sig.level=0.05,
14 |        power=0.80,
15 |        alternative="two.sided")
16 | ```
17 | 
18 |     ## 
19 |     ##      Difference of proportion power calculation for binomial distribution (arcsine transformation) 
20 |     ## 
21 |     ##               h = 0.4421432
22 |     ##               n = 80.29912
23 |     ##       sig.level = 0.05
24 |     ##           power = 0.8
25 |     ##     alternative = two.sided
26 |     ## 
27 |     ## NOTE: same sample sizes
28 | 
29 | ``` r
30 | library(pwr)
31 | power.prop.test(n = NULL, 
32 |                 p1 = .6, 
33 |                 p2 = .8,
34 |                 power=0.8,
35 |                 sig.level=0.05,
36 |                 alternative="two.sided"
37 |                 ) 
38 | ```
39 | 
40 |     ## 
41 |     ##      Two-sample comparison of proportions power calculation 
42 |     ## 
43 |     ##               n = 81.22424
44 |     ##              p1 = 0.6
45 |     ##              p2 = 0.8
46 |     ##       sig.level = 0.05
47 |     ##           power = 0.8
48 |     ##     alternative = two.sided
49 |     ## 
50 |     ## NOTE: n is number in *each* group
51 | 
52 | ### Additional References
53 | 
54 |   - [Biostat Handbook – Power
55 |     Analysis](http://www.biostathandbook.com/power.html)
56 |   - [Biostat Handbook - Hypothesis
57 |     Testing](http://www.biostathandbook.com/hypothesistesting.html)
58 |   - [UCLA Intro to Power
59 |     Analysis](https://stats.idre.ucla.edu/other/mult-pkg/seminars/intro-power/)
60 |   - [An online power calculator for
61 |     proportions](https://www.stat.ubc.ca/~rollin/stats/ssize/b2.html)
62 |   - [The pwr R
63 |     package](https://cran.r-project.org/web/packages/pwr/vignettes/pwr-vignette.html)
64 |     – a popular R package used for power analysis
65 | 


--------------------------------------------------------------------------------
/R/R-Quickstart.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "R Quickstart"
  3 | author: "Jesse Cambon"
  4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  5 | output:
  6 |   github_document:
  7 |     toc: true
  8 | ---
  9 | 
 10 | ```{r knit-settings, include=FALSE}
 11 | library(here)
 12 | source(here("rmd_config.R"))
 13 | ```
 14 | 
 15 | Simple tidyverse code for common data science operations in R.
 16 | 
 17 | ## Setup
 18 | 
 19 | ```{r setup, message=FALSE, results=FALSE, warning=FALSE}
 20 | library(tidyverse)
 21 | library(ggplot2)
 22 | 
 23 | # Set default ggplot theme
 24 | theme_set(theme_bw()+
 25 |   theme(legend.position = "top",
 26 |             plot.subtitle= element_text(face="bold",hjust=0.5),
 27 |             plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)))
 28 | ```
 29 | 
 30 | ## Data Manipulation
 31 | 
 32 | ### Warm Up
 33 | 
 34 | Initial 'mpg' Dataset:
 35 | ```{r,echo=F} 
 36 | kable(head(mpg,3))
 37 | ```
 38 | 
 39 | Use `View(mpg)` to preview the dataset in R. 
 40 | 
 41 | ```{r}
 42 | mpg_subset <- mpg %>%
 43 |   filter(cyl==4 & year >= 2005  & manufacturer == "nissan") %>%
 44 |   mutate(ratio=hwy/cty,
 45 |          make_model=str_c(manufacturer,' ',model)) %>%
 46 |   select(make_model,cyl,year,hwy,cty,ratio)
 47 | ```
 48 | 
 49 | ```{r,echo=F} 
 50 | kable(mpg_subset)
 51 | ```
 52 | 
 53 | 
 54 | ### Counting
 55 | ```{r}
 56 | count_cyl <- mpg %>%
 57 |   count(cyl)
 58 | ```
 59 | 
 60 | ```{r,echo=F}
 61 | kable(count_cyl)
 62 | ```
 63 | 
 64 | ### Calculate Summary Stats
 65 | ```{r}
 66 | mpg_stats <- mpg %>% select(class,hwy) %>%
 67 |   mutate(class_c=case_when(class %in% c("2seater","subcompact") ~ "subcompact",
 68 |                                TRUE ~ class)) %>%
 69 |   group_by(class_c) %>%
 70 |   summarize(count=n(),
 71 |             max_hwy=max(hwy),
 72 |             min_hwy=min(hwy),
 73 |             median_hwy=median(hwy),
 74 |             mean_hwy=mean(hwy)) %>%
 75 |   ungroup() %>%
 76 |   arrange(desc(count)) # sort dataset
 77 | ```
 78 | 
 79 | Note that '2seater' is reclassified as 'subcompact'
 80 | 
 81 | ```{r,echo=F}
 82 | kable(mpg_stats)
 83 | ```
 84 | 
 85 | ### Stacking Data
 86 | 
 87 | 
 88 | Initial 'mpg' Dataset:
 89 | ```{r,echo=F} 
 90 | kable(head(mpg,3))
 91 | ```
 92 | 
 93 | 
 94 | ```{r}
 95 | mpg1 <- mpg %>% slice(1:2) %>% 
 96 |   select(manufacturer,model,hwy,cty) %>%
 97 |   mutate(dataset=1)
 98 | 
 99 | mpg2 <- mpg %>% slice(44:45) %>%
100 |   select(manufacturer,model,hwy,cty) %>%
101 |   mutate(dataset=2)
102 | 
103 | mpg3 <- mpg %>% slice(1:2,5:6) %>%
104 |   select(displ,year)
105 | ```
106 | 
107 | Stack vertically and horizontally
108 | ```{r}
109 | mpg_stack_vert <- mpg1 %>% 
110 |   bind_rows(mpg2)
111 | 
112 | mpg_stack_horz <- mpg_stack_vert %>%
113 |   bind_cols(mpg3)
114 | ```
115 | 
116 | ### Joining
117 | 
118 | ```{r}
119 | car_type <- mpg %>% select(manufacturer,model,class) %>%
120 |   distinct() # distinct rows only
121 | 
122 | joined <- mpg_stack_horz %>%
123 |   left_join(car_type,by=c('manufacturer','model')) %>% 
124 |   select(-dataset,everything())
125 | ```
126 | 
127 | ### Long to Wide
128 | 
129 | Initial Data:
130 | ```{r,echo=F}
131 | kable(head(us_rent_income,4))
132 | ```
133 | 
134 | * pivot_wider
135 |   * names_from: column containing values that we will use for our new column names
136 |   
137 | ```{r}
138 | col_ratio <- us_rent_income %>%
139 |   select(-GEOID,-moe) %>%
140 |   pivot_wider(names_from = variable, values_from = estimate) %>% 
141 |   drop_na() %>%   # drop missing values
142 |   mutate(income_rent_ratio = income / (12*rent))
143 | ```
144 | 
145 | Income and Rent are now in separate columns:
146 | 
147 | ```{r,echo=F}
148 | kable(head(col_ratio,4))
149 | ```
150 | 
151 | ### Wide to Long
152 | 
153 | Initial Data:
154 | ```{r,echo=F}
155 | kable(head(world_bank_pop,3))
156 | ```
157 | 
158 | * pivot_longer
159 |   * cols (1st arg): what columns do we want to pivot? (ie. subtract ones we don't want to)
160 |   * names_to : the name of new column holding the column names as values
161 |   * values_to : name of new column containing values
162 | * seq(start, stop, increment)  ->  generates sequence
163 | 
164 | ```{r} 
165 | wb_pop <- world_bank_pop %>%
166 |   pivot_longer(c(-country,-indicator), names_to = "year", values_to = "value") %>%
167 |   mutate(year=as.numeric(year)) %>% # convert to numeric
168 |   filter(year %in% seq(2000,2016,2))
169 | ```
170 | 
171 | After:
172 | ```{r,echo=F} 
173 | kable(head(wb_pop,3))
174 | ```
175 | 
176 | 
177 | ## Visualizations
178 | 
179 | ### Bar Chart
180 | 
181 | * use fill argument in ggplot() to set bar color based on a variable
182 | * reorder() orders the bars
183 | ```{r}
184 | # A simple bar chart - average heights of the species
185 | # the reorder command orders our bars in order of descending height
186 | ggplot(data=mpg_stats,
187 |     aes(x = reorder(class_c,-mean_hwy), y=mean_hwy)) +
188 | geom_bar(stat='identity',position='dodge',color='black') +
189 | scale_y_continuous(expand = expand_scale(mult = c(0, .1))) +    # plot margins
190 | geom_text(aes(label=round(mean_hwy)), vjust=-0.5) +  # labelling
191 | theme(legend.position="none", # no legend (in case we want to use fill)
192 |       panel.grid = element_blank()) + # turn off grid
193 | labs(title='') +
194 | xlab('') +
195 | ylab('')
196 | ```
197 | 
198 | 
199 | ```{r histogram}
200 | # Histogram with autobinning based on gender
201 | ggplot(mpg,aes(hwy)) +
202 | geom_histogram(aes(fill=cyl),binwidth=1) +
203 | scale_y_continuous(expand = expand_scale(mult = c(0, .05))) +
204 | xlab('Highway mpg') + ylab('Count')
205 | ```
206 | 
207 | ## Line
208 | 
209 | We divide the `value` field by 100 since to convert it to a decimal percentage value.
210 | 
211 | SP.POP.GROW is the % population growth
212 | 
213 | ```{r line}
214 | ggplot(wb_pop %>% filter(country %in% c("USA","CAN","MEX") & indicator == "SP.POP.GROW"),
215 |           aes(x=year,y=value/100,color = country)) +
216 |   theme_classic() +
217 | geom_line() + geom_point() + # lines and points
218 | scale_x_continuous(expand = expand_scale(mult = c(.05, .05))) +
219 | scale_y_continuous(labels=scales::percent) + 
220 | labs(title='',
221 |      caption='') +
222 | theme(legend.title = element_blank(),
223 |       panel.grid.minor.x = element_blank(),
224 |       legend.text=element_text(size=10),
225 |       legend.position='right') +
226 | xlab('Year') +
227 | ylab('Population Growth') +
228 | # make legend items bigger
229 | guides(colour = guide_legend(override.aes = list(size=2))) 
230 | 
231 | ```
232 | 
233 | ## Lollipop
234 | 
235 | ```{r lollipop}
236 |   ggplot(data=col_ratio %>% arrange(desc(rent)) %>% head(15), aes(x=NAME, y=rent) ) +
237 |     geom_segment( aes(x=reorder(NAME,rent) ,xend=NAME, y=0, yend=rent), color="grey") +
238 |     geom_point(size=3) +
239 |    theme_minimal() +
240 |   theme(plot.subtitle= element_text(face="bold",hjust=0.5),
241 |       plot.title = element_text(lineheight=1, face="bold",hjust = 0.5),
242 |       panel.grid.minor.y = element_blank(),
243 |       panel.grid.major.y = element_blank(),
244 |       panel.grid.minor.x = element_blank()
245 |     ) +
246 |   coord_flip() +
247 |     scale_y_continuous(labels=scales::dollar,expand = expand_scale(mult = c(0, .1))) + 
248 |     labs(title='States With Highest Rent',
249 |         caption='Source: 2017 American Community Survey (Census)') +
250 |     xlab('') + ylab('Median Monthly Rent')
251 | ```
252 | 
253 | 
254 | 


--------------------------------------------------------------------------------
/R/R_Quotation.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "R Quotation Methods"
 3 | author: "Jesse Cambon"
 4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
 5 | output:
 6 |   github_document:
 7 |     toc: true
 8 | ---
 9 | 
10 | Demonstrate the use of the quo_name() and enquo() functions
11 | to pass variable names to functions and utilize both the variables
12 | and the variable names 
13 | 
14 | ```{r knit-settings, include=FALSE}
15 | library(here)
16 | source(here("rmd_config.R"))
17 | ```
18 | 
19 | ```{r, warning=F,message=F}
20 | library(tidyverse)
21 | library(knitr)
22 | 
23 | # Create a heatmap with two axis variables
24 | # of the mean of a given metric variable
25 | car_heatmap <- function(data,axis1,axis2,metric) {
26 |   # handle quotations
27 |   
28 |   # The dataset can be passed easily as a function argument
29 |   # but we need to use enquo() and the !! operator (below) 
30 |   # for variable names
31 |   # If you were doing this outside of function you would use quo() instead of enquo()
32 |   axis1 <- enquo(axis1)
33 |   axis2 <- enquo(axis2)
34 |   metric <- enquo(metric)
35 |   
36 |   
37 |   print(quo_name(metric))
38 | 
39 |   cars_agg <- data %>%
40 |     group_by(!!axis1,!!axis2) %>%
41 |     summarize(fill_metric = mean(!!metric),n=n())
42 |   
43 |    ggplot(cars_agg, aes(factor(!!axis1), factor(!!axis2))) + 
44 |      geom_tile(aes(fill = fill_metric), colour = "grey") + 
45 |      scale_fill_gradient(low = "white",high = "steelblue") +
46 |      geom_text(aes(label=round(fill_metric,1))) +
47 |      theme_minimal() +
48 |      theme(panel.grid = element_blank()) +
49 |      scale_x_discrete(expand=c(0,0,0,0)) +
50 |      # quo_name() access the character name of a variable
51 |      guides(fill=guide_legend(title=str_c('mean ',quo_name(metric)))) +
52 |      xlab(quo_name(axis1)) + ylab(quo_name(axis2))
53 | }
54 | 
55 | car_heatmap(mtcars,cyl,gear,mpg)
56 | car_heatmap(mtcars,cyl,gear,hp)
57 | car_heatmap(mtcars,gear,carb,mpg)
58 | ```
59 | 
60 | To quote a character list of variables, use rlang:syms() and the !!! operator
61 | 
62 | ```{r}
63 | # Find frequency counts for all variables in var list
64 | 
65 | hp_calc <- function(data,variables) {
66 |   variables <- rlang::syms(variables)
67 |   return(data %>% group_by(!!!variables) %>%
68 |            summarize(n=n(),
69 |                      mean_hp=mean(hp),
70 |                      min_hp=min(hp),
71 |                      max_hp=max(hp))
72 |          )
73 | }
74 |  
75 | gear_hp <- hp_calc(mtcars,c('gear')) 
76 | vs_am_hp <- hp_calc(mtcars,c('vs','am')) 
77 | 
78 | kable(gear_hp)
79 | kable(vs_am_hp)
80 | ```
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/R/R_Quotation.md:
--------------------------------------------------------------------------------
  1 | R Quotation Methods
  2 | ================
  3 | Jesse Cambon
  4 | 24 November, 2019
  5 | 
  6 | Demonstrate the use of the quo\_name() and enquo() functions to pass
  7 | variable names to functions and utilize both the variables and the
  8 | variable names
  9 | 
 10 | ``` r
 11 | library(tidyverse)
 12 | library(knitr)
 13 | 
 14 | # Create a heatmap with two axis variables
 15 | # of the mean of a given metric variable
 16 | car_heatmap <- function(data,axis1,axis2,metric) {
 17 |   # handle quotations
 18 |   
 19 |   # The dataset can be passed easily as a function argument
 20 |   # but we need to use enquo() and the !! operator (below) 
 21 |   # for variable names
 22 |   # If you were doing this outside of function you would use quo() instead of enquo()
 23 |   axis1 <- enquo(axis1)
 24 |   axis2 <- enquo(axis2)
 25 |   metric <- enquo(metric)
 26 |   
 27 |   
 28 |   print(quo_name(metric))
 29 | 
 30 |   cars_agg <- data %>%
 31 |     group_by(!!axis1,!!axis2) %>%
 32 |     summarize(fill_metric = mean(!!metric),n=n())
 33 |   
 34 |    ggplot(cars_agg, aes(factor(!!axis1), factor(!!axis2))) + 
 35 |      geom_tile(aes(fill = fill_metric), colour = "grey") + 
 36 |      scale_fill_gradient(low = "white",high = "steelblue") +
 37 |      geom_text(aes(label=round(fill_metric,1))) +
 38 |      theme_minimal() +
 39 |      theme(panel.grid = element_blank()) +
 40 |      scale_x_discrete(expand=c(0,0,0,0)) +
 41 |      # quo_name() access the character name of a variable
 42 |      guides(fill=guide_legend(title=str_c('mean ',quo_name(metric)))) +
 43 |      xlab(quo_name(axis1)) + ylab(quo_name(axis2))
 44 | }
 45 | 
 46 | car_heatmap(mtcars,cyl,gear,mpg)
 47 | ```
 48 | 
 49 |     ## [1] "mpg"
 50 | 
 51 | ![](../rmd_images/R_Quotation/unnamed-chunk-1-1.png)<!-- -->
 52 | 
 53 | ``` r
 54 | car_heatmap(mtcars,cyl,gear,hp)
 55 | ```
 56 | 
 57 |     ## [1] "hp"
 58 | 
 59 | ![](../rmd_images/R_Quotation/unnamed-chunk-1-2.png)<!-- -->
 60 | 
 61 | ``` r
 62 | car_heatmap(mtcars,gear,carb,mpg)
 63 | ```
 64 | 
 65 |     ## [1] "mpg"
 66 | 
 67 | ![](../rmd_images/R_Quotation/unnamed-chunk-1-3.png)<!-- -->
 68 | 
 69 | To quote a character list of variables, use rlang:syms() and the \!\!\!
 70 | operator
 71 | 
 72 | ``` r
 73 | # Find frequency counts for all variables in var list
 74 | 
 75 | hp_calc <- function(data,variables) {
 76 |   variables <- rlang::syms(variables)
 77 |   return(data %>% group_by(!!!variables) %>%
 78 |            summarize(n=n(),
 79 |                      mean_hp=mean(hp),
 80 |                      min_hp=min(hp),
 81 |                      max_hp=max(hp))
 82 |          )
 83 | }
 84 |  
 85 | gear_hp <- hp_calc(mtcars,c('gear')) 
 86 | vs_am_hp <- hp_calc(mtcars,c('vs','am')) 
 87 | 
 88 | kable(gear_hp)
 89 | ```
 90 | 
 91 | | gear |  n | mean\_hp | min\_hp | max\_hp |
 92 | | ---: | -: | -------: | ------: | ------: |
 93 | |    3 | 15 | 176.1333 |      97 |     245 |
 94 | |    4 | 12 |  89.5000 |      52 |     123 |
 95 | |    5 |  5 | 195.6000 |      91 |     335 |
 96 | 
 97 | ``` r
 98 | kable(vs_am_hp)
 99 | ```
100 | 
101 | | vs | am |  n |  mean\_hp | min\_hp | max\_hp |
102 | | -: | -: | -: | --------: | ------: | ------: |
103 | |  0 |  0 | 12 | 194.16667 |     150 |     245 |
104 | |  0 |  1 |  6 | 180.83333 |      91 |     335 |
105 | |  1 |  0 |  7 | 102.14286 |      62 |     123 |
106 | |  1 |  1 |  7 |  80.57143 |      52 |     113 |
107 | 


--------------------------------------------------------------------------------
/R/Regression Model Tidying.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Regression Model Tidying"
  3 | author: "Jesse Cambon"
  4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  5 | output:
  6 |   github_document:
  7 |     toc: true
  8 | ---
  9 | 
 10 | Example of labeling categorical variables in a regression model
 11 | 
 12 | ```{r knit-settings, include=FALSE}
 13 | source(here::here("rmd_config.R"))
 14 | ```
 15 | 
 16 | ## Setup
 17 | 
 18 | ```{r, message = FALSE, warning = FALSE}
 19 | library(broom)
 20 | library(tidyverse)
 21 | 
 22 | # obtain character list of independent variables in a model object.
 23 | obtain_model_varlist <- function(model_obj) {
 24 |   var_list_raw <- unlist(strsplit(as.character(formula(model_obj)[3]), split = " \\+ "))
 25 |   # Remove smooth terms (s())
 26 |   return(var_list_raw[!str_detect(var_list_raw, "^s\\(")])
 27 | }
 28 | 
 29 | # Find frequency counts for all categorical variables in var list
 30 | var_freq <- function(data, var) {
 31 |   var <- rlang::sym(var)
 32 |   print(var)
 33 | 
 34 |   if (is.factor(data %>% pull(!!var)) | is.character(data %>% pull(!!var))) {
 35 |     return(data %>% count(!!var) %>% mutate(term = quo_name(var)) %>%
 36 |       rename(level = !!var) %>%
 37 |       mutate(
 38 |         level = as.character(level), # convert to char
 39 |         is_categorical = 1
 40 |       ) %>%
 41 |       select(term, everything()))
 42 |   } else {
 43 |     return(tibble())
 44 |   }
 45 | }
 46 | 
 47 | # Iterate through an entire dataset and return a dataset with sample
 48 | # sizes for all levels of categorical variables
 49 | find_all_freqs <- function(data, var_list) {
 50 |   all_freqs <- tibble()
 51 |   for (var in var_list) {
 52 |     all_freqs <- all_freqs %>%
 53 |       bind_rows(var_freq(data, var))
 54 |   }
 55 |   return(all_freqs)
 56 | }
 57 | 
 58 | # adds term_name field to a tidy dataframe which includes sample sizes
 59 | add_termnames <- function(data, term_freqs, var_list) {
 60 |   # Regexs to match the varname (when it begins a string)
 61 |   varregex <- paste(str_replace(var_list, "^", "\\^"), collapse = "|")
 62 | 
 63 |   return(
 64 |     data %>%
 65 |       mutate(
 66 |         term_name = coalesce(
 67 |           str_extract(term, varregex),
 68 |           term
 69 |           ),
 70 |         level = case_when(!is.na(term_name) ~ str_replace(term, varregex, ""))
 71 |       ) %>%
 72 |       # add in frequency counts and labels
 73 |       left_join(term_freqs, by = c("term_name" = "term", "level")) %>%
 74 |       mutate(label = case_when(
 75 |         is_categorical == 1 ~ str_c(term_name, ": ", level, " (", scales::comma(n), ")"),
 76 |         TRUE ~ str_c(term_name)
 77 |       ))
 78 |   )
 79 | }
 80 | ```
 81 | 
 82 | ## Build Linear Model
 83 | 
 84 | ```{r}
 85 | Mymtcars <- mtcars %>%
 86 |   mutate(
 87 |     Cylinders = factor(cyl),
 88 |     Gears = factor(gear)
 89 |   )
 90 | 
 91 | car_model <- lm(mpg ~ Cylinders + disp + Gears, data = Mymtcars)
 92 | 
 93 | # obtain list of independent variables
 94 | car_varlist <- obtain_model_varlist(car_model)
 95 | # sample sizes for categorical variable levels
 96 | car_freqs <- find_all_freqs(Mymtcars, car_varlist)
 97 | 
 98 | tidy_car <- tidy(car_model, conf.int = T) %>%
 99 |   add_termnames(car_freqs, car_varlist)
100 | 
101 | glance_car <- glance(car_model)
102 | ```
103 | 
104 | ## Plot Coefficients 
105 | 
106 | ```{r}
107 | 
108 | tidy_car %>%
109 |   ggplot(
110 |     aes(x = reorder(term, -estimate), y = estimate)
111 |   ) +
112 |   geom_point() +
113 |   scale_y_continuous() +
114 |   geom_hline(yintercept = 0, color = "grey") +
115 |   coord_flip() +
116 |   theme_bw() +
117 |   theme(plot.title = element_text(lineheight = 1, face = "bold", hjust = 0.5)) +
118 |   geom_pointrange(mapping = aes(ymin = conf.low, ymax = conf.high)) +
119 |   labs(
120 |     title = "MPG Linear Model - Default Labels",
121 |     caption = "Horizontal lines represents 95% confidence intervals."
122 |   ) +
123 |   xlab("Term") +
124 |   ylab("Coefficient")
125 | 
126 | 
127 | tidy_car %>%
128 |   ggplot(
129 |     aes(x = reorder(label, -estimate), y = estimate)
130 |   ) +
131 |   geom_point() +
132 |   scale_y_continuous() +
133 |   geom_hline(yintercept = 0, color = "grey") +
134 |   coord_flip() +
135 |   theme_bw() +
136 |   theme(plot.title = element_text(lineheight = 1, face = "bold", hjust = 0.5)) +
137 |   geom_pointrange(mapping = aes(ymin = conf.low, ymax = conf.high)) +
138 |   labs(
139 |     title = "MPG Linear Model - With Improved Labels",
140 |     caption = "Sample sizes shown in (). Horizontal lines represents 95% confidence intervals."
141 |   ) +
142 |   xlab("Term") +
143 |   ylab("Coefficient")
144 | ```
145 | 


--------------------------------------------------------------------------------
/R/Regression-Model-Tidying.md:
--------------------------------------------------------------------------------
  1 | Regression Model Tidying
  2 | ================
  3 | Jesse Cambon
  4 | 12 September, 2021
  5 | 
  6 | -   [Setup](#setup)
  7 | -   [Build Linear Model](#build-linear-model)
  8 | -   [Plot Coefficients](#plot-coefficients)
  9 | 
 10 | Example of labeling categorical variables in a regression model
 11 | 
 12 | ## Setup
 13 | 
 14 | ``` r
 15 | library(broom)
 16 | library(tidyverse)
 17 | 
 18 | # obtain character list of independent variables in a model object.
 19 | obtain_model_varlist <- function(model_obj) {
 20 |   var_list_raw <- unlist(strsplit(as.character(formula(model_obj)[3]), split = " \\+ "))
 21 |   # Remove smooth terms (s())
 22 |   return(var_list_raw[!str_detect(var_list_raw, "^s\\(")])
 23 | }
 24 | 
 25 | # Find frequency counts for all categorical variables in var list
 26 | var_freq <- function(data, var) {
 27 |   var <- rlang::sym(var)
 28 |   print(var)
 29 | 
 30 |   if (is.factor(data %>% pull(!!var)) | is.character(data %>% pull(!!var))) {
 31 |     return(data %>% count(!!var) %>% mutate(term = quo_name(var)) %>%
 32 |       rename(level = !!var) %>%
 33 |       mutate(
 34 |         level = as.character(level), # convert to char
 35 |         is_categorical = 1
 36 |       ) %>%
 37 |       select(term, everything()))
 38 |   } else {
 39 |     return(tibble())
 40 |   }
 41 | }
 42 | 
 43 | # Iterate through an entire dataset and return a dataset with sample
 44 | # sizes for all levels of categorical variables
 45 | find_all_freqs <- function(data, var_list) {
 46 |   all_freqs <- tibble()
 47 |   for (var in var_list) {
 48 |     all_freqs <- all_freqs %>%
 49 |       bind_rows(var_freq(data, var))
 50 |   }
 51 |   return(all_freqs)
 52 | }
 53 | 
 54 | # adds term_name field to a tidy dataframe which includes sample sizes
 55 | add_termnames <- function(data, term_freqs, var_list) {
 56 |   # Regexs to match the varname (when it begins a string)
 57 |   varregex <- paste(str_replace(var_list, "^", "\\^"), collapse = "|")
 58 | 
 59 |   return(
 60 |     data %>%
 61 |       mutate(
 62 |         term_name = coalesce(
 63 |           str_extract(term, varregex),
 64 |           term
 65 |           ),
 66 |         level = case_when(!is.na(term_name) ~ str_replace(term, varregex, ""))
 67 |       ) %>%
 68 |       # add in frequency counts and labels
 69 |       left_join(term_freqs, by = c("term_name" = "term", "level")) %>%
 70 |       mutate(label = case_when(
 71 |         is_categorical == 1 ~ str_c(term_name, ": ", level, " (", scales::comma(n), ")"),
 72 |         TRUE ~ str_c(term_name)
 73 |       ))
 74 |   )
 75 | }
 76 | ```
 77 | 
 78 | ## Build Linear Model
 79 | 
 80 | ``` r
 81 | Mymtcars <- mtcars %>%
 82 |   mutate(
 83 |     Cylinders = factor(cyl),
 84 |     Gears = factor(gear)
 85 |   )
 86 | 
 87 | car_model <- lm(mpg ~ Cylinders + disp + Gears, data = Mymtcars)
 88 | 
 89 | # obtain list of independent variables
 90 | car_varlist <- obtain_model_varlist(car_model)
 91 | # sample sizes for categorical variable levels
 92 | car_freqs <- find_all_freqs(Mymtcars, car_varlist)
 93 | ```
 94 | 
 95 |     ## Cylinders
 96 |     ## disp
 97 |     ## Gears
 98 | 
 99 | ``` r
100 | tidy_car <- tidy(car_model, conf.int = T) %>%
101 |   add_termnames(car_freqs, car_varlist)
102 | 
103 | glance_car <- glance(car_model)
104 | ```
105 | 
106 | ## Plot Coefficients
107 | 
108 | ``` r
109 | tidy_car %>%
110 |   ggplot(
111 |     aes(x = reorder(term, -estimate), y = estimate)
112 |   ) +
113 |   geom_point() +
114 |   scale_y_continuous() +
115 |   geom_hline(yintercept = 0, color = "grey") +
116 |   coord_flip() +
117 |   theme_bw() +
118 |   theme(plot.title = element_text(lineheight = 1, face = "bold", hjust = 0.5)) +
119 |   geom_pointrange(mapping = aes(ymin = conf.low, ymax = conf.high)) +
120 |   labs(
121 |     title = "MPG Linear Model - Default Labels",
122 |     caption = "Horizontal lines represents 95% confidence intervals."
123 |   ) +
124 |   xlab("Term") +
125 |   ylab("Coefficient")
126 | ```
127 | 
128 | ![](../rmd_images/Regression-Model-Tidying/unnamed-chunk-3-1.png)<!-- -->
129 | 
130 | ``` r
131 | tidy_car %>%
132 |   ggplot(
133 |     aes(x = reorder(label, -estimate), y = estimate)
134 |   ) +
135 |   geom_point() +
136 |   scale_y_continuous() +
137 |   geom_hline(yintercept = 0, color = "grey") +
138 |   coord_flip() +
139 |   theme_bw() +
140 |   theme(plot.title = element_text(lineheight = 1, face = "bold", hjust = 0.5)) +
141 |   geom_pointrange(mapping = aes(ymin = conf.low, ymax = conf.high)) +
142 |   labs(
143 |     title = "MPG Linear Model - With Improved Labels",
144 |     caption = "Sample sizes shown in (). Horizontal lines represents 95% confidence intervals."
145 |   ) +
146 |   xlab("Term") +
147 |   ylab("Coefficient")
148 | ```
149 | 
150 | ![](../rmd_images/Regression-Model-Tidying/unnamed-chunk-3-2.png)<!-- -->
151 | 


--------------------------------------------------------------------------------
/R/Rethinking-Tadpoles.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Multilevel Tadpoles"
 3 | author: "Jesse Cambon"
 4 | date: "`r format(Sys.time(), '%B, %Y')`"
 5 | output:
 6 |   github_document:
 7 |     toc: true
 8 | ---
 9 | 
10 | ```{r knit-settings, include=FALSE}
11 | library(here)
12 | source(here("rmd_config.R"))
13 | ```
14 | 
15 | Reproduce the multilevel model made for the tadpole data in Chapter 13 of [Statistical Rethinking](https://xcelab.net/rm/statistical-rethinking/).
16 | 
17 | Use this code as a starting point:
18 | 
19 | https://bookdown.org/ajkurz/Statistical_Rethinking_recoded/multilevel-models.html
20 | 
21 | Also see:
22 | 
23 | - https://cran.r-project.org/web/packages/brms/vignettes/brms_multilevel.pdf
24 | - https://www.rensvandeschoot.com/tutorials/brms-started/
25 | - https://www.rensvandeschoot.com/tutorials/brms/
26 | 
27 | Info on 'trials' function: https://cran.r-project.org/web/packages/brms/vignettes/brms_customfamilies.html
28 | 
29 | ```{r setup, message = F, warning = F}
30 | library(rethinking)
31 | library(brms)
32 | library(tidyverse)
33 | library(bayesplot)
34 | data("reedfrogs")
35 | ```
36 | 
37 | 
38 | 
39 | ```{r}
40 | d <- reedfrogs %>%
41 |   mutate(tank = as.factor(1:nrow(.)),
42 |          across(tank, as.factor))
43 | ```
44 | 
45 | ```{r}
46 | 
47 | # No pooling
48 | b12.1 <- 
49 |   brm(data = d, family = binomial,
50 |       surv | trials(density)  ~  tank,
51 |       prior(normal(0, 5), class = b),
52 |       iter = 2000, warmup = 500, chains = 4, cores = 4,
53 |       seed = 12)
54 | 
55 | # Partial pooling (multilevel model)
56 | b12.2 <- 
57 |   brm(data = d, family = binomial,
58 |       surv | trials(density) ~  (1 | tank),
59 |       prior = c(prior(normal(0, 1), class = Intercept),
60 |                 prior(cauchy(0, 1), class = sd)),
61 |       iter = 4000, warmup = 1000, chains = 4, cores = 4,
62 |       seed = 12)
63 | 
64 | ```
65 | 
66 | 
67 | ```{r}
68 | pp_check(b12.1)
69 | pp_check(b12.2)
70 | ```
71 | 
72 | ```{r}
73 | mcmc_areas(b12.2, pars = c('r_*', 'b_*'))
74 | ```
75 | ```{r}
76 | summary(b12.2)
77 | 
78 | post <- posterior_samples(b12.2)
79 | ```
80 | 
81 | 


--------------------------------------------------------------------------------
/R/Sentiment_Analysis.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Sentiment Analysis with R"
  3 | author: "Jesse Cambon"
  4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  5 | output:
  6 |   github_document:
  7 |     toc: true
  8 | ---
  9 | 
 10 | 
 11 | ```{r knit-settings, include=FALSE}
 12 | source(here::here("rmd_config.R"))
 13 | ```
 14 | 
 15 | ## Tidytext
 16 | 
 17 | Using tidytext for sentiment analysis
 18 | 
 19 | * https://www.tidytextmining.com/sentiment.html
 20 | 
 21 | ```{r, warning=F, message=F}
 22 | library(janeaustenr)
 23 | library(tidyverse)
 24 | library(tidytext)
 25 | library(knitr)
 26 | library(sentimentr)
 27 | 
 28 | # import original dataset - one row per line of each jane austen book
 29 | austen_df <- austen_books()
 30 | 
 31 | # tokenize each jane austen book
 32 | tidy_books <- austen_df %>%
 33 |   group_by(book) %>%
 34 |   # add some variables
 35 |   mutate(
 36 |     linenumber = row_number(),
 37 |     chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
 38 |       ignore_case = TRUE
 39 |     )))
 40 |   ) %>%
 41 |   ungroup() %>%
 42 |   unnest_tokens(word, text)
 43 | ```
 44 | 
 45 | Sentiment Analysis (grouping by book)
 46 | 
 47 | ```{r}
 48 | jane_austen_sentiment <- tidy_books %>%
 49 |   inner_join(get_sentiments("afinn"))
 50 | 
 51 | kable(head(jane_austen_sentiment, 5))
 52 | 
 53 | # Summarize sentiment by book
 54 | sentiment_summ <- jane_austen_sentiment %>%
 55 |   group_by(book) %>%
 56 |   summarize(
 57 |     mean_sentiment = mean(value),
 58 |     num_sentiment_words = n()
 59 |   ) %>%
 60 |   ungroup() %>%
 61 |   arrange(desc(mean_sentiment))
 62 | 
 63 | kable(sentiment_summ)
 64 | ```
 65 | 
 66 | ## Sentiment Aggregation with tidytext
 67 | 
 68 | Make a function for quickly tokenizing a string and returning the mean sentiment
 69 | 
 70 | ```{r}
 71 | mean_sentiment <- function(.tbl, text) {
 72 |   # Returns mean sentiment
 73 |   # Args:
 74 |   #   .tbl : tibble dataframe
 75 |   #   text (STRING) : quoted column name in .tbl of text content
 76 |   # Returns:
 77 |   #   Dataframe with mean sentiment and counts of both total words
 78 |   #   and words that had a sentiment value
 79 | 
 80 |   # text <- enquo(text)
 81 | 
 82 |   # number each row
 83 |   # use this to join text column back on later
 84 |   text_num <- .tbl %>%
 85 |     mutate(row_num = row_number())
 86 | 
 87 |   # tokenize the dataset (one row per token)
 88 |   tokens <- text_num %>%
 89 |     unnest_tokens(word, {{ text }}) %>%
 90 |     left_join(get_sentiments("afinn"), by = "word") %>%
 91 |     mutate(non_missing = case_when(!is.na(value) ~ 1, TRUE ~ 0)) # record if missing sentiment value
 92 | 
 93 |   # summarize the sentiment (value column contains sentiment of each token)
 94 |   summ <- tokens %>%
 95 |     group_by(row_num) %>%
 96 |     summarize(
 97 |       mean_sentiment = mean(value, na.rm = TRUE),
 98 |       num_words = n(),
 99 |       num_sentiment_words = sum(non_missing)
100 |     ) %>%
101 |     ungroup() %>%
102 |     left_join(text_num, by = "row_num") %>%
103 |     select(row_num, {{ text }}, everything())
104 | 
105 |   return(summ)
106 | }
107 | ```
108 | 
109 | Note that the tidytext approach doesn't handle negation in sentiment
110 | 
111 | ```{r}
112 | test_df <- tribble(
113 |   ~review,
114 |   "This is the worst restaurant I have ever eaten at. It's service is abysmal.",
115 |   "Wow, amazing food, great atmosphere. Will definitely be coming back.",
116 |   "The restaurant was okay. Not good or bad",
117 |   "The restaurant was okay. Not good or bad", # duplicate row
118 |   "The stock market crashed and it was a disaster",
119 |   "This place was not terrible.", # test negation
120 |   "Really wasn't the best meal.", # test negation
121 |   "Sunshine and rainbows. Everything is fantastic, couldn't be better."
122 | )
123 | 
124 | test_sentiment <- test_df %>%
125 |   mean_sentiment(review) %>%
126 |   arrange(desc(mean_sentiment))
127 | 
128 | # test <- test_df %>%
129 | #   unnest_tokens(word,review) %>%
130 | #   left_join(get_sentiments("afinn"), by='word')
131 | 
132 | kable(test_sentiment)
133 | ```
134 | 
135 | ## Sentimentr
136 | 
137 | https://github.com/trinker/sentimentr
138 | 
139 | An example of using the `sentimentr` package for sentiment analysis. This approach does handle negation.
140 | 
141 | ```{r}
142 | # Split entities into sentences, use 'element_id' column to aggregate back
143 | # to the original entitites
144 | sentences_df <- get_sentences(test_df)
145 | 
146 | kable(sentences_df)
147 | # Sentiment by sentence
148 | sentiment_df <- sentiment(sentences_df)
149 | 
150 | # Aggregate sentiment to original entities
151 | sentiment_summ <- sentiment_by(sentences_df, by = "element_id") %>%
152 |   bind_cols(test_df %>% select(review)) %>%
153 |   select(element_id, review, everything()) %>%
154 |   arrange(desc(ave_sentiment))
155 | 
156 | kable(sentiment_summ)
157 | ```
158 | 


--------------------------------------------------------------------------------
/R/Survival.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Survival Models"
  3 | author: "Jesse Cambon"
  4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  5 | output:
  6 |   github_document:
  7 |     toc: true
  8 | ---
  9 | 
 10 | Survival Analysis 
 11 | 
 12 | * Kaplan-Meier Plots
 13 | * Log-rank test
 14 | * Cox Proportional Hazard Model
 15 | * Parametric survival models
 16 | * Bayesian Approaches
 17 | 
 18 | ```{r knit-settings, include=FALSE}
 19 | source(here::here("rmd_config.R"))
 20 | ```
 21 | 
 22 | Reference:
 23 | 
 24 | https://www.emilyzabor.com/tutorials/survival_analysis_in_r_tutorial.html
 25 | 
 26 | 
 27 | ```{r setup, message = F, warning = F}
 28 | library(survival)
 29 | library(survminer)
 30 | library(tidyverse)
 31 | library(broom)
 32 | library(broom.mixed)
 33 | library(brms)
 34 | library(bayesplot)
 35 | options(mc.cores = parallel::detectCores())
 36 | ```
 37 | 
 38 | 
 39 | # Kaplan-Meier 
 40 | 
 41 | status: censoring status 1=censored, 2=dead. Can also use TRUE/FALSE see documentation for
 42 | event in ?Surv
 43 | 
 44 | ```{r}
 45 | lung %>% count(status)
 46 | ```
 47 | 
 48 | 
 49 | ```{r}
 50 | ggsurvplot(
 51 |     fit = survfit(Surv(time, status) ~ sex, data = lung), 
 52 |     xlab = "Days", 
 53 |     ylab = "Survival Probability")$plot
 54 | ```
 55 | 
 56 | # Log-Rank Test
 57 | 
 58 | Test if there was a statistically significant difference in survival time between the groups
 59 | 
 60 | ```{r}
 61 | survdiff(Surv(time, status) ~ sex, 
 62 |          data = lung, 
 63 |          rho = 0 # log-rank, see ?survdiff
 64 |          )
 65 | ```
 66 | 
 67 | # Cox Proportional Hazard Model
 68 | 
 69 | * Multivariate "semi-parametric" regression approach
 70 | * Assumes hazard can change over time, but is proportional between groups at all points in time (ie. hazard ratio is constant over time).
 71 | 
 72 | 
 73 | ```{r}
 74 | cox_fit <- coxph(Surv(time, status) ~ sex + age + ph.ecog,
 75 |         data = lung)
 76 | 
 77 | # Exponentiate coefficients to get hazard ratios
 78 | cox_hr <- tidy(cox_fit, exponentiate = TRUE, conf.int = TRUE)
 79 | ```
 80 | 
 81 | Survival curve
 82 | 
 83 | ```{r}
 84 | ggsurvplot(survfit(cox_fit), data = lung, risk.table = TRUE)
 85 | ```
 86 | 
 87 | Plot Hazard Ratios
 88 | 
 89 | ```{r}
 90 | ggplot(data=cox_hr,
 91 |           aes(x = term, y = estimate)) +
 92 | geom_point() +
 93 | scale_y_continuous() +
 94 | geom_hline(yintercept=0,color='grey') +
 95 | coord_flip() +
 96 |   theme_bw() +
 97 |   theme(plot.title = element_text(lineheight = 1, face="bold",hjust = 0.5)) +
 98 | geom_pointrange(mapping = aes(ymin = conf.low, ymax = conf.high)) + 
 99 | xlab('Term') + ylab('HR') + geom_hline(yintercept = 1, color = "grey")
100 | ```
101 | 
102 | ### Predictions 
103 | 
104 | ```{r}
105 | sample_obs <- lung %>% 
106 |   sample_n(2, seed = 104) %>%
107 |   mutate(id = 1:n()) %>%
108 |   select(id, status, everything())
109 | 
110 | cox_pred <- predict(cox_fit, newdata = sample_obs, type = 'expected')
111 | 
112 | ```
113 | 
114 | 
115 | 
116 | ### Validation
117 | 
118 | Reference: 
119 | http://www.sthda.com/english/wiki/cox-model-assumptions 
120 | 
121 | 
122 | ```{r}
123 | concordance(cox_fit)
124 | ```
125 | 
126 | 
127 | Look at residuals
128 | 
129 | ```{r}
130 | ggcoxdiagnostics(cox_fit, type = "deviance", ox.scale = 'observation.id')
131 | ggcoxdiagnostics(cox_fit, type = "deviance", ox.scale = 'linear.predictions')
132 | ```
133 | 
134 | ```{r}
135 | ggcoxdiagnostics(cox_fit, type = 'dfbeta')
136 | ```
137 | 
138 | 
139 | 
140 | Test proportional hazards assumption
141 | 
142 | ```{r, fig.height = 7, fig.wdith = 5}
143 | zph_fit <- cox.zph(cox_fit)
144 | ggcoxzph(zph_fit)
145 | ```
146 | 
147 | ## Parametric Survival Model
148 | 
149 | Accelerated Failure Time models, an alternative to cox regression
150 | 
151 | ```{r}
152 | aft_fit <- survreg(Surv(time, status) ~ sex + age + ph.ecog,
153 |         dist = 'weibull',
154 |         data = lung)
155 | 
156 | # Exponentiate coefficients to get hazard ratios
157 | aft_hr <- tidy(aft_fit, exponentiate = TRUE, conf.int = TRUE)
158 | 
159 | aft_hr
160 | ```
161 | 
162 | 
163 | 
164 | 
165 | 
166 | ## Bayesian Survival Models
167 | 
168 | - http://paul-buerkner.github.io/brms/reference/kidney.html
169 | - https://mc-stan.org/rstanarm/reference/adapt_delta.html 
170 | 
171 | 
172 | ```{r}
173 | print('Default priors:')
174 | get_prior(time | cens(censored) ~ sex + disease + age + (1 | patient),
175 |             data = kidney, family = weibull()
176 |           )
177 | 
178 | print('Horseshoe priors:')
179 | 
180 | get_prior(time | cens(censored) ~ sex + disease + age + (1 | patient),
181 |             data = kidney, family = weibull(), 
182 |           prior = set_prior("horseshoe(3)", class = 'b') + 
183 |               set_prior("horseshoe(3)", class = 'Intercept') +
184 |               set_prior("horseshoe(3)", class = 'sd')
185 |           )
186 | ```
187 | 
188 | 
189 | ```{r, message = FALSE,  warning = F, error = F, results = 'hide'}
190 | # fit weibull model
191 | fit2 <- brm(time | cens(censored) ~ sex + disease + (1 | patient),
192 |             data = kidney, family = weibull(), 
193 |             prior = set_prior("horseshoe(3)"),
194 |             iter = 3000,
195 |             control = list(adapt_delta = 0.98))
196 | ```
197 | 
198 | 
199 | ```{r}
200 | summary(fit2)
201 | tidy(fit2)
202 | prior_summary(fit2)
203 | ```
204 | 
205 | ```{r, fig.height = 12, fig.width = 10}
206 | mcmc_trace(fit2)
207 | ```
208 | 
209 | 
210 | ```{r}
211 | pp_check(fit2)
212 | 
213 | pp_check(fit2, type = 'intervals')
214 | ```
215 | ```{r, fig.height =  8, fig.width = 4}
216 | mcmc_areas(fit2,  regex_pars = c('b_*', 'r_*'))
217 | ```
218 | 
219 | https://mc-stan.org/bayesplot/reference/PPC-censoring.html
220 | 
221 | ```{r}
222 | yrep <- posterior_predict(fit2)
223 | 
224 | loo(fit2)
225 | ```
226 | 
227 | 
228 | ```{r}
229 | hist(kidney$time)
230 | ```
231 | 
232 | 
233 | ```{r}
234 | ppc_km_overlay(kidney$time, yrep, status_y = kidney$censored) + 
235 |   xlim(0, 200)
236 | ```
237 | 
238 | 


--------------------------------------------------------------------------------
/R/Time_Series_Modeling.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Time Series Modeling"
  3 | author: "Jesse Cambon"
  4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
  5 | output:
  6 |   github_document:
  7 |     toc: true
  8 | ---
  9 | 
 10 | ```{r knit-settings, include=FALSE}
 11 | source(here::here("rmd_config.R"))
 12 | ```
 13 | 
 14 | ## References
 15 | * https://github.com/christophsax/tsbox
 16 | * https://github.com/tidyverts/tsibble
 17 | * http://pkg.robjhyndman.com/forecast/
 18 | * https://business-science.github.io/sweep/index.html
 19 | * https://cran.rstudio.com/web/packages/sweep/vignettes/SW01_Forecasting_Time_Series_Groups.html
 20 | * https://www.r-bloggers.com/climate-change-modeling-140-years-of-temperature-data-with-tsibble-and-fable/
 21 | * https://github.com/tidyverts/fable
 22 | * https://feasts.tidyverts.org/
 23 | 
 24 | 
 25 | ## Setup
 26 | 
 27 | ```{r setup}
 28 | library(nycflights13)
 29 | library(tsibbledata)
 30 | library(wesanderson) # color palettes
 31 | library(tidyverse)
 32 | library(tsibble)
 33 | # library(skimr)
 34 | # library(feasts) # Removed for now since it breaks the forecast package
 35 | library(lubridate)
 36 | library(forecast)
 37 | library(tsbox)
 38 | library(prophet)
 39 | 
 40 | 
 41 | # Set default ggplot theme
 42 | theme_set(theme_bw() +
 43 |   theme(
 44 |     legend.position = "top",
 45 |     plot.subtitle = element_text(face = "bold", hjust = 0.5),
 46 |     plot.title = element_text(lineheight = 1, face = "bold", hjust = 0.5)
 47 |   ))
 48 | ```
 49 | 
 50 | ## Importing Data
 51 | 
 52 | Import Data and Convert to Tsibble format
 53 | 
 54 | ```{r}
 55 | weather <- nycflights13::weather %>%
 56 |   select(origin, time_hour, temp, humid, precip)
 57 | 
 58 | weather_tsbl <- as_tsibble(weather, key = origin, index = time_hour)
 59 | ```
 60 | 
 61 | 
 62 | ## Forecasting with Prophet
 63 | 
 64 | https://facebook.github.io/prophet
 65 | 
 66 | ```{r}
 67 | # convert to format needed by prophet (needs specific column names)
 68 | weather_ts <- weather_tsbl %>%
 69 |   filter(origin == "EWR") %>%
 70 |   select(time_hour, temp) %>%
 71 |   rename(ds = time_hour, y = temp)
 72 | 
 73 | # create prophet model
 74 | m <- prophet(weather_ts, yearly.seasonality = TRUE)
 75 | 
 76 | future <- make_future_dataframe(m, periods = 400)
 77 | 
 78 | # Create forecast
 79 | prophet_forecast <- predict(m, future)
 80 | 
 81 | # Plot
 82 | plot(m, prophet_forecast)
 83 | 
 84 | prophet_plot_components(m, prophet_forecast)
 85 | ```
 86 | 
 87 | ## Data Cleaning
 88 | 
 89 | Fill Missing Gaps in Data
 90 | 
 91 | ```{r}
 92 | nrow(ansett)
 93 | 
 94 | # Fill gaps and filter
 95 | ansett_fill <- ansett %>%
 96 |   filter(Airports == "MEL-SYD") %>%
 97 |   fill_gaps(Passengers = 0)
 98 | 
 99 | nrow(ansett_fill)
100 | 
101 | # Aggregate all classes together , limit to 1990 onward
102 | ansett_summ <- ansett_fill %>%
103 |   group_by() %>%
104 |   summarize(Passengers = sum(Passengers, na.rm = TRUE)) %>%
105 |   filter_index("1990-01" ~ .) %>%
106 |   as_tsibble(index = Week)
107 | ```
108 | 
109 | ```{r}
110 | ggplot(
111 |   ansett_fill,
112 |   aes(x = Week, y = Passengers)
113 | ) +
114 |   geom_area(aes(fill = Class), alpha = 1.0) +
115 |   scale_fill_manual(values = wes_palette("Moonrise2")) +
116 |   scale_y_continuous(labels = scales::comma) +
117 |   labs(
118 |     title = "",
119 |     caption = ""
120 |   ) +
121 |   theme(
122 |     legend.title = element_blank(),
123 |     legend.position = "right"
124 |   )
125 | ```
126 | 
127 | Test forecast package
128 | 
129 | ```{r}
130 | USAccDeaths %>%
131 |   stl(s.window = "periodic") %>%
132 |   forecast() %>%
133 |   autoplot()
134 | 
135 | AirPassengers %>%
136 |   stlf(lambda = 0) %>%
137 |   autoplot()
138 | 
139 | # Have to convert this dataset to time series format with tsbox::ts_ts()
140 | ansett_summ %>%
141 |   ts_ts() %>%
142 |   stlf(lambda = 0) %>%
143 |   autoplot()
144 | ```
145 | 
146 | Feasts package unfortunately breaks the forecast package
147 | 


--------------------------------------------------------------------------------
/R/Time_Series_Modeling.md:
--------------------------------------------------------------------------------
  1 | Time Series Modeling
  2 | ================
  3 | Jesse Cambon
  4 | 24 November, 2019
  5 | 
  6 | ## References
  7 | 
  8 |   - <https://github.com/christophsax/tsbox>
  9 |   - <https://github.com/tidyverts/tsibble>
 10 |   - <http://pkg.robjhyndman.com/forecast/>
 11 |   - <https://business-science.github.io/sweep/index.html>
 12 |   - <https://cran.rstudio.com/web/packages/sweep/vignettes/SW01_Forecasting_Time_Series_Groups.html>
 13 |   - <https://www.r-bloggers.com/climate-change-modeling-140-years-of-temperature-data-with-tsibble-and-fable/>
 14 |   - <https://github.com/tidyverts/fable>
 15 |   - <https://feasts.tidyverts.org/>
 16 | 
 17 | ## Setup
 18 | 
 19 | ``` r
 20 | library(nycflights13) 
 21 | library(tsibbledata)
 22 | library(wesanderson) # color palettes
 23 | library(tidyverse)
 24 | ```
 25 | 
 26 |     ## ── Attaching packages ───────────────────────────── tidyverse 1.3.0 ──
 27 | 
 28 |     ## ✔ ggplot2 3.2.1     ✔ purrr   0.3.3
 29 |     ## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
 30 |     ## ✔ tidyr   1.0.0     ✔ forcats 0.4.0
 31 |     ## ✔ readr   1.3.1
 32 | 
 33 |     ## ── Conflicts ──────────────────────────────── tidyverse_conflicts() ──
 34 |     ## ✖ dplyr::filter() masks stats::filter()
 35 |     ## ✖ dplyr::lag()    masks stats::lag()
 36 | 
 37 | ``` r
 38 | library(tsibble)
 39 | ```
 40 | 
 41 |     ## 
 42 |     ## Attaching package: 'tsibble'
 43 | 
 44 |     ## The following object is masked from 'package:dplyr':
 45 |     ## 
 46 |     ##     id
 47 | 
 48 | ``` r
 49 | #library(skimr)
 50 | #library(feasts) # Removed for now since it breaks the forecast package
 51 | library(lubridate)
 52 | ```
 53 | 
 54 |     ## 
 55 |     ## Attaching package: 'lubridate'
 56 | 
 57 |     ## The following objects are masked from 'package:tsibble':
 58 |     ## 
 59 |     ##     interval, new_interval
 60 | 
 61 |     ## The following object is masked from 'package:here':
 62 |     ## 
 63 |     ##     here
 64 | 
 65 |     ## The following object is masked from 'package:base':
 66 |     ## 
 67 |     ##     date
 68 | 
 69 | ``` r
 70 | library(forecast)
 71 | ```
 72 | 
 73 |     ## Registered S3 method overwritten by 'xts':
 74 |     ##   method     from
 75 |     ##   as.zoo.xts zoo
 76 | 
 77 |     ## Registered S3 method overwritten by 'quantmod':
 78 |     ##   method            from
 79 |     ##   as.zoo.data.frame zoo
 80 | 
 81 |     ## Registered S3 methods overwritten by 'forecast':
 82 |     ##   method             from    
 83 |     ##   fitted.fracdiff    fracdiff
 84 |     ##   residuals.fracdiff fracdiff
 85 | 
 86 | ``` r
 87 | library(tsbox)
 88 | library(prophet)
 89 | ```
 90 | 
 91 |     ## Loading required package: Rcpp
 92 | 
 93 |     ## Loading required package: rlang
 94 | 
 95 |     ## 
 96 |     ## Attaching package: 'rlang'
 97 | 
 98 |     ## The following objects are masked from 'package:purrr':
 99 |     ## 
100 |     ##     %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
101 |     ##     flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
102 |     ##     splice
103 | 
104 | ``` r
105 | # Set default ggplot theme
106 | theme_set(theme_bw() +
107 |   theme(legend.position = "top",
108 |             plot.subtitle= element_text(face="bold",hjust=0.5),
109 |             plot.title = element_text(lineheight=1, face="bold",hjust = 0.5)))
110 | ```
111 | 
112 | \#Importing Data Import Data and Convert to Tsibble format
113 | 
114 | ``` r
115 | weather <- nycflights13::weather %>% 
116 |   select(origin, time_hour, temp, humid, precip)
117 | 
118 | weather_tsbl <- as_tsibble(weather, key = origin, index = time_hour)
119 | ```
120 | 
121 | # Forecasting with Prophet
122 | 
123 | <https://facebook.github.io/prophet>
124 | 
125 | ``` r
126 | # convert to format needed by prophet (needs specific column names)
127 | weather_ts <- weather_tsbl %>% filter(origin == 'EWR') %>% select(time_hour,temp) %>% 
128 |   rename(ds=time_hour,y=temp)
129 | 
130 | # create prophet model
131 | m <- prophet(weather_ts,yearly.seasonality=TRUE)
132 | 
133 | future <- make_future_dataframe(m, periods = 400)
134 | 
135 | # Create forecast
136 | prophet_forecast <- predict(m, future)
137 | 
138 | # Plot
139 | plot(m, prophet_forecast)
140 | ```
141 | 
142 | ![](../rmd_images/Time_Series_Modeling/unnamed-chunk-2-1.png)<!-- -->
143 | 
144 | ``` r
145 | prophet_plot_components(m, prophet_forecast)
146 | ```
147 | 
148 | ![](../rmd_images/Time_Series_Modeling/unnamed-chunk-2-2.png)<!-- -->
149 | 
150 | ## Data Cleaning
151 | 
152 | Fill Missing Gaps in Data
153 | 
154 | ``` r
155 | nrow(ansett)
156 | ```
157 | 
158 |     ## [1] 7407
159 | 
160 | ``` r
161 | # Fill gaps and filter
162 | ansett_fill <- ansett %>%
163 |   filter(Airports == 'MEL-SYD') %>%
164 |   fill_gaps(Passengers = 0)
165 | 
166 | nrow(ansett_fill)
167 | ```
168 | 
169 |     ## [1] 742
170 | 
171 | ``` r
172 | # Aggregate all classes together , limit to 1990 onward
173 | ansett_summ <- ansett_fill %>% group_by %>%
174 |   summarize(Passengers=sum(Passengers,na.rm=TRUE)) %>%
175 |   filter_index("1990-01" ~ .) %>% as_tsibble(index = Week)
176 | ```
177 | 
178 | ``` r
179 | ggplot(ansett_fill,
180 |           aes(x=Week,y=Passengers)) +
181 |   geom_area(aes(fill = Class), alpha = 1.0) +
182 | scale_fill_manual(values=wes_palette('Moonrise2')) +
183 | scale_y_continuous(labels=scales::comma) +
184 | labs(title='',
185 |      caption='') +
186 | theme(legend.title = element_blank(),
187 |       legend.position='right') 
188 | ```
189 | 
190 | ![](../rmd_images/Time_Series_Modeling/unnamed-chunk-4-1.png)<!-- -->
191 | 
192 | Test forecast package
193 | 
194 | ``` r
195 | USAccDeaths %>% 
196 |   stl(s.window='periodic') %>%
197 |   forecast() %>%
198 |   autoplot()
199 | ```
200 | 
201 | ![](../rmd_images/Time_Series_Modeling/unnamed-chunk-5-1.png)<!-- -->
202 | 
203 | ``` r
204 | AirPassengers %>%
205 |   stlf(lambda=0) %>%
206 |   autoplot()
207 | ```
208 | 
209 | ![](../rmd_images/Time_Series_Modeling/unnamed-chunk-5-2.png)<!-- -->
210 | 
211 | ``` r
212 | # Have to convert this dataset to time series format with tsbox::ts_ts()
213 | ansett_summ %>% ts_ts(.) %>%
214 |   stlf(lambda=0) %>%
215 |   autoplot()
216 | ```
217 | 
218 |     ## [time]: 'Week' [value]: 'Passengers'
219 | 
220 | ![](../rmd_images/Time_Series_Modeling/unnamed-chunk-5-3.png)<!-- -->
221 | 
222 | Feasts package unfortunately breaks the forecast package
223 | 


--------------------------------------------------------------------------------
/R/Titanic_files/figure-gfm/explore-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/R/Titanic_files/figure-gfm/explore-1.png


--------------------------------------------------------------------------------
/R/Titanic_files/figure-gfm/explore-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/R/Titanic_files/figure-gfm/explore-2.png


--------------------------------------------------------------------------------
/R/Titanic_files/figure-gfm/explore-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/R/Titanic_files/figure-gfm/explore-3.png


--------------------------------------------------------------------------------
/R/Titanic_files/figure-gfm/imputation-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/R/Titanic_files/figure-gfm/imputation-1.png


--------------------------------------------------------------------------------
/R/Titanic_files/figure-gfm/imputation-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/R/Titanic_files/figure-gfm/imputation-2.png


--------------------------------------------------------------------------------
/R/Titanic_files/figure-gfm/linear-regression-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/R/Titanic_files/figure-gfm/linear-regression-1.png


--------------------------------------------------------------------------------
/R/Titanic_files/figure-gfm/linear-regression-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/R/Titanic_files/figure-gfm/linear-regression-2.png


--------------------------------------------------------------------------------
/R/Titanic_files/figure-gfm/linear-regression-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/R/Titanic_files/figure-gfm/linear-regression-3.png


--------------------------------------------------------------------------------
/R/Titanic_files/figure-gfm/linear-regression-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/R/Titanic_files/figure-gfm/linear-regression-4.png


--------------------------------------------------------------------------------
/R/Titanic_files/figure-gfm/linear-regression-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/R/Titanic_files/figure-gfm/linear-regression-5.png


--------------------------------------------------------------------------------
/R/Titanic_files/figure-gfm/logistic-regression-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/R/Titanic_files/figure-gfm/logistic-regression-1.png


--------------------------------------------------------------------------------
/R/Titanic_files/figure-gfm/logistic-regression-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/R/Titanic_files/figure-gfm/logistic-regression-2.png


--------------------------------------------------------------------------------
/R/Titanic_files/figure-gfm/logistic-regression-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/R/Titanic_files/figure-gfm/logistic-regression-3.png


--------------------------------------------------------------------------------
/R/Titanic_files/figure-gfm/logistic-regression-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/R/Titanic_files/figure-gfm/logistic-regression-4.png


--------------------------------------------------------------------------------
/R/gapminder_summary_report.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/R/gapminder_summary_report.xlsx


--------------------------------------------------------------------------------
/R/hypothesis_testing.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Hypothesis Testing"
 3 | author: "Jesse Cambon"
 4 | date: "`r format(Sys.time(), '%d %B, %Y')`"
 5 | output:
 6 |   github_document:
 7 |     toc: true
 8 | ---
 9 | 
10 | ```{r knit-settings, include=FALSE}
11 | library(here)
12 | source(here("rmd_config.R"))
13 | ```
14 | 
15 | References:
16 | * https://www.tidymodels.org/learn/statistics/xtabs/
17 | 
18 | ```{r,message=F,warning=F}
19 | library(tidymodels) # Includes the infer package
20 | library(knitr)
21 | 
22 | # Set ggplot theme
23 | theme_set(theme_minimal())
24 | 
25 | 
26 | data(ad_data, package = "modeldata")
27 | 
28 | ```
29 | 
30 | ```{r}
31 | ad_data %>%
32 |   count(Genotype, Class,sort=T) %>% head(5) %>% kable()
33 | ```
34 | 
35 | Chi Squared Test of Independences
36 | 
37 | ```{r}
38 | ad_data %>% 
39 |   chisq_test(Genotype ~ Class) %>%
40 |   kable()
41 | ```
42 | 
43 | 
44 | ```{r}
45 | observed_indep_statistic <- ad_data %>%
46 |   specify(Genotype ~ Class) %>%
47 |   calculate(stat = "Chisq")
48 | 
49 | 
50 | # generate the null distribution using randomization
51 | null_distribution_simulated <- ad_data %>%
52 |   specify(Genotype ~ Class) %>%
53 |   hypothesize(null = "independence") %>%
54 |   generate(reps = 500, type = "permute") %>%
55 |   calculate(stat = "Chisq")
56 | ```
57 | 
58 | ```{r}
59 | null_distribution_simulated %>%
60 |   visualize() + 
61 |   shade_p_value(observed_indep_statistic,
62 |                 direction = "greater") + theme_minimal()
63 | ```
64 | 
65 | ```{r}
66 | ad_data %>%
67 |   specify(Genotype ~ Class) %>%
68 |   hypothesize(null = "independence") %>%
69 |   visualize(method = "theoretical") + 
70 |   shade_p_value(observed_indep_statistic,
71 |                 direction = "greater")
72 | ```
73 | 
74 | 
75 | ```{r}
76 | null_distribution_simulated %>%
77 |   visualize(method = "both") + 
78 |   shade_p_value(observed_indep_statistic,
79 |                 direction = "greater")
80 | ```
81 | 
82 | 


--------------------------------------------------------------------------------
/R/hypothesis_testing.md:
--------------------------------------------------------------------------------
 1 | Hypothesis Testing
 2 | ================
 3 | Jesse Cambon
 4 | 02 February, 2021
 5 | 
 6 | References: \* <https://www.tidymodels.org/learn/statistics/xtabs/>
 7 | 
 8 | ``` r
 9 | library(tidymodels) # Includes the infer package
10 | library(knitr)
11 | 
12 | # Set ggplot theme
13 | theme_set(theme_minimal())
14 | 
15 | 
16 | data(ad_data, package = "modeldata")
17 | ```
18 | 
19 | ``` r
20 | ad_data %>%
21 |   count(Genotype, Class,sort=T) %>% head(5) %>% kable()
22 | ```
23 | 
24 | | Genotype | Class    |   n |
25 | |:---------|:---------|----:|
26 | | E3E3     | Control  | 133 |
27 | | E3E4     | Control  |  65 |
28 | | E3E4     | Impaired |  41 |
29 | | E3E3     | Impaired |  34 |
30 | | E2E3     | Control  |  30 |
31 | 
32 | Chi Squared Test of Independences
33 | 
34 | ``` r
35 | ad_data %>% 
36 |   chisq_test(Genotype ~ Class) %>%
37 |   kable()
38 | ```
39 | 
40 |     ## Warning in stats::chisq.test(table(x), ...): Chi-squared approximation may be
41 |     ## incorrect
42 | 
43 | | statistic | chisq\_df |  p\_value |
44 | |----------:|----------:|----------:|
45 | |  21.57748 |         5 | 0.0006298 |
46 | 
47 | ``` r
48 | observed_indep_statistic <- ad_data %>%
49 |   specify(Genotype ~ Class) %>%
50 |   calculate(stat = "Chisq")
51 | 
52 | 
53 | # generate the null distribution using randomization
54 | null_distribution_simulated <- ad_data %>%
55 |   specify(Genotype ~ Class) %>%
56 |   hypothesize(null = "independence") %>%
57 |   generate(reps = 500, type = "permute") %>%
58 |   calculate(stat = "Chisq")
59 | ```
60 | 
61 | ``` r
62 | null_distribution_simulated %>%
63 |   visualize() + 
64 |   shade_p_value(observed_indep_statistic,
65 |                 direction = "greater") + theme_minimal()
66 | ```
67 | 
68 | ![](../rmd_images/hypothesis_testing/unnamed-chunk-5-1.png)<!-- -->
69 | 
70 | ``` r
71 | ad_data %>%
72 |   specify(Genotype ~ Class) %>%
73 |   hypothesize(null = "independence") %>%
74 |   visualize(method = "theoretical") + 
75 |   shade_p_value(observed_indep_statistic,
76 |                 direction = "greater")
77 | ```
78 | 
79 |     ## Warning: Check to make sure the conditions have been met for the theoretical
80 |     ## method. {infer} currently does not check these for you.
81 | 
82 | ![](../rmd_images/hypothesis_testing/unnamed-chunk-6-1.png)<!-- -->
83 | 
84 | ``` r
85 | null_distribution_simulated %>%
86 |   visualize(method = "both") + 
87 |   shade_p_value(observed_indep_statistic,
88 |                 direction = "greater")
89 | ```
90 | 
91 |     ## Warning: Check to make sure the conditions have been met for the theoretical
92 |     ## method. {infer} currently does not check these for you.
93 | 
94 | ![](../rmd_images/hypothesis_testing/unnamed-chunk-7-1.png)<!-- -->
95 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # The Data Science Codex
 2 | 
 3 | A collection of code and resources to serve as a starting point for data science projects. For more explanation and material on R visit [my blog](https://jessecambon.github.io/). 
 4 | 
 5 | <span>
 6 | <a href = "https://github.com/jessecambon/Data-Science-Cookbook/blob/master/R/Visualization_Cookbook.md#lollipop"><img src="https://github.com/jessecambon/Data-Science-Cookbook/blob/master/rmd_images/Visualization_Cookbook/lollipop-1.png" height="150px"/></a>
 7 | <a href = "https://github.com/jessecambon/Data-Science-Cookbook/blob/master/R/Visualization_Cookbook.md#bubbleplot"><img src="https://github.com/jessecambon/Data-Science-Cookbook/blob/master/rmd_images/Visualization_Cookbook/bubbleplot-1.png" height="150px"/></a>
 8 | <a href = "https://github.com/jessecambon/Data-Science-Cookbook/blob/master/R/Geospatial_Analysis.md"><img src="https://github.com/jessecambon/Data-Science-Codex/blob/master/rmd_images/Geospatial_Analysis/unnamed-chunk-2-1.png" height="150px"/></a> 
 9 | <a href = "https://github.com/jessecambon/Data-Science-Cookbook/blob/master/R/Visualization_Cookbook.md#ridgeplot"><img src="https://raw.githubusercontent.com/jessecambon/Data-Science-Codex/master/rmd_images/Visualization_Cookbook/ridge-1.png" height="150px"/></a> 
10 | <a href = "https://github.com/jessecambon/Data-Science-Cookbook/blob/master/R/Titanic.md#logistic-regression-model"><img src="https://github.com/jessecambon/Data-Science-Codex/blob/master/rmd_images/Titanic/logistic-regression-2.png" height="150px"/></a> 
11 | <a href = "https://github.com/jessecambon/Data-Science-Cookbook/blob/master/R/Titanic.md#logistic-regression-model"><img src="https://github.com/jessecambon/Data-Science-Cookbook/blob/master/rmd_images/Titanic/logistic-regression-1.png" height="150px"/></a> 
12 | </span>
13 | 
14 | ## Notes 
15 | * [Resources](Resources.md) - Websites and references that I find helpful for data science projects
16 | * [Developing With R](R-Development.md) - Notes on R package development
17 | * [How to Git](Git-HowTo.md) - version control with git
18 | * [How to Anaconda](Anaconda-HowTo.md) - managing environments with Anaconda
19 | 
20 | ## Data Visualization
21 | * [Visualization Cookbook](R/Visualization_Cookbook.md) (R) - A wide variety of data visualizations demonstrated.
22 | * [Geospatial Data Analysis](R/Geospatial_Analysis.md) (R) - Making maps with R.
23 | 
24 | ## Statistical Modeling and Machine Learning
25 | * [Modeling Fundamentals](R/Titanic.md) (R) - A primer on logistic and linear regression modeling with the classic Titanic dataset.
26 | * [Survival Analysis](R/Survival.md) (R) - Survival analysis methods such as cox proportion hazard models and Kaplan-Meier curves.
27 | * [Modeling Workflows](R/Modeling_Workflow.md) (R) - Streamlined Tidyverse modeling workflows with the gapminder dataset.
28 | * [Multilevel Models](R/Multilevel-Models.md) (R) - Multi-level aka. mixed effects models
29 | * [Time Series Modeling](R/Time_Series_Modeling.md) (R) - Experimenting with time series modeling (tsibble, forecast libraries, prophet, etc.)
30 | * [Ordinal Regression](R/Ordinal_Regression.md) (R) - Experimenting with ordinal (ranked categorical outcome) regression
31 | * [Presenting Regression Models](R/Regression-Model-Tidying.md) (R) - Code for cleaning the outputs of regression models for presentations.
32 | * [Sklearn Modeling Workflows](Python/Sklearn-Workflow.ipynb) (Python) - Modeling workflows with sklearn (cross-validation, randomized search for optimizing hyperparameters, lift curves).
33 | * [Sklearn - Skopt Workflow](Python/sklearn_skopt_pipeline.ipynb) (Python) - Modeling workflow with sklearn and scikit-optimize (bayesian hyperparameter optimization.
34 | * [Machine Learning with Caret](R/Caret.md) (R) - Using the Caret library for machine learning.
35 | * [Parsnip](R/Parsnip.md) (R) - fitting models with the parsnip package (from tidymodels)
36 | 
37 | ## Bayesian Models
38 | * [Bayesian Basics](R/Bayesian_Basics.md) (R) - exploring a simple Bayesian multilevel model
39 | * [Bayesian Modeling](R/Bayesian_Modeling.md) (R) - Experimenting with Bayesian models using rstanarm
40 | * [Comparing Bayesian Packages](R/Comparing_Bayesian_Packages.md) (R) - Comparing rstanarm, brms, and rstan.
41 | 
42 | ##  Clustering 
43 | * [k-means clustering](R/Clustering.md) (R) - Using the k-means algorithm to cluster data.
44 | * [Clustering](Python/Clustering.ipynb) (Python) - Agglomerative (Hierarchical) clustering, k-means clustering, and Gaussian mixture models
45 | 
46 | ## Stats Analysis
47 | * [Power Analysis](R/Power_Analysis.md) (R) - Statistical power analysis
48 | * [Distribution Sampling and Hypothesis Testing](R/Distribution_Sampling.md) (R)
49 | * [Hypothesis Testing](R/hypothesis_testing.md) (R)
50 | 
51 | ## NLP 
52 | * [Document Embeddings](Python/state_of_union_embeddings.ipynb) (Python) - Using word embeddings to compare the similarity of State of the Union addresses.
53 | * [State of the Union Analysis](Python/state_of_union_v2.ipynb) (Python) - An exploration of state of the union addresses with topic modeling and sentiment analysis. 
54 | * [Sentiment Analysis](R/Sentiment_Analysis.md) (R) - Exploring sentiment analysis in R.
55 | * [LSTM Demo](Python/LSTM-Demo.ipynb) (Python) - An LSTM network for predicting if a company review from glassdoor is positive
56 | 
57 | ## Miscellaneous
58 | * [R-Quickstart](R/R-Quickstart.md) (R) - Minimal data analysis and visualization workflows. See [the blog post "Data Science Essentials"](https://jessecambon.github.io/2020/01/12/data-science-essentials.html) for more details and explanation.
59 | * [Creating Formatted Spreadsheets](R/Create_Formatted_Spreadsheet.md) (R) - How to create a custom formatted spreadsheet report with the openxlsx R package.
60 | * [Using Python and R Together](Python/R-Python-Hybrid.ipynb) - How to use python and R code together in the same Jupyter notebook with the rpy2 python package.
61 | * [R Quotation](R/R_Quotation.md) (R) - If you want to do certain things such as pass variable names as arguments to a function in R, you have to use quotation methods like `quo()` and `enquo()`. This notebook demonstrates how to do this. See [my blog post on Tidy Evaluation](https://jessecambon.github.io/2019/12/08/practical-tidy-evaluation.html) for more details and explanation.
62 | * [SQL Databases](Python/SQL_Databases.ipynb) (Python) - Code for creating and manipulating a SQL database.
63 | 


--------------------------------------------------------------------------------
/Resources.md:
--------------------------------------------------------------------------------
 1 | A categorized list of data science resources. 
 2 | 
 3 | ## General
 4 | * [RStudio Cheatsheets](https://www.rstudio.com/resources/cheatsheets/) 
 5 | * [Bookdown Books](https://bookdown.org/) - A great collection of free R books.
 6 | * [R For Data Science](http://r4ds.had.co.nz/index.html) - Classic text by Hadley Wickham, chief overlord of all things Tidyverse.
 7 | * [R-Cookbook](http://www.cookbook-r.com) - Categorized useful R code. 
 8 | * [R for Public Policy](http://www.lecy.info/r-for-public-policy/) - List of resources.
 9 | * [R-Bloggers](https://www.r-bloggers.com) - Great resource for the latest developments in the R community. Subscribe to their emails.
10 | * [Tidyverse](https://www.tidyverse.org/index.html) - A well documented ecosystem of packages for elegant data manipulation and visualization. 
11 | * [Data Science with Python](https://jakevdp.github.io/PythonDataScienceHandbook/) - Book by Jake VanderPlas
12 | * [Pandas Cookbook](http://pandas.pydata.org/pandas-docs/stable/user_guide/cookbook.html) - Code for data manipulation in python
13 | 
14 | ## Data
15 | * [An Increxibly Comprehensive List of APIs for R](https://github.com/ropensci/opendata/blob/master/README.md)
16 | * [Google Dataset Search](https://toolbox.google.com/datasetsearch)
17 | * [KDNuggets](https://www.kdnuggets.com/datasets/index.html)
18 | * [Microsoft's List of R Data Sources](https://mran.microsoft.com/documents/data)
19 | * [Kaggle](https://www.kaggle.com/datasets) - Datasets used for Kaggle competitions.
20 | * [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml)
21 | * [R Datasets](http://stat.ethz.ch/R-manual/R-devel/library/datasets/html/00Index.html) - Documentation on the inbuilt datasets in R.
22 | * [Data.gov](https://www.data.gov/) - A good place to start for government data.
23 | * [R Packages for Importing Data](https://www.computerworld.com/article/3109890/data-analytics/these-r-packages-import-sports-weather-stock-data-and-more.html)
24 | 
25 | ## Visualization Cookbooks
26 | * [Data-To-Viz](https://www.data-to-viz.com) - A comprehensive data viz reference with lots of great code.
27 | * [BBC R Cookbook](https://bbc.github.io/rcookbook) - R code to create plots in the style BBC 
28 | * [Top 50 Ggplot Visualizations](http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html)
29 | * [My R Cookbook](https://github.com/jessecambon/Data-Science-Codex/blob/master/source/Chart_Collection.md) - My hodge-podge collection of visualizations with R code.
30 | * [D3 Blocks](https://bl.ocks.org/)
31 | * [D3.JS Gallery](https://github.com/d3/d3/wiki/Gallery)
32 | * [R-Shiny Gallery](https://shiny.rstudio.com/gallery/)
33 | * [Seaborn Gallery](https://seaborn.pydata.org/examples/index.html)
34 | * [Matplotlib Gallery](https://matplotlib.org/gallery.html)
35 | 
36 | ## Visualization References
37 | * [Dataviz Project](http://datavizproject.com/) - Categorized data visualizations.
38 | * [Visual Capitalist](http://www.visualcapitalist.com/)
39 | * [Flowing Data](http://flowingdata.com/) - Good newsletter and site on data viz techniques.
40 | * [Stephen Few](http://www.perceptualedge.com/) - A data visualization expert.
41 | * [Edward Tufte](https://www.edwardtufte.com/tufte/) - Another data viz expert.
42 | 
43 | ## Visualization Libraries
44 | * [ggplot](https://ggplot2.tidyverse.org/index.html)
45 | * [cowplot](https://cran.r-project.org/web/packages/cowplot/vignettes/introduction.html) - Tweaks to ggplot for publication charts.
46 | * [plotly](https://plot.ly/r/) - Good tool for interactive graphs.
47 | * [dygraphs](https://rstudio.github.io/dygraphs/)
48 | * [rbokeh](http://hafen.github.io/rbokeh/)
49 | * [C3](https://github.com/mrjoh3/c3)
50 | * [rCharts](https://github.com/ramnathv/rCharts)
51 | * [tmap](https://github.com/mtennekes/tmap) - Geospatial mapping.
52 | * [ggalluvial](https://github.com/corybrunson/ggalluvial) - Flow diagrams
53 | 
54 | ## Statistics
55 | * [Cross Validated](https://stats.stackexchange.com/) - StackExchange for Statistics
56 | * [Intro to Statistical Learning Textbook](http://www-bcf.usc.edu/~gareth/ISL/) - Freely available statistics textbook that is tailored to application with lots of code examples in R. This a reworked version of the classic 'Elements of Statistical Learning' text which is heavier on statistical theory.
57 | * [Elements of Statistical Learning](https://web.stanford.edu/~hastie/ElemStatLearn/) - Freely available classic Statistics textbook.
58 | * [Biostat Handbook](http://www.biostathandbook.com/) - A great concise and and accessible reference for statistical methods which have relevance in a wide variety of fields (not just Biostatistics).
59 | * [Frank Harrell](http://www.fharrell.com/) - Head of the Biostats department @ Vanderbilt and author of several R packages (Hmsic and rms). Good resource on regression modeling.
60 | * [Survival analysis tutorial](http://rpubs.com/sinhrks/plot_surv)
61 | * [A very comprehensive article on regression](https://www.r-bloggers.com/15-types-of-regression-you-should-know/)
62 | * [Tidyverse style survey package](https://cran.r-project.org/web/packages/srvyr/vignettes/srvyr-vs-survey.html)
63 | * [Course on Generalized Linear Models](http://data.princeton.edu/wws509/notes/#) - Princeton course on GLMs including logistic and poisson regression.
64 | 
65 | ## Code Reference
66 | * [Code for NLP Models](https://github.com/bicepjai/Deep-Survey-Text-Classification) - Code implementations for 14 NLP text classification papers
67 | * [Papers With Code (Github)](https://github.com/zziz/pwc)
68 | * [Papers With Code (Site)](https://paperswithcode.com/)
69 | 
70 | ## Maps
71 | * [Sample maps](https://bhaskarvk.github.io/user2017.geodataviz/notebooks/02-Static-Maps.nb.html)
72 | * [Making Maps with R](http://eriqande.github.io/rep-res-web/lectures/making-maps-with-R.html)
73 | * [Wind Maps](http://www.hilltop-analytics.com/2018/08/football-wind-maps/)
74 | 
75 | ## Miscellaneous Packages
76 | * [TidyText](https://github.com/juliasilge/tidytext)
77 | * [Broom](https://github.com/tidymodels/broom) - Useful functions for formatting the output of statistical models. 
78 | 
79 | ## Creating Deliverables
80 | * [R Markdown](https://rmarkdown.rstudio.com/)
81 | * [R Markdown - The Definitive Guide](https://bookdown.org/yihui/rmarkdown/) - Free book on RMarkdown.
82 | * [Tables in R Markdown](https://haozhu233.github.io/kableExtra/awesome_table_in_html.html)
83 | * [RMarkdown Cheatsheet](https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet)
84 | * [officer](https://davidgohel.github.io/officer/index.html) - Creating Microsoft Office deliverables in R.
85 | 
86 | ## Colors
87 | * [R-Cookbook Color Tutorial](http://www.cookbook-r.com/Graphs/Colors_(ggplot2)/) - Includes a good color blind palette
88 | * [Datawrapper - Colorguide](https://blog.datawrapper.de/colorguide/)
89 | * [Color Brewer](http://colorbrewer2.org)
90 | * [Viz Palette](http://projects.susielu.com/viz-palette?colors=%5B%22#1DABE6%22,%22#1C366A%22,%22#C3CED0%22,%22#E43034%22,%22#FC4E51%22,%22#AF060F%22%5D&backgroundColor=%22white%22&fontColor=%22black%22)
91 | * [Viridis](https://cran.r-project.org/web/packages/viridis/vignettes/intro-to-viridis.html#gallery)
92 | 
93 | ## Census Data
94 | * [IPUMS](https://usa.ipums.org/) - Documented Census Microdata
95 | * [ipumsr](https://cran.r-project.org/web/packages/ipumsr/vignettes/ipums.html) – for loading and manipulating IPUMs data in R
96 | * [Tidycensus](https://walkerke.github.io/tidycensus/) - Great package for Census data analysis, particularly for geospatial analysis.
97 | * [Tidycensus Tutorial](https://www.mytinyshinys.com/2017/06/30/tidycensus/)
98 | * [CensusReporter](https://censusreporter.org/) - Tools for exploring Census data.
99 | 


--------------------------------------------------------------------------------
/rmd_config.R:
--------------------------------------------------------------------------------
 1 | # This file contains knitr settings for Rmarkdown files
 2 | # run this file via source() in all RMarkdown files
 3 | library(knitr)
 4 | library(stringr)
 5 | # get name of file during knitting and strip file extension
 6 | rmd_filename <- str_remove(knitr::current_input(), "\\.Rmd")
 7 | 
 8 | # Figure path on disk = base.dir + fig.path
 9 | # Figure URL online = base.url + fig.path
10 | knitr::opts_knit$set(base.dir = str_c(here::here(), "/"), base.url = "../") # project root folder
11 | knitr::opts_chunk$set(fig.path = str_c("rmd_images/", rmd_filename, "/"), echo = TRUE)
12 | 


--------------------------------------------------------------------------------
/rmd_images/Bayes/unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayes/unnamed-chunk-5-1.png


--------------------------------------------------------------------------------
/rmd_images/Bayesian_Basics/unnamed-chunk-10-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Basics/unnamed-chunk-10-1.png


--------------------------------------------------------------------------------
/rmd_images/Bayesian_Basics/unnamed-chunk-3-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Basics/unnamed-chunk-3-1.png


--------------------------------------------------------------------------------
/rmd_images/Bayesian_Basics/unnamed-chunk-4-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Basics/unnamed-chunk-4-1.png


--------------------------------------------------------------------------------
/rmd_images/Bayesian_Basics/unnamed-chunk-4-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Basics/unnamed-chunk-4-2.png


--------------------------------------------------------------------------------
/rmd_images/Bayesian_Basics/unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Basics/unnamed-chunk-5-1.png


--------------------------------------------------------------------------------
/rmd_images/Bayesian_Basics/unnamed-chunk-6-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Basics/unnamed-chunk-6-1.png


--------------------------------------------------------------------------------
/rmd_images/Bayesian_Basics/unnamed-chunk-7-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Basics/unnamed-chunk-7-1.png


--------------------------------------------------------------------------------
/rmd_images/Bayesian_Basics/unnamed-chunk-7-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Basics/unnamed-chunk-7-2.png


--------------------------------------------------------------------------------
/rmd_images/Bayesian_Basics/unnamed-chunk-7-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Basics/unnamed-chunk-7-3.png


--------------------------------------------------------------------------------
/rmd_images/Bayesian_Basics/unnamed-chunk-8-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Basics/unnamed-chunk-8-1.png


--------------------------------------------------------------------------------
/rmd_images/Bayesian_Basics/unnamed-chunk-9-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Basics/unnamed-chunk-9-1.png


--------------------------------------------------------------------------------
/rmd_images/Bayesian_Distributions/unnamed-chunk-3-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Distributions/unnamed-chunk-3-1.png


--------------------------------------------------------------------------------
/rmd_images/Bayesian_Distributions/unnamed-chunk-4-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Distributions/unnamed-chunk-4-1.png


--------------------------------------------------------------------------------
/rmd_images/Bayesian_Distributions/unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Distributions/unnamed-chunk-5-1.png


--------------------------------------------------------------------------------
/rmd_images/Bayesian_Modeling/unnamed-chunk-12-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Modeling/unnamed-chunk-12-1.png


--------------------------------------------------------------------------------
/rmd_images/Bayesian_Modeling/unnamed-chunk-12-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Modeling/unnamed-chunk-12-2.png


--------------------------------------------------------------------------------
/rmd_images/Bayesian_Modeling/unnamed-chunk-15-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Modeling/unnamed-chunk-15-1.png


--------------------------------------------------------------------------------
/rmd_images/Bayesian_Modeling/unnamed-chunk-15-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Modeling/unnamed-chunk-15-2.png


--------------------------------------------------------------------------------
/rmd_images/Bayesian_Modeling/unnamed-chunk-15-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Modeling/unnamed-chunk-15-3.png


--------------------------------------------------------------------------------
/rmd_images/Bayesian_Modeling/unnamed-chunk-19-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Modeling/unnamed-chunk-19-1.png


--------------------------------------------------------------------------------
/rmd_images/Bayesian_Modeling/unnamed-chunk-19-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Modeling/unnamed-chunk-19-2.png


--------------------------------------------------------------------------------
/rmd_images/Bayesian_Modeling/unnamed-chunk-19-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Modeling/unnamed-chunk-19-3.png


--------------------------------------------------------------------------------
/rmd_images/Bayesian_Modeling/unnamed-chunk-20-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Modeling/unnamed-chunk-20-1.png


--------------------------------------------------------------------------------
/rmd_images/Bayesian_Modeling/unnamed-chunk-20-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Modeling/unnamed-chunk-20-2.png


--------------------------------------------------------------------------------
/rmd_images/Bayesian_Modeling/unnamed-chunk-20-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Modeling/unnamed-chunk-20-3.png


--------------------------------------------------------------------------------
/rmd_images/Bayesian_Modeling/unnamed-chunk-21-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Modeling/unnamed-chunk-21-1.png


--------------------------------------------------------------------------------
/rmd_images/Bayesian_Modeling/unnamed-chunk-23-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Modeling/unnamed-chunk-23-1.png


--------------------------------------------------------------------------------
/rmd_images/Bayesian_Modeling/unnamed-chunk-4-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Bayesian_Modeling/unnamed-chunk-4-1.png


--------------------------------------------------------------------------------
/rmd_images/Caret/results-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Caret/results-1.png


--------------------------------------------------------------------------------
/rmd_images/Caret/results-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Caret/results-2.png


--------------------------------------------------------------------------------
/rmd_images/Clustering/unnamed-chunk-3-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Clustering/unnamed-chunk-3-1.png


--------------------------------------------------------------------------------
/rmd_images/Clustering/unnamed-chunk-3-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Clustering/unnamed-chunk-3-2.png


--------------------------------------------------------------------------------
/rmd_images/Comparing_Bayesian_Packages/unnamed-chunk-10-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Comparing_Bayesian_Packages/unnamed-chunk-10-1.png


--------------------------------------------------------------------------------
/rmd_images/Comparing_Bayesian_Packages/unnamed-chunk-10-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Comparing_Bayesian_Packages/unnamed-chunk-10-2.png


--------------------------------------------------------------------------------
/rmd_images/Comparing_Bayesian_Packages/unnamed-chunk-11-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Comparing_Bayesian_Packages/unnamed-chunk-11-1.png


--------------------------------------------------------------------------------
/rmd_images/Comparing_Bayesian_Packages/unnamed-chunk-11-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Comparing_Bayesian_Packages/unnamed-chunk-11-2.png


--------------------------------------------------------------------------------
/rmd_images/Comparing_Bayesian_Packages/unnamed-chunk-11-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Comparing_Bayesian_Packages/unnamed-chunk-11-3.png


--------------------------------------------------------------------------------
/rmd_images/Comparing_Bayesian_Packages/unnamed-chunk-12-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Comparing_Bayesian_Packages/unnamed-chunk-12-1.png


--------------------------------------------------------------------------------
/rmd_images/Comparing_Bayesian_Packages/unnamed-chunk-6-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Comparing_Bayesian_Packages/unnamed-chunk-6-1.png


--------------------------------------------------------------------------------
/rmd_images/Comparing_Bayesian_Packages/unnamed-chunk-6-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Comparing_Bayesian_Packages/unnamed-chunk-6-2.png


--------------------------------------------------------------------------------
/rmd_images/Distribution_Sampling/unnamed-chunk-3-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Distribution_Sampling/unnamed-chunk-3-1.png


--------------------------------------------------------------------------------
/rmd_images/Distribution_Sampling/unnamed-chunk-4-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Distribution_Sampling/unnamed-chunk-4-1.png


--------------------------------------------------------------------------------
/rmd_images/Geospatial_Analysis/locale-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Geospatial_Analysis/locale-1.png


--------------------------------------------------------------------------------
/rmd_images/Geospatial_Analysis/unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Geospatial_Analysis/unnamed-chunk-2-1.png


--------------------------------------------------------------------------------
/rmd_images/Geospatial_Analysis/unnamed-chunk-3-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Geospatial_Analysis/unnamed-chunk-3-1.png


--------------------------------------------------------------------------------
/rmd_images/Geospatial_Analysis/unnamed-chunk-3-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Geospatial_Analysis/unnamed-chunk-3-2.png


--------------------------------------------------------------------------------
/rmd_images/Modeling_Workflow/explore-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Modeling_Workflow/explore-1.png


--------------------------------------------------------------------------------
/rmd_images/Modeling_Workflow/explore-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Modeling_Workflow/explore-2.png


--------------------------------------------------------------------------------
/rmd_images/Modeling_Workflow/explore-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Modeling_Workflow/explore-3.png


--------------------------------------------------------------------------------
/rmd_images/Modeling_Workflow/plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Modeling_Workflow/plot-1.png


--------------------------------------------------------------------------------
/rmd_images/Modeling_Workflow/plot-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Modeling_Workflow/plot-2.png


--------------------------------------------------------------------------------
/rmd_images/Multilevel-Models/unnamed-chunk-3-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Multilevel-Models/unnamed-chunk-3-1.png


--------------------------------------------------------------------------------
/rmd_images/Multilevel-Models/unnamed-chunk-3-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Multilevel-Models/unnamed-chunk-3-2.png


--------------------------------------------------------------------------------
/rmd_images/Multilevel-Models/unnamed-chunk-6-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Multilevel-Models/unnamed-chunk-6-1.png


--------------------------------------------------------------------------------
/rmd_images/Multilevel-Models/unnamed-chunk-7-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Multilevel-Models/unnamed-chunk-7-1.png


--------------------------------------------------------------------------------
/rmd_images/Multilevel-Models/unnamed-chunk-8-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Multilevel-Models/unnamed-chunk-8-1.png


--------------------------------------------------------------------------------
/rmd_images/Multilevel-Models/unnamed-chunk-9-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Multilevel-Models/unnamed-chunk-9-1.png


--------------------------------------------------------------------------------
/rmd_images/Multilevel-Models/unnamed-chunk-9-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Multilevel-Models/unnamed-chunk-9-2.png


--------------------------------------------------------------------------------
/rmd_images/Ordinal_Regression/unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Ordinal_Regression/unnamed-chunk-2-1.png


--------------------------------------------------------------------------------
/rmd_images/Ordinal_Regression/unnamed-chunk-4-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Ordinal_Regression/unnamed-chunk-4-1.png


--------------------------------------------------------------------------------
/rmd_images/Ordinal_Regression/unnamed-chunk-6-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Ordinal_Regression/unnamed-chunk-6-1.png


--------------------------------------------------------------------------------
/rmd_images/Ordinal_Regression/unnamed-chunk-7-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Ordinal_Regression/unnamed-chunk-7-1.png


--------------------------------------------------------------------------------
/rmd_images/Ordinal_Regression/unnamed-chunk-8-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Ordinal_Regression/unnamed-chunk-8-1.png


--------------------------------------------------------------------------------
/rmd_images/Parsnip/unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Parsnip/unnamed-chunk-5-1.png


--------------------------------------------------------------------------------
/rmd_images/Parsnip/unnamed-chunk-6-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Parsnip/unnamed-chunk-6-1.png


--------------------------------------------------------------------------------
/rmd_images/R-Quickstart/histogram-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/R-Quickstart/histogram-1.png


--------------------------------------------------------------------------------
/rmd_images/R-Quickstart/line-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/R-Quickstart/line-1.png


--------------------------------------------------------------------------------
/rmd_images/R-Quickstart/lollipop-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/R-Quickstart/lollipop-1.png


--------------------------------------------------------------------------------
/rmd_images/R-Quickstart/unnamed-chunk-18-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/R-Quickstart/unnamed-chunk-18-1.png


--------------------------------------------------------------------------------
/rmd_images/R-Quickstart/unnamed-chunk-22-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/R-Quickstart/unnamed-chunk-22-1.png


--------------------------------------------------------------------------------
/rmd_images/R_Quotation/unnamed-chunk-1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/R_Quotation/unnamed-chunk-1-1.png


--------------------------------------------------------------------------------
/rmd_images/R_Quotation/unnamed-chunk-1-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/R_Quotation/unnamed-chunk-1-2.png


--------------------------------------------------------------------------------
/rmd_images/R_Quotation/unnamed-chunk-1-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/R_Quotation/unnamed-chunk-1-3.png


--------------------------------------------------------------------------------
/rmd_images/Regression-Model-Tidying/unnamed-chunk-3-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Regression-Model-Tidying/unnamed-chunk-3-1.png


--------------------------------------------------------------------------------
/rmd_images/Regression-Model-Tidying/unnamed-chunk-3-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Regression-Model-Tidying/unnamed-chunk-3-2.png


--------------------------------------------------------------------------------
/rmd_images/Survival/unnamed-chunk-10-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-10-1.png


--------------------------------------------------------------------------------
/rmd_images/Survival/unnamed-chunk-11-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-11-1.png


--------------------------------------------------------------------------------
/rmd_images/Survival/unnamed-chunk-15-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-15-1.png


--------------------------------------------------------------------------------
/rmd_images/Survival/unnamed-chunk-15-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-15-2.png


--------------------------------------------------------------------------------
/rmd_images/Survival/unnamed-chunk-16-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-16-1.png


--------------------------------------------------------------------------------
/rmd_images/Survival/unnamed-chunk-16-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-16-2.png


--------------------------------------------------------------------------------
/rmd_images/Survival/unnamed-chunk-17-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-17-1.png


--------------------------------------------------------------------------------
/rmd_images/Survival/unnamed-chunk-17-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-17-2.png


--------------------------------------------------------------------------------
/rmd_images/Survival/unnamed-chunk-18-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-18-1.png


--------------------------------------------------------------------------------
/rmd_images/Survival/unnamed-chunk-19-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-19-1.png


--------------------------------------------------------------------------------
/rmd_images/Survival/unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-2-1.png


--------------------------------------------------------------------------------
/rmd_images/Survival/unnamed-chunk-20-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-20-1.png


--------------------------------------------------------------------------------
/rmd_images/Survival/unnamed-chunk-21-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-21-1.png


--------------------------------------------------------------------------------
/rmd_images/Survival/unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-5-1.png


--------------------------------------------------------------------------------
/rmd_images/Survival/unnamed-chunk-6-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-6-1.png


--------------------------------------------------------------------------------
/rmd_images/Survival/unnamed-chunk-7-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-7-1.png


--------------------------------------------------------------------------------
/rmd_images/Survival/unnamed-chunk-7-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-7-2.png


--------------------------------------------------------------------------------
/rmd_images/Survival/unnamed-chunk-8-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-8-1.png


--------------------------------------------------------------------------------
/rmd_images/Survival/unnamed-chunk-9-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-9-1.png


--------------------------------------------------------------------------------
/rmd_images/Survival/unnamed-chunk-9-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Survival/unnamed-chunk-9-2.png


--------------------------------------------------------------------------------
/rmd_images/Time_Series_Modeling/unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Time_Series_Modeling/unnamed-chunk-2-1.png


--------------------------------------------------------------------------------
/rmd_images/Time_Series_Modeling/unnamed-chunk-2-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Time_Series_Modeling/unnamed-chunk-2-2.png


--------------------------------------------------------------------------------
/rmd_images/Time_Series_Modeling/unnamed-chunk-4-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Time_Series_Modeling/unnamed-chunk-4-1.png


--------------------------------------------------------------------------------
/rmd_images/Time_Series_Modeling/unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Time_Series_Modeling/unnamed-chunk-5-1.png


--------------------------------------------------------------------------------
/rmd_images/Time_Series_Modeling/unnamed-chunk-5-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Time_Series_Modeling/unnamed-chunk-5-2.png


--------------------------------------------------------------------------------
/rmd_images/Time_Series_Modeling/unnamed-chunk-5-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Time_Series_Modeling/unnamed-chunk-5-3.png


--------------------------------------------------------------------------------
/rmd_images/Titanic/explore-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Titanic/explore-1.png


--------------------------------------------------------------------------------
/rmd_images/Titanic/explore-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Titanic/explore-2.png


--------------------------------------------------------------------------------
/rmd_images/Titanic/explore-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Titanic/explore-3.png


--------------------------------------------------------------------------------
/rmd_images/Titanic/imputation-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Titanic/imputation-1.png


--------------------------------------------------------------------------------
/rmd_images/Titanic/imputation-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Titanic/imputation-2.png


--------------------------------------------------------------------------------
/rmd_images/Titanic/linear-regression-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Titanic/linear-regression-1.png


--------------------------------------------------------------------------------
/rmd_images/Titanic/linear-regression-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Titanic/linear-regression-2.png


--------------------------------------------------------------------------------
/rmd_images/Titanic/linear-regression-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Titanic/linear-regression-3.png


--------------------------------------------------------------------------------
/rmd_images/Titanic/linear-regression-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Titanic/linear-regression-4.png


--------------------------------------------------------------------------------
/rmd_images/Titanic/linear-regression-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Titanic/linear-regression-5.png


--------------------------------------------------------------------------------
/rmd_images/Titanic/logistic-regression-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Titanic/logistic-regression-1.png


--------------------------------------------------------------------------------
/rmd_images/Titanic/logistic-regression-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Titanic/logistic-regression-2.png


--------------------------------------------------------------------------------
/rmd_images/Titanic/logistic-regression-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Titanic/logistic-regression-3.png


--------------------------------------------------------------------------------
/rmd_images/Titanic/logistic-regression-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Titanic/logistic-regression-4.png


--------------------------------------------------------------------------------
/rmd_images/Vehicles/compare-models-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Vehicles/compare-models-1.png


--------------------------------------------------------------------------------
/rmd_images/Vehicles/compare-models-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Vehicles/compare-models-2.png


--------------------------------------------------------------------------------
/rmd_images/Visualization_Cookbook/bar-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/bar-1.png


--------------------------------------------------------------------------------
/rmd_images/Visualization_Cookbook/bar-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/bar-2.png


--------------------------------------------------------------------------------
/rmd_images/Visualization_Cookbook/bar-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/bar-3.png


--------------------------------------------------------------------------------
/rmd_images/Visualization_Cookbook/bar-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/bar-4.png


--------------------------------------------------------------------------------
/rmd_images/Visualization_Cookbook/boxplot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/boxplot-1.png


--------------------------------------------------------------------------------
/rmd_images/Visualization_Cookbook/bubbleplot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/bubbleplot-1.png


--------------------------------------------------------------------------------
/rmd_images/Visualization_Cookbook/dotplot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/dotplot-1.png


--------------------------------------------------------------------------------
/rmd_images/Visualization_Cookbook/dotplot-rank-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/dotplot-rank-1.png


--------------------------------------------------------------------------------
/rmd_images/Visualization_Cookbook/heatmap-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/heatmap-1.png


--------------------------------------------------------------------------------
/rmd_images/Visualization_Cookbook/histogram-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/histogram-1.png


--------------------------------------------------------------------------------
/rmd_images/Visualization_Cookbook/line-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/line-1.png


--------------------------------------------------------------------------------
/rmd_images/Visualization_Cookbook/line-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/line-2.png


--------------------------------------------------------------------------------
/rmd_images/Visualization_Cookbook/lollipop-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/lollipop-1.png


--------------------------------------------------------------------------------
/rmd_images/Visualization_Cookbook/pyramid-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/pyramid-1.png


--------------------------------------------------------------------------------
/rmd_images/Visualization_Cookbook/ridge-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/ridge-1.png


--------------------------------------------------------------------------------
/rmd_images/Visualization_Cookbook/scatter-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/scatter-1.png


--------------------------------------------------------------------------------
/rmd_images/Visualization_Cookbook/stackedarea-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/stackedarea-1.png


--------------------------------------------------------------------------------
/rmd_images/Visualization_Cookbook/treemap-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/treemap-1.png


--------------------------------------------------------------------------------
/rmd_images/Visualization_Cookbook/treemap-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/treemap-2.png


--------------------------------------------------------------------------------
/rmd_images/Visualization_Cookbook/violin-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/Visualization_Cookbook/violin-1.png


--------------------------------------------------------------------------------
/rmd_images/hypothesis_testing/unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/hypothesis_testing/unnamed-chunk-5-1.png


--------------------------------------------------------------------------------
/rmd_images/hypothesis_testing/unnamed-chunk-6-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/hypothesis_testing/unnamed-chunk-6-1.png


--------------------------------------------------------------------------------
/rmd_images/hypothesis_testing/unnamed-chunk-7-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jessecambon/Data-Science-Sandbox/8715a7d0b6eb2efcb4a87517d2fb87bf385992e3/rmd_images/hypothesis_testing/unnamed-chunk-7-1.png


--------------------------------------------------------------------------------