├── .gitattributes
├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── bayes.png
├── case_studies
    ├── Case Study 1 - Diabetes dataset.Rmd
    ├── Case_Study_1_-_Diabetes_dataset.md
    ├── data
    │   └── diabetes.sav
    └── figures
    │   ├── cs1-unnamed-chunk-12-1.pdf
    │   ├── cs1-unnamed-chunk-12-1.png
    │   ├── cs1-unnamed-chunk-15-1.png
    │   ├── cs1-unnamed-chunk-18-1.png
    │   ├── cs1-unnamed-chunk-19-1.png
    │   ├── cs1-unnamed-chunk-22-1.png
    │   ├── cs1-unnamed-chunk-23-1.png
    │   ├── cs1-unnamed-chunk-24-1.png
    │   ├── cs1-unnamed-chunk-25-1.png
    │   ├── cs1-unnamed-chunk-26-1.png
    │   ├── cs1-unnamed-chunk-29-1.png
    │   ├── cs1-unnamed-chunk-33-1.png
    │   ├── cs1-unnamed-chunk-9-1.pdf
    │   └── cs1-unnamed-chunk-9-1.png
├── ci
    └── scripts
    │   └── runAllModels.sh
├── data
    ├── aircraft.csv
    ├── awards.csv
    ├── bfi.csv
    ├── binary.dta
    ├── cereals.txt
    ├── child_data.csv
    ├── drugtrial.csv
    ├── hsbdemo.dta
    ├── iqdata.csv
    ├── ologit.dta
    ├── scents.sav
    └── temprate.sav
├── models
    ├── linearRegression.stan
    ├── logisticRegression.stan
    ├── multinomialLogisticRegression.stan
    ├── multipleLinearRegression.stan
    ├── onewayANOVA.stan
    ├── orderedLogisticRegression.stan
    ├── robustRegression.stan
    └── twowayANOVA.stan
├── notebooks
    ├── Bayes Factor.Rmd
    ├── Bayes_Factor.md
    ├── Correlation Analysis.Rmd
    ├── Correlation_Analysis.md
    ├── Factor Analysis.Rmd
    ├── Factor_Analysis.md
    ├── Multiple Linear Regression with interaction terms.Rmd
    ├── Multiple_Linear_Regression_with_interaction_terms.md
    ├── Poisson Regression.Rmd
    ├── Poisson_Regression.md
    └── figures
    │   ├── corr-unnamed-chunk-5-1.png
    │   ├── factor-unnamed-chunk-5-1.png
    │   ├── factor-unnamed-chunk-6-1.png
    │   ├── multipleLin-unnamed-chunk-4-1.png
    │   ├── multipleLin-unnamed-chunk-5-1.png
    │   ├── poisson-unnamed-chunk-10-1.png
    │   ├── poisson-unnamed-chunk-5-1.png
    │   └── poisson-unnamed-chunk-9-1.png
├── requirements.txt
└── scripts
    ├── Multiple linear regression with interaction terms.py
    ├── Poisson Regression.py
    ├── helper
        ├── psis.py
        └── stan_utility.py
    ├── linearRegression.py
    ├── logisticRegression.py
    ├── multinomialLogisticRegression.py
    ├── multipleLinearRegression.py
    ├── onewayANOVA.py
    ├── orderedLogisticRegression.py
    ├── robustRegression.py
    └── twowayANOVA.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | ci/* linguist-vendored
2 | data/* linguist-vendored
3 | *.ipynb linguist-language=R
4 | *.py linguist-language=R
5 | *.rmd linguist-language=R
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # vscode
107 | .vscode/
108 | notebooks/.RData
109 | notebooks/.Rhistory
110 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | os:
 3 |   - "linux"
 4 | python:
 5 |   - "3.6"
 6 | install:
 7 |   - pip install -r requirements.txt
 8 | before_script:
 9 |   - "export MPLBACKEND=Agg"
10 |   - "export DISPLAY=:99.0"
11 |   - "sh -e /etc/init.d/xvfb start"
12 |   - sleep 3
13 | script:
14 |   - sh ci/scripts/runAllModels.sh
15 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU General Public License is a free, copyleft license for
 11 | software and other kinds of works.
 12 | 
 13 |   The licenses for most software and other practical works are designed
 14 | to take away your freedom to share and change the works.  By contrast,
 15 | the GNU General Public License is intended to guarantee your freedom to
 16 | share and change all versions of a program--to make sure it remains free
 17 | software for all its users.  We, the Free Software Foundation, use the
 18 | GNU General Public License for most of our software; it applies also to
 19 | any other work released this way by its authors.  You can apply it to
 20 | your programs, too.
 21 | 
 22 |   When we speak of free software, we are referring to freedom, not
 23 | price.  Our General Public Licenses are designed to make sure that you
 24 | have the freedom to distribute copies of free software (and charge for
 25 | them if you wish), that you receive source code or can get it if you
 26 | want it, that you can change the software or use pieces of it in new
 27 | free programs, and that you know you can do these things.
 28 | 
 29 |   To protect your rights, we need to prevent others from denying you
 30 | these rights or asking you to surrender the rights.  Therefore, you have
 31 | certain responsibilities if you distribute copies of the software, or if
 32 | you modify it: responsibilities to respect the freedom of others.
 33 | 
 34 |   For example, if you distribute copies of such a program, whether
 35 | gratis or for a fee, you must pass on to the recipients the same
 36 | freedoms that you received.  You must make sure that they, too, receive
 37 | or can get the source code.  And you must show them these terms so they
 38 | know their rights.
 39 | 
 40 |   Developers that use the GNU GPL protect your rights with two steps:
 41 | (1) assert copyright on the software, and (2) offer you this License
 42 | giving you legal permission to copy, distribute and/or modify it.
 43 | 
 44 |   For the developers' and authors' protection, the GPL clearly explains
 45 | that there is no warranty for this free software.  For both users' and
 46 | authors' sake, the GPL requires that modified versions be marked as
 47 | changed, so that their problems will not be attributed erroneously to
 48 | authors of previous versions.
 49 | 
 50 |   Some devices are designed to deny users access to install or run
 51 | modified versions of the software inside them, although the manufacturer
 52 | can do so.  This is fundamentally incompatible with the aim of
 53 | protecting users' freedom to change the software.  The systematic
 54 | pattern of such abuse occurs in the area of products for individuals to
 55 | use, which is precisely where it is most unacceptable.  Therefore, we
 56 | have designed this version of the GPL to prohibit the practice for those
 57 | products.  If such problems arise substantially in other domains, we
 58 | stand ready to extend this provision to those domains in future versions
 59 | of the GPL, as needed to protect the freedom of users.
 60 | 
 61 |   Finally, every program is threatened constantly by software patents.
 62 | States should not allow patents to restrict development and use of
 63 | software on general-purpose computers, but in those that do, we wish to
 64 | avoid the special danger that patents applied to a free program could
 65 | make it effectively proprietary.  To prevent this, the GPL assures that
 66 | patents cannot be used to render the program non-free.
 67 | 
 68 |   The precise terms and conditions for copying, distribution and
 69 | modification follow.
 70 | 
 71 |                        TERMS AND CONDITIONS
 72 | 
 73 |   0. Definitions.
 74 | 
 75 |   "This License" refers to version 3 of the GNU General Public License.
 76 | 
 77 |   "Copyright" also means copyright-like laws that apply to other kinds of
 78 | works, such as semiconductor masks.
 79 | 
 80 |   "The Program" refers to any copyrightable work licensed under this
 81 | License.  Each licensee is addressed as "you".  "Licensees" and
 82 | "recipients" may be individuals or organizations.
 83 | 
 84 |   To "modify" a work means to copy from or adapt all or part of the work
 85 | in a fashion requiring copyright permission, other than the making of an
 86 | exact copy.  The resulting work is called a "modified version" of the
 87 | earlier work or a work "based on" the earlier work.
 88 | 
 89 |   A "covered work" means either the unmodified Program or a work based
 90 | on the Program.
 91 | 
 92 |   To "propagate" a work means to do anything with it that, without
 93 | permission, would make you directly or secondarily liable for
 94 | infringement under applicable copyright law, except executing it on a
 95 | computer or modifying a private copy.  Propagation includes copying,
 96 | distribution (with or without modification), making available to the
 97 | public, and in some countries other activities as well.
 98 | 
 99 |   To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies.  Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 | 
103 |   An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License.  If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 | 
112 |   1. Source Code.
113 | 
114 |   The "source code" for a work means the preferred form of the work
115 | for making modifications to it.  "Object code" means any non-source
116 | form of a work.
117 | 
118 |   A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 | 
123 |   The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form.  A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 | 
134 |   The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities.  However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work.  For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 | 
147 |   The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 | 
151 |   The Corresponding Source for a work in source code form is that
152 | same work.
153 | 
154 |   2. Basic Permissions.
155 | 
156 |   All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met.  This License explicitly affirms your unlimited
159 | permission to run the unmodified Program.  The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work.  This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 | 
164 |   You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force.  You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright.  Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 | 
175 |   Conveying under any other circumstances is permitted solely under
176 | the conditions stated below.  Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 | 
179 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 | 
181 |   No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 | 
187 |   When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 | 
195 |   4. Conveying Verbatim Copies.
196 | 
197 |   You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 | 
205 |   You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 | 
208 |   5. Conveying Modified Source Versions.
209 | 
210 |   You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 | 
214 |     a) The work must carry prominent notices stating that you modified
215 |     it, and giving a relevant date.
216 | 
217 |     b) The work must carry prominent notices stating that it is
218 |     released under this License and any conditions added under section
219 |     7.  This requirement modifies the requirement in section 4 to
220 |     "keep intact all notices".
221 | 
222 |     c) You must license the entire work, as a whole, under this
223 |     License to anyone who comes into possession of a copy.  This
224 |     License will therefore apply, along with any applicable section 7
225 |     additional terms, to the whole of the work, and all its parts,
226 |     regardless of how they are packaged.  This License gives no
227 |     permission to license the work in any other way, but it does not
228 |     invalidate such permission if you have separately received it.
229 | 
230 |     d) If the work has interactive user interfaces, each must display
231 |     Appropriate Legal Notices; however, if the Program has interactive
232 |     interfaces that do not display Appropriate Legal Notices, your
233 |     work need not make them do so.
234 | 
235 |   A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit.  Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 | 
245 |   6. Conveying Non-Source Forms.
246 | 
247 |   You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 | 
252 |     a) Convey the object code in, or embodied in, a physical product
253 |     (including a physical distribution medium), accompanied by the
254 |     Corresponding Source fixed on a durable physical medium
255 |     customarily used for software interchange.
256 | 
257 |     b) Convey the object code in, or embodied in, a physical product
258 |     (including a physical distribution medium), accompanied by a
259 |     written offer, valid for at least three years and valid for as
260 |     long as you offer spare parts or customer support for that product
261 |     model, to give anyone who possesses the object code either (1) a
262 |     copy of the Corresponding Source for all the software in the
263 |     product that is covered by this License, on a durable physical
264 |     medium customarily used for software interchange, for a price no
265 |     more than your reasonable cost of physically performing this
266 |     conveying of source, or (2) access to copy the
267 |     Corresponding Source from a network server at no charge.
268 | 
269 |     c) Convey individual copies of the object code with a copy of the
270 |     written offer to provide the Corresponding Source.  This
271 |     alternative is allowed only occasionally and noncommercially, and
272 |     only if you received the object code with such an offer, in accord
273 |     with subsection 6b.
274 | 
275 |     d) Convey the object code by offering access from a designated
276 |     place (gratis or for a charge), and offer equivalent access to the
277 |     Corresponding Source in the same way through the same place at no
278 |     further charge.  You need not require recipients to copy the
279 |     Corresponding Source along with the object code.  If the place to
280 |     copy the object code is a network server, the Corresponding Source
281 |     may be on a different server (operated by you or a third party)
282 |     that supports equivalent copying facilities, provided you maintain
283 |     clear directions next to the object code saying where to find the
284 |     Corresponding Source.  Regardless of what server hosts the
285 |     Corresponding Source, you remain obligated to ensure that it is
286 |     available for as long as needed to satisfy these requirements.
287 | 
288 |     e) Convey the object code using peer-to-peer transmission, provided
289 |     you inform other peers where the object code and Corresponding
290 |     Source of the work are being offered to the general public at no
291 |     charge under subsection 6d.
292 | 
293 |   A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 | 
297 |   A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling.  In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage.  For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product.  A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 | 
310 |   "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source.  The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 | 
318 |   If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information.  But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 | 
329 |   The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed.  Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 | 
337 |   Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 | 
343 |   7. Additional Terms.
344 | 
345 |   "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law.  If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 | 
354 |   When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it.  (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.)  You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 | 
361 |   Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 | 
365 |     a) Disclaiming warranty or limiting liability differently from the
366 |     terms of sections 15 and 16 of this License; or
367 | 
368 |     b) Requiring preservation of specified reasonable legal notices or
369 |     author attributions in that material or in the Appropriate Legal
370 |     Notices displayed by works containing it; or
371 | 
372 |     c) Prohibiting misrepresentation of the origin of that material, or
373 |     requiring that modified versions of such material be marked in
374 |     reasonable ways as different from the original version; or
375 | 
376 |     d) Limiting the use for publicity purposes of names of licensors or
377 |     authors of the material; or
378 | 
379 |     e) Declining to grant rights under trademark law for use of some
380 |     trade names, trademarks, or service marks; or
381 | 
382 |     f) Requiring indemnification of licensors and authors of that
383 |     material by anyone who conveys the material (or modified versions of
384 |     it) with contractual assumptions of liability to the recipient, for
385 |     any liability that these contractual assumptions directly impose on
386 |     those licensors and authors.
387 | 
388 |   All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10.  If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term.  If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 | 
398 |   If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 | 
403 |   Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 | 
407 |   8. Termination.
408 | 
409 |   You may not propagate or modify a covered work except as expressly
410 | provided under this License.  Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 | 
415 |   However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 | 
422 |   Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 | 
429 |   Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License.  If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 | 
435 |   9. Acceptance Not Required for Having Copies.
436 | 
437 |   You are not required to accept this License in order to receive or
438 | run a copy of the Program.  Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance.  However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work.  These actions infringe copyright if you do
443 | not accept this License.  Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 | 
446 |   10. Automatic Licensing of Downstream Recipients.
447 | 
448 |   Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License.  You are not responsible
451 | for enforcing compliance by third parties with this License.
452 | 
453 |   An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations.  If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 | 
463 |   You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License.  For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 | 
471 |   11. Patents.
472 | 
473 |   A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based.  The
475 | work thus licensed is called the contributor's "contributor version".
476 | 
477 |   A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version.  For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 | 
487 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 | 
492 |   In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement).  To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 | 
499 |   If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients.  "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 | 
513 |   If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 | 
521 |   A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License.  You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 | 
536 |   Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 | 
540 |   12. No Surrender of Others' Freedom.
541 | 
542 |   If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License.  If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all.  For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 | 
552 |   13. Use with the GNU Affero General Public License.
553 | 
554 |   Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work.  The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 | 
563 |   14. Revised Versions of this License.
564 | 
565 |   The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time.  Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 | 
570 |   Each version is given a distinguishing version number.  If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation.  If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 | 
579 |   If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 | 
584 |   Later license versions may give you additional or different
585 | permissions.  However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 | 
589 |   15. Disclaimer of Warranty.
590 | 
591 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 | 
600 |   16. Limitation of Liability.
601 | 
602 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 | 
612 |   17. Interpretation of Sections 15 and 16.
613 | 
614 |   If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 | 
621 |                      END OF TERMS AND CONDITIONS
622 | 
623 |             How to Apply These Terms to Your New Programs
624 | 
625 |   If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 | 
629 |   To do so, attach the following notices to the program.  It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 | 
634 |     <one line to give the program's name and a brief idea of what it does.>
635 |     Copyright (C) <year>  <name of author>
636 | 
637 |     This program is free software: you can redistribute it and/or modify
638 |     it under the terms of the GNU General Public License as published by
639 |     the Free Software Foundation, either version 3 of the License, or
640 |     (at your option) any later version.
641 | 
642 |     This program is distributed in the hope that it will be useful,
643 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
644 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
645 |     GNU General Public License for more details.
646 | 
647 |     You should have received a copy of the GNU General Public License
648 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
649 | 
650 | Also add information on how to contact you by electronic and paper mail.
651 | 
652 |   If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 | 
655 |     <program>  Copyright (C) <year>  <name of author>
656 |     This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 |     This is free software, and you are welcome to redistribute it
658 |     under certain conditions; type `show c' for details.
659 | 
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License.  Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 | 
664 |   You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | <https://www.gnu.org/licenses/>.
668 | 
669 |   The GNU General Public License does not permit incorporating your program
670 | into proprietary programs.  If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library.  If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License.  But first, please read
674 | <https://www.gnu.org/licenses/why-not-lgpl.html>.
675 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Statistical Modeling Examples
 2 | 
 3 | <p align="right">All we need is just</p>
 4 | <img align="right" src="bayes.png" width="200px">
 5 | 
 6 | [![GitHub license](https://img.shields.io/github/license/mrtkp9993/Statistical-Modelling-Examples.svg)](https://github.com/mrtkp9993/Statistical-Modelling-Examples/blob/master/LICENSE)
 7 | [![DOI](https://zenodo.org/badge/143592387.svg)](https://zenodo.org/badge/latestdoi/143592387)
 8 | 
 9 | ---
10 | 
11 | ## Case Studies
12 | 
13 | * Diabetes dataset: [Dataset info](http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/diabetes.html), [Markdown](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/case_studies/Case_Study_1_-_Diabetes_dataset.md).
14 | 
15 | ## Examples
16 | 
17 | PyStan examples includes these methods:
18 | 
19 | * Linear Regression [Model](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/models/linearRegression.stan), [Script](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/scripts/linearRegression.py).
20 | * Multiple Linear Regression [Model](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/models/multipleLinearRegression.stan), [Script](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/scripts/multipleLinearRegression.py).
21 | * Robust Regression [Model](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/models/robustRegression.stan), [Script](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/scripts/robustRegression.py).
22 | * Logistic Regression [Model](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/models/logisticRegression.stan), [Script](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/scripts/logisticRegression.py).
23 | * Multinomial Logistic Regression [Model](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/models/multinomialLogisticRegression.stan), [Script](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/scripts/multinomialLogisticRegression.py).
24 | * Ordered Logistic Regression [Model](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/models/orderedLogisticRegression.stan), [Script](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/scripts/orderedLogisticRegression.py).
25 | * One-way ANOVA [Model](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/models/onewayANOVA.stan), [Script](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/scripts/onewayANOVA.py).
26 | * Two-way ANOVA [Model](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/models/twowayANOVA.stan), [Script](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/scripts/twowayANOVA.py).
27 | 
28 | R examples includes these methods:
29 | 
30 | * Factor analysis [Markdown](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/notebooks/Factor_Analysis.md).
31 | * Correlation analysis [Markdown](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/notebooks/Correlation_Analysis.md).
32 | * Multiple Linear Regression with interaction terms [Markdown](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/notebooks/Multiple_Linear_Regression_with_interaction_terms.md).
33 | * Poisson Regression [Markdown](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/notebooks/Poisson_Regression.md).
34 | * Bayes Factors [Markdown](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/notebooks/Bayes_Factor.md).
35 | 
36 | ## Useful Resources
37 | 
38 | ### General
39 | 
40 | * Glossary of statistical terms [Link](https://www.stat.berkeley.edu/~stark/SticiGui/Text/gloss.htm).
41 | * Statistical tests with Python [Link](https://machinelearningmastery.com/statistical-hypothesis-tests-in-python-cheat-sheet/).
42 | * Michael Betancourt: “A Conceptual Introduction to Hamiltonian Monte Carlo”, 2017; arXiv:1701.02434.
43 | * Hamiltonian Monte Carlo explained, [Link](http://arogozhnikov.github.io/2016/12/19/markov_chain_monte_carlo.html).
44 | 
45 | ### Stan
46 | 
47 | * Stan Reference Manual [Link](https://github.com/stan-dev/stan/releases/download/v2.17.0/stan-reference-2.17.0.pdf).
48 | * PyStan Getting Started [Link](https://pystan.readthedocs.io/en/latest/getting_started.html).
49 | * Stan example models [Link](https://github.com/stan-dev/example-models/tree/master/misc).
50 | * Prior choices [Link](https://github.com/stan-dev/stan/wiki/Prior-Choice-Recommendations).
51 | 
52 | ### R
53 | 
54 | * R-bloggers [Link](https://www.r-bloggers.com/).
55 | * Quick-R [Link](https://www.statmethods.net/index.html). 
56 | 
57 | 
58 | ## Datasets
59 | 
60 | * R datasets [Link](https://vincentarelbundock.github.io/Rdatasets/datasets.html).
61 | * Datasets for teaching [Link](https://www.sheffield.ac.uk/mash/data).
62 | 
63 | ## Books
64 | 
65 | * Korner-Nievergelt, F., Korner-Nievergelt, P., Roth, T., Almasi, B., Felten, S. V., & Guélat, J. (2016). Bayesian data analysis in ecology using linear models with R, BUGS and Stan. Amsterdam: Elsevier/Academic Press.
66 | 


--------------------------------------------------------------------------------
/bayes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/bayes.png


--------------------------------------------------------------------------------
/case_studies/Case Study 1 - Diabetes dataset.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Case Study 1 - Diabetes dataset"
  3 | author: "Murat Koptur"
  4 | date: "`r format(Sys.time(), '%d %B %Y')`"
  5 | output:
  6 |   rmarkdown::github_document: default
  7 | ---
  8 | 
  9 | ```{r echo=FALSE}
 10 | knitr::opts_chunk$set(fig.path='figures/cs1-')
 11 | ```
 12 | 
 13 | ```{r}
 14 | library(dplyr)
 15 | library(fastDummies)
 16 | library(GGally)
 17 | library(lavaan)
 18 | library(loo)
 19 | library(magrittr)
 20 | library(mice)
 21 | library(psych)
 22 | library(rstanarm)
 23 | library(semPlot)
 24 | ```
 25 | 
 26 | ```{r}
 27 | # http://biostat.mc.vanderbilt.edu/wiki/Main/DataSets
 28 | load("./data/diabetes.sav")
 29 | ```
 30 | 
 31 | ```{r}
 32 | str(diabetes)
 33 | ```
 34 | 
 35 | ```{r}
 36 | # I'll not use location in this analysis
 37 | diabetes <- select(diabetes, -location, -id)
 38 | ```
 39 | 
 40 | ```{r}
 41 | # Let's look at summary of data
 42 | summary(diabetes)
 43 | ```
 44 | 
 45 | ```{r}
 46 | # Investigate NA counts
 47 | colSums(is.na(diabetes))
 48 | ```
 49 | 
 50 | ```{r}
 51 | # bp.2s and bp.2d variables has too much missing values
 52 | 
 53 | # Glycosolated hemoglobin (glyhb) column has 13 NAs
 54 | # I'll drop these observations
 55 | diabetes <- filter(diabetes, !is.na(glyhb))
 56 | ```
 57 | 
 58 | ```{r}
 59 | # impute 
 60 | md.pattern(diabetes)
 61 | ```
 62 | 
 63 | ```{r results='hide'}
 64 | diabetes_imp <-
 65 |   mice(
 66 |     data = diabetes,
 67 |     m = 5,
 68 |     maxit = 50,
 69 |     method = "pmm"
 70 |   )
 71 | ```
 72 | 
 73 | ```{r}
 74 | # Take first imputed dataset (we have 5 imputed datasets, m=5)
 75 | diabetes_completed <- complete(diabetes_imp, 1)
 76 | # Investigate NA counts again
 77 | colSums(is.na(diabetes_completed))
 78 | ```
 79 | 
 80 | ```{r}
 81 | # correlation analysis
 82 | ggcorr(diabetes_completed, label = TRUE, label_alpha = .7)
 83 | ```
 84 | 
 85 | ```{r}
 86 | corr_table <-
 87 |   cor(diabetes_completed[, sapply(diabetes_completed, is.numeric)])
 88 | subset(as.data.frame(as.table(corr_table)), abs(Freq) > 0.5)
 89 | ```
 90 | 
 91 | ```{r}
 92 | # since bp.2d and bp.2s seems highly correlated with bp.1d and bp.1s and 
 93 | # they have a lot of missing values, I decided to discard them from analysis 
 94 | 
 95 | # also, I'll create two new variables,
 96 | # BMI (body mass index) and waist-to-hip ratio
 97 | 
 98 | diabetes_completed$bmi <-
 99 |   (diabetes_completed$weight / (diabetes_completed$height ** 2) * 703)
100 | diabetes_completed$waist_to_hip_rat <-
101 |   diabetes_completed$waist / diabetes_completed$hip
102 | 
103 | # take a subset of uncorrelated variables
104 | diabetes_completed_subset <- select(
105 |   diabetes_completed,
106 |   chol,
107 |   ratio,
108 |   glyhb,
109 |   age,
110 |   gender,
111 |   bmi,
112 |   waist_to_hip_rat,
113 |   frame,
114 |   bp.1s,
115 |   bp.1d,
116 |   time.ppn
117 | )
118 | head(diabetes_completed_subset)
119 | ```
120 | 
121 | ```{r}
122 | # pairs plot
123 | ggpairs(diabetes_completed_subset)
124 | ```
125 | 
126 | ```{r}
127 | # standardize all variables
128 | diabetes_completed_subset %<>%
129 |   mutate_at(
130 |     funs(scale),
131 |     .vars = c(
132 |       "chol",
133 |       "ratio",
134 |       "glyhb",
135 |       "age",
136 |       "bmi",
137 |       "waist_to_hip_rat",
138 |       "bp.1s",
139 |       "bp.1d",
140 |       "time.ppn"
141 |     )
142 |   )
143 | ```
144 | 
145 | ```{r}
146 | # Create dummy variables for gender and frame
147 | library(fastDummies)
148 | diabetes_completed_subset <-
149 |   dummy_cols(diabetes_completed_subset, remove_first_dummy = TRUE)
150 | diabetes_completed_subset <-
151 |   select(diabetes_completed_subset,-gender,-frame)
152 | head(diabetes_completed_subset)
153 | ```
154 | 
155 | ```{r}
156 | # Explonatory Factor analysis
157 | fa.parallel(select(diabetes_completed_subset,-glyhb))
158 | ```
159 | 
160 | ```{r}
161 | diabetes_completed_subset_fi <-
162 |   fa(
163 |     select(diabetes_completed_subset,-glyhb),
164 |     nfactors = 6,
165 |     fm = "pa",
166 |     max.iter = 200
167 |   )
168 | fa.diagram(diabetes_completed_subset_fi)
169 | ```
170 | 
171 | ```{r}
172 | fl <- round(unclass(diabetes_completed_subset_fi$loadings), 2)
173 | fl
174 | ```
175 | 
176 | ```{r}
177 | # Let's start to build models
178 | model1 <- stan_glm('glyhb ~ .', data = diabetes_completed_subset)
179 | model1
180 | summary(model1)
181 | ```
182 | 
183 | ```{r}
184 | par(mfrow = c(2, 2), mar = c(3, 5, 3, 3))
185 | plot(model1)
186 | ```
187 | 
188 | ```{r}
189 | model2 <-
190 |   stan_glm('glyhb ~  ratio + age', data = diabetes_completed_subset)
191 | model2
192 | summary(model2)
193 | 
194 | par(mfrow = c(2, 2), mar = c(3, 5, 3, 3))
195 | plot(model2)
196 | ```
197 | 
198 | ```{r}
199 | model3 <-
200 |   stan_glm('glyhb ~ bmi + waist_to_hip_rat', data = diabetes_completed_subset)
201 | model3
202 | summary(model3)
203 | 
204 | par(mfrow = c(2, 2), mar = c(3, 5, 3, 3))
205 | plot(model3)
206 | ```
207 | 
208 | ```{r}
209 | model4 <-
210 |   stan_glm('glyhb ~ ratio + age + bmi + waist_to_hip_rat', data = diabetes_completed_subset)
211 | model4
212 | summary(model4)
213 | 
214 | par(mfrow = c(2, 2), mar = c(3, 5, 3, 3))
215 | plot(model4)
216 | ```
217 | 
218 | ```{r}
219 | model5 <-
220 |   stan_glm('glyhb ~ ratio + age + bmi', data = diabetes_completed_subset)
221 | model5
222 | summary(model5)
223 | 
224 | par(mfrow = c(2, 2), mar = c(3, 5, 3, 3))
225 | plot(model5)
226 | ```
227 | 
228 | ```{r}
229 | ic <- data.frame(
230 |   Model = c("model1", "model2", "model3", "model4", "model5"),
231 |   WAIC = c(waic(model1)$estimates[3,1], waic(model2)$estimates[3,1], waic(model3)$estimates[3,1], waic(model4)$estimates[3,1], waic(model5)$estimates[3,1]),
232 |   stringsAsFactors = FALSE
233 | )
234 | ic
235 | ```
236 | 
237 | ```{r}
238 | # Let's build a SEM model
239 | library(lavaan)
240 | semModel1 <- '
241 | pa1 =~ age
242 | pa2 =~ bp.1d + bp.1s
243 | pa3 =~ bmi + frame_large + frame_small
244 | pa4 =~ gender_male + waist_to_hip_rat
245 | pa5 =~ ratio + chol
246 | pa6 =~ time.ppn
247 | 
248 | glyhb ~ pa1 + pa2 + pa3 + pa4 + pa5 + pa6
249 | '
250 | fit1 <- sem(semModel1,
251 |             data = diabetes_completed_subset)
252 | fit1
253 | ```
254 | 
255 | ```{r}
256 | semPaths(fit1)
257 | ```
258 | 
259 | ```{r}
260 | summary(fit1, standardized = TRUE, fit.measures = TRUE)
261 | ```
262 | 
263 | ```{r}
264 | parameterEstimates(fit1)
265 | ```
266 | 
267 | ```{r}
268 | # Second SEM model
269 | semModel2 <- '
270 | pa1 =~ age
271 | pa5 =~ ratio + chol
272 | 
273 | glyhb ~ pa1 + pa5
274 | '
275 | fit2 <- sem(semModel2,
276 |             data = diabetes_completed_subset)
277 | fit2
278 | ```
279 | 
280 | ```{r}
281 | semPaths(fit2)
282 | ```
283 | 
284 | ```{r}
285 | summary(fit2, standardized = TRUE, fit.measures = TRUE)
286 | ```
287 | 
288 | ```{r}
289 | parameterEstimates(fit1)
290 | ```


--------------------------------------------------------------------------------
/case_studies/Case_Study_1_-_Diabetes_dataset.md:
--------------------------------------------------------------------------------
   1 | Case Study 1 - Diabetes dataset
   2 | ================
   3 | Murat Koptur
   4 | 26 Ağustos 2018
   5 | 
   6 | ``` r
   7 | library(dplyr)
   8 | library(fastDummies)
   9 | library(GGally)
  10 | library(lavaan)
  11 | library(loo)
  12 | library(magrittr)
  13 | library(mice)
  14 | library(psych)
  15 | library(rstanarm)
  16 | library(semPlot)
  17 | ```
  18 | 
  19 | ``` r
  20 | # http://biostat.mc.vanderbilt.edu/wiki/Main/DataSets
  21 | load("./data/diabetes.sav")
  22 | ```
  23 | 
  24 | ``` r
  25 | str(diabetes)
  26 | ```
  27 | 
  28 |     ## 'data.frame':    403 obs. of  19 variables:
  29 |     ##  $ id      : 'labelled' int  1000 1001 1002 1003 1005 1008 1011 1015 1016 1022 ...
  30 |     ##   ..- attr(*, "label")= chr "Subject ID"
  31 |     ##  $ chol    : 'labelled' int  203 165 228 78 249 248 195 227 177 263 ...
  32 |     ##   ..- attr(*, "label")= chr "Total Cholesterol"
  33 |     ##  $ stab.glu: 'labelled' int  82 97 92 93 90 94 92 75 87 89 ...
  34 |     ##   ..- attr(*, "label")= chr "Stabilized Glucose"
  35 |     ##  $ hdl     : 'labelled' int  56 24 37 12 28 69 41 44 49 40 ...
  36 |     ##   ..- attr(*, "label")= chr "High Density Lipoprotein"
  37 |     ##  $ ratio   : 'labelled' num  3.6 6.9 6.2 6.5 8.9 ...
  38 |     ##   ..- attr(*, "label")= chr "Cholesterol/HDL Ratio"
  39 |     ##  $ glyhb   : 'labelled' num  4.31 4.44 4.64 4.63 7.72 ...
  40 |     ##   ..- attr(*, "label")= chr "Glycosolated Hemoglobin"
  41 |     ##  $ location: Factor w/ 2 levels "Buckingham","Louisa": 1 1 1 1 1 1 1 1 1 1 ...
  42 |     ##  $ age     : int  46 29 58 67 64 34 30 37 45 55 ...
  43 |     ##   ..- attr(*, "units")= chr "years"
  44 |     ##  $ gender  : Factor w/ 2 levels "male","female": 2 2 2 1 1 1 1 1 1 2 ...
  45 |     ##  $ height  : int  62 64 61 67 68 71 69 59 69 63 ...
  46 |     ##   ..- attr(*, "units")= chr "inches"
  47 |     ##  $ weight  : int  121 218 256 119 183 190 191 170 166 202 ...
  48 |     ##   ..- attr(*, "units")= chr "pounds"
  49 |     ##  $ frame   : Factor w/ 3 levels "small","medium",..: 2 3 3 3 2 3 2 2 3 1 ...
  50 |     ##  $ bp.1s   : 'labelled' int  118 112 190 110 138 132 161 NA 160 108 ...
  51 |     ##   ..- attr(*, "label")= chr "First Systolic Blood Pressure"
  52 |     ##  $ bp.1d   : 'labelled' int  59 68 92 50 80 86 112 NA 80 72 ...
  53 |     ##   ..- attr(*, "label")= chr "First Diastolic Blood Pressure"
  54 |     ##  $ bp.2s   : 'labelled' int  NA NA 185 NA NA NA 161 NA 128 NA ...
  55 |     ##   ..- attr(*, "label")= chr "Second Systolic Blood Pressure"
  56 |     ##   ..- attr(*, "comment")= chr "equals first measurement if it was not high"
  57 |     ##  $ bp.2d   : 'labelled' int  NA NA 92 NA NA NA 112 NA 86 NA ...
  58 |     ##   ..- attr(*, "comment")= chr "equals first measurement if it was not high"
  59 |     ##   ..- attr(*, "label")= chr "Second Diastolic Blood Pressure"
  60 |     ##  $ waist   : int  29 46 49 33 44 36 46 34 34 45 ...
  61 |     ##   ..- attr(*, "units")= chr "inches"
  62 |     ##  $ hip     : int  38 48 57 38 41 42 49 39 40 50 ...
  63 |     ##   ..- attr(*, "units")= chr "inches"
  64 |     ##  $ time.ppn: 'labelled' int  720 360 180 480 300 195 720 1020 300 240 ...
  65 |     ##   ..- attr(*, "label")= chr "Postprandial Time when Labs were Drawn"
  66 |     ##   ..- attr(*, "units")= chr "minutes"
  67 | 
  68 | ``` r
  69 | # I'll not use location in this analysis
  70 | diabetes <- select(diabetes, -location, -id)
  71 | ```
  72 | 
  73 | ``` r
  74 | # Let's look at summary of data
  75 | summary(diabetes)
  76 | ```
  77 | 
  78 |     ##       chol          stab.glu          hdl             ratio       
  79 |     ##  Min.   : 78.0   Min.   : 48.0   Min.   : 12.00   Min.   : 1.500  
  80 |     ##  1st Qu.:179.0   1st Qu.: 81.0   1st Qu.: 38.00   1st Qu.: 3.200  
  81 |     ##  Median :204.0   Median : 89.0   Median : 46.00   Median : 4.200  
  82 |     ##  Mean   :207.8   Mean   :106.7   Mean   : 50.45   Mean   : 4.522  
  83 |     ##  3rd Qu.:230.0   3rd Qu.:106.0   3rd Qu.: 59.00   3rd Qu.: 5.400  
  84 |     ##  Max.   :443.0   Max.   :385.0   Max.   :120.00   Max.   :19.300  
  85 |     ##  NA's   :1                       NA's   :1        NA's   :1       
  86 |     ##      glyhb            age           gender        height     
  87 |     ##  Min.   : 2.68   Min.   :19.00   male  :169   Min.   :52.00  
  88 |     ##  1st Qu.: 4.38   1st Qu.:34.00   female:234   1st Qu.:63.00  
  89 |     ##  Median : 4.84   Median :45.00                Median :66.00  
  90 |     ##  Mean   : 5.59   Mean   :46.85                Mean   :66.02  
  91 |     ##  3rd Qu.: 5.60   3rd Qu.:60.00                3rd Qu.:69.00  
  92 |     ##  Max.   :16.11   Max.   :92.00                Max.   :76.00  
  93 |     ##  NA's   :13                                   NA's   :5      
  94 |     ##      weight         frame         bp.1s           bp.1d       
  95 |     ##  Min.   : 99.0   small :104   Min.   : 90.0   Min.   : 48.00  
  96 |     ##  1st Qu.:151.0   medium:184   1st Qu.:121.2   1st Qu.: 75.00  
  97 |     ##  Median :172.5   large :103   Median :136.0   Median : 82.00  
  98 |     ##  Mean   :177.6   NA's  : 12   Mean   :136.9   Mean   : 83.32  
  99 |     ##  3rd Qu.:200.0                3rd Qu.:146.8   3rd Qu.: 90.00  
 100 |     ##  Max.   :325.0                Max.   :250.0   Max.   :124.00  
 101 |     ##  NA's   :1                    NA's   :5       NA's   :5       
 102 |     ##      bp.2s           bp.2d            waist           hip       
 103 |     ##  Min.   :110.0   Min.   : 60.00   Min.   :26.0   Min.   :30.00  
 104 |     ##  1st Qu.:138.0   1st Qu.: 84.00   1st Qu.:33.0   1st Qu.:39.00  
 105 |     ##  Median :149.0   Median : 92.00   Median :37.0   Median :42.00  
 106 |     ##  Mean   :152.4   Mean   : 92.52   Mean   :37.9   Mean   :43.04  
 107 |     ##  3rd Qu.:161.0   3rd Qu.:100.00   3rd Qu.:41.0   3rd Qu.:46.00  
 108 |     ##  Max.   :238.0   Max.   :124.00   Max.   :56.0   Max.   :64.00  
 109 |     ##  NA's   :262     NA's   :262      NA's   :2      NA's   :2      
 110 |     ##     time.ppn     
 111 |     ##  Min.   :   5.0  
 112 |     ##  1st Qu.:  90.0  
 113 |     ##  Median : 240.0  
 114 |     ##  Mean   : 341.2  
 115 |     ##  3rd Qu.: 517.5  
 116 |     ##  Max.   :1560.0  
 117 |     ##  NA's   :3
 118 | 
 119 | ``` r
 120 | # Investigate NA counts
 121 | colSums(is.na(diabetes))
 122 | ```
 123 | 
 124 |     ##     chol stab.glu      hdl    ratio    glyhb      age   gender   height 
 125 |     ##        1        0        1        1       13        0        0        5 
 126 |     ##   weight    frame    bp.1s    bp.1d    bp.2s    bp.2d    waist      hip 
 127 |     ##        1       12        5        5      262      262        2        2 
 128 |     ## time.ppn 
 129 |     ##        3
 130 | 
 131 | ``` r
 132 | # bp.2s and bp.2d variables has too much missing values
 133 | 
 134 | # Glycosolated hemoglobin (glyhb) column has 13 NAs
 135 | # I'll drop these observations
 136 | diabetes <- filter(diabetes, !is.na(glyhb))
 137 | ```
 138 | 
 139 | ``` r
 140 | # impute 
 141 | md.pattern(diabetes)
 142 | ```
 143 | 
 144 | ![](figures/cs1-unnamed-chunk-9-1.png)
 145 | 
 146 |     ##     stab.glu glyhb age gender chol hdl ratio weight waist hip time.ppn
 147 |     ## 130        1     1   1      1    1   1     1      1     1   1        1
 148 |     ## 236        1     1   1      1    1   1     1      1     1   1        1
 149 |     ## 6          1     1   1      1    1   1     1      1     1   1        1
 150 |     ## 3          1     1   1      1    1   1     1      1     1   1        1
 151 |     ## 3          1     1   1      1    1   1     1      1     1   1        1
 152 |     ## 4          1     1   1      1    1   1     1      1     1   1        1
 153 |     ## 1          1     1   1      1    1   1     1      1     1   1        1
 154 |     ## 1          1     1   1      1    1   1     1      1     1   1        0
 155 |     ## 1          1     1   1      1    1   1     1      1     1   1        0
 156 |     ## 1          1     1   1      1    1   1     1      1     1   1        0
 157 |     ## 1          1     1   1      1    1   1     1      1     0   0        1
 158 |     ## 1          1     1   1      1    1   1     1      1     0   0        1
 159 |     ## 1          1     1   1      1    1   1     1      0     1   1        1
 160 |     ## 1          1     1   1      1    0   0     0      1     1   1        1
 161 |     ##            0     0   0      0    1   1     1      1     2   2        3
 162 |     ##     height bp.1s bp.1d frame bp.2s bp.2d    
 163 |     ## 130      1     1     1     1     1     1   0
 164 |     ## 236      1     1     1     1     0     0   2
 165 |     ## 6        1     1     1     0     1     1   1
 166 |     ## 3        1     1     1     0     0     0   3
 167 |     ## 3        1     0     0     1     0     0   4
 168 |     ## 4        0     1     1     1     0     0   3
 169 |     ## 1        0     0     0     0     0     0   6
 170 |     ## 1        1     1     1     1     1     1   1
 171 |     ## 1        1     1     1     0     0     0   4
 172 |     ## 1        1     0     0     1     0     0   5
 173 |     ## 1        1     1     1     1     1     1   2
 174 |     ## 1        1     1     1     1     0     0   4
 175 |     ## 1        1     1     1     1     0     0   3
 176 |     ## 1        1     1     1     1     0     0   5
 177 |     ##          5     5     5    11   252   252 541
 178 | 
 179 | ``` r
 180 | diabetes_imp <-
 181 |   mice(
 182 |     data = diabetes,
 183 |     m = 5,
 184 |     maxit = 50,
 185 |     method = "pmm"
 186 |   )
 187 | ```
 188 | 
 189 | ``` r
 190 | # Take first imputed dataset (we have 5 imputed datasets, m=5)
 191 | diabetes_completed <- complete(diabetes_imp, 1)
 192 | ```
 193 | 
 194 | ``` r
 195 | # Investigate NA counts again
 196 | colSums(is.na(diabetes_completed))
 197 | ```
 198 | 
 199 |     ##     chol stab.glu      hdl    ratio    glyhb      age   gender   height 
 200 |     ##        0        0        0        0        0        0        0        0 
 201 |     ##   weight    frame    bp.1s    bp.1d    bp.2s    bp.2d    waist      hip 
 202 |     ##        0        0        0        0        0        0        0        0 
 203 |     ## time.ppn 
 204 |     ##        0
 205 | 
 206 | ``` r
 207 | # correlation analysis
 208 | ggcorr(diabetes_completed, label = TRUE, label_alpha = .7)
 209 | ```
 210 | 
 211 | ![](figures/cs1-unnamed-chunk-12-1.png)
 212 | 
 213 | ``` r
 214 | corr_table <-
 215 |   cor(diabetes_completed[, sapply(diabetes_completed, is.numeric)])
 216 | subset(as.data.frame(as.table(corr_table)), abs(Freq) > 0.5)
 217 | ```
 218 | 
 219 |     ##         Var1     Var2       Freq
 220 |     ## 1       chol     chol  1.0000000
 221 |     ## 17  stab.glu stab.glu  1.0000000
 222 |     ## 20     glyhb stab.glu  0.7492355
 223 |     ## 33       hdl      hdl  1.0000000
 224 |     ## 34     ratio      hdl -0.6826599
 225 |     ## 48       hdl    ratio -0.6826599
 226 |     ## 49     ratio    ratio  1.0000000
 227 |     ## 62  stab.glu    glyhb  0.7492355
 228 |     ## 65     glyhb    glyhb  1.0000000
 229 |     ## 81       age      age  1.0000000
 230 |     ## 97    height   height  1.0000000
 231 |     ## 113   weight   weight  1.0000000
 232 |     ## 118    waist   weight  0.8522011
 233 |     ## 119      hip   weight  0.8307025
 234 |     ## 129    bp.1s    bp.1s  1.0000000
 235 |     ## 130    bp.1d    bp.1s  0.6054981
 236 |     ## 131    bp.2s    bp.1s  0.8778776
 237 |     ## 132    bp.2d    bp.1s  0.5162788
 238 |     ## 144    bp.1s    bp.1d  0.6054981
 239 |     ## 145    bp.1d    bp.1d  1.0000000
 240 |     ## 146    bp.2s    bp.1d  0.5814284
 241 |     ## 147    bp.2d    bp.1d  0.8272843
 242 |     ## 159    bp.1s    bp.2s  0.8778776
 243 |     ## 160    bp.1d    bp.2s  0.5814284
 244 |     ## 161    bp.2s    bp.2s  1.0000000
 245 |     ## 162    bp.2d    bp.2s  0.5746704
 246 |     ## 174    bp.1s    bp.2d  0.5162788
 247 |     ## 175    bp.1d    bp.2d  0.8272843
 248 |     ## 176    bp.2s    bp.2d  0.5746704
 249 |     ## 177    bp.2d    bp.2d  1.0000000
 250 |     ## 188   weight    waist  0.8522011
 251 |     ## 193    waist    waist  1.0000000
 252 |     ## 194      hip    waist  0.8341216
 253 |     ## 203   weight      hip  0.8307025
 254 |     ## 208    waist      hip  0.8341216
 255 |     ## 209      hip      hip  1.0000000
 256 |     ## 225 time.ppn time.ppn  1.0000000
 257 | 
 258 | ``` r
 259 | # since bp.2d and bp.2s seems highly correlated with bp.1d and bp.1s and 
 260 | # they have a lot of missing values, I decided to discard them from analysis 
 261 | 
 262 | # also, I'll create two new variables,
 263 | # BMI (body mass index) and waist-to-hip ratio
 264 | 
 265 | diabetes_completed$bmi <-
 266 |   (diabetes_completed$weight / (diabetes_completed$height ** 2) * 703)
 267 | diabetes_completed$waist_to_hip_rat <-
 268 |   diabetes_completed$waist / diabetes_completed$hip
 269 | 
 270 | # take a subset of uncorrelated variables
 271 | diabetes_completed_subset <- select(
 272 |   diabetes_completed,
 273 |   chol,
 274 |   ratio,
 275 |   glyhb,
 276 |   age,
 277 |   gender,
 278 |   bmi,
 279 |   waist_to_hip_rat,
 280 |   frame,
 281 |   bp.1s,
 282 |   bp.1d,
 283 |   time.ppn
 284 | )
 285 | head(diabetes_completed_subset)
 286 | ```
 287 | 
 288 |     ##   chol ratio glyhb age gender      bmi waist_to_hip_rat  frame bp.1s bp.1d
 289 |     ## 1  203   3.6  4.31  46 female 22.12877        0.7631579 medium   118    59
 290 |     ## 2  165   6.9  4.44  29 female 37.41553        0.9583333  large   112    68
 291 |     ## 3  228   6.2  4.64  58 female 48.36549        0.8596491  large   190    92
 292 |     ## 4   78   6.5  4.63  67   male 18.63600        0.8684211  large   110    50
 293 |     ## 5  249   8.9  7.72  64   male 27.82202        1.0731707 medium   138    80
 294 |     ## 6  248   3.6  4.81  34   male 26.49673        0.8571429  large   132    86
 295 |     ##   time.ppn
 296 |     ## 1      720
 297 |     ## 2      360
 298 |     ## 3      180
 299 |     ## 4      480
 300 |     ## 5      300
 301 |     ## 6      195
 302 | 
 303 | ``` r
 304 | # pairs plot
 305 | ggpairs(diabetes_completed_subset)
 306 | ```
 307 | 
 308 |     ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
 309 |     ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
 310 |     ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
 311 |     ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
 312 |     ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
 313 |     ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
 314 |     ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
 315 |     ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
 316 |     ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
 317 |     ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
 318 |     ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
 319 |     ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
 320 |     ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
 321 |     ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
 322 |     ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
 323 |     ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
 324 |     ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
 325 |     ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
 326 | 
 327 | ![](figures/cs1-unnamed-chunk-15-1.png)
 328 | 
 329 | ``` r
 330 | # standardize all variables
 331 | diabetes_completed_subset %<>%
 332 |   mutate_at(
 333 |     funs(scale),
 334 |     .vars = c(
 335 |       "chol",
 336 |       "ratio",
 337 |       "glyhb",
 338 |       "age",
 339 |       "bmi",
 340 |       "waist_to_hip_rat",
 341 |       "bp.1s",
 342 |       "bp.1d",
 343 |       "time.ppn"
 344 |     )
 345 |   )
 346 | ```
 347 | 
 348 | ``` r
 349 | # Create dummy variables for gender and frame
 350 | library(fastDummies)
 351 | diabetes_completed_subset <-
 352 |   dummy_cols(diabetes_completed_subset, remove_first_dummy = TRUE)
 353 | diabetes_completed_subset <-
 354 |   select(diabetes_completed_subset,-gender,-frame)
 355 | head(diabetes_completed_subset)
 356 | ```
 357 | 
 358 |     ##          chol      ratio      glyhb         age        bmi
 359 |     ## 1 -0.09319585 -0.5301616 -0.5706645 -0.04711384 -0.9973448
 360 |     ## 2 -0.94314197  1.3678022 -0.5126959 -1.08143433  1.3055456
 361 |     ## 3  0.46597923  0.9652036 -0.4235136  0.68299474  2.9551156
 362 |     ## 4 -2.88907124  1.1377459 -0.4279726  1.23057617 -1.5235175
 363 |     ## 5  0.93568630  2.5180829  0.9498954  1.04804903 -0.1396797
 364 |     ## 6  0.91331929 -0.5301616 -0.3477085 -0.77722242 -0.3393293
 365 |     ##   waist_to_hip_rat       bp.1s      bp.1d    time.ppn gender_male
 366 |     ## 1       -1.6083402 -0.82988906 -1.7821860  1.25031434           0
 367 |     ## 2        1.0550300 -1.09181790 -1.1181935  0.08200624           0
 368 |     ## 3       -0.2916179  2.31325699  0.6524530 -0.50214781           0
 369 |     ## 4       -0.1719158 -1.17912751 -2.4461784  0.47144227           1
 370 |     ## 5        2.6221047  0.04320706 -0.2328703 -0.11271177           1
 371 |     ## 6       -0.3258185 -0.21872177  0.2097913 -0.45346830           1
 372 |     ##   frame_large frame_small
 373 |     ## 1           0           0
 374 |     ## 2           1           0
 375 |     ## 3           1           0
 376 |     ## 4           1           0
 377 |     ## 5           0           0
 378 |     ## 6           1           0
 379 | 
 380 | ``` r
 381 | # Explonatory Factor analysis
 382 | fa.parallel(select(diabetes_completed_subset,-glyhb))
 383 | ```
 384 | 
 385 | ![](figures/cs1-unnamed-chunk-18-1.png)
 386 | 
 387 |     ## Parallel analysis suggests that the number of factors =  6  and the number of components =  4
 388 | 
 389 | ``` r
 390 | diabetes_completed_subset_fi <-
 391 |   fa(
 392 |     select(diabetes_completed_subset,-glyhb),
 393 |     nfactors = 6,
 394 |     fm = "pa",
 395 |     max.iter = 200
 396 |   )
 397 | ```
 398 | 
 399 | ``` r
 400 | fa.diagram(diabetes_completed_subset_fi)
 401 | ```
 402 | 
 403 | ![](figures/cs1-unnamed-chunk-19-1.png)
 404 | 
 405 | ``` r
 406 | fl <- round(unclass(diabetes_completed_subset_fi$loadings), 2)
 407 | fl
 408 | ```
 409 | 
 410 |     ##                    PA2   PA3   PA1   PA5   PA4   PA6
 411 |     ## chol              0.07 -0.10  0.05  0.75 -0.12  0.09
 412 |     ## ratio            -0.08  0.17 -0.01  0.67  0.19 -0.12
 413 |     ## age              -0.02 -0.02  0.99  0.02 -0.01  0.00
 414 |     ## bmi               0.06  0.84 -0.06  0.03 -0.15 -0.04
 415 |     ## waist_to_hip_rat  0.01  0.19  0.18  0.08  0.47 -0.05
 416 |     ## bp.1s             0.58  0.05  0.38  0.02 -0.02  0.00
 417 |     ## bp.1d             0.98  0.01 -0.07  0.00  0.03  0.00
 418 |     ## time.ppn         -0.09 -0.04 -0.10  0.04 -0.03  0.36
 419 |     ## gender_male       0.06 -0.15 -0.04  0.00  0.79  0.04
 420 |     ## frame_large      -0.07  0.49  0.15 -0.09  0.31  0.18
 421 |     ## frame_small      -0.05 -0.42 -0.03 -0.14 -0.13 -0.29
 422 | 
 423 | ``` r
 424 | # Let's start to build models
 425 | model1 <- stan_glm('glyhb ~ .', data = diabetes_completed_subset)
 426 | ```
 427 | 
 428 | ``` r
 429 | model1
 430 | ```
 431 | 
 432 |     ## stan_glm
 433 |     ##  family:       gaussian [identity]
 434 |     ##  formula:      "glyhb ~ ."
 435 |     ##  observations: 390
 436 |     ##  predictors:   12
 437 |     ## ------
 438 |     ##                  Median MAD_SD
 439 |     ## (Intercept)      0.0    0.1   
 440 |     ## chol             0.1    0.1   
 441 |     ## ratio            0.2    0.1   
 442 |     ## age              0.3    0.1   
 443 |     ## bmi              0.1    0.1   
 444 |     ## waist_to_hip_rat 0.0    0.1   
 445 |     ## bp.1s            0.1    0.1   
 446 |     ## bp.1d            0.0    0.1   
 447 |     ## time.ppn         0.1    0.0   
 448 |     ## gender_male      0.0    0.1   
 449 |     ## frame_large      0.0    0.1   
 450 |     ## frame_small      0.0    0.1   
 451 |     ## sigma            0.9    0.0   
 452 |     ## 
 453 |     ## Sample avg. posterior predictive distribution of y:
 454 |     ##          Median MAD_SD
 455 |     ## mean_PPD 0.0    0.1   
 456 |     ## 
 457 |     ## ------
 458 |     ## For info on the priors used see help('prior_summary.stanreg').
 459 | 
 460 | ``` r
 461 | summary(model1)
 462 | ```
 463 | 
 464 |     ## 
 465 |     ## Model Info:
 466 |     ## 
 467 |     ##  function:     stan_glm
 468 |     ##  family:       gaussian [identity]
 469 |     ##  formula:      "glyhb ~ ."
 470 |     ##  algorithm:    sampling
 471 |     ##  priors:       see help('prior_summary')
 472 |     ##  sample:       4000 (posterior sample size)
 473 |     ##  observations: 390
 474 |     ##  predictors:   12
 475 |     ## 
 476 |     ## Estimates:
 477 |     ##                    mean   sd     2.5%   25%    50%    75%    97.5%
 478 |     ## (Intercept)         0.0    0.1   -0.2   -0.1    0.0    0.1    0.2 
 479 |     ## chol                0.1    0.1   -0.1    0.0    0.1    0.1    0.2 
 480 |     ## ratio               0.2    0.1    0.1    0.2    0.2    0.3    0.3 
 481 |     ## age                 0.3    0.1    0.1    0.2    0.3    0.3    0.4 
 482 |     ## bmi                 0.1    0.1    0.0    0.0    0.1    0.1    0.2 
 483 |     ## waist_to_hip_rat    0.0    0.1   -0.1    0.0    0.0    0.1    0.2 
 484 |     ## bp.1s               0.1    0.1   -0.1    0.0    0.1    0.1    0.2 
 485 |     ## bp.1d               0.0    0.1   -0.2   -0.1    0.0    0.0    0.1 
 486 |     ## time.ppn            0.1    0.0    0.0    0.0    0.1    0.1    0.1 
 487 |     ## gender_male         0.0    0.1   -0.2    0.0    0.0    0.1    0.2 
 488 |     ## frame_large         0.0    0.1   -0.3   -0.1    0.0    0.0    0.2 
 489 |     ## frame_small         0.0    0.1   -0.2   -0.1    0.0    0.1    0.2 
 490 |     ## sigma               0.9    0.0    0.8    0.9    0.9    0.9    1.0 
 491 |     ## mean_PPD            0.0    0.1   -0.1    0.0    0.0    0.0    0.1 
 492 |     ## log-posterior    -529.0    2.6 -534.8 -530.6 -528.6 -527.1 -524.9 
 493 |     ## 
 494 |     ## Diagnostics:
 495 |     ##                  mcse Rhat n_eff
 496 |     ## (Intercept)      0.0  1.0  4000 
 497 |     ## chol             0.0  1.0  4000 
 498 |     ## ratio            0.0  1.0  4000 
 499 |     ## age              0.0  1.0  4000 
 500 |     ## bmi              0.0  1.0  4000 
 501 |     ## waist_to_hip_rat 0.0  1.0  4000 
 502 |     ## bp.1s            0.0  1.0  3638 
 503 |     ## bp.1d            0.0  1.0  3939 
 504 |     ## time.ppn         0.0  1.0  4000 
 505 |     ## gender_male      0.0  1.0  4000 
 506 |     ## frame_large      0.0  1.0  4000 
 507 |     ## frame_small      0.0  1.0  4000 
 508 |     ## sigma            0.0  1.0  4000 
 509 |     ## mean_PPD         0.0  1.0  4000 
 510 |     ## log-posterior    0.1  1.0  1764 
 511 |     ## 
 512 |     ## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
 513 | 
 514 | ``` r
 515 | par(mfrow = c(2, 2), mar = c(3, 5, 3, 3))
 516 | plot(model1)
 517 | ```
 518 | 
 519 | ![](figures/cs1-unnamed-chunk-22-1.png)
 520 | 
 521 | ``` r
 522 | model2 <-
 523 |   stan_glm('glyhb ~  ratio + age', data = diabetes_completed_subset)
 524 | ```
 525 | 
 526 | ``` r
 527 | model2
 528 | ```
 529 | 
 530 |     ## stan_glm
 531 |     ##  family:       gaussian [identity]
 532 |     ##  formula:      "glyhb ~ ratio + age"
 533 |     ##  observations: 390
 534 |     ##  predictors:   3
 535 |     ## ------
 536 |     ##             Median MAD_SD
 537 |     ## (Intercept) 0.0    0.0   
 538 |     ## ratio       0.3    0.0   
 539 |     ## age         0.3    0.0   
 540 |     ## sigma       0.9    0.0   
 541 |     ## 
 542 |     ## Sample avg. posterior predictive distribution of y:
 543 |     ##          Median MAD_SD
 544 |     ## mean_PPD 0.0    0.1   
 545 |     ## 
 546 |     ## ------
 547 |     ## For info on the priors used see help('prior_summary.stanreg').
 548 | 
 549 | ``` r
 550 | summary(model2)
 551 | ```
 552 | 
 553 |     ## 
 554 |     ## Model Info:
 555 |     ## 
 556 |     ##  function:     stan_glm
 557 |     ##  family:       gaussian [identity]
 558 |     ##  formula:      "glyhb ~ ratio + age"
 559 |     ##  algorithm:    sampling
 560 |     ##  priors:       see help('prior_summary')
 561 |     ##  sample:       4000 (posterior sample size)
 562 |     ##  observations: 390
 563 |     ##  predictors:   3
 564 |     ## 
 565 |     ## Estimates:
 566 |     ##                 mean   sd     2.5%   25%    50%    75%    97.5%
 567 |     ## (Intercept)      0.0    0.0   -0.1    0.0    0.0    0.0    0.1 
 568 |     ## ratio            0.3    0.0    0.2    0.3    0.3    0.3    0.4 
 569 |     ## age              0.3    0.0    0.2    0.3    0.3    0.3    0.4 
 570 |     ## sigma            0.9    0.0    0.8    0.9    0.9    0.9    1.0 
 571 |     ## mean_PPD         0.0    0.1   -0.1    0.0    0.0    0.0    0.1 
 572 |     ## log-posterior -519.3    1.4 -522.8 -520.1 -519.0 -518.3 -517.6 
 573 |     ## 
 574 |     ## Diagnostics:
 575 |     ##               mcse Rhat n_eff
 576 |     ## (Intercept)   0.0  1.0  4000 
 577 |     ## ratio         0.0  1.0  4000 
 578 |     ## age           0.0  1.0  4000 
 579 |     ## sigma         0.0  1.0  4000 
 580 |     ## mean_PPD      0.0  1.0  4000 
 581 |     ## log-posterior 0.0  1.0  1855 
 582 |     ## 
 583 |     ## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
 584 | 
 585 | ``` r
 586 | par(mfrow = c(2, 2), mar = c(3, 5, 3, 3))
 587 | plot(model2)
 588 | ```
 589 | 
 590 | ![](figures/cs1-unnamed-chunk-23-1.png)
 591 | 
 592 | ``` r
 593 | model3 <-
 594 |   stan_glm('glyhb ~ bmi + waist_to_hip_rat', data = diabetes_completed_subset)
 595 | ```
 596 | 
 597 | ``` r
 598 | model3
 599 | ```
 600 | 
 601 |     ## stan_glm
 602 |     ##  family:       gaussian [identity]
 603 |     ##  formula:      "glyhb ~ bmi + waist_to_hip_rat"
 604 |     ##  observations: 390
 605 |     ##  predictors:   3
 606 |     ## ------
 607 |     ##                  Median MAD_SD
 608 |     ## (Intercept)      0.0    0.0   
 609 |     ## bmi              0.1    0.0   
 610 |     ## waist_to_hip_rat 0.2    0.1   
 611 |     ## sigma            1.0    0.0   
 612 |     ## 
 613 |     ## Sample avg. posterior predictive distribution of y:
 614 |     ##          Median MAD_SD
 615 |     ## mean_PPD 0.0    0.1   
 616 |     ## 
 617 |     ## ------
 618 |     ## For info on the priors used see help('prior_summary.stanreg').
 619 | 
 620 | ``` r
 621 | summary(model3)
 622 | ```
 623 | 
 624 |     ## 
 625 |     ## Model Info:
 626 |     ## 
 627 |     ##  function:     stan_glm
 628 |     ##  family:       gaussian [identity]
 629 |     ##  formula:      "glyhb ~ bmi + waist_to_hip_rat"
 630 |     ##  algorithm:    sampling
 631 |     ##  priors:       see help('prior_summary')
 632 |     ##  sample:       4000 (posterior sample size)
 633 |     ##  observations: 390
 634 |     ##  predictors:   3
 635 |     ## 
 636 |     ## Estimates:
 637 |     ##                    mean   sd     2.5%   25%    50%    75%    97.5%
 638 |     ## (Intercept)         0.0    0.0   -0.1    0.0    0.0    0.0    0.1 
 639 |     ## bmi                 0.1    0.1    0.0    0.1    0.1    0.1    0.2 
 640 |     ## waist_to_hip_rat    0.2    0.1    0.1    0.1    0.2    0.2    0.3 
 641 |     ## sigma               1.0    0.0    0.9    1.0    1.0    1.0    1.1 
 642 |     ## mean_PPD            0.0    0.1   -0.1    0.0    0.0    0.0    0.1 
 643 |     ## log-posterior    -551.6    1.5 -555.3 -552.3 -551.2 -550.5 -549.8 
 644 |     ## 
 645 |     ## Diagnostics:
 646 |     ##                  mcse Rhat n_eff
 647 |     ## (Intercept)      0.0  1.0  4000 
 648 |     ## bmi              0.0  1.0  4000 
 649 |     ## waist_to_hip_rat 0.0  1.0  4000 
 650 |     ## sigma            0.0  1.0  4000 
 651 |     ## mean_PPD         0.0  1.0  4000 
 652 |     ## log-posterior    0.0  1.0  1792 
 653 |     ## 
 654 |     ## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
 655 | 
 656 | ``` r
 657 | par(mfrow = c(2, 2), mar = c(3, 5, 3, 3))
 658 | plot(model3)
 659 | ```
 660 | 
 661 | ![](figures/cs1-unnamed-chunk-24-1.png)
 662 | 
 663 | ``` r
 664 | model4 <-
 665 |   stan_glm('glyhb ~ ratio + age + bmi + waist_to_hip_rat', data = diabetes_completed_subset)
 666 | ```
 667 | 
 668 | ``` r
 669 | model4
 670 | ```
 671 | 
 672 |     ## stan_glm
 673 |     ##  family:       gaussian [identity]
 674 |     ##  formula:      "glyhb ~ ratio + age + bmi + waist_to_hip_rat"
 675 |     ##  observations: 390
 676 |     ##  predictors:   5
 677 |     ## ------
 678 |     ##                  Median MAD_SD
 679 |     ## (Intercept)      0.0    0.0   
 680 |     ## ratio            0.3    0.0   
 681 |     ## age              0.3    0.0   
 682 |     ## bmi              0.1    0.0   
 683 |     ## waist_to_hip_rat 0.0    0.1   
 684 |     ## sigma            0.9    0.0   
 685 |     ## 
 686 |     ## Sample avg. posterior predictive distribution of y:
 687 |     ##          Median MAD_SD
 688 |     ## mean_PPD 0.0    0.1   
 689 |     ## 
 690 |     ## ------
 691 |     ## For info on the priors used see help('prior_summary.stanreg').
 692 | 
 693 | ``` r
 694 | summary(model4)
 695 | ```
 696 | 
 697 |     ## 
 698 |     ## Model Info:
 699 |     ## 
 700 |     ##  function:     stan_glm
 701 |     ##  family:       gaussian [identity]
 702 |     ##  formula:      "glyhb ~ ratio + age + bmi + waist_to_hip_rat"
 703 |     ##  algorithm:    sampling
 704 |     ##  priors:       see help('prior_summary')
 705 |     ##  sample:       4000 (posterior sample size)
 706 |     ##  observations: 390
 707 |     ##  predictors:   5
 708 |     ## 
 709 |     ## Estimates:
 710 |     ##                    mean   sd     2.5%   25%    50%    75%    97.5%
 711 |     ## (Intercept)         0.0    0.0   -0.1    0.0    0.0    0.0    0.1 
 712 |     ## ratio               0.3    0.0    0.2    0.2    0.3    0.3    0.4 
 713 |     ## age                 0.3    0.0    0.2    0.3    0.3    0.3    0.4 
 714 |     ## bmi                 0.1    0.0    0.0    0.0    0.1    0.1    0.2 
 715 |     ## waist_to_hip_rat    0.0    0.0   -0.1    0.0    0.0    0.1    0.1 
 716 |     ## sigma               0.9    0.0    0.8    0.9    0.9    0.9    1.0 
 717 |     ## mean_PPD            0.0    0.1   -0.1    0.0    0.0    0.0    0.1 
 718 |     ## log-posterior    -521.0    1.7 -525.2 -521.9 -520.6 -519.7 -518.6 
 719 |     ## 
 720 |     ## Diagnostics:
 721 |     ##                  mcse Rhat n_eff
 722 |     ## (Intercept)      0.0  1.0  4000 
 723 |     ## ratio            0.0  1.0  4000 
 724 |     ## age              0.0  1.0  4000 
 725 |     ## bmi              0.0  1.0  4000 
 726 |     ## waist_to_hip_rat 0.0  1.0  4000 
 727 |     ## sigma            0.0  1.0  4000 
 728 |     ## mean_PPD         0.0  1.0  4000 
 729 |     ## log-posterior    0.0  1.0  1911 
 730 |     ## 
 731 |     ## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
 732 | 
 733 | ``` r
 734 | par(mfrow = c(2, 2), mar = c(3, 5, 3, 3))
 735 | plot(model4)
 736 | ```
 737 | 
 738 | ![](figures/cs1-unnamed-chunk-25-1.png)
 739 | 
 740 | ``` r
 741 | model5 <-
 742 |   stan_glm('glyhb ~ ratio + age + bmi', data = diabetes_completed_subset)
 743 | ```
 744 | 
 745 | ``` r
 746 | model5
 747 | ```
 748 | 
 749 |     ## stan_glm
 750 |     ##  family:       gaussian [identity]
 751 |     ##  formula:      "glyhb ~ ratio + age + bmi"
 752 |     ##  observations: 390
 753 |     ##  predictors:   4
 754 |     ## ------
 755 |     ##             Median MAD_SD
 756 |     ## (Intercept) 0.0    0.0   
 757 |     ## ratio       0.3    0.0   
 758 |     ## age         0.3    0.0   
 759 |     ## bmi         0.1    0.0   
 760 |     ## sigma       0.9    0.0   
 761 |     ## 
 762 |     ## Sample avg. posterior predictive distribution of y:
 763 |     ##          Median MAD_SD
 764 |     ## mean_PPD 0.0    0.1   
 765 |     ## 
 766 |     ## ------
 767 |     ## For info on the priors used see help('prior_summary.stanreg').
 768 | 
 769 | ``` r
 770 | summary(model5)
 771 | ```
 772 | 
 773 |     ## 
 774 |     ## Model Info:
 775 |     ## 
 776 |     ##  function:     stan_glm
 777 |     ##  family:       gaussian [identity]
 778 |     ##  formula:      "glyhb ~ ratio + age + bmi"
 779 |     ##  algorithm:    sampling
 780 |     ##  priors:       see help('prior_summary')
 781 |     ##  sample:       4000 (posterior sample size)
 782 |     ##  observations: 390
 783 |     ##  predictors:   4
 784 |     ## 
 785 |     ## Estimates:
 786 |     ##                 mean   sd     2.5%   25%    50%    75%    97.5%
 787 |     ## (Intercept)      0.0    0.0   -0.1    0.0    0.0    0.0    0.1 
 788 |     ## ratio            0.3    0.0    0.2    0.2    0.3    0.3    0.4 
 789 |     ## age              0.3    0.0    0.2    0.3    0.3    0.3    0.4 
 790 |     ## bmi              0.1    0.0    0.0    0.0    0.1    0.1    0.2 
 791 |     ## sigma            0.9    0.0    0.8    0.9    0.9    0.9    1.0 
 792 |     ## mean_PPD         0.0    0.1   -0.1    0.0    0.0    0.0    0.1 
 793 |     ## log-posterior -519.8    1.5 -523.4 -520.6 -519.5 -518.7 -517.8 
 794 |     ## 
 795 |     ## Diagnostics:
 796 |     ##               mcse Rhat n_eff
 797 |     ## (Intercept)   0.0  1.0  4000 
 798 |     ## ratio         0.0  1.0  4000 
 799 |     ## age           0.0  1.0  4000 
 800 |     ## bmi           0.0  1.0  4000 
 801 |     ## sigma         0.0  1.0  4000 
 802 |     ## mean_PPD      0.0  1.0  4000 
 803 |     ## log-posterior 0.0  1.0  1941 
 804 |     ## 
 805 |     ## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
 806 | 
 807 | ``` r
 808 | par(mfrow = c(2, 2), mar = c(3, 5, 3, 3))
 809 | plot(model5)
 810 | ```
 811 | 
 812 | ![](figures/cs1-unnamed-chunk-26-1.png)
 813 | 
 814 | ``` r
 815 | ic <- data.frame(
 816 |   Model = c("model1", "model2", "model3", "model4", "model5"),
 817 |   WAIC = c(waic(model1)$estimates[3,1], waic(model2)$estimates[3,1], waic(model3)$estimates[3,1], waic(model4)$estimates[3,1], waic(model5)$estimates[3,1]),
 818 |   stringsAsFactors = FALSE
 819 | )
 820 | ```
 821 | 
 822 | ``` r
 823 | ic
 824 | ```
 825 | 
 826 |     ##    Model     WAIC
 827 |     ## 1 model1 1045.760
 828 |     ## 2 model2 1033.905
 829 |     ## 3 model3 1097.760
 830 |     ## 4 model4 1035.492
 831 |     ## 5 model5 1034.094
 832 | 
 833 | ``` r
 834 | # Let's build a SEM model
 835 | library(lavaan)
 836 | semModel1 <- '
 837 | pa1 =~ age
 838 | pa2 =~ bp.1d + bp.1s
 839 | pa3 =~ bmi + frame_large + frame_small
 840 | pa4 =~ gender_male + waist_to_hip_rat
 841 | pa5 =~ ratio + chol
 842 | pa6 =~ time.ppn
 843 | 
 844 | glyhb ~ pa1 + pa2 + pa3 + pa4 + pa5 + pa6
 845 | '
 846 | fit1 <- sem(semModel1,
 847 |             data = diabetes_completed_subset)
 848 | ```
 849 | 
 850 |     ## Warning in lav_object_post_check(object): lavaan WARNING: some estimated ov
 851 |     ## variances are negative
 852 | 
 853 | ``` r
 854 | fit1
 855 | ```
 856 | 
 857 |     ## lavaan 0.6-2 ended normally after 144 iterations
 858 |     ## 
 859 |     ##   Optimization method                           NLMINB
 860 |     ##   Number of free parameters                         42
 861 |     ## 
 862 |     ##   Number of observations                           390
 863 |     ## 
 864 |     ##   Estimator                                         ML
 865 |     ##   Model Fit Test Statistic                     178.781
 866 |     ##   Degrees of freedom                                36
 867 |     ##   P-value (Chi-square)                           0.000
 868 | 
 869 | ``` r
 870 | semPaths(fit1)
 871 | ```
 872 | 
 873 | ![](figures/cs1-unnamed-chunk-29-1.png)
 874 | 
 875 | ``` r
 876 | summary(fit1, standardized = TRUE, fit.measures = TRUE)
 877 | ```
 878 | 
 879 |     ## lavaan 0.6-2 ended normally after 144 iterations
 880 |     ## 
 881 |     ##   Optimization method                           NLMINB
 882 |     ##   Number of free parameters                         42
 883 |     ## 
 884 |     ##   Number of observations                           390
 885 |     ## 
 886 |     ##   Estimator                                         ML
 887 |     ##   Model Fit Test Statistic                     178.781
 888 |     ##   Degrees of freedom                                36
 889 |     ##   P-value (Chi-square)                           0.000
 890 |     ## 
 891 |     ## Model test baseline model:
 892 |     ## 
 893 |     ##   Minimum Function Test Statistic              974.533
 894 |     ##   Degrees of freedom                                66
 895 |     ##   P-value                                        0.000
 896 |     ## 
 897 |     ## User model versus baseline model:
 898 |     ## 
 899 |     ##   Comparative Fit Index (CFI)                    0.843
 900 |     ##   Tucker-Lewis Index (TLI)                       0.712
 901 |     ## 
 902 |     ## Loglikelihood and Information Criteria:
 903 |     ## 
 904 |     ##   Loglikelihood user model (H0)              -5323.322
 905 |     ##   Loglikelihood unrestricted model (H1)      -5233.931
 906 |     ## 
 907 |     ##   Number of free parameters                         42
 908 |     ##   Akaike (AIC)                               10730.643
 909 |     ##   Bayesian (BIC)                             10897.222
 910 |     ##   Sample-size adjusted Bayesian (BIC)        10763.958
 911 |     ## 
 912 |     ## Root Mean Square Error of Approximation:
 913 |     ## 
 914 |     ##   RMSEA                                          0.101
 915 |     ##   90 Percent Confidence Interval          0.086  0.116
 916 |     ##   P-value RMSEA <= 0.05                          0.000
 917 |     ## 
 918 |     ## Standardized Root Mean Square Residual:
 919 |     ## 
 920 |     ##   SRMR                                           0.064
 921 |     ## 
 922 |     ## Parameter Estimates:
 923 |     ## 
 924 |     ##   Information                                 Expected
 925 |     ##   Information saturated (h1) model          Structured
 926 |     ##   Standard Errors                             Standard
 927 |     ## 
 928 |     ## Latent Variables:
 929 |     ##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
 930 |     ##   pa1 =~                                                                
 931 |     ##     age               1.000                               0.999    1.000
 932 |     ##   pa2 =~                                                                
 933 |     ##     bp.1d             1.000                               0.340    0.340
 934 |     ##     bp.1s             5.235    2.648    1.977    0.048    1.778    1.780
 935 |     ##   pa3 =~                                                                
 936 |     ##     bmi               1.000                               0.532    0.533
 937 |     ##     frame_large       0.483    0.072    6.705    0.000    0.257    0.586
 938 |     ##     frame_small      -0.543    0.080   -6.793    0.000   -0.289   -0.652
 939 |     ##   pa4 =~                                                                
 940 |     ##     gender_male       1.000                               0.157    0.320
 941 |     ##     waist_to_hp_rt    6.946    2.881    2.411    0.016    1.094    1.095
 942 |     ##   pa5 =~                                                                
 943 |     ##     ratio             1.000                               0.882    0.883
 944 |     ##     chol              0.612    0.106    5.760    0.000    0.539    0.540
 945 |     ##   pa6 =~                                                                
 946 |     ##     time.ppn          1.000                               0.999    1.000
 947 |     ## 
 948 |     ## Regressions:
 949 |     ##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
 950 |     ##   glyhb ~                                                               
 951 |     ##     pa1               0.257    0.049    5.254    0.000    0.256    0.257
 952 |     ##     pa2               0.055    0.061    0.892    0.372    0.019    0.019
 953 |     ##     pa3               0.063    0.134    0.469    0.639    0.033    0.033
 954 |     ##     pa4               0.132    0.290    0.456    0.648    0.021    0.021
 955 |     ##     pa5               0.351    0.090    3.916    0.000    0.310    0.310
 956 |     ##     pa6               0.056    0.046    1.222    0.222    0.056    0.056
 957 |     ## 
 958 |     ## Covariances:
 959 |     ##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
 960 |     ##   pa1 ~~                                                                
 961 |     ##     pa2               0.088    0.050    1.766    0.077    0.259    0.259
 962 |     ##     pa3               0.130    0.037    3.510    0.000    0.244    0.244
 963 |     ##     pa4               0.040    0.019    2.163    0.031    0.256    0.256
 964 |     ##     pa5               0.187    0.051    3.681    0.000    0.213    0.213
 965 |     ##     pa6              -0.039    0.051   -0.778    0.437   -0.039   -0.039
 966 |     ##   pa2 ~~                                                                
 967 |     ##     pa3               0.019    0.012    1.564    0.118    0.108    0.108
 968 |     ##     pa4               0.003    0.003    1.249    0.212    0.059    0.059
 969 |     ##     pa5               0.023    0.015    1.499    0.134    0.077    0.077
 970 |     ##     pa6              -0.008    0.010   -0.840    0.401   -0.024   -0.024
 971 |     ##   pa3 ~~                                                                
 972 |     ##     pa4               0.027    0.013    2.113    0.035    0.322    0.322
 973 |     ##     pa5               0.182    0.039    4.618    0.000    0.388    0.388
 974 |     ##     pa6               0.036    0.034    1.062    0.288    0.069    0.069
 975 |     ##   pa4 ~~                                                                
 976 |     ##     pa5               0.034    0.016    2.109    0.035    0.248    0.248
 977 |     ##     pa6               0.000    0.007    0.003    0.998    0.000    0.000
 978 |     ##   pa5 ~~                                                                
 979 |     ##     pa6              -0.037    0.050   -0.733    0.464   -0.042   -0.042
 980 |     ## 
 981 |     ## Variances:
 982 |     ##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
 983 |     ##    .age               0.000                               0.000    0.000
 984 |     ##    .bp.1d             0.882    0.084   10.560    0.000    0.882    0.884
 985 |     ##    .bp.1s            -2.164    1.506   -1.437    0.151   -2.164   -2.170
 986 |     ##    .bmi               0.714    0.065   10.975    0.000    0.714    0.716
 987 |     ##    .frame_large       0.126    0.013    9.944    0.000    0.126    0.656
 988 |     ##    .frame_small       0.113    0.014    8.366    0.000    0.113    0.575
 989 |     ##    .gender_male       0.218    0.018   11.936    0.000    0.218    0.898
 990 |     ##    .waist_to_hp_rt   -0.199    0.458   -0.435    0.664   -0.199   -0.200
 991 |     ##    .ratio             0.219    0.124    1.771    0.077    0.219    0.220
 992 |     ##    .chol              0.706    0.068   10.337    0.000    0.706    0.708
 993 |     ##    .time.ppn          0.000                               0.000    0.000
 994 |     ##    .glyhb             0.777    0.059   13.243    0.000    0.777    0.779
 995 |     ##     pa1               0.997    0.071   13.964    0.000    1.000    1.000
 996 |     ##     pa2               0.115    0.064    1.802    0.072    1.000    1.000
 997 |     ##     pa3               0.283    0.064    4.423    0.000    1.000    1.000
 998 |     ##     pa4               0.025    0.012    2.035    0.042    1.000    1.000
 999 |     ##     pa5               0.778    0.141    5.508    0.000    1.000    1.000
1000 |     ##     pa6               0.997    0.071   13.964    0.000    1.000    1.000
1001 | 
1002 | ``` r
1003 | parameterEstimates(fit1)
1004 | ```
1005 | 
1006 |     ##                 lhs op              rhs    est    se      z pvalue
1007 |     ## 1               pa1 =~              age  1.000 0.000     NA     NA
1008 |     ## 2               pa2 =~            bp.1d  1.000 0.000     NA     NA
1009 |     ## 3               pa2 =~            bp.1s  5.235 2.648  1.977  0.048
1010 |     ## 4               pa3 =~              bmi  1.000 0.000     NA     NA
1011 |     ## 5               pa3 =~      frame_large  0.483 0.072  6.705  0.000
1012 |     ## 6               pa3 =~      frame_small -0.543 0.080 -6.793  0.000
1013 |     ## 7               pa4 =~      gender_male  1.000 0.000     NA     NA
1014 |     ## 8               pa4 =~ waist_to_hip_rat  6.946 2.881  2.411  0.016
1015 |     ## 9               pa5 =~            ratio  1.000 0.000     NA     NA
1016 |     ## 10              pa5 =~             chol  0.612 0.106  5.760  0.000
1017 |     ## 11              pa6 =~         time.ppn  1.000 0.000     NA     NA
1018 |     ## 12            glyhb  ~              pa1  0.257 0.049  5.254  0.000
1019 |     ## 13            glyhb  ~              pa2  0.055 0.061  0.892  0.372
1020 |     ## 14            glyhb  ~              pa3  0.063 0.134  0.469  0.639
1021 |     ## 15            glyhb  ~              pa4  0.132 0.290  0.456  0.648
1022 |     ## 16            glyhb  ~              pa5  0.351 0.090  3.916  0.000
1023 |     ## 17            glyhb  ~              pa6  0.056 0.046  1.222  0.222
1024 |     ## 18              age ~~              age  0.000 0.000     NA     NA
1025 |     ## 19            bp.1d ~~            bp.1d  0.882 0.084 10.560  0.000
1026 |     ## 20            bp.1s ~~            bp.1s -2.164 1.506 -1.437  0.151
1027 |     ## 21              bmi ~~              bmi  0.714 0.065 10.975  0.000
1028 |     ## 22      frame_large ~~      frame_large  0.126 0.013  9.944  0.000
1029 |     ## 23      frame_small ~~      frame_small  0.113 0.014  8.366  0.000
1030 |     ## 24      gender_male ~~      gender_male  0.218 0.018 11.936  0.000
1031 |     ## 25 waist_to_hip_rat ~~ waist_to_hip_rat -0.199 0.458 -0.435  0.664
1032 |     ## 26            ratio ~~            ratio  0.219 0.124  1.771  0.077
1033 |     ## 27             chol ~~             chol  0.706 0.068 10.337  0.000
1034 |     ## 28         time.ppn ~~         time.ppn  0.000 0.000     NA     NA
1035 |     ## 29            glyhb ~~            glyhb  0.777 0.059 13.243  0.000
1036 |     ## 30              pa1 ~~              pa1  0.997 0.071 13.964  0.000
1037 |     ## 31              pa2 ~~              pa2  0.115 0.064  1.802  0.072
1038 |     ## 32              pa3 ~~              pa3  0.283 0.064  4.423  0.000
1039 |     ## 33              pa4 ~~              pa4  0.025 0.012  2.035  0.042
1040 |     ## 34              pa5 ~~              pa5  0.778 0.141  5.508  0.000
1041 |     ## 35              pa6 ~~              pa6  0.997 0.071 13.964  0.000
1042 |     ## 36              pa1 ~~              pa2  0.088 0.050  1.766  0.077
1043 |     ## 37              pa1 ~~              pa3  0.130 0.037  3.510  0.000
1044 |     ## 38              pa1 ~~              pa4  0.040 0.019  2.163  0.031
1045 |     ## 39              pa1 ~~              pa5  0.187 0.051  3.681  0.000
1046 |     ## 40              pa1 ~~              pa6 -0.039 0.051 -0.778  0.437
1047 |     ## 41              pa2 ~~              pa3  0.019 0.012  1.564  0.118
1048 |     ## 42              pa2 ~~              pa4  0.003 0.003  1.249  0.212
1049 |     ## 43              pa2 ~~              pa5  0.023 0.015  1.499  0.134
1050 |     ## 44              pa2 ~~              pa6 -0.008 0.010 -0.840  0.401
1051 |     ## 45              pa3 ~~              pa4  0.027 0.013  2.113  0.035
1052 |     ## 46              pa3 ~~              pa5  0.182 0.039  4.618  0.000
1053 |     ## 47              pa3 ~~              pa6  0.036 0.034  1.062  0.288
1054 |     ## 48              pa4 ~~              pa5  0.034 0.016  2.109  0.035
1055 |     ## 49              pa4 ~~              pa6  0.000 0.007  0.003  0.998
1056 |     ## 50              pa5 ~~              pa6 -0.037 0.050 -0.733  0.464
1057 |     ##    ci.lower ci.upper
1058 |     ## 1     1.000    1.000
1059 |     ## 2     1.000    1.000
1060 |     ## 3     0.046   10.425
1061 |     ## 4     1.000    1.000
1062 |     ## 5     0.341    0.624
1063 |     ## 6    -0.700   -0.386
1064 |     ## 7     1.000    1.000
1065 |     ## 8     1.300   12.592
1066 |     ## 9     1.000    1.000
1067 |     ## 10    0.403    0.820
1068 |     ## 11    1.000    1.000
1069 |     ## 12    0.161    0.353
1070 |     ## 13   -0.065    0.174
1071 |     ## 14   -0.199    0.325
1072 |     ## 15   -0.436    0.700
1073 |     ## 16    0.175    0.527
1074 |     ## 17   -0.034    0.146
1075 |     ## 18    0.000    0.000
1076 |     ## 19    0.718    1.046
1077 |     ## 20   -5.116    0.787
1078 |     ## 21    0.587    0.842
1079 |     ## 22    0.101    0.151
1080 |     ## 23    0.087    0.140
1081 |     ## 24    0.182    0.254
1082 |     ## 25   -1.096    0.698
1083 |     ## 26   -0.023    0.462
1084 |     ## 27    0.573    0.840
1085 |     ## 28    0.000    0.000
1086 |     ## 29    0.662    0.892
1087 |     ## 30    0.857    1.137
1088 |     ## 31   -0.010    0.241
1089 |     ## 32    0.158    0.409
1090 |     ## 33    0.001    0.049
1091 |     ## 34    0.501    1.055
1092 |     ## 35    0.857    1.137
1093 |     ## 36   -0.010    0.186
1094 |     ## 37    0.057    0.203
1095 |     ## 38    0.004    0.077
1096 |     ## 39    0.088    0.287
1097 |     ## 40   -0.138    0.060
1098 |     ## 41   -0.005    0.044
1099 |     ## 42   -0.002    0.008
1100 |     ## 43   -0.007    0.053
1101 |     ## 44   -0.027    0.011
1102 |     ## 45    0.002    0.052
1103 |     ## 46    0.105    0.260
1104 |     ## 47   -0.031    0.104
1105 |     ## 48    0.002    0.067
1106 |     ## 49   -0.014    0.014
1107 |     ## 50   -0.135    0.061
1108 | 
1109 | ``` r
1110 | # Second SEM model
1111 | semModel2 <- '
1112 | pa1 =~ age
1113 | pa5 =~ ratio + chol
1114 | 
1115 | glyhb ~ pa1 + pa5
1116 | '
1117 | fit2 <- sem(semModel2,
1118 |             data = diabetes_completed_subset)
1119 | fit2
1120 | ```
1121 | 
1122 |     ## lavaan 0.6-2 ended normally after 21 iterations
1123 |     ## 
1124 |     ##   Optimization method                           NLMINB
1125 |     ##   Number of free parameters                          9
1126 |     ## 
1127 |     ##   Number of observations                           390
1128 |     ## 
1129 |     ##   Estimator                                         ML
1130 |     ##   Model Fit Test Statistic                       7.350
1131 |     ##   Degrees of freedom                                 1
1132 |     ##   P-value (Chi-square)                           0.007
1133 | 
1134 | ``` r
1135 | semPaths(fit2)
1136 | ```
1137 | 
1138 | ![](figures/cs1-unnamed-chunk-33-1.png)
1139 | 
1140 | ``` r
1141 | summary(fit2, standardized = TRUE, fit.measures = TRUE)
1142 | ```
1143 | 
1144 |     ## lavaan 0.6-2 ended normally after 21 iterations
1145 |     ## 
1146 |     ##   Optimization method                           NLMINB
1147 |     ##   Number of free parameters                          9
1148 |     ## 
1149 |     ##   Number of observations                           390
1150 |     ## 
1151 |     ##   Estimator                                         ML
1152 |     ##   Model Fit Test Statistic                       7.350
1153 |     ##   Degrees of freedom                                 1
1154 |     ##   P-value (Chi-square)                           0.007
1155 |     ## 
1156 |     ## Model test baseline model:
1157 |     ## 
1158 |     ##   Minimum Function Test Statistic              210.710
1159 |     ##   Degrees of freedom                                 6
1160 |     ##   P-value                                        0.000
1161 |     ## 
1162 |     ## User model versus baseline model:
1163 |     ## 
1164 |     ##   Comparative Fit Index (CFI)                    0.969
1165 |     ##   Tucker-Lewis Index (TLI)                       0.814
1166 |     ## 
1167 |     ## Loglikelihood and Information Criteria:
1168 |     ## 
1169 |     ##   Loglikelihood user model (H0)              -2109.862
1170 |     ##   Loglikelihood unrestricted model (H1)      -2106.186
1171 |     ## 
1172 |     ##   Number of free parameters                          9
1173 |     ##   Akaike (AIC)                                4237.723
1174 |     ##   Bayesian (BIC)                              4273.418
1175 |     ##   Sample-size adjusted Bayesian (BIC)         4244.862
1176 |     ## 
1177 |     ## Root Mean Square Error of Approximation:
1178 |     ## 
1179 |     ##   RMSEA                                          0.128
1180 |     ##   90 Percent Confidence Interval          0.054  0.221
1181 |     ##   P-value RMSEA <= 0.05                          0.042
1182 |     ## 
1183 |     ## Standardized Root Mean Square Residual:
1184 |     ## 
1185 |     ##   SRMR                                           0.027
1186 |     ## 
1187 |     ## Parameter Estimates:
1188 |     ## 
1189 |     ##   Information                                 Expected
1190 |     ##   Information saturated (h1) model          Structured
1191 |     ##   Standard Errors                             Standard
1192 |     ## 
1193 |     ## Latent Variables:
1194 |     ##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
1195 |     ##   pa1 =~                                                                
1196 |     ##     age               1.000                               0.999    1.000
1197 |     ##   pa5 =~                                                                
1198 |     ##     ratio             1.000                               0.733    0.734
1199 |     ##     chol              0.885    0.149    5.938    0.000    0.649    0.650
1200 |     ## 
1201 |     ## Regressions:
1202 |     ##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
1203 |     ##   glyhb ~                                                               
1204 |     ##     pa1               0.238    0.050    4.789    0.000    0.238    0.238
1205 |     ##     pa5               0.485    0.099    4.903    0.000    0.355    0.356
1206 |     ## 
1207 |     ## Covariances:
1208 |     ##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
1209 |     ##   pa1 ~~                                                                
1210 |     ##     pa5               0.207    0.049    4.237    0.000    0.283    0.283
1211 |     ## 
1212 |     ## Variances:
1213 |     ##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
1214 |     ##    .age               0.000                               0.000    0.000
1215 |     ##    .ratio             0.460    0.092    4.991    0.000    0.460    0.461
1216 |     ##    .chol              0.576    0.079    7.283    0.000    0.576    0.578
1217 |     ##    .glyhb             0.767    0.060   12.771    0.000    0.767    0.769
1218 |     ##     pa1               0.997    0.071   13.964    0.000    1.000    1.000
1219 |     ##     pa5               0.537    0.107    5.027    0.000    1.000    1.000
1220 | 
1221 | ``` r
1222 | parameterEstimates(fit1)
1223 | ```
1224 | 
1225 |     ##                 lhs op              rhs    est    se      z pvalue
1226 |     ## 1               pa1 =~              age  1.000 0.000     NA     NA
1227 |     ## 2               pa2 =~            bp.1d  1.000 0.000     NA     NA
1228 |     ## 3               pa2 =~            bp.1s  5.235 2.648  1.977  0.048
1229 |     ## 4               pa3 =~              bmi  1.000 0.000     NA     NA
1230 |     ## 5               pa3 =~      frame_large  0.483 0.072  6.705  0.000
1231 |     ## 6               pa3 =~      frame_small -0.543 0.080 -6.793  0.000
1232 |     ## 7               pa4 =~      gender_male  1.000 0.000     NA     NA
1233 |     ## 8               pa4 =~ waist_to_hip_rat  6.946 2.881  2.411  0.016
1234 |     ## 9               pa5 =~            ratio  1.000 0.000     NA     NA
1235 |     ## 10              pa5 =~             chol  0.612 0.106  5.760  0.000
1236 |     ## 11              pa6 =~         time.ppn  1.000 0.000     NA     NA
1237 |     ## 12            glyhb  ~              pa1  0.257 0.049  5.254  0.000
1238 |     ## 13            glyhb  ~              pa2  0.055 0.061  0.892  0.372
1239 |     ## 14            glyhb  ~              pa3  0.063 0.134  0.469  0.639
1240 |     ## 15            glyhb  ~              pa4  0.132 0.290  0.456  0.648
1241 |     ## 16            glyhb  ~              pa5  0.351 0.090  3.916  0.000
1242 |     ## 17            glyhb  ~              pa6  0.056 0.046  1.222  0.222
1243 |     ## 18              age ~~              age  0.000 0.000     NA     NA
1244 |     ## 19            bp.1d ~~            bp.1d  0.882 0.084 10.560  0.000
1245 |     ## 20            bp.1s ~~            bp.1s -2.164 1.506 -1.437  0.151
1246 |     ## 21              bmi ~~              bmi  0.714 0.065 10.975  0.000
1247 |     ## 22      frame_large ~~      frame_large  0.126 0.013  9.944  0.000
1248 |     ## 23      frame_small ~~      frame_small  0.113 0.014  8.366  0.000
1249 |     ## 24      gender_male ~~      gender_male  0.218 0.018 11.936  0.000
1250 |     ## 25 waist_to_hip_rat ~~ waist_to_hip_rat -0.199 0.458 -0.435  0.664
1251 |     ## 26            ratio ~~            ratio  0.219 0.124  1.771  0.077
1252 |     ## 27             chol ~~             chol  0.706 0.068 10.337  0.000
1253 |     ## 28         time.ppn ~~         time.ppn  0.000 0.000     NA     NA
1254 |     ## 29            glyhb ~~            glyhb  0.777 0.059 13.243  0.000
1255 |     ## 30              pa1 ~~              pa1  0.997 0.071 13.964  0.000
1256 |     ## 31              pa2 ~~              pa2  0.115 0.064  1.802  0.072
1257 |     ## 32              pa3 ~~              pa3  0.283 0.064  4.423  0.000
1258 |     ## 33              pa4 ~~              pa4  0.025 0.012  2.035  0.042
1259 |     ## 34              pa5 ~~              pa5  0.778 0.141  5.508  0.000
1260 |     ## 35              pa6 ~~              pa6  0.997 0.071 13.964  0.000
1261 |     ## 36              pa1 ~~              pa2  0.088 0.050  1.766  0.077
1262 |     ## 37              pa1 ~~              pa3  0.130 0.037  3.510  0.000
1263 |     ## 38              pa1 ~~              pa4  0.040 0.019  2.163  0.031
1264 |     ## 39              pa1 ~~              pa5  0.187 0.051  3.681  0.000
1265 |     ## 40              pa1 ~~              pa6 -0.039 0.051 -0.778  0.437
1266 |     ## 41              pa2 ~~              pa3  0.019 0.012  1.564  0.118
1267 |     ## 42              pa2 ~~              pa4  0.003 0.003  1.249  0.212
1268 |     ## 43              pa2 ~~              pa5  0.023 0.015  1.499  0.134
1269 |     ## 44              pa2 ~~              pa6 -0.008 0.010 -0.840  0.401
1270 |     ## 45              pa3 ~~              pa4  0.027 0.013  2.113  0.035
1271 |     ## 46              pa3 ~~              pa5  0.182 0.039  4.618  0.000
1272 |     ## 47              pa3 ~~              pa6  0.036 0.034  1.062  0.288
1273 |     ## 48              pa4 ~~              pa5  0.034 0.016  2.109  0.035
1274 |     ## 49              pa4 ~~              pa6  0.000 0.007  0.003  0.998
1275 |     ## 50              pa5 ~~              pa6 -0.037 0.050 -0.733  0.464
1276 |     ##    ci.lower ci.upper
1277 |     ## 1     1.000    1.000
1278 |     ## 2     1.000    1.000
1279 |     ## 3     0.046   10.425
1280 |     ## 4     1.000    1.000
1281 |     ## 5     0.341    0.624
1282 |     ## 6    -0.700   -0.386
1283 |     ## 7     1.000    1.000
1284 |     ## 8     1.300   12.592
1285 |     ## 9     1.000    1.000
1286 |     ## 10    0.403    0.820
1287 |     ## 11    1.000    1.000
1288 |     ## 12    0.161    0.353
1289 |     ## 13   -0.065    0.174
1290 |     ## 14   -0.199    0.325
1291 |     ## 15   -0.436    0.700
1292 |     ## 16    0.175    0.527
1293 |     ## 17   -0.034    0.146
1294 |     ## 18    0.000    0.000
1295 |     ## 19    0.718    1.046
1296 |     ## 20   -5.116    0.787
1297 |     ## 21    0.587    0.842
1298 |     ## 22    0.101    0.151
1299 |     ## 23    0.087    0.140
1300 |     ## 24    0.182    0.254
1301 |     ## 25   -1.096    0.698
1302 |     ## 26   -0.023    0.462
1303 |     ## 27    0.573    0.840
1304 |     ## 28    0.000    0.000
1305 |     ## 29    0.662    0.892
1306 |     ## 30    0.857    1.137
1307 |     ## 31   -0.010    0.241
1308 |     ## 32    0.158    0.409
1309 |     ## 33    0.001    0.049
1310 |     ## 34    0.501    1.055
1311 |     ## 35    0.857    1.137
1312 |     ## 36   -0.010    0.186
1313 |     ## 37    0.057    0.203
1314 |     ## 38    0.004    0.077
1315 |     ## 39    0.088    0.287
1316 |     ## 40   -0.138    0.060
1317 |     ## 41   -0.005    0.044
1318 |     ## 42   -0.002    0.008
1319 |     ## 43   -0.007    0.053
1320 |     ## 44   -0.027    0.011
1321 |     ## 45    0.002    0.052
1322 |     ## 46    0.105    0.260
1323 |     ## 47   -0.031    0.104
1324 |     ## 48    0.002    0.067
1325 |     ## 49   -0.014    0.014
1326 |     ## 50   -0.135    0.061
1327 | 


--------------------------------------------------------------------------------
/case_studies/data/diabetes.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/data/diabetes.sav


--------------------------------------------------------------------------------
/case_studies/figures/cs1-unnamed-chunk-12-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-12-1.pdf


--------------------------------------------------------------------------------
/case_studies/figures/cs1-unnamed-chunk-12-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-12-1.png


--------------------------------------------------------------------------------
/case_studies/figures/cs1-unnamed-chunk-15-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-15-1.png


--------------------------------------------------------------------------------
/case_studies/figures/cs1-unnamed-chunk-18-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-18-1.png


--------------------------------------------------------------------------------
/case_studies/figures/cs1-unnamed-chunk-19-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-19-1.png


--------------------------------------------------------------------------------
/case_studies/figures/cs1-unnamed-chunk-22-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-22-1.png


--------------------------------------------------------------------------------
/case_studies/figures/cs1-unnamed-chunk-23-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-23-1.png


--------------------------------------------------------------------------------
/case_studies/figures/cs1-unnamed-chunk-24-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-24-1.png


--------------------------------------------------------------------------------
/case_studies/figures/cs1-unnamed-chunk-25-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-25-1.png


--------------------------------------------------------------------------------
/case_studies/figures/cs1-unnamed-chunk-26-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-26-1.png


--------------------------------------------------------------------------------
/case_studies/figures/cs1-unnamed-chunk-29-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-29-1.png


--------------------------------------------------------------------------------
/case_studies/figures/cs1-unnamed-chunk-33-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-33-1.png


--------------------------------------------------------------------------------
/case_studies/figures/cs1-unnamed-chunk-9-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-9-1.pdf


--------------------------------------------------------------------------------
/case_studies/figures/cs1-unnamed-chunk-9-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-9-1.png


--------------------------------------------------------------------------------
/ci/scripts/runAllModels.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | cd scripts
3 | for f in *.py; do python "$f"; done


--------------------------------------------------------------------------------
/data/aircraft.csv:
--------------------------------------------------------------------------------
 1 | "","X1","X2","X3","X4","Y"
 2 | "1",6.3,1.7,8176,4500,2.76
 3 | "2",6,1.9,6699,3120,4.76
 4 | "3",5.9,1.5,9663,6300,8.75
 5 | "4",3,1.2,12837,9800,7.78
 6 | "5",5,1.8,10205,4900,6.18
 7 | "6",6.3,2,14890,6500,9.5
 8 | "7",5.6,1.6,13836,8920,5.14
 9 | "8",3.6,1.2,11628,14500,4.76
10 | "9",2,1.4,15225,14800,16.7
11 | "10",2.9,2.3,18691,10900,27.68
12 | "11",2.2,1.9,19350,16000,26.64
13 | "12",3.9,2.6,20638,16000,13.71
14 | "13",4.5,2,12843,7800,12.31
15 | "14",4.3,9.7,13384,17900,15.73
16 | "15",4,2.9,13307,10500,13.59
17 | "16",3.2,4.3,29855,24500,51.9
18 | "17",4.3,4.3,29277,30000,20.78
19 | "18",2.4,2.6,24651,24500,29.82
20 | "19",2.8,3.7,28539,34000,32.78
21 | "20",3.9,3.3,8085,8160,10.12
22 | "21",2.8,3.9,30328,35800,27.84
23 | "22",1.6,4.1,46172,37000,107.1
24 | "23",3.4,2.5,17836,19600,11.19
25 | 


--------------------------------------------------------------------------------
/data/awards.csv:
--------------------------------------------------------------------------------
  1 | id,num_awards,prog,math
  2 | 45,1,3,41
  3 | 108,1,1,41
  4 | 15,1,3,44
  5 | 67,1,3,42
  6 | 153,1,3,40
  7 | 51,1,1,42
  8 | 164,1,3,46
  9 | 133,1,3,40
 10 | 2,1,3,33
 11 | 53,1,3,46
 12 | 1,1,3,40
 13 | 128,0,2,38
 14 | 16,1,3,44
 15 | 106,1,3,37
 16 | 89,1,3,40
 17 | 134,1,1,39
 18 | 19,1,1,43
 19 | 145,0,3,38
 20 | 11,1,2,45
 21 | 117,0,3,39
 22 | 109,1,1,42
 23 | 12,1,3,45
 24 | 37,1,3,40
 25 | 69,0,3,40
 26 | 43,1,2,43
 27 | 196,1,2,49
 28 | 36,1,1,44
 29 | 155,1,1,46
 30 | 6,0,2,46
 31 | 4,1,2,41
 32 | 25,0,1,42
 33 | 107,0,3,47
 34 | 5,1,2,43
 35 | 47,1,2,49
 36 | 140,1,3,40
 37 | 22,1,3,39
 38 | 18,1,3,49
 39 | 30,0,2,42
 40 | 40,0,1,43
 41 | 176,0,2,41
 42 | 126,0,1,57
 43 | 197,0,2,50
 44 | 46,0,2,44
 45 | 49,0,3,39
 46 | 8,0,2,52
 47 | 124,1,3,41
 48 | 13,0,3,39
 49 | 111,0,1,39
 50 | 142,0,3,52
 51 | 193,1,2,48
 52 | 105,3,2,45
 53 | 58,2,3,40
 54 | 129,3,1,46
 55 | 38,3,2,50
 56 | 182,0,2,43
 57 | 115,0,1,43
 58 | 14,1,2,54
 59 | 175,1,1,42
 60 | 44,2,3,45
 61 | 86,2,1,54
 62 | 72,3,3,47
 63 | 41,1,2,45
 64 | 191,0,2,43
 65 | 138,1,3,40
 66 | 9,0,3,52
 67 | 151,1,3,52
 68 | 119,0,1,45
 69 | 55,1,2,49
 70 | 73,1,2,53
 71 | 28,0,1,54
 72 | 90,2,2,50
 73 | 17,0,2,48
 74 | 102,0,2,51
 75 | 70,0,1,41
 76 | 148,1,3,51
 77 | 54,0,1,46
 78 | 42,0,3,55
 79 | 87,0,1,46
 80 | 21,2,1,61
 81 | 181,1,2,45
 82 | 165,1,3,54
 83 | 78,1,2,54
 84 | 76,1,2,51
 85 | 29,0,1,49
 86 | 91,1,3,56
 87 | 52,2,2,53
 88 | 10,1,1,49
 89 | 85,3,1,57
 90 | 50,0,1,42
 91 | 56,1,3,46
 92 | 64,1,3,45
 93 | 130,1,1,55
 94 | 141,1,3,47
 95 | 74,0,2,50
 96 | 83,1,3,41
 97 | 31,0,1,52
 98 | 172,1,2,57
 99 | 184,1,3,53
100 | 75,1,3,51
101 | 187,1,1,57
102 | 113,1,2,51
103 | 162,0,3,40
104 | 110,2,3,50
105 | 150,2,3,57
106 | 167,0,1,35
107 | 77,1,2,49
108 | 35,0,1,50
109 | 158,1,1,55
110 | 112,0,2,48
111 | 48,0,2,52
112 | 147,1,2,53
113 | 7,1,2,59
114 | 65,2,2,66
115 | 168,0,2,57
116 | 190,1,2,54
117 | 178,0,3,57
118 | 159,1,2,54
119 | 120,0,2,54
120 | 116,0,2,54
121 | 79,2,2,49
122 | 98,1,3,51
123 | 122,3,2,58
124 | 179,1,2,60
125 | 198,1,2,51
126 | 189,1,2,63
127 | 199,1,2,50
128 | 156,1,2,53
129 | 166,0,2,53
130 | 160,0,2,55
131 | 152,1,2,56
132 | 183,0,2,49
133 | 94,1,2,61
134 | 149,0,1,49
135 | 131,0,2,57
136 | 24,0,2,66
137 | 99,0,1,56
138 | 171,3,2,60
139 | 104,1,2,57
140 | 81,1,2,59
141 | 97,1,2,58
142 | 20,0,2,57
143 | 163,3,2,64
144 | 195,0,1,60
145 | 84,0,1,54
146 | 27,1,2,61
147 | 118,1,1,58
148 | 71,0,1,56
149 | 63,0,1,60
150 | 185,0,2,55
151 | 127,3,2,57
152 | 177,0,2,62
153 | 188,0,2,56
154 | 60,0,2,51
155 | 66,2,3,56
156 | 173,0,1,61
157 | 186,1,2,63
158 | 96,5,2,61
159 | 101,0,2,67
160 | 3,0,2,48
161 | 170,1,2,61
162 | 92,0,1,57
163 | 62,0,1,48
164 | 135,2,2,65
165 | 26,4,2,62
166 | 139,1,2,61
167 | 121,0,3,53
168 | 144,1,1,58
169 | 146,1,2,64
170 | 137,3,2,65
171 | 123,1,1,56
172 | 169,1,1,63
173 | 34,3,2,57
174 | 33,2,2,72
175 | 32,0,3,66
176 | 114,0,2,62
177 | 125,1,2,58
178 | 59,1,2,63
179 | 23,3,2,64
180 | 161,2,2,72
181 | 103,0,2,64
182 | 194,6,2,69
183 | 136,4,2,70
184 | 154,1,2,66
185 | 157,0,1,58
186 | 93,2,2,62
187 | 39,2,2,67
188 | 88,1,2,64
189 | 192,2,2,63
190 | 80,1,2,68
191 | 200,1,2,75
192 | 180,0,2,69
193 | 82,1,2,65
194 | 174,2,2,71
195 | 95,5,2,71
196 | 61,1,2,60
197 | 100,2,2,71
198 | 143,2,3,75
199 | 68,1,2,71
200 | 57,0,2,72
201 | 132,3,2,73
202 | 


--------------------------------------------------------------------------------
/data/binary.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/data/binary.dta


--------------------------------------------------------------------------------
/data/cereals.txt:
--------------------------------------------------------------------------------
 1 | name	mfr	type	calories	protein	fat	sodium	fiber	carbo	sugars	potass	vitamins	shelf	weight	cups	rating
 2 | 100%_Bran	N	C	70	4	1	130	10	5	6	280	25	3	1	0.33	68.402973
 3 | 100%_Natural_Bran	Q	C	120	3	5	15	2	8	8	135	0	3	1	1	33.983679
 4 | All-Bran	K	C	70	4	1	260	9	7	5	320	25	3	1	0.33	59.425505
 5 | All-Bran_with_Extra_Fiber	K	C	50	4	0	140	14	8	0	330	25	3	1	0.5	93.704912
 6 | Almond_Delight	R	C	110	2	2	200	1	14	8	-1	25	3	1	0.75	34.384843
 7 | Apple_Cinnamon_Cheerios	G	C	110	2	2	180	1.5	10.5	10	70	25	1	1	0.75	29.509541
 8 | Apple_Jacks	K	C	110	2	0	125	1	11	14	30	25	2	1	1	33.174094
 9 | Basic_4	G	C	130	3	2	210	2	18	8	100	25	3	1.33	0.75	37.038562
10 | Bran_Chex	R	C	90	2	1	200	4	15	6	125	25	1	1	0.67	49.120253
11 | Bran_Flakes	P	C	90	3	0	210	5	13	5	190	25	3	1	0.67	53.313813
12 | Cap'n'Crunch	Q	C	120	1	2	220	0	12	12	35	25	2	1	0.75	18.042851
13 | Cheerios	G	C	110	6	2	290	2	17	1	105	25	1	1	1.25	50.764999
14 | Cinnamon_Toast_Crunch	G	C	120	1	3	210	0	13	9	45	25	2	1	0.75	19.823573
15 | Clusters	G	C	110	3	2	140	2	13	7	105	25	3	1	0.5	40.400208
16 | Cocoa_Puffs	G	C	110	1	1	180	0	12	13	55	25	2	1	1	22.736446
17 | Corn_Chex	R	C	110	2	0	280	0	22	3	25	25	1	1	1	41.445019
18 | Corn_Flakes	K	C	100	2	0	290	1	21	2	35	25	1	1	1	45.863324
19 | Corn_Pops	K	C	110	1	0	90	1	13	12	20	25	2	1	1	35.782791
20 | Count_Chocula	G	C	110	1	1	180	0	12	13	65	25	2	1	1	22.396513
21 | Cracklin'_Oat_Bran	K	C	110	3	3	140	4	10	7	160	25	3	1	0.5	40.448772
22 | Cream_of_Wheat_(Quick)	N	H	100	3	0	80	1	21	0	-1	0	2	1	1	64.533816
23 | Crispix	K	C	110	2	0	220	1	21	3	30	25	3	1	1	46.895644
24 | Crispy_Wheat_&_Raisins	G	C	100	2	1	140	2	11	10	120	25	3	1	0.75	36.176196
25 | Double_Chex	R	C	100	2	0	190	1	18	5	80	25	3	1	0.75	44.330856
26 | Froot_Loops	K	C	110	2	1	125	1	11	13	30	25	2	1	1	32.207582
27 | Frosted_Flakes	K	C	110	1	0	200	1	14	11	25	25	1	1	0.75	31.435973
28 | Frosted_Mini-Wheats	K	C	100	3	0	0	3	14	7	100	25	2	1	0.8	58.345141
29 | Fruit_&_Fibre_Dates,_Walnuts,_and_Oats	P	C	120	3	2	160	5	12	10	200	25	3	1.25	0.67	40.917047
30 | Fruitful_Bran	K	C	120	3	0	240	5	14	12	190	25	3	1.33	0.67	41.015492
31 | Fruity_Pebbles	P	C	110	1	1	135	0	13	12	25	25	2	1	0.75	28.025765
32 | Golden_Crisp	P	C	100	2	0	45	0	11	15	40	25	1	1	0.88	35.252444
33 | Golden_Grahams	G	C	110	1	1	280	0	15	9	45	25	2	1	0.75	23.804043
34 | Grape_Nuts_Flakes	P	C	100	3	1	140	3	15	5	85	25	3	1	0.88	52.076897
35 | Grape-Nuts	P	C	110	3	0	170	3	17	3	90	25	3	1	0.25	53.371007
36 | Great_Grains_Pecan	P	C	120	3	3	75	3	13	4	100	25	3	1	0.33	45.811716
37 | Honey_Graham_Ohs	Q	C	120	1	2	220	1	12	11	45	25	2	1	1	21.871292
38 | Honey_Nut_Cheerios	G	C	110	3	1	250	1.5	11.5	10	90	25	1	1	0.75	31.072217
39 | Honey-comb	P	C	110	1	0	180	0	14	11	35	25	1	1	1.33	28.742414
40 | Just_Right_Crunchy__Nuggets	K	C	110	2	1	170	1	17	6	60	100	3	1	1	36.523683
41 | Just_Right_Fruit_&_Nut	K	C	140	3	1	170	2	20	9	95	100	3	1.3	0.75	36.471512
42 | Kix	G	C	110	2	1	260	0	21	3	40	25	2	1	1.5	39.241114
43 | Life	Q	C	100	4	2	150	2	12	6	95	25	2	1	0.67	45.328074
44 | Lucky_Charms	G	C	110	2	1	180	0	12	12	55	25	2	1	1	26.734515
45 | Maypo	A	H	100	4	1	0	0	16	3	95	25	2	1	1	54.850917
46 | Muesli_Raisins,_Dates,_&_Almonds	R	C	150	4	3	95	3	16	11	170	25	3	1	1	37.136863
47 | Muesli_Raisins,_Peaches,_&_Pecans	R	C	150	4	3	150	3	16	11	170	25	3	1	1	34.139765
48 | Mueslix_Crispy_Blend	K	C	160	3	2	150	3	17	13	160	25	3	1.5	0.67	30.313351
49 | Multi-Grain_Cheerios	G	C	100	2	1	220	2	15	6	90	25	1	1	1	40.105965
50 | Nut&Honey_Crunch	K	C	120	2	1	190	0	15	9	40	25	2	1	0.67	29.924285
51 | Nutri-Grain_Almond-Raisin	K	C	140	3	2	220	3	21	7	130	25	3	1.33	0.67	40.692320
52 | Nutri-grain_Wheat	K	C	90	3	0	170	3	18	2	90	25	3	1	1	59.642837
53 | Oatmeal_Raisin_Crisp	G	C	130	3	2	170	1.5	13.5	10	120	25	3	1.25	0.5	30.450843
54 | Post_Nat._Raisin_Bran	P	C	120	3	1	200	6	11	14	260	25	3	1.33	0.67	37.840594
55 | Product_19	K	C	100	3	0	320	1	20	3	45	100	3	1	1	41.503540
56 | Puffed_Rice	Q	C	50	1	0	0	0	13	0	15	0	3	0.5	1	60.756112
57 | Puffed_Wheat	Q	C	50	2	0	0	1	10	0	50	0	3	0.5	1	63.005645
58 | Quaker_Oat_Squares	Q	C	100	4	1	135	2	14	6	110	25	3	1	0.5	49.511874
59 | Quaker_Oatmeal	Q	H	100	5	2	0	2.7	-1	-1	110	0	1	1	0.67	50.828392
60 | Raisin_Bran	K	C	120	3	1	210	5	14	12	240	25	2	1.33	0.75	39.259197
61 | Raisin_Nut_Bran	G	C	100	3	2	140	2.5	10.5	8	140	25	3	1	0.5	39.703400
62 | Raisin_Squares	K	C	90	2	0	0	2	15	6	110	25	3	1	0.5	55.333142
63 | Rice_Chex	R	C	110	1	0	240	0	23	2	30	25	1	1	1.13	41.998933
64 | Rice_Krispies	K	C	110	2	0	290	0	22	3	35	25	1	1	1	40.560159
65 | Shredded_Wheat	N	C	80	2	0	0	3	16	0	95	0	1	0.83	1	68.235885
66 | Shredded_Wheat_'n'Bran	N	C	90	3	0	0	4	19	0	140	0	1	1	0.67	74.472949
67 | Shredded_Wheat_spoon_size	N	C	90	3	0	0	3	20	0	120	0	1	1	0.67	72.801787
68 | Smacks	K	C	110	2	1	70	1	9	15	40	25	2	1	0.75	31.230054
69 | Special_K	K	C	110	6	0	230	1	16	3	55	25	1	1	1	53.131324
70 | Strawberry_Fruit_Wheats	N	C	90	2	0	15	3	15	5	90	25	2	1	1	59.363993
71 | Total_Corn_Flakes	G	C	110	2	1	200	0	21	3	35	100	3	1	1	38.839746
72 | Total_Raisin_Bran	G	C	140	3	1	190	4	15	14	230	100	3	1.5	1	28.592785
73 | Total_Whole_Grain	G	C	100	3	1	200	3	16	3	110	100	3	1	1	46.658844
74 | Triples	G	C	110	2	1	250	0	21	3	60	25	3	1	0.75	39.106174
75 | Trix	G	C	110	1	1	140	0	13	12	25	25	2	1	1	27.753301
76 | Wheat_Chex	R	C	100	3	1	230	3	17	3	115	25	1	1	0.67	49.787445
77 | Wheaties	G	C	100	3	1	200	3	17	3	110	25	1	1	1	51.592193
78 | Wheaties_Honey_Gold	G	C	110	2	1	200	1	16	8	60	25	1	1	0.75	36.187559
79 | 


--------------------------------------------------------------------------------
/data/child_data.csv:
--------------------------------------------------------------------------------
 1 | age,mem_span,iq,read_ab
 2 | 6.7,4.4,95,7.2
 3 | 5.9,4,90,6
 4 | 5.5,4.1,105,6
 5 | 6.2,4.8,98,6.6
 6 | 6.4,5,106,7
 7 | 7.3,5.5,100,7.2
 8 | 5.7,3.6,88,5.3
 9 | 6.15,5,95,6.4
10 | 7.5,5.4,96,6.6
11 | 6.9,5,104,7.3
12 | 4.1,3.9,108,5
13 | 5.5,4.2,90,5.8
14 | 6.9,4.5,91,6.6
15 | 7.2,5,92,6.8
16 | 4,4.2,101,5.6
17 | 7.3,5.5,100,7.2
18 | 5.9,4,90,6
19 | 5.5,4.2,90,5.8
20 | 4,4.2,101,5.6
21 | 5.9,4,90,6


--------------------------------------------------------------------------------
/data/drugtrial.csv:
--------------------------------------------------------------------------------
 1 | subject,gender,dose,score
 2 | 1,1,1,6
 3 | 2,1,1,6
 4 | 3,1,1,3
 5 | 4,1,1,5
 6 | 5,1,1,6
 7 | 6,1,1,4
 8 | 7,1,1,5
 9 | 8,1,1,4
10 | 9,1,1,4
11 | 10,1,1,5
12 | 11,1,1,4
13 | 12,1,1,3
14 | 13,1,2,6
15 | 14,1,2,8
16 | 15,1,2,7
17 | 16,1,2,8
18 | 17,1,2,6
19 | 18,1,2,8
20 | 19,1,2,8
21 | 20,1,2,6
22 | 21,1,2,7
23 | 22,1,2,8
24 | 23,1,2,6
25 | 24,1,2,7
26 | 25,2,1,2
27 | 26,2,1,5
28 | 27,2,1,2
29 | 28,2,1,4
30 | 29,2,1,5
31 | 30,2,1,7
32 | 31,2,1,4
33 | 32,2,1,1
34 | 33,2,1,2
35 | 34,2,1,7
36 | 35,2,1,4
37 | 36,2,1,0
38 | 37,2,2,2
39 | 38,2,2,3
40 | 39,2,2,4
41 | 40,2,2,0
42 | 41,2,2,0
43 | 42,2,2,1
44 | 43,2,2,2
45 | 44,2,2,2
46 | 45,2,2,4
47 | 46,2,2,3
48 | 47,2,2,6
49 | 48,2,2,3


--------------------------------------------------------------------------------
/data/hsbdemo.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/data/hsbdemo.dta


--------------------------------------------------------------------------------
/data/iqdata.csv:
--------------------------------------------------------------------------------
 1 | group,iq
 2 | 1,44
 3 | 1,40
 4 | 1,44
 5 | 1,39
 6 | 1,25
 7 | 1,37
 8 | 1,31
 9 | 1,40
10 | 1,22
11 | 1,34
12 | 1,39
13 | 1,20
14 | 1,39
15 | 1,42
16 | 1,41
17 | 2,36
18 | 2,40
19 | 2,37
20 | 2,35
21 | 2,39
22 | 2,40
23 | 2,36
24 | 2,38
25 | 2,24
26 | 2,27
27 | 2,29
28 | 2,24
29 | 2,45
30 | 2,44
31 | 2,44
32 | 3,52
33 | 3,50
34 | 3,51
35 | 3,52
36 | 3,45
37 | 3,49
38 | 3,47
39 | 3,46
40 | 3,47
41 | 3,47
42 | 3,46
43 | 3,45
44 | 3,50
45 | 


--------------------------------------------------------------------------------
/data/ologit.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/data/ologit.dta


--------------------------------------------------------------------------------
/data/scents.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/data/scents.sav


--------------------------------------------------------------------------------
/data/temprate.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/data/temprate.sav


--------------------------------------------------------------------------------
/models/linearRegression.stan:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Linear Regression
 3 | * -----------------------------------------------------
 4 | * Copyright: Murat Koptur <muratkoptur@yandex.com>
 5 | * Date: 05/08/2018
 6 | * License: GPLv3
 7 | */
 8 | 
 9 | data {
10 |   int<lower=0> N; // number of observations
11 |   vector[N] x; // day x_i
12 |   vector[N] y; // weigth in grams on day x_i
13 | }
14 | 
15 | parameters {
16 |   real alpha; // intercept
17 |   real beta; // slope
18 |   real<lower=0> sigma; // std deviation
19 | }
20 | 
21 | model {
22 |   alpha ~ normal(0, 100);
23 |   beta ~ normal(0, 100);
24 |   sigma ~ cauchy(0, 10);
25 |   y ~ normal(alpha + beta * x, sigma);
26 | }
27 | 
28 | generated quantities {
29 |   // http://mc-stan.org/loo/reference/extract_log_lik.html
30 |   vector[N] log_lik;
31 |   for (n in 1:N)
32 |     log_lik[n] = normal_lpdf(y[n] | alpha + beta * x[n], sigma);
33 | }


--------------------------------------------------------------------------------
/models/logisticRegression.stan:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Logistic Regression
 3 | * -----------------------------------------------------
 4 | * Copyright: Murat Koptur <muratkoptur@yandex.com>
 5 | * Date: 09/08/2018
 6 | * License: GPLv3
 7 | */
 8 | 
 9 | data {
10 |   int<lower=0> N_train;
11 |   int<lower=0> N_test;
12 |   int<lower=0> D;
13 |   row_vector[D] x_train[N_train];
14 |   row_vector[D] x_test[N_test];
15 |   int<lower=0, upper=1> y_train[N_train];
16 | }
17 | 
18 | parameters {
19 |   real alpha;
20 |   vector[D] beta;
21 | }
22 | 
23 | model {
24 |   alpha ~ normal(0, 10);
25 |   beta ~ student_t(1,0,2.5); // weakly informative priors
26 |   for (n in 1:N_train)
27 |     y_train[n] ~ bernoulli_logit(x_train[n] * beta + alpha);
28 | }
29 | 
30 | generated quantities {
31 |   int<lower=0,upper=1> y_pred[N_test];
32 |   for (n in 1:N_test) {
33 |     y_pred[n] = bernoulli_logit_rng(x_test[n] * beta + alpha);
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/models/multinomialLogisticRegression.stan:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Multinomial Logistic Regression
 3 | * -----------------------------------------------------
 4 | * Copyright: Murat Koptur <muratkoptur@yandex.com>
 5 | * Date: 11/08/2018
 6 | * License: GPLv3
 7 | */
 8 | 
 9 | data {
10 |   int<lower=0> N; // number of observations
11 |   int<lower=2> K; // number of possible outcomes
12 |   int<lower=1> D; // D is dimension of x_n vectors
13 |   vector[D] x[N];
14 |   int<lower=1,upper=K> y[N];
15 | }
16 | 
17 | parameters {
18 |   matrix[K, D] beta;
19 | }
20 | 
21 | model {
22 |   for (k in 1:K)
23 |     beta[k] ~ normal(0, 1);
24 |   for (n in 1:N)
25 |     y[n] ~ categorical_logit(beta * x[n]);
26 | }
27 | 


--------------------------------------------------------------------------------
/models/multipleLinearRegression.stan:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Multiple Linear Regression
 3 | * -----------------------------------------------------
 4 | * Copyright: Murat Koptur <muratkoptur@yandex.com>
 5 | * Date: 06/08/2018
 6 | * License: GPLv3
 7 | */
 8 | 
 9 | data {
10 |   int<lower=0> N; // number of observations
11 |   vector[N] fat; // grams of fat
12 |   vector[N] weight; // weight in ounces of one serving
13 |   vector[N] cups; // number of cups in one serving
14 |   vector[N] rating; // a rating of the cereals
15 | }
16 | 
17 | parameters {
18 |   real b_fat; // coefficents
19 |   real b_weight;
20 |   real b_cups;
21 |   real beta;
22 |   real<lower=0> sigma; // std deviation
23 | }
24 | 
25 | model {
26 |   b_fat ~ normal(0, 10);
27 |   b_weight ~ normal(0, 10);
28 |   b_cups ~ normal(0, 10);
29 |   beta ~ normal(0, 10);
30 |   sigma ~ cauchy(0, 5);
31 |   rating ~ normal(beta + b_fat * fat + b_weight * weight +
32 |                   b_cups * cups, sigma);
33 | }
34 | 
35 | generated quantities {
36 |   real rating_pred[N]; // predictions
37 |   real log_lik[N];
38 |   for (n in 1:N)
39 |     rating_pred[n] = normal_rng(beta + b_fat * fat[n] + b_weight * weight[n] +
40 |                                 b_cups * cups[n], sigma);
41 |   for (n in 1:N)
42 |     log_lik[n] = normal_lpdf(rating[n] | beta + b_fat * fat[n] + b_weight * weight[n] + 
43 |                                          b_cups * cups[n], sigma);
44 | }
45 | 


--------------------------------------------------------------------------------
/models/onewayANOVA.stan:
--------------------------------------------------------------------------------
 1 | /*
 2 | * One-way ANOVA
 3 | * -----------------------------------------------------
 4 | * Copyright: Murat Koptur <muratkoptur@yandex.com>
 5 | * Date: 17/08/2018
 6 | * License: GPLv3
 7 | */
 8 | 
 9 | data {
10 |   int<lower=0> N;
11 |   int<lower=0, upper=1> x1[N];
12 |   int<lower=0, upper=1> x2[N];
13 |   int y[N];
14 | }
15 | 
16 | parameters {
17 |   real alpha;
18 |   real beta_x1;
19 |   real beta_x2;
20 |   real<lower=0> sigma;
21 | }
22 | 
23 | model {
24 |   alpha ~ normal(0, 10);
25 |   beta_x1 ~ normal(0, 10);
26 |   beta_x2 ~ normal(0, 10);
27 |   sigma ~ normal(0, 5);
28 |   for (i in 1:N)
29 |     y[i] ~ normal(alpha + beta_x1 * x1[i] + beta_x2 * x2[i], sigma);
30 | }
31 | 


--------------------------------------------------------------------------------
/models/orderedLogisticRegression.stan:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Ordered Logistic Regression
 3 | * -----------------------------------------------------
 4 | * Copyright: Murat Koptur <muratkoptur@yandex.com>
 5 | * Date: 13/08/2018
 6 | * License: GPLv3
 7 | */
 8 | 
 9 | data {
10 |   int<lower=0> N;
11 |   int<lower=0> D;
12 |   int<lower=0> K;
13 |   row_vector[D] x[N];
14 |   int<lower=1, upper=K> y[N];
15 | }
16 | 
17 | parameters {
18 |   vector[D] beta;
19 |   ordered[K-1] c;
20 | }
21 | 
22 | model {
23 |   for (n in 1:N)
24 |     y[n] ~ ordered_logistic(x[n] * beta, c);
25 | }
26 | 


--------------------------------------------------------------------------------
/models/robustRegression.stan:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Robust Regression
 3 | * -----------------------------------------------------
 4 | * Copyright: Murat Koptur <muratkoptur@yandex.com>
 5 | * Date: 08/08/2018
 6 | * License: GPLv3
 7 | */
 8 | 
 9 | data {
10 |   int<lower=0> N; // number of observations
11 |   vector[N] X1; // Aspect Ratio
12 |   vector[N] X2; // Lift-to-Drag Ratio
13 |   vector[N] X3; // Weight
14 |   vector[N] X4; // Thrust
15 |   vector[N] Y; // Cost
16 | }
17 | 
18 | parameters {
19 |   real b_X1;
20 |   real b_X2;
21 |   real b_X3;
22 |   real b_X4;
23 |   real beta;
24 |   real<lower=0> sigma;
25 |   real<lower=1> nu;
26 | }
27 | 
28 | model {
29 |   b_X1 ~ normal(0, 1e6);
30 |   b_X2 ~ normal(0, 1e6);
31 |   b_X3 ~ normal(0, 1e6);
32 |   b_X4 ~ normal(0, 1e6);
33 |   beta ~ normal(0, 1e3);
34 |   sigma ~ normal(0, 5);
35 |   nu ~ gamma(2, 0.1);
36 |   Y ~ student_t(nu,
37 |                 beta + b_X1 * X1 + b_X2 * X2 + b_X3 * X3 + b_X4 * X4,
38 |                 sigma);
39 | }
40 | 
41 | generated quantities {
42 |   real Y_pred[N]; // predictions
43 |   for (n in 1:N) {
44 |     Y_pred[n] = student_t_rng(nu,
45 |                               beta + b_X1 * X1[n] +
46 |                               b_X2 * X2[n] + b_X3 * X3[n] +
47 |                               b_X4 * X4[n],
48 |                               sigma);
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/models/twowayANOVA.stan:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Two-way ANOVA
 3 | * -----------------------------------------------------
 4 | * Copyright: Murat Koptur <muratkoptur@yandex.com>
 5 | * Date: 17/08/2018
 6 | * License: GPLv3
 7 | */
 8 | 
 9 | data {
10 |   int<lower=0> N;
11 |   int<lower=0, upper=1> x1[N];
12 |   int<lower=0, upper=1> x2[N];
13 |   int y[N];
14 | }
15 | 
16 | parameters {
17 |   real alpha;
18 |   real beta_x1;
19 |   real beta_x2;
20 |   real beta_x3;
21 |   real<lower=0> sigma;
22 | }
23 | 
24 | model {
25 |   alpha ~ normal(0, 10);
26 |   beta_x1 ~ normal(0, 10);
27 |   beta_x2 ~ normal(0, 10);
28 |   beta_x3 ~ normal(0, 10);
29 |   sigma ~ normal(0, 5);
30 |   for (i in 1:N)
31 |     y[i] ~ normal(alpha + beta_x1 * x1[i] + beta_x2 * x2[i] + beta_x3 * x1[i] * x2[i], sigma);
32 | }
33 | 


--------------------------------------------------------------------------------
/notebooks/Bayes Factor.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Bayes Factors"
 3 | author: "Murat Koptur"
 4 | date: "`r format(Sys.time(), '%d %B %Y')`"
 5 | output: rmarkdown::github_document
 6 | ---
 7 | 
 8 | ```{r echo=FALSE}
 9 | knitr::opts_chunk$set(fig.path='figures/bf-')
10 | ```
11 | 
12 | ```{r}
13 | library(haven)
14 | library(BayesFactor)
15 | ```
16 | 
17 | ```{r}
18 | scents <- read_spss("../data/scents.sav")
19 | head(scents)
20 | ```
21 | 
22 | ```{r}
23 | scents$diffs <- scents$noscent - scents$scent
24 | head(scents)
25 | ```
26 | 
27 | ```{r}
28 | bf <- ttestBF(scents$diffs)
29 | bf
30 | ```
31 | 
32 | ```{r}
33 | sprintf("Bayes factor: %f", exp(bf@bayesFactor$bf))
34 | ```
35 | 
36 | 


--------------------------------------------------------------------------------
/notebooks/Bayes_Factor.md:
--------------------------------------------------------------------------------
 1 | Bayes Factors
 2 | ================
 3 | Murat Koptur
 4 | 25 Ağustos 2018
 5 | 
 6 | ``` r
 7 | library(haven)
 8 | library(BayesFactor)
 9 | ```
10 | 
11 |     ## Loading required package: coda
12 | 
13 |     ## Loading required package: Matrix
14 | 
15 |     ## ************
16 |     ## Welcome to BayesFactor 0.9.12-4.2. If you have questions, please contact Richard Morey (richarddmorey@gmail.com).
17 |     ## 
18 |     ## Type BFManual() to open the manual.
19 |     ## ************
20 | 
21 | ``` r
22 | scents <- read_spss("../data/scents.sav")
23 | head(scents)
24 | ```
25 | 
26 |     ## # A tibble: 6 x 4
27 |     ##    part sex       noscent scent
28 |     ##   <dbl> <chr+lbl>   <dbl> <dbl>
29 |     ## 1     1 1            27.7  30.6
30 |     ## 2     2 2            57.2  43.3
31 |     ## 3     3 1            57.9  53.4
32 |     ## 4     4 1            38    37.4
33 |     ## 5     5 1            57.9  48.6
34 |     ## 6     6 2            32    35.5
35 | 
36 | ``` r
37 | scents$diffs <- scents$noscent - scents$scent
38 | head(scents)
39 | ```
40 | 
41 |     ## # A tibble: 6 x 5
42 |     ##    part sex       noscent scent  diffs
43 |     ##   <dbl> <chr+lbl>   <dbl> <dbl>  <dbl>
44 |     ## 1     1 1            27.7  30.6  -2.9 
45 |     ## 2     2 2            57.2  43.3  13.9 
46 |     ## 3     3 1            57.9  53.4   4.5 
47 |     ## 4     4 1            38    37.4   0.6 
48 |     ## 5     5 1            57.9  48.6   9.30
49 |     ## 6     6 2            32    35.5  -3.5
50 | 
51 | ``` r
52 | bf <- ttestBF(scents$diffs)
53 | bf
54 | ```
55 | 
56 |     ## Bayes factor analysis
57 |     ## --------------
58 |     ## [1] Alt., r=0.707 : 0.2294321 ±0.03%
59 |     ## 
60 |     ## Against denominator:
61 |     ##   Null, mu = 0 
62 |     ## ---
63 |     ## Bayes factor type: BFoneSample, JZS
64 | 
65 | ``` r
66 | sprintf("Bayes factor: %f", exp(bf@bayesFactor$bf))
67 | ```
68 | 
69 |     ## [1] "Bayes factor: 0.229432"
70 | 


--------------------------------------------------------------------------------
/notebooks/Correlation Analysis.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Correlation Analysis"
 3 | author: "Murat Koptur"
 4 | date: "`r format(Sys.time(), '%d %B %Y')`"
 5 | output: rmarkdown::github_document
 6 | ---
 7 | 
 8 | ```{r echo=FALSE}
 9 | knitr::opts_chunk$set(fig.path='figures/corr-')
10 | ```
11 | 
12 | ```{r}
13 | library(ggpubr)
14 | library(haven)
15 | ```
16 | 
17 | ```{r}
18 | temprate <- read_sav("../data/temprate.sav")
19 | head(temprate)
20 | ```
21 | 
22 | ```{r}
23 | cor.test(temprate$temp, temprate$hrtrate, method = "pearson")
24 | ```
25 | 
26 | ```{r}
27 | ggscatter(
28 |   data = temprate,
29 |   x = "temp",
30 |   y = "hrtrate",
31 |   add = "reg.line",
32 |   conf.int = TRUE,
33 |   cor.coef = TRUE,
34 |   cor.method = "pearson",
35 |   xlab = "Temperature",
36 |   ylab = "Heart Rate"
37 | )
38 | ```
39 | 
40 | 


--------------------------------------------------------------------------------
/notebooks/Correlation_Analysis.md:
--------------------------------------------------------------------------------
 1 | Correlation Analysis
 2 | ================
 3 | Murat Koptur
 4 | 24 Ağustos 2018
 5 | 
 6 | ``` r
 7 | library(ggpubr)
 8 | ```
 9 | 
10 |     ## Loading required package: ggplot2
11 | 
12 |     ## Loading required package: magrittr
13 | 
14 | ``` r
15 | library(haven)
16 | ```
17 | 
18 | ``` r
19 | temprate <- read_sav("../data/temprate.sav")
20 | head(temprate)
21 | ```
22 | 
23 |     ## # A tibble: 6 x 2
24 |     ##    temp hrtrate
25 |     ##   <dbl>   <dbl>
26 |     ## 1  35.7      70
27 |     ## 2  35.9      71
28 |     ## 3  36.1      74
29 |     ## 4  36.1      80
30 |     ## 5  36.2      73
31 |     ## 6  36.2      75
32 | 
33 | ``` r
34 | cor.test(temprate$temp, temprate$hrtrate, method = "pearson")
35 | ```
36 | 
37 |     ## 
38 |     ##  Pearson's product-moment correlation
39 |     ## 
40 |     ## data:  temprate$temp and temprate$hrtrate
41 |     ## t = 2.9668, df = 128, p-value = 0.003591
42 |     ## alternative hypothesis: true correlation is not equal to 0
43 |     ## 95 percent confidence interval:
44 |     ##  0.08519113 0.40802170
45 |     ## sample estimates:
46 |     ##       cor 
47 |     ## 0.2536564
48 | 
49 | ``` r
50 | ggscatter(
51 |   data = temprate,
52 |   x = "temp",
53 |   y = "hrtrate",
54 |   add = "reg.line",
55 |   conf.int = TRUE,
56 |   cor.coef = TRUE,
57 |   cor.method = "pearson",
58 |   xlab = "Temperature",
59 |   ylab = "Heart Rate"
60 | )
61 | ```
62 | 
63 | ![](figures/corr-unnamed-chunk-5-1.png)
64 | 


--------------------------------------------------------------------------------
/notebooks/Factor Analysis.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Factor Analysis"
 3 | author: "Murat Koptur"
 4 | date: "`r format(Sys.time(), '%d %B %Y')`"
 5 | output: rmarkdown::github_document
 6 | ---
 7 | 
 8 | ```{r echo=FALSE}
 9 | knitr::opts_chunk$set(fig.path='figures/factor-')
10 | ```
11 | 
12 | ```{r}
13 | library(readr)
14 | library(knitr)
15 | library(psych)
16 | ```
17 | 
18 | ```{r results='asis'}
19 | bfi <- read_csv("../data/bfi.csv", 
20 |     col_types = cols(X1 = col_skip(), age = col_skip(), 
21 |         education = col_skip(), gender = col_skip()))
22 | kable(head(bfi))
23 | ```
24 | ```{r}
25 | KMO(bfi)
26 | ```
27 | ```{r}
28 | fa.parallel(bfi)
29 | ```
30 | ```{r}
31 | bfi.fa <- fa(bfi, nfactors = 6, fm="pa", max.iter = 100)
32 | fa.diagram(bfi.fa)
33 | ```
34 | 
35 | 


--------------------------------------------------------------------------------
/notebooks/Factor_Analysis.md:
--------------------------------------------------------------------------------
  1 | Factor Analysis
  2 | ================
  3 | Murat Koptur
  4 | 24 Ağustos 2018
  5 | 
  6 | ``` r
  7 | library(readr)
  8 | library(knitr)
  9 | library(psych)
 10 | ```
 11 | 
 12 | ``` r
 13 | bfi <- read_csv("../data/bfi.csv", 
 14 |     col_types = cols(X1 = col_skip(), age = col_skip(), 
 15 |         education = col_skip(), gender = col_skip()))
 16 | ```
 17 | 
 18 |     ## Warning: Missing column names filled in: 'X1' [1]
 19 | 
 20 |     ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, :
 21 |     ## length of NULL cannot be changed
 22 | 
 23 |     ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, :
 24 |     ## length of NULL cannot be changed
 25 | 
 26 |     ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, :
 27 |     ## length of NULL cannot be changed
 28 | 
 29 |     ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, :
 30 |     ## length of NULL cannot be changed
 31 | 
 32 |     ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, :
 33 |     ## length of NULL cannot be changed
 34 | 
 35 |     ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, :
 36 |     ## length of NULL cannot be changed
 37 | 
 38 |     ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, :
 39 |     ## length of NULL cannot be changed
 40 | 
 41 |     ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, :
 42 |     ## length of NULL cannot be changed
 43 | 
 44 |     ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, :
 45 |     ## length of NULL cannot be changed
 46 | 
 47 |     ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, :
 48 |     ## length of NULL cannot be changed
 49 | 
 50 |     ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, :
 51 |     ## length of NULL cannot be changed
 52 | 
 53 |     ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, :
 54 |     ## length of NULL cannot be changed
 55 | 
 56 | ``` r
 57 | kable(head(bfi))
 58 | ```
 59 | 
 60 | |   A1|   A2|   A3|   A4|   A5|   C1|   C2|   C3|   C4|   C5|   E1|   E2|   E3|   E4|   E5|   N1|   N2|   N3|   N4|   N5|   O1|   O2|   O3|   O4|   O5|
 61 | |----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|
 62 | |    2|    4|    3|    4|    4|    2|    3|    3|    4|    4|    3|    3|    3|    4|    4|    3|    4|    2|    2|    3|    3|    6|    3|    4|    3|
 63 | |    2|    4|    5|    2|    5|    5|    4|    4|    3|    4|    1|    1|    6|    4|    3|    3|    3|    3|    5|    5|    4|    2|    4|    3|    3|
 64 | |    5|    4|    5|    4|    4|    4|    5|    4|    2|    5|    2|    4|    4|    4|    5|    4|    5|    4|    2|    3|    4|    2|    5|    5|    2|
 65 | |    4|    4|    6|    5|    5|    4|    4|    3|    5|    5|    5|    3|    4|    4|    4|    2|    5|    2|    4|    1|    3|    3|    4|    3|    5|
 66 | |    2|    3|    3|    4|    5|    4|    4|    5|    3|    2|    2|    2|    5|    4|    5|    2|    3|    4|    4|    3|    3|    3|    4|    3|    3|
 67 | |    6|    6|    5|    6|    5|    6|    6|    6|    1|    3|    2|    1|    6|    5|    6|    3|    5|    2|    2|    3|    4|    3|    5|    6|    1|
 68 | 
 69 | ``` r
 70 | KMO(bfi)
 71 | ```
 72 | 
 73 |     ## Kaiser-Meyer-Olkin factor adequacy
 74 |     ## Call: KMO(r = bfi)
 75 |     ## Overall MSA =  0.85
 76 |     ## MSA for each item = 
 77 |     ##   A1   A2   A3   A4   A5   C1   C2   C3   C4   C5   E1   E2   E3   E4   E5 
 78 |     ## 0.74 0.84 0.87 0.87 0.90 0.83 0.79 0.85 0.82 0.86 0.83 0.88 0.89 0.87 0.89 
 79 |     ##   N1   N2   N3   N4   N5   O1   O2   O3   O4   O5 
 80 |     ## 0.78 0.78 0.86 0.88 0.86 0.85 0.78 0.84 0.76 0.76
 81 | 
 82 | ``` r
 83 | fa.parallel(bfi)
 84 | ```
 85 | 
 86 | ![](figures/factor-unnamed-chunk-5-1.png)
 87 | 
 88 |     ## Parallel analysis suggests that the number of factors =  6  and the number of components =  6
 89 | 
 90 | ``` r
 91 | bfi.fa <- fa(bfi, nfactors = 6, fm="pa", max.iter = 100)
 92 | ```
 93 | 
 94 |     ## Loading required namespace: GPArotation
 95 | 
 96 | ``` r
 97 | fa.diagram(bfi.fa)
 98 | ```
 99 | 
100 | ![](figures/factor-unnamed-chunk-6-1.png)
101 | 


--------------------------------------------------------------------------------
/notebooks/Multiple Linear Regression with interaction terms.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Multiple Linear Regression with interaction terms"
 3 | author: "Murat Koptur"
 4 | date: "`r format(Sys.time(), '%d %B %Y')`"
 5 | output: rmarkdown::github_document
 6 | ---
 7 | 
 8 | ```{r echo=FALSE}
 9 | knitr::opts_chunk$set(fig.path='figures/multipleLin-')
10 | ```
11 | 
12 | ```{r}
13 | library(GGally) 
14 | library(ggplot2)
15 | library(readr)
16 | library(reshape2)
17 | ```
18 | 
19 | ```{r}
20 | child_data <- read_csv("../data/child_data.csv")
21 | head(child_data)
22 | ```
23 | 
24 | ```{r}
25 | child_data_melted <- melt(child_data)
26 | head(child_data_melted)
27 | 
28 | ggplot(data = child_data_melted, aes(x = value)) + 
29 |   geom_histogram(aes(y = ..ncount..)) + 
30 |   geom_density(aes(y = ..scaled..)) + 
31 |   facet_wrap(~variable, scales = "free") +
32 |   labs(x = "Values", y = "Frequencies", title = "Histograms")
33 | ```
34 | 
35 | ```{r}
36 | ggpairs(child_data)
37 | ```
38 | 
39 | ```{r}
40 | child_data_scaled <- scale(child_data)
41 | head(child_data_scaled)
42 | 
43 | model1 <- lm(read_ab ~ age + iq, data = as.data.frame(child_data_scaled))
44 | summary(model1)
45 | 
46 | model2 <- lm(read_ab ~ age + mem_span, data = as.data.frame(child_data_scaled))
47 | summary(model2)
48 | 
49 | model3 <- lm(read_ab ~ age + iq + age:iq, data = as.data.frame(child_data_scaled))
50 | summary(model3)
51 | ```
52 | 
53 | 


--------------------------------------------------------------------------------
/notebooks/Multiple_Linear_Regression_with_interaction_terms.md:
--------------------------------------------------------------------------------
  1 | Multiple Linear Regression with interaction terms
  2 | ================
  3 | Murat Koptur
  4 | 24 Ağustos 2018
  5 | 
  6 | ``` r
  7 | library(GGally) 
  8 | ```
  9 | 
 10 |     ## Loading required package: ggplot2
 11 | 
 12 | ``` r
 13 | library(ggplot2)
 14 | library(readr)
 15 | library(reshape2)
 16 | ```
 17 | 
 18 | ``` r
 19 | child_data <- read_csv("../data/child_data.csv")
 20 | ```
 21 | 
 22 |     ## Parsed with column specification:
 23 |     ## cols(
 24 |     ##   age = col_double(),
 25 |     ##   mem_span = col_double(),
 26 |     ##   iq = col_integer(),
 27 |     ##   read_ab = col_double()
 28 |     ## )
 29 | 
 30 | ``` r
 31 | head(child_data)
 32 | ```
 33 | 
 34 |     ## # A tibble: 6 x 4
 35 |     ##     age mem_span    iq read_ab
 36 |     ##   <dbl>    <dbl> <int>   <dbl>
 37 |     ## 1   6.7      4.4    95     7.2
 38 |     ## 2   5.9      4      90     6  
 39 |     ## 3   5.5      4.1   105     6  
 40 |     ## 4   6.2      4.8    98     6.6
 41 |     ## 5   6.4      5     106     7  
 42 |     ## 6   7.3      5.5   100     7.2
 43 | 
 44 | ``` r
 45 | child_data_melted <- melt(child_data)
 46 | ```
 47 | 
 48 |     ## No id variables; using all as measure variables
 49 | 
 50 | ``` r
 51 | head(child_data_melted)
 52 | ```
 53 | 
 54 |     ##   variable value
 55 |     ## 1      age   6.7
 56 |     ## 2      age   5.9
 57 |     ## 3      age   5.5
 58 |     ## 4      age   6.2
 59 |     ## 5      age   6.4
 60 |     ## 6      age   7.3
 61 | 
 62 | ``` r
 63 | ggplot(data = child_data_melted, aes(x = value)) + 
 64 |   geom_histogram(aes(y = ..ncount..)) + 
 65 |   geom_density(aes(y = ..scaled..)) + 
 66 |   facet_wrap(~variable, scales = "free") +
 67 |   labs(x = "Values", y = "Frequencies", title = "Histograms")
 68 | ```
 69 | 
 70 |     ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
 71 | 
 72 | ![](figures/multipleLin-unnamed-chunk-4-1.png)
 73 | 
 74 | ``` r
 75 | ggpairs(child_data)
 76 | ```
 77 | 
 78 | ![](figures/multipleLin-unnamed-chunk-5-1.png)
 79 | 
 80 | ``` r
 81 | child_data_scaled <- scale(child_data)
 82 | head(child_data_scaled)
 83 | ```
 84 | 
 85 |     ##             age   mem_span         iq   read_ab
 86 |     ## [1,]  0.6268603 -0.2164352 -0.2376403  1.309125
 87 |     ## [2,] -0.1188471 -0.9090277 -1.0297747 -0.436375
 88 |     ## [3,] -0.4917008 -0.7358796  1.3466285 -0.436375
 89 |     ## [4,]  0.1607932  0.4761574  0.2376403  0.436375
 90 |     ## [5,]  0.3472200  0.8224536  1.5050553  1.018208
 91 |     ## [6,]  1.1861409  1.6881943  0.5544941  1.309125
 92 | 
 93 | ``` r
 94 | model1 <- lm(read_ab ~ age + iq, data = as.data.frame(child_data_scaled))
 95 | summary(model1)
 96 | ```
 97 | 
 98 |     ## 
 99 |     ## Call:
100 |     ## lm(formula = read_ab ~ age + iq, data = as.data.frame(child_data_scaled))
101 |     ## 
102 |     ## Residuals:
103 |     ##      Min       1Q   Median       3Q      Max 
104 |     ## -0.85644 -0.02059  0.04402  0.20506  0.81633 
105 |     ## 
106 |     ## Coefficients:
107 |     ##               Estimate Std. Error t value Pr(>|t|)    
108 |     ## (Intercept) -2.302e-16  9.998e-02   0.000  1.00000    
109 |     ## age          9.117e-01  1.047e-01   8.711 1.12e-07 ***
110 |     ## iq           3.313e-01  1.047e-01   3.165  0.00565 ** 
111 |     ## ---
112 |     ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
113 |     ## 
114 |     ## Residual standard error: 0.4471 on 17 degrees of freedom
115 |     ## Multiple R-squared:  0.8211, Adjusted R-squared:  0.8001 
116 |     ## F-statistic: 39.02 on 2 and 17 DF,  p-value: 4.434e-07
117 | 
118 | ``` r
119 | model2 <- lm(read_ab ~ age + mem_span, data = as.data.frame(child_data_scaled))
120 | summary(model2)
121 | ```
122 | 
123 |     ## 
124 |     ## Call:
125 |     ## lm(formula = read_ab ~ age + mem_span, data = as.data.frame(child_data_scaled))
126 |     ## 
127 |     ## Residuals:
128 |     ##     Min      1Q  Median      3Q     Max 
129 |     ## -0.9536 -0.2206  0.0244  0.1668  1.0719 
130 |     ## 
131 |     ## Coefficients:
132 |     ##              Estimate Std. Error t value Pr(>|t|)   
133 |     ## (Intercept) 1.363e-16  1.038e-01   0.000  1.00000   
134 |     ## age         5.296e-01  1.542e-01   3.435  0.00316 **
135 |     ## mem_span    4.377e-01  1.542e-01   2.839  0.01135 * 
136 |     ## ---
137 |     ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
138 |     ## 
139 |     ## Residual standard error: 0.4643 on 17 degrees of freedom
140 |     ## Multiple R-squared:  0.8071, Adjusted R-squared:  0.7844 
141 |     ## F-statistic: 35.57 on 2 and 17 DF,  p-value: 8.414e-07
142 | 
143 | ``` r
144 | model3 <- lm(read_ab ~ age + iq + age:iq, data = as.data.frame(child_data_scaled))
145 | summary(model3)
146 | ```
147 | 
148 |     ## 
149 |     ## Call:
150 |     ## lm(formula = read_ab ~ age + iq + age:iq, data = as.data.frame(child_data_scaled))
151 |     ## 
152 |     ## Residuals:
153 |     ##      Min       1Q   Median       3Q      Max 
154 |     ## -0.82042 -0.08630 -0.01172  0.18550  0.89331 
155 |     ## 
156 |     ## Coefficients:
157 |     ##             Estimate Std. Error t value Pr(>|t|)    
158 |     ## (Intercept)  0.03942    0.09964   0.396  0.69764    
159 |     ## age          0.79560    0.12613   6.308 1.04e-05 ***
160 |     ## iq           0.38369    0.10642   3.605  0.00237 ** 
161 |     ## age:iq       0.20914    0.13667   1.530  0.14549    
162 |     ## ---
163 |     ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
164 |     ## 
165 |     ## Residual standard error: 0.4305 on 16 degrees of freedom
166 |     ## Multiple R-squared:  0.844,  Adjusted R-squared:  0.8147 
167 |     ## F-statistic: 28.85 on 3 and 16 DF,  p-value: 1.089e-06
168 | 


--------------------------------------------------------------------------------
/notebooks/Poisson Regression.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Poisson Regression"
 3 | author: "Murat Koptur"
 4 | date: "`r format(Sys.time(), '%d %B %Y')`"
 5 | output: rmarkdown::github_document
 6 | ---
 7 | 
 8 | ```{r echo=FALSE}
 9 | knitr::opts_chunk$set(fig.path='figures/poisson-')
10 | ```
11 | 
12 | 
13 | ```{r}
14 | library(bayesplot)
15 | library(ggplot2)
16 | library(readr)
17 | library(reshape2)
18 | library(rstanarm)
19 | ```
20 | 
21 | ```{r}
22 | awards <- read_csv("../data/awards.csv",
23 |                    col_types = cols(id = col_skip(), prog = col_factor(levels = c("1", "2", "3"))))
24 | head(awards)
25 | ```
26 | 
27 | ```{r}
28 | awards_melted <- melt(awards)
29 | head(awards_melted)
30 | ```
31 | 
32 | ```{r}
33 | ggplot(data = awards_melted, aes(x = value)) + 
34 |   geom_histogram(aes(y = ..ncount..)) + 
35 |   geom_density(aes(y = ..scaled..)) + 
36 |   facet_wrap(~variable, scales = "free") +
37 |   labs(x = "Values", y = "Frequencies", title = "Histograms")
38 | ```
39 | 
40 | ```{r}
41 | awards$math <- scale(awards$math)
42 | ```
43 | 
44 | ```{r}
45 | model1 <- glm(num_awards ~ math + prog, data = awards, family = poisson)
46 | summary(model1)
47 | ```
48 | 
49 | ```{r}
50 | model2 <- stan_glm(num_awards ~ math + prog, data = awards, family = poisson,
51 |                    prior = normal(0, 10), prior_intercept = normal(0, 10))
52 | summary(model2)
53 | ```
54 | 
55 | ```{r}
56 | posterior_interval(model2, prob = 0.95)
57 | plot(model2, plotfun = "areas", prob = 0.95)
58 | ```
59 | 
60 | ```{r}
61 | pp_check(model2)
62 | ```
63 | 
64 | 


--------------------------------------------------------------------------------
/notebooks/Poisson_Regression.md:
--------------------------------------------------------------------------------
  1 | Poisson Regression
  2 | ================
  3 | Murat Koptur
  4 | 24 Ağustos 2018
  5 | 
  6 | ``` r
  7 | library(bayesplot)
  8 | ```
  9 | 
 10 |     ## This is bayesplot version 1.6.0
 11 | 
 12 |     ## - Online documentation and vignettes at mc-stan.org/bayesplot
 13 | 
 14 |     ## - bayesplot theme set to bayesplot::theme_default()
 15 | 
 16 |     ##    * Does _not_ affect other ggplot2 plots
 17 | 
 18 |     ##    * See ?bayesplot_theme_set for details on theme setting
 19 | 
 20 | ``` r
 21 | library(ggplot2)
 22 | library(readr)
 23 | library(reshape2)
 24 | library(rstanarm)
 25 | ```
 26 | 
 27 |     ## Loading required package: Rcpp
 28 | 
 29 |     ## rstanarm (Version 2.17.4, packaged: 2018-04-13 01:51:52 UTC)
 30 | 
 31 |     ## - Do not expect the default priors to remain the same in future rstanarm versions.
 32 | 
 33 |     ## Thus, R scripts should specify priors explicitly, even if they are just the defaults.
 34 | 
 35 |     ## - For execution on a local, multicore CPU with excess RAM we recommend calling
 36 | 
 37 |     ## options(mc.cores = parallel::detectCores())
 38 | 
 39 |     ## - Plotting theme set to bayesplot::theme_default().
 40 | 
 41 | ``` r
 42 | awards <- read_csv("../data/awards.csv",
 43 |                    col_types = cols(id = col_skip(), prog = col_factor(levels = c("1", "2", "3"))))
 44 | ```
 45 | 
 46 |     ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, :
 47 |     ## length of NULL cannot be changed
 48 | 
 49 |     ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, :
 50 |     ## length of NULL cannot be changed
 51 | 
 52 | ``` r
 53 | head(awards)
 54 | ```
 55 | 
 56 |     ## # A tibble: 6 x 3
 57 |     ##   num_awards prog   math
 58 |     ##        <int> <fct> <int>
 59 |     ## 1          1 3        41
 60 |     ## 2          1 1        41
 61 |     ## 3          1 3        44
 62 |     ## 4          1 3        42
 63 |     ## 5          1 3        40
 64 |     ## 6          1 1        42
 65 | 
 66 | ``` r
 67 | awards_melted <- melt(awards)
 68 | ```
 69 | 
 70 |     ## Using prog as id variables
 71 | 
 72 | ``` r
 73 | head(awards_melted)
 74 | ```
 75 | 
 76 |     ##   prog   variable value
 77 |     ## 1    3 num_awards     1
 78 |     ## 2    1 num_awards     1
 79 |     ## 3    3 num_awards     1
 80 |     ## 4    3 num_awards     1
 81 |     ## 5    3 num_awards     1
 82 |     ## 6    1 num_awards     1
 83 | 
 84 | ``` r
 85 | ggplot(data = awards_melted, aes(x = value)) + 
 86 |   geom_histogram(aes(y = ..ncount..)) + 
 87 |   geom_density(aes(y = ..scaled..)) + 
 88 |   facet_wrap(~variable, scales = "free") +
 89 |   labs(x = "Values", y = "Frequencies", title = "Histograms")
 90 | ```
 91 | 
 92 |     ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
 93 | 
 94 | ![](figures/poisson-unnamed-chunk-5-1.png)
 95 | 
 96 | ``` r
 97 | awards$math <- scale(awards$math)
 98 | ```
 99 | 
100 | ``` r
101 | model1 <- glm(num_awards ~ math + prog, data = awards, family = poisson)
102 | summary(model1)
103 | ```
104 | 
105 |     ## 
106 |     ## Call:
107 |     ## glm(formula = num_awards ~ math + prog, family = poisson, data = awards)
108 |     ## 
109 |     ## Deviance Residuals: 
110 |     ##      Min        1Q    Median        3Q       Max  
111 |     ## -1.96335  -1.14818  -0.01392   0.35710   2.52541  
112 |     ## 
113 |     ## Coefficients:
114 |     ##             Estimate Std. Error z value Pr(>|z|)    
115 |     ## (Intercept) -0.48897    0.19620  -2.492   0.0127 *  
116 |     ## math         0.33520    0.07817   4.288  1.8e-05 ***
117 |     ## prog2        0.45262    0.22475   2.014   0.0440 *  
118 |     ## prog3        0.56172    0.24748   2.270   0.0232 *  
119 |     ## ---
120 |     ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
121 |     ## 
122 |     ## (Dispersion parameter for poisson family taken to be 1)
123 |     ## 
124 |     ##     Null deviance: 228.83  on 199  degrees of freedom
125 |     ## Residual deviance: 198.05  on 196  degrees of freedom
126 |     ## AIC: 496.36
127 |     ## 
128 |     ## Number of Fisher Scoring iterations: 5
129 | 
130 | ``` r
131 | model2 <- stan_glm(num_awards ~ math + prog, data = awards, family = poisson,
132 |                    prior = normal(0, 10), prior_intercept = normal(0, 10))
133 | ```
134 | 
135 |     ## 
136 |     ## SAMPLING FOR MODEL 'count' NOW (CHAIN 1).
137 |     ## 
138 |     ## Gradient evaluation took 0.000117 seconds
139 |     ## 1000 transitions using 10 leapfrog steps per transition would take 1.17 seconds.
140 |     ## Adjust your expectations accordingly!
141 |     ## 
142 |     ## 
143 |     ## Iteration:    1 / 2000 [  0%]  (Warmup)
144 |     ## Iteration:  200 / 2000 [ 10%]  (Warmup)
145 |     ## Iteration:  400 / 2000 [ 20%]  (Warmup)
146 |     ## Iteration:  600 / 2000 [ 30%]  (Warmup)
147 |     ## Iteration:  800 / 2000 [ 40%]  (Warmup)
148 |     ## Iteration: 1000 / 2000 [ 50%]  (Warmup)
149 |     ## Iteration: 1001 / 2000 [ 50%]  (Sampling)
150 |     ## Iteration: 1200 / 2000 [ 60%]  (Sampling)
151 |     ## Iteration: 1400 / 2000 [ 70%]  (Sampling)
152 |     ## Iteration: 1600 / 2000 [ 80%]  (Sampling)
153 |     ## Iteration: 1800 / 2000 [ 90%]  (Sampling)
154 |     ## Iteration: 2000 / 2000 [100%]  (Sampling)
155 |     ## 
156 |     ##  Elapsed Time: 0.289811 seconds (Warm-up)
157 |     ##                0.270276 seconds (Sampling)
158 |     ##                0.560087 seconds (Total)
159 |     ## 
160 |     ## 
161 |     ## SAMPLING FOR MODEL 'count' NOW (CHAIN 2).
162 |     ## 
163 |     ## Gradient evaluation took 3.1e-05 seconds
164 |     ## 1000 transitions using 10 leapfrog steps per transition would take 0.31 seconds.
165 |     ## Adjust your expectations accordingly!
166 |     ## 
167 |     ## 
168 |     ## Iteration:    1 / 2000 [  0%]  (Warmup)
169 |     ## Iteration:  200 / 2000 [ 10%]  (Warmup)
170 |     ## Iteration:  400 / 2000 [ 20%]  (Warmup)
171 |     ## Iteration:  600 / 2000 [ 30%]  (Warmup)
172 |     ## Iteration:  800 / 2000 [ 40%]  (Warmup)
173 |     ## Iteration: 1000 / 2000 [ 50%]  (Warmup)
174 |     ## Iteration: 1001 / 2000 [ 50%]  (Sampling)
175 |     ## Iteration: 1200 / 2000 [ 60%]  (Sampling)
176 |     ## Iteration: 1400 / 2000 [ 70%]  (Sampling)
177 |     ## Iteration: 1600 / 2000 [ 80%]  (Sampling)
178 |     ## Iteration: 1800 / 2000 [ 90%]  (Sampling)
179 |     ## Iteration: 2000 / 2000 [100%]  (Sampling)
180 |     ## 
181 |     ##  Elapsed Time: 0.281356 seconds (Warm-up)
182 |     ##                0.258399 seconds (Sampling)
183 |     ##                0.539755 seconds (Total)
184 |     ## 
185 |     ## 
186 |     ## SAMPLING FOR MODEL 'count' NOW (CHAIN 3).
187 |     ## 
188 |     ## Gradient evaluation took 3e-05 seconds
189 |     ## 1000 transitions using 10 leapfrog steps per transition would take 0.3 seconds.
190 |     ## Adjust your expectations accordingly!
191 |     ## 
192 |     ## 
193 |     ## Iteration:    1 / 2000 [  0%]  (Warmup)
194 |     ## Iteration:  200 / 2000 [ 10%]  (Warmup)
195 |     ## Iteration:  400 / 2000 [ 20%]  (Warmup)
196 |     ## Iteration:  600 / 2000 [ 30%]  (Warmup)
197 |     ## Iteration:  800 / 2000 [ 40%]  (Warmup)
198 |     ## Iteration: 1000 / 2000 [ 50%]  (Warmup)
199 |     ## Iteration: 1001 / 2000 [ 50%]  (Sampling)
200 |     ## Iteration: 1200 / 2000 [ 60%]  (Sampling)
201 |     ## Iteration: 1400 / 2000 [ 70%]  (Sampling)
202 |     ## Iteration: 1600 / 2000 [ 80%]  (Sampling)
203 |     ## Iteration: 1800 / 2000 [ 90%]  (Sampling)
204 |     ## Iteration: 2000 / 2000 [100%]  (Sampling)
205 |     ## 
206 |     ##  Elapsed Time: 0.273531 seconds (Warm-up)
207 |     ##                0.267135 seconds (Sampling)
208 |     ##                0.540666 seconds (Total)
209 |     ## 
210 |     ## 
211 |     ## SAMPLING FOR MODEL 'count' NOW (CHAIN 4).
212 |     ## 
213 |     ## Gradient evaluation took 3.1e-05 seconds
214 |     ## 1000 transitions using 10 leapfrog steps per transition would take 0.31 seconds.
215 |     ## Adjust your expectations accordingly!
216 |     ## 
217 |     ## 
218 |     ## Iteration:    1 / 2000 [  0%]  (Warmup)
219 |     ## Iteration:  200 / 2000 [ 10%]  (Warmup)
220 |     ## Iteration:  400 / 2000 [ 20%]  (Warmup)
221 |     ## Iteration:  600 / 2000 [ 30%]  (Warmup)
222 |     ## Iteration:  800 / 2000 [ 40%]  (Warmup)
223 |     ## Iteration: 1000 / 2000 [ 50%]  (Warmup)
224 |     ## Iteration: 1001 / 2000 [ 50%]  (Sampling)
225 |     ## Iteration: 1200 / 2000 [ 60%]  (Sampling)
226 |     ## Iteration: 1400 / 2000 [ 70%]  (Sampling)
227 |     ## Iteration: 1600 / 2000 [ 80%]  (Sampling)
228 |     ## Iteration: 1800 / 2000 [ 90%]  (Sampling)
229 |     ## Iteration: 2000 / 2000 [100%]  (Sampling)
230 |     ## 
231 |     ##  Elapsed Time: 0.248926 seconds (Warm-up)
232 |     ##                0.250404 seconds (Sampling)
233 |     ##                0.49933 seconds (Total)
234 | 
235 | ``` r
236 | summary(model2)
237 | ```
238 | 
239 |     ## 
240 |     ## Model Info:
241 |     ## 
242 |     ##  function:     stan_glm
243 |     ##  family:       poisson [log]
244 |     ##  formula:      num_awards ~ math + prog
245 |     ##  algorithm:    sampling
246 |     ##  priors:       see help('prior_summary')
247 |     ##  sample:       4000 (posterior sample size)
248 |     ##  observations: 200
249 |     ##  predictors:   4
250 |     ## 
251 |     ## Estimates:
252 |     ##                 mean   sd     2.5%   25%    50%    75%    97.5%
253 |     ## (Intercept)     -0.5    0.2   -0.9   -0.6   -0.5   -0.4   -0.1 
254 |     ## math             0.3    0.1    0.2    0.3    0.3    0.4    0.5 
255 |     ## prog2            0.5    0.2    0.0    0.3    0.5    0.6    0.9 
256 |     ## prog3            0.6    0.3    0.1    0.4    0.6    0.7    1.0 
257 |     ## mean_PPD         1.0    0.1    0.8    0.9    1.0    1.0    1.2 
258 |     ## log-posterior -252.2    1.4 -255.8 -252.9 -251.9 -251.1 -250.4 
259 |     ## 
260 |     ## Diagnostics:
261 |     ##               mcse Rhat n_eff
262 |     ## (Intercept)   0.0  1.0  1997 
263 |     ## math          0.0  1.0  2485 
264 |     ## prog2         0.0  1.0  2291 
265 |     ## prog3         0.0  1.0  2054 
266 |     ## mean_PPD      0.0  1.0  3751 
267 |     ## log-posterior 0.0  1.0  1624 
268 |     ## 
269 |     ## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
270 | 
271 | ``` r
272 | posterior_interval(model2, prob = 0.95)
273 | ```
274 | 
275 |     ##                    2.5%      97.5%
276 |     ## (Intercept) -0.89457959 -0.1447066
277 |     ## math         0.18111692  0.4915252
278 |     ## prog2        0.03168288  0.9214785
279 |     ## prog3        0.07135645  1.0449510
280 | 
281 | ``` r
282 | plot(model2, plotfun = "areas", prob = 0.95)
283 | ```
284 | 
285 | ![](figures/poisson-unnamed-chunk-9-1.png)
286 | 
287 | ``` r
288 | pp_check(model2)
289 | ```
290 | 
291 | ![](figures/poisson-unnamed-chunk-10-1.png)
292 | 


--------------------------------------------------------------------------------
/notebooks/figures/corr-unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/notebooks/figures/corr-unnamed-chunk-5-1.png


--------------------------------------------------------------------------------
/notebooks/figures/factor-unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/notebooks/figures/factor-unnamed-chunk-5-1.png


--------------------------------------------------------------------------------
/notebooks/figures/factor-unnamed-chunk-6-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/notebooks/figures/factor-unnamed-chunk-6-1.png


--------------------------------------------------------------------------------
/notebooks/figures/multipleLin-unnamed-chunk-4-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/notebooks/figures/multipleLin-unnamed-chunk-4-1.png


--------------------------------------------------------------------------------
/notebooks/figures/multipleLin-unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/notebooks/figures/multipleLin-unnamed-chunk-5-1.png


--------------------------------------------------------------------------------
/notebooks/figures/poisson-unnamed-chunk-10-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/notebooks/figures/poisson-unnamed-chunk-10-1.png


--------------------------------------------------------------------------------
/notebooks/figures/poisson-unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/notebooks/figures/poisson-unnamed-chunk-5-1.png


--------------------------------------------------------------------------------
/notebooks/figures/poisson-unnamed-chunk-9-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/notebooks/figures/poisson-unnamed-chunk-9-1.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | seaborn==0.9.0
2 | pandas==0.23.4
3 | pystan==2.17.1.0
4 | matplotlib==2.2.2
5 | numpy==1.13.3
6 | scikit_learn==0.19.2
7 | statsmodels==0.9.0


--------------------------------------------------------------------------------
/scripts/Multiple linear regression with interaction terms.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | import pandas as pd
 4 | import seaborn as sns
 5 | import statsmodels.api as sm
 6 | 
 7 | # http://staff.bath.ac.uk/pssiw/stats2/page16/page16.html
 8 | df = pd.read_csv("../data/child_data.csv")
 9 | print(df.head())
10 | 
11 | sns.set(style="white", palette="muted", color_codes=True)
12 | 
13 | f, axes = plt.subplots(2, 2, figsize=(7, 7))
14 | 
15 | sns.distplot(df.age, ax=axes[0, 0])
16 | sns.distplot(df.mem_span, ax=axes[0, 1])
17 | sns.distplot(df.iq, ax=axes[1, 0])
18 | sns.distplot(df.read_ab, ax=axes[1, 1])
19 | plt.show()
20 | 
21 | sns.pairplot(df, vars=['age', 'mem_span', 'iq'])
22 | plt.show()
23 | 
24 | # Rescale all variables
25 | for col in df.columns.values:
26 |     df[col] = (df[col] - np.mean(df[col]))/(2 * np.std(df[col]))
27 | 
28 | print(df.head())
29 | 
30 | # Ordinary multiple linear regression
31 | # Mem_span and age seems correlated, so I'll use one of them
32 | mod1 = sm.formula.ols('read_ab ~ age + iq', data=df).fit()
33 | print(mod1.summary())
34 | 
35 | mod2 = sm.formula.ols('read_ab ~ age + mem_span', data=df).fit()
36 | print(mod2.summary())
37 | 
38 | # Now, add interaction term
39 | mod1 = sm.formula.ols('read_ab ~ age + iq + age:iq', data=df).fit()
40 | print(mod1.summary())
41 | 


--------------------------------------------------------------------------------
/scripts/Poisson Regression.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | import pandas as pd
 4 | import seaborn as sns
 5 | import statsmodels.api as sm
 6 | 
 7 | # http://staff.bath.ac.uk/pssiw/stats2/page16/page16.html
 8 | df = pd.read_csv("../data/awards.csv", index_col=0)
 9 | print(df.head())
10 | 
11 | print(df.describe())
12 | 
13 | df = pd.get_dummies(df, columns=["prog"])
14 | del df['prog_1']
15 | print(df.head())
16 | 
17 | df['math'] = (df['math'] - np.mean(df['math']))/(2 * np.std(df['math']))
18 | print(df.head())
19 | 
20 | X = np.column_stack(
21 |     (np.ones((df.shape[0], 1)), df[['math', 'prog_2', 'prog_3']]))
22 | y = df['num_awards']
23 | 
24 | mod = sm.formula.GLM(y, X, family=sm.families.Poisson()).fit()
25 | print(mod.summary())
26 | 
27 | model_fitted_y = mod.fittedvalues
28 | model_residuals = mod.df_resid
29 | model_abs_resid = np.abs(model_residuals)
30 | 
31 | # https://medium.com/@emredjan/emulating-r-regression-plots-in-python-43741952c034
32 | plot_lm_1 = plt.figure(1)
33 | plot_lm_1.set_figheight(8)
34 | plot_lm_1.set_figwidth(12)
35 | 
36 | plot_lm_1.axes[0] = sns.residplot(model_fitted_y, 'num_awards', data=df,
37 |                                   lowess=True,
38 |                                   scatter_kws={'alpha': 0.5},
39 |                                   line_kws={'color': 'red', 'lw': 2, 'alpha': 0.8})
40 | 
41 | plot_lm_1.axes[0].set_title('Residuals vs Fitted')
42 | plot_lm_1.axes[0].set_xlabel('Fitted values')
43 | plot_lm_1.axes[0].set_ylabel('Residuals')
44 | plt.show()
45 | 


--------------------------------------------------------------------------------
/scripts/helper/psis.py:
--------------------------------------------------------------------------------
  1 | """Pareto smoothed importance sampling (PSIS)
  2 | 
  3 | This module implements Pareto smoothed importance sampling (PSIS) and PSIS
  4 | leave-one-out (LOO) cross-validation for Python (Numpy).
  5 | 
  6 | Included functions
  7 | ------------------
  8 | psisloo
  9 |     Pareto smoothed importance sampling leave-one-out log predictive densities.
 10 | 
 11 | psislw
 12 |     Pareto smoothed importance sampling.
 13 | 
 14 | gpdfitnew
 15 |     Estimate the paramaters for the Generalized Pareto Distribution (GPD).
 16 | 
 17 | gpinv
 18 |     Inverse Generalised Pareto distribution function.
 19 | 
 20 | sumlogs
 21 |     Sum of vector where numbers are represented by their logarithms.
 22 | 
 23 | References
 24 | ----------
 25 | Aki Vehtari, Andrew Gelman and Jonah Gabry (2017). Practical
 26 | Bayesian model evaluation using leave-one-out cross-validation
 27 | and WAIC. Statistics and Computing, 27(5):1413–1432.
 28 | doi:10.1007/s11222-016-9696-4. https://arxiv.org/abs/1507.04544
 29 | 
 30 | Aki Vehtari, Andrew Gelman and Jonah Gabry (2017). Pareto
 31 | smoothed importance sampling. https://arxiv.org/abs/arXiv:1507.02646v5
 32 | 
 33 | """
 34 | 
 35 | from __future__ import division # For Python 2 compatibility
 36 | import numpy as np
 37 | 
 38 | # 3-Clause BSD License
 39 | """
 40 | Copyright 2017 Aki Vehtari, Tuomas Sivula
 41 | 
 42 | Redistribution and use in source and binary forms, with or without modification,
 43 | are permitted provided that the following conditions are met:
 44 | 
 45 | 1. Redistributions of source code must retain the above copyright notice, this
 46 | list of conditions and the following disclaimer.
 47 | 
 48 | 2. Redistributions in binary form must reproduce the above copyright notice,
 49 | this list of conditions and the following disclaimer in the documentation and/or
 50 | other materials provided with the distribution.
 51 | 
 52 | 3. Neither the name of the copyright holder nor the names of its contributors
 53 | may be used to endorse or promote products derived from this software without
 54 | specific prior written permission.
 55 | 
 56 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 57 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 58 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 59 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
 60 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 61 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 62 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 63 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 64 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 65 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """
 66 | 
 67 | 
 68 | def psisloo(log_lik, **kwargs):
 69 |     r"""PSIS leave-one-out log predictive densities.
 70 | 
 71 |     Computes the log predictive densities given posterior samples of the log
 72 |     likelihood terms :math:`p(y_i|\theta^s)` in input parameter `log_lik`.
 73 |     Returns a sum of the leave-one-out log predictive densities `loo`,
 74 |     individual leave-one-out log predictive density terms `loos` and an estimate
 75 |     of Pareto tail indeces `ks`. The estimates are unreliable if tail index
 76 |     ``k > 0.7`` (see more in the references listed in the module docstring).
 77 | 
 78 |     Additional keyword arguments are passed to the :meth:`psislw()` function
 79 |     (see the corresponding documentation).
 80 | 
 81 |     Parameters
 82 |     ----------
 83 |     log_lik : ndarray
 84 |         Array of size n x m containing n posterior samples of the log likelihood
 85 |         terms :math:`p(y_i|\theta^s)`.
 86 | 
 87 |     Returns
 88 |     -------
 89 |     loo : scalar
 90 |         sum of the leave-one-out log predictive densities
 91 | 
 92 |     loos : ndarray
 93 |         individual leave-one-out log predictive density terms
 94 | 
 95 |     ks : ndarray
 96 |         estimated Pareto tail indeces
 97 | 
 98 |     """
 99 |     # ensure overwrite flag in passed arguments
100 |     kwargs['overwrite_lw'] = True
101 |     # log raw weights from log_lik
102 |     lw = -log_lik
103 |     # compute Pareto smoothed log weights given raw log weights
104 |     lw, ks = psislw(lw, **kwargs)
105 |     # compute
106 |     lw += log_lik
107 |     loos = sumlogs(lw, axis=0)
108 |     loo = loos.sum()
109 |     return loo, loos, ks
110 | 
111 | 
112 | def psislw(lw, Reff=1.0, overwrite_lw=False):
113 |     """Pareto smoothed importance sampling (PSIS).
114 | 
115 |     Parameters
116 |     ----------
117 |     lw : ndarray
118 |         Array of size n x m containing m sets of n log weights. It is also
119 |         possible to provide one dimensional array of length n.
120 | 
121 |     Reff : scalar, optional
122 |         relative MCMC efficiency ``N_eff / N``
123 | 
124 |     overwrite_lw : bool, optional
125 |         If True, the input array `lw` is smoothed in-place, assuming the array
126 |         is F-contiguous. By default, a new array is allocated.
127 | 
128 |     Returns
129 |     -------
130 |     lw_out : ndarray
131 |         smoothed log weights
132 |     kss : ndarray
133 |         Pareto tail indices
134 | 
135 |     """
136 |     if lw.ndim == 2:
137 |         n, m = lw.shape
138 |     elif lw.ndim == 1:
139 |         n = len(lw)
140 |         m = 1
141 |     else:
142 |         raise ValueError("Argument `lw` must be 1 or 2 dimensional.")
143 |     if n <= 1:
144 |         raise ValueError("More than one log-weight needed.")
145 | 
146 |     if overwrite_lw and lw.flags.f_contiguous:
147 |         # in-place operation
148 |         lw_out = lw
149 |     else:
150 |         # allocate new array for output
151 |         lw_out = np.copy(lw, order='F')
152 | 
153 |     # allocate output array for kss
154 |     kss = np.empty(m)
155 | 
156 |     # precalculate constants
157 |     cutoff_ind = - int(np.ceil(min(0.2 * n, 3 * np.sqrt(n / Reff)))) - 1
158 |     cutoffmin = np.log(np.finfo(float).tiny)
159 |     logn = np.log(n)
160 |     k_min = 1/3
161 | 
162 |     # loop over sets of log weights
163 |     for i, x in enumerate(lw_out.T if lw_out.ndim == 2 else lw_out[None, :]):
164 |         # improve numerical accuracy
165 |         x -= np.max(x)
166 |         # sort the array
167 |         x_sort_ind = np.argsort(x)
168 |         # divide log weights into body and right tail
169 |         xcutoff = max(
170 |             x[x_sort_ind[cutoff_ind]],
171 |             cutoffmin
172 |         )
173 |         expxcutoff = np.exp(xcutoff)
174 |         tailinds, = np.where(x > xcutoff)
175 |         x2 = x[tailinds]
176 |         n2 = len(x2)
177 |         if n2 <= 4:
178 |             # not enough tail samples for gpdfitnew
179 |             k = np.inf
180 |         else:
181 |             # order of tail samples
182 |             x2si = np.argsort(x2)
183 |             # fit generalized Pareto distribution to the right tail samples
184 |             np.exp(x2, out=x2)
185 |             x2 -= expxcutoff
186 |             k, sigma = gpdfitnew(x2, sort=x2si)
187 |         if k >= k_min and not np.isinf(k):
188 |             # no smoothing if short tail or GPD fit failed
189 |             # compute ordered statistic for the fit
190 |             sti = np.arange(0.5, n2)
191 |             sti /= n2
192 |             qq = gpinv(sti, k, sigma)
193 |             qq += expxcutoff
194 |             np.log(qq, out=qq)
195 |             # place the smoothed tail into the output array
196 |             x[tailinds[x2si]] = qq
197 |             # truncate smoothed values to the largest raw weight 0
198 |             x[x > 0] = 0
199 |         # renormalize weights
200 |         x -= sumlogs(x)
201 |         # store tail index k
202 |         kss[i] = k
203 | 
204 |     # If the provided input array is one dimensional, return kss as scalar.
205 |     if lw_out.ndim == 1:
206 |         kss = kss[0]
207 | 
208 |     return lw_out, kss
209 | 
210 | 
211 | def gpdfitnew(x, sort=True, sort_in_place=False, return_quadrature=False):
212 |     """Estimate the paramaters for the Generalized Pareto Distribution (GPD)
213 | 
214 |     Returns empirical Bayes estimate for the parameters of the two-parameter
215 |     generalized Parato distribution given the data.
216 | 
217 |     Parameters
218 |     ----------
219 |     x : ndarray
220 |         One dimensional data array
221 | 
222 |     sort : bool or ndarray, optional
223 |         If known in advance, one can provide an array of indices that would
224 |         sort the input array `x`. If the input array is already sorted, provide
225 |         False. If True (default behaviour), the array is sorted internally.
226 | 
227 |     sort_in_place : bool, optional
228 |         If `sort` is True and `sort_in_place` is True, the array is sorted
229 |         in-place (False by default).
230 | 
231 |     return_quadrature : bool, optional
232 |         If True, quadrature points and weight `ks` and `w` of the marginal posterior distribution of k are also calculated and returned. False by
233 |         default.
234 | 
235 |     Returns
236 |     -------
237 |     k, sigma : float
238 |         estimated parameter values
239 | 
240 |     ks, w : ndarray
241 |         Quadrature points and weights of the marginal posterior distribution
242 |         of `k`. Returned only if `return_quadrature` is True.
243 | 
244 |     Notes
245 |     -----
246 |     This function returns a negative of Zhang and Stephens's k, because it is
247 |     more common parameterisation.
248 | 
249 |     """
250 |     if x.ndim != 1 or len(x) <= 1:
251 |         raise ValueError("Invalid input array.")
252 | 
253 |     # check if x should be sorted
254 |     if sort is True:
255 |         if sort_in_place:
256 |             x.sort()
257 |             xsorted = True
258 |         else:
259 |             sort = np.argsort(x)
260 |             xsorted = False
261 |     elif sort is False:
262 |         xsorted = True
263 |     else:
264 |         xsorted = False
265 | 
266 |     n = len(x)
267 |     PRIOR = 3
268 |     m = 30 + int(np.sqrt(n))
269 | 
270 |     bs = np.arange(1, m + 1, dtype=float)
271 |     bs -= 0.5
272 |     np.divide(m, bs, out=bs)
273 |     np.sqrt(bs, out=bs)
274 |     np.subtract(1, bs, out=bs)
275 |     if xsorted:
276 |         bs /= PRIOR * x[int(n/4 + 0.5) - 1]
277 |         bs += 1 / x[-1]
278 |     else:
279 |         bs /= PRIOR * x[sort[int(n/4 + 0.5) - 1]]
280 |         bs += 1 / x[sort[-1]]
281 | 
282 |     ks = np.negative(bs)
283 |     temp = ks[:,None] * x
284 |     np.log1p(temp, out=temp)
285 |     np.mean(temp, axis=1, out=ks)
286 | 
287 |     L = bs / ks
288 |     np.negative(L, out=L)
289 |     np.log(L, out=L)
290 |     L -= ks
291 |     L -= 1
292 |     L *= n
293 | 
294 |     temp = L - L[:,None]
295 |     np.exp(temp, out=temp)
296 |     w = np.sum(temp, axis=1)
297 |     np.divide(1, w, out=w)
298 | 
299 |     # remove negligible weights
300 |     dii = w >= 10 * np.finfo(float).eps
301 |     if not np.all(dii):
302 |         w = w[dii]
303 |         bs = bs[dii]
304 |     # normalise w
305 |     w /= w.sum()
306 | 
307 |     # posterior mean for b
308 |     b = np.sum(bs * w)
309 |     # Estimate for k, note that we return a negative of Zhang and
310 |     # Stephens's k, because it is more common parameterisation.
311 |     temp = (-b) * x
312 |     np.log1p(temp, out=temp)
313 |     k = np.mean(temp)
314 |     if return_quadrature:
315 |         np.negative(x, out=temp)
316 |         temp = bs[:, None] * temp
317 |         np.log1p(temp, out=temp)
318 |         ks = np.mean(temp, axis=1)
319 |     # estimate for sigma
320 |     sigma = -k / b * n / (n - 0)
321 |     # weakly informative prior for k
322 |     a = 10
323 |     k = k * n / (n+a) + a * 0.5 / (n+a)
324 |     if return_quadrature:
325 |         ks *= n / (n+a)
326 |         ks += a * 0.5 / (n+a)
327 | 
328 |     if return_quadrature:
329 |         return k, sigma, ks, w
330 |     else:
331 |         return k, sigma
332 | 
333 | 
334 | def gpinv(p, k, sigma):
335 |     """Inverse Generalised Pareto distribution function."""
336 |     x = np.empty(p.shape)
337 |     x.fill(np.nan)
338 |     if sigma <= 0:
339 |         return x
340 |     ok = (p > 0) & (p < 1)
341 |     if np.all(ok):
342 |         if np.abs(k) < np.finfo(float).eps:
343 |             np.negative(p, out=x)
344 |             np.log1p(x, out=x)
345 |             np.negative(x, out=x)
346 |         else:
347 |             np.negative(p, out=x)
348 |             np.log1p(x, out=x)
349 |             x *= -k
350 |             np.expm1(x, out=x)
351 |             x /= k
352 |         x *= sigma
353 |     else:
354 |         if np.abs(k) < np.finfo(float).eps:
355 |             # x[ok] = - np.log1p(-p[ok])
356 |             temp = p[ok]
357 |             np.negative(temp, out=temp)
358 |             np.log1p(temp, out=temp)
359 |             np.negative(temp, out=temp)
360 |             x[ok] = temp
361 |         else:
362 |             # x[ok] = np.expm1(-k * np.log1p(-p[ok])) / k
363 |             temp = p[ok]
364 |             np.negative(temp, out=temp)
365 |             np.log1p(temp, out=temp)
366 |             temp *= -k
367 |             np.expm1(temp, out=temp)
368 |             temp /= k
369 |             x[ok] = temp
370 |         x *= sigma
371 |         x[p == 0] = 0
372 |         if k >= 0:
373 |             x[p == 1] = np.inf
374 |         else:
375 |             x[p == 1] = -sigma / k
376 |     return x
377 | 
378 | 
379 | def sumlogs(x, axis=None, out=None):
380 |     """Sum of vector where numbers are represented by their logarithms.
381 | 
382 |     Calculates ``np.log(np.sum(np.exp(x), axis=axis))`` in such a fashion that
383 |     it works even when elements have large magnitude.
384 | 
385 |     """
386 |     maxx = x.max(axis=axis, keepdims=True)
387 |     xnorm = x - maxx
388 |     np.exp(xnorm, out=xnorm)
389 |     out = np.sum(xnorm, axis=axis, out=out)
390 |     if isinstance(out, np.ndarray):
391 |         np.log(out, out=out)
392 |     else:
393 |         out = np.log(out)
394 |     out += np.squeeze(maxx)
395 |     return out
396 | 


--------------------------------------------------------------------------------
/scripts/helper/stan_utility.py:
--------------------------------------------------------------------------------
  1 | import pystan
  2 | import pickle
  3 | import numpy
  4 | 
  5 | def check_div(fit):
  6 |     """Check transitions that ended with a divergence"""
  7 |     sampler_params = fit.get_sampler_params(inc_warmup=False)
  8 |     divergent = [x for y in sampler_params for x in y['divergent__']]
  9 |     n = sum(divergent)
 10 |     N = len(divergent)
 11 |     print('{} of {} iterations ended with a divergence ({}%)'.format(n, N,
 12 |             100 * n / N))
 13 |     if n > 0:
 14 |         print('  Try running with larger adapt_delta to remove the divergences')
 15 | 
 16 | def check_treedepth(fit, max_depth = 10):
 17 |     """Check transitions that ended prematurely due to maximum tree depth limit"""
 18 |     sampler_params = fit.get_sampler_params(inc_warmup=False)
 19 |     depths = [x for y in sampler_params for x in y['treedepth__']]
 20 |     n = sum(1 for x in depths if x == max_depth)
 21 |     N = len(depths)
 22 |     print(('{} of {} iterations saturated the maximum tree depth of {}'
 23 |             + ' ({}%)').format(n, N, max_depth, 100 * n / N))
 24 |     if n > 0:
 25 |         print('  Run again with max_depth set to a larger value to avoid saturation')
 26 | 
 27 | def check_energy(fit):
 28 |     """Checks the energy Bayesian fraction of missing information (E-BFMI)"""
 29 |     sampler_params = fit.get_sampler_params(inc_warmup=False)
 30 |     no_warning = True
 31 |     for chain_num, s in enumerate(sampler_params):
 32 |         energies = s['energy__']
 33 |         numer = sum((energies[i] - energies[i - 1])**2 for i in range(1, len(energies))) / len(energies)
 34 |         denom = numpy.var(energies)
 35 |         if numer / denom < 0.2:
 36 |             print('Chain {}: E-BFMI = {}'.format(chain_num, numer / denom))
 37 |             no_warning = False
 38 |     if no_warning:
 39 |         print('E-BFMI indicated no pathological behavior')
 40 |     else:
 41 |         print('  E-BFMI below 0.2 indicates you may need to reparameterize your model')
 42 | 
 43 | def check_n_eff(fit):
 44 |     """Checks the effective sample size per iteration"""
 45 |     fit_summary = fit.summary(probs=[0.5])
 46 |     n_effs = [x[4] for x in fit_summary['summary']]
 47 |     names = fit_summary['summary_rownames']
 48 |     n_iter = len(fit.extract()['lp__'])
 49 | 
 50 |     no_warning = True
 51 |     for n_eff, name in zip(n_effs, names):
 52 |         ratio = n_eff / n_iter
 53 |         if (ratio < 0.001):
 54 |             print('n_eff / iter for parameter {} is {}!'.format(name, ratio))
 55 |             print('E-BFMI below 0.2 indicates you may need to reparameterize your model')
 56 |             no_warning = False
 57 |     if no_warning:
 58 |         print('n_eff / iter looks reasonable for all parameters')
 59 |     else:
 60 |         print('  n_eff / iter below 0.001 indicates that the effective sample size has likely been overestimated')
 61 | 
 62 | def check_rhat(fit):
 63 |     """Checks the potential scale reduction factors"""
 64 |     from math import isnan
 65 |     from math import isinf
 66 | 
 67 |     fit_summary = fit.summary(probs=[0.5])
 68 |     rhats = [x[5] for x in fit_summary['summary']]
 69 |     names = fit_summary['summary_rownames']
 70 | 
 71 |     no_warning = True
 72 |     for rhat, name in zip(rhats, names):
 73 |         if (rhat > 1.1 or isnan(rhat) or isinf(rhat)):
 74 |             print('Rhat for parameter {} is {}!'.format(name, rhat))
 75 |             no_warning = False
 76 |     if no_warning:
 77 |         print('Rhat looks reasonable for all parameters')
 78 |     else:
 79 |         print('  Rhat above 1.1 indicates that the chains very likely have not mixed')
 80 | 
 81 | def check_all_diagnostics(fit):
 82 |     """Checks all MCMC diagnostics"""
 83 |     check_n_eff(fit)
 84 |     check_rhat(fit)
 85 |     check_div(fit)
 86 |     check_treedepth(fit)
 87 |     check_energy(fit)
 88 | 
 89 | def _by_chain(unpermuted_extraction):
 90 |     num_chains = len(unpermuted_extraction[0])
 91 |     result = [[] for _ in range(num_chains)]
 92 |     for c in range(num_chains):
 93 |         for i in range(len(unpermuted_extraction)):
 94 |             result[c].append(unpermuted_extraction[i][c])
 95 |     return numpy.array(result)
 96 | 
 97 | def _shaped_ordered_params(fit):
 98 |     ef = fit.extract(permuted=False, inc_warmup=False) # flattened, unpermuted, by (iteration, chain)
 99 |     ef = _by_chain(ef)
100 |     ef = ef.reshape(-1, len(ef[0][0]))
101 |     ef = ef[:, 0:len(fit.flatnames)] # drop lp__
102 |     shaped = {}
103 |     idx = 0
104 |     for dim, param_name in zip(fit.par_dims, fit.extract().keys()):
105 |         length = int(numpy.prod(dim))
106 |         shaped[param_name] = ef[:,idx:idx + length]
107 |         shaped[param_name].reshape(*([-1] + dim))
108 |         idx += length
109 |     return shaped
110 | 
111 | def partition_div(fit):
112 |     """ Returns parameter arrays separated into divergent and non-divergent transitions"""
113 |     sampler_params = fit.get_sampler_params(inc_warmup=False)
114 |     div = numpy.concatenate([x['divergent__'] for x in sampler_params]).astype('int')
115 |     params = _shaped_ordered_params(fit)
116 |     nondiv_params = dict((key, params[key][div == 0]) for key in params)
117 |     div_params = dict((key, params[key][div == 1]) for key in params)
118 |     return nondiv_params, div_params
119 | 
120 | def compile_model(filename, model_name=None, **kwargs):
121 |     """This will automatically cache models - great if you're just running a
122 |     script on the command line.
123 | 
124 |     See http://pystan.readthedocs.io/en/latest/avoiding_recompilation.html"""
125 |     from hashlib import md5
126 | 
127 |     with open(filename) as f:
128 |         model_code = f.read()
129 |         code_hash = md5(model_code.encode('ascii')).hexdigest()
130 |         if model_name is None:
131 |             cache_fn = 'cached-model-{}.pkl'.format(code_hash)
132 |         else:
133 |             cache_fn = 'cached-{}-{}.pkl'.format(model_name, code_hash)
134 |         try:
135 |             sm = pickle.load(open(cache_fn, 'rb'))
136 |         except:
137 |             sm = pystan.StanModel(model_code=model_code)
138 |             with open(cache_fn, 'wb') as f:
139 |                 pickle.dump(sm, f)
140 |         else:
141 |             print("Using cached StanModel")
142 |         return sm
143 | 


--------------------------------------------------------------------------------
/scripts/linearRegression.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | import pystan
 6 | 
 7 | from helper import psis, stan_utility
 8 | 
 9 | model_file = "../models/linearRegression.stan"
10 | # Data from http://www.openbugs.net/Examples/Ratsdata.html
11 | data = {'N': 5,
12 |         'x': [8.0, 15.0, 22.0, 29.0, 36.0],
13 |         'y': [160, 207, 248, 288, 324]
14 |         }
15 | 
16 | sm = pystan.StanModel(file=model_file)
17 | fit = sm.sampling(data=data, control=dict(adapt_delta=0.95))
18 | print(fit)
19 | fit.plot(['alpha', 'beta', 'sigma'])
20 | plt.show()
21 | 
22 | # model diagnostics
23 | # https://github.com/betanalpha/jupyter_case_studies/blob/master/pystan_workflow/stan_utility.py
24 | stan_utility.check_all_diagnostics(fit)
25 | 
26 | # visualize model
27 | fit_dict = fit.extract()
28 | m_alpha = np.mean(fit_dict['alpha'])
29 | m_beta = np.mean(fit_dict['beta'])
30 | x = np.linspace(min(data['x']), max(data['x']))
31 | y = m_alpha + m_beta * x
32 | plt.scatter(data['x'], data['y'], c="#1f77b4", label="Observed Data")
33 | plt.plot(x, y, c='#7f7f7f', label="Our Model")
34 | plt.title("Rat weights")
35 | plt.xlabel("Days")
36 | plt.ylabel("Weigths in grams")
37 | plt.legend()
38 | plt.show()
39 | 
40 | # Log-likelihood
41 | log_lik = fit.extract()['log_lik']
42 | print(psis.psisloo(log_lik)[0])
43 | 
44 | # Save model for later use
45 | with open('../models/saved/linearRegression.pkl', 'wb') as f:
46 |     pickle.dump(sm, f)
47 | 


--------------------------------------------------------------------------------
/scripts/logisticRegression.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | import pandas as pd
 6 | import pystan
 7 | import seaborn as sns
 8 | from sklearn.metrics import confusion_matrix
 9 | from sklearn.model_selection import train_test_split
10 | 
11 | from helper import stan_utility
12 | 
13 | model_file = "../models/logisticRegression.stan"
14 | # https://stats.idre.ucla.edu/stata/dae/logistic-regression/
15 | data_file = "../data/binary.dta"
16 | 
17 | data = pd.read_stata(data_file)
18 | 
19 | # Data preprocessing
20 | # Convert rank categorial variable to dummy
21 | data = pd.get_dummies(data=data, columns=['rank'])
22 | del data['rank_1.0']  # avoid dummy variable trap
23 | 
24 | # Rescale gpa and gre variables
25 | data['gre'] = (data['gre'] - np.mean(data['gre'])) / np.std(data['gre'])
26 | data['gpa'] = (data['gpa'] - np.mean(data['gpa'])) / np.std(data['gpa'])
27 | 
28 | # Split data as train/test
29 | data_train, data_test = train_test_split(data, test_size=0.2)
30 | 
31 | model_data = {'N_train': 320,
32 |               'N_test': 80,
33 |               'D': 5,
34 |               'x_train': data_train[['gre', 'gpa', 'rank_2.0',
35 |                                      'rank_3.0', 'rank_4.0']].astype(np.int32),
36 |               'x_test': data_test[['gre', 'gpa', 'rank_2.0',
37 |                                    'rank_3.0', 'rank_4.0']].astype(np.int32),
38 |               'y_train': data_train['admit'].astype(np.int32)}
39 | 
40 | sm = pystan.StanModel(file=model_file)
41 | fit = sm.sampling(data=model_data, control=dict(adapt_delta=0.95))
42 | print(fit)
43 | fit.plot(['alpha', 'beta'])
44 | plt.show()
45 | 
46 | sns.pairplot(pd.DataFrame(fit.extract()['beta']))
47 | plt.show()
48 | 
49 | # model diagnostics
50 | # https://github.com/betanalpha/jupyter_case_studies/blob/master/pystan_workflow/stan_utility.py
51 | stan_utility.check_all_diagnostics(fit)
52 | 
53 | # Confusion matrix
54 | y_pred = fit.extract()['y_pred']
55 | y_pred = np.median(y_pred, axis=0)
56 | print(confusion_matrix(data_test['admit'], y_pred))
57 | 
58 | # Save model for later use
59 | with open('../models/saved/logisticRegression.pkl', 'wb') as f:
60 |     pickle.dump(sm, f)
61 | 


--------------------------------------------------------------------------------
/scripts/multinomialLogisticRegression.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import pystan
 6 | 
 7 | from helper import stan_utility
 8 | 
 9 | model_file = "../models/multinomialLogisticRegression.stan"
10 | # https://stats.idre.ucla.edu/stata/dae/multinomiallogistic-regression/
11 | data_file = "../data/hsbdemo.dta"
12 | 
13 | data = pd.read_stata(data_file)
14 | 
15 | data = pd.get_dummies(data=data, columns=['ses', 'schtyp', 'honors'])
16 | 
17 | map_prog = {'general': 1,
18 |             'academic': 2,
19 |             'vocation': 3}
20 | data['prog'] = data['prog'].map(map_prog)
21 | 
22 | data['read'] = (data['read'] - np.mean(data['read'])) / np.mean(data['read'])
23 | data['write'] = (data['write'] - np.mean(data['write'])) / \
24 |     np.mean(data['write'])
25 | 
26 | data = {'N': 200,
27 |         'K': 3,
28 |         'D': 6,
29 |         'x': data[['ses_low', 'ses_middle', 'schtyp_public',
30 |                    'honors_enrolled', 'read', 'write']],
31 |         'y': data['prog']
32 |         }
33 | 
34 | sm = pystan.StanModel(file=model_file)
35 | fit = sm.sampling(data=data, control=dict(adapt_delta=0.95))
36 | print(fit)
37 | 
38 | # model diagnostics
39 | # https://github.com/betanalpha/jupyter_case_studies/blob/master/pystan_workflow/stan_utility.py
40 | stan_utility.check_all_diagnostics(fit)
41 | 
42 | # Save model for later use
43 | with open('../models/saved/multinomialLogisticRegression.pkl', 'wb') as f:
44 |     pickle.dump(sm, f)
45 | 


--------------------------------------------------------------------------------
/scripts/multipleLinearRegression.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | import pandas as pd
 6 | import pystan
 7 | import seaborn as sns
 8 | 
 9 | from helper import psis, stan_utility
10 | 
11 | model_file = "../models/multipleLinearRegression.stan"
12 | # http://lib.stat.cmu.edu/DASL/Datafiles/Cereals.html
13 | data_file = "../data/cereals.txt"
14 | data = pd.read_table(data_file)
15 | 
16 | data = data[['fat', 'weight', 'cups', 'rating']]
17 | data = {'N': 77,
18 |         'fat': data['fat'],
19 |         'weight': data['weight'],
20 |         'cups': data['cups'],
21 |         'rating': data['rating']}
22 | 
23 | sm = pystan.StanModel(file=model_file)
24 | fit = sm.sampling(data=data, control=dict(adapt_delta=0.95))
25 | print(fit)
26 | fit.plot(['b_fat', 'b_weight', 'b_cups'])
27 | plt.show()
28 | 
29 | # model diagnostics
30 | # https://github.com/betanalpha/jupyter_case_studies/blob/master/pystan_workflow/stan_utility.py
31 | stan_utility.check_all_diagnostics(fit)
32 | 
33 | # visualize model
34 | # we'll plot histogram of our errors
35 | rating_pred = fit.extract()['rating_pred'].mean(axis=0)
36 | rating = data['rating'].values
37 | abs_err = np.abs(rating - rating_pred)
38 | sns.distplot(abs_err)
39 | plt.title("Histogram of absolute errors")
40 | plt.xlabel("Errors")
41 | plt.ylabel("Frequency")
42 | plt.show()
43 | 
44 | # Log-likelihood
45 | log_lik = fit.extract()['log_lik']
46 | print(psis.psisloo(log_lik)[0])
47 | 
48 | # Save model for later use
49 | with open('../models/saved/multipleLinearRegression.pkl', 'wb') as f:
50 |     pickle.dump(sm, f)
51 | 


--------------------------------------------------------------------------------
/scripts/onewayANOVA.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | import pandas as pd
 6 | import pystan
 7 | 
 8 | from helper import stan_utility
 9 | 
10 | model_file = "../models/onewayANOVA.stan"
11 | # http://staff.bath.ac.uk/pssiw/stats2/page16/page16.html
12 | data_file = "../data/iqdata.csv"
13 | 
14 | data = pd.read_csv(data_file)
15 | 
16 | data = pd.get_dummies(data, columns=['group'])
17 | del data['group_1']
18 | 
19 | data = {'N': 43,
20 |         'x1': data['group_2'],
21 |         'x2': data['group_3'],
22 |         'y': data['iq']}
23 | 
24 | sm = pystan.StanModel(file=model_file)
25 | fit = sm.sampling(data=data, control=dict(adapt_delta=0.95))
26 | print(fit)
27 | fit.plot()
28 | plt.show()
29 | 
30 | # model diagnostics
31 | # https://github.com/betanalpha/jupyter_case_studies/blob/master/pystan_workflow/stan_utility.py
32 | stan_utility.check_all_diagnostics(fit)
33 | 
34 | # extract coefficents
35 | fit_dict = fit.extract()
36 | alpha = fit_dict['alpha']
37 | beta_x1 = fit_dict['beta_x1']
38 | beta_x2 = fit_dict['beta_x2']
39 | # calculate group means from coefficents
40 | mean_1 = alpha.mean(axis=0)
41 | mean_2 = alpha.mean(axis=0) + beta_x1.mean(axis=0)
42 | mean_3 = alpha.mean(axis=0) + beta_x2.mean(axis=0)
43 | print(
44 |     f'Mean of group 1: {mean_1},\nMean of group 2: {mean_2},\nMean of group 3: {mean_3}\n')
45 | # calculate the posterior distribution of the difference between the means of group 1 and 3
46 | diffs13 = alpha - (alpha + beta_x2)
47 | # 95% credible intervals
48 | diffs13_ci = np.percentile(diffs13, [2.5, 97.5], axis=0)
49 | print(
50 |     f"Estimated difference between the means of group 1 and 3: {diffs13.mean(axis=0)}\n")
51 | print(f"\t95% credible interval: ({diffs13_ci[0]}, {diffs13_ci[1]})\n")
52 | # How strongly do the data support the hypothesis that the mean of group 3 is larger than the mean of group 1?
53 | print(f"{np.sum(alpha + beta_x2 > alpha) / np.size(alpha)}")
54 | # Because probabilities are never exactly 1, we write >0.999
55 | 
56 | # Save model for later use
57 | with open('../models/saved/onewayANOVA.pkl', 'wb') as f:
58 |     pickle.dump(sm, f)
59 | 


--------------------------------------------------------------------------------
/scripts/orderedLogisticRegression.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import pandas as pd
 5 | import pystan
 6 | 
 7 | from helper import stan_utility
 8 | 
 9 | model_file = "../models/orderedLogisticRegression.stan"
10 | # https://stats.idre.ucla.edu/stata/dae/ordered-logistic-regression/
11 | data_file = "../data/ologit.dta"
12 | 
13 | data = pd.read_stata(data_file)
14 | 
15 | x = data[['pared', 'public', 'gpa']]
16 | y = data['apply'].map({'unlikely': 1, 'somewhat likely': 2, 'very likely': 3})
17 | 
18 | data = {'N': 400,
19 |         'D': 3,
20 |         'K': 3,
21 |         'x': x,
22 |         'y': y}
23 | 
24 | sm = pystan.StanModel(file=model_file)
25 | fit = sm.sampling(data=data, control=dict(adapt_delta=0.95))
26 | print(fit)
27 | 
28 | fit.plot()
29 | plt.show()
30 | 
31 | # model diagnostics
32 | # https://github.com/betanalpha/jupyter_case_studies/blob/master/pystan_workflow/stan_utility.py
33 | stan_utility.check_all_diagnostics(fit)
34 | 
35 | # Save model for later use
36 | with open('../models/saved/orderedLogisticRegression.pkl', 'wb') as f:
37 |     pickle.dump(sm, f)
38 | 


--------------------------------------------------------------------------------
/scripts/robustRegression.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | import pandas as pd
 6 | import pystan
 7 | import seaborn as sns
 8 | 
 9 | from helper import stan_utility
10 | 
11 | model_file = "../models/robustRegression.stan"
12 | # http://vincentarelbundock.github.io/Rdatasets/datasets.html
13 | data_file = "../data/aircraft.csv"
14 | 
15 | data = pd.read_csv(data_file)
16 | 
17 | data = {'N': 23,
18 |         'X1': data['X1'],
19 |         'X2': data['X2'],
20 |         'X3': (data['X3'] - np.mean(data['X3'])) / np.std(data['X3']),
21 |         'X4': (data['X4'] - np.mean(data['X4'])) / np.std(data['X4']),
22 |         'Y': data['Y'],
23 |         }
24 | 
25 | sm = pystan.StanModel(file=model_file)
26 | fit = sm.sampling(data=data, control=dict(adapt_delta=0.95))
27 | print(fit)
28 | fit.plot(['b_X1', 'b_X2', 'b_X3', 'b_X4'])
29 | plt.show()
30 | 
31 | # model diagnostics
32 | # https://github.com/betanalpha/jupyter_case_studies/blob/master/pystan_workflow/stan_utility.py
33 | stan_utility.check_all_diagnostics(fit)
34 | 
35 | # visualize model
36 | # we'll plot histogram of our errors
37 | Y_pred = fit.extract()['Y_pred'].mean(axis=0)
38 | Y = data['Y'].values
39 | abs_err = np.abs(Y - Y_pred)
40 | sns.distplot(abs_err)
41 | plt.title("Histogram of absolute errors")
42 | plt.xlabel("Errors")
43 | plt.ylabel("Frequency")
44 | plt.show()
45 | 
46 | # Save model for later use
47 | with open('../models/saved/robustRegression.pkl', 'wb') as f:
48 |     pickle.dump(sm, f)
49 | 


--------------------------------------------------------------------------------
/scripts/twowayANOVA.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | import pandas as pd
 6 | import pystan
 7 | 
 8 | from helper import stan_utility
 9 | 
10 | model_file = "../models/twowayANOVA.stan"
11 | # http://staff.bath.ac.uk/pssiw/stats2/page16/page16.html
12 | data_file = "../data/drugtrial.csv"
13 | 
14 | data = pd.read_csv(data_file, index_col=0)
15 | 
16 | data = pd.get_dummies(data, columns=['gender', 'dose'])
17 | data.drop(columns=['gender_1', 'dose_1'], inplace=True)
18 | 
19 | data = {'N': 48,
20 |         'x1': data['gender_2'],
21 |         'x2': data['dose_2'],
22 |         'y': data['score']}
23 | 
24 | sm = pystan.StanModel(file=model_file)
25 | fit = sm.sampling(data=data, control=dict(adapt_delta=0.95))
26 | print(fit)
27 | fit.plot()
28 | plt.show()
29 | 
30 | # model diagnostics
31 | # https://github.com/betanalpha/jupyter_case_studies/blob/master/pystan_workflow/stan_utility.py
32 | stan_utility.check_all_diagnostics(fit)
33 | 
34 | # extract coefficents
35 | fit_dict = fit.extract()
36 | alpha = fit_dict['alpha']
37 | beta_x1 = fit_dict['beta_x1']
38 | beta_x2 = fit_dict['beta_x2']
39 | beta_x3 = fit_dict['beta_x3']
40 | # calculate group means from coefficents
41 | mean_11 = alpha.mean(axis=0)
42 | mean_12 = alpha.mean(axis=0) + beta_x1.mean(axis=0)
43 | mean_21 = alpha.mean(axis=0) + beta_x2.mean(axis=0)
44 | mean_22 = alpha.mean(axis=0) + beta_x1.mean(axis=0) + \
45 |     beta_x2.mean(axis=0) + beta_x3.mean(axis=0)
46 | print(
47 |     f'Mean of gender=1, dose=1: {mean_11},\n'
48 |     f'Mean of gender=1, dose=2: {mean_12},\n'
49 |     f'Mean of gender=2, dose=1: {mean_21},\n'
50 |     f'Mean of gender=2, dose=2: {mean_22}\n')
51 | 
52 | # Save model for later use
53 | with open('../models/saved/twowayANOVA.pkl', 'wb') as f:
54 |     pickle.dump(sm, f)
55 | 


--------------------------------------------------------------------------------