├── .gitattributes
├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── bayes.png
├── case_studies
├── Case Study 1 - Diabetes dataset.Rmd
├── Case_Study_1_-_Diabetes_dataset.md
├── data
│ └── diabetes.sav
└── figures
│ ├── cs1-unnamed-chunk-12-1.pdf
│ ├── cs1-unnamed-chunk-12-1.png
│ ├── cs1-unnamed-chunk-15-1.png
│ ├── cs1-unnamed-chunk-18-1.png
│ ├── cs1-unnamed-chunk-19-1.png
│ ├── cs1-unnamed-chunk-22-1.png
│ ├── cs1-unnamed-chunk-23-1.png
│ ├── cs1-unnamed-chunk-24-1.png
│ ├── cs1-unnamed-chunk-25-1.png
│ ├── cs1-unnamed-chunk-26-1.png
│ ├── cs1-unnamed-chunk-29-1.png
│ ├── cs1-unnamed-chunk-33-1.png
│ ├── cs1-unnamed-chunk-9-1.pdf
│ └── cs1-unnamed-chunk-9-1.png
├── ci
└── scripts
│ └── runAllModels.sh
├── data
├── aircraft.csv
├── awards.csv
├── bfi.csv
├── binary.dta
├── cereals.txt
├── child_data.csv
├── drugtrial.csv
├── hsbdemo.dta
├── iqdata.csv
├── ologit.dta
├── scents.sav
└── temprate.sav
├── models
├── linearRegression.stan
├── logisticRegression.stan
├── multinomialLogisticRegression.stan
├── multipleLinearRegression.stan
├── onewayANOVA.stan
├── orderedLogisticRegression.stan
├── robustRegression.stan
└── twowayANOVA.stan
├── notebooks
├── Bayes Factor.Rmd
├── Bayes_Factor.md
├── Correlation Analysis.Rmd
├── Correlation_Analysis.md
├── Factor Analysis.Rmd
├── Factor_Analysis.md
├── Multiple Linear Regression with interaction terms.Rmd
├── Multiple_Linear_Regression_with_interaction_terms.md
├── Poisson Regression.Rmd
├── Poisson_Regression.md
└── figures
│ ├── corr-unnamed-chunk-5-1.png
│ ├── factor-unnamed-chunk-5-1.png
│ ├── factor-unnamed-chunk-6-1.png
│ ├── multipleLin-unnamed-chunk-4-1.png
│ ├── multipleLin-unnamed-chunk-5-1.png
│ ├── poisson-unnamed-chunk-10-1.png
│ ├── poisson-unnamed-chunk-5-1.png
│ └── poisson-unnamed-chunk-9-1.png
├── requirements.txt
└── scripts
├── Multiple linear regression with interaction terms.py
├── Poisson Regression.py
├── helper
├── psis.py
└── stan_utility.py
├── linearRegression.py
├── logisticRegression.py
├── multinomialLogisticRegression.py
├── multipleLinearRegression.py
├── onewayANOVA.py
├── orderedLogisticRegression.py
├── robustRegression.py
└── twowayANOVA.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | ci/* linguist-vendored
2 | data/* linguist-vendored
3 | *.ipynb linguist-language=R
4 | *.py linguist-language=R
5 | *.rmd linguist-language=R
6 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
106 | # vscode
107 | .vscode/
108 | notebooks/.RData
109 | notebooks/.Rhistory
110 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | os:
3 | - "linux"
4 | python:
5 | - "3.6"
6 | install:
7 | - pip install -r requirements.txt
8 | before_script:
9 | - "export MPLBACKEND=Agg"
10 | - "export DISPLAY=:99.0"
11 | - "sh -e /etc/init.d/xvfb start"
12 | - sleep 3
13 | script:
14 | - sh ci/scripts/runAllModels.sh
15 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 | Preamble
9 |
10 | The GNU General Public License is a free, copyleft license for
11 | software and other kinds of works.
12 |
13 | The licenses for most software and other practical works are designed
14 | to take away your freedom to share and change the works. By contrast,
15 | the GNU General Public License is intended to guarantee your freedom to
16 | share and change all versions of a program--to make sure it remains free
17 | software for all its users. We, the Free Software Foundation, use the
18 | GNU General Public License for most of our software; it applies also to
19 | any other work released this way by its authors. You can apply it to
20 | your programs, too.
21 |
22 | When we speak of free software, we are referring to freedom, not
23 | price. Our General Public Licenses are designed to make sure that you
24 | have the freedom to distribute copies of free software (and charge for
25 | them if you wish), that you receive source code or can get it if you
26 | want it, that you can change the software or use pieces of it in new
27 | free programs, and that you know you can do these things.
28 |
29 | To protect your rights, we need to prevent others from denying you
30 | these rights or asking you to surrender the rights. Therefore, you have
31 | certain responsibilities if you distribute copies of the software, or if
32 | you modify it: responsibilities to respect the freedom of others.
33 |
34 | For example, if you distribute copies of such a program, whether
35 | gratis or for a fee, you must pass on to the recipients the same
36 | freedoms that you received. You must make sure that they, too, receive
37 | or can get the source code. And you must show them these terms so they
38 | know their rights.
39 |
40 | Developers that use the GNU GPL protect your rights with two steps:
41 | (1) assert copyright on the software, and (2) offer you this License
42 | giving you legal permission to copy, distribute and/or modify it.
43 |
44 | For the developers' and authors' protection, the GPL clearly explains
45 | that there is no warranty for this free software. For both users' and
46 | authors' sake, the GPL requires that modified versions be marked as
47 | changed, so that their problems will not be attributed erroneously to
48 | authors of previous versions.
49 |
50 | Some devices are designed to deny users access to install or run
51 | modified versions of the software inside them, although the manufacturer
52 | can do so. This is fundamentally incompatible with the aim of
53 | protecting users' freedom to change the software. The systematic
54 | pattern of such abuse occurs in the area of products for individuals to
55 | use, which is precisely where it is most unacceptable. Therefore, we
56 | have designed this version of the GPL to prohibit the practice for those
57 | products. If such problems arise substantially in other domains, we
58 | stand ready to extend this provision to those domains in future versions
59 | of the GPL, as needed to protect the freedom of users.
60 |
61 | Finally, every program is threatened constantly by software patents.
62 | States should not allow patents to restrict development and use of
63 | software on general-purpose computers, but in those that do, we wish to
64 | avoid the special danger that patents applied to a free program could
65 | make it effectively proprietary. To prevent this, the GPL assures that
66 | patents cannot be used to render the program non-free.
67 |
68 | The precise terms and conditions for copying, distribution and
69 | modification follow.
70 |
71 | TERMS AND CONDITIONS
72 |
73 | 0. Definitions.
74 |
75 | "This License" refers to version 3 of the GNU General Public License.
76 |
77 | "Copyright" also means copyright-like laws that apply to other kinds of
78 | works, such as semiconductor masks.
79 |
80 | "The Program" refers to any copyrightable work licensed under this
81 | License. Each licensee is addressed as "you". "Licensees" and
82 | "recipients" may be individuals or organizations.
83 |
84 | To "modify" a work means to copy from or adapt all or part of the work
85 | in a fashion requiring copyright permission, other than the making of an
86 | exact copy. The resulting work is called a "modified version" of the
87 | earlier work or a work "based on" the earlier work.
88 |
89 | A "covered work" means either the unmodified Program or a work based
90 | on the Program.
91 |
92 | To "propagate" a work means to do anything with it that, without
93 | permission, would make you directly or secondarily liable for
94 | infringement under applicable copyright law, except executing it on a
95 | computer or modifying a private copy. Propagation includes copying,
96 | distribution (with or without modification), making available to the
97 | public, and in some countries other activities as well.
98 |
99 | To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies. Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 |
103 | An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License. If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 |
112 | 1. Source Code.
113 |
114 | The "source code" for a work means the preferred form of the work
115 | for making modifications to it. "Object code" means any non-source
116 | form of a work.
117 |
118 | A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 |
123 | The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form. A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 |
134 | The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities. However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work. For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 |
147 | The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 |
151 | The Corresponding Source for a work in source code form is that
152 | same work.
153 |
154 | 2. Basic Permissions.
155 |
156 | All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met. This License explicitly affirms your unlimited
159 | permission to run the unmodified Program. The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work. This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 |
164 | You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force. You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright. Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 |
175 | Conveying under any other circumstances is permitted solely under
176 | the conditions stated below. Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 |
179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 |
181 | No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 |
187 | When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 |
195 | 4. Conveying Verbatim Copies.
196 |
197 | You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 |
205 | You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 |
208 | 5. Conveying Modified Source Versions.
209 |
210 | You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 |
214 | a) The work must carry prominent notices stating that you modified
215 | it, and giving a relevant date.
216 |
217 | b) The work must carry prominent notices stating that it is
218 | released under this License and any conditions added under section
219 | 7. This requirement modifies the requirement in section 4 to
220 | "keep intact all notices".
221 |
222 | c) You must license the entire work, as a whole, under this
223 | License to anyone who comes into possession of a copy. This
224 | License will therefore apply, along with any applicable section 7
225 | additional terms, to the whole of the work, and all its parts,
226 | regardless of how they are packaged. This License gives no
227 | permission to license the work in any other way, but it does not
228 | invalidate such permission if you have separately received it.
229 |
230 | d) If the work has interactive user interfaces, each must display
231 | Appropriate Legal Notices; however, if the Program has interactive
232 | interfaces that do not display Appropriate Legal Notices, your
233 | work need not make them do so.
234 |
235 | A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit. Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 |
245 | 6. Conveying Non-Source Forms.
246 |
247 | You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 |
252 | a) Convey the object code in, or embodied in, a physical product
253 | (including a physical distribution medium), accompanied by the
254 | Corresponding Source fixed on a durable physical medium
255 | customarily used for software interchange.
256 |
257 | b) Convey the object code in, or embodied in, a physical product
258 | (including a physical distribution medium), accompanied by a
259 | written offer, valid for at least three years and valid for as
260 | long as you offer spare parts or customer support for that product
261 | model, to give anyone who possesses the object code either (1) a
262 | copy of the Corresponding Source for all the software in the
263 | product that is covered by this License, on a durable physical
264 | medium customarily used for software interchange, for a price no
265 | more than your reasonable cost of physically performing this
266 | conveying of source, or (2) access to copy the
267 | Corresponding Source from a network server at no charge.
268 |
269 | c) Convey individual copies of the object code with a copy of the
270 | written offer to provide the Corresponding Source. This
271 | alternative is allowed only occasionally and noncommercially, and
272 | only if you received the object code with such an offer, in accord
273 | with subsection 6b.
274 |
275 | d) Convey the object code by offering access from a designated
276 | place (gratis or for a charge), and offer equivalent access to the
277 | Corresponding Source in the same way through the same place at no
278 | further charge. You need not require recipients to copy the
279 | Corresponding Source along with the object code. If the place to
280 | copy the object code is a network server, the Corresponding Source
281 | may be on a different server (operated by you or a third party)
282 | that supports equivalent copying facilities, provided you maintain
283 | clear directions next to the object code saying where to find the
284 | Corresponding Source. Regardless of what server hosts the
285 | Corresponding Source, you remain obligated to ensure that it is
286 | available for as long as needed to satisfy these requirements.
287 |
288 | e) Convey the object code using peer-to-peer transmission, provided
289 | you inform other peers where the object code and Corresponding
290 | Source of the work are being offered to the general public at no
291 | charge under subsection 6d.
292 |
293 | A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 |
297 | A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling. In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage. For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product. A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 |
310 | "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source. The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 |
318 | If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information. But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 |
329 | The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed. Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 |
337 | Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 |
343 | 7. Additional Terms.
344 |
345 | "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law. If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 |
354 | When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it. (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.) You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 |
361 | Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 |
365 | a) Disclaiming warranty or limiting liability differently from the
366 | terms of sections 15 and 16 of this License; or
367 |
368 | b) Requiring preservation of specified reasonable legal notices or
369 | author attributions in that material or in the Appropriate Legal
370 | Notices displayed by works containing it; or
371 |
372 | c) Prohibiting misrepresentation of the origin of that material, or
373 | requiring that modified versions of such material be marked in
374 | reasonable ways as different from the original version; or
375 |
376 | d) Limiting the use for publicity purposes of names of licensors or
377 | authors of the material; or
378 |
379 | e) Declining to grant rights under trademark law for use of some
380 | trade names, trademarks, or service marks; or
381 |
382 | f) Requiring indemnification of licensors and authors of that
383 | material by anyone who conveys the material (or modified versions of
384 | it) with contractual assumptions of liability to the recipient, for
385 | any liability that these contractual assumptions directly impose on
386 | those licensors and authors.
387 |
388 | All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10. If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term. If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 |
398 | If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 |
403 | Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 |
407 | 8. Termination.
408 |
409 | You may not propagate or modify a covered work except as expressly
410 | provided under this License. Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 |
415 | However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 |
422 | Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 |
429 | Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License. If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 |
435 | 9. Acceptance Not Required for Having Copies.
436 |
437 | You are not required to accept this License in order to receive or
438 | run a copy of the Program. Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance. However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work. These actions infringe copyright if you do
443 | not accept this License. Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 |
446 | 10. Automatic Licensing of Downstream Recipients.
447 |
448 | Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License. You are not responsible
451 | for enforcing compliance by third parties with this License.
452 |
453 | An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations. If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 |
463 | You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License. For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 |
471 | 11. Patents.
472 |
473 | A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based. The
475 | work thus licensed is called the contributor's "contributor version".
476 |
477 | A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version. For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 |
487 | Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 |
492 | In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement). To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 |
499 | If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients. "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 |
513 | If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 |
521 | A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License. You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 |
536 | Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 |
540 | 12. No Surrender of Others' Freedom.
541 |
542 | If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License. If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all. For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 |
552 | 13. Use with the GNU Affero General Public License.
553 |
554 | Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work. The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 |
563 | 14. Revised Versions of this License.
564 |
565 | The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time. Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 |
570 | Each version is given a distinguishing version number. If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation. If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 |
579 | If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 |
584 | Later license versions may give you additional or different
585 | permissions. However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 |
589 | 15. Disclaimer of Warranty.
590 |
591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 |
600 | 16. Limitation of Liability.
601 |
602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 |
612 | 17. Interpretation of Sections 15 and 16.
613 |
614 | If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 |
621 | END OF TERMS AND CONDITIONS
622 |
623 | How to Apply These Terms to Your New Programs
624 |
625 | If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 |
629 | To do so, attach the following notices to the program. It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 |
634 |
635 | Copyright (C)
636 |
637 | This program is free software: you can redistribute it and/or modify
638 | it under the terms of the GNU General Public License as published by
639 | the Free Software Foundation, either version 3 of the License, or
640 | (at your option) any later version.
641 |
642 | This program is distributed in the hope that it will be useful,
643 | but WITHOUT ANY WARRANTY; without even the implied warranty of
644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
645 | GNU General Public License for more details.
646 |
647 | You should have received a copy of the GNU General Public License
648 | along with this program. If not, see .
649 |
650 | Also add information on how to contact you by electronic and paper mail.
651 |
652 | If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 |
655 | Copyright (C)
656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 | This is free software, and you are welcome to redistribute it
658 | under certain conditions; type `show c' for details.
659 |
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License. Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 |
664 | You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | .
668 |
669 | The GNU General Public License does not permit incorporating your program
670 | into proprietary programs. If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library. If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License. But first, please read
674 | .
675 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Statistical Modeling Examples
2 |
3 | All we need is just
4 |
5 |
6 | [](https://github.com/mrtkp9993/Statistical-Modelling-Examples/blob/master/LICENSE)
7 | [](https://zenodo.org/badge/latestdoi/143592387)
8 |
9 | ---
10 |
11 | ## Case Studies
12 |
13 | * Diabetes dataset: [Dataset info](http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/diabetes.html), [Markdown](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/case_studies/Case_Study_1_-_Diabetes_dataset.md).
14 |
15 | ## Examples
16 |
17 | PyStan examples includes these methods:
18 |
19 | * Linear Regression [Model](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/models/linearRegression.stan), [Script](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/scripts/linearRegression.py).
20 | * Multiple Linear Regression [Model](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/models/multipleLinearRegression.stan), [Script](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/scripts/multipleLinearRegression.py).
21 | * Robust Regression [Model](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/models/robustRegression.stan), [Script](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/scripts/robustRegression.py).
22 | * Logistic Regression [Model](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/models/logisticRegression.stan), [Script](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/scripts/logisticRegression.py).
23 | * Multinomial Logistic Regression [Model](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/models/multinomialLogisticRegression.stan), [Script](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/scripts/multinomialLogisticRegression.py).
24 | * Ordered Logistic Regression [Model](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/models/orderedLogisticRegression.stan), [Script](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/scripts/orderedLogisticRegression.py).
25 | * One-way ANOVA [Model](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/models/onewayANOVA.stan), [Script](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/scripts/onewayANOVA.py).
26 | * Two-way ANOVA [Model](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/models/twowayANOVA.stan), [Script](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/scripts/twowayANOVA.py).
27 |
28 | R examples includes these methods:
29 |
30 | * Factor analysis [Markdown](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/notebooks/Factor_Analysis.md).
31 | * Correlation analysis [Markdown](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/notebooks/Correlation_Analysis.md).
32 | * Multiple Linear Regression with interaction terms [Markdown](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/notebooks/Multiple_Linear_Regression_with_interaction_terms.md).
33 | * Poisson Regression [Markdown](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/notebooks/Poisson_Regression.md).
34 | * Bayes Factors [Markdown](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/notebooks/Bayes_Factor.md).
35 |
36 | ## Useful Resources
37 |
38 | ### General
39 |
40 | * Glossary of statistical terms [Link](https://www.stat.berkeley.edu/~stark/SticiGui/Text/gloss.htm).
41 | * Statistical tests with Python [Link](https://machinelearningmastery.com/statistical-hypothesis-tests-in-python-cheat-sheet/).
42 | * Michael Betancourt: “A Conceptual Introduction to Hamiltonian Monte Carlo”, 2017; arXiv:1701.02434.
43 | * Hamiltonian Monte Carlo explained, [Link](http://arogozhnikov.github.io/2016/12/19/markov_chain_monte_carlo.html).
44 |
45 | ### Stan
46 |
47 | * Stan Reference Manual [Link](https://github.com/stan-dev/stan/releases/download/v2.17.0/stan-reference-2.17.0.pdf).
48 | * PyStan Getting Started [Link](https://pystan.readthedocs.io/en/latest/getting_started.html).
49 | * Stan example models [Link](https://github.com/stan-dev/example-models/tree/master/misc).
50 | * Prior choices [Link](https://github.com/stan-dev/stan/wiki/Prior-Choice-Recommendations).
51 |
52 | ### R
53 |
54 | * R-bloggers [Link](https://www.r-bloggers.com/).
55 | * Quick-R [Link](https://www.statmethods.net/index.html).
56 |
57 |
58 | ## Datasets
59 |
60 | * R datasets [Link](https://vincentarelbundock.github.io/Rdatasets/datasets.html).
61 | * Datasets for teaching [Link](https://www.sheffield.ac.uk/mash/data).
62 |
63 | ## Books
64 |
65 | * Korner-Nievergelt, F., Korner-Nievergelt, P., Roth, T., Almasi, B., Felten, S. V., & Guélat, J. (2016). Bayesian data analysis in ecology using linear models with R, BUGS and Stan. Amsterdam: Elsevier/Academic Press.
66 |
--------------------------------------------------------------------------------
/bayes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/bayes.png
--------------------------------------------------------------------------------
/case_studies/Case Study 1 - Diabetes dataset.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Case Study 1 - Diabetes dataset"
3 | author: "Murat Koptur"
4 | date: "`r format(Sys.time(), '%d %B %Y')`"
5 | output:
6 | rmarkdown::github_document: default
7 | ---
8 |
9 | ```{r echo=FALSE}
10 | knitr::opts_chunk$set(fig.path='figures/cs1-')
11 | ```
12 |
13 | ```{r}
14 | library(dplyr)
15 | library(fastDummies)
16 | library(GGally)
17 | library(lavaan)
18 | library(loo)
19 | library(magrittr)
20 | library(mice)
21 | library(psych)
22 | library(rstanarm)
23 | library(semPlot)
24 | ```
25 |
26 | ```{r}
27 | # http://biostat.mc.vanderbilt.edu/wiki/Main/DataSets
28 | load("./data/diabetes.sav")
29 | ```
30 |
31 | ```{r}
32 | str(diabetes)
33 | ```
34 |
35 | ```{r}
36 | # I'll not use location in this analysis
37 | diabetes <- select(diabetes, -location, -id)
38 | ```
39 |
40 | ```{r}
41 | # Let's look at summary of data
42 | summary(diabetes)
43 | ```
44 |
45 | ```{r}
46 | # Investigate NA counts
47 | colSums(is.na(diabetes))
48 | ```
49 |
50 | ```{r}
51 | # bp.2s and bp.2d variables has too much missing values
52 |
53 | # Glycosolated hemoglobin (glyhb) column has 13 NAs
54 | # I'll drop these observations
55 | diabetes <- filter(diabetes, !is.na(glyhb))
56 | ```
57 |
58 | ```{r}
59 | # impute
60 | md.pattern(diabetes)
61 | ```
62 |
63 | ```{r results='hide'}
64 | diabetes_imp <-
65 | mice(
66 | data = diabetes,
67 | m = 5,
68 | maxit = 50,
69 | method = "pmm"
70 | )
71 | ```
72 |
73 | ```{r}
74 | # Take first imputed dataset (we have 5 imputed datasets, m=5)
75 | diabetes_completed <- complete(diabetes_imp, 1)
76 | # Investigate NA counts again
77 | colSums(is.na(diabetes_completed))
78 | ```
79 |
80 | ```{r}
81 | # correlation analysis
82 | ggcorr(diabetes_completed, label = TRUE, label_alpha = .7)
83 | ```
84 |
85 | ```{r}
86 | corr_table <-
87 | cor(diabetes_completed[, sapply(diabetes_completed, is.numeric)])
88 | subset(as.data.frame(as.table(corr_table)), abs(Freq) > 0.5)
89 | ```
90 |
91 | ```{r}
92 | # since bp.2d and bp.2s seems highly correlated with bp.1d and bp.1s and
93 | # they have a lot of missing values, I decided to discard them from analysis
94 |
95 | # also, I'll create two new variables,
96 | # BMI (body mass index) and waist-to-hip ratio
97 |
98 | diabetes_completed$bmi <-
99 | (diabetes_completed$weight / (diabetes_completed$height ** 2) * 703)
100 | diabetes_completed$waist_to_hip_rat <-
101 | diabetes_completed$waist / diabetes_completed$hip
102 |
103 | # take a subset of uncorrelated variables
104 | diabetes_completed_subset <- select(
105 | diabetes_completed,
106 | chol,
107 | ratio,
108 | glyhb,
109 | age,
110 | gender,
111 | bmi,
112 | waist_to_hip_rat,
113 | frame,
114 | bp.1s,
115 | bp.1d,
116 | time.ppn
117 | )
118 | head(diabetes_completed_subset)
119 | ```
120 |
121 | ```{r}
122 | # pairs plot
123 | ggpairs(diabetes_completed_subset)
124 | ```
125 |
126 | ```{r}
127 | # standardize all variables
128 | diabetes_completed_subset %<>%
129 | mutate_at(
130 | funs(scale),
131 | .vars = c(
132 | "chol",
133 | "ratio",
134 | "glyhb",
135 | "age",
136 | "bmi",
137 | "waist_to_hip_rat",
138 | "bp.1s",
139 | "bp.1d",
140 | "time.ppn"
141 | )
142 | )
143 | ```
144 |
145 | ```{r}
146 | # Create dummy variables for gender and frame
147 | library(fastDummies)
148 | diabetes_completed_subset <-
149 | dummy_cols(diabetes_completed_subset, remove_first_dummy = TRUE)
150 | diabetes_completed_subset <-
151 | select(diabetes_completed_subset,-gender,-frame)
152 | head(diabetes_completed_subset)
153 | ```
154 |
155 | ```{r}
156 | # Explonatory Factor analysis
157 | fa.parallel(select(diabetes_completed_subset,-glyhb))
158 | ```
159 |
160 | ```{r}
161 | diabetes_completed_subset_fi <-
162 | fa(
163 | select(diabetes_completed_subset,-glyhb),
164 | nfactors = 6,
165 | fm = "pa",
166 | max.iter = 200
167 | )
168 | fa.diagram(diabetes_completed_subset_fi)
169 | ```
170 |
171 | ```{r}
172 | fl <- round(unclass(diabetes_completed_subset_fi$loadings), 2)
173 | fl
174 | ```
175 |
176 | ```{r}
177 | # Let's start to build models
178 | model1 <- stan_glm('glyhb ~ .', data = diabetes_completed_subset)
179 | model1
180 | summary(model1)
181 | ```
182 |
183 | ```{r}
184 | par(mfrow = c(2, 2), mar = c(3, 5, 3, 3))
185 | plot(model1)
186 | ```
187 |
188 | ```{r}
189 | model2 <-
190 | stan_glm('glyhb ~ ratio + age', data = diabetes_completed_subset)
191 | model2
192 | summary(model2)
193 |
194 | par(mfrow = c(2, 2), mar = c(3, 5, 3, 3))
195 | plot(model2)
196 | ```
197 |
198 | ```{r}
199 | model3 <-
200 | stan_glm('glyhb ~ bmi + waist_to_hip_rat', data = diabetes_completed_subset)
201 | model3
202 | summary(model3)
203 |
204 | par(mfrow = c(2, 2), mar = c(3, 5, 3, 3))
205 | plot(model3)
206 | ```
207 |
208 | ```{r}
209 | model4 <-
210 | stan_glm('glyhb ~ ratio + age + bmi + waist_to_hip_rat', data = diabetes_completed_subset)
211 | model4
212 | summary(model4)
213 |
214 | par(mfrow = c(2, 2), mar = c(3, 5, 3, 3))
215 | plot(model4)
216 | ```
217 |
218 | ```{r}
219 | model5 <-
220 | stan_glm('glyhb ~ ratio + age + bmi', data = diabetes_completed_subset)
221 | model5
222 | summary(model5)
223 |
224 | par(mfrow = c(2, 2), mar = c(3, 5, 3, 3))
225 | plot(model5)
226 | ```
227 |
228 | ```{r}
229 | ic <- data.frame(
230 | Model = c("model1", "model2", "model3", "model4", "model5"),
231 | WAIC = c(waic(model1)$estimates[3,1], waic(model2)$estimates[3,1], waic(model3)$estimates[3,1], waic(model4)$estimates[3,1], waic(model5)$estimates[3,1]),
232 | stringsAsFactors = FALSE
233 | )
234 | ic
235 | ```
236 |
237 | ```{r}
238 | # Let's build a SEM model
239 | library(lavaan)
240 | semModel1 <- '
241 | pa1 =~ age
242 | pa2 =~ bp.1d + bp.1s
243 | pa3 =~ bmi + frame_large + frame_small
244 | pa4 =~ gender_male + waist_to_hip_rat
245 | pa5 =~ ratio + chol
246 | pa6 =~ time.ppn
247 |
248 | glyhb ~ pa1 + pa2 + pa3 + pa4 + pa5 + pa6
249 | '
250 | fit1 <- sem(semModel1,
251 | data = diabetes_completed_subset)
252 | fit1
253 | ```
254 |
255 | ```{r}
256 | semPaths(fit1)
257 | ```
258 |
259 | ```{r}
260 | summary(fit1, standardized = TRUE, fit.measures = TRUE)
261 | ```
262 |
263 | ```{r}
264 | parameterEstimates(fit1)
265 | ```
266 |
267 | ```{r}
268 | # Second SEM model
269 | semModel2 <- '
270 | pa1 =~ age
271 | pa5 =~ ratio + chol
272 |
273 | glyhb ~ pa1 + pa5
274 | '
275 | fit2 <- sem(semModel2,
276 | data = diabetes_completed_subset)
277 | fit2
278 | ```
279 |
280 | ```{r}
281 | semPaths(fit2)
282 | ```
283 |
284 | ```{r}
285 | summary(fit2, standardized = TRUE, fit.measures = TRUE)
286 | ```
287 |
288 | ```{r}
289 | parameterEstimates(fit1)
290 | ```
--------------------------------------------------------------------------------
/case_studies/Case_Study_1_-_Diabetes_dataset.md:
--------------------------------------------------------------------------------
1 | Case Study 1 - Diabetes dataset
2 | ================
3 | Murat Koptur
4 | 26 Ağustos 2018
5 |
6 | ``` r
7 | library(dplyr)
8 | library(fastDummies)
9 | library(GGally)
10 | library(lavaan)
11 | library(loo)
12 | library(magrittr)
13 | library(mice)
14 | library(psych)
15 | library(rstanarm)
16 | library(semPlot)
17 | ```
18 |
19 | ``` r
20 | # http://biostat.mc.vanderbilt.edu/wiki/Main/DataSets
21 | load("./data/diabetes.sav")
22 | ```
23 |
24 | ``` r
25 | str(diabetes)
26 | ```
27 |
28 | ## 'data.frame': 403 obs. of 19 variables:
29 | ## $ id : 'labelled' int 1000 1001 1002 1003 1005 1008 1011 1015 1016 1022 ...
30 | ## ..- attr(*, "label")= chr "Subject ID"
31 | ## $ chol : 'labelled' int 203 165 228 78 249 248 195 227 177 263 ...
32 | ## ..- attr(*, "label")= chr "Total Cholesterol"
33 | ## $ stab.glu: 'labelled' int 82 97 92 93 90 94 92 75 87 89 ...
34 | ## ..- attr(*, "label")= chr "Stabilized Glucose"
35 | ## $ hdl : 'labelled' int 56 24 37 12 28 69 41 44 49 40 ...
36 | ## ..- attr(*, "label")= chr "High Density Lipoprotein"
37 | ## $ ratio : 'labelled' num 3.6 6.9 6.2 6.5 8.9 ...
38 | ## ..- attr(*, "label")= chr "Cholesterol/HDL Ratio"
39 | ## $ glyhb : 'labelled' num 4.31 4.44 4.64 4.63 7.72 ...
40 | ## ..- attr(*, "label")= chr "Glycosolated Hemoglobin"
41 | ## $ location: Factor w/ 2 levels "Buckingham","Louisa": 1 1 1 1 1 1 1 1 1 1 ...
42 | ## $ age : int 46 29 58 67 64 34 30 37 45 55 ...
43 | ## ..- attr(*, "units")= chr "years"
44 | ## $ gender : Factor w/ 2 levels "male","female": 2 2 2 1 1 1 1 1 1 2 ...
45 | ## $ height : int 62 64 61 67 68 71 69 59 69 63 ...
46 | ## ..- attr(*, "units")= chr "inches"
47 | ## $ weight : int 121 218 256 119 183 190 191 170 166 202 ...
48 | ## ..- attr(*, "units")= chr "pounds"
49 | ## $ frame : Factor w/ 3 levels "small","medium",..: 2 3 3 3 2 3 2 2 3 1 ...
50 | ## $ bp.1s : 'labelled' int 118 112 190 110 138 132 161 NA 160 108 ...
51 | ## ..- attr(*, "label")= chr "First Systolic Blood Pressure"
52 | ## $ bp.1d : 'labelled' int 59 68 92 50 80 86 112 NA 80 72 ...
53 | ## ..- attr(*, "label")= chr "First Diastolic Blood Pressure"
54 | ## $ bp.2s : 'labelled' int NA NA 185 NA NA NA 161 NA 128 NA ...
55 | ## ..- attr(*, "label")= chr "Second Systolic Blood Pressure"
56 | ## ..- attr(*, "comment")= chr "equals first measurement if it was not high"
57 | ## $ bp.2d : 'labelled' int NA NA 92 NA NA NA 112 NA 86 NA ...
58 | ## ..- attr(*, "comment")= chr "equals first measurement if it was not high"
59 | ## ..- attr(*, "label")= chr "Second Diastolic Blood Pressure"
60 | ## $ waist : int 29 46 49 33 44 36 46 34 34 45 ...
61 | ## ..- attr(*, "units")= chr "inches"
62 | ## $ hip : int 38 48 57 38 41 42 49 39 40 50 ...
63 | ## ..- attr(*, "units")= chr "inches"
64 | ## $ time.ppn: 'labelled' int 720 360 180 480 300 195 720 1020 300 240 ...
65 | ## ..- attr(*, "label")= chr "Postprandial Time when Labs were Drawn"
66 | ## ..- attr(*, "units")= chr "minutes"
67 |
68 | ``` r
69 | # I'll not use location in this analysis
70 | diabetes <- select(diabetes, -location, -id)
71 | ```
72 |
73 | ``` r
74 | # Let's look at summary of data
75 | summary(diabetes)
76 | ```
77 |
78 | ## chol stab.glu hdl ratio
79 | ## Min. : 78.0 Min. : 48.0 Min. : 12.00 Min. : 1.500
80 | ## 1st Qu.:179.0 1st Qu.: 81.0 1st Qu.: 38.00 1st Qu.: 3.200
81 | ## Median :204.0 Median : 89.0 Median : 46.00 Median : 4.200
82 | ## Mean :207.8 Mean :106.7 Mean : 50.45 Mean : 4.522
83 | ## 3rd Qu.:230.0 3rd Qu.:106.0 3rd Qu.: 59.00 3rd Qu.: 5.400
84 | ## Max. :443.0 Max. :385.0 Max. :120.00 Max. :19.300
85 | ## NA's :1 NA's :1 NA's :1
86 | ## glyhb age gender height
87 | ## Min. : 2.68 Min. :19.00 male :169 Min. :52.00
88 | ## 1st Qu.: 4.38 1st Qu.:34.00 female:234 1st Qu.:63.00
89 | ## Median : 4.84 Median :45.00 Median :66.00
90 | ## Mean : 5.59 Mean :46.85 Mean :66.02
91 | ## 3rd Qu.: 5.60 3rd Qu.:60.00 3rd Qu.:69.00
92 | ## Max. :16.11 Max. :92.00 Max. :76.00
93 | ## NA's :13 NA's :5
94 | ## weight frame bp.1s bp.1d
95 | ## Min. : 99.0 small :104 Min. : 90.0 Min. : 48.00
96 | ## 1st Qu.:151.0 medium:184 1st Qu.:121.2 1st Qu.: 75.00
97 | ## Median :172.5 large :103 Median :136.0 Median : 82.00
98 | ## Mean :177.6 NA's : 12 Mean :136.9 Mean : 83.32
99 | ## 3rd Qu.:200.0 3rd Qu.:146.8 3rd Qu.: 90.00
100 | ## Max. :325.0 Max. :250.0 Max. :124.00
101 | ## NA's :1 NA's :5 NA's :5
102 | ## bp.2s bp.2d waist hip
103 | ## Min. :110.0 Min. : 60.00 Min. :26.0 Min. :30.00
104 | ## 1st Qu.:138.0 1st Qu.: 84.00 1st Qu.:33.0 1st Qu.:39.00
105 | ## Median :149.0 Median : 92.00 Median :37.0 Median :42.00
106 | ## Mean :152.4 Mean : 92.52 Mean :37.9 Mean :43.04
107 | ## 3rd Qu.:161.0 3rd Qu.:100.00 3rd Qu.:41.0 3rd Qu.:46.00
108 | ## Max. :238.0 Max. :124.00 Max. :56.0 Max. :64.00
109 | ## NA's :262 NA's :262 NA's :2 NA's :2
110 | ## time.ppn
111 | ## Min. : 5.0
112 | ## 1st Qu.: 90.0
113 | ## Median : 240.0
114 | ## Mean : 341.2
115 | ## 3rd Qu.: 517.5
116 | ## Max. :1560.0
117 | ## NA's :3
118 |
119 | ``` r
120 | # Investigate NA counts
121 | colSums(is.na(diabetes))
122 | ```
123 |
124 | ## chol stab.glu hdl ratio glyhb age gender height
125 | ## 1 0 1 1 13 0 0 5
126 | ## weight frame bp.1s bp.1d bp.2s bp.2d waist hip
127 | ## 1 12 5 5 262 262 2 2
128 | ## time.ppn
129 | ## 3
130 |
131 | ``` r
132 | # bp.2s and bp.2d variables has too much missing values
133 |
134 | # Glycosolated hemoglobin (glyhb) column has 13 NAs
135 | # I'll drop these observations
136 | diabetes <- filter(diabetes, !is.na(glyhb))
137 | ```
138 |
139 | ``` r
140 | # impute
141 | md.pattern(diabetes)
142 | ```
143 |
144 | 
145 |
146 | ## stab.glu glyhb age gender chol hdl ratio weight waist hip time.ppn
147 | ## 130 1 1 1 1 1 1 1 1 1 1 1
148 | ## 236 1 1 1 1 1 1 1 1 1 1 1
149 | ## 6 1 1 1 1 1 1 1 1 1 1 1
150 | ## 3 1 1 1 1 1 1 1 1 1 1 1
151 | ## 3 1 1 1 1 1 1 1 1 1 1 1
152 | ## 4 1 1 1 1 1 1 1 1 1 1 1
153 | ## 1 1 1 1 1 1 1 1 1 1 1 1
154 | ## 1 1 1 1 1 1 1 1 1 1 1 0
155 | ## 1 1 1 1 1 1 1 1 1 1 1 0
156 | ## 1 1 1 1 1 1 1 1 1 1 1 0
157 | ## 1 1 1 1 1 1 1 1 1 0 0 1
158 | ## 1 1 1 1 1 1 1 1 1 0 0 1
159 | ## 1 1 1 1 1 1 1 1 0 1 1 1
160 | ## 1 1 1 1 1 0 0 0 1 1 1 1
161 | ## 0 0 0 0 1 1 1 1 2 2 3
162 | ## height bp.1s bp.1d frame bp.2s bp.2d
163 | ## 130 1 1 1 1 1 1 0
164 | ## 236 1 1 1 1 0 0 2
165 | ## 6 1 1 1 0 1 1 1
166 | ## 3 1 1 1 0 0 0 3
167 | ## 3 1 0 0 1 0 0 4
168 | ## 4 0 1 1 1 0 0 3
169 | ## 1 0 0 0 0 0 0 6
170 | ## 1 1 1 1 1 1 1 1
171 | ## 1 1 1 1 0 0 0 4
172 | ## 1 1 0 0 1 0 0 5
173 | ## 1 1 1 1 1 1 1 2
174 | ## 1 1 1 1 1 0 0 4
175 | ## 1 1 1 1 1 0 0 3
176 | ## 1 1 1 1 1 0 0 5
177 | ## 5 5 5 11 252 252 541
178 |
179 | ``` r
180 | diabetes_imp <-
181 | mice(
182 | data = diabetes,
183 | m = 5,
184 | maxit = 50,
185 | method = "pmm"
186 | )
187 | ```
188 |
189 | ``` r
190 | # Take first imputed dataset (we have 5 imputed datasets, m=5)
191 | diabetes_completed <- complete(diabetes_imp, 1)
192 | ```
193 |
194 | ``` r
195 | # Investigate NA counts again
196 | colSums(is.na(diabetes_completed))
197 | ```
198 |
199 | ## chol stab.glu hdl ratio glyhb age gender height
200 | ## 0 0 0 0 0 0 0 0
201 | ## weight frame bp.1s bp.1d bp.2s bp.2d waist hip
202 | ## 0 0 0 0 0 0 0 0
203 | ## time.ppn
204 | ## 0
205 |
206 | ``` r
207 | # correlation analysis
208 | ggcorr(diabetes_completed, label = TRUE, label_alpha = .7)
209 | ```
210 |
211 | 
212 |
213 | ``` r
214 | corr_table <-
215 | cor(diabetes_completed[, sapply(diabetes_completed, is.numeric)])
216 | subset(as.data.frame(as.table(corr_table)), abs(Freq) > 0.5)
217 | ```
218 |
219 | ## Var1 Var2 Freq
220 | ## 1 chol chol 1.0000000
221 | ## 17 stab.glu stab.glu 1.0000000
222 | ## 20 glyhb stab.glu 0.7492355
223 | ## 33 hdl hdl 1.0000000
224 | ## 34 ratio hdl -0.6826599
225 | ## 48 hdl ratio -0.6826599
226 | ## 49 ratio ratio 1.0000000
227 | ## 62 stab.glu glyhb 0.7492355
228 | ## 65 glyhb glyhb 1.0000000
229 | ## 81 age age 1.0000000
230 | ## 97 height height 1.0000000
231 | ## 113 weight weight 1.0000000
232 | ## 118 waist weight 0.8522011
233 | ## 119 hip weight 0.8307025
234 | ## 129 bp.1s bp.1s 1.0000000
235 | ## 130 bp.1d bp.1s 0.6054981
236 | ## 131 bp.2s bp.1s 0.8778776
237 | ## 132 bp.2d bp.1s 0.5162788
238 | ## 144 bp.1s bp.1d 0.6054981
239 | ## 145 bp.1d bp.1d 1.0000000
240 | ## 146 bp.2s bp.1d 0.5814284
241 | ## 147 bp.2d bp.1d 0.8272843
242 | ## 159 bp.1s bp.2s 0.8778776
243 | ## 160 bp.1d bp.2s 0.5814284
244 | ## 161 bp.2s bp.2s 1.0000000
245 | ## 162 bp.2d bp.2s 0.5746704
246 | ## 174 bp.1s bp.2d 0.5162788
247 | ## 175 bp.1d bp.2d 0.8272843
248 | ## 176 bp.2s bp.2d 0.5746704
249 | ## 177 bp.2d bp.2d 1.0000000
250 | ## 188 weight waist 0.8522011
251 | ## 193 waist waist 1.0000000
252 | ## 194 hip waist 0.8341216
253 | ## 203 weight hip 0.8307025
254 | ## 208 waist hip 0.8341216
255 | ## 209 hip hip 1.0000000
256 | ## 225 time.ppn time.ppn 1.0000000
257 |
258 | ``` r
259 | # since bp.2d and bp.2s seems highly correlated with bp.1d and bp.1s and
260 | # they have a lot of missing values, I decided to discard them from analysis
261 |
262 | # also, I'll create two new variables,
263 | # BMI (body mass index) and waist-to-hip ratio
264 |
265 | diabetes_completed$bmi <-
266 | (diabetes_completed$weight / (diabetes_completed$height ** 2) * 703)
267 | diabetes_completed$waist_to_hip_rat <-
268 | diabetes_completed$waist / diabetes_completed$hip
269 |
270 | # take a subset of uncorrelated variables
271 | diabetes_completed_subset <- select(
272 | diabetes_completed,
273 | chol,
274 | ratio,
275 | glyhb,
276 | age,
277 | gender,
278 | bmi,
279 | waist_to_hip_rat,
280 | frame,
281 | bp.1s,
282 | bp.1d,
283 | time.ppn
284 | )
285 | head(diabetes_completed_subset)
286 | ```
287 |
288 | ## chol ratio glyhb age gender bmi waist_to_hip_rat frame bp.1s bp.1d
289 | ## 1 203 3.6 4.31 46 female 22.12877 0.7631579 medium 118 59
290 | ## 2 165 6.9 4.44 29 female 37.41553 0.9583333 large 112 68
291 | ## 3 228 6.2 4.64 58 female 48.36549 0.8596491 large 190 92
292 | ## 4 78 6.5 4.63 67 male 18.63600 0.8684211 large 110 50
293 | ## 5 249 8.9 7.72 64 male 27.82202 1.0731707 medium 138 80
294 | ## 6 248 3.6 4.81 34 male 26.49673 0.8571429 large 132 86
295 | ## time.ppn
296 | ## 1 720
297 | ## 2 360
298 | ## 3 180
299 | ## 4 480
300 | ## 5 300
301 | ## 6 195
302 |
303 | ``` r
304 | # pairs plot
305 | ggpairs(diabetes_completed_subset)
306 | ```
307 |
308 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
309 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
310 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
311 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
312 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
313 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
314 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
315 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
316 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
317 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
318 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
319 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
320 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
321 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
322 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
323 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
324 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
325 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
326 |
327 | 
328 |
329 | ``` r
330 | # standardize all variables
331 | diabetes_completed_subset %<>%
332 | mutate_at(
333 | funs(scale),
334 | .vars = c(
335 | "chol",
336 | "ratio",
337 | "glyhb",
338 | "age",
339 | "bmi",
340 | "waist_to_hip_rat",
341 | "bp.1s",
342 | "bp.1d",
343 | "time.ppn"
344 | )
345 | )
346 | ```
347 |
348 | ``` r
349 | # Create dummy variables for gender and frame
350 | library(fastDummies)
351 | diabetes_completed_subset <-
352 | dummy_cols(diabetes_completed_subset, remove_first_dummy = TRUE)
353 | diabetes_completed_subset <-
354 | select(diabetes_completed_subset,-gender,-frame)
355 | head(diabetes_completed_subset)
356 | ```
357 |
358 | ## chol ratio glyhb age bmi
359 | ## 1 -0.09319585 -0.5301616 -0.5706645 -0.04711384 -0.9973448
360 | ## 2 -0.94314197 1.3678022 -0.5126959 -1.08143433 1.3055456
361 | ## 3 0.46597923 0.9652036 -0.4235136 0.68299474 2.9551156
362 | ## 4 -2.88907124 1.1377459 -0.4279726 1.23057617 -1.5235175
363 | ## 5 0.93568630 2.5180829 0.9498954 1.04804903 -0.1396797
364 | ## 6 0.91331929 -0.5301616 -0.3477085 -0.77722242 -0.3393293
365 | ## waist_to_hip_rat bp.1s bp.1d time.ppn gender_male
366 | ## 1 -1.6083402 -0.82988906 -1.7821860 1.25031434 0
367 | ## 2 1.0550300 -1.09181790 -1.1181935 0.08200624 0
368 | ## 3 -0.2916179 2.31325699 0.6524530 -0.50214781 0
369 | ## 4 -0.1719158 -1.17912751 -2.4461784 0.47144227 1
370 | ## 5 2.6221047 0.04320706 -0.2328703 -0.11271177 1
371 | ## 6 -0.3258185 -0.21872177 0.2097913 -0.45346830 1
372 | ## frame_large frame_small
373 | ## 1 0 0
374 | ## 2 1 0
375 | ## 3 1 0
376 | ## 4 1 0
377 | ## 5 0 0
378 | ## 6 1 0
379 |
380 | ``` r
381 | # Explonatory Factor analysis
382 | fa.parallel(select(diabetes_completed_subset,-glyhb))
383 | ```
384 |
385 | 
386 |
387 | ## Parallel analysis suggests that the number of factors = 6 and the number of components = 4
388 |
389 | ``` r
390 | diabetes_completed_subset_fi <-
391 | fa(
392 | select(diabetes_completed_subset,-glyhb),
393 | nfactors = 6,
394 | fm = "pa",
395 | max.iter = 200
396 | )
397 | ```
398 |
399 | ``` r
400 | fa.diagram(diabetes_completed_subset_fi)
401 | ```
402 |
403 | 
404 |
405 | ``` r
406 | fl <- round(unclass(diabetes_completed_subset_fi$loadings), 2)
407 | fl
408 | ```
409 |
410 | ## PA2 PA3 PA1 PA5 PA4 PA6
411 | ## chol 0.07 -0.10 0.05 0.75 -0.12 0.09
412 | ## ratio -0.08 0.17 -0.01 0.67 0.19 -0.12
413 | ## age -0.02 -0.02 0.99 0.02 -0.01 0.00
414 | ## bmi 0.06 0.84 -0.06 0.03 -0.15 -0.04
415 | ## waist_to_hip_rat 0.01 0.19 0.18 0.08 0.47 -0.05
416 | ## bp.1s 0.58 0.05 0.38 0.02 -0.02 0.00
417 | ## bp.1d 0.98 0.01 -0.07 0.00 0.03 0.00
418 | ## time.ppn -0.09 -0.04 -0.10 0.04 -0.03 0.36
419 | ## gender_male 0.06 -0.15 -0.04 0.00 0.79 0.04
420 | ## frame_large -0.07 0.49 0.15 -0.09 0.31 0.18
421 | ## frame_small -0.05 -0.42 -0.03 -0.14 -0.13 -0.29
422 |
423 | ``` r
424 | # Let's start to build models
425 | model1 <- stan_glm('glyhb ~ .', data = diabetes_completed_subset)
426 | ```
427 |
428 | ``` r
429 | model1
430 | ```
431 |
432 | ## stan_glm
433 | ## family: gaussian [identity]
434 | ## formula: "glyhb ~ ."
435 | ## observations: 390
436 | ## predictors: 12
437 | ## ------
438 | ## Median MAD_SD
439 | ## (Intercept) 0.0 0.1
440 | ## chol 0.1 0.1
441 | ## ratio 0.2 0.1
442 | ## age 0.3 0.1
443 | ## bmi 0.1 0.1
444 | ## waist_to_hip_rat 0.0 0.1
445 | ## bp.1s 0.1 0.1
446 | ## bp.1d 0.0 0.1
447 | ## time.ppn 0.1 0.0
448 | ## gender_male 0.0 0.1
449 | ## frame_large 0.0 0.1
450 | ## frame_small 0.0 0.1
451 | ## sigma 0.9 0.0
452 | ##
453 | ## Sample avg. posterior predictive distribution of y:
454 | ## Median MAD_SD
455 | ## mean_PPD 0.0 0.1
456 | ##
457 | ## ------
458 | ## For info on the priors used see help('prior_summary.stanreg').
459 |
460 | ``` r
461 | summary(model1)
462 | ```
463 |
464 | ##
465 | ## Model Info:
466 | ##
467 | ## function: stan_glm
468 | ## family: gaussian [identity]
469 | ## formula: "glyhb ~ ."
470 | ## algorithm: sampling
471 | ## priors: see help('prior_summary')
472 | ## sample: 4000 (posterior sample size)
473 | ## observations: 390
474 | ## predictors: 12
475 | ##
476 | ## Estimates:
477 | ## mean sd 2.5% 25% 50% 75% 97.5%
478 | ## (Intercept) 0.0 0.1 -0.2 -0.1 0.0 0.1 0.2
479 | ## chol 0.1 0.1 -0.1 0.0 0.1 0.1 0.2
480 | ## ratio 0.2 0.1 0.1 0.2 0.2 0.3 0.3
481 | ## age 0.3 0.1 0.1 0.2 0.3 0.3 0.4
482 | ## bmi 0.1 0.1 0.0 0.0 0.1 0.1 0.2
483 | ## waist_to_hip_rat 0.0 0.1 -0.1 0.0 0.0 0.1 0.2
484 | ## bp.1s 0.1 0.1 -0.1 0.0 0.1 0.1 0.2
485 | ## bp.1d 0.0 0.1 -0.2 -0.1 0.0 0.0 0.1
486 | ## time.ppn 0.1 0.0 0.0 0.0 0.1 0.1 0.1
487 | ## gender_male 0.0 0.1 -0.2 0.0 0.0 0.1 0.2
488 | ## frame_large 0.0 0.1 -0.3 -0.1 0.0 0.0 0.2
489 | ## frame_small 0.0 0.1 -0.2 -0.1 0.0 0.1 0.2
490 | ## sigma 0.9 0.0 0.8 0.9 0.9 0.9 1.0
491 | ## mean_PPD 0.0 0.1 -0.1 0.0 0.0 0.0 0.1
492 | ## log-posterior -529.0 2.6 -534.8 -530.6 -528.6 -527.1 -524.9
493 | ##
494 | ## Diagnostics:
495 | ## mcse Rhat n_eff
496 | ## (Intercept) 0.0 1.0 4000
497 | ## chol 0.0 1.0 4000
498 | ## ratio 0.0 1.0 4000
499 | ## age 0.0 1.0 4000
500 | ## bmi 0.0 1.0 4000
501 | ## waist_to_hip_rat 0.0 1.0 4000
502 | ## bp.1s 0.0 1.0 3638
503 | ## bp.1d 0.0 1.0 3939
504 | ## time.ppn 0.0 1.0 4000
505 | ## gender_male 0.0 1.0 4000
506 | ## frame_large 0.0 1.0 4000
507 | ## frame_small 0.0 1.0 4000
508 | ## sigma 0.0 1.0 4000
509 | ## mean_PPD 0.0 1.0 4000
510 | ## log-posterior 0.1 1.0 1764
511 | ##
512 | ## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
513 |
514 | ``` r
515 | par(mfrow = c(2, 2), mar = c(3, 5, 3, 3))
516 | plot(model1)
517 | ```
518 |
519 | 
520 |
521 | ``` r
522 | model2 <-
523 | stan_glm('glyhb ~ ratio + age', data = diabetes_completed_subset)
524 | ```
525 |
526 | ``` r
527 | model2
528 | ```
529 |
530 | ## stan_glm
531 | ## family: gaussian [identity]
532 | ## formula: "glyhb ~ ratio + age"
533 | ## observations: 390
534 | ## predictors: 3
535 | ## ------
536 | ## Median MAD_SD
537 | ## (Intercept) 0.0 0.0
538 | ## ratio 0.3 0.0
539 | ## age 0.3 0.0
540 | ## sigma 0.9 0.0
541 | ##
542 | ## Sample avg. posterior predictive distribution of y:
543 | ## Median MAD_SD
544 | ## mean_PPD 0.0 0.1
545 | ##
546 | ## ------
547 | ## For info on the priors used see help('prior_summary.stanreg').
548 |
549 | ``` r
550 | summary(model2)
551 | ```
552 |
553 | ##
554 | ## Model Info:
555 | ##
556 | ## function: stan_glm
557 | ## family: gaussian [identity]
558 | ## formula: "glyhb ~ ratio + age"
559 | ## algorithm: sampling
560 | ## priors: see help('prior_summary')
561 | ## sample: 4000 (posterior sample size)
562 | ## observations: 390
563 | ## predictors: 3
564 | ##
565 | ## Estimates:
566 | ## mean sd 2.5% 25% 50% 75% 97.5%
567 | ## (Intercept) 0.0 0.0 -0.1 0.0 0.0 0.0 0.1
568 | ## ratio 0.3 0.0 0.2 0.3 0.3 0.3 0.4
569 | ## age 0.3 0.0 0.2 0.3 0.3 0.3 0.4
570 | ## sigma 0.9 0.0 0.8 0.9 0.9 0.9 1.0
571 | ## mean_PPD 0.0 0.1 -0.1 0.0 0.0 0.0 0.1
572 | ## log-posterior -519.3 1.4 -522.8 -520.1 -519.0 -518.3 -517.6
573 | ##
574 | ## Diagnostics:
575 | ## mcse Rhat n_eff
576 | ## (Intercept) 0.0 1.0 4000
577 | ## ratio 0.0 1.0 4000
578 | ## age 0.0 1.0 4000
579 | ## sigma 0.0 1.0 4000
580 | ## mean_PPD 0.0 1.0 4000
581 | ## log-posterior 0.0 1.0 1855
582 | ##
583 | ## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
584 |
585 | ``` r
586 | par(mfrow = c(2, 2), mar = c(3, 5, 3, 3))
587 | plot(model2)
588 | ```
589 |
590 | 
591 |
592 | ``` r
593 | model3 <-
594 | stan_glm('glyhb ~ bmi + waist_to_hip_rat', data = diabetes_completed_subset)
595 | ```
596 |
597 | ``` r
598 | model3
599 | ```
600 |
601 | ## stan_glm
602 | ## family: gaussian [identity]
603 | ## formula: "glyhb ~ bmi + waist_to_hip_rat"
604 | ## observations: 390
605 | ## predictors: 3
606 | ## ------
607 | ## Median MAD_SD
608 | ## (Intercept) 0.0 0.0
609 | ## bmi 0.1 0.0
610 | ## waist_to_hip_rat 0.2 0.1
611 | ## sigma 1.0 0.0
612 | ##
613 | ## Sample avg. posterior predictive distribution of y:
614 | ## Median MAD_SD
615 | ## mean_PPD 0.0 0.1
616 | ##
617 | ## ------
618 | ## For info on the priors used see help('prior_summary.stanreg').
619 |
620 | ``` r
621 | summary(model3)
622 | ```
623 |
624 | ##
625 | ## Model Info:
626 | ##
627 | ## function: stan_glm
628 | ## family: gaussian [identity]
629 | ## formula: "glyhb ~ bmi + waist_to_hip_rat"
630 | ## algorithm: sampling
631 | ## priors: see help('prior_summary')
632 | ## sample: 4000 (posterior sample size)
633 | ## observations: 390
634 | ## predictors: 3
635 | ##
636 | ## Estimates:
637 | ## mean sd 2.5% 25% 50% 75% 97.5%
638 | ## (Intercept) 0.0 0.0 -0.1 0.0 0.0 0.0 0.1
639 | ## bmi 0.1 0.1 0.0 0.1 0.1 0.1 0.2
640 | ## waist_to_hip_rat 0.2 0.1 0.1 0.1 0.2 0.2 0.3
641 | ## sigma 1.0 0.0 0.9 1.0 1.0 1.0 1.1
642 | ## mean_PPD 0.0 0.1 -0.1 0.0 0.0 0.0 0.1
643 | ## log-posterior -551.6 1.5 -555.3 -552.3 -551.2 -550.5 -549.8
644 | ##
645 | ## Diagnostics:
646 | ## mcse Rhat n_eff
647 | ## (Intercept) 0.0 1.0 4000
648 | ## bmi 0.0 1.0 4000
649 | ## waist_to_hip_rat 0.0 1.0 4000
650 | ## sigma 0.0 1.0 4000
651 | ## mean_PPD 0.0 1.0 4000
652 | ## log-posterior 0.0 1.0 1792
653 | ##
654 | ## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
655 |
656 | ``` r
657 | par(mfrow = c(2, 2), mar = c(3, 5, 3, 3))
658 | plot(model3)
659 | ```
660 |
661 | 
662 |
663 | ``` r
664 | model4 <-
665 | stan_glm('glyhb ~ ratio + age + bmi + waist_to_hip_rat', data = diabetes_completed_subset)
666 | ```
667 |
668 | ``` r
669 | model4
670 | ```
671 |
672 | ## stan_glm
673 | ## family: gaussian [identity]
674 | ## formula: "glyhb ~ ratio + age + bmi + waist_to_hip_rat"
675 | ## observations: 390
676 | ## predictors: 5
677 | ## ------
678 | ## Median MAD_SD
679 | ## (Intercept) 0.0 0.0
680 | ## ratio 0.3 0.0
681 | ## age 0.3 0.0
682 | ## bmi 0.1 0.0
683 | ## waist_to_hip_rat 0.0 0.1
684 | ## sigma 0.9 0.0
685 | ##
686 | ## Sample avg. posterior predictive distribution of y:
687 | ## Median MAD_SD
688 | ## mean_PPD 0.0 0.1
689 | ##
690 | ## ------
691 | ## For info on the priors used see help('prior_summary.stanreg').
692 |
693 | ``` r
694 | summary(model4)
695 | ```
696 |
697 | ##
698 | ## Model Info:
699 | ##
700 | ## function: stan_glm
701 | ## family: gaussian [identity]
702 | ## formula: "glyhb ~ ratio + age + bmi + waist_to_hip_rat"
703 | ## algorithm: sampling
704 | ## priors: see help('prior_summary')
705 | ## sample: 4000 (posterior sample size)
706 | ## observations: 390
707 | ## predictors: 5
708 | ##
709 | ## Estimates:
710 | ## mean sd 2.5% 25% 50% 75% 97.5%
711 | ## (Intercept) 0.0 0.0 -0.1 0.0 0.0 0.0 0.1
712 | ## ratio 0.3 0.0 0.2 0.2 0.3 0.3 0.4
713 | ## age 0.3 0.0 0.2 0.3 0.3 0.3 0.4
714 | ## bmi 0.1 0.0 0.0 0.0 0.1 0.1 0.2
715 | ## waist_to_hip_rat 0.0 0.0 -0.1 0.0 0.0 0.1 0.1
716 | ## sigma 0.9 0.0 0.8 0.9 0.9 0.9 1.0
717 | ## mean_PPD 0.0 0.1 -0.1 0.0 0.0 0.0 0.1
718 | ## log-posterior -521.0 1.7 -525.2 -521.9 -520.6 -519.7 -518.6
719 | ##
720 | ## Diagnostics:
721 | ## mcse Rhat n_eff
722 | ## (Intercept) 0.0 1.0 4000
723 | ## ratio 0.0 1.0 4000
724 | ## age 0.0 1.0 4000
725 | ## bmi 0.0 1.0 4000
726 | ## waist_to_hip_rat 0.0 1.0 4000
727 | ## sigma 0.0 1.0 4000
728 | ## mean_PPD 0.0 1.0 4000
729 | ## log-posterior 0.0 1.0 1911
730 | ##
731 | ## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
732 |
733 | ``` r
734 | par(mfrow = c(2, 2), mar = c(3, 5, 3, 3))
735 | plot(model4)
736 | ```
737 |
738 | 
739 |
740 | ``` r
741 | model5 <-
742 | stan_glm('glyhb ~ ratio + age + bmi', data = diabetes_completed_subset)
743 | ```
744 |
745 | ``` r
746 | model5
747 | ```
748 |
749 | ## stan_glm
750 | ## family: gaussian [identity]
751 | ## formula: "glyhb ~ ratio + age + bmi"
752 | ## observations: 390
753 | ## predictors: 4
754 | ## ------
755 | ## Median MAD_SD
756 | ## (Intercept) 0.0 0.0
757 | ## ratio 0.3 0.0
758 | ## age 0.3 0.0
759 | ## bmi 0.1 0.0
760 | ## sigma 0.9 0.0
761 | ##
762 | ## Sample avg. posterior predictive distribution of y:
763 | ## Median MAD_SD
764 | ## mean_PPD 0.0 0.1
765 | ##
766 | ## ------
767 | ## For info on the priors used see help('prior_summary.stanreg').
768 |
769 | ``` r
770 | summary(model5)
771 | ```
772 |
773 | ##
774 | ## Model Info:
775 | ##
776 | ## function: stan_glm
777 | ## family: gaussian [identity]
778 | ## formula: "glyhb ~ ratio + age + bmi"
779 | ## algorithm: sampling
780 | ## priors: see help('prior_summary')
781 | ## sample: 4000 (posterior sample size)
782 | ## observations: 390
783 | ## predictors: 4
784 | ##
785 | ## Estimates:
786 | ## mean sd 2.5% 25% 50% 75% 97.5%
787 | ## (Intercept) 0.0 0.0 -0.1 0.0 0.0 0.0 0.1
788 | ## ratio 0.3 0.0 0.2 0.2 0.3 0.3 0.4
789 | ## age 0.3 0.0 0.2 0.3 0.3 0.3 0.4
790 | ## bmi 0.1 0.0 0.0 0.0 0.1 0.1 0.2
791 | ## sigma 0.9 0.0 0.8 0.9 0.9 0.9 1.0
792 | ## mean_PPD 0.0 0.1 -0.1 0.0 0.0 0.0 0.1
793 | ## log-posterior -519.8 1.5 -523.4 -520.6 -519.5 -518.7 -517.8
794 | ##
795 | ## Diagnostics:
796 | ## mcse Rhat n_eff
797 | ## (Intercept) 0.0 1.0 4000
798 | ## ratio 0.0 1.0 4000
799 | ## age 0.0 1.0 4000
800 | ## bmi 0.0 1.0 4000
801 | ## sigma 0.0 1.0 4000
802 | ## mean_PPD 0.0 1.0 4000
803 | ## log-posterior 0.0 1.0 1941
804 | ##
805 | ## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
806 |
807 | ``` r
808 | par(mfrow = c(2, 2), mar = c(3, 5, 3, 3))
809 | plot(model5)
810 | ```
811 |
812 | 
813 |
814 | ``` r
815 | ic <- data.frame(
816 | Model = c("model1", "model2", "model3", "model4", "model5"),
817 | WAIC = c(waic(model1)$estimates[3,1], waic(model2)$estimates[3,1], waic(model3)$estimates[3,1], waic(model4)$estimates[3,1], waic(model5)$estimates[3,1]),
818 | stringsAsFactors = FALSE
819 | )
820 | ```
821 |
822 | ``` r
823 | ic
824 | ```
825 |
826 | ## Model WAIC
827 | ## 1 model1 1045.760
828 | ## 2 model2 1033.905
829 | ## 3 model3 1097.760
830 | ## 4 model4 1035.492
831 | ## 5 model5 1034.094
832 |
833 | ``` r
834 | # Let's build a SEM model
835 | library(lavaan)
836 | semModel1 <- '
837 | pa1 =~ age
838 | pa2 =~ bp.1d + bp.1s
839 | pa3 =~ bmi + frame_large + frame_small
840 | pa4 =~ gender_male + waist_to_hip_rat
841 | pa5 =~ ratio + chol
842 | pa6 =~ time.ppn
843 |
844 | glyhb ~ pa1 + pa2 + pa3 + pa4 + pa5 + pa6
845 | '
846 | fit1 <- sem(semModel1,
847 | data = diabetes_completed_subset)
848 | ```
849 |
850 | ## Warning in lav_object_post_check(object): lavaan WARNING: some estimated ov
851 | ## variances are negative
852 |
853 | ``` r
854 | fit1
855 | ```
856 |
857 | ## lavaan 0.6-2 ended normally after 144 iterations
858 | ##
859 | ## Optimization method NLMINB
860 | ## Number of free parameters 42
861 | ##
862 | ## Number of observations 390
863 | ##
864 | ## Estimator ML
865 | ## Model Fit Test Statistic 178.781
866 | ## Degrees of freedom 36
867 | ## P-value (Chi-square) 0.000
868 |
869 | ``` r
870 | semPaths(fit1)
871 | ```
872 |
873 | 
874 |
875 | ``` r
876 | summary(fit1, standardized = TRUE, fit.measures = TRUE)
877 | ```
878 |
879 | ## lavaan 0.6-2 ended normally after 144 iterations
880 | ##
881 | ## Optimization method NLMINB
882 | ## Number of free parameters 42
883 | ##
884 | ## Number of observations 390
885 | ##
886 | ## Estimator ML
887 | ## Model Fit Test Statistic 178.781
888 | ## Degrees of freedom 36
889 | ## P-value (Chi-square) 0.000
890 | ##
891 | ## Model test baseline model:
892 | ##
893 | ## Minimum Function Test Statistic 974.533
894 | ## Degrees of freedom 66
895 | ## P-value 0.000
896 | ##
897 | ## User model versus baseline model:
898 | ##
899 | ## Comparative Fit Index (CFI) 0.843
900 | ## Tucker-Lewis Index (TLI) 0.712
901 | ##
902 | ## Loglikelihood and Information Criteria:
903 | ##
904 | ## Loglikelihood user model (H0) -5323.322
905 | ## Loglikelihood unrestricted model (H1) -5233.931
906 | ##
907 | ## Number of free parameters 42
908 | ## Akaike (AIC) 10730.643
909 | ## Bayesian (BIC) 10897.222
910 | ## Sample-size adjusted Bayesian (BIC) 10763.958
911 | ##
912 | ## Root Mean Square Error of Approximation:
913 | ##
914 | ## RMSEA 0.101
915 | ## 90 Percent Confidence Interval 0.086 0.116
916 | ## P-value RMSEA <= 0.05 0.000
917 | ##
918 | ## Standardized Root Mean Square Residual:
919 | ##
920 | ## SRMR 0.064
921 | ##
922 | ## Parameter Estimates:
923 | ##
924 | ## Information Expected
925 | ## Information saturated (h1) model Structured
926 | ## Standard Errors Standard
927 | ##
928 | ## Latent Variables:
929 | ## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
930 | ## pa1 =~
931 | ## age 1.000 0.999 1.000
932 | ## pa2 =~
933 | ## bp.1d 1.000 0.340 0.340
934 | ## bp.1s 5.235 2.648 1.977 0.048 1.778 1.780
935 | ## pa3 =~
936 | ## bmi 1.000 0.532 0.533
937 | ## frame_large 0.483 0.072 6.705 0.000 0.257 0.586
938 | ## frame_small -0.543 0.080 -6.793 0.000 -0.289 -0.652
939 | ## pa4 =~
940 | ## gender_male 1.000 0.157 0.320
941 | ## waist_to_hp_rt 6.946 2.881 2.411 0.016 1.094 1.095
942 | ## pa5 =~
943 | ## ratio 1.000 0.882 0.883
944 | ## chol 0.612 0.106 5.760 0.000 0.539 0.540
945 | ## pa6 =~
946 | ## time.ppn 1.000 0.999 1.000
947 | ##
948 | ## Regressions:
949 | ## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
950 | ## glyhb ~
951 | ## pa1 0.257 0.049 5.254 0.000 0.256 0.257
952 | ## pa2 0.055 0.061 0.892 0.372 0.019 0.019
953 | ## pa3 0.063 0.134 0.469 0.639 0.033 0.033
954 | ## pa4 0.132 0.290 0.456 0.648 0.021 0.021
955 | ## pa5 0.351 0.090 3.916 0.000 0.310 0.310
956 | ## pa6 0.056 0.046 1.222 0.222 0.056 0.056
957 | ##
958 | ## Covariances:
959 | ## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
960 | ## pa1 ~~
961 | ## pa2 0.088 0.050 1.766 0.077 0.259 0.259
962 | ## pa3 0.130 0.037 3.510 0.000 0.244 0.244
963 | ## pa4 0.040 0.019 2.163 0.031 0.256 0.256
964 | ## pa5 0.187 0.051 3.681 0.000 0.213 0.213
965 | ## pa6 -0.039 0.051 -0.778 0.437 -0.039 -0.039
966 | ## pa2 ~~
967 | ## pa3 0.019 0.012 1.564 0.118 0.108 0.108
968 | ## pa4 0.003 0.003 1.249 0.212 0.059 0.059
969 | ## pa5 0.023 0.015 1.499 0.134 0.077 0.077
970 | ## pa6 -0.008 0.010 -0.840 0.401 -0.024 -0.024
971 | ## pa3 ~~
972 | ## pa4 0.027 0.013 2.113 0.035 0.322 0.322
973 | ## pa5 0.182 0.039 4.618 0.000 0.388 0.388
974 | ## pa6 0.036 0.034 1.062 0.288 0.069 0.069
975 | ## pa4 ~~
976 | ## pa5 0.034 0.016 2.109 0.035 0.248 0.248
977 | ## pa6 0.000 0.007 0.003 0.998 0.000 0.000
978 | ## pa5 ~~
979 | ## pa6 -0.037 0.050 -0.733 0.464 -0.042 -0.042
980 | ##
981 | ## Variances:
982 | ## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
983 | ## .age 0.000 0.000 0.000
984 | ## .bp.1d 0.882 0.084 10.560 0.000 0.882 0.884
985 | ## .bp.1s -2.164 1.506 -1.437 0.151 -2.164 -2.170
986 | ## .bmi 0.714 0.065 10.975 0.000 0.714 0.716
987 | ## .frame_large 0.126 0.013 9.944 0.000 0.126 0.656
988 | ## .frame_small 0.113 0.014 8.366 0.000 0.113 0.575
989 | ## .gender_male 0.218 0.018 11.936 0.000 0.218 0.898
990 | ## .waist_to_hp_rt -0.199 0.458 -0.435 0.664 -0.199 -0.200
991 | ## .ratio 0.219 0.124 1.771 0.077 0.219 0.220
992 | ## .chol 0.706 0.068 10.337 0.000 0.706 0.708
993 | ## .time.ppn 0.000 0.000 0.000
994 | ## .glyhb 0.777 0.059 13.243 0.000 0.777 0.779
995 | ## pa1 0.997 0.071 13.964 0.000 1.000 1.000
996 | ## pa2 0.115 0.064 1.802 0.072 1.000 1.000
997 | ## pa3 0.283 0.064 4.423 0.000 1.000 1.000
998 | ## pa4 0.025 0.012 2.035 0.042 1.000 1.000
999 | ## pa5 0.778 0.141 5.508 0.000 1.000 1.000
1000 | ## pa6 0.997 0.071 13.964 0.000 1.000 1.000
1001 |
1002 | ``` r
1003 | parameterEstimates(fit1)
1004 | ```
1005 |
1006 | ## lhs op rhs est se z pvalue
1007 | ## 1 pa1 =~ age 1.000 0.000 NA NA
1008 | ## 2 pa2 =~ bp.1d 1.000 0.000 NA NA
1009 | ## 3 pa2 =~ bp.1s 5.235 2.648 1.977 0.048
1010 | ## 4 pa3 =~ bmi 1.000 0.000 NA NA
1011 | ## 5 pa3 =~ frame_large 0.483 0.072 6.705 0.000
1012 | ## 6 pa3 =~ frame_small -0.543 0.080 -6.793 0.000
1013 | ## 7 pa4 =~ gender_male 1.000 0.000 NA NA
1014 | ## 8 pa4 =~ waist_to_hip_rat 6.946 2.881 2.411 0.016
1015 | ## 9 pa5 =~ ratio 1.000 0.000 NA NA
1016 | ## 10 pa5 =~ chol 0.612 0.106 5.760 0.000
1017 | ## 11 pa6 =~ time.ppn 1.000 0.000 NA NA
1018 | ## 12 glyhb ~ pa1 0.257 0.049 5.254 0.000
1019 | ## 13 glyhb ~ pa2 0.055 0.061 0.892 0.372
1020 | ## 14 glyhb ~ pa3 0.063 0.134 0.469 0.639
1021 | ## 15 glyhb ~ pa4 0.132 0.290 0.456 0.648
1022 | ## 16 glyhb ~ pa5 0.351 0.090 3.916 0.000
1023 | ## 17 glyhb ~ pa6 0.056 0.046 1.222 0.222
1024 | ## 18 age ~~ age 0.000 0.000 NA NA
1025 | ## 19 bp.1d ~~ bp.1d 0.882 0.084 10.560 0.000
1026 | ## 20 bp.1s ~~ bp.1s -2.164 1.506 -1.437 0.151
1027 | ## 21 bmi ~~ bmi 0.714 0.065 10.975 0.000
1028 | ## 22 frame_large ~~ frame_large 0.126 0.013 9.944 0.000
1029 | ## 23 frame_small ~~ frame_small 0.113 0.014 8.366 0.000
1030 | ## 24 gender_male ~~ gender_male 0.218 0.018 11.936 0.000
1031 | ## 25 waist_to_hip_rat ~~ waist_to_hip_rat -0.199 0.458 -0.435 0.664
1032 | ## 26 ratio ~~ ratio 0.219 0.124 1.771 0.077
1033 | ## 27 chol ~~ chol 0.706 0.068 10.337 0.000
1034 | ## 28 time.ppn ~~ time.ppn 0.000 0.000 NA NA
1035 | ## 29 glyhb ~~ glyhb 0.777 0.059 13.243 0.000
1036 | ## 30 pa1 ~~ pa1 0.997 0.071 13.964 0.000
1037 | ## 31 pa2 ~~ pa2 0.115 0.064 1.802 0.072
1038 | ## 32 pa3 ~~ pa3 0.283 0.064 4.423 0.000
1039 | ## 33 pa4 ~~ pa4 0.025 0.012 2.035 0.042
1040 | ## 34 pa5 ~~ pa5 0.778 0.141 5.508 0.000
1041 | ## 35 pa6 ~~ pa6 0.997 0.071 13.964 0.000
1042 | ## 36 pa1 ~~ pa2 0.088 0.050 1.766 0.077
1043 | ## 37 pa1 ~~ pa3 0.130 0.037 3.510 0.000
1044 | ## 38 pa1 ~~ pa4 0.040 0.019 2.163 0.031
1045 | ## 39 pa1 ~~ pa5 0.187 0.051 3.681 0.000
1046 | ## 40 pa1 ~~ pa6 -0.039 0.051 -0.778 0.437
1047 | ## 41 pa2 ~~ pa3 0.019 0.012 1.564 0.118
1048 | ## 42 pa2 ~~ pa4 0.003 0.003 1.249 0.212
1049 | ## 43 pa2 ~~ pa5 0.023 0.015 1.499 0.134
1050 | ## 44 pa2 ~~ pa6 -0.008 0.010 -0.840 0.401
1051 | ## 45 pa3 ~~ pa4 0.027 0.013 2.113 0.035
1052 | ## 46 pa3 ~~ pa5 0.182 0.039 4.618 0.000
1053 | ## 47 pa3 ~~ pa6 0.036 0.034 1.062 0.288
1054 | ## 48 pa4 ~~ pa5 0.034 0.016 2.109 0.035
1055 | ## 49 pa4 ~~ pa6 0.000 0.007 0.003 0.998
1056 | ## 50 pa5 ~~ pa6 -0.037 0.050 -0.733 0.464
1057 | ## ci.lower ci.upper
1058 | ## 1 1.000 1.000
1059 | ## 2 1.000 1.000
1060 | ## 3 0.046 10.425
1061 | ## 4 1.000 1.000
1062 | ## 5 0.341 0.624
1063 | ## 6 -0.700 -0.386
1064 | ## 7 1.000 1.000
1065 | ## 8 1.300 12.592
1066 | ## 9 1.000 1.000
1067 | ## 10 0.403 0.820
1068 | ## 11 1.000 1.000
1069 | ## 12 0.161 0.353
1070 | ## 13 -0.065 0.174
1071 | ## 14 -0.199 0.325
1072 | ## 15 -0.436 0.700
1073 | ## 16 0.175 0.527
1074 | ## 17 -0.034 0.146
1075 | ## 18 0.000 0.000
1076 | ## 19 0.718 1.046
1077 | ## 20 -5.116 0.787
1078 | ## 21 0.587 0.842
1079 | ## 22 0.101 0.151
1080 | ## 23 0.087 0.140
1081 | ## 24 0.182 0.254
1082 | ## 25 -1.096 0.698
1083 | ## 26 -0.023 0.462
1084 | ## 27 0.573 0.840
1085 | ## 28 0.000 0.000
1086 | ## 29 0.662 0.892
1087 | ## 30 0.857 1.137
1088 | ## 31 -0.010 0.241
1089 | ## 32 0.158 0.409
1090 | ## 33 0.001 0.049
1091 | ## 34 0.501 1.055
1092 | ## 35 0.857 1.137
1093 | ## 36 -0.010 0.186
1094 | ## 37 0.057 0.203
1095 | ## 38 0.004 0.077
1096 | ## 39 0.088 0.287
1097 | ## 40 -0.138 0.060
1098 | ## 41 -0.005 0.044
1099 | ## 42 -0.002 0.008
1100 | ## 43 -0.007 0.053
1101 | ## 44 -0.027 0.011
1102 | ## 45 0.002 0.052
1103 | ## 46 0.105 0.260
1104 | ## 47 -0.031 0.104
1105 | ## 48 0.002 0.067
1106 | ## 49 -0.014 0.014
1107 | ## 50 -0.135 0.061
1108 |
1109 | ``` r
1110 | # Second SEM model
1111 | semModel2 <- '
1112 | pa1 =~ age
1113 | pa5 =~ ratio + chol
1114 |
1115 | glyhb ~ pa1 + pa5
1116 | '
1117 | fit2 <- sem(semModel2,
1118 | data = diabetes_completed_subset)
1119 | fit2
1120 | ```
1121 |
1122 | ## lavaan 0.6-2 ended normally after 21 iterations
1123 | ##
1124 | ## Optimization method NLMINB
1125 | ## Number of free parameters 9
1126 | ##
1127 | ## Number of observations 390
1128 | ##
1129 | ## Estimator ML
1130 | ## Model Fit Test Statistic 7.350
1131 | ## Degrees of freedom 1
1132 | ## P-value (Chi-square) 0.007
1133 |
1134 | ``` r
1135 | semPaths(fit2)
1136 | ```
1137 |
1138 | 
1139 |
1140 | ``` r
1141 | summary(fit2, standardized = TRUE, fit.measures = TRUE)
1142 | ```
1143 |
1144 | ## lavaan 0.6-2 ended normally after 21 iterations
1145 | ##
1146 | ## Optimization method NLMINB
1147 | ## Number of free parameters 9
1148 | ##
1149 | ## Number of observations 390
1150 | ##
1151 | ## Estimator ML
1152 | ## Model Fit Test Statistic 7.350
1153 | ## Degrees of freedom 1
1154 | ## P-value (Chi-square) 0.007
1155 | ##
1156 | ## Model test baseline model:
1157 | ##
1158 | ## Minimum Function Test Statistic 210.710
1159 | ## Degrees of freedom 6
1160 | ## P-value 0.000
1161 | ##
1162 | ## User model versus baseline model:
1163 | ##
1164 | ## Comparative Fit Index (CFI) 0.969
1165 | ## Tucker-Lewis Index (TLI) 0.814
1166 | ##
1167 | ## Loglikelihood and Information Criteria:
1168 | ##
1169 | ## Loglikelihood user model (H0) -2109.862
1170 | ## Loglikelihood unrestricted model (H1) -2106.186
1171 | ##
1172 | ## Number of free parameters 9
1173 | ## Akaike (AIC) 4237.723
1174 | ## Bayesian (BIC) 4273.418
1175 | ## Sample-size adjusted Bayesian (BIC) 4244.862
1176 | ##
1177 | ## Root Mean Square Error of Approximation:
1178 | ##
1179 | ## RMSEA 0.128
1180 | ## 90 Percent Confidence Interval 0.054 0.221
1181 | ## P-value RMSEA <= 0.05 0.042
1182 | ##
1183 | ## Standardized Root Mean Square Residual:
1184 | ##
1185 | ## SRMR 0.027
1186 | ##
1187 | ## Parameter Estimates:
1188 | ##
1189 | ## Information Expected
1190 | ## Information saturated (h1) model Structured
1191 | ## Standard Errors Standard
1192 | ##
1193 | ## Latent Variables:
1194 | ## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
1195 | ## pa1 =~
1196 | ## age 1.000 0.999 1.000
1197 | ## pa5 =~
1198 | ## ratio 1.000 0.733 0.734
1199 | ## chol 0.885 0.149 5.938 0.000 0.649 0.650
1200 | ##
1201 | ## Regressions:
1202 | ## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
1203 | ## glyhb ~
1204 | ## pa1 0.238 0.050 4.789 0.000 0.238 0.238
1205 | ## pa5 0.485 0.099 4.903 0.000 0.355 0.356
1206 | ##
1207 | ## Covariances:
1208 | ## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
1209 | ## pa1 ~~
1210 | ## pa5 0.207 0.049 4.237 0.000 0.283 0.283
1211 | ##
1212 | ## Variances:
1213 | ## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
1214 | ## .age 0.000 0.000 0.000
1215 | ## .ratio 0.460 0.092 4.991 0.000 0.460 0.461
1216 | ## .chol 0.576 0.079 7.283 0.000 0.576 0.578
1217 | ## .glyhb 0.767 0.060 12.771 0.000 0.767 0.769
1218 | ## pa1 0.997 0.071 13.964 0.000 1.000 1.000
1219 | ## pa5 0.537 0.107 5.027 0.000 1.000 1.000
1220 |
1221 | ``` r
1222 | parameterEstimates(fit1)
1223 | ```
1224 |
1225 | ## lhs op rhs est se z pvalue
1226 | ## 1 pa1 =~ age 1.000 0.000 NA NA
1227 | ## 2 pa2 =~ bp.1d 1.000 0.000 NA NA
1228 | ## 3 pa2 =~ bp.1s 5.235 2.648 1.977 0.048
1229 | ## 4 pa3 =~ bmi 1.000 0.000 NA NA
1230 | ## 5 pa3 =~ frame_large 0.483 0.072 6.705 0.000
1231 | ## 6 pa3 =~ frame_small -0.543 0.080 -6.793 0.000
1232 | ## 7 pa4 =~ gender_male 1.000 0.000 NA NA
1233 | ## 8 pa4 =~ waist_to_hip_rat 6.946 2.881 2.411 0.016
1234 | ## 9 pa5 =~ ratio 1.000 0.000 NA NA
1235 | ## 10 pa5 =~ chol 0.612 0.106 5.760 0.000
1236 | ## 11 pa6 =~ time.ppn 1.000 0.000 NA NA
1237 | ## 12 glyhb ~ pa1 0.257 0.049 5.254 0.000
1238 | ## 13 glyhb ~ pa2 0.055 0.061 0.892 0.372
1239 | ## 14 glyhb ~ pa3 0.063 0.134 0.469 0.639
1240 | ## 15 glyhb ~ pa4 0.132 0.290 0.456 0.648
1241 | ## 16 glyhb ~ pa5 0.351 0.090 3.916 0.000
1242 | ## 17 glyhb ~ pa6 0.056 0.046 1.222 0.222
1243 | ## 18 age ~~ age 0.000 0.000 NA NA
1244 | ## 19 bp.1d ~~ bp.1d 0.882 0.084 10.560 0.000
1245 | ## 20 bp.1s ~~ bp.1s -2.164 1.506 -1.437 0.151
1246 | ## 21 bmi ~~ bmi 0.714 0.065 10.975 0.000
1247 | ## 22 frame_large ~~ frame_large 0.126 0.013 9.944 0.000
1248 | ## 23 frame_small ~~ frame_small 0.113 0.014 8.366 0.000
1249 | ## 24 gender_male ~~ gender_male 0.218 0.018 11.936 0.000
1250 | ## 25 waist_to_hip_rat ~~ waist_to_hip_rat -0.199 0.458 -0.435 0.664
1251 | ## 26 ratio ~~ ratio 0.219 0.124 1.771 0.077
1252 | ## 27 chol ~~ chol 0.706 0.068 10.337 0.000
1253 | ## 28 time.ppn ~~ time.ppn 0.000 0.000 NA NA
1254 | ## 29 glyhb ~~ glyhb 0.777 0.059 13.243 0.000
1255 | ## 30 pa1 ~~ pa1 0.997 0.071 13.964 0.000
1256 | ## 31 pa2 ~~ pa2 0.115 0.064 1.802 0.072
1257 | ## 32 pa3 ~~ pa3 0.283 0.064 4.423 0.000
1258 | ## 33 pa4 ~~ pa4 0.025 0.012 2.035 0.042
1259 | ## 34 pa5 ~~ pa5 0.778 0.141 5.508 0.000
1260 | ## 35 pa6 ~~ pa6 0.997 0.071 13.964 0.000
1261 | ## 36 pa1 ~~ pa2 0.088 0.050 1.766 0.077
1262 | ## 37 pa1 ~~ pa3 0.130 0.037 3.510 0.000
1263 | ## 38 pa1 ~~ pa4 0.040 0.019 2.163 0.031
1264 | ## 39 pa1 ~~ pa5 0.187 0.051 3.681 0.000
1265 | ## 40 pa1 ~~ pa6 -0.039 0.051 -0.778 0.437
1266 | ## 41 pa2 ~~ pa3 0.019 0.012 1.564 0.118
1267 | ## 42 pa2 ~~ pa4 0.003 0.003 1.249 0.212
1268 | ## 43 pa2 ~~ pa5 0.023 0.015 1.499 0.134
1269 | ## 44 pa2 ~~ pa6 -0.008 0.010 -0.840 0.401
1270 | ## 45 pa3 ~~ pa4 0.027 0.013 2.113 0.035
1271 | ## 46 pa3 ~~ pa5 0.182 0.039 4.618 0.000
1272 | ## 47 pa3 ~~ pa6 0.036 0.034 1.062 0.288
1273 | ## 48 pa4 ~~ pa5 0.034 0.016 2.109 0.035
1274 | ## 49 pa4 ~~ pa6 0.000 0.007 0.003 0.998
1275 | ## 50 pa5 ~~ pa6 -0.037 0.050 -0.733 0.464
1276 | ## ci.lower ci.upper
1277 | ## 1 1.000 1.000
1278 | ## 2 1.000 1.000
1279 | ## 3 0.046 10.425
1280 | ## 4 1.000 1.000
1281 | ## 5 0.341 0.624
1282 | ## 6 -0.700 -0.386
1283 | ## 7 1.000 1.000
1284 | ## 8 1.300 12.592
1285 | ## 9 1.000 1.000
1286 | ## 10 0.403 0.820
1287 | ## 11 1.000 1.000
1288 | ## 12 0.161 0.353
1289 | ## 13 -0.065 0.174
1290 | ## 14 -0.199 0.325
1291 | ## 15 -0.436 0.700
1292 | ## 16 0.175 0.527
1293 | ## 17 -0.034 0.146
1294 | ## 18 0.000 0.000
1295 | ## 19 0.718 1.046
1296 | ## 20 -5.116 0.787
1297 | ## 21 0.587 0.842
1298 | ## 22 0.101 0.151
1299 | ## 23 0.087 0.140
1300 | ## 24 0.182 0.254
1301 | ## 25 -1.096 0.698
1302 | ## 26 -0.023 0.462
1303 | ## 27 0.573 0.840
1304 | ## 28 0.000 0.000
1305 | ## 29 0.662 0.892
1306 | ## 30 0.857 1.137
1307 | ## 31 -0.010 0.241
1308 | ## 32 0.158 0.409
1309 | ## 33 0.001 0.049
1310 | ## 34 0.501 1.055
1311 | ## 35 0.857 1.137
1312 | ## 36 -0.010 0.186
1313 | ## 37 0.057 0.203
1314 | ## 38 0.004 0.077
1315 | ## 39 0.088 0.287
1316 | ## 40 -0.138 0.060
1317 | ## 41 -0.005 0.044
1318 | ## 42 -0.002 0.008
1319 | ## 43 -0.007 0.053
1320 | ## 44 -0.027 0.011
1321 | ## 45 0.002 0.052
1322 | ## 46 0.105 0.260
1323 | ## 47 -0.031 0.104
1324 | ## 48 0.002 0.067
1325 | ## 49 -0.014 0.014
1326 | ## 50 -0.135 0.061
1327 |
--------------------------------------------------------------------------------
/case_studies/data/diabetes.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/data/diabetes.sav
--------------------------------------------------------------------------------
/case_studies/figures/cs1-unnamed-chunk-12-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-12-1.pdf
--------------------------------------------------------------------------------
/case_studies/figures/cs1-unnamed-chunk-12-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-12-1.png
--------------------------------------------------------------------------------
/case_studies/figures/cs1-unnamed-chunk-15-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-15-1.png
--------------------------------------------------------------------------------
/case_studies/figures/cs1-unnamed-chunk-18-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-18-1.png
--------------------------------------------------------------------------------
/case_studies/figures/cs1-unnamed-chunk-19-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-19-1.png
--------------------------------------------------------------------------------
/case_studies/figures/cs1-unnamed-chunk-22-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-22-1.png
--------------------------------------------------------------------------------
/case_studies/figures/cs1-unnamed-chunk-23-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-23-1.png
--------------------------------------------------------------------------------
/case_studies/figures/cs1-unnamed-chunk-24-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-24-1.png
--------------------------------------------------------------------------------
/case_studies/figures/cs1-unnamed-chunk-25-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-25-1.png
--------------------------------------------------------------------------------
/case_studies/figures/cs1-unnamed-chunk-26-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-26-1.png
--------------------------------------------------------------------------------
/case_studies/figures/cs1-unnamed-chunk-29-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-29-1.png
--------------------------------------------------------------------------------
/case_studies/figures/cs1-unnamed-chunk-33-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-33-1.png
--------------------------------------------------------------------------------
/case_studies/figures/cs1-unnamed-chunk-9-1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-9-1.pdf
--------------------------------------------------------------------------------
/case_studies/figures/cs1-unnamed-chunk-9-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-9-1.png
--------------------------------------------------------------------------------
/ci/scripts/runAllModels.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | cd scripts
3 | for f in *.py; do python "$f"; done
--------------------------------------------------------------------------------
/data/aircraft.csv:
--------------------------------------------------------------------------------
1 | "","X1","X2","X3","X4","Y"
2 | "1",6.3,1.7,8176,4500,2.76
3 | "2",6,1.9,6699,3120,4.76
4 | "3",5.9,1.5,9663,6300,8.75
5 | "4",3,1.2,12837,9800,7.78
6 | "5",5,1.8,10205,4900,6.18
7 | "6",6.3,2,14890,6500,9.5
8 | "7",5.6,1.6,13836,8920,5.14
9 | "8",3.6,1.2,11628,14500,4.76
10 | "9",2,1.4,15225,14800,16.7
11 | "10",2.9,2.3,18691,10900,27.68
12 | "11",2.2,1.9,19350,16000,26.64
13 | "12",3.9,2.6,20638,16000,13.71
14 | "13",4.5,2,12843,7800,12.31
15 | "14",4.3,9.7,13384,17900,15.73
16 | "15",4,2.9,13307,10500,13.59
17 | "16",3.2,4.3,29855,24500,51.9
18 | "17",4.3,4.3,29277,30000,20.78
19 | "18",2.4,2.6,24651,24500,29.82
20 | "19",2.8,3.7,28539,34000,32.78
21 | "20",3.9,3.3,8085,8160,10.12
22 | "21",2.8,3.9,30328,35800,27.84
23 | "22",1.6,4.1,46172,37000,107.1
24 | "23",3.4,2.5,17836,19600,11.19
25 |
--------------------------------------------------------------------------------
/data/awards.csv:
--------------------------------------------------------------------------------
1 | id,num_awards,prog,math
2 | 45,1,3,41
3 | 108,1,1,41
4 | 15,1,3,44
5 | 67,1,3,42
6 | 153,1,3,40
7 | 51,1,1,42
8 | 164,1,3,46
9 | 133,1,3,40
10 | 2,1,3,33
11 | 53,1,3,46
12 | 1,1,3,40
13 | 128,0,2,38
14 | 16,1,3,44
15 | 106,1,3,37
16 | 89,1,3,40
17 | 134,1,1,39
18 | 19,1,1,43
19 | 145,0,3,38
20 | 11,1,2,45
21 | 117,0,3,39
22 | 109,1,1,42
23 | 12,1,3,45
24 | 37,1,3,40
25 | 69,0,3,40
26 | 43,1,2,43
27 | 196,1,2,49
28 | 36,1,1,44
29 | 155,1,1,46
30 | 6,0,2,46
31 | 4,1,2,41
32 | 25,0,1,42
33 | 107,0,3,47
34 | 5,1,2,43
35 | 47,1,2,49
36 | 140,1,3,40
37 | 22,1,3,39
38 | 18,1,3,49
39 | 30,0,2,42
40 | 40,0,1,43
41 | 176,0,2,41
42 | 126,0,1,57
43 | 197,0,2,50
44 | 46,0,2,44
45 | 49,0,3,39
46 | 8,0,2,52
47 | 124,1,3,41
48 | 13,0,3,39
49 | 111,0,1,39
50 | 142,0,3,52
51 | 193,1,2,48
52 | 105,3,2,45
53 | 58,2,3,40
54 | 129,3,1,46
55 | 38,3,2,50
56 | 182,0,2,43
57 | 115,0,1,43
58 | 14,1,2,54
59 | 175,1,1,42
60 | 44,2,3,45
61 | 86,2,1,54
62 | 72,3,3,47
63 | 41,1,2,45
64 | 191,0,2,43
65 | 138,1,3,40
66 | 9,0,3,52
67 | 151,1,3,52
68 | 119,0,1,45
69 | 55,1,2,49
70 | 73,1,2,53
71 | 28,0,1,54
72 | 90,2,2,50
73 | 17,0,2,48
74 | 102,0,2,51
75 | 70,0,1,41
76 | 148,1,3,51
77 | 54,0,1,46
78 | 42,0,3,55
79 | 87,0,1,46
80 | 21,2,1,61
81 | 181,1,2,45
82 | 165,1,3,54
83 | 78,1,2,54
84 | 76,1,2,51
85 | 29,0,1,49
86 | 91,1,3,56
87 | 52,2,2,53
88 | 10,1,1,49
89 | 85,3,1,57
90 | 50,0,1,42
91 | 56,1,3,46
92 | 64,1,3,45
93 | 130,1,1,55
94 | 141,1,3,47
95 | 74,0,2,50
96 | 83,1,3,41
97 | 31,0,1,52
98 | 172,1,2,57
99 | 184,1,3,53
100 | 75,1,3,51
101 | 187,1,1,57
102 | 113,1,2,51
103 | 162,0,3,40
104 | 110,2,3,50
105 | 150,2,3,57
106 | 167,0,1,35
107 | 77,1,2,49
108 | 35,0,1,50
109 | 158,1,1,55
110 | 112,0,2,48
111 | 48,0,2,52
112 | 147,1,2,53
113 | 7,1,2,59
114 | 65,2,2,66
115 | 168,0,2,57
116 | 190,1,2,54
117 | 178,0,3,57
118 | 159,1,2,54
119 | 120,0,2,54
120 | 116,0,2,54
121 | 79,2,2,49
122 | 98,1,3,51
123 | 122,3,2,58
124 | 179,1,2,60
125 | 198,1,2,51
126 | 189,1,2,63
127 | 199,1,2,50
128 | 156,1,2,53
129 | 166,0,2,53
130 | 160,0,2,55
131 | 152,1,2,56
132 | 183,0,2,49
133 | 94,1,2,61
134 | 149,0,1,49
135 | 131,0,2,57
136 | 24,0,2,66
137 | 99,0,1,56
138 | 171,3,2,60
139 | 104,1,2,57
140 | 81,1,2,59
141 | 97,1,2,58
142 | 20,0,2,57
143 | 163,3,2,64
144 | 195,0,1,60
145 | 84,0,1,54
146 | 27,1,2,61
147 | 118,1,1,58
148 | 71,0,1,56
149 | 63,0,1,60
150 | 185,0,2,55
151 | 127,3,2,57
152 | 177,0,2,62
153 | 188,0,2,56
154 | 60,0,2,51
155 | 66,2,3,56
156 | 173,0,1,61
157 | 186,1,2,63
158 | 96,5,2,61
159 | 101,0,2,67
160 | 3,0,2,48
161 | 170,1,2,61
162 | 92,0,1,57
163 | 62,0,1,48
164 | 135,2,2,65
165 | 26,4,2,62
166 | 139,1,2,61
167 | 121,0,3,53
168 | 144,1,1,58
169 | 146,1,2,64
170 | 137,3,2,65
171 | 123,1,1,56
172 | 169,1,1,63
173 | 34,3,2,57
174 | 33,2,2,72
175 | 32,0,3,66
176 | 114,0,2,62
177 | 125,1,2,58
178 | 59,1,2,63
179 | 23,3,2,64
180 | 161,2,2,72
181 | 103,0,2,64
182 | 194,6,2,69
183 | 136,4,2,70
184 | 154,1,2,66
185 | 157,0,1,58
186 | 93,2,2,62
187 | 39,2,2,67
188 | 88,1,2,64
189 | 192,2,2,63
190 | 80,1,2,68
191 | 200,1,2,75
192 | 180,0,2,69
193 | 82,1,2,65
194 | 174,2,2,71
195 | 95,5,2,71
196 | 61,1,2,60
197 | 100,2,2,71
198 | 143,2,3,75
199 | 68,1,2,71
200 | 57,0,2,72
201 | 132,3,2,73
202 |
--------------------------------------------------------------------------------
/data/binary.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/data/binary.dta
--------------------------------------------------------------------------------
/data/cereals.txt:
--------------------------------------------------------------------------------
1 | name mfr type calories protein fat sodium fiber carbo sugars potass vitamins shelf weight cups rating
2 | 100%_Bran N C 70 4 1 130 10 5 6 280 25 3 1 0.33 68.402973
3 | 100%_Natural_Bran Q C 120 3 5 15 2 8 8 135 0 3 1 1 33.983679
4 | All-Bran K C 70 4 1 260 9 7 5 320 25 3 1 0.33 59.425505
5 | All-Bran_with_Extra_Fiber K C 50 4 0 140 14 8 0 330 25 3 1 0.5 93.704912
6 | Almond_Delight R C 110 2 2 200 1 14 8 -1 25 3 1 0.75 34.384843
7 | Apple_Cinnamon_Cheerios G C 110 2 2 180 1.5 10.5 10 70 25 1 1 0.75 29.509541
8 | Apple_Jacks K C 110 2 0 125 1 11 14 30 25 2 1 1 33.174094
9 | Basic_4 G C 130 3 2 210 2 18 8 100 25 3 1.33 0.75 37.038562
10 | Bran_Chex R C 90 2 1 200 4 15 6 125 25 1 1 0.67 49.120253
11 | Bran_Flakes P C 90 3 0 210 5 13 5 190 25 3 1 0.67 53.313813
12 | Cap'n'Crunch Q C 120 1 2 220 0 12 12 35 25 2 1 0.75 18.042851
13 | Cheerios G C 110 6 2 290 2 17 1 105 25 1 1 1.25 50.764999
14 | Cinnamon_Toast_Crunch G C 120 1 3 210 0 13 9 45 25 2 1 0.75 19.823573
15 | Clusters G C 110 3 2 140 2 13 7 105 25 3 1 0.5 40.400208
16 | Cocoa_Puffs G C 110 1 1 180 0 12 13 55 25 2 1 1 22.736446
17 | Corn_Chex R C 110 2 0 280 0 22 3 25 25 1 1 1 41.445019
18 | Corn_Flakes K C 100 2 0 290 1 21 2 35 25 1 1 1 45.863324
19 | Corn_Pops K C 110 1 0 90 1 13 12 20 25 2 1 1 35.782791
20 | Count_Chocula G C 110 1 1 180 0 12 13 65 25 2 1 1 22.396513
21 | Cracklin'_Oat_Bran K C 110 3 3 140 4 10 7 160 25 3 1 0.5 40.448772
22 | Cream_of_Wheat_(Quick) N H 100 3 0 80 1 21 0 -1 0 2 1 1 64.533816
23 | Crispix K C 110 2 0 220 1 21 3 30 25 3 1 1 46.895644
24 | Crispy_Wheat_&_Raisins G C 100 2 1 140 2 11 10 120 25 3 1 0.75 36.176196
25 | Double_Chex R C 100 2 0 190 1 18 5 80 25 3 1 0.75 44.330856
26 | Froot_Loops K C 110 2 1 125 1 11 13 30 25 2 1 1 32.207582
27 | Frosted_Flakes K C 110 1 0 200 1 14 11 25 25 1 1 0.75 31.435973
28 | Frosted_Mini-Wheats K C 100 3 0 0 3 14 7 100 25 2 1 0.8 58.345141
29 | Fruit_&_Fibre_Dates,_Walnuts,_and_Oats P C 120 3 2 160 5 12 10 200 25 3 1.25 0.67 40.917047
30 | Fruitful_Bran K C 120 3 0 240 5 14 12 190 25 3 1.33 0.67 41.015492
31 | Fruity_Pebbles P C 110 1 1 135 0 13 12 25 25 2 1 0.75 28.025765
32 | Golden_Crisp P C 100 2 0 45 0 11 15 40 25 1 1 0.88 35.252444
33 | Golden_Grahams G C 110 1 1 280 0 15 9 45 25 2 1 0.75 23.804043
34 | Grape_Nuts_Flakes P C 100 3 1 140 3 15 5 85 25 3 1 0.88 52.076897
35 | Grape-Nuts P C 110 3 0 170 3 17 3 90 25 3 1 0.25 53.371007
36 | Great_Grains_Pecan P C 120 3 3 75 3 13 4 100 25 3 1 0.33 45.811716
37 | Honey_Graham_Ohs Q C 120 1 2 220 1 12 11 45 25 2 1 1 21.871292
38 | Honey_Nut_Cheerios G C 110 3 1 250 1.5 11.5 10 90 25 1 1 0.75 31.072217
39 | Honey-comb P C 110 1 0 180 0 14 11 35 25 1 1 1.33 28.742414
40 | Just_Right_Crunchy__Nuggets K C 110 2 1 170 1 17 6 60 100 3 1 1 36.523683
41 | Just_Right_Fruit_&_Nut K C 140 3 1 170 2 20 9 95 100 3 1.3 0.75 36.471512
42 | Kix G C 110 2 1 260 0 21 3 40 25 2 1 1.5 39.241114
43 | Life Q C 100 4 2 150 2 12 6 95 25 2 1 0.67 45.328074
44 | Lucky_Charms G C 110 2 1 180 0 12 12 55 25 2 1 1 26.734515
45 | Maypo A H 100 4 1 0 0 16 3 95 25 2 1 1 54.850917
46 | Muesli_Raisins,_Dates,_&_Almonds R C 150 4 3 95 3 16 11 170 25 3 1 1 37.136863
47 | Muesli_Raisins,_Peaches,_&_Pecans R C 150 4 3 150 3 16 11 170 25 3 1 1 34.139765
48 | Mueslix_Crispy_Blend K C 160 3 2 150 3 17 13 160 25 3 1.5 0.67 30.313351
49 | Multi-Grain_Cheerios G C 100 2 1 220 2 15 6 90 25 1 1 1 40.105965
50 | Nut&Honey_Crunch K C 120 2 1 190 0 15 9 40 25 2 1 0.67 29.924285
51 | Nutri-Grain_Almond-Raisin K C 140 3 2 220 3 21 7 130 25 3 1.33 0.67 40.692320
52 | Nutri-grain_Wheat K C 90 3 0 170 3 18 2 90 25 3 1 1 59.642837
53 | Oatmeal_Raisin_Crisp G C 130 3 2 170 1.5 13.5 10 120 25 3 1.25 0.5 30.450843
54 | Post_Nat._Raisin_Bran P C 120 3 1 200 6 11 14 260 25 3 1.33 0.67 37.840594
55 | Product_19 K C 100 3 0 320 1 20 3 45 100 3 1 1 41.503540
56 | Puffed_Rice Q C 50 1 0 0 0 13 0 15 0 3 0.5 1 60.756112
57 | Puffed_Wheat Q C 50 2 0 0 1 10 0 50 0 3 0.5 1 63.005645
58 | Quaker_Oat_Squares Q C 100 4 1 135 2 14 6 110 25 3 1 0.5 49.511874
59 | Quaker_Oatmeal Q H 100 5 2 0 2.7 -1 -1 110 0 1 1 0.67 50.828392
60 | Raisin_Bran K C 120 3 1 210 5 14 12 240 25 2 1.33 0.75 39.259197
61 | Raisin_Nut_Bran G C 100 3 2 140 2.5 10.5 8 140 25 3 1 0.5 39.703400
62 | Raisin_Squares K C 90 2 0 0 2 15 6 110 25 3 1 0.5 55.333142
63 | Rice_Chex R C 110 1 0 240 0 23 2 30 25 1 1 1.13 41.998933
64 | Rice_Krispies K C 110 2 0 290 0 22 3 35 25 1 1 1 40.560159
65 | Shredded_Wheat N C 80 2 0 0 3 16 0 95 0 1 0.83 1 68.235885
66 | Shredded_Wheat_'n'Bran N C 90 3 0 0 4 19 0 140 0 1 1 0.67 74.472949
67 | Shredded_Wheat_spoon_size N C 90 3 0 0 3 20 0 120 0 1 1 0.67 72.801787
68 | Smacks K C 110 2 1 70 1 9 15 40 25 2 1 0.75 31.230054
69 | Special_K K C 110 6 0 230 1 16 3 55 25 1 1 1 53.131324
70 | Strawberry_Fruit_Wheats N C 90 2 0 15 3 15 5 90 25 2 1 1 59.363993
71 | Total_Corn_Flakes G C 110 2 1 200 0 21 3 35 100 3 1 1 38.839746
72 | Total_Raisin_Bran G C 140 3 1 190 4 15 14 230 100 3 1.5 1 28.592785
73 | Total_Whole_Grain G C 100 3 1 200 3 16 3 110 100 3 1 1 46.658844
74 | Triples G C 110 2 1 250 0 21 3 60 25 3 1 0.75 39.106174
75 | Trix G C 110 1 1 140 0 13 12 25 25 2 1 1 27.753301
76 | Wheat_Chex R C 100 3 1 230 3 17 3 115 25 1 1 0.67 49.787445
77 | Wheaties G C 100 3 1 200 3 17 3 110 25 1 1 1 51.592193
78 | Wheaties_Honey_Gold G C 110 2 1 200 1 16 8 60 25 1 1 0.75 36.187559
79 |
--------------------------------------------------------------------------------
/data/child_data.csv:
--------------------------------------------------------------------------------
1 | age,mem_span,iq,read_ab
2 | 6.7,4.4,95,7.2
3 | 5.9,4,90,6
4 | 5.5,4.1,105,6
5 | 6.2,4.8,98,6.6
6 | 6.4,5,106,7
7 | 7.3,5.5,100,7.2
8 | 5.7,3.6,88,5.3
9 | 6.15,5,95,6.4
10 | 7.5,5.4,96,6.6
11 | 6.9,5,104,7.3
12 | 4.1,3.9,108,5
13 | 5.5,4.2,90,5.8
14 | 6.9,4.5,91,6.6
15 | 7.2,5,92,6.8
16 | 4,4.2,101,5.6
17 | 7.3,5.5,100,7.2
18 | 5.9,4,90,6
19 | 5.5,4.2,90,5.8
20 | 4,4.2,101,5.6
21 | 5.9,4,90,6
--------------------------------------------------------------------------------
/data/drugtrial.csv:
--------------------------------------------------------------------------------
1 | subject,gender,dose,score
2 | 1,1,1,6
3 | 2,1,1,6
4 | 3,1,1,3
5 | 4,1,1,5
6 | 5,1,1,6
7 | 6,1,1,4
8 | 7,1,1,5
9 | 8,1,1,4
10 | 9,1,1,4
11 | 10,1,1,5
12 | 11,1,1,4
13 | 12,1,1,3
14 | 13,1,2,6
15 | 14,1,2,8
16 | 15,1,2,7
17 | 16,1,2,8
18 | 17,1,2,6
19 | 18,1,2,8
20 | 19,1,2,8
21 | 20,1,2,6
22 | 21,1,2,7
23 | 22,1,2,8
24 | 23,1,2,6
25 | 24,1,2,7
26 | 25,2,1,2
27 | 26,2,1,5
28 | 27,2,1,2
29 | 28,2,1,4
30 | 29,2,1,5
31 | 30,2,1,7
32 | 31,2,1,4
33 | 32,2,1,1
34 | 33,2,1,2
35 | 34,2,1,7
36 | 35,2,1,4
37 | 36,2,1,0
38 | 37,2,2,2
39 | 38,2,2,3
40 | 39,2,2,4
41 | 40,2,2,0
42 | 41,2,2,0
43 | 42,2,2,1
44 | 43,2,2,2
45 | 44,2,2,2
46 | 45,2,2,4
47 | 46,2,2,3
48 | 47,2,2,6
49 | 48,2,2,3
--------------------------------------------------------------------------------
/data/hsbdemo.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/data/hsbdemo.dta
--------------------------------------------------------------------------------
/data/iqdata.csv:
--------------------------------------------------------------------------------
1 | group,iq
2 | 1,44
3 | 1,40
4 | 1,44
5 | 1,39
6 | 1,25
7 | 1,37
8 | 1,31
9 | 1,40
10 | 1,22
11 | 1,34
12 | 1,39
13 | 1,20
14 | 1,39
15 | 1,42
16 | 1,41
17 | 2,36
18 | 2,40
19 | 2,37
20 | 2,35
21 | 2,39
22 | 2,40
23 | 2,36
24 | 2,38
25 | 2,24
26 | 2,27
27 | 2,29
28 | 2,24
29 | 2,45
30 | 2,44
31 | 2,44
32 | 3,52
33 | 3,50
34 | 3,51
35 | 3,52
36 | 3,45
37 | 3,49
38 | 3,47
39 | 3,46
40 | 3,47
41 | 3,47
42 | 3,46
43 | 3,45
44 | 3,50
45 |
--------------------------------------------------------------------------------
/data/ologit.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/data/ologit.dta
--------------------------------------------------------------------------------
/data/scents.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/data/scents.sav
--------------------------------------------------------------------------------
/data/temprate.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/data/temprate.sav
--------------------------------------------------------------------------------
/models/linearRegression.stan:
--------------------------------------------------------------------------------
1 | /*
2 | * Linear Regression
3 | * -----------------------------------------------------
4 | * Copyright: Murat Koptur
5 | * Date: 05/08/2018
6 | * License: GPLv3
7 | */
8 |
9 | data {
10 | int N; // number of observations
11 | vector[N] x; // day x_i
12 | vector[N] y; // weigth in grams on day x_i
13 | }
14 |
15 | parameters {
16 | real alpha; // intercept
17 | real beta; // slope
18 | real sigma; // std deviation
19 | }
20 |
21 | model {
22 | alpha ~ normal(0, 100);
23 | beta ~ normal(0, 100);
24 | sigma ~ cauchy(0, 10);
25 | y ~ normal(alpha + beta * x, sigma);
26 | }
27 |
28 | generated quantities {
29 | // http://mc-stan.org/loo/reference/extract_log_lik.html
30 | vector[N] log_lik;
31 | for (n in 1:N)
32 | log_lik[n] = normal_lpdf(y[n] | alpha + beta * x[n], sigma);
33 | }
--------------------------------------------------------------------------------
/models/logisticRegression.stan:
--------------------------------------------------------------------------------
1 | /*
2 | * Logistic Regression
3 | * -----------------------------------------------------
4 | * Copyright: Murat Koptur
5 | * Date: 09/08/2018
6 | * License: GPLv3
7 | */
8 |
9 | data {
10 | int N_train;
11 | int N_test;
12 | int D;
13 | row_vector[D] x_train[N_train];
14 | row_vector[D] x_test[N_test];
15 | int y_train[N_train];
16 | }
17 |
18 | parameters {
19 | real alpha;
20 | vector[D] beta;
21 | }
22 |
23 | model {
24 | alpha ~ normal(0, 10);
25 | beta ~ student_t(1,0,2.5); // weakly informative priors
26 | for (n in 1:N_train)
27 | y_train[n] ~ bernoulli_logit(x_train[n] * beta + alpha);
28 | }
29 |
30 | generated quantities {
31 | int y_pred[N_test];
32 | for (n in 1:N_test) {
33 | y_pred[n] = bernoulli_logit_rng(x_test[n] * beta + alpha);
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/models/multinomialLogisticRegression.stan:
--------------------------------------------------------------------------------
1 | /*
2 | * Multinomial Logistic Regression
3 | * -----------------------------------------------------
4 | * Copyright: Murat Koptur
5 | * Date: 11/08/2018
6 | * License: GPLv3
7 | */
8 |
9 | data {
10 | int N; // number of observations
11 | int K; // number of possible outcomes
12 | int D; // D is dimension of x_n vectors
13 | vector[D] x[N];
14 | int y[N];
15 | }
16 |
17 | parameters {
18 | matrix[K, D] beta;
19 | }
20 |
21 | model {
22 | for (k in 1:K)
23 | beta[k] ~ normal(0, 1);
24 | for (n in 1:N)
25 | y[n] ~ categorical_logit(beta * x[n]);
26 | }
27 |
--------------------------------------------------------------------------------
/models/multipleLinearRegression.stan:
--------------------------------------------------------------------------------
1 | /*
2 | * Multiple Linear Regression
3 | * -----------------------------------------------------
4 | * Copyright: Murat Koptur
5 | * Date: 06/08/2018
6 | * License: GPLv3
7 | */
8 |
9 | data {
10 | int N; // number of observations
11 | vector[N] fat; // grams of fat
12 | vector[N] weight; // weight in ounces of one serving
13 | vector[N] cups; // number of cups in one serving
14 | vector[N] rating; // a rating of the cereals
15 | }
16 |
17 | parameters {
18 | real b_fat; // coefficents
19 | real b_weight;
20 | real b_cups;
21 | real beta;
22 | real sigma; // std deviation
23 | }
24 |
25 | model {
26 | b_fat ~ normal(0, 10);
27 | b_weight ~ normal(0, 10);
28 | b_cups ~ normal(0, 10);
29 | beta ~ normal(0, 10);
30 | sigma ~ cauchy(0, 5);
31 | rating ~ normal(beta + b_fat * fat + b_weight * weight +
32 | b_cups * cups, sigma);
33 | }
34 |
35 | generated quantities {
36 | real rating_pred[N]; // predictions
37 | real log_lik[N];
38 | for (n in 1:N)
39 | rating_pred[n] = normal_rng(beta + b_fat * fat[n] + b_weight * weight[n] +
40 | b_cups * cups[n], sigma);
41 | for (n in 1:N)
42 | log_lik[n] = normal_lpdf(rating[n] | beta + b_fat * fat[n] + b_weight * weight[n] +
43 | b_cups * cups[n], sigma);
44 | }
45 |
--------------------------------------------------------------------------------
/models/onewayANOVA.stan:
--------------------------------------------------------------------------------
1 | /*
2 | * One-way ANOVA
3 | * -----------------------------------------------------
4 | * Copyright: Murat Koptur
5 | * Date: 17/08/2018
6 | * License: GPLv3
7 | */
8 |
9 | data {
10 | int N;
11 | int x1[N];
12 | int x2[N];
13 | int y[N];
14 | }
15 |
16 | parameters {
17 | real alpha;
18 | real beta_x1;
19 | real beta_x2;
20 | real sigma;
21 | }
22 |
23 | model {
24 | alpha ~ normal(0, 10);
25 | beta_x1 ~ normal(0, 10);
26 | beta_x2 ~ normal(0, 10);
27 | sigma ~ normal(0, 5);
28 | for (i in 1:N)
29 | y[i] ~ normal(alpha + beta_x1 * x1[i] + beta_x2 * x2[i], sigma);
30 | }
31 |
--------------------------------------------------------------------------------
/models/orderedLogisticRegression.stan:
--------------------------------------------------------------------------------
1 | /*
2 | * Ordered Logistic Regression
3 | * -----------------------------------------------------
4 | * Copyright: Murat Koptur
5 | * Date: 13/08/2018
6 | * License: GPLv3
7 | */
8 |
9 | data {
10 | int N;
11 | int D;
12 | int K;
13 | row_vector[D] x[N];
14 | int y[N];
15 | }
16 |
17 | parameters {
18 | vector[D] beta;
19 | ordered[K-1] c;
20 | }
21 |
22 | model {
23 | for (n in 1:N)
24 | y[n] ~ ordered_logistic(x[n] * beta, c);
25 | }
26 |
--------------------------------------------------------------------------------
/models/robustRegression.stan:
--------------------------------------------------------------------------------
1 | /*
2 | * Robust Regression
3 | * -----------------------------------------------------
4 | * Copyright: Murat Koptur
5 | * Date: 08/08/2018
6 | * License: GPLv3
7 | */
8 |
9 | data {
10 | int N; // number of observations
11 | vector[N] X1; // Aspect Ratio
12 | vector[N] X2; // Lift-to-Drag Ratio
13 | vector[N] X3; // Weight
14 | vector[N] X4; // Thrust
15 | vector[N] Y; // Cost
16 | }
17 |
18 | parameters {
19 | real b_X1;
20 | real b_X2;
21 | real b_X3;
22 | real b_X4;
23 | real beta;
24 | real sigma;
25 | real nu;
26 | }
27 |
28 | model {
29 | b_X1 ~ normal(0, 1e6);
30 | b_X2 ~ normal(0, 1e6);
31 | b_X3 ~ normal(0, 1e6);
32 | b_X4 ~ normal(0, 1e6);
33 | beta ~ normal(0, 1e3);
34 | sigma ~ normal(0, 5);
35 | nu ~ gamma(2, 0.1);
36 | Y ~ student_t(nu,
37 | beta + b_X1 * X1 + b_X2 * X2 + b_X3 * X3 + b_X4 * X4,
38 | sigma);
39 | }
40 |
41 | generated quantities {
42 | real Y_pred[N]; // predictions
43 | for (n in 1:N) {
44 | Y_pred[n] = student_t_rng(nu,
45 | beta + b_X1 * X1[n] +
46 | b_X2 * X2[n] + b_X3 * X3[n] +
47 | b_X4 * X4[n],
48 | sigma);
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/models/twowayANOVA.stan:
--------------------------------------------------------------------------------
1 | /*
2 | * Two-way ANOVA
3 | * -----------------------------------------------------
4 | * Copyright: Murat Koptur
5 | * Date: 17/08/2018
6 | * License: GPLv3
7 | */
8 |
9 | data {
10 | int N;
11 | int x1[N];
12 | int x2[N];
13 | int y[N];
14 | }
15 |
16 | parameters {
17 | real alpha;
18 | real beta_x1;
19 | real beta_x2;
20 | real beta_x3;
21 | real sigma;
22 | }
23 |
24 | model {
25 | alpha ~ normal(0, 10);
26 | beta_x1 ~ normal(0, 10);
27 | beta_x2 ~ normal(0, 10);
28 | beta_x3 ~ normal(0, 10);
29 | sigma ~ normal(0, 5);
30 | for (i in 1:N)
31 | y[i] ~ normal(alpha + beta_x1 * x1[i] + beta_x2 * x2[i] + beta_x3 * x1[i] * x2[i], sigma);
32 | }
33 |
--------------------------------------------------------------------------------
/notebooks/Bayes Factor.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Bayes Factors"
3 | author: "Murat Koptur"
4 | date: "`r format(Sys.time(), '%d %B %Y')`"
5 | output: rmarkdown::github_document
6 | ---
7 |
8 | ```{r echo=FALSE}
9 | knitr::opts_chunk$set(fig.path='figures/bf-')
10 | ```
11 |
12 | ```{r}
13 | library(haven)
14 | library(BayesFactor)
15 | ```
16 |
17 | ```{r}
18 | scents <- read_spss("../data/scents.sav")
19 | head(scents)
20 | ```
21 |
22 | ```{r}
23 | scents$diffs <- scents$noscent - scents$scent
24 | head(scents)
25 | ```
26 |
27 | ```{r}
28 | bf <- ttestBF(scents$diffs)
29 | bf
30 | ```
31 |
32 | ```{r}
33 | sprintf("Bayes factor: %f", exp(bf@bayesFactor$bf))
34 | ```
35 |
36 |
--------------------------------------------------------------------------------
/notebooks/Bayes_Factor.md:
--------------------------------------------------------------------------------
1 | Bayes Factors
2 | ================
3 | Murat Koptur
4 | 25 Ağustos 2018
5 |
6 | ``` r
7 | library(haven)
8 | library(BayesFactor)
9 | ```
10 |
11 | ## Loading required package: coda
12 |
13 | ## Loading required package: Matrix
14 |
15 | ## ************
16 | ## Welcome to BayesFactor 0.9.12-4.2. If you have questions, please contact Richard Morey (richarddmorey@gmail.com).
17 | ##
18 | ## Type BFManual() to open the manual.
19 | ## ************
20 |
21 | ``` r
22 | scents <- read_spss("../data/scents.sav")
23 | head(scents)
24 | ```
25 |
26 | ## # A tibble: 6 x 4
27 | ## part sex noscent scent
28 | ##
29 | ## 1 1 1 27.7 30.6
30 | ## 2 2 2 57.2 43.3
31 | ## 3 3 1 57.9 53.4
32 | ## 4 4 1 38 37.4
33 | ## 5 5 1 57.9 48.6
34 | ## 6 6 2 32 35.5
35 |
36 | ``` r
37 | scents$diffs <- scents$noscent - scents$scent
38 | head(scents)
39 | ```
40 |
41 | ## # A tibble: 6 x 5
42 | ## part sex noscent scent diffs
43 | ##
44 | ## 1 1 1 27.7 30.6 -2.9
45 | ## 2 2 2 57.2 43.3 13.9
46 | ## 3 3 1 57.9 53.4 4.5
47 | ## 4 4 1 38 37.4 0.6
48 | ## 5 5 1 57.9 48.6 9.30
49 | ## 6 6 2 32 35.5 -3.5
50 |
51 | ``` r
52 | bf <- ttestBF(scents$diffs)
53 | bf
54 | ```
55 |
56 | ## Bayes factor analysis
57 | ## --------------
58 | ## [1] Alt., r=0.707 : 0.2294321 ±0.03%
59 | ##
60 | ## Against denominator:
61 | ## Null, mu = 0
62 | ## ---
63 | ## Bayes factor type: BFoneSample, JZS
64 |
65 | ``` r
66 | sprintf("Bayes factor: %f", exp(bf@bayesFactor$bf))
67 | ```
68 |
69 | ## [1] "Bayes factor: 0.229432"
70 |
--------------------------------------------------------------------------------
/notebooks/Correlation Analysis.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Correlation Analysis"
3 | author: "Murat Koptur"
4 | date: "`r format(Sys.time(), '%d %B %Y')`"
5 | output: rmarkdown::github_document
6 | ---
7 |
8 | ```{r echo=FALSE}
9 | knitr::opts_chunk$set(fig.path='figures/corr-')
10 | ```
11 |
12 | ```{r}
13 | library(ggpubr)
14 | library(haven)
15 | ```
16 |
17 | ```{r}
18 | temprate <- read_sav("../data/temprate.sav")
19 | head(temprate)
20 | ```
21 |
22 | ```{r}
23 | cor.test(temprate$temp, temprate$hrtrate, method = "pearson")
24 | ```
25 |
26 | ```{r}
27 | ggscatter(
28 | data = temprate,
29 | x = "temp",
30 | y = "hrtrate",
31 | add = "reg.line",
32 | conf.int = TRUE,
33 | cor.coef = TRUE,
34 | cor.method = "pearson",
35 | xlab = "Temperature",
36 | ylab = "Heart Rate"
37 | )
38 | ```
39 |
40 |
--------------------------------------------------------------------------------
/notebooks/Correlation_Analysis.md:
--------------------------------------------------------------------------------
1 | Correlation Analysis
2 | ================
3 | Murat Koptur
4 | 24 Ağustos 2018
5 |
6 | ``` r
7 | library(ggpubr)
8 | ```
9 |
10 | ## Loading required package: ggplot2
11 |
12 | ## Loading required package: magrittr
13 |
14 | ``` r
15 | library(haven)
16 | ```
17 |
18 | ``` r
19 | temprate <- read_sav("../data/temprate.sav")
20 | head(temprate)
21 | ```
22 |
23 | ## # A tibble: 6 x 2
24 | ## temp hrtrate
25 | ##
26 | ## 1 35.7 70
27 | ## 2 35.9 71
28 | ## 3 36.1 74
29 | ## 4 36.1 80
30 | ## 5 36.2 73
31 | ## 6 36.2 75
32 |
33 | ``` r
34 | cor.test(temprate$temp, temprate$hrtrate, method = "pearson")
35 | ```
36 |
37 | ##
38 | ## Pearson's product-moment correlation
39 | ##
40 | ## data: temprate$temp and temprate$hrtrate
41 | ## t = 2.9668, df = 128, p-value = 0.003591
42 | ## alternative hypothesis: true correlation is not equal to 0
43 | ## 95 percent confidence interval:
44 | ## 0.08519113 0.40802170
45 | ## sample estimates:
46 | ## cor
47 | ## 0.2536564
48 |
49 | ``` r
50 | ggscatter(
51 | data = temprate,
52 | x = "temp",
53 | y = "hrtrate",
54 | add = "reg.line",
55 | conf.int = TRUE,
56 | cor.coef = TRUE,
57 | cor.method = "pearson",
58 | xlab = "Temperature",
59 | ylab = "Heart Rate"
60 | )
61 | ```
62 |
63 | 
64 |
--------------------------------------------------------------------------------
/notebooks/Factor Analysis.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Factor Analysis"
3 | author: "Murat Koptur"
4 | date: "`r format(Sys.time(), '%d %B %Y')`"
5 | output: rmarkdown::github_document
6 | ---
7 |
8 | ```{r echo=FALSE}
9 | knitr::opts_chunk$set(fig.path='figures/factor-')
10 | ```
11 |
12 | ```{r}
13 | library(readr)
14 | library(knitr)
15 | library(psych)
16 | ```
17 |
18 | ```{r results='asis'}
19 | bfi <- read_csv("../data/bfi.csv",
20 | col_types = cols(X1 = col_skip(), age = col_skip(),
21 | education = col_skip(), gender = col_skip()))
22 | kable(head(bfi))
23 | ```
24 | ```{r}
25 | KMO(bfi)
26 | ```
27 | ```{r}
28 | fa.parallel(bfi)
29 | ```
30 | ```{r}
31 | bfi.fa <- fa(bfi, nfactors = 6, fm="pa", max.iter = 100)
32 | fa.diagram(bfi.fa)
33 | ```
34 |
35 |
--------------------------------------------------------------------------------
/notebooks/Factor_Analysis.md:
--------------------------------------------------------------------------------
1 | Factor Analysis
2 | ================
3 | Murat Koptur
4 | 24 Ağustos 2018
5 |
6 | ``` r
7 | library(readr)
8 | library(knitr)
9 | library(psych)
10 | ```
11 |
12 | ``` r
13 | bfi <- read_csv("../data/bfi.csv",
14 | col_types = cols(X1 = col_skip(), age = col_skip(),
15 | education = col_skip(), gender = col_skip()))
16 | ```
17 |
18 | ## Warning: Missing column names filled in: 'X1' [1]
19 |
20 | ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, :
21 | ## length of NULL cannot be changed
22 |
23 | ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, :
24 | ## length of NULL cannot be changed
25 |
26 | ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, :
27 | ## length of NULL cannot be changed
28 |
29 | ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, :
30 | ## length of NULL cannot be changed
31 |
32 | ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, :
33 | ## length of NULL cannot be changed
34 |
35 | ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, :
36 | ## length of NULL cannot be changed
37 |
38 | ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, :
39 | ## length of NULL cannot be changed
40 |
41 | ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, :
42 | ## length of NULL cannot be changed
43 |
44 | ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, :
45 | ## length of NULL cannot be changed
46 |
47 | ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, :
48 | ## length of NULL cannot be changed
49 |
50 | ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, :
51 | ## length of NULL cannot be changed
52 |
53 | ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, :
54 | ## length of NULL cannot be changed
55 |
56 | ``` r
57 | kable(head(bfi))
58 | ```
59 |
60 | | A1| A2| A3| A4| A5| C1| C2| C3| C4| C5| E1| E2| E3| E4| E5| N1| N2| N3| N4| N5| O1| O2| O3| O4| O5|
61 | |----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|
62 | | 2| 4| 3| 4| 4| 2| 3| 3| 4| 4| 3| 3| 3| 4| 4| 3| 4| 2| 2| 3| 3| 6| 3| 4| 3|
63 | | 2| 4| 5| 2| 5| 5| 4| 4| 3| 4| 1| 1| 6| 4| 3| 3| 3| 3| 5| 5| 4| 2| 4| 3| 3|
64 | | 5| 4| 5| 4| 4| 4| 5| 4| 2| 5| 2| 4| 4| 4| 5| 4| 5| 4| 2| 3| 4| 2| 5| 5| 2|
65 | | 4| 4| 6| 5| 5| 4| 4| 3| 5| 5| 5| 3| 4| 4| 4| 2| 5| 2| 4| 1| 3| 3| 4| 3| 5|
66 | | 2| 3| 3| 4| 5| 4| 4| 5| 3| 2| 2| 2| 5| 4| 5| 2| 3| 4| 4| 3| 3| 3| 4| 3| 3|
67 | | 6| 6| 5| 6| 5| 6| 6| 6| 1| 3| 2| 1| 6| 5| 6| 3| 5| 2| 2| 3| 4| 3| 5| 6| 1|
68 |
69 | ``` r
70 | KMO(bfi)
71 | ```
72 |
73 | ## Kaiser-Meyer-Olkin factor adequacy
74 | ## Call: KMO(r = bfi)
75 | ## Overall MSA = 0.85
76 | ## MSA for each item =
77 | ## A1 A2 A3 A4 A5 C1 C2 C3 C4 C5 E1 E2 E3 E4 E5
78 | ## 0.74 0.84 0.87 0.87 0.90 0.83 0.79 0.85 0.82 0.86 0.83 0.88 0.89 0.87 0.89
79 | ## N1 N2 N3 N4 N5 O1 O2 O3 O4 O5
80 | ## 0.78 0.78 0.86 0.88 0.86 0.85 0.78 0.84 0.76 0.76
81 |
82 | ``` r
83 | fa.parallel(bfi)
84 | ```
85 |
86 | 
87 |
88 | ## Parallel analysis suggests that the number of factors = 6 and the number of components = 6
89 |
90 | ``` r
91 | bfi.fa <- fa(bfi, nfactors = 6, fm="pa", max.iter = 100)
92 | ```
93 |
94 | ## Loading required namespace: GPArotation
95 |
96 | ``` r
97 | fa.diagram(bfi.fa)
98 | ```
99 |
100 | 
101 |
--------------------------------------------------------------------------------
/notebooks/Multiple Linear Regression with interaction terms.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Multiple Linear Regression with interaction terms"
3 | author: "Murat Koptur"
4 | date: "`r format(Sys.time(), '%d %B %Y')`"
5 | output: rmarkdown::github_document
6 | ---
7 |
8 | ```{r echo=FALSE}
9 | knitr::opts_chunk$set(fig.path='figures/multipleLin-')
10 | ```
11 |
12 | ```{r}
13 | library(GGally)
14 | library(ggplot2)
15 | library(readr)
16 | library(reshape2)
17 | ```
18 |
19 | ```{r}
20 | child_data <- read_csv("../data/child_data.csv")
21 | head(child_data)
22 | ```
23 |
24 | ```{r}
25 | child_data_melted <- melt(child_data)
26 | head(child_data_melted)
27 |
28 | ggplot(data = child_data_melted, aes(x = value)) +
29 | geom_histogram(aes(y = ..ncount..)) +
30 | geom_density(aes(y = ..scaled..)) +
31 | facet_wrap(~variable, scales = "free") +
32 | labs(x = "Values", y = "Frequencies", title = "Histograms")
33 | ```
34 |
35 | ```{r}
36 | ggpairs(child_data)
37 | ```
38 |
39 | ```{r}
40 | child_data_scaled <- scale(child_data)
41 | head(child_data_scaled)
42 |
43 | model1 <- lm(read_ab ~ age + iq, data = as.data.frame(child_data_scaled))
44 | summary(model1)
45 |
46 | model2 <- lm(read_ab ~ age + mem_span, data = as.data.frame(child_data_scaled))
47 | summary(model2)
48 |
49 | model3 <- lm(read_ab ~ age + iq + age:iq, data = as.data.frame(child_data_scaled))
50 | summary(model3)
51 | ```
52 |
53 |
--------------------------------------------------------------------------------
/notebooks/Multiple_Linear_Regression_with_interaction_terms.md:
--------------------------------------------------------------------------------
1 | Multiple Linear Regression with interaction terms
2 | ================
3 | Murat Koptur
4 | 24 Ağustos 2018
5 |
6 | ``` r
7 | library(GGally)
8 | ```
9 |
10 | ## Loading required package: ggplot2
11 |
12 | ``` r
13 | library(ggplot2)
14 | library(readr)
15 | library(reshape2)
16 | ```
17 |
18 | ``` r
19 | child_data <- read_csv("../data/child_data.csv")
20 | ```
21 |
22 | ## Parsed with column specification:
23 | ## cols(
24 | ## age = col_double(),
25 | ## mem_span = col_double(),
26 | ## iq = col_integer(),
27 | ## read_ab = col_double()
28 | ## )
29 |
30 | ``` r
31 | head(child_data)
32 | ```
33 |
34 | ## # A tibble: 6 x 4
35 | ## age mem_span iq read_ab
36 | ##
37 | ## 1 6.7 4.4 95 7.2
38 | ## 2 5.9 4 90 6
39 | ## 3 5.5 4.1 105 6
40 | ## 4 6.2 4.8 98 6.6
41 | ## 5 6.4 5 106 7
42 | ## 6 7.3 5.5 100 7.2
43 |
44 | ``` r
45 | child_data_melted <- melt(child_data)
46 | ```
47 |
48 | ## No id variables; using all as measure variables
49 |
50 | ``` r
51 | head(child_data_melted)
52 | ```
53 |
54 | ## variable value
55 | ## 1 age 6.7
56 | ## 2 age 5.9
57 | ## 3 age 5.5
58 | ## 4 age 6.2
59 | ## 5 age 6.4
60 | ## 6 age 7.3
61 |
62 | ``` r
63 | ggplot(data = child_data_melted, aes(x = value)) +
64 | geom_histogram(aes(y = ..ncount..)) +
65 | geom_density(aes(y = ..scaled..)) +
66 | facet_wrap(~variable, scales = "free") +
67 | labs(x = "Values", y = "Frequencies", title = "Histograms")
68 | ```
69 |
70 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
71 |
72 | 
73 |
74 | ``` r
75 | ggpairs(child_data)
76 | ```
77 |
78 | 
79 |
80 | ``` r
81 | child_data_scaled <- scale(child_data)
82 | head(child_data_scaled)
83 | ```
84 |
85 | ## age mem_span iq read_ab
86 | ## [1,] 0.6268603 -0.2164352 -0.2376403 1.309125
87 | ## [2,] -0.1188471 -0.9090277 -1.0297747 -0.436375
88 | ## [3,] -0.4917008 -0.7358796 1.3466285 -0.436375
89 | ## [4,] 0.1607932 0.4761574 0.2376403 0.436375
90 | ## [5,] 0.3472200 0.8224536 1.5050553 1.018208
91 | ## [6,] 1.1861409 1.6881943 0.5544941 1.309125
92 |
93 | ``` r
94 | model1 <- lm(read_ab ~ age + iq, data = as.data.frame(child_data_scaled))
95 | summary(model1)
96 | ```
97 |
98 | ##
99 | ## Call:
100 | ## lm(formula = read_ab ~ age + iq, data = as.data.frame(child_data_scaled))
101 | ##
102 | ## Residuals:
103 | ## Min 1Q Median 3Q Max
104 | ## -0.85644 -0.02059 0.04402 0.20506 0.81633
105 | ##
106 | ## Coefficients:
107 | ## Estimate Std. Error t value Pr(>|t|)
108 | ## (Intercept) -2.302e-16 9.998e-02 0.000 1.00000
109 | ## age 9.117e-01 1.047e-01 8.711 1.12e-07 ***
110 | ## iq 3.313e-01 1.047e-01 3.165 0.00565 **
111 | ## ---
112 | ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
113 | ##
114 | ## Residual standard error: 0.4471 on 17 degrees of freedom
115 | ## Multiple R-squared: 0.8211, Adjusted R-squared: 0.8001
116 | ## F-statistic: 39.02 on 2 and 17 DF, p-value: 4.434e-07
117 |
118 | ``` r
119 | model2 <- lm(read_ab ~ age + mem_span, data = as.data.frame(child_data_scaled))
120 | summary(model2)
121 | ```
122 |
123 | ##
124 | ## Call:
125 | ## lm(formula = read_ab ~ age + mem_span, data = as.data.frame(child_data_scaled))
126 | ##
127 | ## Residuals:
128 | ## Min 1Q Median 3Q Max
129 | ## -0.9536 -0.2206 0.0244 0.1668 1.0719
130 | ##
131 | ## Coefficients:
132 | ## Estimate Std. Error t value Pr(>|t|)
133 | ## (Intercept) 1.363e-16 1.038e-01 0.000 1.00000
134 | ## age 5.296e-01 1.542e-01 3.435 0.00316 **
135 | ## mem_span 4.377e-01 1.542e-01 2.839 0.01135 *
136 | ## ---
137 | ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
138 | ##
139 | ## Residual standard error: 0.4643 on 17 degrees of freedom
140 | ## Multiple R-squared: 0.8071, Adjusted R-squared: 0.7844
141 | ## F-statistic: 35.57 on 2 and 17 DF, p-value: 8.414e-07
142 |
143 | ``` r
144 | model3 <- lm(read_ab ~ age + iq + age:iq, data = as.data.frame(child_data_scaled))
145 | summary(model3)
146 | ```
147 |
148 | ##
149 | ## Call:
150 | ## lm(formula = read_ab ~ age + iq + age:iq, data = as.data.frame(child_data_scaled))
151 | ##
152 | ## Residuals:
153 | ## Min 1Q Median 3Q Max
154 | ## -0.82042 -0.08630 -0.01172 0.18550 0.89331
155 | ##
156 | ## Coefficients:
157 | ## Estimate Std. Error t value Pr(>|t|)
158 | ## (Intercept) 0.03942 0.09964 0.396 0.69764
159 | ## age 0.79560 0.12613 6.308 1.04e-05 ***
160 | ## iq 0.38369 0.10642 3.605 0.00237 **
161 | ## age:iq 0.20914 0.13667 1.530 0.14549
162 | ## ---
163 | ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
164 | ##
165 | ## Residual standard error: 0.4305 on 16 degrees of freedom
166 | ## Multiple R-squared: 0.844, Adjusted R-squared: 0.8147
167 | ## F-statistic: 28.85 on 3 and 16 DF, p-value: 1.089e-06
168 |
--------------------------------------------------------------------------------
/notebooks/Poisson Regression.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Poisson Regression"
3 | author: "Murat Koptur"
4 | date: "`r format(Sys.time(), '%d %B %Y')`"
5 | output: rmarkdown::github_document
6 | ---
7 |
8 | ```{r echo=FALSE}
9 | knitr::opts_chunk$set(fig.path='figures/poisson-')
10 | ```
11 |
12 |
13 | ```{r}
14 | library(bayesplot)
15 | library(ggplot2)
16 | library(readr)
17 | library(reshape2)
18 | library(rstanarm)
19 | ```
20 |
21 | ```{r}
22 | awards <- read_csv("../data/awards.csv",
23 | col_types = cols(id = col_skip(), prog = col_factor(levels = c("1", "2", "3"))))
24 | head(awards)
25 | ```
26 |
27 | ```{r}
28 | awards_melted <- melt(awards)
29 | head(awards_melted)
30 | ```
31 |
32 | ```{r}
33 | ggplot(data = awards_melted, aes(x = value)) +
34 | geom_histogram(aes(y = ..ncount..)) +
35 | geom_density(aes(y = ..scaled..)) +
36 | facet_wrap(~variable, scales = "free") +
37 | labs(x = "Values", y = "Frequencies", title = "Histograms")
38 | ```
39 |
40 | ```{r}
41 | awards$math <- scale(awards$math)
42 | ```
43 |
44 | ```{r}
45 | model1 <- glm(num_awards ~ math + prog, data = awards, family = poisson)
46 | summary(model1)
47 | ```
48 |
49 | ```{r}
50 | model2 <- stan_glm(num_awards ~ math + prog, data = awards, family = poisson,
51 | prior = normal(0, 10), prior_intercept = normal(0, 10))
52 | summary(model2)
53 | ```
54 |
55 | ```{r}
56 | posterior_interval(model2, prob = 0.95)
57 | plot(model2, plotfun = "areas", prob = 0.95)
58 | ```
59 |
60 | ```{r}
61 | pp_check(model2)
62 | ```
63 |
64 |
--------------------------------------------------------------------------------
/notebooks/Poisson_Regression.md:
--------------------------------------------------------------------------------
1 | Poisson Regression
2 | ================
3 | Murat Koptur
4 | 24 Ağustos 2018
5 |
6 | ``` r
7 | library(bayesplot)
8 | ```
9 |
10 | ## This is bayesplot version 1.6.0
11 |
12 | ## - Online documentation and vignettes at mc-stan.org/bayesplot
13 |
14 | ## - bayesplot theme set to bayesplot::theme_default()
15 |
16 | ## * Does _not_ affect other ggplot2 plots
17 |
18 | ## * See ?bayesplot_theme_set for details on theme setting
19 |
20 | ``` r
21 | library(ggplot2)
22 | library(readr)
23 | library(reshape2)
24 | library(rstanarm)
25 | ```
26 |
27 | ## Loading required package: Rcpp
28 |
29 | ## rstanarm (Version 2.17.4, packaged: 2018-04-13 01:51:52 UTC)
30 |
31 | ## - Do not expect the default priors to remain the same in future rstanarm versions.
32 |
33 | ## Thus, R scripts should specify priors explicitly, even if they are just the defaults.
34 |
35 | ## - For execution on a local, multicore CPU with excess RAM we recommend calling
36 |
37 | ## options(mc.cores = parallel::detectCores())
38 |
39 | ## - Plotting theme set to bayesplot::theme_default().
40 |
41 | ``` r
42 | awards <- read_csv("../data/awards.csv",
43 | col_types = cols(id = col_skip(), prog = col_factor(levels = c("1", "2", "3"))))
44 | ```
45 |
46 | ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, :
47 | ## length of NULL cannot be changed
48 |
49 | ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, :
50 | ## length of NULL cannot be changed
51 |
52 | ``` r
53 | head(awards)
54 | ```
55 |
56 | ## # A tibble: 6 x 3
57 | ## num_awards prog math
58 | ##
59 | ## 1 1 3 41
60 | ## 2 1 1 41
61 | ## 3 1 3 44
62 | ## 4 1 3 42
63 | ## 5 1 3 40
64 | ## 6 1 1 42
65 |
66 | ``` r
67 | awards_melted <- melt(awards)
68 | ```
69 |
70 | ## Using prog as id variables
71 |
72 | ``` r
73 | head(awards_melted)
74 | ```
75 |
76 | ## prog variable value
77 | ## 1 3 num_awards 1
78 | ## 2 1 num_awards 1
79 | ## 3 3 num_awards 1
80 | ## 4 3 num_awards 1
81 | ## 5 3 num_awards 1
82 | ## 6 1 num_awards 1
83 |
84 | ``` r
85 | ggplot(data = awards_melted, aes(x = value)) +
86 | geom_histogram(aes(y = ..ncount..)) +
87 | geom_density(aes(y = ..scaled..)) +
88 | facet_wrap(~variable, scales = "free") +
89 | labs(x = "Values", y = "Frequencies", title = "Histograms")
90 | ```
91 |
92 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
93 |
94 | 
95 |
96 | ``` r
97 | awards$math <- scale(awards$math)
98 | ```
99 |
100 | ``` r
101 | model1 <- glm(num_awards ~ math + prog, data = awards, family = poisson)
102 | summary(model1)
103 | ```
104 |
105 | ##
106 | ## Call:
107 | ## glm(formula = num_awards ~ math + prog, family = poisson, data = awards)
108 | ##
109 | ## Deviance Residuals:
110 | ## Min 1Q Median 3Q Max
111 | ## -1.96335 -1.14818 -0.01392 0.35710 2.52541
112 | ##
113 | ## Coefficients:
114 | ## Estimate Std. Error z value Pr(>|z|)
115 | ## (Intercept) -0.48897 0.19620 -2.492 0.0127 *
116 | ## math 0.33520 0.07817 4.288 1.8e-05 ***
117 | ## prog2 0.45262 0.22475 2.014 0.0440 *
118 | ## prog3 0.56172 0.24748 2.270 0.0232 *
119 | ## ---
120 | ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
121 | ##
122 | ## (Dispersion parameter for poisson family taken to be 1)
123 | ##
124 | ## Null deviance: 228.83 on 199 degrees of freedom
125 | ## Residual deviance: 198.05 on 196 degrees of freedom
126 | ## AIC: 496.36
127 | ##
128 | ## Number of Fisher Scoring iterations: 5
129 |
130 | ``` r
131 | model2 <- stan_glm(num_awards ~ math + prog, data = awards, family = poisson,
132 | prior = normal(0, 10), prior_intercept = normal(0, 10))
133 | ```
134 |
135 | ##
136 | ## SAMPLING FOR MODEL 'count' NOW (CHAIN 1).
137 | ##
138 | ## Gradient evaluation took 0.000117 seconds
139 | ## 1000 transitions using 10 leapfrog steps per transition would take 1.17 seconds.
140 | ## Adjust your expectations accordingly!
141 | ##
142 | ##
143 | ## Iteration: 1 / 2000 [ 0%] (Warmup)
144 | ## Iteration: 200 / 2000 [ 10%] (Warmup)
145 | ## Iteration: 400 / 2000 [ 20%] (Warmup)
146 | ## Iteration: 600 / 2000 [ 30%] (Warmup)
147 | ## Iteration: 800 / 2000 [ 40%] (Warmup)
148 | ## Iteration: 1000 / 2000 [ 50%] (Warmup)
149 | ## Iteration: 1001 / 2000 [ 50%] (Sampling)
150 | ## Iteration: 1200 / 2000 [ 60%] (Sampling)
151 | ## Iteration: 1400 / 2000 [ 70%] (Sampling)
152 | ## Iteration: 1600 / 2000 [ 80%] (Sampling)
153 | ## Iteration: 1800 / 2000 [ 90%] (Sampling)
154 | ## Iteration: 2000 / 2000 [100%] (Sampling)
155 | ##
156 | ## Elapsed Time: 0.289811 seconds (Warm-up)
157 | ## 0.270276 seconds (Sampling)
158 | ## 0.560087 seconds (Total)
159 | ##
160 | ##
161 | ## SAMPLING FOR MODEL 'count' NOW (CHAIN 2).
162 | ##
163 | ## Gradient evaluation took 3.1e-05 seconds
164 | ## 1000 transitions using 10 leapfrog steps per transition would take 0.31 seconds.
165 | ## Adjust your expectations accordingly!
166 | ##
167 | ##
168 | ## Iteration: 1 / 2000 [ 0%] (Warmup)
169 | ## Iteration: 200 / 2000 [ 10%] (Warmup)
170 | ## Iteration: 400 / 2000 [ 20%] (Warmup)
171 | ## Iteration: 600 / 2000 [ 30%] (Warmup)
172 | ## Iteration: 800 / 2000 [ 40%] (Warmup)
173 | ## Iteration: 1000 / 2000 [ 50%] (Warmup)
174 | ## Iteration: 1001 / 2000 [ 50%] (Sampling)
175 | ## Iteration: 1200 / 2000 [ 60%] (Sampling)
176 | ## Iteration: 1400 / 2000 [ 70%] (Sampling)
177 | ## Iteration: 1600 / 2000 [ 80%] (Sampling)
178 | ## Iteration: 1800 / 2000 [ 90%] (Sampling)
179 | ## Iteration: 2000 / 2000 [100%] (Sampling)
180 | ##
181 | ## Elapsed Time: 0.281356 seconds (Warm-up)
182 | ## 0.258399 seconds (Sampling)
183 | ## 0.539755 seconds (Total)
184 | ##
185 | ##
186 | ## SAMPLING FOR MODEL 'count' NOW (CHAIN 3).
187 | ##
188 | ## Gradient evaluation took 3e-05 seconds
189 | ## 1000 transitions using 10 leapfrog steps per transition would take 0.3 seconds.
190 | ## Adjust your expectations accordingly!
191 | ##
192 | ##
193 | ## Iteration: 1 / 2000 [ 0%] (Warmup)
194 | ## Iteration: 200 / 2000 [ 10%] (Warmup)
195 | ## Iteration: 400 / 2000 [ 20%] (Warmup)
196 | ## Iteration: 600 / 2000 [ 30%] (Warmup)
197 | ## Iteration: 800 / 2000 [ 40%] (Warmup)
198 | ## Iteration: 1000 / 2000 [ 50%] (Warmup)
199 | ## Iteration: 1001 / 2000 [ 50%] (Sampling)
200 | ## Iteration: 1200 / 2000 [ 60%] (Sampling)
201 | ## Iteration: 1400 / 2000 [ 70%] (Sampling)
202 | ## Iteration: 1600 / 2000 [ 80%] (Sampling)
203 | ## Iteration: 1800 / 2000 [ 90%] (Sampling)
204 | ## Iteration: 2000 / 2000 [100%] (Sampling)
205 | ##
206 | ## Elapsed Time: 0.273531 seconds (Warm-up)
207 | ## 0.267135 seconds (Sampling)
208 | ## 0.540666 seconds (Total)
209 | ##
210 | ##
211 | ## SAMPLING FOR MODEL 'count' NOW (CHAIN 4).
212 | ##
213 | ## Gradient evaluation took 3.1e-05 seconds
214 | ## 1000 transitions using 10 leapfrog steps per transition would take 0.31 seconds.
215 | ## Adjust your expectations accordingly!
216 | ##
217 | ##
218 | ## Iteration: 1 / 2000 [ 0%] (Warmup)
219 | ## Iteration: 200 / 2000 [ 10%] (Warmup)
220 | ## Iteration: 400 / 2000 [ 20%] (Warmup)
221 | ## Iteration: 600 / 2000 [ 30%] (Warmup)
222 | ## Iteration: 800 / 2000 [ 40%] (Warmup)
223 | ## Iteration: 1000 / 2000 [ 50%] (Warmup)
224 | ## Iteration: 1001 / 2000 [ 50%] (Sampling)
225 | ## Iteration: 1200 / 2000 [ 60%] (Sampling)
226 | ## Iteration: 1400 / 2000 [ 70%] (Sampling)
227 | ## Iteration: 1600 / 2000 [ 80%] (Sampling)
228 | ## Iteration: 1800 / 2000 [ 90%] (Sampling)
229 | ## Iteration: 2000 / 2000 [100%] (Sampling)
230 | ##
231 | ## Elapsed Time: 0.248926 seconds (Warm-up)
232 | ## 0.250404 seconds (Sampling)
233 | ## 0.49933 seconds (Total)
234 |
235 | ``` r
236 | summary(model2)
237 | ```
238 |
239 | ##
240 | ## Model Info:
241 | ##
242 | ## function: stan_glm
243 | ## family: poisson [log]
244 | ## formula: num_awards ~ math + prog
245 | ## algorithm: sampling
246 | ## priors: see help('prior_summary')
247 | ## sample: 4000 (posterior sample size)
248 | ## observations: 200
249 | ## predictors: 4
250 | ##
251 | ## Estimates:
252 | ## mean sd 2.5% 25% 50% 75% 97.5%
253 | ## (Intercept) -0.5 0.2 -0.9 -0.6 -0.5 -0.4 -0.1
254 | ## math 0.3 0.1 0.2 0.3 0.3 0.4 0.5
255 | ## prog2 0.5 0.2 0.0 0.3 0.5 0.6 0.9
256 | ## prog3 0.6 0.3 0.1 0.4 0.6 0.7 1.0
257 | ## mean_PPD 1.0 0.1 0.8 0.9 1.0 1.0 1.2
258 | ## log-posterior -252.2 1.4 -255.8 -252.9 -251.9 -251.1 -250.4
259 | ##
260 | ## Diagnostics:
261 | ## mcse Rhat n_eff
262 | ## (Intercept) 0.0 1.0 1997
263 | ## math 0.0 1.0 2485
264 | ## prog2 0.0 1.0 2291
265 | ## prog3 0.0 1.0 2054
266 | ## mean_PPD 0.0 1.0 3751
267 | ## log-posterior 0.0 1.0 1624
268 | ##
269 | ## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
270 |
271 | ``` r
272 | posterior_interval(model2, prob = 0.95)
273 | ```
274 |
275 | ## 2.5% 97.5%
276 | ## (Intercept) -0.89457959 -0.1447066
277 | ## math 0.18111692 0.4915252
278 | ## prog2 0.03168288 0.9214785
279 | ## prog3 0.07135645 1.0449510
280 |
281 | ``` r
282 | plot(model2, plotfun = "areas", prob = 0.95)
283 | ```
284 |
285 | 
286 |
287 | ``` r
288 | pp_check(model2)
289 | ```
290 |
291 | 
292 |
--------------------------------------------------------------------------------
/notebooks/figures/corr-unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/notebooks/figures/corr-unnamed-chunk-5-1.png
--------------------------------------------------------------------------------
/notebooks/figures/factor-unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/notebooks/figures/factor-unnamed-chunk-5-1.png
--------------------------------------------------------------------------------
/notebooks/figures/factor-unnamed-chunk-6-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/notebooks/figures/factor-unnamed-chunk-6-1.png
--------------------------------------------------------------------------------
/notebooks/figures/multipleLin-unnamed-chunk-4-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/notebooks/figures/multipleLin-unnamed-chunk-4-1.png
--------------------------------------------------------------------------------
/notebooks/figures/multipleLin-unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/notebooks/figures/multipleLin-unnamed-chunk-5-1.png
--------------------------------------------------------------------------------
/notebooks/figures/poisson-unnamed-chunk-10-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/notebooks/figures/poisson-unnamed-chunk-10-1.png
--------------------------------------------------------------------------------
/notebooks/figures/poisson-unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/notebooks/figures/poisson-unnamed-chunk-5-1.png
--------------------------------------------------------------------------------
/notebooks/figures/poisson-unnamed-chunk-9-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/notebooks/figures/poisson-unnamed-chunk-9-1.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | seaborn==0.9.0
2 | pandas==0.23.4
3 | pystan==2.17.1.0
4 | matplotlib==2.2.2
5 | numpy==1.13.3
6 | scikit_learn==0.19.2
7 | statsmodels==0.9.0
--------------------------------------------------------------------------------
/scripts/Multiple linear regression with interaction terms.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | import pandas as pd
4 | import seaborn as sns
5 | import statsmodels.api as sm
6 |
7 | # http://staff.bath.ac.uk/pssiw/stats2/page16/page16.html
8 | df = pd.read_csv("../data/child_data.csv")
9 | print(df.head())
10 |
11 | sns.set(style="white", palette="muted", color_codes=True)
12 |
13 | f, axes = plt.subplots(2, 2, figsize=(7, 7))
14 |
15 | sns.distplot(df.age, ax=axes[0, 0])
16 | sns.distplot(df.mem_span, ax=axes[0, 1])
17 | sns.distplot(df.iq, ax=axes[1, 0])
18 | sns.distplot(df.read_ab, ax=axes[1, 1])
19 | plt.show()
20 |
21 | sns.pairplot(df, vars=['age', 'mem_span', 'iq'])
22 | plt.show()
23 |
24 | # Rescale all variables
25 | for col in df.columns.values:
26 | df[col] = (df[col] - np.mean(df[col]))/(2 * np.std(df[col]))
27 |
28 | print(df.head())
29 |
30 | # Ordinary multiple linear regression
31 | # Mem_span and age seems correlated, so I'll use one of them
32 | mod1 = sm.formula.ols('read_ab ~ age + iq', data=df).fit()
33 | print(mod1.summary())
34 |
35 | mod2 = sm.formula.ols('read_ab ~ age + mem_span', data=df).fit()
36 | print(mod2.summary())
37 |
38 | # Now, add interaction term
39 | mod1 = sm.formula.ols('read_ab ~ age + iq + age:iq', data=df).fit()
40 | print(mod1.summary())
41 |
--------------------------------------------------------------------------------
/scripts/Poisson Regression.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | import pandas as pd
4 | import seaborn as sns
5 | import statsmodels.api as sm
6 |
7 | # http://staff.bath.ac.uk/pssiw/stats2/page16/page16.html
8 | df = pd.read_csv("../data/awards.csv", index_col=0)
9 | print(df.head())
10 |
11 | print(df.describe())
12 |
13 | df = pd.get_dummies(df, columns=["prog"])
14 | del df['prog_1']
15 | print(df.head())
16 |
17 | df['math'] = (df['math'] - np.mean(df['math']))/(2 * np.std(df['math']))
18 | print(df.head())
19 |
20 | X = np.column_stack(
21 | (np.ones((df.shape[0], 1)), df[['math', 'prog_2', 'prog_3']]))
22 | y = df['num_awards']
23 |
24 | mod = sm.formula.GLM(y, X, family=sm.families.Poisson()).fit()
25 | print(mod.summary())
26 |
27 | model_fitted_y = mod.fittedvalues
28 | model_residuals = mod.df_resid
29 | model_abs_resid = np.abs(model_residuals)
30 |
31 | # https://medium.com/@emredjan/emulating-r-regression-plots-in-python-43741952c034
32 | plot_lm_1 = plt.figure(1)
33 | plot_lm_1.set_figheight(8)
34 | plot_lm_1.set_figwidth(12)
35 |
36 | plot_lm_1.axes[0] = sns.residplot(model_fitted_y, 'num_awards', data=df,
37 | lowess=True,
38 | scatter_kws={'alpha': 0.5},
39 | line_kws={'color': 'red', 'lw': 2, 'alpha': 0.8})
40 |
41 | plot_lm_1.axes[0].set_title('Residuals vs Fitted')
42 | plot_lm_1.axes[0].set_xlabel('Fitted values')
43 | plot_lm_1.axes[0].set_ylabel('Residuals')
44 | plt.show()
45 |
--------------------------------------------------------------------------------
/scripts/helper/psis.py:
--------------------------------------------------------------------------------
1 | """Pareto smoothed importance sampling (PSIS)
2 |
3 | This module implements Pareto smoothed importance sampling (PSIS) and PSIS
4 | leave-one-out (LOO) cross-validation for Python (Numpy).
5 |
6 | Included functions
7 | ------------------
8 | psisloo
9 | Pareto smoothed importance sampling leave-one-out log predictive densities.
10 |
11 | psislw
12 | Pareto smoothed importance sampling.
13 |
14 | gpdfitnew
15 | Estimate the paramaters for the Generalized Pareto Distribution (GPD).
16 |
17 | gpinv
18 | Inverse Generalised Pareto distribution function.
19 |
20 | sumlogs
21 | Sum of vector where numbers are represented by their logarithms.
22 |
23 | References
24 | ----------
25 | Aki Vehtari, Andrew Gelman and Jonah Gabry (2017). Practical
26 | Bayesian model evaluation using leave-one-out cross-validation
27 | and WAIC. Statistics and Computing, 27(5):1413–1432.
28 | doi:10.1007/s11222-016-9696-4. https://arxiv.org/abs/1507.04544
29 |
30 | Aki Vehtari, Andrew Gelman and Jonah Gabry (2017). Pareto
31 | smoothed importance sampling. https://arxiv.org/abs/arXiv:1507.02646v5
32 |
33 | """
34 |
35 | from __future__ import division # For Python 2 compatibility
36 | import numpy as np
37 |
38 | # 3-Clause BSD License
39 | """
40 | Copyright 2017 Aki Vehtari, Tuomas Sivula
41 |
42 | Redistribution and use in source and binary forms, with or without modification,
43 | are permitted provided that the following conditions are met:
44 |
45 | 1. Redistributions of source code must retain the above copyright notice, this
46 | list of conditions and the following disclaimer.
47 |
48 | 2. Redistributions in binary form must reproduce the above copyright notice,
49 | this list of conditions and the following disclaimer in the documentation and/or
50 | other materials provided with the distribution.
51 |
52 | 3. Neither the name of the copyright holder nor the names of its contributors
53 | may be used to endorse or promote products derived from this software without
54 | specific prior written permission.
55 |
56 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
57 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
58 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
59 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
60 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
61 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
62 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
63 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
64 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
65 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """
66 |
67 |
68 | def psisloo(log_lik, **kwargs):
69 | r"""PSIS leave-one-out log predictive densities.
70 |
71 | Computes the log predictive densities given posterior samples of the log
72 | likelihood terms :math:`p(y_i|\theta^s)` in input parameter `log_lik`.
73 | Returns a sum of the leave-one-out log predictive densities `loo`,
74 | individual leave-one-out log predictive density terms `loos` and an estimate
75 | of Pareto tail indeces `ks`. The estimates are unreliable if tail index
76 | ``k > 0.7`` (see more in the references listed in the module docstring).
77 |
78 | Additional keyword arguments are passed to the :meth:`psislw()` function
79 | (see the corresponding documentation).
80 |
81 | Parameters
82 | ----------
83 | log_lik : ndarray
84 | Array of size n x m containing n posterior samples of the log likelihood
85 | terms :math:`p(y_i|\theta^s)`.
86 |
87 | Returns
88 | -------
89 | loo : scalar
90 | sum of the leave-one-out log predictive densities
91 |
92 | loos : ndarray
93 | individual leave-one-out log predictive density terms
94 |
95 | ks : ndarray
96 | estimated Pareto tail indeces
97 |
98 | """
99 | # ensure overwrite flag in passed arguments
100 | kwargs['overwrite_lw'] = True
101 | # log raw weights from log_lik
102 | lw = -log_lik
103 | # compute Pareto smoothed log weights given raw log weights
104 | lw, ks = psislw(lw, **kwargs)
105 | # compute
106 | lw += log_lik
107 | loos = sumlogs(lw, axis=0)
108 | loo = loos.sum()
109 | return loo, loos, ks
110 |
111 |
112 | def psislw(lw, Reff=1.0, overwrite_lw=False):
113 | """Pareto smoothed importance sampling (PSIS).
114 |
115 | Parameters
116 | ----------
117 | lw : ndarray
118 | Array of size n x m containing m sets of n log weights. It is also
119 | possible to provide one dimensional array of length n.
120 |
121 | Reff : scalar, optional
122 | relative MCMC efficiency ``N_eff / N``
123 |
124 | overwrite_lw : bool, optional
125 | If True, the input array `lw` is smoothed in-place, assuming the array
126 | is F-contiguous. By default, a new array is allocated.
127 |
128 | Returns
129 | -------
130 | lw_out : ndarray
131 | smoothed log weights
132 | kss : ndarray
133 | Pareto tail indices
134 |
135 | """
136 | if lw.ndim == 2:
137 | n, m = lw.shape
138 | elif lw.ndim == 1:
139 | n = len(lw)
140 | m = 1
141 | else:
142 | raise ValueError("Argument `lw` must be 1 or 2 dimensional.")
143 | if n <= 1:
144 | raise ValueError("More than one log-weight needed.")
145 |
146 | if overwrite_lw and lw.flags.f_contiguous:
147 | # in-place operation
148 | lw_out = lw
149 | else:
150 | # allocate new array for output
151 | lw_out = np.copy(lw, order='F')
152 |
153 | # allocate output array for kss
154 | kss = np.empty(m)
155 |
156 | # precalculate constants
157 | cutoff_ind = - int(np.ceil(min(0.2 * n, 3 * np.sqrt(n / Reff)))) - 1
158 | cutoffmin = np.log(np.finfo(float).tiny)
159 | logn = np.log(n)
160 | k_min = 1/3
161 |
162 | # loop over sets of log weights
163 | for i, x in enumerate(lw_out.T if lw_out.ndim == 2 else lw_out[None, :]):
164 | # improve numerical accuracy
165 | x -= np.max(x)
166 | # sort the array
167 | x_sort_ind = np.argsort(x)
168 | # divide log weights into body and right tail
169 | xcutoff = max(
170 | x[x_sort_ind[cutoff_ind]],
171 | cutoffmin
172 | )
173 | expxcutoff = np.exp(xcutoff)
174 | tailinds, = np.where(x > xcutoff)
175 | x2 = x[tailinds]
176 | n2 = len(x2)
177 | if n2 <= 4:
178 | # not enough tail samples for gpdfitnew
179 | k = np.inf
180 | else:
181 | # order of tail samples
182 | x2si = np.argsort(x2)
183 | # fit generalized Pareto distribution to the right tail samples
184 | np.exp(x2, out=x2)
185 | x2 -= expxcutoff
186 | k, sigma = gpdfitnew(x2, sort=x2si)
187 | if k >= k_min and not np.isinf(k):
188 | # no smoothing if short tail or GPD fit failed
189 | # compute ordered statistic for the fit
190 | sti = np.arange(0.5, n2)
191 | sti /= n2
192 | qq = gpinv(sti, k, sigma)
193 | qq += expxcutoff
194 | np.log(qq, out=qq)
195 | # place the smoothed tail into the output array
196 | x[tailinds[x2si]] = qq
197 | # truncate smoothed values to the largest raw weight 0
198 | x[x > 0] = 0
199 | # renormalize weights
200 | x -= sumlogs(x)
201 | # store tail index k
202 | kss[i] = k
203 |
204 | # If the provided input array is one dimensional, return kss as scalar.
205 | if lw_out.ndim == 1:
206 | kss = kss[0]
207 |
208 | return lw_out, kss
209 |
210 |
211 | def gpdfitnew(x, sort=True, sort_in_place=False, return_quadrature=False):
212 | """Estimate the paramaters for the Generalized Pareto Distribution (GPD)
213 |
214 | Returns empirical Bayes estimate for the parameters of the two-parameter
215 | generalized Parato distribution given the data.
216 |
217 | Parameters
218 | ----------
219 | x : ndarray
220 | One dimensional data array
221 |
222 | sort : bool or ndarray, optional
223 | If known in advance, one can provide an array of indices that would
224 | sort the input array `x`. If the input array is already sorted, provide
225 | False. If True (default behaviour), the array is sorted internally.
226 |
227 | sort_in_place : bool, optional
228 | If `sort` is True and `sort_in_place` is True, the array is sorted
229 | in-place (False by default).
230 |
231 | return_quadrature : bool, optional
232 | If True, quadrature points and weight `ks` and `w` of the marginal posterior distribution of k are also calculated and returned. False by
233 | default.
234 |
235 | Returns
236 | -------
237 | k, sigma : float
238 | estimated parameter values
239 |
240 | ks, w : ndarray
241 | Quadrature points and weights of the marginal posterior distribution
242 | of `k`. Returned only if `return_quadrature` is True.
243 |
244 | Notes
245 | -----
246 | This function returns a negative of Zhang and Stephens's k, because it is
247 | more common parameterisation.
248 |
249 | """
250 | if x.ndim != 1 or len(x) <= 1:
251 | raise ValueError("Invalid input array.")
252 |
253 | # check if x should be sorted
254 | if sort is True:
255 | if sort_in_place:
256 | x.sort()
257 | xsorted = True
258 | else:
259 | sort = np.argsort(x)
260 | xsorted = False
261 | elif sort is False:
262 | xsorted = True
263 | else:
264 | xsorted = False
265 |
266 | n = len(x)
267 | PRIOR = 3
268 | m = 30 + int(np.sqrt(n))
269 |
270 | bs = np.arange(1, m + 1, dtype=float)
271 | bs -= 0.5
272 | np.divide(m, bs, out=bs)
273 | np.sqrt(bs, out=bs)
274 | np.subtract(1, bs, out=bs)
275 | if xsorted:
276 | bs /= PRIOR * x[int(n/4 + 0.5) - 1]
277 | bs += 1 / x[-1]
278 | else:
279 | bs /= PRIOR * x[sort[int(n/4 + 0.5) - 1]]
280 | bs += 1 / x[sort[-1]]
281 |
282 | ks = np.negative(bs)
283 | temp = ks[:,None] * x
284 | np.log1p(temp, out=temp)
285 | np.mean(temp, axis=1, out=ks)
286 |
287 | L = bs / ks
288 | np.negative(L, out=L)
289 | np.log(L, out=L)
290 | L -= ks
291 | L -= 1
292 | L *= n
293 |
294 | temp = L - L[:,None]
295 | np.exp(temp, out=temp)
296 | w = np.sum(temp, axis=1)
297 | np.divide(1, w, out=w)
298 |
299 | # remove negligible weights
300 | dii = w >= 10 * np.finfo(float).eps
301 | if not np.all(dii):
302 | w = w[dii]
303 | bs = bs[dii]
304 | # normalise w
305 | w /= w.sum()
306 |
307 | # posterior mean for b
308 | b = np.sum(bs * w)
309 | # Estimate for k, note that we return a negative of Zhang and
310 | # Stephens's k, because it is more common parameterisation.
311 | temp = (-b) * x
312 | np.log1p(temp, out=temp)
313 | k = np.mean(temp)
314 | if return_quadrature:
315 | np.negative(x, out=temp)
316 | temp = bs[:, None] * temp
317 | np.log1p(temp, out=temp)
318 | ks = np.mean(temp, axis=1)
319 | # estimate for sigma
320 | sigma = -k / b * n / (n - 0)
321 | # weakly informative prior for k
322 | a = 10
323 | k = k * n / (n+a) + a * 0.5 / (n+a)
324 | if return_quadrature:
325 | ks *= n / (n+a)
326 | ks += a * 0.5 / (n+a)
327 |
328 | if return_quadrature:
329 | return k, sigma, ks, w
330 | else:
331 | return k, sigma
332 |
333 |
334 | def gpinv(p, k, sigma):
335 | """Inverse Generalised Pareto distribution function."""
336 | x = np.empty(p.shape)
337 | x.fill(np.nan)
338 | if sigma <= 0:
339 | return x
340 | ok = (p > 0) & (p < 1)
341 | if np.all(ok):
342 | if np.abs(k) < np.finfo(float).eps:
343 | np.negative(p, out=x)
344 | np.log1p(x, out=x)
345 | np.negative(x, out=x)
346 | else:
347 | np.negative(p, out=x)
348 | np.log1p(x, out=x)
349 | x *= -k
350 | np.expm1(x, out=x)
351 | x /= k
352 | x *= sigma
353 | else:
354 | if np.abs(k) < np.finfo(float).eps:
355 | # x[ok] = - np.log1p(-p[ok])
356 | temp = p[ok]
357 | np.negative(temp, out=temp)
358 | np.log1p(temp, out=temp)
359 | np.negative(temp, out=temp)
360 | x[ok] = temp
361 | else:
362 | # x[ok] = np.expm1(-k * np.log1p(-p[ok])) / k
363 | temp = p[ok]
364 | np.negative(temp, out=temp)
365 | np.log1p(temp, out=temp)
366 | temp *= -k
367 | np.expm1(temp, out=temp)
368 | temp /= k
369 | x[ok] = temp
370 | x *= sigma
371 | x[p == 0] = 0
372 | if k >= 0:
373 | x[p == 1] = np.inf
374 | else:
375 | x[p == 1] = -sigma / k
376 | return x
377 |
378 |
379 | def sumlogs(x, axis=None, out=None):
380 | """Sum of vector where numbers are represented by their logarithms.
381 |
382 | Calculates ``np.log(np.sum(np.exp(x), axis=axis))`` in such a fashion that
383 | it works even when elements have large magnitude.
384 |
385 | """
386 | maxx = x.max(axis=axis, keepdims=True)
387 | xnorm = x - maxx
388 | np.exp(xnorm, out=xnorm)
389 | out = np.sum(xnorm, axis=axis, out=out)
390 | if isinstance(out, np.ndarray):
391 | np.log(out, out=out)
392 | else:
393 | out = np.log(out)
394 | out += np.squeeze(maxx)
395 | return out
396 |
--------------------------------------------------------------------------------
/scripts/helper/stan_utility.py:
--------------------------------------------------------------------------------
1 | import pystan
2 | import pickle
3 | import numpy
4 |
5 | def check_div(fit):
6 | """Check transitions that ended with a divergence"""
7 | sampler_params = fit.get_sampler_params(inc_warmup=False)
8 | divergent = [x for y in sampler_params for x in y['divergent__']]
9 | n = sum(divergent)
10 | N = len(divergent)
11 | print('{} of {} iterations ended with a divergence ({}%)'.format(n, N,
12 | 100 * n / N))
13 | if n > 0:
14 | print(' Try running with larger adapt_delta to remove the divergences')
15 |
16 | def check_treedepth(fit, max_depth = 10):
17 | """Check transitions that ended prematurely due to maximum tree depth limit"""
18 | sampler_params = fit.get_sampler_params(inc_warmup=False)
19 | depths = [x for y in sampler_params for x in y['treedepth__']]
20 | n = sum(1 for x in depths if x == max_depth)
21 | N = len(depths)
22 | print(('{} of {} iterations saturated the maximum tree depth of {}'
23 | + ' ({}%)').format(n, N, max_depth, 100 * n / N))
24 | if n > 0:
25 | print(' Run again with max_depth set to a larger value to avoid saturation')
26 |
27 | def check_energy(fit):
28 | """Checks the energy Bayesian fraction of missing information (E-BFMI)"""
29 | sampler_params = fit.get_sampler_params(inc_warmup=False)
30 | no_warning = True
31 | for chain_num, s in enumerate(sampler_params):
32 | energies = s['energy__']
33 | numer = sum((energies[i] - energies[i - 1])**2 for i in range(1, len(energies))) / len(energies)
34 | denom = numpy.var(energies)
35 | if numer / denom < 0.2:
36 | print('Chain {}: E-BFMI = {}'.format(chain_num, numer / denom))
37 | no_warning = False
38 | if no_warning:
39 | print('E-BFMI indicated no pathological behavior')
40 | else:
41 | print(' E-BFMI below 0.2 indicates you may need to reparameterize your model')
42 |
43 | def check_n_eff(fit):
44 | """Checks the effective sample size per iteration"""
45 | fit_summary = fit.summary(probs=[0.5])
46 | n_effs = [x[4] for x in fit_summary['summary']]
47 | names = fit_summary['summary_rownames']
48 | n_iter = len(fit.extract()['lp__'])
49 |
50 | no_warning = True
51 | for n_eff, name in zip(n_effs, names):
52 | ratio = n_eff / n_iter
53 | if (ratio < 0.001):
54 | print('n_eff / iter for parameter {} is {}!'.format(name, ratio))
55 | print('E-BFMI below 0.2 indicates you may need to reparameterize your model')
56 | no_warning = False
57 | if no_warning:
58 | print('n_eff / iter looks reasonable for all parameters')
59 | else:
60 | print(' n_eff / iter below 0.001 indicates that the effective sample size has likely been overestimated')
61 |
62 | def check_rhat(fit):
63 | """Checks the potential scale reduction factors"""
64 | from math import isnan
65 | from math import isinf
66 |
67 | fit_summary = fit.summary(probs=[0.5])
68 | rhats = [x[5] for x in fit_summary['summary']]
69 | names = fit_summary['summary_rownames']
70 |
71 | no_warning = True
72 | for rhat, name in zip(rhats, names):
73 | if (rhat > 1.1 or isnan(rhat) or isinf(rhat)):
74 | print('Rhat for parameter {} is {}!'.format(name, rhat))
75 | no_warning = False
76 | if no_warning:
77 | print('Rhat looks reasonable for all parameters')
78 | else:
79 | print(' Rhat above 1.1 indicates that the chains very likely have not mixed')
80 |
81 | def check_all_diagnostics(fit):
82 | """Checks all MCMC diagnostics"""
83 | check_n_eff(fit)
84 | check_rhat(fit)
85 | check_div(fit)
86 | check_treedepth(fit)
87 | check_energy(fit)
88 |
89 | def _by_chain(unpermuted_extraction):
90 | num_chains = len(unpermuted_extraction[0])
91 | result = [[] for _ in range(num_chains)]
92 | for c in range(num_chains):
93 | for i in range(len(unpermuted_extraction)):
94 | result[c].append(unpermuted_extraction[i][c])
95 | return numpy.array(result)
96 |
97 | def _shaped_ordered_params(fit):
98 | ef = fit.extract(permuted=False, inc_warmup=False) # flattened, unpermuted, by (iteration, chain)
99 | ef = _by_chain(ef)
100 | ef = ef.reshape(-1, len(ef[0][0]))
101 | ef = ef[:, 0:len(fit.flatnames)] # drop lp__
102 | shaped = {}
103 | idx = 0
104 | for dim, param_name in zip(fit.par_dims, fit.extract().keys()):
105 | length = int(numpy.prod(dim))
106 | shaped[param_name] = ef[:,idx:idx + length]
107 | shaped[param_name].reshape(*([-1] + dim))
108 | idx += length
109 | return shaped
110 |
111 | def partition_div(fit):
112 | """ Returns parameter arrays separated into divergent and non-divergent transitions"""
113 | sampler_params = fit.get_sampler_params(inc_warmup=False)
114 | div = numpy.concatenate([x['divergent__'] for x in sampler_params]).astype('int')
115 | params = _shaped_ordered_params(fit)
116 | nondiv_params = dict((key, params[key][div == 0]) for key in params)
117 | div_params = dict((key, params[key][div == 1]) for key in params)
118 | return nondiv_params, div_params
119 |
120 | def compile_model(filename, model_name=None, **kwargs):
121 | """This will automatically cache models - great if you're just running a
122 | script on the command line.
123 |
124 | See http://pystan.readthedocs.io/en/latest/avoiding_recompilation.html"""
125 | from hashlib import md5
126 |
127 | with open(filename) as f:
128 | model_code = f.read()
129 | code_hash = md5(model_code.encode('ascii')).hexdigest()
130 | if model_name is None:
131 | cache_fn = 'cached-model-{}.pkl'.format(code_hash)
132 | else:
133 | cache_fn = 'cached-{}-{}.pkl'.format(model_name, code_hash)
134 | try:
135 | sm = pickle.load(open(cache_fn, 'rb'))
136 | except:
137 | sm = pystan.StanModel(model_code=model_code)
138 | with open(cache_fn, 'wb') as f:
139 | pickle.dump(sm, f)
140 | else:
141 | print("Using cached StanModel")
142 | return sm
143 |
--------------------------------------------------------------------------------
/scripts/linearRegression.py:
--------------------------------------------------------------------------------
1 | import pickle
2 |
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 | import pystan
6 |
7 | from helper import psis, stan_utility
8 |
9 | model_file = "../models/linearRegression.stan"
10 | # Data from http://www.openbugs.net/Examples/Ratsdata.html
11 | data = {'N': 5,
12 | 'x': [8.0, 15.0, 22.0, 29.0, 36.0],
13 | 'y': [160, 207, 248, 288, 324]
14 | }
15 |
16 | sm = pystan.StanModel(file=model_file)
17 | fit = sm.sampling(data=data, control=dict(adapt_delta=0.95))
18 | print(fit)
19 | fit.plot(['alpha', 'beta', 'sigma'])
20 | plt.show()
21 |
22 | # model diagnostics
23 | # https://github.com/betanalpha/jupyter_case_studies/blob/master/pystan_workflow/stan_utility.py
24 | stan_utility.check_all_diagnostics(fit)
25 |
26 | # visualize model
27 | fit_dict = fit.extract()
28 | m_alpha = np.mean(fit_dict['alpha'])
29 | m_beta = np.mean(fit_dict['beta'])
30 | x = np.linspace(min(data['x']), max(data['x']))
31 | y = m_alpha + m_beta * x
32 | plt.scatter(data['x'], data['y'], c="#1f77b4", label="Observed Data")
33 | plt.plot(x, y, c='#7f7f7f', label="Our Model")
34 | plt.title("Rat weights")
35 | plt.xlabel("Days")
36 | plt.ylabel("Weigths in grams")
37 | plt.legend()
38 | plt.show()
39 |
40 | # Log-likelihood
41 | log_lik = fit.extract()['log_lik']
42 | print(psis.psisloo(log_lik)[0])
43 |
44 | # Save model for later use
45 | with open('../models/saved/linearRegression.pkl', 'wb') as f:
46 | pickle.dump(sm, f)
47 |
--------------------------------------------------------------------------------
/scripts/logisticRegression.py:
--------------------------------------------------------------------------------
1 | import pickle
2 |
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 | import pandas as pd
6 | import pystan
7 | import seaborn as sns
8 | from sklearn.metrics import confusion_matrix
9 | from sklearn.model_selection import train_test_split
10 |
11 | from helper import stan_utility
12 |
13 | model_file = "../models/logisticRegression.stan"
14 | # https://stats.idre.ucla.edu/stata/dae/logistic-regression/
15 | data_file = "../data/binary.dta"
16 |
17 | data = pd.read_stata(data_file)
18 |
19 | # Data preprocessing
20 | # Convert rank categorial variable to dummy
21 | data = pd.get_dummies(data=data, columns=['rank'])
22 | del data['rank_1.0'] # avoid dummy variable trap
23 |
24 | # Rescale gpa and gre variables
25 | data['gre'] = (data['gre'] - np.mean(data['gre'])) / np.std(data['gre'])
26 | data['gpa'] = (data['gpa'] - np.mean(data['gpa'])) / np.std(data['gpa'])
27 |
28 | # Split data as train/test
29 | data_train, data_test = train_test_split(data, test_size=0.2)
30 |
31 | model_data = {'N_train': 320,
32 | 'N_test': 80,
33 | 'D': 5,
34 | 'x_train': data_train[['gre', 'gpa', 'rank_2.0',
35 | 'rank_3.0', 'rank_4.0']].astype(np.int32),
36 | 'x_test': data_test[['gre', 'gpa', 'rank_2.0',
37 | 'rank_3.0', 'rank_4.0']].astype(np.int32),
38 | 'y_train': data_train['admit'].astype(np.int32)}
39 |
40 | sm = pystan.StanModel(file=model_file)
41 | fit = sm.sampling(data=model_data, control=dict(adapt_delta=0.95))
42 | print(fit)
43 | fit.plot(['alpha', 'beta'])
44 | plt.show()
45 |
46 | sns.pairplot(pd.DataFrame(fit.extract()['beta']))
47 | plt.show()
48 |
49 | # model diagnostics
50 | # https://github.com/betanalpha/jupyter_case_studies/blob/master/pystan_workflow/stan_utility.py
51 | stan_utility.check_all_diagnostics(fit)
52 |
53 | # Confusion matrix
54 | y_pred = fit.extract()['y_pred']
55 | y_pred = np.median(y_pred, axis=0)
56 | print(confusion_matrix(data_test['admit'], y_pred))
57 |
58 | # Save model for later use
59 | with open('../models/saved/logisticRegression.pkl', 'wb') as f:
60 | pickle.dump(sm, f)
61 |
--------------------------------------------------------------------------------
/scripts/multinomialLogisticRegression.py:
--------------------------------------------------------------------------------
1 | import pickle
2 |
3 | import numpy as np
4 | import pandas as pd
5 | import pystan
6 |
7 | from helper import stan_utility
8 |
9 | model_file = "../models/multinomialLogisticRegression.stan"
10 | # https://stats.idre.ucla.edu/stata/dae/multinomiallogistic-regression/
11 | data_file = "../data/hsbdemo.dta"
12 |
13 | data = pd.read_stata(data_file)
14 |
15 | data = pd.get_dummies(data=data, columns=['ses', 'schtyp', 'honors'])
16 |
17 | map_prog = {'general': 1,
18 | 'academic': 2,
19 | 'vocation': 3}
20 | data['prog'] = data['prog'].map(map_prog)
21 |
22 | data['read'] = (data['read'] - np.mean(data['read'])) / np.mean(data['read'])
23 | data['write'] = (data['write'] - np.mean(data['write'])) / \
24 | np.mean(data['write'])
25 |
26 | data = {'N': 200,
27 | 'K': 3,
28 | 'D': 6,
29 | 'x': data[['ses_low', 'ses_middle', 'schtyp_public',
30 | 'honors_enrolled', 'read', 'write']],
31 | 'y': data['prog']
32 | }
33 |
34 | sm = pystan.StanModel(file=model_file)
35 | fit = sm.sampling(data=data, control=dict(adapt_delta=0.95))
36 | print(fit)
37 |
38 | # model diagnostics
39 | # https://github.com/betanalpha/jupyter_case_studies/blob/master/pystan_workflow/stan_utility.py
40 | stan_utility.check_all_diagnostics(fit)
41 |
42 | # Save model for later use
43 | with open('../models/saved/multinomialLogisticRegression.pkl', 'wb') as f:
44 | pickle.dump(sm, f)
45 |
--------------------------------------------------------------------------------
/scripts/multipleLinearRegression.py:
--------------------------------------------------------------------------------
1 | import pickle
2 |
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 | import pandas as pd
6 | import pystan
7 | import seaborn as sns
8 |
9 | from helper import psis, stan_utility
10 |
11 | model_file = "../models/multipleLinearRegression.stan"
12 | # http://lib.stat.cmu.edu/DASL/Datafiles/Cereals.html
13 | data_file = "../data/cereals.txt"
14 | data = pd.read_table(data_file)
15 |
16 | data = data[['fat', 'weight', 'cups', 'rating']]
17 | data = {'N': 77,
18 | 'fat': data['fat'],
19 | 'weight': data['weight'],
20 | 'cups': data['cups'],
21 | 'rating': data['rating']}
22 |
23 | sm = pystan.StanModel(file=model_file)
24 | fit = sm.sampling(data=data, control=dict(adapt_delta=0.95))
25 | print(fit)
26 | fit.plot(['b_fat', 'b_weight', 'b_cups'])
27 | plt.show()
28 |
29 | # model diagnostics
30 | # https://github.com/betanalpha/jupyter_case_studies/blob/master/pystan_workflow/stan_utility.py
31 | stan_utility.check_all_diagnostics(fit)
32 |
33 | # visualize model
34 | # we'll plot histogram of our errors
35 | rating_pred = fit.extract()['rating_pred'].mean(axis=0)
36 | rating = data['rating'].values
37 | abs_err = np.abs(rating - rating_pred)
38 | sns.distplot(abs_err)
39 | plt.title("Histogram of absolute errors")
40 | plt.xlabel("Errors")
41 | plt.ylabel("Frequency")
42 | plt.show()
43 |
44 | # Log-likelihood
45 | log_lik = fit.extract()['log_lik']
46 | print(psis.psisloo(log_lik)[0])
47 |
48 | # Save model for later use
49 | with open('../models/saved/multipleLinearRegression.pkl', 'wb') as f:
50 | pickle.dump(sm, f)
51 |
--------------------------------------------------------------------------------
/scripts/onewayANOVA.py:
--------------------------------------------------------------------------------
1 | import pickle
2 |
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 | import pandas as pd
6 | import pystan
7 |
8 | from helper import stan_utility
9 |
10 | model_file = "../models/onewayANOVA.stan"
11 | # http://staff.bath.ac.uk/pssiw/stats2/page16/page16.html
12 | data_file = "../data/iqdata.csv"
13 |
14 | data = pd.read_csv(data_file)
15 |
16 | data = pd.get_dummies(data, columns=['group'])
17 | del data['group_1']
18 |
19 | data = {'N': 43,
20 | 'x1': data['group_2'],
21 | 'x2': data['group_3'],
22 | 'y': data['iq']}
23 |
24 | sm = pystan.StanModel(file=model_file)
25 | fit = sm.sampling(data=data, control=dict(adapt_delta=0.95))
26 | print(fit)
27 | fit.plot()
28 | plt.show()
29 |
30 | # model diagnostics
31 | # https://github.com/betanalpha/jupyter_case_studies/blob/master/pystan_workflow/stan_utility.py
32 | stan_utility.check_all_diagnostics(fit)
33 |
34 | # extract coefficents
35 | fit_dict = fit.extract()
36 | alpha = fit_dict['alpha']
37 | beta_x1 = fit_dict['beta_x1']
38 | beta_x2 = fit_dict['beta_x2']
39 | # calculate group means from coefficents
40 | mean_1 = alpha.mean(axis=0)
41 | mean_2 = alpha.mean(axis=0) + beta_x1.mean(axis=0)
42 | mean_3 = alpha.mean(axis=0) + beta_x2.mean(axis=0)
43 | print(
44 | f'Mean of group 1: {mean_1},\nMean of group 2: {mean_2},\nMean of group 3: {mean_3}\n')
45 | # calculate the posterior distribution of the difference between the means of group 1 and 3
46 | diffs13 = alpha - (alpha + beta_x2)
47 | # 95% credible intervals
48 | diffs13_ci = np.percentile(diffs13, [2.5, 97.5], axis=0)
49 | print(
50 | f"Estimated difference between the means of group 1 and 3: {diffs13.mean(axis=0)}\n")
51 | print(f"\t95% credible interval: ({diffs13_ci[0]}, {diffs13_ci[1]})\n")
52 | # How strongly do the data support the hypothesis that the mean of group 3 is larger than the mean of group 1?
53 | print(f"{np.sum(alpha + beta_x2 > alpha) / np.size(alpha)}")
54 | # Because probabilities are never exactly 1, we write >0.999
55 |
56 | # Save model for later use
57 | with open('../models/saved/onewayANOVA.pkl', 'wb') as f:
58 | pickle.dump(sm, f)
59 |
--------------------------------------------------------------------------------
/scripts/orderedLogisticRegression.py:
--------------------------------------------------------------------------------
1 | import pickle
2 |
3 | import matplotlib.pyplot as plt
4 | import pandas as pd
5 | import pystan
6 |
7 | from helper import stan_utility
8 |
9 | model_file = "../models/orderedLogisticRegression.stan"
10 | # https://stats.idre.ucla.edu/stata/dae/ordered-logistic-regression/
11 | data_file = "../data/ologit.dta"
12 |
13 | data = pd.read_stata(data_file)
14 |
15 | x = data[['pared', 'public', 'gpa']]
16 | y = data['apply'].map({'unlikely': 1, 'somewhat likely': 2, 'very likely': 3})
17 |
18 | data = {'N': 400,
19 | 'D': 3,
20 | 'K': 3,
21 | 'x': x,
22 | 'y': y}
23 |
24 | sm = pystan.StanModel(file=model_file)
25 | fit = sm.sampling(data=data, control=dict(adapt_delta=0.95))
26 | print(fit)
27 |
28 | fit.plot()
29 | plt.show()
30 |
31 | # model diagnostics
32 | # https://github.com/betanalpha/jupyter_case_studies/blob/master/pystan_workflow/stan_utility.py
33 | stan_utility.check_all_diagnostics(fit)
34 |
35 | # Save model for later use
36 | with open('../models/saved/orderedLogisticRegression.pkl', 'wb') as f:
37 | pickle.dump(sm, f)
38 |
--------------------------------------------------------------------------------
/scripts/robustRegression.py:
--------------------------------------------------------------------------------
1 | import pickle
2 |
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 | import pandas as pd
6 | import pystan
7 | import seaborn as sns
8 |
9 | from helper import stan_utility
10 |
11 | model_file = "../models/robustRegression.stan"
12 | # http://vincentarelbundock.github.io/Rdatasets/datasets.html
13 | data_file = "../data/aircraft.csv"
14 |
15 | data = pd.read_csv(data_file)
16 |
17 | data = {'N': 23,
18 | 'X1': data['X1'],
19 | 'X2': data['X2'],
20 | 'X3': (data['X3'] - np.mean(data['X3'])) / np.std(data['X3']),
21 | 'X4': (data['X4'] - np.mean(data['X4'])) / np.std(data['X4']),
22 | 'Y': data['Y'],
23 | }
24 |
25 | sm = pystan.StanModel(file=model_file)
26 | fit = sm.sampling(data=data, control=dict(adapt_delta=0.95))
27 | print(fit)
28 | fit.plot(['b_X1', 'b_X2', 'b_X3', 'b_X4'])
29 | plt.show()
30 |
31 | # model diagnostics
32 | # https://github.com/betanalpha/jupyter_case_studies/blob/master/pystan_workflow/stan_utility.py
33 | stan_utility.check_all_diagnostics(fit)
34 |
35 | # visualize model
36 | # we'll plot histogram of our errors
37 | Y_pred = fit.extract()['Y_pred'].mean(axis=0)
38 | Y = data['Y'].values
39 | abs_err = np.abs(Y - Y_pred)
40 | sns.distplot(abs_err)
41 | plt.title("Histogram of absolute errors")
42 | plt.xlabel("Errors")
43 | plt.ylabel("Frequency")
44 | plt.show()
45 |
46 | # Save model for later use
47 | with open('../models/saved/robustRegression.pkl', 'wb') as f:
48 | pickle.dump(sm, f)
49 |
--------------------------------------------------------------------------------
/scripts/twowayANOVA.py:
--------------------------------------------------------------------------------
1 | import pickle
2 |
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 | import pandas as pd
6 | import pystan
7 |
8 | from helper import stan_utility
9 |
10 | model_file = "../models/twowayANOVA.stan"
11 | # http://staff.bath.ac.uk/pssiw/stats2/page16/page16.html
12 | data_file = "../data/drugtrial.csv"
13 |
14 | data = pd.read_csv(data_file, index_col=0)
15 |
16 | data = pd.get_dummies(data, columns=['gender', 'dose'])
17 | data.drop(columns=['gender_1', 'dose_1'], inplace=True)
18 |
19 | data = {'N': 48,
20 | 'x1': data['gender_2'],
21 | 'x2': data['dose_2'],
22 | 'y': data['score']}
23 |
24 | sm = pystan.StanModel(file=model_file)
25 | fit = sm.sampling(data=data, control=dict(adapt_delta=0.95))
26 | print(fit)
27 | fit.plot()
28 | plt.show()
29 |
30 | # model diagnostics
31 | # https://github.com/betanalpha/jupyter_case_studies/blob/master/pystan_workflow/stan_utility.py
32 | stan_utility.check_all_diagnostics(fit)
33 |
34 | # extract coefficents
35 | fit_dict = fit.extract()
36 | alpha = fit_dict['alpha']
37 | beta_x1 = fit_dict['beta_x1']
38 | beta_x2 = fit_dict['beta_x2']
39 | beta_x3 = fit_dict['beta_x3']
40 | # calculate group means from coefficents
41 | mean_11 = alpha.mean(axis=0)
42 | mean_12 = alpha.mean(axis=0) + beta_x1.mean(axis=0)
43 | mean_21 = alpha.mean(axis=0) + beta_x2.mean(axis=0)
44 | mean_22 = alpha.mean(axis=0) + beta_x1.mean(axis=0) + \
45 | beta_x2.mean(axis=0) + beta_x3.mean(axis=0)
46 | print(
47 | f'Mean of gender=1, dose=1: {mean_11},\n'
48 | f'Mean of gender=1, dose=2: {mean_12},\n'
49 | f'Mean of gender=2, dose=1: {mean_21},\n'
50 | f'Mean of gender=2, dose=2: {mean_22}\n')
51 |
52 | # Save model for later use
53 | with open('../models/saved/twowayANOVA.pkl', 'wb') as f:
54 | pickle.dump(sm, f)
55 |
--------------------------------------------------------------------------------