├── .gitattributes ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── bayes.png ├── case_studies ├── Case Study 1 - Diabetes dataset.Rmd ├── Case_Study_1_-_Diabetes_dataset.md ├── data │ └── diabetes.sav └── figures │ ├── cs1-unnamed-chunk-12-1.pdf │ ├── cs1-unnamed-chunk-12-1.png │ ├── cs1-unnamed-chunk-15-1.png │ ├── cs1-unnamed-chunk-18-1.png │ ├── cs1-unnamed-chunk-19-1.png │ ├── cs1-unnamed-chunk-22-1.png │ ├── cs1-unnamed-chunk-23-1.png │ ├── cs1-unnamed-chunk-24-1.png │ ├── cs1-unnamed-chunk-25-1.png │ ├── cs1-unnamed-chunk-26-1.png │ ├── cs1-unnamed-chunk-29-1.png │ ├── cs1-unnamed-chunk-33-1.png │ ├── cs1-unnamed-chunk-9-1.pdf │ └── cs1-unnamed-chunk-9-1.png ├── ci └── scripts │ └── runAllModels.sh ├── data ├── aircraft.csv ├── awards.csv ├── bfi.csv ├── binary.dta ├── cereals.txt ├── child_data.csv ├── drugtrial.csv ├── hsbdemo.dta ├── iqdata.csv ├── ologit.dta ├── scents.sav └── temprate.sav ├── models ├── linearRegression.stan ├── logisticRegression.stan ├── multinomialLogisticRegression.stan ├── multipleLinearRegression.stan ├── onewayANOVA.stan ├── orderedLogisticRegression.stan ├── robustRegression.stan └── twowayANOVA.stan ├── notebooks ├── Bayes Factor.Rmd ├── Bayes_Factor.md ├── Correlation Analysis.Rmd ├── Correlation_Analysis.md ├── Factor Analysis.Rmd ├── Factor_Analysis.md ├── Multiple Linear Regression with interaction terms.Rmd ├── Multiple_Linear_Regression_with_interaction_terms.md ├── Poisson Regression.Rmd ├── Poisson_Regression.md └── figures │ ├── corr-unnamed-chunk-5-1.png │ ├── factor-unnamed-chunk-5-1.png │ ├── factor-unnamed-chunk-6-1.png │ ├── multipleLin-unnamed-chunk-4-1.png │ ├── multipleLin-unnamed-chunk-5-1.png │ ├── poisson-unnamed-chunk-10-1.png │ ├── poisson-unnamed-chunk-5-1.png │ └── poisson-unnamed-chunk-9-1.png ├── requirements.txt └── scripts ├── Multiple linear regression with interaction terms.py ├── Poisson Regression.py ├── helper ├── psis.py └── stan_utility.py ├── linearRegression.py ├── logisticRegression.py ├── multinomialLogisticRegression.py ├── multipleLinearRegression.py ├── onewayANOVA.py ├── orderedLogisticRegression.py ├── robustRegression.py └── twowayANOVA.py /.gitattributes: -------------------------------------------------------------------------------- 1 | ci/* linguist-vendored 2 | data/* linguist-vendored 3 | *.ipynb linguist-language=R 4 | *.py linguist-language=R 5 | *.rmd linguist-language=R 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # vscode 107 | .vscode/ 108 | notebooks/.RData 109 | notebooks/.Rhistory 110 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | os: 3 | - "linux" 4 | python: 5 | - "3.6" 6 | install: 7 | - pip install -r requirements.txt 8 | before_script: 9 | - "export MPLBACKEND=Agg" 10 | - "export DISPLAY=:99.0" 11 | - "sh -e /etc/init.d/xvfb start" 12 | - sleep 3 13 | script: 14 | - sh ci/scripts/runAllModels.sh 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | 635 | Copyright (C) 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | Copyright (C) 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Statistical Modeling Examples 2 | 3 |

All we need is just

4 | 5 | 6 | [![GitHub license](https://img.shields.io/github/license/mrtkp9993/Statistical-Modelling-Examples.svg)](https://github.com/mrtkp9993/Statistical-Modelling-Examples/blob/master/LICENSE) 7 | [![DOI](https://zenodo.org/badge/143592387.svg)](https://zenodo.org/badge/latestdoi/143592387) 8 | 9 | --- 10 | 11 | ## Case Studies 12 | 13 | * Diabetes dataset: [Dataset info](http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/diabetes.html), [Markdown](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/case_studies/Case_Study_1_-_Diabetes_dataset.md). 14 | 15 | ## Examples 16 | 17 | PyStan examples includes these methods: 18 | 19 | * Linear Regression [Model](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/models/linearRegression.stan), [Script](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/scripts/linearRegression.py). 20 | * Multiple Linear Regression [Model](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/models/multipleLinearRegression.stan), [Script](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/scripts/multipleLinearRegression.py). 21 | * Robust Regression [Model](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/models/robustRegression.stan), [Script](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/scripts/robustRegression.py). 22 | * Logistic Regression [Model](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/models/logisticRegression.stan), [Script](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/scripts/logisticRegression.py). 23 | * Multinomial Logistic Regression [Model](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/models/multinomialLogisticRegression.stan), [Script](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/scripts/multinomialLogisticRegression.py). 24 | * Ordered Logistic Regression [Model](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/models/orderedLogisticRegression.stan), [Script](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/scripts/orderedLogisticRegression.py). 25 | * One-way ANOVA [Model](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/models/onewayANOVA.stan), [Script](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/scripts/onewayANOVA.py). 26 | * Two-way ANOVA [Model](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/models/twowayANOVA.stan), [Script](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/scripts/twowayANOVA.py). 27 | 28 | R examples includes these methods: 29 | 30 | * Factor analysis [Markdown](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/notebooks/Factor_Analysis.md). 31 | * Correlation analysis [Markdown](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/notebooks/Correlation_Analysis.md). 32 | * Multiple Linear Regression with interaction terms [Markdown](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/notebooks/Multiple_Linear_Regression_with_interaction_terms.md). 33 | * Poisson Regression [Markdown](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/notebooks/Poisson_Regression.md). 34 | * Bayes Factors [Markdown](https://github.com/mrtkp9993/Statistical-Modeling-Examples/blob/master/notebooks/Bayes_Factor.md). 35 | 36 | ## Useful Resources 37 | 38 | ### General 39 | 40 | * Glossary of statistical terms [Link](https://www.stat.berkeley.edu/~stark/SticiGui/Text/gloss.htm). 41 | * Statistical tests with Python [Link](https://machinelearningmastery.com/statistical-hypothesis-tests-in-python-cheat-sheet/). 42 | * Michael Betancourt: “A Conceptual Introduction to Hamiltonian Monte Carlo”, 2017; arXiv:1701.02434. 43 | * Hamiltonian Monte Carlo explained, [Link](http://arogozhnikov.github.io/2016/12/19/markov_chain_monte_carlo.html). 44 | 45 | ### Stan 46 | 47 | * Stan Reference Manual [Link](https://github.com/stan-dev/stan/releases/download/v2.17.0/stan-reference-2.17.0.pdf). 48 | * PyStan Getting Started [Link](https://pystan.readthedocs.io/en/latest/getting_started.html). 49 | * Stan example models [Link](https://github.com/stan-dev/example-models/tree/master/misc). 50 | * Prior choices [Link](https://github.com/stan-dev/stan/wiki/Prior-Choice-Recommendations). 51 | 52 | ### R 53 | 54 | * R-bloggers [Link](https://www.r-bloggers.com/). 55 | * Quick-R [Link](https://www.statmethods.net/index.html). 56 | 57 | 58 | ## Datasets 59 | 60 | * R datasets [Link](https://vincentarelbundock.github.io/Rdatasets/datasets.html). 61 | * Datasets for teaching [Link](https://www.sheffield.ac.uk/mash/data). 62 | 63 | ## Books 64 | 65 | * Korner-Nievergelt, F., Korner-Nievergelt, P., Roth, T., Almasi, B., Felten, S. V., & Guélat, J. (2016). Bayesian data analysis in ecology using linear models with R, BUGS and Stan. Amsterdam: Elsevier/Academic Press. 66 | -------------------------------------------------------------------------------- /bayes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/bayes.png -------------------------------------------------------------------------------- /case_studies/Case Study 1 - Diabetes dataset.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Case Study 1 - Diabetes dataset" 3 | author: "Murat Koptur" 4 | date: "`r format(Sys.time(), '%d %B %Y')`" 5 | output: 6 | rmarkdown::github_document: default 7 | --- 8 | 9 | ```{r echo=FALSE} 10 | knitr::opts_chunk$set(fig.path='figures/cs1-') 11 | ``` 12 | 13 | ```{r} 14 | library(dplyr) 15 | library(fastDummies) 16 | library(GGally) 17 | library(lavaan) 18 | library(loo) 19 | library(magrittr) 20 | library(mice) 21 | library(psych) 22 | library(rstanarm) 23 | library(semPlot) 24 | ``` 25 | 26 | ```{r} 27 | # http://biostat.mc.vanderbilt.edu/wiki/Main/DataSets 28 | load("./data/diabetes.sav") 29 | ``` 30 | 31 | ```{r} 32 | str(diabetes) 33 | ``` 34 | 35 | ```{r} 36 | # I'll not use location in this analysis 37 | diabetes <- select(diabetes, -location, -id) 38 | ``` 39 | 40 | ```{r} 41 | # Let's look at summary of data 42 | summary(diabetes) 43 | ``` 44 | 45 | ```{r} 46 | # Investigate NA counts 47 | colSums(is.na(diabetes)) 48 | ``` 49 | 50 | ```{r} 51 | # bp.2s and bp.2d variables has too much missing values 52 | 53 | # Glycosolated hemoglobin (glyhb) column has 13 NAs 54 | # I'll drop these observations 55 | diabetes <- filter(diabetes, !is.na(glyhb)) 56 | ``` 57 | 58 | ```{r} 59 | # impute 60 | md.pattern(diabetes) 61 | ``` 62 | 63 | ```{r results='hide'} 64 | diabetes_imp <- 65 | mice( 66 | data = diabetes, 67 | m = 5, 68 | maxit = 50, 69 | method = "pmm" 70 | ) 71 | ``` 72 | 73 | ```{r} 74 | # Take first imputed dataset (we have 5 imputed datasets, m=5) 75 | diabetes_completed <- complete(diabetes_imp, 1) 76 | # Investigate NA counts again 77 | colSums(is.na(diabetes_completed)) 78 | ``` 79 | 80 | ```{r} 81 | # correlation analysis 82 | ggcorr(diabetes_completed, label = TRUE, label_alpha = .7) 83 | ``` 84 | 85 | ```{r} 86 | corr_table <- 87 | cor(diabetes_completed[, sapply(diabetes_completed, is.numeric)]) 88 | subset(as.data.frame(as.table(corr_table)), abs(Freq) > 0.5) 89 | ``` 90 | 91 | ```{r} 92 | # since bp.2d and bp.2s seems highly correlated with bp.1d and bp.1s and 93 | # they have a lot of missing values, I decided to discard them from analysis 94 | 95 | # also, I'll create two new variables, 96 | # BMI (body mass index) and waist-to-hip ratio 97 | 98 | diabetes_completed$bmi <- 99 | (diabetes_completed$weight / (diabetes_completed$height ** 2) * 703) 100 | diabetes_completed$waist_to_hip_rat <- 101 | diabetes_completed$waist / diabetes_completed$hip 102 | 103 | # take a subset of uncorrelated variables 104 | diabetes_completed_subset <- select( 105 | diabetes_completed, 106 | chol, 107 | ratio, 108 | glyhb, 109 | age, 110 | gender, 111 | bmi, 112 | waist_to_hip_rat, 113 | frame, 114 | bp.1s, 115 | bp.1d, 116 | time.ppn 117 | ) 118 | head(diabetes_completed_subset) 119 | ``` 120 | 121 | ```{r} 122 | # pairs plot 123 | ggpairs(diabetes_completed_subset) 124 | ``` 125 | 126 | ```{r} 127 | # standardize all variables 128 | diabetes_completed_subset %<>% 129 | mutate_at( 130 | funs(scale), 131 | .vars = c( 132 | "chol", 133 | "ratio", 134 | "glyhb", 135 | "age", 136 | "bmi", 137 | "waist_to_hip_rat", 138 | "bp.1s", 139 | "bp.1d", 140 | "time.ppn" 141 | ) 142 | ) 143 | ``` 144 | 145 | ```{r} 146 | # Create dummy variables for gender and frame 147 | library(fastDummies) 148 | diabetes_completed_subset <- 149 | dummy_cols(diabetes_completed_subset, remove_first_dummy = TRUE) 150 | diabetes_completed_subset <- 151 | select(diabetes_completed_subset,-gender,-frame) 152 | head(diabetes_completed_subset) 153 | ``` 154 | 155 | ```{r} 156 | # Explonatory Factor analysis 157 | fa.parallel(select(diabetes_completed_subset,-glyhb)) 158 | ``` 159 | 160 | ```{r} 161 | diabetes_completed_subset_fi <- 162 | fa( 163 | select(diabetes_completed_subset,-glyhb), 164 | nfactors = 6, 165 | fm = "pa", 166 | max.iter = 200 167 | ) 168 | fa.diagram(diabetes_completed_subset_fi) 169 | ``` 170 | 171 | ```{r} 172 | fl <- round(unclass(diabetes_completed_subset_fi$loadings), 2) 173 | fl 174 | ``` 175 | 176 | ```{r} 177 | # Let's start to build models 178 | model1 <- stan_glm('glyhb ~ .', data = diabetes_completed_subset) 179 | model1 180 | summary(model1) 181 | ``` 182 | 183 | ```{r} 184 | par(mfrow = c(2, 2), mar = c(3, 5, 3, 3)) 185 | plot(model1) 186 | ``` 187 | 188 | ```{r} 189 | model2 <- 190 | stan_glm('glyhb ~ ratio + age', data = diabetes_completed_subset) 191 | model2 192 | summary(model2) 193 | 194 | par(mfrow = c(2, 2), mar = c(3, 5, 3, 3)) 195 | plot(model2) 196 | ``` 197 | 198 | ```{r} 199 | model3 <- 200 | stan_glm('glyhb ~ bmi + waist_to_hip_rat', data = diabetes_completed_subset) 201 | model3 202 | summary(model3) 203 | 204 | par(mfrow = c(2, 2), mar = c(3, 5, 3, 3)) 205 | plot(model3) 206 | ``` 207 | 208 | ```{r} 209 | model4 <- 210 | stan_glm('glyhb ~ ratio + age + bmi + waist_to_hip_rat', data = diabetes_completed_subset) 211 | model4 212 | summary(model4) 213 | 214 | par(mfrow = c(2, 2), mar = c(3, 5, 3, 3)) 215 | plot(model4) 216 | ``` 217 | 218 | ```{r} 219 | model5 <- 220 | stan_glm('glyhb ~ ratio + age + bmi', data = diabetes_completed_subset) 221 | model5 222 | summary(model5) 223 | 224 | par(mfrow = c(2, 2), mar = c(3, 5, 3, 3)) 225 | plot(model5) 226 | ``` 227 | 228 | ```{r} 229 | ic <- data.frame( 230 | Model = c("model1", "model2", "model3", "model4", "model5"), 231 | WAIC = c(waic(model1)$estimates[3,1], waic(model2)$estimates[3,1], waic(model3)$estimates[3,1], waic(model4)$estimates[3,1], waic(model5)$estimates[3,1]), 232 | stringsAsFactors = FALSE 233 | ) 234 | ic 235 | ``` 236 | 237 | ```{r} 238 | # Let's build a SEM model 239 | library(lavaan) 240 | semModel1 <- ' 241 | pa1 =~ age 242 | pa2 =~ bp.1d + bp.1s 243 | pa3 =~ bmi + frame_large + frame_small 244 | pa4 =~ gender_male + waist_to_hip_rat 245 | pa5 =~ ratio + chol 246 | pa6 =~ time.ppn 247 | 248 | glyhb ~ pa1 + pa2 + pa3 + pa4 + pa5 + pa6 249 | ' 250 | fit1 <- sem(semModel1, 251 | data = diabetes_completed_subset) 252 | fit1 253 | ``` 254 | 255 | ```{r} 256 | semPaths(fit1) 257 | ``` 258 | 259 | ```{r} 260 | summary(fit1, standardized = TRUE, fit.measures = TRUE) 261 | ``` 262 | 263 | ```{r} 264 | parameterEstimates(fit1) 265 | ``` 266 | 267 | ```{r} 268 | # Second SEM model 269 | semModel2 <- ' 270 | pa1 =~ age 271 | pa5 =~ ratio + chol 272 | 273 | glyhb ~ pa1 + pa5 274 | ' 275 | fit2 <- sem(semModel2, 276 | data = diabetes_completed_subset) 277 | fit2 278 | ``` 279 | 280 | ```{r} 281 | semPaths(fit2) 282 | ``` 283 | 284 | ```{r} 285 | summary(fit2, standardized = TRUE, fit.measures = TRUE) 286 | ``` 287 | 288 | ```{r} 289 | parameterEstimates(fit1) 290 | ``` -------------------------------------------------------------------------------- /case_studies/Case_Study_1_-_Diabetes_dataset.md: -------------------------------------------------------------------------------- 1 | Case Study 1 - Diabetes dataset 2 | ================ 3 | Murat Koptur 4 | 26 Ağustos 2018 5 | 6 | ``` r 7 | library(dplyr) 8 | library(fastDummies) 9 | library(GGally) 10 | library(lavaan) 11 | library(loo) 12 | library(magrittr) 13 | library(mice) 14 | library(psych) 15 | library(rstanarm) 16 | library(semPlot) 17 | ``` 18 | 19 | ``` r 20 | # http://biostat.mc.vanderbilt.edu/wiki/Main/DataSets 21 | load("./data/diabetes.sav") 22 | ``` 23 | 24 | ``` r 25 | str(diabetes) 26 | ``` 27 | 28 | ## 'data.frame': 403 obs. of 19 variables: 29 | ## $ id : 'labelled' int 1000 1001 1002 1003 1005 1008 1011 1015 1016 1022 ... 30 | ## ..- attr(*, "label")= chr "Subject ID" 31 | ## $ chol : 'labelled' int 203 165 228 78 249 248 195 227 177 263 ... 32 | ## ..- attr(*, "label")= chr "Total Cholesterol" 33 | ## $ stab.glu: 'labelled' int 82 97 92 93 90 94 92 75 87 89 ... 34 | ## ..- attr(*, "label")= chr "Stabilized Glucose" 35 | ## $ hdl : 'labelled' int 56 24 37 12 28 69 41 44 49 40 ... 36 | ## ..- attr(*, "label")= chr "High Density Lipoprotein" 37 | ## $ ratio : 'labelled' num 3.6 6.9 6.2 6.5 8.9 ... 38 | ## ..- attr(*, "label")= chr "Cholesterol/HDL Ratio" 39 | ## $ glyhb : 'labelled' num 4.31 4.44 4.64 4.63 7.72 ... 40 | ## ..- attr(*, "label")= chr "Glycosolated Hemoglobin" 41 | ## $ location: Factor w/ 2 levels "Buckingham","Louisa": 1 1 1 1 1 1 1 1 1 1 ... 42 | ## $ age : int 46 29 58 67 64 34 30 37 45 55 ... 43 | ## ..- attr(*, "units")= chr "years" 44 | ## $ gender : Factor w/ 2 levels "male","female": 2 2 2 1 1 1 1 1 1 2 ... 45 | ## $ height : int 62 64 61 67 68 71 69 59 69 63 ... 46 | ## ..- attr(*, "units")= chr "inches" 47 | ## $ weight : int 121 218 256 119 183 190 191 170 166 202 ... 48 | ## ..- attr(*, "units")= chr "pounds" 49 | ## $ frame : Factor w/ 3 levels "small","medium",..: 2 3 3 3 2 3 2 2 3 1 ... 50 | ## $ bp.1s : 'labelled' int 118 112 190 110 138 132 161 NA 160 108 ... 51 | ## ..- attr(*, "label")= chr "First Systolic Blood Pressure" 52 | ## $ bp.1d : 'labelled' int 59 68 92 50 80 86 112 NA 80 72 ... 53 | ## ..- attr(*, "label")= chr "First Diastolic Blood Pressure" 54 | ## $ bp.2s : 'labelled' int NA NA 185 NA NA NA 161 NA 128 NA ... 55 | ## ..- attr(*, "label")= chr "Second Systolic Blood Pressure" 56 | ## ..- attr(*, "comment")= chr "equals first measurement if it was not high" 57 | ## $ bp.2d : 'labelled' int NA NA 92 NA NA NA 112 NA 86 NA ... 58 | ## ..- attr(*, "comment")= chr "equals first measurement if it was not high" 59 | ## ..- attr(*, "label")= chr "Second Diastolic Blood Pressure" 60 | ## $ waist : int 29 46 49 33 44 36 46 34 34 45 ... 61 | ## ..- attr(*, "units")= chr "inches" 62 | ## $ hip : int 38 48 57 38 41 42 49 39 40 50 ... 63 | ## ..- attr(*, "units")= chr "inches" 64 | ## $ time.ppn: 'labelled' int 720 360 180 480 300 195 720 1020 300 240 ... 65 | ## ..- attr(*, "label")= chr "Postprandial Time when Labs were Drawn" 66 | ## ..- attr(*, "units")= chr "minutes" 67 | 68 | ``` r 69 | # I'll not use location in this analysis 70 | diabetes <- select(diabetes, -location, -id) 71 | ``` 72 | 73 | ``` r 74 | # Let's look at summary of data 75 | summary(diabetes) 76 | ``` 77 | 78 | ## chol stab.glu hdl ratio 79 | ## Min. : 78.0 Min. : 48.0 Min. : 12.00 Min. : 1.500 80 | ## 1st Qu.:179.0 1st Qu.: 81.0 1st Qu.: 38.00 1st Qu.: 3.200 81 | ## Median :204.0 Median : 89.0 Median : 46.00 Median : 4.200 82 | ## Mean :207.8 Mean :106.7 Mean : 50.45 Mean : 4.522 83 | ## 3rd Qu.:230.0 3rd Qu.:106.0 3rd Qu.: 59.00 3rd Qu.: 5.400 84 | ## Max. :443.0 Max. :385.0 Max. :120.00 Max. :19.300 85 | ## NA's :1 NA's :1 NA's :1 86 | ## glyhb age gender height 87 | ## Min. : 2.68 Min. :19.00 male :169 Min. :52.00 88 | ## 1st Qu.: 4.38 1st Qu.:34.00 female:234 1st Qu.:63.00 89 | ## Median : 4.84 Median :45.00 Median :66.00 90 | ## Mean : 5.59 Mean :46.85 Mean :66.02 91 | ## 3rd Qu.: 5.60 3rd Qu.:60.00 3rd Qu.:69.00 92 | ## Max. :16.11 Max. :92.00 Max. :76.00 93 | ## NA's :13 NA's :5 94 | ## weight frame bp.1s bp.1d 95 | ## Min. : 99.0 small :104 Min. : 90.0 Min. : 48.00 96 | ## 1st Qu.:151.0 medium:184 1st Qu.:121.2 1st Qu.: 75.00 97 | ## Median :172.5 large :103 Median :136.0 Median : 82.00 98 | ## Mean :177.6 NA's : 12 Mean :136.9 Mean : 83.32 99 | ## 3rd Qu.:200.0 3rd Qu.:146.8 3rd Qu.: 90.00 100 | ## Max. :325.0 Max. :250.0 Max. :124.00 101 | ## NA's :1 NA's :5 NA's :5 102 | ## bp.2s bp.2d waist hip 103 | ## Min. :110.0 Min. : 60.00 Min. :26.0 Min. :30.00 104 | ## 1st Qu.:138.0 1st Qu.: 84.00 1st Qu.:33.0 1st Qu.:39.00 105 | ## Median :149.0 Median : 92.00 Median :37.0 Median :42.00 106 | ## Mean :152.4 Mean : 92.52 Mean :37.9 Mean :43.04 107 | ## 3rd Qu.:161.0 3rd Qu.:100.00 3rd Qu.:41.0 3rd Qu.:46.00 108 | ## Max. :238.0 Max. :124.00 Max. :56.0 Max. :64.00 109 | ## NA's :262 NA's :262 NA's :2 NA's :2 110 | ## time.ppn 111 | ## Min. : 5.0 112 | ## 1st Qu.: 90.0 113 | ## Median : 240.0 114 | ## Mean : 341.2 115 | ## 3rd Qu.: 517.5 116 | ## Max. :1560.0 117 | ## NA's :3 118 | 119 | ``` r 120 | # Investigate NA counts 121 | colSums(is.na(diabetes)) 122 | ``` 123 | 124 | ## chol stab.glu hdl ratio glyhb age gender height 125 | ## 1 0 1 1 13 0 0 5 126 | ## weight frame bp.1s bp.1d bp.2s bp.2d waist hip 127 | ## 1 12 5 5 262 262 2 2 128 | ## time.ppn 129 | ## 3 130 | 131 | ``` r 132 | # bp.2s and bp.2d variables has too much missing values 133 | 134 | # Glycosolated hemoglobin (glyhb) column has 13 NAs 135 | # I'll drop these observations 136 | diabetes <- filter(diabetes, !is.na(glyhb)) 137 | ``` 138 | 139 | ``` r 140 | # impute 141 | md.pattern(diabetes) 142 | ``` 143 | 144 | ![](figures/cs1-unnamed-chunk-9-1.png) 145 | 146 | ## stab.glu glyhb age gender chol hdl ratio weight waist hip time.ppn 147 | ## 130 1 1 1 1 1 1 1 1 1 1 1 148 | ## 236 1 1 1 1 1 1 1 1 1 1 1 149 | ## 6 1 1 1 1 1 1 1 1 1 1 1 150 | ## 3 1 1 1 1 1 1 1 1 1 1 1 151 | ## 3 1 1 1 1 1 1 1 1 1 1 1 152 | ## 4 1 1 1 1 1 1 1 1 1 1 1 153 | ## 1 1 1 1 1 1 1 1 1 1 1 1 154 | ## 1 1 1 1 1 1 1 1 1 1 1 0 155 | ## 1 1 1 1 1 1 1 1 1 1 1 0 156 | ## 1 1 1 1 1 1 1 1 1 1 1 0 157 | ## 1 1 1 1 1 1 1 1 1 0 0 1 158 | ## 1 1 1 1 1 1 1 1 1 0 0 1 159 | ## 1 1 1 1 1 1 1 1 0 1 1 1 160 | ## 1 1 1 1 1 0 0 0 1 1 1 1 161 | ## 0 0 0 0 1 1 1 1 2 2 3 162 | ## height bp.1s bp.1d frame bp.2s bp.2d 163 | ## 130 1 1 1 1 1 1 0 164 | ## 236 1 1 1 1 0 0 2 165 | ## 6 1 1 1 0 1 1 1 166 | ## 3 1 1 1 0 0 0 3 167 | ## 3 1 0 0 1 0 0 4 168 | ## 4 0 1 1 1 0 0 3 169 | ## 1 0 0 0 0 0 0 6 170 | ## 1 1 1 1 1 1 1 1 171 | ## 1 1 1 1 0 0 0 4 172 | ## 1 1 0 0 1 0 0 5 173 | ## 1 1 1 1 1 1 1 2 174 | ## 1 1 1 1 1 0 0 4 175 | ## 1 1 1 1 1 0 0 3 176 | ## 1 1 1 1 1 0 0 5 177 | ## 5 5 5 11 252 252 541 178 | 179 | ``` r 180 | diabetes_imp <- 181 | mice( 182 | data = diabetes, 183 | m = 5, 184 | maxit = 50, 185 | method = "pmm" 186 | ) 187 | ``` 188 | 189 | ``` r 190 | # Take first imputed dataset (we have 5 imputed datasets, m=5) 191 | diabetes_completed <- complete(diabetes_imp, 1) 192 | ``` 193 | 194 | ``` r 195 | # Investigate NA counts again 196 | colSums(is.na(diabetes_completed)) 197 | ``` 198 | 199 | ## chol stab.glu hdl ratio glyhb age gender height 200 | ## 0 0 0 0 0 0 0 0 201 | ## weight frame bp.1s bp.1d bp.2s bp.2d waist hip 202 | ## 0 0 0 0 0 0 0 0 203 | ## time.ppn 204 | ## 0 205 | 206 | ``` r 207 | # correlation analysis 208 | ggcorr(diabetes_completed, label = TRUE, label_alpha = .7) 209 | ``` 210 | 211 | ![](figures/cs1-unnamed-chunk-12-1.png) 212 | 213 | ``` r 214 | corr_table <- 215 | cor(diabetes_completed[, sapply(diabetes_completed, is.numeric)]) 216 | subset(as.data.frame(as.table(corr_table)), abs(Freq) > 0.5) 217 | ``` 218 | 219 | ## Var1 Var2 Freq 220 | ## 1 chol chol 1.0000000 221 | ## 17 stab.glu stab.glu 1.0000000 222 | ## 20 glyhb stab.glu 0.7492355 223 | ## 33 hdl hdl 1.0000000 224 | ## 34 ratio hdl -0.6826599 225 | ## 48 hdl ratio -0.6826599 226 | ## 49 ratio ratio 1.0000000 227 | ## 62 stab.glu glyhb 0.7492355 228 | ## 65 glyhb glyhb 1.0000000 229 | ## 81 age age 1.0000000 230 | ## 97 height height 1.0000000 231 | ## 113 weight weight 1.0000000 232 | ## 118 waist weight 0.8522011 233 | ## 119 hip weight 0.8307025 234 | ## 129 bp.1s bp.1s 1.0000000 235 | ## 130 bp.1d bp.1s 0.6054981 236 | ## 131 bp.2s bp.1s 0.8778776 237 | ## 132 bp.2d bp.1s 0.5162788 238 | ## 144 bp.1s bp.1d 0.6054981 239 | ## 145 bp.1d bp.1d 1.0000000 240 | ## 146 bp.2s bp.1d 0.5814284 241 | ## 147 bp.2d bp.1d 0.8272843 242 | ## 159 bp.1s bp.2s 0.8778776 243 | ## 160 bp.1d bp.2s 0.5814284 244 | ## 161 bp.2s bp.2s 1.0000000 245 | ## 162 bp.2d bp.2s 0.5746704 246 | ## 174 bp.1s bp.2d 0.5162788 247 | ## 175 bp.1d bp.2d 0.8272843 248 | ## 176 bp.2s bp.2d 0.5746704 249 | ## 177 bp.2d bp.2d 1.0000000 250 | ## 188 weight waist 0.8522011 251 | ## 193 waist waist 1.0000000 252 | ## 194 hip waist 0.8341216 253 | ## 203 weight hip 0.8307025 254 | ## 208 waist hip 0.8341216 255 | ## 209 hip hip 1.0000000 256 | ## 225 time.ppn time.ppn 1.0000000 257 | 258 | ``` r 259 | # since bp.2d and bp.2s seems highly correlated with bp.1d and bp.1s and 260 | # they have a lot of missing values, I decided to discard them from analysis 261 | 262 | # also, I'll create two new variables, 263 | # BMI (body mass index) and waist-to-hip ratio 264 | 265 | diabetes_completed$bmi <- 266 | (diabetes_completed$weight / (diabetes_completed$height ** 2) * 703) 267 | diabetes_completed$waist_to_hip_rat <- 268 | diabetes_completed$waist / diabetes_completed$hip 269 | 270 | # take a subset of uncorrelated variables 271 | diabetes_completed_subset <- select( 272 | diabetes_completed, 273 | chol, 274 | ratio, 275 | glyhb, 276 | age, 277 | gender, 278 | bmi, 279 | waist_to_hip_rat, 280 | frame, 281 | bp.1s, 282 | bp.1d, 283 | time.ppn 284 | ) 285 | head(diabetes_completed_subset) 286 | ``` 287 | 288 | ## chol ratio glyhb age gender bmi waist_to_hip_rat frame bp.1s bp.1d 289 | ## 1 203 3.6 4.31 46 female 22.12877 0.7631579 medium 118 59 290 | ## 2 165 6.9 4.44 29 female 37.41553 0.9583333 large 112 68 291 | ## 3 228 6.2 4.64 58 female 48.36549 0.8596491 large 190 92 292 | ## 4 78 6.5 4.63 67 male 18.63600 0.8684211 large 110 50 293 | ## 5 249 8.9 7.72 64 male 27.82202 1.0731707 medium 138 80 294 | ## 6 248 3.6 4.81 34 male 26.49673 0.8571429 large 132 86 295 | ## time.ppn 296 | ## 1 720 297 | ## 2 360 298 | ## 3 180 299 | ## 4 480 300 | ## 5 300 301 | ## 6 195 302 | 303 | ``` r 304 | # pairs plot 305 | ggpairs(diabetes_completed_subset) 306 | ``` 307 | 308 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. 309 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. 310 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. 311 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. 312 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. 313 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. 314 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. 315 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. 316 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. 317 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. 318 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. 319 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. 320 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. 321 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. 322 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. 323 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. 324 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. 325 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. 326 | 327 | ![](figures/cs1-unnamed-chunk-15-1.png) 328 | 329 | ``` r 330 | # standardize all variables 331 | diabetes_completed_subset %<>% 332 | mutate_at( 333 | funs(scale), 334 | .vars = c( 335 | "chol", 336 | "ratio", 337 | "glyhb", 338 | "age", 339 | "bmi", 340 | "waist_to_hip_rat", 341 | "bp.1s", 342 | "bp.1d", 343 | "time.ppn" 344 | ) 345 | ) 346 | ``` 347 | 348 | ``` r 349 | # Create dummy variables for gender and frame 350 | library(fastDummies) 351 | diabetes_completed_subset <- 352 | dummy_cols(diabetes_completed_subset, remove_first_dummy = TRUE) 353 | diabetes_completed_subset <- 354 | select(diabetes_completed_subset,-gender,-frame) 355 | head(diabetes_completed_subset) 356 | ``` 357 | 358 | ## chol ratio glyhb age bmi 359 | ## 1 -0.09319585 -0.5301616 -0.5706645 -0.04711384 -0.9973448 360 | ## 2 -0.94314197 1.3678022 -0.5126959 -1.08143433 1.3055456 361 | ## 3 0.46597923 0.9652036 -0.4235136 0.68299474 2.9551156 362 | ## 4 -2.88907124 1.1377459 -0.4279726 1.23057617 -1.5235175 363 | ## 5 0.93568630 2.5180829 0.9498954 1.04804903 -0.1396797 364 | ## 6 0.91331929 -0.5301616 -0.3477085 -0.77722242 -0.3393293 365 | ## waist_to_hip_rat bp.1s bp.1d time.ppn gender_male 366 | ## 1 -1.6083402 -0.82988906 -1.7821860 1.25031434 0 367 | ## 2 1.0550300 -1.09181790 -1.1181935 0.08200624 0 368 | ## 3 -0.2916179 2.31325699 0.6524530 -0.50214781 0 369 | ## 4 -0.1719158 -1.17912751 -2.4461784 0.47144227 1 370 | ## 5 2.6221047 0.04320706 -0.2328703 -0.11271177 1 371 | ## 6 -0.3258185 -0.21872177 0.2097913 -0.45346830 1 372 | ## frame_large frame_small 373 | ## 1 0 0 374 | ## 2 1 0 375 | ## 3 1 0 376 | ## 4 1 0 377 | ## 5 0 0 378 | ## 6 1 0 379 | 380 | ``` r 381 | # Explonatory Factor analysis 382 | fa.parallel(select(diabetes_completed_subset,-glyhb)) 383 | ``` 384 | 385 | ![](figures/cs1-unnamed-chunk-18-1.png) 386 | 387 | ## Parallel analysis suggests that the number of factors = 6 and the number of components = 4 388 | 389 | ``` r 390 | diabetes_completed_subset_fi <- 391 | fa( 392 | select(diabetes_completed_subset,-glyhb), 393 | nfactors = 6, 394 | fm = "pa", 395 | max.iter = 200 396 | ) 397 | ``` 398 | 399 | ``` r 400 | fa.diagram(diabetes_completed_subset_fi) 401 | ``` 402 | 403 | ![](figures/cs1-unnamed-chunk-19-1.png) 404 | 405 | ``` r 406 | fl <- round(unclass(diabetes_completed_subset_fi$loadings), 2) 407 | fl 408 | ``` 409 | 410 | ## PA2 PA3 PA1 PA5 PA4 PA6 411 | ## chol 0.07 -0.10 0.05 0.75 -0.12 0.09 412 | ## ratio -0.08 0.17 -0.01 0.67 0.19 -0.12 413 | ## age -0.02 -0.02 0.99 0.02 -0.01 0.00 414 | ## bmi 0.06 0.84 -0.06 0.03 -0.15 -0.04 415 | ## waist_to_hip_rat 0.01 0.19 0.18 0.08 0.47 -0.05 416 | ## bp.1s 0.58 0.05 0.38 0.02 -0.02 0.00 417 | ## bp.1d 0.98 0.01 -0.07 0.00 0.03 0.00 418 | ## time.ppn -0.09 -0.04 -0.10 0.04 -0.03 0.36 419 | ## gender_male 0.06 -0.15 -0.04 0.00 0.79 0.04 420 | ## frame_large -0.07 0.49 0.15 -0.09 0.31 0.18 421 | ## frame_small -0.05 -0.42 -0.03 -0.14 -0.13 -0.29 422 | 423 | ``` r 424 | # Let's start to build models 425 | model1 <- stan_glm('glyhb ~ .', data = diabetes_completed_subset) 426 | ``` 427 | 428 | ``` r 429 | model1 430 | ``` 431 | 432 | ## stan_glm 433 | ## family: gaussian [identity] 434 | ## formula: "glyhb ~ ." 435 | ## observations: 390 436 | ## predictors: 12 437 | ## ------ 438 | ## Median MAD_SD 439 | ## (Intercept) 0.0 0.1 440 | ## chol 0.1 0.1 441 | ## ratio 0.2 0.1 442 | ## age 0.3 0.1 443 | ## bmi 0.1 0.1 444 | ## waist_to_hip_rat 0.0 0.1 445 | ## bp.1s 0.1 0.1 446 | ## bp.1d 0.0 0.1 447 | ## time.ppn 0.1 0.0 448 | ## gender_male 0.0 0.1 449 | ## frame_large 0.0 0.1 450 | ## frame_small 0.0 0.1 451 | ## sigma 0.9 0.0 452 | ## 453 | ## Sample avg. posterior predictive distribution of y: 454 | ## Median MAD_SD 455 | ## mean_PPD 0.0 0.1 456 | ## 457 | ## ------ 458 | ## For info on the priors used see help('prior_summary.stanreg'). 459 | 460 | ``` r 461 | summary(model1) 462 | ``` 463 | 464 | ## 465 | ## Model Info: 466 | ## 467 | ## function: stan_glm 468 | ## family: gaussian [identity] 469 | ## formula: "glyhb ~ ." 470 | ## algorithm: sampling 471 | ## priors: see help('prior_summary') 472 | ## sample: 4000 (posterior sample size) 473 | ## observations: 390 474 | ## predictors: 12 475 | ## 476 | ## Estimates: 477 | ## mean sd 2.5% 25% 50% 75% 97.5% 478 | ## (Intercept) 0.0 0.1 -0.2 -0.1 0.0 0.1 0.2 479 | ## chol 0.1 0.1 -0.1 0.0 0.1 0.1 0.2 480 | ## ratio 0.2 0.1 0.1 0.2 0.2 0.3 0.3 481 | ## age 0.3 0.1 0.1 0.2 0.3 0.3 0.4 482 | ## bmi 0.1 0.1 0.0 0.0 0.1 0.1 0.2 483 | ## waist_to_hip_rat 0.0 0.1 -0.1 0.0 0.0 0.1 0.2 484 | ## bp.1s 0.1 0.1 -0.1 0.0 0.1 0.1 0.2 485 | ## bp.1d 0.0 0.1 -0.2 -0.1 0.0 0.0 0.1 486 | ## time.ppn 0.1 0.0 0.0 0.0 0.1 0.1 0.1 487 | ## gender_male 0.0 0.1 -0.2 0.0 0.0 0.1 0.2 488 | ## frame_large 0.0 0.1 -0.3 -0.1 0.0 0.0 0.2 489 | ## frame_small 0.0 0.1 -0.2 -0.1 0.0 0.1 0.2 490 | ## sigma 0.9 0.0 0.8 0.9 0.9 0.9 1.0 491 | ## mean_PPD 0.0 0.1 -0.1 0.0 0.0 0.0 0.1 492 | ## log-posterior -529.0 2.6 -534.8 -530.6 -528.6 -527.1 -524.9 493 | ## 494 | ## Diagnostics: 495 | ## mcse Rhat n_eff 496 | ## (Intercept) 0.0 1.0 4000 497 | ## chol 0.0 1.0 4000 498 | ## ratio 0.0 1.0 4000 499 | ## age 0.0 1.0 4000 500 | ## bmi 0.0 1.0 4000 501 | ## waist_to_hip_rat 0.0 1.0 4000 502 | ## bp.1s 0.0 1.0 3638 503 | ## bp.1d 0.0 1.0 3939 504 | ## time.ppn 0.0 1.0 4000 505 | ## gender_male 0.0 1.0 4000 506 | ## frame_large 0.0 1.0 4000 507 | ## frame_small 0.0 1.0 4000 508 | ## sigma 0.0 1.0 4000 509 | ## mean_PPD 0.0 1.0 4000 510 | ## log-posterior 0.1 1.0 1764 511 | ## 512 | ## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1). 513 | 514 | ``` r 515 | par(mfrow = c(2, 2), mar = c(3, 5, 3, 3)) 516 | plot(model1) 517 | ``` 518 | 519 | ![](figures/cs1-unnamed-chunk-22-1.png) 520 | 521 | ``` r 522 | model2 <- 523 | stan_glm('glyhb ~ ratio + age', data = diabetes_completed_subset) 524 | ``` 525 | 526 | ``` r 527 | model2 528 | ``` 529 | 530 | ## stan_glm 531 | ## family: gaussian [identity] 532 | ## formula: "glyhb ~ ratio + age" 533 | ## observations: 390 534 | ## predictors: 3 535 | ## ------ 536 | ## Median MAD_SD 537 | ## (Intercept) 0.0 0.0 538 | ## ratio 0.3 0.0 539 | ## age 0.3 0.0 540 | ## sigma 0.9 0.0 541 | ## 542 | ## Sample avg. posterior predictive distribution of y: 543 | ## Median MAD_SD 544 | ## mean_PPD 0.0 0.1 545 | ## 546 | ## ------ 547 | ## For info on the priors used see help('prior_summary.stanreg'). 548 | 549 | ``` r 550 | summary(model2) 551 | ``` 552 | 553 | ## 554 | ## Model Info: 555 | ## 556 | ## function: stan_glm 557 | ## family: gaussian [identity] 558 | ## formula: "glyhb ~ ratio + age" 559 | ## algorithm: sampling 560 | ## priors: see help('prior_summary') 561 | ## sample: 4000 (posterior sample size) 562 | ## observations: 390 563 | ## predictors: 3 564 | ## 565 | ## Estimates: 566 | ## mean sd 2.5% 25% 50% 75% 97.5% 567 | ## (Intercept) 0.0 0.0 -0.1 0.0 0.0 0.0 0.1 568 | ## ratio 0.3 0.0 0.2 0.3 0.3 0.3 0.4 569 | ## age 0.3 0.0 0.2 0.3 0.3 0.3 0.4 570 | ## sigma 0.9 0.0 0.8 0.9 0.9 0.9 1.0 571 | ## mean_PPD 0.0 0.1 -0.1 0.0 0.0 0.0 0.1 572 | ## log-posterior -519.3 1.4 -522.8 -520.1 -519.0 -518.3 -517.6 573 | ## 574 | ## Diagnostics: 575 | ## mcse Rhat n_eff 576 | ## (Intercept) 0.0 1.0 4000 577 | ## ratio 0.0 1.0 4000 578 | ## age 0.0 1.0 4000 579 | ## sigma 0.0 1.0 4000 580 | ## mean_PPD 0.0 1.0 4000 581 | ## log-posterior 0.0 1.0 1855 582 | ## 583 | ## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1). 584 | 585 | ``` r 586 | par(mfrow = c(2, 2), mar = c(3, 5, 3, 3)) 587 | plot(model2) 588 | ``` 589 | 590 | ![](figures/cs1-unnamed-chunk-23-1.png) 591 | 592 | ``` r 593 | model3 <- 594 | stan_glm('glyhb ~ bmi + waist_to_hip_rat', data = diabetes_completed_subset) 595 | ``` 596 | 597 | ``` r 598 | model3 599 | ``` 600 | 601 | ## stan_glm 602 | ## family: gaussian [identity] 603 | ## formula: "glyhb ~ bmi + waist_to_hip_rat" 604 | ## observations: 390 605 | ## predictors: 3 606 | ## ------ 607 | ## Median MAD_SD 608 | ## (Intercept) 0.0 0.0 609 | ## bmi 0.1 0.0 610 | ## waist_to_hip_rat 0.2 0.1 611 | ## sigma 1.0 0.0 612 | ## 613 | ## Sample avg. posterior predictive distribution of y: 614 | ## Median MAD_SD 615 | ## mean_PPD 0.0 0.1 616 | ## 617 | ## ------ 618 | ## For info on the priors used see help('prior_summary.stanreg'). 619 | 620 | ``` r 621 | summary(model3) 622 | ``` 623 | 624 | ## 625 | ## Model Info: 626 | ## 627 | ## function: stan_glm 628 | ## family: gaussian [identity] 629 | ## formula: "glyhb ~ bmi + waist_to_hip_rat" 630 | ## algorithm: sampling 631 | ## priors: see help('prior_summary') 632 | ## sample: 4000 (posterior sample size) 633 | ## observations: 390 634 | ## predictors: 3 635 | ## 636 | ## Estimates: 637 | ## mean sd 2.5% 25% 50% 75% 97.5% 638 | ## (Intercept) 0.0 0.0 -0.1 0.0 0.0 0.0 0.1 639 | ## bmi 0.1 0.1 0.0 0.1 0.1 0.1 0.2 640 | ## waist_to_hip_rat 0.2 0.1 0.1 0.1 0.2 0.2 0.3 641 | ## sigma 1.0 0.0 0.9 1.0 1.0 1.0 1.1 642 | ## mean_PPD 0.0 0.1 -0.1 0.0 0.0 0.0 0.1 643 | ## log-posterior -551.6 1.5 -555.3 -552.3 -551.2 -550.5 -549.8 644 | ## 645 | ## Diagnostics: 646 | ## mcse Rhat n_eff 647 | ## (Intercept) 0.0 1.0 4000 648 | ## bmi 0.0 1.0 4000 649 | ## waist_to_hip_rat 0.0 1.0 4000 650 | ## sigma 0.0 1.0 4000 651 | ## mean_PPD 0.0 1.0 4000 652 | ## log-posterior 0.0 1.0 1792 653 | ## 654 | ## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1). 655 | 656 | ``` r 657 | par(mfrow = c(2, 2), mar = c(3, 5, 3, 3)) 658 | plot(model3) 659 | ``` 660 | 661 | ![](figures/cs1-unnamed-chunk-24-1.png) 662 | 663 | ``` r 664 | model4 <- 665 | stan_glm('glyhb ~ ratio + age + bmi + waist_to_hip_rat', data = diabetes_completed_subset) 666 | ``` 667 | 668 | ``` r 669 | model4 670 | ``` 671 | 672 | ## stan_glm 673 | ## family: gaussian [identity] 674 | ## formula: "glyhb ~ ratio + age + bmi + waist_to_hip_rat" 675 | ## observations: 390 676 | ## predictors: 5 677 | ## ------ 678 | ## Median MAD_SD 679 | ## (Intercept) 0.0 0.0 680 | ## ratio 0.3 0.0 681 | ## age 0.3 0.0 682 | ## bmi 0.1 0.0 683 | ## waist_to_hip_rat 0.0 0.1 684 | ## sigma 0.9 0.0 685 | ## 686 | ## Sample avg. posterior predictive distribution of y: 687 | ## Median MAD_SD 688 | ## mean_PPD 0.0 0.1 689 | ## 690 | ## ------ 691 | ## For info on the priors used see help('prior_summary.stanreg'). 692 | 693 | ``` r 694 | summary(model4) 695 | ``` 696 | 697 | ## 698 | ## Model Info: 699 | ## 700 | ## function: stan_glm 701 | ## family: gaussian [identity] 702 | ## formula: "glyhb ~ ratio + age + bmi + waist_to_hip_rat" 703 | ## algorithm: sampling 704 | ## priors: see help('prior_summary') 705 | ## sample: 4000 (posterior sample size) 706 | ## observations: 390 707 | ## predictors: 5 708 | ## 709 | ## Estimates: 710 | ## mean sd 2.5% 25% 50% 75% 97.5% 711 | ## (Intercept) 0.0 0.0 -0.1 0.0 0.0 0.0 0.1 712 | ## ratio 0.3 0.0 0.2 0.2 0.3 0.3 0.4 713 | ## age 0.3 0.0 0.2 0.3 0.3 0.3 0.4 714 | ## bmi 0.1 0.0 0.0 0.0 0.1 0.1 0.2 715 | ## waist_to_hip_rat 0.0 0.0 -0.1 0.0 0.0 0.1 0.1 716 | ## sigma 0.9 0.0 0.8 0.9 0.9 0.9 1.0 717 | ## mean_PPD 0.0 0.1 -0.1 0.0 0.0 0.0 0.1 718 | ## log-posterior -521.0 1.7 -525.2 -521.9 -520.6 -519.7 -518.6 719 | ## 720 | ## Diagnostics: 721 | ## mcse Rhat n_eff 722 | ## (Intercept) 0.0 1.0 4000 723 | ## ratio 0.0 1.0 4000 724 | ## age 0.0 1.0 4000 725 | ## bmi 0.0 1.0 4000 726 | ## waist_to_hip_rat 0.0 1.0 4000 727 | ## sigma 0.0 1.0 4000 728 | ## mean_PPD 0.0 1.0 4000 729 | ## log-posterior 0.0 1.0 1911 730 | ## 731 | ## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1). 732 | 733 | ``` r 734 | par(mfrow = c(2, 2), mar = c(3, 5, 3, 3)) 735 | plot(model4) 736 | ``` 737 | 738 | ![](figures/cs1-unnamed-chunk-25-1.png) 739 | 740 | ``` r 741 | model5 <- 742 | stan_glm('glyhb ~ ratio + age + bmi', data = diabetes_completed_subset) 743 | ``` 744 | 745 | ``` r 746 | model5 747 | ``` 748 | 749 | ## stan_glm 750 | ## family: gaussian [identity] 751 | ## formula: "glyhb ~ ratio + age + bmi" 752 | ## observations: 390 753 | ## predictors: 4 754 | ## ------ 755 | ## Median MAD_SD 756 | ## (Intercept) 0.0 0.0 757 | ## ratio 0.3 0.0 758 | ## age 0.3 0.0 759 | ## bmi 0.1 0.0 760 | ## sigma 0.9 0.0 761 | ## 762 | ## Sample avg. posterior predictive distribution of y: 763 | ## Median MAD_SD 764 | ## mean_PPD 0.0 0.1 765 | ## 766 | ## ------ 767 | ## For info on the priors used see help('prior_summary.stanreg'). 768 | 769 | ``` r 770 | summary(model5) 771 | ``` 772 | 773 | ## 774 | ## Model Info: 775 | ## 776 | ## function: stan_glm 777 | ## family: gaussian [identity] 778 | ## formula: "glyhb ~ ratio + age + bmi" 779 | ## algorithm: sampling 780 | ## priors: see help('prior_summary') 781 | ## sample: 4000 (posterior sample size) 782 | ## observations: 390 783 | ## predictors: 4 784 | ## 785 | ## Estimates: 786 | ## mean sd 2.5% 25% 50% 75% 97.5% 787 | ## (Intercept) 0.0 0.0 -0.1 0.0 0.0 0.0 0.1 788 | ## ratio 0.3 0.0 0.2 0.2 0.3 0.3 0.4 789 | ## age 0.3 0.0 0.2 0.3 0.3 0.3 0.4 790 | ## bmi 0.1 0.0 0.0 0.0 0.1 0.1 0.2 791 | ## sigma 0.9 0.0 0.8 0.9 0.9 0.9 1.0 792 | ## mean_PPD 0.0 0.1 -0.1 0.0 0.0 0.0 0.1 793 | ## log-posterior -519.8 1.5 -523.4 -520.6 -519.5 -518.7 -517.8 794 | ## 795 | ## Diagnostics: 796 | ## mcse Rhat n_eff 797 | ## (Intercept) 0.0 1.0 4000 798 | ## ratio 0.0 1.0 4000 799 | ## age 0.0 1.0 4000 800 | ## bmi 0.0 1.0 4000 801 | ## sigma 0.0 1.0 4000 802 | ## mean_PPD 0.0 1.0 4000 803 | ## log-posterior 0.0 1.0 1941 804 | ## 805 | ## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1). 806 | 807 | ``` r 808 | par(mfrow = c(2, 2), mar = c(3, 5, 3, 3)) 809 | plot(model5) 810 | ``` 811 | 812 | ![](figures/cs1-unnamed-chunk-26-1.png) 813 | 814 | ``` r 815 | ic <- data.frame( 816 | Model = c("model1", "model2", "model3", "model4", "model5"), 817 | WAIC = c(waic(model1)$estimates[3,1], waic(model2)$estimates[3,1], waic(model3)$estimates[3,1], waic(model4)$estimates[3,1], waic(model5)$estimates[3,1]), 818 | stringsAsFactors = FALSE 819 | ) 820 | ``` 821 | 822 | ``` r 823 | ic 824 | ``` 825 | 826 | ## Model WAIC 827 | ## 1 model1 1045.760 828 | ## 2 model2 1033.905 829 | ## 3 model3 1097.760 830 | ## 4 model4 1035.492 831 | ## 5 model5 1034.094 832 | 833 | ``` r 834 | # Let's build a SEM model 835 | library(lavaan) 836 | semModel1 <- ' 837 | pa1 =~ age 838 | pa2 =~ bp.1d + bp.1s 839 | pa3 =~ bmi + frame_large + frame_small 840 | pa4 =~ gender_male + waist_to_hip_rat 841 | pa5 =~ ratio + chol 842 | pa6 =~ time.ppn 843 | 844 | glyhb ~ pa1 + pa2 + pa3 + pa4 + pa5 + pa6 845 | ' 846 | fit1 <- sem(semModel1, 847 | data = diabetes_completed_subset) 848 | ``` 849 | 850 | ## Warning in lav_object_post_check(object): lavaan WARNING: some estimated ov 851 | ## variances are negative 852 | 853 | ``` r 854 | fit1 855 | ``` 856 | 857 | ## lavaan 0.6-2 ended normally after 144 iterations 858 | ## 859 | ## Optimization method NLMINB 860 | ## Number of free parameters 42 861 | ## 862 | ## Number of observations 390 863 | ## 864 | ## Estimator ML 865 | ## Model Fit Test Statistic 178.781 866 | ## Degrees of freedom 36 867 | ## P-value (Chi-square) 0.000 868 | 869 | ``` r 870 | semPaths(fit1) 871 | ``` 872 | 873 | ![](figures/cs1-unnamed-chunk-29-1.png) 874 | 875 | ``` r 876 | summary(fit1, standardized = TRUE, fit.measures = TRUE) 877 | ``` 878 | 879 | ## lavaan 0.6-2 ended normally after 144 iterations 880 | ## 881 | ## Optimization method NLMINB 882 | ## Number of free parameters 42 883 | ## 884 | ## Number of observations 390 885 | ## 886 | ## Estimator ML 887 | ## Model Fit Test Statistic 178.781 888 | ## Degrees of freedom 36 889 | ## P-value (Chi-square) 0.000 890 | ## 891 | ## Model test baseline model: 892 | ## 893 | ## Minimum Function Test Statistic 974.533 894 | ## Degrees of freedom 66 895 | ## P-value 0.000 896 | ## 897 | ## User model versus baseline model: 898 | ## 899 | ## Comparative Fit Index (CFI) 0.843 900 | ## Tucker-Lewis Index (TLI) 0.712 901 | ## 902 | ## Loglikelihood and Information Criteria: 903 | ## 904 | ## Loglikelihood user model (H0) -5323.322 905 | ## Loglikelihood unrestricted model (H1) -5233.931 906 | ## 907 | ## Number of free parameters 42 908 | ## Akaike (AIC) 10730.643 909 | ## Bayesian (BIC) 10897.222 910 | ## Sample-size adjusted Bayesian (BIC) 10763.958 911 | ## 912 | ## Root Mean Square Error of Approximation: 913 | ## 914 | ## RMSEA 0.101 915 | ## 90 Percent Confidence Interval 0.086 0.116 916 | ## P-value RMSEA <= 0.05 0.000 917 | ## 918 | ## Standardized Root Mean Square Residual: 919 | ## 920 | ## SRMR 0.064 921 | ## 922 | ## Parameter Estimates: 923 | ## 924 | ## Information Expected 925 | ## Information saturated (h1) model Structured 926 | ## Standard Errors Standard 927 | ## 928 | ## Latent Variables: 929 | ## Estimate Std.Err z-value P(>|z|) Std.lv Std.all 930 | ## pa1 =~ 931 | ## age 1.000 0.999 1.000 932 | ## pa2 =~ 933 | ## bp.1d 1.000 0.340 0.340 934 | ## bp.1s 5.235 2.648 1.977 0.048 1.778 1.780 935 | ## pa3 =~ 936 | ## bmi 1.000 0.532 0.533 937 | ## frame_large 0.483 0.072 6.705 0.000 0.257 0.586 938 | ## frame_small -0.543 0.080 -6.793 0.000 -0.289 -0.652 939 | ## pa4 =~ 940 | ## gender_male 1.000 0.157 0.320 941 | ## waist_to_hp_rt 6.946 2.881 2.411 0.016 1.094 1.095 942 | ## pa5 =~ 943 | ## ratio 1.000 0.882 0.883 944 | ## chol 0.612 0.106 5.760 0.000 0.539 0.540 945 | ## pa6 =~ 946 | ## time.ppn 1.000 0.999 1.000 947 | ## 948 | ## Regressions: 949 | ## Estimate Std.Err z-value P(>|z|) Std.lv Std.all 950 | ## glyhb ~ 951 | ## pa1 0.257 0.049 5.254 0.000 0.256 0.257 952 | ## pa2 0.055 0.061 0.892 0.372 0.019 0.019 953 | ## pa3 0.063 0.134 0.469 0.639 0.033 0.033 954 | ## pa4 0.132 0.290 0.456 0.648 0.021 0.021 955 | ## pa5 0.351 0.090 3.916 0.000 0.310 0.310 956 | ## pa6 0.056 0.046 1.222 0.222 0.056 0.056 957 | ## 958 | ## Covariances: 959 | ## Estimate Std.Err z-value P(>|z|) Std.lv Std.all 960 | ## pa1 ~~ 961 | ## pa2 0.088 0.050 1.766 0.077 0.259 0.259 962 | ## pa3 0.130 0.037 3.510 0.000 0.244 0.244 963 | ## pa4 0.040 0.019 2.163 0.031 0.256 0.256 964 | ## pa5 0.187 0.051 3.681 0.000 0.213 0.213 965 | ## pa6 -0.039 0.051 -0.778 0.437 -0.039 -0.039 966 | ## pa2 ~~ 967 | ## pa3 0.019 0.012 1.564 0.118 0.108 0.108 968 | ## pa4 0.003 0.003 1.249 0.212 0.059 0.059 969 | ## pa5 0.023 0.015 1.499 0.134 0.077 0.077 970 | ## pa6 -0.008 0.010 -0.840 0.401 -0.024 -0.024 971 | ## pa3 ~~ 972 | ## pa4 0.027 0.013 2.113 0.035 0.322 0.322 973 | ## pa5 0.182 0.039 4.618 0.000 0.388 0.388 974 | ## pa6 0.036 0.034 1.062 0.288 0.069 0.069 975 | ## pa4 ~~ 976 | ## pa5 0.034 0.016 2.109 0.035 0.248 0.248 977 | ## pa6 0.000 0.007 0.003 0.998 0.000 0.000 978 | ## pa5 ~~ 979 | ## pa6 -0.037 0.050 -0.733 0.464 -0.042 -0.042 980 | ## 981 | ## Variances: 982 | ## Estimate Std.Err z-value P(>|z|) Std.lv Std.all 983 | ## .age 0.000 0.000 0.000 984 | ## .bp.1d 0.882 0.084 10.560 0.000 0.882 0.884 985 | ## .bp.1s -2.164 1.506 -1.437 0.151 -2.164 -2.170 986 | ## .bmi 0.714 0.065 10.975 0.000 0.714 0.716 987 | ## .frame_large 0.126 0.013 9.944 0.000 0.126 0.656 988 | ## .frame_small 0.113 0.014 8.366 0.000 0.113 0.575 989 | ## .gender_male 0.218 0.018 11.936 0.000 0.218 0.898 990 | ## .waist_to_hp_rt -0.199 0.458 -0.435 0.664 -0.199 -0.200 991 | ## .ratio 0.219 0.124 1.771 0.077 0.219 0.220 992 | ## .chol 0.706 0.068 10.337 0.000 0.706 0.708 993 | ## .time.ppn 0.000 0.000 0.000 994 | ## .glyhb 0.777 0.059 13.243 0.000 0.777 0.779 995 | ## pa1 0.997 0.071 13.964 0.000 1.000 1.000 996 | ## pa2 0.115 0.064 1.802 0.072 1.000 1.000 997 | ## pa3 0.283 0.064 4.423 0.000 1.000 1.000 998 | ## pa4 0.025 0.012 2.035 0.042 1.000 1.000 999 | ## pa5 0.778 0.141 5.508 0.000 1.000 1.000 1000 | ## pa6 0.997 0.071 13.964 0.000 1.000 1.000 1001 | 1002 | ``` r 1003 | parameterEstimates(fit1) 1004 | ``` 1005 | 1006 | ## lhs op rhs est se z pvalue 1007 | ## 1 pa1 =~ age 1.000 0.000 NA NA 1008 | ## 2 pa2 =~ bp.1d 1.000 0.000 NA NA 1009 | ## 3 pa2 =~ bp.1s 5.235 2.648 1.977 0.048 1010 | ## 4 pa3 =~ bmi 1.000 0.000 NA NA 1011 | ## 5 pa3 =~ frame_large 0.483 0.072 6.705 0.000 1012 | ## 6 pa3 =~ frame_small -0.543 0.080 -6.793 0.000 1013 | ## 7 pa4 =~ gender_male 1.000 0.000 NA NA 1014 | ## 8 pa4 =~ waist_to_hip_rat 6.946 2.881 2.411 0.016 1015 | ## 9 pa5 =~ ratio 1.000 0.000 NA NA 1016 | ## 10 pa5 =~ chol 0.612 0.106 5.760 0.000 1017 | ## 11 pa6 =~ time.ppn 1.000 0.000 NA NA 1018 | ## 12 glyhb ~ pa1 0.257 0.049 5.254 0.000 1019 | ## 13 glyhb ~ pa2 0.055 0.061 0.892 0.372 1020 | ## 14 glyhb ~ pa3 0.063 0.134 0.469 0.639 1021 | ## 15 glyhb ~ pa4 0.132 0.290 0.456 0.648 1022 | ## 16 glyhb ~ pa5 0.351 0.090 3.916 0.000 1023 | ## 17 glyhb ~ pa6 0.056 0.046 1.222 0.222 1024 | ## 18 age ~~ age 0.000 0.000 NA NA 1025 | ## 19 bp.1d ~~ bp.1d 0.882 0.084 10.560 0.000 1026 | ## 20 bp.1s ~~ bp.1s -2.164 1.506 -1.437 0.151 1027 | ## 21 bmi ~~ bmi 0.714 0.065 10.975 0.000 1028 | ## 22 frame_large ~~ frame_large 0.126 0.013 9.944 0.000 1029 | ## 23 frame_small ~~ frame_small 0.113 0.014 8.366 0.000 1030 | ## 24 gender_male ~~ gender_male 0.218 0.018 11.936 0.000 1031 | ## 25 waist_to_hip_rat ~~ waist_to_hip_rat -0.199 0.458 -0.435 0.664 1032 | ## 26 ratio ~~ ratio 0.219 0.124 1.771 0.077 1033 | ## 27 chol ~~ chol 0.706 0.068 10.337 0.000 1034 | ## 28 time.ppn ~~ time.ppn 0.000 0.000 NA NA 1035 | ## 29 glyhb ~~ glyhb 0.777 0.059 13.243 0.000 1036 | ## 30 pa1 ~~ pa1 0.997 0.071 13.964 0.000 1037 | ## 31 pa2 ~~ pa2 0.115 0.064 1.802 0.072 1038 | ## 32 pa3 ~~ pa3 0.283 0.064 4.423 0.000 1039 | ## 33 pa4 ~~ pa4 0.025 0.012 2.035 0.042 1040 | ## 34 pa5 ~~ pa5 0.778 0.141 5.508 0.000 1041 | ## 35 pa6 ~~ pa6 0.997 0.071 13.964 0.000 1042 | ## 36 pa1 ~~ pa2 0.088 0.050 1.766 0.077 1043 | ## 37 pa1 ~~ pa3 0.130 0.037 3.510 0.000 1044 | ## 38 pa1 ~~ pa4 0.040 0.019 2.163 0.031 1045 | ## 39 pa1 ~~ pa5 0.187 0.051 3.681 0.000 1046 | ## 40 pa1 ~~ pa6 -0.039 0.051 -0.778 0.437 1047 | ## 41 pa2 ~~ pa3 0.019 0.012 1.564 0.118 1048 | ## 42 pa2 ~~ pa4 0.003 0.003 1.249 0.212 1049 | ## 43 pa2 ~~ pa5 0.023 0.015 1.499 0.134 1050 | ## 44 pa2 ~~ pa6 -0.008 0.010 -0.840 0.401 1051 | ## 45 pa3 ~~ pa4 0.027 0.013 2.113 0.035 1052 | ## 46 pa3 ~~ pa5 0.182 0.039 4.618 0.000 1053 | ## 47 pa3 ~~ pa6 0.036 0.034 1.062 0.288 1054 | ## 48 pa4 ~~ pa5 0.034 0.016 2.109 0.035 1055 | ## 49 pa4 ~~ pa6 0.000 0.007 0.003 0.998 1056 | ## 50 pa5 ~~ pa6 -0.037 0.050 -0.733 0.464 1057 | ## ci.lower ci.upper 1058 | ## 1 1.000 1.000 1059 | ## 2 1.000 1.000 1060 | ## 3 0.046 10.425 1061 | ## 4 1.000 1.000 1062 | ## 5 0.341 0.624 1063 | ## 6 -0.700 -0.386 1064 | ## 7 1.000 1.000 1065 | ## 8 1.300 12.592 1066 | ## 9 1.000 1.000 1067 | ## 10 0.403 0.820 1068 | ## 11 1.000 1.000 1069 | ## 12 0.161 0.353 1070 | ## 13 -0.065 0.174 1071 | ## 14 -0.199 0.325 1072 | ## 15 -0.436 0.700 1073 | ## 16 0.175 0.527 1074 | ## 17 -0.034 0.146 1075 | ## 18 0.000 0.000 1076 | ## 19 0.718 1.046 1077 | ## 20 -5.116 0.787 1078 | ## 21 0.587 0.842 1079 | ## 22 0.101 0.151 1080 | ## 23 0.087 0.140 1081 | ## 24 0.182 0.254 1082 | ## 25 -1.096 0.698 1083 | ## 26 -0.023 0.462 1084 | ## 27 0.573 0.840 1085 | ## 28 0.000 0.000 1086 | ## 29 0.662 0.892 1087 | ## 30 0.857 1.137 1088 | ## 31 -0.010 0.241 1089 | ## 32 0.158 0.409 1090 | ## 33 0.001 0.049 1091 | ## 34 0.501 1.055 1092 | ## 35 0.857 1.137 1093 | ## 36 -0.010 0.186 1094 | ## 37 0.057 0.203 1095 | ## 38 0.004 0.077 1096 | ## 39 0.088 0.287 1097 | ## 40 -0.138 0.060 1098 | ## 41 -0.005 0.044 1099 | ## 42 -0.002 0.008 1100 | ## 43 -0.007 0.053 1101 | ## 44 -0.027 0.011 1102 | ## 45 0.002 0.052 1103 | ## 46 0.105 0.260 1104 | ## 47 -0.031 0.104 1105 | ## 48 0.002 0.067 1106 | ## 49 -0.014 0.014 1107 | ## 50 -0.135 0.061 1108 | 1109 | ``` r 1110 | # Second SEM model 1111 | semModel2 <- ' 1112 | pa1 =~ age 1113 | pa5 =~ ratio + chol 1114 | 1115 | glyhb ~ pa1 + pa5 1116 | ' 1117 | fit2 <- sem(semModel2, 1118 | data = diabetes_completed_subset) 1119 | fit2 1120 | ``` 1121 | 1122 | ## lavaan 0.6-2 ended normally after 21 iterations 1123 | ## 1124 | ## Optimization method NLMINB 1125 | ## Number of free parameters 9 1126 | ## 1127 | ## Number of observations 390 1128 | ## 1129 | ## Estimator ML 1130 | ## Model Fit Test Statistic 7.350 1131 | ## Degrees of freedom 1 1132 | ## P-value (Chi-square) 0.007 1133 | 1134 | ``` r 1135 | semPaths(fit2) 1136 | ``` 1137 | 1138 | ![](figures/cs1-unnamed-chunk-33-1.png) 1139 | 1140 | ``` r 1141 | summary(fit2, standardized = TRUE, fit.measures = TRUE) 1142 | ``` 1143 | 1144 | ## lavaan 0.6-2 ended normally after 21 iterations 1145 | ## 1146 | ## Optimization method NLMINB 1147 | ## Number of free parameters 9 1148 | ## 1149 | ## Number of observations 390 1150 | ## 1151 | ## Estimator ML 1152 | ## Model Fit Test Statistic 7.350 1153 | ## Degrees of freedom 1 1154 | ## P-value (Chi-square) 0.007 1155 | ## 1156 | ## Model test baseline model: 1157 | ## 1158 | ## Minimum Function Test Statistic 210.710 1159 | ## Degrees of freedom 6 1160 | ## P-value 0.000 1161 | ## 1162 | ## User model versus baseline model: 1163 | ## 1164 | ## Comparative Fit Index (CFI) 0.969 1165 | ## Tucker-Lewis Index (TLI) 0.814 1166 | ## 1167 | ## Loglikelihood and Information Criteria: 1168 | ## 1169 | ## Loglikelihood user model (H0) -2109.862 1170 | ## Loglikelihood unrestricted model (H1) -2106.186 1171 | ## 1172 | ## Number of free parameters 9 1173 | ## Akaike (AIC) 4237.723 1174 | ## Bayesian (BIC) 4273.418 1175 | ## Sample-size adjusted Bayesian (BIC) 4244.862 1176 | ## 1177 | ## Root Mean Square Error of Approximation: 1178 | ## 1179 | ## RMSEA 0.128 1180 | ## 90 Percent Confidence Interval 0.054 0.221 1181 | ## P-value RMSEA <= 0.05 0.042 1182 | ## 1183 | ## Standardized Root Mean Square Residual: 1184 | ## 1185 | ## SRMR 0.027 1186 | ## 1187 | ## Parameter Estimates: 1188 | ## 1189 | ## Information Expected 1190 | ## Information saturated (h1) model Structured 1191 | ## Standard Errors Standard 1192 | ## 1193 | ## Latent Variables: 1194 | ## Estimate Std.Err z-value P(>|z|) Std.lv Std.all 1195 | ## pa1 =~ 1196 | ## age 1.000 0.999 1.000 1197 | ## pa5 =~ 1198 | ## ratio 1.000 0.733 0.734 1199 | ## chol 0.885 0.149 5.938 0.000 0.649 0.650 1200 | ## 1201 | ## Regressions: 1202 | ## Estimate Std.Err z-value P(>|z|) Std.lv Std.all 1203 | ## glyhb ~ 1204 | ## pa1 0.238 0.050 4.789 0.000 0.238 0.238 1205 | ## pa5 0.485 0.099 4.903 0.000 0.355 0.356 1206 | ## 1207 | ## Covariances: 1208 | ## Estimate Std.Err z-value P(>|z|) Std.lv Std.all 1209 | ## pa1 ~~ 1210 | ## pa5 0.207 0.049 4.237 0.000 0.283 0.283 1211 | ## 1212 | ## Variances: 1213 | ## Estimate Std.Err z-value P(>|z|) Std.lv Std.all 1214 | ## .age 0.000 0.000 0.000 1215 | ## .ratio 0.460 0.092 4.991 0.000 0.460 0.461 1216 | ## .chol 0.576 0.079 7.283 0.000 0.576 0.578 1217 | ## .glyhb 0.767 0.060 12.771 0.000 0.767 0.769 1218 | ## pa1 0.997 0.071 13.964 0.000 1.000 1.000 1219 | ## pa5 0.537 0.107 5.027 0.000 1.000 1.000 1220 | 1221 | ``` r 1222 | parameterEstimates(fit1) 1223 | ``` 1224 | 1225 | ## lhs op rhs est se z pvalue 1226 | ## 1 pa1 =~ age 1.000 0.000 NA NA 1227 | ## 2 pa2 =~ bp.1d 1.000 0.000 NA NA 1228 | ## 3 pa2 =~ bp.1s 5.235 2.648 1.977 0.048 1229 | ## 4 pa3 =~ bmi 1.000 0.000 NA NA 1230 | ## 5 pa3 =~ frame_large 0.483 0.072 6.705 0.000 1231 | ## 6 pa3 =~ frame_small -0.543 0.080 -6.793 0.000 1232 | ## 7 pa4 =~ gender_male 1.000 0.000 NA NA 1233 | ## 8 pa4 =~ waist_to_hip_rat 6.946 2.881 2.411 0.016 1234 | ## 9 pa5 =~ ratio 1.000 0.000 NA NA 1235 | ## 10 pa5 =~ chol 0.612 0.106 5.760 0.000 1236 | ## 11 pa6 =~ time.ppn 1.000 0.000 NA NA 1237 | ## 12 glyhb ~ pa1 0.257 0.049 5.254 0.000 1238 | ## 13 glyhb ~ pa2 0.055 0.061 0.892 0.372 1239 | ## 14 glyhb ~ pa3 0.063 0.134 0.469 0.639 1240 | ## 15 glyhb ~ pa4 0.132 0.290 0.456 0.648 1241 | ## 16 glyhb ~ pa5 0.351 0.090 3.916 0.000 1242 | ## 17 glyhb ~ pa6 0.056 0.046 1.222 0.222 1243 | ## 18 age ~~ age 0.000 0.000 NA NA 1244 | ## 19 bp.1d ~~ bp.1d 0.882 0.084 10.560 0.000 1245 | ## 20 bp.1s ~~ bp.1s -2.164 1.506 -1.437 0.151 1246 | ## 21 bmi ~~ bmi 0.714 0.065 10.975 0.000 1247 | ## 22 frame_large ~~ frame_large 0.126 0.013 9.944 0.000 1248 | ## 23 frame_small ~~ frame_small 0.113 0.014 8.366 0.000 1249 | ## 24 gender_male ~~ gender_male 0.218 0.018 11.936 0.000 1250 | ## 25 waist_to_hip_rat ~~ waist_to_hip_rat -0.199 0.458 -0.435 0.664 1251 | ## 26 ratio ~~ ratio 0.219 0.124 1.771 0.077 1252 | ## 27 chol ~~ chol 0.706 0.068 10.337 0.000 1253 | ## 28 time.ppn ~~ time.ppn 0.000 0.000 NA NA 1254 | ## 29 glyhb ~~ glyhb 0.777 0.059 13.243 0.000 1255 | ## 30 pa1 ~~ pa1 0.997 0.071 13.964 0.000 1256 | ## 31 pa2 ~~ pa2 0.115 0.064 1.802 0.072 1257 | ## 32 pa3 ~~ pa3 0.283 0.064 4.423 0.000 1258 | ## 33 pa4 ~~ pa4 0.025 0.012 2.035 0.042 1259 | ## 34 pa5 ~~ pa5 0.778 0.141 5.508 0.000 1260 | ## 35 pa6 ~~ pa6 0.997 0.071 13.964 0.000 1261 | ## 36 pa1 ~~ pa2 0.088 0.050 1.766 0.077 1262 | ## 37 pa1 ~~ pa3 0.130 0.037 3.510 0.000 1263 | ## 38 pa1 ~~ pa4 0.040 0.019 2.163 0.031 1264 | ## 39 pa1 ~~ pa5 0.187 0.051 3.681 0.000 1265 | ## 40 pa1 ~~ pa6 -0.039 0.051 -0.778 0.437 1266 | ## 41 pa2 ~~ pa3 0.019 0.012 1.564 0.118 1267 | ## 42 pa2 ~~ pa4 0.003 0.003 1.249 0.212 1268 | ## 43 pa2 ~~ pa5 0.023 0.015 1.499 0.134 1269 | ## 44 pa2 ~~ pa6 -0.008 0.010 -0.840 0.401 1270 | ## 45 pa3 ~~ pa4 0.027 0.013 2.113 0.035 1271 | ## 46 pa3 ~~ pa5 0.182 0.039 4.618 0.000 1272 | ## 47 pa3 ~~ pa6 0.036 0.034 1.062 0.288 1273 | ## 48 pa4 ~~ pa5 0.034 0.016 2.109 0.035 1274 | ## 49 pa4 ~~ pa6 0.000 0.007 0.003 0.998 1275 | ## 50 pa5 ~~ pa6 -0.037 0.050 -0.733 0.464 1276 | ## ci.lower ci.upper 1277 | ## 1 1.000 1.000 1278 | ## 2 1.000 1.000 1279 | ## 3 0.046 10.425 1280 | ## 4 1.000 1.000 1281 | ## 5 0.341 0.624 1282 | ## 6 -0.700 -0.386 1283 | ## 7 1.000 1.000 1284 | ## 8 1.300 12.592 1285 | ## 9 1.000 1.000 1286 | ## 10 0.403 0.820 1287 | ## 11 1.000 1.000 1288 | ## 12 0.161 0.353 1289 | ## 13 -0.065 0.174 1290 | ## 14 -0.199 0.325 1291 | ## 15 -0.436 0.700 1292 | ## 16 0.175 0.527 1293 | ## 17 -0.034 0.146 1294 | ## 18 0.000 0.000 1295 | ## 19 0.718 1.046 1296 | ## 20 -5.116 0.787 1297 | ## 21 0.587 0.842 1298 | ## 22 0.101 0.151 1299 | ## 23 0.087 0.140 1300 | ## 24 0.182 0.254 1301 | ## 25 -1.096 0.698 1302 | ## 26 -0.023 0.462 1303 | ## 27 0.573 0.840 1304 | ## 28 0.000 0.000 1305 | ## 29 0.662 0.892 1306 | ## 30 0.857 1.137 1307 | ## 31 -0.010 0.241 1308 | ## 32 0.158 0.409 1309 | ## 33 0.001 0.049 1310 | ## 34 0.501 1.055 1311 | ## 35 0.857 1.137 1312 | ## 36 -0.010 0.186 1313 | ## 37 0.057 0.203 1314 | ## 38 0.004 0.077 1315 | ## 39 0.088 0.287 1316 | ## 40 -0.138 0.060 1317 | ## 41 -0.005 0.044 1318 | ## 42 -0.002 0.008 1319 | ## 43 -0.007 0.053 1320 | ## 44 -0.027 0.011 1321 | ## 45 0.002 0.052 1322 | ## 46 0.105 0.260 1323 | ## 47 -0.031 0.104 1324 | ## 48 0.002 0.067 1325 | ## 49 -0.014 0.014 1326 | ## 50 -0.135 0.061 1327 | -------------------------------------------------------------------------------- /case_studies/data/diabetes.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/data/diabetes.sav -------------------------------------------------------------------------------- /case_studies/figures/cs1-unnamed-chunk-12-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-12-1.pdf -------------------------------------------------------------------------------- /case_studies/figures/cs1-unnamed-chunk-12-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-12-1.png -------------------------------------------------------------------------------- /case_studies/figures/cs1-unnamed-chunk-15-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-15-1.png -------------------------------------------------------------------------------- /case_studies/figures/cs1-unnamed-chunk-18-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-18-1.png -------------------------------------------------------------------------------- /case_studies/figures/cs1-unnamed-chunk-19-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-19-1.png -------------------------------------------------------------------------------- /case_studies/figures/cs1-unnamed-chunk-22-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-22-1.png -------------------------------------------------------------------------------- /case_studies/figures/cs1-unnamed-chunk-23-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-23-1.png -------------------------------------------------------------------------------- /case_studies/figures/cs1-unnamed-chunk-24-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-24-1.png -------------------------------------------------------------------------------- /case_studies/figures/cs1-unnamed-chunk-25-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-25-1.png -------------------------------------------------------------------------------- /case_studies/figures/cs1-unnamed-chunk-26-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-26-1.png -------------------------------------------------------------------------------- /case_studies/figures/cs1-unnamed-chunk-29-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-29-1.png -------------------------------------------------------------------------------- /case_studies/figures/cs1-unnamed-chunk-33-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-33-1.png -------------------------------------------------------------------------------- /case_studies/figures/cs1-unnamed-chunk-9-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-9-1.pdf -------------------------------------------------------------------------------- /case_studies/figures/cs1-unnamed-chunk-9-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/case_studies/figures/cs1-unnamed-chunk-9-1.png -------------------------------------------------------------------------------- /ci/scripts/runAllModels.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | cd scripts 3 | for f in *.py; do python "$f"; done -------------------------------------------------------------------------------- /data/aircraft.csv: -------------------------------------------------------------------------------- 1 | "","X1","X2","X3","X4","Y" 2 | "1",6.3,1.7,8176,4500,2.76 3 | "2",6,1.9,6699,3120,4.76 4 | "3",5.9,1.5,9663,6300,8.75 5 | "4",3,1.2,12837,9800,7.78 6 | "5",5,1.8,10205,4900,6.18 7 | "6",6.3,2,14890,6500,9.5 8 | "7",5.6,1.6,13836,8920,5.14 9 | "8",3.6,1.2,11628,14500,4.76 10 | "9",2,1.4,15225,14800,16.7 11 | "10",2.9,2.3,18691,10900,27.68 12 | "11",2.2,1.9,19350,16000,26.64 13 | "12",3.9,2.6,20638,16000,13.71 14 | "13",4.5,2,12843,7800,12.31 15 | "14",4.3,9.7,13384,17900,15.73 16 | "15",4,2.9,13307,10500,13.59 17 | "16",3.2,4.3,29855,24500,51.9 18 | "17",4.3,4.3,29277,30000,20.78 19 | "18",2.4,2.6,24651,24500,29.82 20 | "19",2.8,3.7,28539,34000,32.78 21 | "20",3.9,3.3,8085,8160,10.12 22 | "21",2.8,3.9,30328,35800,27.84 23 | "22",1.6,4.1,46172,37000,107.1 24 | "23",3.4,2.5,17836,19600,11.19 25 | -------------------------------------------------------------------------------- /data/awards.csv: -------------------------------------------------------------------------------- 1 | id,num_awards,prog,math 2 | 45,1,3,41 3 | 108,1,1,41 4 | 15,1,3,44 5 | 67,1,3,42 6 | 153,1,3,40 7 | 51,1,1,42 8 | 164,1,3,46 9 | 133,1,3,40 10 | 2,1,3,33 11 | 53,1,3,46 12 | 1,1,3,40 13 | 128,0,2,38 14 | 16,1,3,44 15 | 106,1,3,37 16 | 89,1,3,40 17 | 134,1,1,39 18 | 19,1,1,43 19 | 145,0,3,38 20 | 11,1,2,45 21 | 117,0,3,39 22 | 109,1,1,42 23 | 12,1,3,45 24 | 37,1,3,40 25 | 69,0,3,40 26 | 43,1,2,43 27 | 196,1,2,49 28 | 36,1,1,44 29 | 155,1,1,46 30 | 6,0,2,46 31 | 4,1,2,41 32 | 25,0,1,42 33 | 107,0,3,47 34 | 5,1,2,43 35 | 47,1,2,49 36 | 140,1,3,40 37 | 22,1,3,39 38 | 18,1,3,49 39 | 30,0,2,42 40 | 40,0,1,43 41 | 176,0,2,41 42 | 126,0,1,57 43 | 197,0,2,50 44 | 46,0,2,44 45 | 49,0,3,39 46 | 8,0,2,52 47 | 124,1,3,41 48 | 13,0,3,39 49 | 111,0,1,39 50 | 142,0,3,52 51 | 193,1,2,48 52 | 105,3,2,45 53 | 58,2,3,40 54 | 129,3,1,46 55 | 38,3,2,50 56 | 182,0,2,43 57 | 115,0,1,43 58 | 14,1,2,54 59 | 175,1,1,42 60 | 44,2,3,45 61 | 86,2,1,54 62 | 72,3,3,47 63 | 41,1,2,45 64 | 191,0,2,43 65 | 138,1,3,40 66 | 9,0,3,52 67 | 151,1,3,52 68 | 119,0,1,45 69 | 55,1,2,49 70 | 73,1,2,53 71 | 28,0,1,54 72 | 90,2,2,50 73 | 17,0,2,48 74 | 102,0,2,51 75 | 70,0,1,41 76 | 148,1,3,51 77 | 54,0,1,46 78 | 42,0,3,55 79 | 87,0,1,46 80 | 21,2,1,61 81 | 181,1,2,45 82 | 165,1,3,54 83 | 78,1,2,54 84 | 76,1,2,51 85 | 29,0,1,49 86 | 91,1,3,56 87 | 52,2,2,53 88 | 10,1,1,49 89 | 85,3,1,57 90 | 50,0,1,42 91 | 56,1,3,46 92 | 64,1,3,45 93 | 130,1,1,55 94 | 141,1,3,47 95 | 74,0,2,50 96 | 83,1,3,41 97 | 31,0,1,52 98 | 172,1,2,57 99 | 184,1,3,53 100 | 75,1,3,51 101 | 187,1,1,57 102 | 113,1,2,51 103 | 162,0,3,40 104 | 110,2,3,50 105 | 150,2,3,57 106 | 167,0,1,35 107 | 77,1,2,49 108 | 35,0,1,50 109 | 158,1,1,55 110 | 112,0,2,48 111 | 48,0,2,52 112 | 147,1,2,53 113 | 7,1,2,59 114 | 65,2,2,66 115 | 168,0,2,57 116 | 190,1,2,54 117 | 178,0,3,57 118 | 159,1,2,54 119 | 120,0,2,54 120 | 116,0,2,54 121 | 79,2,2,49 122 | 98,1,3,51 123 | 122,3,2,58 124 | 179,1,2,60 125 | 198,1,2,51 126 | 189,1,2,63 127 | 199,1,2,50 128 | 156,1,2,53 129 | 166,0,2,53 130 | 160,0,2,55 131 | 152,1,2,56 132 | 183,0,2,49 133 | 94,1,2,61 134 | 149,0,1,49 135 | 131,0,2,57 136 | 24,0,2,66 137 | 99,0,1,56 138 | 171,3,2,60 139 | 104,1,2,57 140 | 81,1,2,59 141 | 97,1,2,58 142 | 20,0,2,57 143 | 163,3,2,64 144 | 195,0,1,60 145 | 84,0,1,54 146 | 27,1,2,61 147 | 118,1,1,58 148 | 71,0,1,56 149 | 63,0,1,60 150 | 185,0,2,55 151 | 127,3,2,57 152 | 177,0,2,62 153 | 188,0,2,56 154 | 60,0,2,51 155 | 66,2,3,56 156 | 173,0,1,61 157 | 186,1,2,63 158 | 96,5,2,61 159 | 101,0,2,67 160 | 3,0,2,48 161 | 170,1,2,61 162 | 92,0,1,57 163 | 62,0,1,48 164 | 135,2,2,65 165 | 26,4,2,62 166 | 139,1,2,61 167 | 121,0,3,53 168 | 144,1,1,58 169 | 146,1,2,64 170 | 137,3,2,65 171 | 123,1,1,56 172 | 169,1,1,63 173 | 34,3,2,57 174 | 33,2,2,72 175 | 32,0,3,66 176 | 114,0,2,62 177 | 125,1,2,58 178 | 59,1,2,63 179 | 23,3,2,64 180 | 161,2,2,72 181 | 103,0,2,64 182 | 194,6,2,69 183 | 136,4,2,70 184 | 154,1,2,66 185 | 157,0,1,58 186 | 93,2,2,62 187 | 39,2,2,67 188 | 88,1,2,64 189 | 192,2,2,63 190 | 80,1,2,68 191 | 200,1,2,75 192 | 180,0,2,69 193 | 82,1,2,65 194 | 174,2,2,71 195 | 95,5,2,71 196 | 61,1,2,60 197 | 100,2,2,71 198 | 143,2,3,75 199 | 68,1,2,71 200 | 57,0,2,72 201 | 132,3,2,73 202 | -------------------------------------------------------------------------------- /data/binary.dta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/data/binary.dta -------------------------------------------------------------------------------- /data/cereals.txt: -------------------------------------------------------------------------------- 1 | name mfr type calories protein fat sodium fiber carbo sugars potass vitamins shelf weight cups rating 2 | 100%_Bran N C 70 4 1 130 10 5 6 280 25 3 1 0.33 68.402973 3 | 100%_Natural_Bran Q C 120 3 5 15 2 8 8 135 0 3 1 1 33.983679 4 | All-Bran K C 70 4 1 260 9 7 5 320 25 3 1 0.33 59.425505 5 | All-Bran_with_Extra_Fiber K C 50 4 0 140 14 8 0 330 25 3 1 0.5 93.704912 6 | Almond_Delight R C 110 2 2 200 1 14 8 -1 25 3 1 0.75 34.384843 7 | Apple_Cinnamon_Cheerios G C 110 2 2 180 1.5 10.5 10 70 25 1 1 0.75 29.509541 8 | Apple_Jacks K C 110 2 0 125 1 11 14 30 25 2 1 1 33.174094 9 | Basic_4 G C 130 3 2 210 2 18 8 100 25 3 1.33 0.75 37.038562 10 | Bran_Chex R C 90 2 1 200 4 15 6 125 25 1 1 0.67 49.120253 11 | Bran_Flakes P C 90 3 0 210 5 13 5 190 25 3 1 0.67 53.313813 12 | Cap'n'Crunch Q C 120 1 2 220 0 12 12 35 25 2 1 0.75 18.042851 13 | Cheerios G C 110 6 2 290 2 17 1 105 25 1 1 1.25 50.764999 14 | Cinnamon_Toast_Crunch G C 120 1 3 210 0 13 9 45 25 2 1 0.75 19.823573 15 | Clusters G C 110 3 2 140 2 13 7 105 25 3 1 0.5 40.400208 16 | Cocoa_Puffs G C 110 1 1 180 0 12 13 55 25 2 1 1 22.736446 17 | Corn_Chex R C 110 2 0 280 0 22 3 25 25 1 1 1 41.445019 18 | Corn_Flakes K C 100 2 0 290 1 21 2 35 25 1 1 1 45.863324 19 | Corn_Pops K C 110 1 0 90 1 13 12 20 25 2 1 1 35.782791 20 | Count_Chocula G C 110 1 1 180 0 12 13 65 25 2 1 1 22.396513 21 | Cracklin'_Oat_Bran K C 110 3 3 140 4 10 7 160 25 3 1 0.5 40.448772 22 | Cream_of_Wheat_(Quick) N H 100 3 0 80 1 21 0 -1 0 2 1 1 64.533816 23 | Crispix K C 110 2 0 220 1 21 3 30 25 3 1 1 46.895644 24 | Crispy_Wheat_&_Raisins G C 100 2 1 140 2 11 10 120 25 3 1 0.75 36.176196 25 | Double_Chex R C 100 2 0 190 1 18 5 80 25 3 1 0.75 44.330856 26 | Froot_Loops K C 110 2 1 125 1 11 13 30 25 2 1 1 32.207582 27 | Frosted_Flakes K C 110 1 0 200 1 14 11 25 25 1 1 0.75 31.435973 28 | Frosted_Mini-Wheats K C 100 3 0 0 3 14 7 100 25 2 1 0.8 58.345141 29 | Fruit_&_Fibre_Dates,_Walnuts,_and_Oats P C 120 3 2 160 5 12 10 200 25 3 1.25 0.67 40.917047 30 | Fruitful_Bran K C 120 3 0 240 5 14 12 190 25 3 1.33 0.67 41.015492 31 | Fruity_Pebbles P C 110 1 1 135 0 13 12 25 25 2 1 0.75 28.025765 32 | Golden_Crisp P C 100 2 0 45 0 11 15 40 25 1 1 0.88 35.252444 33 | Golden_Grahams G C 110 1 1 280 0 15 9 45 25 2 1 0.75 23.804043 34 | Grape_Nuts_Flakes P C 100 3 1 140 3 15 5 85 25 3 1 0.88 52.076897 35 | Grape-Nuts P C 110 3 0 170 3 17 3 90 25 3 1 0.25 53.371007 36 | Great_Grains_Pecan P C 120 3 3 75 3 13 4 100 25 3 1 0.33 45.811716 37 | Honey_Graham_Ohs Q C 120 1 2 220 1 12 11 45 25 2 1 1 21.871292 38 | Honey_Nut_Cheerios G C 110 3 1 250 1.5 11.5 10 90 25 1 1 0.75 31.072217 39 | Honey-comb P C 110 1 0 180 0 14 11 35 25 1 1 1.33 28.742414 40 | Just_Right_Crunchy__Nuggets K C 110 2 1 170 1 17 6 60 100 3 1 1 36.523683 41 | Just_Right_Fruit_&_Nut K C 140 3 1 170 2 20 9 95 100 3 1.3 0.75 36.471512 42 | Kix G C 110 2 1 260 0 21 3 40 25 2 1 1.5 39.241114 43 | Life Q C 100 4 2 150 2 12 6 95 25 2 1 0.67 45.328074 44 | Lucky_Charms G C 110 2 1 180 0 12 12 55 25 2 1 1 26.734515 45 | Maypo A H 100 4 1 0 0 16 3 95 25 2 1 1 54.850917 46 | Muesli_Raisins,_Dates,_&_Almonds R C 150 4 3 95 3 16 11 170 25 3 1 1 37.136863 47 | Muesli_Raisins,_Peaches,_&_Pecans R C 150 4 3 150 3 16 11 170 25 3 1 1 34.139765 48 | Mueslix_Crispy_Blend K C 160 3 2 150 3 17 13 160 25 3 1.5 0.67 30.313351 49 | Multi-Grain_Cheerios G C 100 2 1 220 2 15 6 90 25 1 1 1 40.105965 50 | Nut&Honey_Crunch K C 120 2 1 190 0 15 9 40 25 2 1 0.67 29.924285 51 | Nutri-Grain_Almond-Raisin K C 140 3 2 220 3 21 7 130 25 3 1.33 0.67 40.692320 52 | Nutri-grain_Wheat K C 90 3 0 170 3 18 2 90 25 3 1 1 59.642837 53 | Oatmeal_Raisin_Crisp G C 130 3 2 170 1.5 13.5 10 120 25 3 1.25 0.5 30.450843 54 | Post_Nat._Raisin_Bran P C 120 3 1 200 6 11 14 260 25 3 1.33 0.67 37.840594 55 | Product_19 K C 100 3 0 320 1 20 3 45 100 3 1 1 41.503540 56 | Puffed_Rice Q C 50 1 0 0 0 13 0 15 0 3 0.5 1 60.756112 57 | Puffed_Wheat Q C 50 2 0 0 1 10 0 50 0 3 0.5 1 63.005645 58 | Quaker_Oat_Squares Q C 100 4 1 135 2 14 6 110 25 3 1 0.5 49.511874 59 | Quaker_Oatmeal Q H 100 5 2 0 2.7 -1 -1 110 0 1 1 0.67 50.828392 60 | Raisin_Bran K C 120 3 1 210 5 14 12 240 25 2 1.33 0.75 39.259197 61 | Raisin_Nut_Bran G C 100 3 2 140 2.5 10.5 8 140 25 3 1 0.5 39.703400 62 | Raisin_Squares K C 90 2 0 0 2 15 6 110 25 3 1 0.5 55.333142 63 | Rice_Chex R C 110 1 0 240 0 23 2 30 25 1 1 1.13 41.998933 64 | Rice_Krispies K C 110 2 0 290 0 22 3 35 25 1 1 1 40.560159 65 | Shredded_Wheat N C 80 2 0 0 3 16 0 95 0 1 0.83 1 68.235885 66 | Shredded_Wheat_'n'Bran N C 90 3 0 0 4 19 0 140 0 1 1 0.67 74.472949 67 | Shredded_Wheat_spoon_size N C 90 3 0 0 3 20 0 120 0 1 1 0.67 72.801787 68 | Smacks K C 110 2 1 70 1 9 15 40 25 2 1 0.75 31.230054 69 | Special_K K C 110 6 0 230 1 16 3 55 25 1 1 1 53.131324 70 | Strawberry_Fruit_Wheats N C 90 2 0 15 3 15 5 90 25 2 1 1 59.363993 71 | Total_Corn_Flakes G C 110 2 1 200 0 21 3 35 100 3 1 1 38.839746 72 | Total_Raisin_Bran G C 140 3 1 190 4 15 14 230 100 3 1.5 1 28.592785 73 | Total_Whole_Grain G C 100 3 1 200 3 16 3 110 100 3 1 1 46.658844 74 | Triples G C 110 2 1 250 0 21 3 60 25 3 1 0.75 39.106174 75 | Trix G C 110 1 1 140 0 13 12 25 25 2 1 1 27.753301 76 | Wheat_Chex R C 100 3 1 230 3 17 3 115 25 1 1 0.67 49.787445 77 | Wheaties G C 100 3 1 200 3 17 3 110 25 1 1 1 51.592193 78 | Wheaties_Honey_Gold G C 110 2 1 200 1 16 8 60 25 1 1 0.75 36.187559 79 | -------------------------------------------------------------------------------- /data/child_data.csv: -------------------------------------------------------------------------------- 1 | age,mem_span,iq,read_ab 2 | 6.7,4.4,95,7.2 3 | 5.9,4,90,6 4 | 5.5,4.1,105,6 5 | 6.2,4.8,98,6.6 6 | 6.4,5,106,7 7 | 7.3,5.5,100,7.2 8 | 5.7,3.6,88,5.3 9 | 6.15,5,95,6.4 10 | 7.5,5.4,96,6.6 11 | 6.9,5,104,7.3 12 | 4.1,3.9,108,5 13 | 5.5,4.2,90,5.8 14 | 6.9,4.5,91,6.6 15 | 7.2,5,92,6.8 16 | 4,4.2,101,5.6 17 | 7.3,5.5,100,7.2 18 | 5.9,4,90,6 19 | 5.5,4.2,90,5.8 20 | 4,4.2,101,5.6 21 | 5.9,4,90,6 -------------------------------------------------------------------------------- /data/drugtrial.csv: -------------------------------------------------------------------------------- 1 | subject,gender,dose,score 2 | 1,1,1,6 3 | 2,1,1,6 4 | 3,1,1,3 5 | 4,1,1,5 6 | 5,1,1,6 7 | 6,1,1,4 8 | 7,1,1,5 9 | 8,1,1,4 10 | 9,1,1,4 11 | 10,1,1,5 12 | 11,1,1,4 13 | 12,1,1,3 14 | 13,1,2,6 15 | 14,1,2,8 16 | 15,1,2,7 17 | 16,1,2,8 18 | 17,1,2,6 19 | 18,1,2,8 20 | 19,1,2,8 21 | 20,1,2,6 22 | 21,1,2,7 23 | 22,1,2,8 24 | 23,1,2,6 25 | 24,1,2,7 26 | 25,2,1,2 27 | 26,2,1,5 28 | 27,2,1,2 29 | 28,2,1,4 30 | 29,2,1,5 31 | 30,2,1,7 32 | 31,2,1,4 33 | 32,2,1,1 34 | 33,2,1,2 35 | 34,2,1,7 36 | 35,2,1,4 37 | 36,2,1,0 38 | 37,2,2,2 39 | 38,2,2,3 40 | 39,2,2,4 41 | 40,2,2,0 42 | 41,2,2,0 43 | 42,2,2,1 44 | 43,2,2,2 45 | 44,2,2,2 46 | 45,2,2,4 47 | 46,2,2,3 48 | 47,2,2,6 49 | 48,2,2,3 -------------------------------------------------------------------------------- /data/hsbdemo.dta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/data/hsbdemo.dta -------------------------------------------------------------------------------- /data/iqdata.csv: -------------------------------------------------------------------------------- 1 | group,iq 2 | 1,44 3 | 1,40 4 | 1,44 5 | 1,39 6 | 1,25 7 | 1,37 8 | 1,31 9 | 1,40 10 | 1,22 11 | 1,34 12 | 1,39 13 | 1,20 14 | 1,39 15 | 1,42 16 | 1,41 17 | 2,36 18 | 2,40 19 | 2,37 20 | 2,35 21 | 2,39 22 | 2,40 23 | 2,36 24 | 2,38 25 | 2,24 26 | 2,27 27 | 2,29 28 | 2,24 29 | 2,45 30 | 2,44 31 | 2,44 32 | 3,52 33 | 3,50 34 | 3,51 35 | 3,52 36 | 3,45 37 | 3,49 38 | 3,47 39 | 3,46 40 | 3,47 41 | 3,47 42 | 3,46 43 | 3,45 44 | 3,50 45 | -------------------------------------------------------------------------------- /data/ologit.dta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/data/ologit.dta -------------------------------------------------------------------------------- /data/scents.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/data/scents.sav -------------------------------------------------------------------------------- /data/temprate.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/data/temprate.sav -------------------------------------------------------------------------------- /models/linearRegression.stan: -------------------------------------------------------------------------------- 1 | /* 2 | * Linear Regression 3 | * ----------------------------------------------------- 4 | * Copyright: Murat Koptur 5 | * Date: 05/08/2018 6 | * License: GPLv3 7 | */ 8 | 9 | data { 10 | int N; // number of observations 11 | vector[N] x; // day x_i 12 | vector[N] y; // weigth in grams on day x_i 13 | } 14 | 15 | parameters { 16 | real alpha; // intercept 17 | real beta; // slope 18 | real sigma; // std deviation 19 | } 20 | 21 | model { 22 | alpha ~ normal(0, 100); 23 | beta ~ normal(0, 100); 24 | sigma ~ cauchy(0, 10); 25 | y ~ normal(alpha + beta * x, sigma); 26 | } 27 | 28 | generated quantities { 29 | // http://mc-stan.org/loo/reference/extract_log_lik.html 30 | vector[N] log_lik; 31 | for (n in 1:N) 32 | log_lik[n] = normal_lpdf(y[n] | alpha + beta * x[n], sigma); 33 | } -------------------------------------------------------------------------------- /models/logisticRegression.stan: -------------------------------------------------------------------------------- 1 | /* 2 | * Logistic Regression 3 | * ----------------------------------------------------- 4 | * Copyright: Murat Koptur 5 | * Date: 09/08/2018 6 | * License: GPLv3 7 | */ 8 | 9 | data { 10 | int N_train; 11 | int N_test; 12 | int D; 13 | row_vector[D] x_train[N_train]; 14 | row_vector[D] x_test[N_test]; 15 | int y_train[N_train]; 16 | } 17 | 18 | parameters { 19 | real alpha; 20 | vector[D] beta; 21 | } 22 | 23 | model { 24 | alpha ~ normal(0, 10); 25 | beta ~ student_t(1,0,2.5); // weakly informative priors 26 | for (n in 1:N_train) 27 | y_train[n] ~ bernoulli_logit(x_train[n] * beta + alpha); 28 | } 29 | 30 | generated quantities { 31 | int y_pred[N_test]; 32 | for (n in 1:N_test) { 33 | y_pred[n] = bernoulli_logit_rng(x_test[n] * beta + alpha); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /models/multinomialLogisticRegression.stan: -------------------------------------------------------------------------------- 1 | /* 2 | * Multinomial Logistic Regression 3 | * ----------------------------------------------------- 4 | * Copyright: Murat Koptur 5 | * Date: 11/08/2018 6 | * License: GPLv3 7 | */ 8 | 9 | data { 10 | int N; // number of observations 11 | int K; // number of possible outcomes 12 | int D; // D is dimension of x_n vectors 13 | vector[D] x[N]; 14 | int y[N]; 15 | } 16 | 17 | parameters { 18 | matrix[K, D] beta; 19 | } 20 | 21 | model { 22 | for (k in 1:K) 23 | beta[k] ~ normal(0, 1); 24 | for (n in 1:N) 25 | y[n] ~ categorical_logit(beta * x[n]); 26 | } 27 | -------------------------------------------------------------------------------- /models/multipleLinearRegression.stan: -------------------------------------------------------------------------------- 1 | /* 2 | * Multiple Linear Regression 3 | * ----------------------------------------------------- 4 | * Copyright: Murat Koptur 5 | * Date: 06/08/2018 6 | * License: GPLv3 7 | */ 8 | 9 | data { 10 | int N; // number of observations 11 | vector[N] fat; // grams of fat 12 | vector[N] weight; // weight in ounces of one serving 13 | vector[N] cups; // number of cups in one serving 14 | vector[N] rating; // a rating of the cereals 15 | } 16 | 17 | parameters { 18 | real b_fat; // coefficents 19 | real b_weight; 20 | real b_cups; 21 | real beta; 22 | real sigma; // std deviation 23 | } 24 | 25 | model { 26 | b_fat ~ normal(0, 10); 27 | b_weight ~ normal(0, 10); 28 | b_cups ~ normal(0, 10); 29 | beta ~ normal(0, 10); 30 | sigma ~ cauchy(0, 5); 31 | rating ~ normal(beta + b_fat * fat + b_weight * weight + 32 | b_cups * cups, sigma); 33 | } 34 | 35 | generated quantities { 36 | real rating_pred[N]; // predictions 37 | real log_lik[N]; 38 | for (n in 1:N) 39 | rating_pred[n] = normal_rng(beta + b_fat * fat[n] + b_weight * weight[n] + 40 | b_cups * cups[n], sigma); 41 | for (n in 1:N) 42 | log_lik[n] = normal_lpdf(rating[n] | beta + b_fat * fat[n] + b_weight * weight[n] + 43 | b_cups * cups[n], sigma); 44 | } 45 | -------------------------------------------------------------------------------- /models/onewayANOVA.stan: -------------------------------------------------------------------------------- 1 | /* 2 | * One-way ANOVA 3 | * ----------------------------------------------------- 4 | * Copyright: Murat Koptur 5 | * Date: 17/08/2018 6 | * License: GPLv3 7 | */ 8 | 9 | data { 10 | int N; 11 | int x1[N]; 12 | int x2[N]; 13 | int y[N]; 14 | } 15 | 16 | parameters { 17 | real alpha; 18 | real beta_x1; 19 | real beta_x2; 20 | real sigma; 21 | } 22 | 23 | model { 24 | alpha ~ normal(0, 10); 25 | beta_x1 ~ normal(0, 10); 26 | beta_x2 ~ normal(0, 10); 27 | sigma ~ normal(0, 5); 28 | for (i in 1:N) 29 | y[i] ~ normal(alpha + beta_x1 * x1[i] + beta_x2 * x2[i], sigma); 30 | } 31 | -------------------------------------------------------------------------------- /models/orderedLogisticRegression.stan: -------------------------------------------------------------------------------- 1 | /* 2 | * Ordered Logistic Regression 3 | * ----------------------------------------------------- 4 | * Copyright: Murat Koptur 5 | * Date: 13/08/2018 6 | * License: GPLv3 7 | */ 8 | 9 | data { 10 | int N; 11 | int D; 12 | int K; 13 | row_vector[D] x[N]; 14 | int y[N]; 15 | } 16 | 17 | parameters { 18 | vector[D] beta; 19 | ordered[K-1] c; 20 | } 21 | 22 | model { 23 | for (n in 1:N) 24 | y[n] ~ ordered_logistic(x[n] * beta, c); 25 | } 26 | -------------------------------------------------------------------------------- /models/robustRegression.stan: -------------------------------------------------------------------------------- 1 | /* 2 | * Robust Regression 3 | * ----------------------------------------------------- 4 | * Copyright: Murat Koptur 5 | * Date: 08/08/2018 6 | * License: GPLv3 7 | */ 8 | 9 | data { 10 | int N; // number of observations 11 | vector[N] X1; // Aspect Ratio 12 | vector[N] X2; // Lift-to-Drag Ratio 13 | vector[N] X3; // Weight 14 | vector[N] X4; // Thrust 15 | vector[N] Y; // Cost 16 | } 17 | 18 | parameters { 19 | real b_X1; 20 | real b_X2; 21 | real b_X3; 22 | real b_X4; 23 | real beta; 24 | real sigma; 25 | real nu; 26 | } 27 | 28 | model { 29 | b_X1 ~ normal(0, 1e6); 30 | b_X2 ~ normal(0, 1e6); 31 | b_X3 ~ normal(0, 1e6); 32 | b_X4 ~ normal(0, 1e6); 33 | beta ~ normal(0, 1e3); 34 | sigma ~ normal(0, 5); 35 | nu ~ gamma(2, 0.1); 36 | Y ~ student_t(nu, 37 | beta + b_X1 * X1 + b_X2 * X2 + b_X3 * X3 + b_X4 * X4, 38 | sigma); 39 | } 40 | 41 | generated quantities { 42 | real Y_pred[N]; // predictions 43 | for (n in 1:N) { 44 | Y_pred[n] = student_t_rng(nu, 45 | beta + b_X1 * X1[n] + 46 | b_X2 * X2[n] + b_X3 * X3[n] + 47 | b_X4 * X4[n], 48 | sigma); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /models/twowayANOVA.stan: -------------------------------------------------------------------------------- 1 | /* 2 | * Two-way ANOVA 3 | * ----------------------------------------------------- 4 | * Copyright: Murat Koptur 5 | * Date: 17/08/2018 6 | * License: GPLv3 7 | */ 8 | 9 | data { 10 | int N; 11 | int x1[N]; 12 | int x2[N]; 13 | int y[N]; 14 | } 15 | 16 | parameters { 17 | real alpha; 18 | real beta_x1; 19 | real beta_x2; 20 | real beta_x3; 21 | real sigma; 22 | } 23 | 24 | model { 25 | alpha ~ normal(0, 10); 26 | beta_x1 ~ normal(0, 10); 27 | beta_x2 ~ normal(0, 10); 28 | beta_x3 ~ normal(0, 10); 29 | sigma ~ normal(0, 5); 30 | for (i in 1:N) 31 | y[i] ~ normal(alpha + beta_x1 * x1[i] + beta_x2 * x2[i] + beta_x3 * x1[i] * x2[i], sigma); 32 | } 33 | -------------------------------------------------------------------------------- /notebooks/Bayes Factor.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Bayes Factors" 3 | author: "Murat Koptur" 4 | date: "`r format(Sys.time(), '%d %B %Y')`" 5 | output: rmarkdown::github_document 6 | --- 7 | 8 | ```{r echo=FALSE} 9 | knitr::opts_chunk$set(fig.path='figures/bf-') 10 | ``` 11 | 12 | ```{r} 13 | library(haven) 14 | library(BayesFactor) 15 | ``` 16 | 17 | ```{r} 18 | scents <- read_spss("../data/scents.sav") 19 | head(scents) 20 | ``` 21 | 22 | ```{r} 23 | scents$diffs <- scents$noscent - scents$scent 24 | head(scents) 25 | ``` 26 | 27 | ```{r} 28 | bf <- ttestBF(scents$diffs) 29 | bf 30 | ``` 31 | 32 | ```{r} 33 | sprintf("Bayes factor: %f", exp(bf@bayesFactor$bf)) 34 | ``` 35 | 36 | -------------------------------------------------------------------------------- /notebooks/Bayes_Factor.md: -------------------------------------------------------------------------------- 1 | Bayes Factors 2 | ================ 3 | Murat Koptur 4 | 25 Ağustos 2018 5 | 6 | ``` r 7 | library(haven) 8 | library(BayesFactor) 9 | ``` 10 | 11 | ## Loading required package: coda 12 | 13 | ## Loading required package: Matrix 14 | 15 | ## ************ 16 | ## Welcome to BayesFactor 0.9.12-4.2. If you have questions, please contact Richard Morey (richarddmorey@gmail.com). 17 | ## 18 | ## Type BFManual() to open the manual. 19 | ## ************ 20 | 21 | ``` r 22 | scents <- read_spss("../data/scents.sav") 23 | head(scents) 24 | ``` 25 | 26 | ## # A tibble: 6 x 4 27 | ## part sex noscent scent 28 | ## 29 | ## 1 1 1 27.7 30.6 30 | ## 2 2 2 57.2 43.3 31 | ## 3 3 1 57.9 53.4 32 | ## 4 4 1 38 37.4 33 | ## 5 5 1 57.9 48.6 34 | ## 6 6 2 32 35.5 35 | 36 | ``` r 37 | scents$diffs <- scents$noscent - scents$scent 38 | head(scents) 39 | ``` 40 | 41 | ## # A tibble: 6 x 5 42 | ## part sex noscent scent diffs 43 | ## 44 | ## 1 1 1 27.7 30.6 -2.9 45 | ## 2 2 2 57.2 43.3 13.9 46 | ## 3 3 1 57.9 53.4 4.5 47 | ## 4 4 1 38 37.4 0.6 48 | ## 5 5 1 57.9 48.6 9.30 49 | ## 6 6 2 32 35.5 -3.5 50 | 51 | ``` r 52 | bf <- ttestBF(scents$diffs) 53 | bf 54 | ``` 55 | 56 | ## Bayes factor analysis 57 | ## -------------- 58 | ## [1] Alt., r=0.707 : 0.2294321 ±0.03% 59 | ## 60 | ## Against denominator: 61 | ## Null, mu = 0 62 | ## --- 63 | ## Bayes factor type: BFoneSample, JZS 64 | 65 | ``` r 66 | sprintf("Bayes factor: %f", exp(bf@bayesFactor$bf)) 67 | ``` 68 | 69 | ## [1] "Bayes factor: 0.229432" 70 | -------------------------------------------------------------------------------- /notebooks/Correlation Analysis.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Correlation Analysis" 3 | author: "Murat Koptur" 4 | date: "`r format(Sys.time(), '%d %B %Y')`" 5 | output: rmarkdown::github_document 6 | --- 7 | 8 | ```{r echo=FALSE} 9 | knitr::opts_chunk$set(fig.path='figures/corr-') 10 | ``` 11 | 12 | ```{r} 13 | library(ggpubr) 14 | library(haven) 15 | ``` 16 | 17 | ```{r} 18 | temprate <- read_sav("../data/temprate.sav") 19 | head(temprate) 20 | ``` 21 | 22 | ```{r} 23 | cor.test(temprate$temp, temprate$hrtrate, method = "pearson") 24 | ``` 25 | 26 | ```{r} 27 | ggscatter( 28 | data = temprate, 29 | x = "temp", 30 | y = "hrtrate", 31 | add = "reg.line", 32 | conf.int = TRUE, 33 | cor.coef = TRUE, 34 | cor.method = "pearson", 35 | xlab = "Temperature", 36 | ylab = "Heart Rate" 37 | ) 38 | ``` 39 | 40 | -------------------------------------------------------------------------------- /notebooks/Correlation_Analysis.md: -------------------------------------------------------------------------------- 1 | Correlation Analysis 2 | ================ 3 | Murat Koptur 4 | 24 Ağustos 2018 5 | 6 | ``` r 7 | library(ggpubr) 8 | ``` 9 | 10 | ## Loading required package: ggplot2 11 | 12 | ## Loading required package: magrittr 13 | 14 | ``` r 15 | library(haven) 16 | ``` 17 | 18 | ``` r 19 | temprate <- read_sav("../data/temprate.sav") 20 | head(temprate) 21 | ``` 22 | 23 | ## # A tibble: 6 x 2 24 | ## temp hrtrate 25 | ## 26 | ## 1 35.7 70 27 | ## 2 35.9 71 28 | ## 3 36.1 74 29 | ## 4 36.1 80 30 | ## 5 36.2 73 31 | ## 6 36.2 75 32 | 33 | ``` r 34 | cor.test(temprate$temp, temprate$hrtrate, method = "pearson") 35 | ``` 36 | 37 | ## 38 | ## Pearson's product-moment correlation 39 | ## 40 | ## data: temprate$temp and temprate$hrtrate 41 | ## t = 2.9668, df = 128, p-value = 0.003591 42 | ## alternative hypothesis: true correlation is not equal to 0 43 | ## 95 percent confidence interval: 44 | ## 0.08519113 0.40802170 45 | ## sample estimates: 46 | ## cor 47 | ## 0.2536564 48 | 49 | ``` r 50 | ggscatter( 51 | data = temprate, 52 | x = "temp", 53 | y = "hrtrate", 54 | add = "reg.line", 55 | conf.int = TRUE, 56 | cor.coef = TRUE, 57 | cor.method = "pearson", 58 | xlab = "Temperature", 59 | ylab = "Heart Rate" 60 | ) 61 | ``` 62 | 63 | ![](figures/corr-unnamed-chunk-5-1.png) 64 | -------------------------------------------------------------------------------- /notebooks/Factor Analysis.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Factor Analysis" 3 | author: "Murat Koptur" 4 | date: "`r format(Sys.time(), '%d %B %Y')`" 5 | output: rmarkdown::github_document 6 | --- 7 | 8 | ```{r echo=FALSE} 9 | knitr::opts_chunk$set(fig.path='figures/factor-') 10 | ``` 11 | 12 | ```{r} 13 | library(readr) 14 | library(knitr) 15 | library(psych) 16 | ``` 17 | 18 | ```{r results='asis'} 19 | bfi <- read_csv("../data/bfi.csv", 20 | col_types = cols(X1 = col_skip(), age = col_skip(), 21 | education = col_skip(), gender = col_skip())) 22 | kable(head(bfi)) 23 | ``` 24 | ```{r} 25 | KMO(bfi) 26 | ``` 27 | ```{r} 28 | fa.parallel(bfi) 29 | ``` 30 | ```{r} 31 | bfi.fa <- fa(bfi, nfactors = 6, fm="pa", max.iter = 100) 32 | fa.diagram(bfi.fa) 33 | ``` 34 | 35 | -------------------------------------------------------------------------------- /notebooks/Factor_Analysis.md: -------------------------------------------------------------------------------- 1 | Factor Analysis 2 | ================ 3 | Murat Koptur 4 | 24 Ağustos 2018 5 | 6 | ``` r 7 | library(readr) 8 | library(knitr) 9 | library(psych) 10 | ``` 11 | 12 | ``` r 13 | bfi <- read_csv("../data/bfi.csv", 14 | col_types = cols(X1 = col_skip(), age = col_skip(), 15 | education = col_skip(), gender = col_skip())) 16 | ``` 17 | 18 | ## Warning: Missing column names filled in: 'X1' [1] 19 | 20 | ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, : 21 | ## length of NULL cannot be changed 22 | 23 | ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, : 24 | ## length of NULL cannot be changed 25 | 26 | ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, : 27 | ## length of NULL cannot be changed 28 | 29 | ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, : 30 | ## length of NULL cannot be changed 31 | 32 | ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, : 33 | ## length of NULL cannot be changed 34 | 35 | ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, : 36 | ## length of NULL cannot be changed 37 | 38 | ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, : 39 | ## length of NULL cannot be changed 40 | 41 | ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, : 42 | ## length of NULL cannot be changed 43 | 44 | ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, : 45 | ## length of NULL cannot be changed 46 | 47 | ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, : 48 | ## length of NULL cannot be changed 49 | 50 | ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, : 51 | ## length of NULL cannot be changed 52 | 53 | ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, : 54 | ## length of NULL cannot be changed 55 | 56 | ``` r 57 | kable(head(bfi)) 58 | ``` 59 | 60 | | A1| A2| A3| A4| A5| C1| C2| C3| C4| C5| E1| E2| E3| E4| E5| N1| N2| N3| N4| N5| O1| O2| O3| O4| O5| 61 | |----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:|----:| 62 | | 2| 4| 3| 4| 4| 2| 3| 3| 4| 4| 3| 3| 3| 4| 4| 3| 4| 2| 2| 3| 3| 6| 3| 4| 3| 63 | | 2| 4| 5| 2| 5| 5| 4| 4| 3| 4| 1| 1| 6| 4| 3| 3| 3| 3| 5| 5| 4| 2| 4| 3| 3| 64 | | 5| 4| 5| 4| 4| 4| 5| 4| 2| 5| 2| 4| 4| 4| 5| 4| 5| 4| 2| 3| 4| 2| 5| 5| 2| 65 | | 4| 4| 6| 5| 5| 4| 4| 3| 5| 5| 5| 3| 4| 4| 4| 2| 5| 2| 4| 1| 3| 3| 4| 3| 5| 66 | | 2| 3| 3| 4| 5| 4| 4| 5| 3| 2| 2| 2| 5| 4| 5| 2| 3| 4| 4| 3| 3| 3| 4| 3| 3| 67 | | 6| 6| 5| 6| 5| 6| 6| 6| 1| 3| 2| 1| 6| 5| 6| 3| 5| 2| 2| 3| 4| 3| 5| 6| 1| 68 | 69 | ``` r 70 | KMO(bfi) 71 | ``` 72 | 73 | ## Kaiser-Meyer-Olkin factor adequacy 74 | ## Call: KMO(r = bfi) 75 | ## Overall MSA = 0.85 76 | ## MSA for each item = 77 | ## A1 A2 A3 A4 A5 C1 C2 C3 C4 C5 E1 E2 E3 E4 E5 78 | ## 0.74 0.84 0.87 0.87 0.90 0.83 0.79 0.85 0.82 0.86 0.83 0.88 0.89 0.87 0.89 79 | ## N1 N2 N3 N4 N5 O1 O2 O3 O4 O5 80 | ## 0.78 0.78 0.86 0.88 0.86 0.85 0.78 0.84 0.76 0.76 81 | 82 | ``` r 83 | fa.parallel(bfi) 84 | ``` 85 | 86 | ![](figures/factor-unnamed-chunk-5-1.png) 87 | 88 | ## Parallel analysis suggests that the number of factors = 6 and the number of components = 6 89 | 90 | ``` r 91 | bfi.fa <- fa(bfi, nfactors = 6, fm="pa", max.iter = 100) 92 | ``` 93 | 94 | ## Loading required namespace: GPArotation 95 | 96 | ``` r 97 | fa.diagram(bfi.fa) 98 | ``` 99 | 100 | ![](figures/factor-unnamed-chunk-6-1.png) 101 | -------------------------------------------------------------------------------- /notebooks/Multiple Linear Regression with interaction terms.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Multiple Linear Regression with interaction terms" 3 | author: "Murat Koptur" 4 | date: "`r format(Sys.time(), '%d %B %Y')`" 5 | output: rmarkdown::github_document 6 | --- 7 | 8 | ```{r echo=FALSE} 9 | knitr::opts_chunk$set(fig.path='figures/multipleLin-') 10 | ``` 11 | 12 | ```{r} 13 | library(GGally) 14 | library(ggplot2) 15 | library(readr) 16 | library(reshape2) 17 | ``` 18 | 19 | ```{r} 20 | child_data <- read_csv("../data/child_data.csv") 21 | head(child_data) 22 | ``` 23 | 24 | ```{r} 25 | child_data_melted <- melt(child_data) 26 | head(child_data_melted) 27 | 28 | ggplot(data = child_data_melted, aes(x = value)) + 29 | geom_histogram(aes(y = ..ncount..)) + 30 | geom_density(aes(y = ..scaled..)) + 31 | facet_wrap(~variable, scales = "free") + 32 | labs(x = "Values", y = "Frequencies", title = "Histograms") 33 | ``` 34 | 35 | ```{r} 36 | ggpairs(child_data) 37 | ``` 38 | 39 | ```{r} 40 | child_data_scaled <- scale(child_data) 41 | head(child_data_scaled) 42 | 43 | model1 <- lm(read_ab ~ age + iq, data = as.data.frame(child_data_scaled)) 44 | summary(model1) 45 | 46 | model2 <- lm(read_ab ~ age + mem_span, data = as.data.frame(child_data_scaled)) 47 | summary(model2) 48 | 49 | model3 <- lm(read_ab ~ age + iq + age:iq, data = as.data.frame(child_data_scaled)) 50 | summary(model3) 51 | ``` 52 | 53 | -------------------------------------------------------------------------------- /notebooks/Multiple_Linear_Regression_with_interaction_terms.md: -------------------------------------------------------------------------------- 1 | Multiple Linear Regression with interaction terms 2 | ================ 3 | Murat Koptur 4 | 24 Ağustos 2018 5 | 6 | ``` r 7 | library(GGally) 8 | ``` 9 | 10 | ## Loading required package: ggplot2 11 | 12 | ``` r 13 | library(ggplot2) 14 | library(readr) 15 | library(reshape2) 16 | ``` 17 | 18 | ``` r 19 | child_data <- read_csv("../data/child_data.csv") 20 | ``` 21 | 22 | ## Parsed with column specification: 23 | ## cols( 24 | ## age = col_double(), 25 | ## mem_span = col_double(), 26 | ## iq = col_integer(), 27 | ## read_ab = col_double() 28 | ## ) 29 | 30 | ``` r 31 | head(child_data) 32 | ``` 33 | 34 | ## # A tibble: 6 x 4 35 | ## age mem_span iq read_ab 36 | ## 37 | ## 1 6.7 4.4 95 7.2 38 | ## 2 5.9 4 90 6 39 | ## 3 5.5 4.1 105 6 40 | ## 4 6.2 4.8 98 6.6 41 | ## 5 6.4 5 106 7 42 | ## 6 7.3 5.5 100 7.2 43 | 44 | ``` r 45 | child_data_melted <- melt(child_data) 46 | ``` 47 | 48 | ## No id variables; using all as measure variables 49 | 50 | ``` r 51 | head(child_data_melted) 52 | ``` 53 | 54 | ## variable value 55 | ## 1 age 6.7 56 | ## 2 age 5.9 57 | ## 3 age 5.5 58 | ## 4 age 6.2 59 | ## 5 age 6.4 60 | ## 6 age 7.3 61 | 62 | ``` r 63 | ggplot(data = child_data_melted, aes(x = value)) + 64 | geom_histogram(aes(y = ..ncount..)) + 65 | geom_density(aes(y = ..scaled..)) + 66 | facet_wrap(~variable, scales = "free") + 67 | labs(x = "Values", y = "Frequencies", title = "Histograms") 68 | ``` 69 | 70 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. 71 | 72 | ![](figures/multipleLin-unnamed-chunk-4-1.png) 73 | 74 | ``` r 75 | ggpairs(child_data) 76 | ``` 77 | 78 | ![](figures/multipleLin-unnamed-chunk-5-1.png) 79 | 80 | ``` r 81 | child_data_scaled <- scale(child_data) 82 | head(child_data_scaled) 83 | ``` 84 | 85 | ## age mem_span iq read_ab 86 | ## [1,] 0.6268603 -0.2164352 -0.2376403 1.309125 87 | ## [2,] -0.1188471 -0.9090277 -1.0297747 -0.436375 88 | ## [3,] -0.4917008 -0.7358796 1.3466285 -0.436375 89 | ## [4,] 0.1607932 0.4761574 0.2376403 0.436375 90 | ## [5,] 0.3472200 0.8224536 1.5050553 1.018208 91 | ## [6,] 1.1861409 1.6881943 0.5544941 1.309125 92 | 93 | ``` r 94 | model1 <- lm(read_ab ~ age + iq, data = as.data.frame(child_data_scaled)) 95 | summary(model1) 96 | ``` 97 | 98 | ## 99 | ## Call: 100 | ## lm(formula = read_ab ~ age + iq, data = as.data.frame(child_data_scaled)) 101 | ## 102 | ## Residuals: 103 | ## Min 1Q Median 3Q Max 104 | ## -0.85644 -0.02059 0.04402 0.20506 0.81633 105 | ## 106 | ## Coefficients: 107 | ## Estimate Std. Error t value Pr(>|t|) 108 | ## (Intercept) -2.302e-16 9.998e-02 0.000 1.00000 109 | ## age 9.117e-01 1.047e-01 8.711 1.12e-07 *** 110 | ## iq 3.313e-01 1.047e-01 3.165 0.00565 ** 111 | ## --- 112 | ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 113 | ## 114 | ## Residual standard error: 0.4471 on 17 degrees of freedom 115 | ## Multiple R-squared: 0.8211, Adjusted R-squared: 0.8001 116 | ## F-statistic: 39.02 on 2 and 17 DF, p-value: 4.434e-07 117 | 118 | ``` r 119 | model2 <- lm(read_ab ~ age + mem_span, data = as.data.frame(child_data_scaled)) 120 | summary(model2) 121 | ``` 122 | 123 | ## 124 | ## Call: 125 | ## lm(formula = read_ab ~ age + mem_span, data = as.data.frame(child_data_scaled)) 126 | ## 127 | ## Residuals: 128 | ## Min 1Q Median 3Q Max 129 | ## -0.9536 -0.2206 0.0244 0.1668 1.0719 130 | ## 131 | ## Coefficients: 132 | ## Estimate Std. Error t value Pr(>|t|) 133 | ## (Intercept) 1.363e-16 1.038e-01 0.000 1.00000 134 | ## age 5.296e-01 1.542e-01 3.435 0.00316 ** 135 | ## mem_span 4.377e-01 1.542e-01 2.839 0.01135 * 136 | ## --- 137 | ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 138 | ## 139 | ## Residual standard error: 0.4643 on 17 degrees of freedom 140 | ## Multiple R-squared: 0.8071, Adjusted R-squared: 0.7844 141 | ## F-statistic: 35.57 on 2 and 17 DF, p-value: 8.414e-07 142 | 143 | ``` r 144 | model3 <- lm(read_ab ~ age + iq + age:iq, data = as.data.frame(child_data_scaled)) 145 | summary(model3) 146 | ``` 147 | 148 | ## 149 | ## Call: 150 | ## lm(formula = read_ab ~ age + iq + age:iq, data = as.data.frame(child_data_scaled)) 151 | ## 152 | ## Residuals: 153 | ## Min 1Q Median 3Q Max 154 | ## -0.82042 -0.08630 -0.01172 0.18550 0.89331 155 | ## 156 | ## Coefficients: 157 | ## Estimate Std. Error t value Pr(>|t|) 158 | ## (Intercept) 0.03942 0.09964 0.396 0.69764 159 | ## age 0.79560 0.12613 6.308 1.04e-05 *** 160 | ## iq 0.38369 0.10642 3.605 0.00237 ** 161 | ## age:iq 0.20914 0.13667 1.530 0.14549 162 | ## --- 163 | ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 164 | ## 165 | ## Residual standard error: 0.4305 on 16 degrees of freedom 166 | ## Multiple R-squared: 0.844, Adjusted R-squared: 0.8147 167 | ## F-statistic: 28.85 on 3 and 16 DF, p-value: 1.089e-06 168 | -------------------------------------------------------------------------------- /notebooks/Poisson Regression.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Poisson Regression" 3 | author: "Murat Koptur" 4 | date: "`r format(Sys.time(), '%d %B %Y')`" 5 | output: rmarkdown::github_document 6 | --- 7 | 8 | ```{r echo=FALSE} 9 | knitr::opts_chunk$set(fig.path='figures/poisson-') 10 | ``` 11 | 12 | 13 | ```{r} 14 | library(bayesplot) 15 | library(ggplot2) 16 | library(readr) 17 | library(reshape2) 18 | library(rstanarm) 19 | ``` 20 | 21 | ```{r} 22 | awards <- read_csv("../data/awards.csv", 23 | col_types = cols(id = col_skip(), prog = col_factor(levels = c("1", "2", "3")))) 24 | head(awards) 25 | ``` 26 | 27 | ```{r} 28 | awards_melted <- melt(awards) 29 | head(awards_melted) 30 | ``` 31 | 32 | ```{r} 33 | ggplot(data = awards_melted, aes(x = value)) + 34 | geom_histogram(aes(y = ..ncount..)) + 35 | geom_density(aes(y = ..scaled..)) + 36 | facet_wrap(~variable, scales = "free") + 37 | labs(x = "Values", y = "Frequencies", title = "Histograms") 38 | ``` 39 | 40 | ```{r} 41 | awards$math <- scale(awards$math) 42 | ``` 43 | 44 | ```{r} 45 | model1 <- glm(num_awards ~ math + prog, data = awards, family = poisson) 46 | summary(model1) 47 | ``` 48 | 49 | ```{r} 50 | model2 <- stan_glm(num_awards ~ math + prog, data = awards, family = poisson, 51 | prior = normal(0, 10), prior_intercept = normal(0, 10)) 52 | summary(model2) 53 | ``` 54 | 55 | ```{r} 56 | posterior_interval(model2, prob = 0.95) 57 | plot(model2, plotfun = "areas", prob = 0.95) 58 | ``` 59 | 60 | ```{r} 61 | pp_check(model2) 62 | ``` 63 | 64 | -------------------------------------------------------------------------------- /notebooks/Poisson_Regression.md: -------------------------------------------------------------------------------- 1 | Poisson Regression 2 | ================ 3 | Murat Koptur 4 | 24 Ağustos 2018 5 | 6 | ``` r 7 | library(bayesplot) 8 | ``` 9 | 10 | ## This is bayesplot version 1.6.0 11 | 12 | ## - Online documentation and vignettes at mc-stan.org/bayesplot 13 | 14 | ## - bayesplot theme set to bayesplot::theme_default() 15 | 16 | ## * Does _not_ affect other ggplot2 plots 17 | 18 | ## * See ?bayesplot_theme_set for details on theme setting 19 | 20 | ``` r 21 | library(ggplot2) 22 | library(readr) 23 | library(reshape2) 24 | library(rstanarm) 25 | ``` 26 | 27 | ## Loading required package: Rcpp 28 | 29 | ## rstanarm (Version 2.17.4, packaged: 2018-04-13 01:51:52 UTC) 30 | 31 | ## - Do not expect the default priors to remain the same in future rstanarm versions. 32 | 33 | ## Thus, R scripts should specify priors explicitly, even if they are just the defaults. 34 | 35 | ## - For execution on a local, multicore CPU with excess RAM we recommend calling 36 | 37 | ## options(mc.cores = parallel::detectCores()) 38 | 39 | ## - Plotting theme set to bayesplot::theme_default(). 40 | 41 | ``` r 42 | awards <- read_csv("../data/awards.csv", 43 | col_types = cols(id = col_skip(), prog = col_factor(levels = c("1", "2", "3")))) 44 | ``` 45 | 46 | ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, : 47 | ## length of NULL cannot be changed 48 | 49 | ## Warning in read_tokens_(data, tokenizer, col_specs, col_names, locale_, : 50 | ## length of NULL cannot be changed 51 | 52 | ``` r 53 | head(awards) 54 | ``` 55 | 56 | ## # A tibble: 6 x 3 57 | ## num_awards prog math 58 | ## 59 | ## 1 1 3 41 60 | ## 2 1 1 41 61 | ## 3 1 3 44 62 | ## 4 1 3 42 63 | ## 5 1 3 40 64 | ## 6 1 1 42 65 | 66 | ``` r 67 | awards_melted <- melt(awards) 68 | ``` 69 | 70 | ## Using prog as id variables 71 | 72 | ``` r 73 | head(awards_melted) 74 | ``` 75 | 76 | ## prog variable value 77 | ## 1 3 num_awards 1 78 | ## 2 1 num_awards 1 79 | ## 3 3 num_awards 1 80 | ## 4 3 num_awards 1 81 | ## 5 3 num_awards 1 82 | ## 6 1 num_awards 1 83 | 84 | ``` r 85 | ggplot(data = awards_melted, aes(x = value)) + 86 | geom_histogram(aes(y = ..ncount..)) + 87 | geom_density(aes(y = ..scaled..)) + 88 | facet_wrap(~variable, scales = "free") + 89 | labs(x = "Values", y = "Frequencies", title = "Histograms") 90 | ``` 91 | 92 | ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. 93 | 94 | ![](figures/poisson-unnamed-chunk-5-1.png) 95 | 96 | ``` r 97 | awards$math <- scale(awards$math) 98 | ``` 99 | 100 | ``` r 101 | model1 <- glm(num_awards ~ math + prog, data = awards, family = poisson) 102 | summary(model1) 103 | ``` 104 | 105 | ## 106 | ## Call: 107 | ## glm(formula = num_awards ~ math + prog, family = poisson, data = awards) 108 | ## 109 | ## Deviance Residuals: 110 | ## Min 1Q Median 3Q Max 111 | ## -1.96335 -1.14818 -0.01392 0.35710 2.52541 112 | ## 113 | ## Coefficients: 114 | ## Estimate Std. Error z value Pr(>|z|) 115 | ## (Intercept) -0.48897 0.19620 -2.492 0.0127 * 116 | ## math 0.33520 0.07817 4.288 1.8e-05 *** 117 | ## prog2 0.45262 0.22475 2.014 0.0440 * 118 | ## prog3 0.56172 0.24748 2.270 0.0232 * 119 | ## --- 120 | ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 121 | ## 122 | ## (Dispersion parameter for poisson family taken to be 1) 123 | ## 124 | ## Null deviance: 228.83 on 199 degrees of freedom 125 | ## Residual deviance: 198.05 on 196 degrees of freedom 126 | ## AIC: 496.36 127 | ## 128 | ## Number of Fisher Scoring iterations: 5 129 | 130 | ``` r 131 | model2 <- stan_glm(num_awards ~ math + prog, data = awards, family = poisson, 132 | prior = normal(0, 10), prior_intercept = normal(0, 10)) 133 | ``` 134 | 135 | ## 136 | ## SAMPLING FOR MODEL 'count' NOW (CHAIN 1). 137 | ## 138 | ## Gradient evaluation took 0.000117 seconds 139 | ## 1000 transitions using 10 leapfrog steps per transition would take 1.17 seconds. 140 | ## Adjust your expectations accordingly! 141 | ## 142 | ## 143 | ## Iteration: 1 / 2000 [ 0%] (Warmup) 144 | ## Iteration: 200 / 2000 [ 10%] (Warmup) 145 | ## Iteration: 400 / 2000 [ 20%] (Warmup) 146 | ## Iteration: 600 / 2000 [ 30%] (Warmup) 147 | ## Iteration: 800 / 2000 [ 40%] (Warmup) 148 | ## Iteration: 1000 / 2000 [ 50%] (Warmup) 149 | ## Iteration: 1001 / 2000 [ 50%] (Sampling) 150 | ## Iteration: 1200 / 2000 [ 60%] (Sampling) 151 | ## Iteration: 1400 / 2000 [ 70%] (Sampling) 152 | ## Iteration: 1600 / 2000 [ 80%] (Sampling) 153 | ## Iteration: 1800 / 2000 [ 90%] (Sampling) 154 | ## Iteration: 2000 / 2000 [100%] (Sampling) 155 | ## 156 | ## Elapsed Time: 0.289811 seconds (Warm-up) 157 | ## 0.270276 seconds (Sampling) 158 | ## 0.560087 seconds (Total) 159 | ## 160 | ## 161 | ## SAMPLING FOR MODEL 'count' NOW (CHAIN 2). 162 | ## 163 | ## Gradient evaluation took 3.1e-05 seconds 164 | ## 1000 transitions using 10 leapfrog steps per transition would take 0.31 seconds. 165 | ## Adjust your expectations accordingly! 166 | ## 167 | ## 168 | ## Iteration: 1 / 2000 [ 0%] (Warmup) 169 | ## Iteration: 200 / 2000 [ 10%] (Warmup) 170 | ## Iteration: 400 / 2000 [ 20%] (Warmup) 171 | ## Iteration: 600 / 2000 [ 30%] (Warmup) 172 | ## Iteration: 800 / 2000 [ 40%] (Warmup) 173 | ## Iteration: 1000 / 2000 [ 50%] (Warmup) 174 | ## Iteration: 1001 / 2000 [ 50%] (Sampling) 175 | ## Iteration: 1200 / 2000 [ 60%] (Sampling) 176 | ## Iteration: 1400 / 2000 [ 70%] (Sampling) 177 | ## Iteration: 1600 / 2000 [ 80%] (Sampling) 178 | ## Iteration: 1800 / 2000 [ 90%] (Sampling) 179 | ## Iteration: 2000 / 2000 [100%] (Sampling) 180 | ## 181 | ## Elapsed Time: 0.281356 seconds (Warm-up) 182 | ## 0.258399 seconds (Sampling) 183 | ## 0.539755 seconds (Total) 184 | ## 185 | ## 186 | ## SAMPLING FOR MODEL 'count' NOW (CHAIN 3). 187 | ## 188 | ## Gradient evaluation took 3e-05 seconds 189 | ## 1000 transitions using 10 leapfrog steps per transition would take 0.3 seconds. 190 | ## Adjust your expectations accordingly! 191 | ## 192 | ## 193 | ## Iteration: 1 / 2000 [ 0%] (Warmup) 194 | ## Iteration: 200 / 2000 [ 10%] (Warmup) 195 | ## Iteration: 400 / 2000 [ 20%] (Warmup) 196 | ## Iteration: 600 / 2000 [ 30%] (Warmup) 197 | ## Iteration: 800 / 2000 [ 40%] (Warmup) 198 | ## Iteration: 1000 / 2000 [ 50%] (Warmup) 199 | ## Iteration: 1001 / 2000 [ 50%] (Sampling) 200 | ## Iteration: 1200 / 2000 [ 60%] (Sampling) 201 | ## Iteration: 1400 / 2000 [ 70%] (Sampling) 202 | ## Iteration: 1600 / 2000 [ 80%] (Sampling) 203 | ## Iteration: 1800 / 2000 [ 90%] (Sampling) 204 | ## Iteration: 2000 / 2000 [100%] (Sampling) 205 | ## 206 | ## Elapsed Time: 0.273531 seconds (Warm-up) 207 | ## 0.267135 seconds (Sampling) 208 | ## 0.540666 seconds (Total) 209 | ## 210 | ## 211 | ## SAMPLING FOR MODEL 'count' NOW (CHAIN 4). 212 | ## 213 | ## Gradient evaluation took 3.1e-05 seconds 214 | ## 1000 transitions using 10 leapfrog steps per transition would take 0.31 seconds. 215 | ## Adjust your expectations accordingly! 216 | ## 217 | ## 218 | ## Iteration: 1 / 2000 [ 0%] (Warmup) 219 | ## Iteration: 200 / 2000 [ 10%] (Warmup) 220 | ## Iteration: 400 / 2000 [ 20%] (Warmup) 221 | ## Iteration: 600 / 2000 [ 30%] (Warmup) 222 | ## Iteration: 800 / 2000 [ 40%] (Warmup) 223 | ## Iteration: 1000 / 2000 [ 50%] (Warmup) 224 | ## Iteration: 1001 / 2000 [ 50%] (Sampling) 225 | ## Iteration: 1200 / 2000 [ 60%] (Sampling) 226 | ## Iteration: 1400 / 2000 [ 70%] (Sampling) 227 | ## Iteration: 1600 / 2000 [ 80%] (Sampling) 228 | ## Iteration: 1800 / 2000 [ 90%] (Sampling) 229 | ## Iteration: 2000 / 2000 [100%] (Sampling) 230 | ## 231 | ## Elapsed Time: 0.248926 seconds (Warm-up) 232 | ## 0.250404 seconds (Sampling) 233 | ## 0.49933 seconds (Total) 234 | 235 | ``` r 236 | summary(model2) 237 | ``` 238 | 239 | ## 240 | ## Model Info: 241 | ## 242 | ## function: stan_glm 243 | ## family: poisson [log] 244 | ## formula: num_awards ~ math + prog 245 | ## algorithm: sampling 246 | ## priors: see help('prior_summary') 247 | ## sample: 4000 (posterior sample size) 248 | ## observations: 200 249 | ## predictors: 4 250 | ## 251 | ## Estimates: 252 | ## mean sd 2.5% 25% 50% 75% 97.5% 253 | ## (Intercept) -0.5 0.2 -0.9 -0.6 -0.5 -0.4 -0.1 254 | ## math 0.3 0.1 0.2 0.3 0.3 0.4 0.5 255 | ## prog2 0.5 0.2 0.0 0.3 0.5 0.6 0.9 256 | ## prog3 0.6 0.3 0.1 0.4 0.6 0.7 1.0 257 | ## mean_PPD 1.0 0.1 0.8 0.9 1.0 1.0 1.2 258 | ## log-posterior -252.2 1.4 -255.8 -252.9 -251.9 -251.1 -250.4 259 | ## 260 | ## Diagnostics: 261 | ## mcse Rhat n_eff 262 | ## (Intercept) 0.0 1.0 1997 263 | ## math 0.0 1.0 2485 264 | ## prog2 0.0 1.0 2291 265 | ## prog3 0.0 1.0 2054 266 | ## mean_PPD 0.0 1.0 3751 267 | ## log-posterior 0.0 1.0 1624 268 | ## 269 | ## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1). 270 | 271 | ``` r 272 | posterior_interval(model2, prob = 0.95) 273 | ``` 274 | 275 | ## 2.5% 97.5% 276 | ## (Intercept) -0.89457959 -0.1447066 277 | ## math 0.18111692 0.4915252 278 | ## prog2 0.03168288 0.9214785 279 | ## prog3 0.07135645 1.0449510 280 | 281 | ``` r 282 | plot(model2, plotfun = "areas", prob = 0.95) 283 | ``` 284 | 285 | ![](figures/poisson-unnamed-chunk-9-1.png) 286 | 287 | ``` r 288 | pp_check(model2) 289 | ``` 290 | 291 | ![](figures/poisson-unnamed-chunk-10-1.png) 292 | -------------------------------------------------------------------------------- /notebooks/figures/corr-unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/notebooks/figures/corr-unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /notebooks/figures/factor-unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/notebooks/figures/factor-unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /notebooks/figures/factor-unnamed-chunk-6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/notebooks/figures/factor-unnamed-chunk-6-1.png -------------------------------------------------------------------------------- /notebooks/figures/multipleLin-unnamed-chunk-4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/notebooks/figures/multipleLin-unnamed-chunk-4-1.png -------------------------------------------------------------------------------- /notebooks/figures/multipleLin-unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/notebooks/figures/multipleLin-unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /notebooks/figures/poisson-unnamed-chunk-10-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/notebooks/figures/poisson-unnamed-chunk-10-1.png -------------------------------------------------------------------------------- /notebooks/figures/poisson-unnamed-chunk-5-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/notebooks/figures/poisson-unnamed-chunk-5-1.png -------------------------------------------------------------------------------- /notebooks/figures/poisson-unnamed-chunk-9-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrtkp9993/Statistical-Modeling-Examples/2701ab970ed3c66d20125f853f77f8380d58ed4f/notebooks/figures/poisson-unnamed-chunk-9-1.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | seaborn==0.9.0 2 | pandas==0.23.4 3 | pystan==2.17.1.0 4 | matplotlib==2.2.2 5 | numpy==1.13.3 6 | scikit_learn==0.19.2 7 | statsmodels==0.9.0 -------------------------------------------------------------------------------- /scripts/Multiple linear regression with interaction terms.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import pandas as pd 4 | import seaborn as sns 5 | import statsmodels.api as sm 6 | 7 | # http://staff.bath.ac.uk/pssiw/stats2/page16/page16.html 8 | df = pd.read_csv("../data/child_data.csv") 9 | print(df.head()) 10 | 11 | sns.set(style="white", palette="muted", color_codes=True) 12 | 13 | f, axes = plt.subplots(2, 2, figsize=(7, 7)) 14 | 15 | sns.distplot(df.age, ax=axes[0, 0]) 16 | sns.distplot(df.mem_span, ax=axes[0, 1]) 17 | sns.distplot(df.iq, ax=axes[1, 0]) 18 | sns.distplot(df.read_ab, ax=axes[1, 1]) 19 | plt.show() 20 | 21 | sns.pairplot(df, vars=['age', 'mem_span', 'iq']) 22 | plt.show() 23 | 24 | # Rescale all variables 25 | for col in df.columns.values: 26 | df[col] = (df[col] - np.mean(df[col]))/(2 * np.std(df[col])) 27 | 28 | print(df.head()) 29 | 30 | # Ordinary multiple linear regression 31 | # Mem_span and age seems correlated, so I'll use one of them 32 | mod1 = sm.formula.ols('read_ab ~ age + iq', data=df).fit() 33 | print(mod1.summary()) 34 | 35 | mod2 = sm.formula.ols('read_ab ~ age + mem_span', data=df).fit() 36 | print(mod2.summary()) 37 | 38 | # Now, add interaction term 39 | mod1 = sm.formula.ols('read_ab ~ age + iq + age:iq', data=df).fit() 40 | print(mod1.summary()) 41 | -------------------------------------------------------------------------------- /scripts/Poisson Regression.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import pandas as pd 4 | import seaborn as sns 5 | import statsmodels.api as sm 6 | 7 | # http://staff.bath.ac.uk/pssiw/stats2/page16/page16.html 8 | df = pd.read_csv("../data/awards.csv", index_col=0) 9 | print(df.head()) 10 | 11 | print(df.describe()) 12 | 13 | df = pd.get_dummies(df, columns=["prog"]) 14 | del df['prog_1'] 15 | print(df.head()) 16 | 17 | df['math'] = (df['math'] - np.mean(df['math']))/(2 * np.std(df['math'])) 18 | print(df.head()) 19 | 20 | X = np.column_stack( 21 | (np.ones((df.shape[0], 1)), df[['math', 'prog_2', 'prog_3']])) 22 | y = df['num_awards'] 23 | 24 | mod = sm.formula.GLM(y, X, family=sm.families.Poisson()).fit() 25 | print(mod.summary()) 26 | 27 | model_fitted_y = mod.fittedvalues 28 | model_residuals = mod.df_resid 29 | model_abs_resid = np.abs(model_residuals) 30 | 31 | # https://medium.com/@emredjan/emulating-r-regression-plots-in-python-43741952c034 32 | plot_lm_1 = plt.figure(1) 33 | plot_lm_1.set_figheight(8) 34 | plot_lm_1.set_figwidth(12) 35 | 36 | plot_lm_1.axes[0] = sns.residplot(model_fitted_y, 'num_awards', data=df, 37 | lowess=True, 38 | scatter_kws={'alpha': 0.5}, 39 | line_kws={'color': 'red', 'lw': 2, 'alpha': 0.8}) 40 | 41 | plot_lm_1.axes[0].set_title('Residuals vs Fitted') 42 | plot_lm_1.axes[0].set_xlabel('Fitted values') 43 | plot_lm_1.axes[0].set_ylabel('Residuals') 44 | plt.show() 45 | -------------------------------------------------------------------------------- /scripts/helper/psis.py: -------------------------------------------------------------------------------- 1 | """Pareto smoothed importance sampling (PSIS) 2 | 3 | This module implements Pareto smoothed importance sampling (PSIS) and PSIS 4 | leave-one-out (LOO) cross-validation for Python (Numpy). 5 | 6 | Included functions 7 | ------------------ 8 | psisloo 9 | Pareto smoothed importance sampling leave-one-out log predictive densities. 10 | 11 | psislw 12 | Pareto smoothed importance sampling. 13 | 14 | gpdfitnew 15 | Estimate the paramaters for the Generalized Pareto Distribution (GPD). 16 | 17 | gpinv 18 | Inverse Generalised Pareto distribution function. 19 | 20 | sumlogs 21 | Sum of vector where numbers are represented by their logarithms. 22 | 23 | References 24 | ---------- 25 | Aki Vehtari, Andrew Gelman and Jonah Gabry (2017). Practical 26 | Bayesian model evaluation using leave-one-out cross-validation 27 | and WAIC. Statistics and Computing, 27(5):1413–1432. 28 | doi:10.1007/s11222-016-9696-4. https://arxiv.org/abs/1507.04544 29 | 30 | Aki Vehtari, Andrew Gelman and Jonah Gabry (2017). Pareto 31 | smoothed importance sampling. https://arxiv.org/abs/arXiv:1507.02646v5 32 | 33 | """ 34 | 35 | from __future__ import division # For Python 2 compatibility 36 | import numpy as np 37 | 38 | # 3-Clause BSD License 39 | """ 40 | Copyright 2017 Aki Vehtari, Tuomas Sivula 41 | 42 | Redistribution and use in source and binary forms, with or without modification, 43 | are permitted provided that the following conditions are met: 44 | 45 | 1. Redistributions of source code must retain the above copyright notice, this 46 | list of conditions and the following disclaimer. 47 | 48 | 2. Redistributions in binary form must reproduce the above copyright notice, 49 | this list of conditions and the following disclaimer in the documentation and/or 50 | other materials provided with the distribution. 51 | 52 | 3. Neither the name of the copyright holder nor the names of its contributors 53 | may be used to endorse or promote products derived from this software without 54 | specific prior written permission. 55 | 56 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 57 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 58 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 59 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 60 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 61 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 62 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 63 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 64 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 65 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ 66 | 67 | 68 | def psisloo(log_lik, **kwargs): 69 | r"""PSIS leave-one-out log predictive densities. 70 | 71 | Computes the log predictive densities given posterior samples of the log 72 | likelihood terms :math:`p(y_i|\theta^s)` in input parameter `log_lik`. 73 | Returns a sum of the leave-one-out log predictive densities `loo`, 74 | individual leave-one-out log predictive density terms `loos` and an estimate 75 | of Pareto tail indeces `ks`. The estimates are unreliable if tail index 76 | ``k > 0.7`` (see more in the references listed in the module docstring). 77 | 78 | Additional keyword arguments are passed to the :meth:`psislw()` function 79 | (see the corresponding documentation). 80 | 81 | Parameters 82 | ---------- 83 | log_lik : ndarray 84 | Array of size n x m containing n posterior samples of the log likelihood 85 | terms :math:`p(y_i|\theta^s)`. 86 | 87 | Returns 88 | ------- 89 | loo : scalar 90 | sum of the leave-one-out log predictive densities 91 | 92 | loos : ndarray 93 | individual leave-one-out log predictive density terms 94 | 95 | ks : ndarray 96 | estimated Pareto tail indeces 97 | 98 | """ 99 | # ensure overwrite flag in passed arguments 100 | kwargs['overwrite_lw'] = True 101 | # log raw weights from log_lik 102 | lw = -log_lik 103 | # compute Pareto smoothed log weights given raw log weights 104 | lw, ks = psislw(lw, **kwargs) 105 | # compute 106 | lw += log_lik 107 | loos = sumlogs(lw, axis=0) 108 | loo = loos.sum() 109 | return loo, loos, ks 110 | 111 | 112 | def psislw(lw, Reff=1.0, overwrite_lw=False): 113 | """Pareto smoothed importance sampling (PSIS). 114 | 115 | Parameters 116 | ---------- 117 | lw : ndarray 118 | Array of size n x m containing m sets of n log weights. It is also 119 | possible to provide one dimensional array of length n. 120 | 121 | Reff : scalar, optional 122 | relative MCMC efficiency ``N_eff / N`` 123 | 124 | overwrite_lw : bool, optional 125 | If True, the input array `lw` is smoothed in-place, assuming the array 126 | is F-contiguous. By default, a new array is allocated. 127 | 128 | Returns 129 | ------- 130 | lw_out : ndarray 131 | smoothed log weights 132 | kss : ndarray 133 | Pareto tail indices 134 | 135 | """ 136 | if lw.ndim == 2: 137 | n, m = lw.shape 138 | elif lw.ndim == 1: 139 | n = len(lw) 140 | m = 1 141 | else: 142 | raise ValueError("Argument `lw` must be 1 or 2 dimensional.") 143 | if n <= 1: 144 | raise ValueError("More than one log-weight needed.") 145 | 146 | if overwrite_lw and lw.flags.f_contiguous: 147 | # in-place operation 148 | lw_out = lw 149 | else: 150 | # allocate new array for output 151 | lw_out = np.copy(lw, order='F') 152 | 153 | # allocate output array for kss 154 | kss = np.empty(m) 155 | 156 | # precalculate constants 157 | cutoff_ind = - int(np.ceil(min(0.2 * n, 3 * np.sqrt(n / Reff)))) - 1 158 | cutoffmin = np.log(np.finfo(float).tiny) 159 | logn = np.log(n) 160 | k_min = 1/3 161 | 162 | # loop over sets of log weights 163 | for i, x in enumerate(lw_out.T if lw_out.ndim == 2 else lw_out[None, :]): 164 | # improve numerical accuracy 165 | x -= np.max(x) 166 | # sort the array 167 | x_sort_ind = np.argsort(x) 168 | # divide log weights into body and right tail 169 | xcutoff = max( 170 | x[x_sort_ind[cutoff_ind]], 171 | cutoffmin 172 | ) 173 | expxcutoff = np.exp(xcutoff) 174 | tailinds, = np.where(x > xcutoff) 175 | x2 = x[tailinds] 176 | n2 = len(x2) 177 | if n2 <= 4: 178 | # not enough tail samples for gpdfitnew 179 | k = np.inf 180 | else: 181 | # order of tail samples 182 | x2si = np.argsort(x2) 183 | # fit generalized Pareto distribution to the right tail samples 184 | np.exp(x2, out=x2) 185 | x2 -= expxcutoff 186 | k, sigma = gpdfitnew(x2, sort=x2si) 187 | if k >= k_min and not np.isinf(k): 188 | # no smoothing if short tail or GPD fit failed 189 | # compute ordered statistic for the fit 190 | sti = np.arange(0.5, n2) 191 | sti /= n2 192 | qq = gpinv(sti, k, sigma) 193 | qq += expxcutoff 194 | np.log(qq, out=qq) 195 | # place the smoothed tail into the output array 196 | x[tailinds[x2si]] = qq 197 | # truncate smoothed values to the largest raw weight 0 198 | x[x > 0] = 0 199 | # renormalize weights 200 | x -= sumlogs(x) 201 | # store tail index k 202 | kss[i] = k 203 | 204 | # If the provided input array is one dimensional, return kss as scalar. 205 | if lw_out.ndim == 1: 206 | kss = kss[0] 207 | 208 | return lw_out, kss 209 | 210 | 211 | def gpdfitnew(x, sort=True, sort_in_place=False, return_quadrature=False): 212 | """Estimate the paramaters for the Generalized Pareto Distribution (GPD) 213 | 214 | Returns empirical Bayes estimate for the parameters of the two-parameter 215 | generalized Parato distribution given the data. 216 | 217 | Parameters 218 | ---------- 219 | x : ndarray 220 | One dimensional data array 221 | 222 | sort : bool or ndarray, optional 223 | If known in advance, one can provide an array of indices that would 224 | sort the input array `x`. If the input array is already sorted, provide 225 | False. If True (default behaviour), the array is sorted internally. 226 | 227 | sort_in_place : bool, optional 228 | If `sort` is True and `sort_in_place` is True, the array is sorted 229 | in-place (False by default). 230 | 231 | return_quadrature : bool, optional 232 | If True, quadrature points and weight `ks` and `w` of the marginal posterior distribution of k are also calculated and returned. False by 233 | default. 234 | 235 | Returns 236 | ------- 237 | k, sigma : float 238 | estimated parameter values 239 | 240 | ks, w : ndarray 241 | Quadrature points and weights of the marginal posterior distribution 242 | of `k`. Returned only if `return_quadrature` is True. 243 | 244 | Notes 245 | ----- 246 | This function returns a negative of Zhang and Stephens's k, because it is 247 | more common parameterisation. 248 | 249 | """ 250 | if x.ndim != 1 or len(x) <= 1: 251 | raise ValueError("Invalid input array.") 252 | 253 | # check if x should be sorted 254 | if sort is True: 255 | if sort_in_place: 256 | x.sort() 257 | xsorted = True 258 | else: 259 | sort = np.argsort(x) 260 | xsorted = False 261 | elif sort is False: 262 | xsorted = True 263 | else: 264 | xsorted = False 265 | 266 | n = len(x) 267 | PRIOR = 3 268 | m = 30 + int(np.sqrt(n)) 269 | 270 | bs = np.arange(1, m + 1, dtype=float) 271 | bs -= 0.5 272 | np.divide(m, bs, out=bs) 273 | np.sqrt(bs, out=bs) 274 | np.subtract(1, bs, out=bs) 275 | if xsorted: 276 | bs /= PRIOR * x[int(n/4 + 0.5) - 1] 277 | bs += 1 / x[-1] 278 | else: 279 | bs /= PRIOR * x[sort[int(n/4 + 0.5) - 1]] 280 | bs += 1 / x[sort[-1]] 281 | 282 | ks = np.negative(bs) 283 | temp = ks[:,None] * x 284 | np.log1p(temp, out=temp) 285 | np.mean(temp, axis=1, out=ks) 286 | 287 | L = bs / ks 288 | np.negative(L, out=L) 289 | np.log(L, out=L) 290 | L -= ks 291 | L -= 1 292 | L *= n 293 | 294 | temp = L - L[:,None] 295 | np.exp(temp, out=temp) 296 | w = np.sum(temp, axis=1) 297 | np.divide(1, w, out=w) 298 | 299 | # remove negligible weights 300 | dii = w >= 10 * np.finfo(float).eps 301 | if not np.all(dii): 302 | w = w[dii] 303 | bs = bs[dii] 304 | # normalise w 305 | w /= w.sum() 306 | 307 | # posterior mean for b 308 | b = np.sum(bs * w) 309 | # Estimate for k, note that we return a negative of Zhang and 310 | # Stephens's k, because it is more common parameterisation. 311 | temp = (-b) * x 312 | np.log1p(temp, out=temp) 313 | k = np.mean(temp) 314 | if return_quadrature: 315 | np.negative(x, out=temp) 316 | temp = bs[:, None] * temp 317 | np.log1p(temp, out=temp) 318 | ks = np.mean(temp, axis=1) 319 | # estimate for sigma 320 | sigma = -k / b * n / (n - 0) 321 | # weakly informative prior for k 322 | a = 10 323 | k = k * n / (n+a) + a * 0.5 / (n+a) 324 | if return_quadrature: 325 | ks *= n / (n+a) 326 | ks += a * 0.5 / (n+a) 327 | 328 | if return_quadrature: 329 | return k, sigma, ks, w 330 | else: 331 | return k, sigma 332 | 333 | 334 | def gpinv(p, k, sigma): 335 | """Inverse Generalised Pareto distribution function.""" 336 | x = np.empty(p.shape) 337 | x.fill(np.nan) 338 | if sigma <= 0: 339 | return x 340 | ok = (p > 0) & (p < 1) 341 | if np.all(ok): 342 | if np.abs(k) < np.finfo(float).eps: 343 | np.negative(p, out=x) 344 | np.log1p(x, out=x) 345 | np.negative(x, out=x) 346 | else: 347 | np.negative(p, out=x) 348 | np.log1p(x, out=x) 349 | x *= -k 350 | np.expm1(x, out=x) 351 | x /= k 352 | x *= sigma 353 | else: 354 | if np.abs(k) < np.finfo(float).eps: 355 | # x[ok] = - np.log1p(-p[ok]) 356 | temp = p[ok] 357 | np.negative(temp, out=temp) 358 | np.log1p(temp, out=temp) 359 | np.negative(temp, out=temp) 360 | x[ok] = temp 361 | else: 362 | # x[ok] = np.expm1(-k * np.log1p(-p[ok])) / k 363 | temp = p[ok] 364 | np.negative(temp, out=temp) 365 | np.log1p(temp, out=temp) 366 | temp *= -k 367 | np.expm1(temp, out=temp) 368 | temp /= k 369 | x[ok] = temp 370 | x *= sigma 371 | x[p == 0] = 0 372 | if k >= 0: 373 | x[p == 1] = np.inf 374 | else: 375 | x[p == 1] = -sigma / k 376 | return x 377 | 378 | 379 | def sumlogs(x, axis=None, out=None): 380 | """Sum of vector where numbers are represented by their logarithms. 381 | 382 | Calculates ``np.log(np.sum(np.exp(x), axis=axis))`` in such a fashion that 383 | it works even when elements have large magnitude. 384 | 385 | """ 386 | maxx = x.max(axis=axis, keepdims=True) 387 | xnorm = x - maxx 388 | np.exp(xnorm, out=xnorm) 389 | out = np.sum(xnorm, axis=axis, out=out) 390 | if isinstance(out, np.ndarray): 391 | np.log(out, out=out) 392 | else: 393 | out = np.log(out) 394 | out += np.squeeze(maxx) 395 | return out 396 | -------------------------------------------------------------------------------- /scripts/helper/stan_utility.py: -------------------------------------------------------------------------------- 1 | import pystan 2 | import pickle 3 | import numpy 4 | 5 | def check_div(fit): 6 | """Check transitions that ended with a divergence""" 7 | sampler_params = fit.get_sampler_params(inc_warmup=False) 8 | divergent = [x for y in sampler_params for x in y['divergent__']] 9 | n = sum(divergent) 10 | N = len(divergent) 11 | print('{} of {} iterations ended with a divergence ({}%)'.format(n, N, 12 | 100 * n / N)) 13 | if n > 0: 14 | print(' Try running with larger adapt_delta to remove the divergences') 15 | 16 | def check_treedepth(fit, max_depth = 10): 17 | """Check transitions that ended prematurely due to maximum tree depth limit""" 18 | sampler_params = fit.get_sampler_params(inc_warmup=False) 19 | depths = [x for y in sampler_params for x in y['treedepth__']] 20 | n = sum(1 for x in depths if x == max_depth) 21 | N = len(depths) 22 | print(('{} of {} iterations saturated the maximum tree depth of {}' 23 | + ' ({}%)').format(n, N, max_depth, 100 * n / N)) 24 | if n > 0: 25 | print(' Run again with max_depth set to a larger value to avoid saturation') 26 | 27 | def check_energy(fit): 28 | """Checks the energy Bayesian fraction of missing information (E-BFMI)""" 29 | sampler_params = fit.get_sampler_params(inc_warmup=False) 30 | no_warning = True 31 | for chain_num, s in enumerate(sampler_params): 32 | energies = s['energy__'] 33 | numer = sum((energies[i] - energies[i - 1])**2 for i in range(1, len(energies))) / len(energies) 34 | denom = numpy.var(energies) 35 | if numer / denom < 0.2: 36 | print('Chain {}: E-BFMI = {}'.format(chain_num, numer / denom)) 37 | no_warning = False 38 | if no_warning: 39 | print('E-BFMI indicated no pathological behavior') 40 | else: 41 | print(' E-BFMI below 0.2 indicates you may need to reparameterize your model') 42 | 43 | def check_n_eff(fit): 44 | """Checks the effective sample size per iteration""" 45 | fit_summary = fit.summary(probs=[0.5]) 46 | n_effs = [x[4] for x in fit_summary['summary']] 47 | names = fit_summary['summary_rownames'] 48 | n_iter = len(fit.extract()['lp__']) 49 | 50 | no_warning = True 51 | for n_eff, name in zip(n_effs, names): 52 | ratio = n_eff / n_iter 53 | if (ratio < 0.001): 54 | print('n_eff / iter for parameter {} is {}!'.format(name, ratio)) 55 | print('E-BFMI below 0.2 indicates you may need to reparameterize your model') 56 | no_warning = False 57 | if no_warning: 58 | print('n_eff / iter looks reasonable for all parameters') 59 | else: 60 | print(' n_eff / iter below 0.001 indicates that the effective sample size has likely been overestimated') 61 | 62 | def check_rhat(fit): 63 | """Checks the potential scale reduction factors""" 64 | from math import isnan 65 | from math import isinf 66 | 67 | fit_summary = fit.summary(probs=[0.5]) 68 | rhats = [x[5] for x in fit_summary['summary']] 69 | names = fit_summary['summary_rownames'] 70 | 71 | no_warning = True 72 | for rhat, name in zip(rhats, names): 73 | if (rhat > 1.1 or isnan(rhat) or isinf(rhat)): 74 | print('Rhat for parameter {} is {}!'.format(name, rhat)) 75 | no_warning = False 76 | if no_warning: 77 | print('Rhat looks reasonable for all parameters') 78 | else: 79 | print(' Rhat above 1.1 indicates that the chains very likely have not mixed') 80 | 81 | def check_all_diagnostics(fit): 82 | """Checks all MCMC diagnostics""" 83 | check_n_eff(fit) 84 | check_rhat(fit) 85 | check_div(fit) 86 | check_treedepth(fit) 87 | check_energy(fit) 88 | 89 | def _by_chain(unpermuted_extraction): 90 | num_chains = len(unpermuted_extraction[0]) 91 | result = [[] for _ in range(num_chains)] 92 | for c in range(num_chains): 93 | for i in range(len(unpermuted_extraction)): 94 | result[c].append(unpermuted_extraction[i][c]) 95 | return numpy.array(result) 96 | 97 | def _shaped_ordered_params(fit): 98 | ef = fit.extract(permuted=False, inc_warmup=False) # flattened, unpermuted, by (iteration, chain) 99 | ef = _by_chain(ef) 100 | ef = ef.reshape(-1, len(ef[0][0])) 101 | ef = ef[:, 0:len(fit.flatnames)] # drop lp__ 102 | shaped = {} 103 | idx = 0 104 | for dim, param_name in zip(fit.par_dims, fit.extract().keys()): 105 | length = int(numpy.prod(dim)) 106 | shaped[param_name] = ef[:,idx:idx + length] 107 | shaped[param_name].reshape(*([-1] + dim)) 108 | idx += length 109 | return shaped 110 | 111 | def partition_div(fit): 112 | """ Returns parameter arrays separated into divergent and non-divergent transitions""" 113 | sampler_params = fit.get_sampler_params(inc_warmup=False) 114 | div = numpy.concatenate([x['divergent__'] for x in sampler_params]).astype('int') 115 | params = _shaped_ordered_params(fit) 116 | nondiv_params = dict((key, params[key][div == 0]) for key in params) 117 | div_params = dict((key, params[key][div == 1]) for key in params) 118 | return nondiv_params, div_params 119 | 120 | def compile_model(filename, model_name=None, **kwargs): 121 | """This will automatically cache models - great if you're just running a 122 | script on the command line. 123 | 124 | See http://pystan.readthedocs.io/en/latest/avoiding_recompilation.html""" 125 | from hashlib import md5 126 | 127 | with open(filename) as f: 128 | model_code = f.read() 129 | code_hash = md5(model_code.encode('ascii')).hexdigest() 130 | if model_name is None: 131 | cache_fn = 'cached-model-{}.pkl'.format(code_hash) 132 | else: 133 | cache_fn = 'cached-{}-{}.pkl'.format(model_name, code_hash) 134 | try: 135 | sm = pickle.load(open(cache_fn, 'rb')) 136 | except: 137 | sm = pystan.StanModel(model_code=model_code) 138 | with open(cache_fn, 'wb') as f: 139 | pickle.dump(sm, f) 140 | else: 141 | print("Using cached StanModel") 142 | return sm 143 | -------------------------------------------------------------------------------- /scripts/linearRegression.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import pystan 6 | 7 | from helper import psis, stan_utility 8 | 9 | model_file = "../models/linearRegression.stan" 10 | # Data from http://www.openbugs.net/Examples/Ratsdata.html 11 | data = {'N': 5, 12 | 'x': [8.0, 15.0, 22.0, 29.0, 36.0], 13 | 'y': [160, 207, 248, 288, 324] 14 | } 15 | 16 | sm = pystan.StanModel(file=model_file) 17 | fit = sm.sampling(data=data, control=dict(adapt_delta=0.95)) 18 | print(fit) 19 | fit.plot(['alpha', 'beta', 'sigma']) 20 | plt.show() 21 | 22 | # model diagnostics 23 | # https://github.com/betanalpha/jupyter_case_studies/blob/master/pystan_workflow/stan_utility.py 24 | stan_utility.check_all_diagnostics(fit) 25 | 26 | # visualize model 27 | fit_dict = fit.extract() 28 | m_alpha = np.mean(fit_dict['alpha']) 29 | m_beta = np.mean(fit_dict['beta']) 30 | x = np.linspace(min(data['x']), max(data['x'])) 31 | y = m_alpha + m_beta * x 32 | plt.scatter(data['x'], data['y'], c="#1f77b4", label="Observed Data") 33 | plt.plot(x, y, c='#7f7f7f', label="Our Model") 34 | plt.title("Rat weights") 35 | plt.xlabel("Days") 36 | plt.ylabel("Weigths in grams") 37 | plt.legend() 38 | plt.show() 39 | 40 | # Log-likelihood 41 | log_lik = fit.extract()['log_lik'] 42 | print(psis.psisloo(log_lik)[0]) 43 | 44 | # Save model for later use 45 | with open('../models/saved/linearRegression.pkl', 'wb') as f: 46 | pickle.dump(sm, f) 47 | -------------------------------------------------------------------------------- /scripts/logisticRegression.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import pandas as pd 6 | import pystan 7 | import seaborn as sns 8 | from sklearn.metrics import confusion_matrix 9 | from sklearn.model_selection import train_test_split 10 | 11 | from helper import stan_utility 12 | 13 | model_file = "../models/logisticRegression.stan" 14 | # https://stats.idre.ucla.edu/stata/dae/logistic-regression/ 15 | data_file = "../data/binary.dta" 16 | 17 | data = pd.read_stata(data_file) 18 | 19 | # Data preprocessing 20 | # Convert rank categorial variable to dummy 21 | data = pd.get_dummies(data=data, columns=['rank']) 22 | del data['rank_1.0'] # avoid dummy variable trap 23 | 24 | # Rescale gpa and gre variables 25 | data['gre'] = (data['gre'] - np.mean(data['gre'])) / np.std(data['gre']) 26 | data['gpa'] = (data['gpa'] - np.mean(data['gpa'])) / np.std(data['gpa']) 27 | 28 | # Split data as train/test 29 | data_train, data_test = train_test_split(data, test_size=0.2) 30 | 31 | model_data = {'N_train': 320, 32 | 'N_test': 80, 33 | 'D': 5, 34 | 'x_train': data_train[['gre', 'gpa', 'rank_2.0', 35 | 'rank_3.0', 'rank_4.0']].astype(np.int32), 36 | 'x_test': data_test[['gre', 'gpa', 'rank_2.0', 37 | 'rank_3.0', 'rank_4.0']].astype(np.int32), 38 | 'y_train': data_train['admit'].astype(np.int32)} 39 | 40 | sm = pystan.StanModel(file=model_file) 41 | fit = sm.sampling(data=model_data, control=dict(adapt_delta=0.95)) 42 | print(fit) 43 | fit.plot(['alpha', 'beta']) 44 | plt.show() 45 | 46 | sns.pairplot(pd.DataFrame(fit.extract()['beta'])) 47 | plt.show() 48 | 49 | # model diagnostics 50 | # https://github.com/betanalpha/jupyter_case_studies/blob/master/pystan_workflow/stan_utility.py 51 | stan_utility.check_all_diagnostics(fit) 52 | 53 | # Confusion matrix 54 | y_pred = fit.extract()['y_pred'] 55 | y_pred = np.median(y_pred, axis=0) 56 | print(confusion_matrix(data_test['admit'], y_pred)) 57 | 58 | # Save model for later use 59 | with open('../models/saved/logisticRegression.pkl', 'wb') as f: 60 | pickle.dump(sm, f) 61 | -------------------------------------------------------------------------------- /scripts/multinomialLogisticRegression.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import pystan 6 | 7 | from helper import stan_utility 8 | 9 | model_file = "../models/multinomialLogisticRegression.stan" 10 | # https://stats.idre.ucla.edu/stata/dae/multinomiallogistic-regression/ 11 | data_file = "../data/hsbdemo.dta" 12 | 13 | data = pd.read_stata(data_file) 14 | 15 | data = pd.get_dummies(data=data, columns=['ses', 'schtyp', 'honors']) 16 | 17 | map_prog = {'general': 1, 18 | 'academic': 2, 19 | 'vocation': 3} 20 | data['prog'] = data['prog'].map(map_prog) 21 | 22 | data['read'] = (data['read'] - np.mean(data['read'])) / np.mean(data['read']) 23 | data['write'] = (data['write'] - np.mean(data['write'])) / \ 24 | np.mean(data['write']) 25 | 26 | data = {'N': 200, 27 | 'K': 3, 28 | 'D': 6, 29 | 'x': data[['ses_low', 'ses_middle', 'schtyp_public', 30 | 'honors_enrolled', 'read', 'write']], 31 | 'y': data['prog'] 32 | } 33 | 34 | sm = pystan.StanModel(file=model_file) 35 | fit = sm.sampling(data=data, control=dict(adapt_delta=0.95)) 36 | print(fit) 37 | 38 | # model diagnostics 39 | # https://github.com/betanalpha/jupyter_case_studies/blob/master/pystan_workflow/stan_utility.py 40 | stan_utility.check_all_diagnostics(fit) 41 | 42 | # Save model for later use 43 | with open('../models/saved/multinomialLogisticRegression.pkl', 'wb') as f: 44 | pickle.dump(sm, f) 45 | -------------------------------------------------------------------------------- /scripts/multipleLinearRegression.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import pandas as pd 6 | import pystan 7 | import seaborn as sns 8 | 9 | from helper import psis, stan_utility 10 | 11 | model_file = "../models/multipleLinearRegression.stan" 12 | # http://lib.stat.cmu.edu/DASL/Datafiles/Cereals.html 13 | data_file = "../data/cereals.txt" 14 | data = pd.read_table(data_file) 15 | 16 | data = data[['fat', 'weight', 'cups', 'rating']] 17 | data = {'N': 77, 18 | 'fat': data['fat'], 19 | 'weight': data['weight'], 20 | 'cups': data['cups'], 21 | 'rating': data['rating']} 22 | 23 | sm = pystan.StanModel(file=model_file) 24 | fit = sm.sampling(data=data, control=dict(adapt_delta=0.95)) 25 | print(fit) 26 | fit.plot(['b_fat', 'b_weight', 'b_cups']) 27 | plt.show() 28 | 29 | # model diagnostics 30 | # https://github.com/betanalpha/jupyter_case_studies/blob/master/pystan_workflow/stan_utility.py 31 | stan_utility.check_all_diagnostics(fit) 32 | 33 | # visualize model 34 | # we'll plot histogram of our errors 35 | rating_pred = fit.extract()['rating_pred'].mean(axis=0) 36 | rating = data['rating'].values 37 | abs_err = np.abs(rating - rating_pred) 38 | sns.distplot(abs_err) 39 | plt.title("Histogram of absolute errors") 40 | plt.xlabel("Errors") 41 | plt.ylabel("Frequency") 42 | plt.show() 43 | 44 | # Log-likelihood 45 | log_lik = fit.extract()['log_lik'] 46 | print(psis.psisloo(log_lik)[0]) 47 | 48 | # Save model for later use 49 | with open('../models/saved/multipleLinearRegression.pkl', 'wb') as f: 50 | pickle.dump(sm, f) 51 | -------------------------------------------------------------------------------- /scripts/onewayANOVA.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import pandas as pd 6 | import pystan 7 | 8 | from helper import stan_utility 9 | 10 | model_file = "../models/onewayANOVA.stan" 11 | # http://staff.bath.ac.uk/pssiw/stats2/page16/page16.html 12 | data_file = "../data/iqdata.csv" 13 | 14 | data = pd.read_csv(data_file) 15 | 16 | data = pd.get_dummies(data, columns=['group']) 17 | del data['group_1'] 18 | 19 | data = {'N': 43, 20 | 'x1': data['group_2'], 21 | 'x2': data['group_3'], 22 | 'y': data['iq']} 23 | 24 | sm = pystan.StanModel(file=model_file) 25 | fit = sm.sampling(data=data, control=dict(adapt_delta=0.95)) 26 | print(fit) 27 | fit.plot() 28 | plt.show() 29 | 30 | # model diagnostics 31 | # https://github.com/betanalpha/jupyter_case_studies/blob/master/pystan_workflow/stan_utility.py 32 | stan_utility.check_all_diagnostics(fit) 33 | 34 | # extract coefficents 35 | fit_dict = fit.extract() 36 | alpha = fit_dict['alpha'] 37 | beta_x1 = fit_dict['beta_x1'] 38 | beta_x2 = fit_dict['beta_x2'] 39 | # calculate group means from coefficents 40 | mean_1 = alpha.mean(axis=0) 41 | mean_2 = alpha.mean(axis=0) + beta_x1.mean(axis=0) 42 | mean_3 = alpha.mean(axis=0) + beta_x2.mean(axis=0) 43 | print( 44 | f'Mean of group 1: {mean_1},\nMean of group 2: {mean_2},\nMean of group 3: {mean_3}\n') 45 | # calculate the posterior distribution of the difference between the means of group 1 and 3 46 | diffs13 = alpha - (alpha + beta_x2) 47 | # 95% credible intervals 48 | diffs13_ci = np.percentile(diffs13, [2.5, 97.5], axis=0) 49 | print( 50 | f"Estimated difference between the means of group 1 and 3: {diffs13.mean(axis=0)}\n") 51 | print(f"\t95% credible interval: ({diffs13_ci[0]}, {diffs13_ci[1]})\n") 52 | # How strongly do the data support the hypothesis that the mean of group 3 is larger than the mean of group 1? 53 | print(f"{np.sum(alpha + beta_x2 > alpha) / np.size(alpha)}") 54 | # Because probabilities are never exactly 1, we write >0.999 55 | 56 | # Save model for later use 57 | with open('../models/saved/onewayANOVA.pkl', 'wb') as f: 58 | pickle.dump(sm, f) 59 | -------------------------------------------------------------------------------- /scripts/orderedLogisticRegression.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | import matplotlib.pyplot as plt 4 | import pandas as pd 5 | import pystan 6 | 7 | from helper import stan_utility 8 | 9 | model_file = "../models/orderedLogisticRegression.stan" 10 | # https://stats.idre.ucla.edu/stata/dae/ordered-logistic-regression/ 11 | data_file = "../data/ologit.dta" 12 | 13 | data = pd.read_stata(data_file) 14 | 15 | x = data[['pared', 'public', 'gpa']] 16 | y = data['apply'].map({'unlikely': 1, 'somewhat likely': 2, 'very likely': 3}) 17 | 18 | data = {'N': 400, 19 | 'D': 3, 20 | 'K': 3, 21 | 'x': x, 22 | 'y': y} 23 | 24 | sm = pystan.StanModel(file=model_file) 25 | fit = sm.sampling(data=data, control=dict(adapt_delta=0.95)) 26 | print(fit) 27 | 28 | fit.plot() 29 | plt.show() 30 | 31 | # model diagnostics 32 | # https://github.com/betanalpha/jupyter_case_studies/blob/master/pystan_workflow/stan_utility.py 33 | stan_utility.check_all_diagnostics(fit) 34 | 35 | # Save model for later use 36 | with open('../models/saved/orderedLogisticRegression.pkl', 'wb') as f: 37 | pickle.dump(sm, f) 38 | -------------------------------------------------------------------------------- /scripts/robustRegression.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import pandas as pd 6 | import pystan 7 | import seaborn as sns 8 | 9 | from helper import stan_utility 10 | 11 | model_file = "../models/robustRegression.stan" 12 | # http://vincentarelbundock.github.io/Rdatasets/datasets.html 13 | data_file = "../data/aircraft.csv" 14 | 15 | data = pd.read_csv(data_file) 16 | 17 | data = {'N': 23, 18 | 'X1': data['X1'], 19 | 'X2': data['X2'], 20 | 'X3': (data['X3'] - np.mean(data['X3'])) / np.std(data['X3']), 21 | 'X4': (data['X4'] - np.mean(data['X4'])) / np.std(data['X4']), 22 | 'Y': data['Y'], 23 | } 24 | 25 | sm = pystan.StanModel(file=model_file) 26 | fit = sm.sampling(data=data, control=dict(adapt_delta=0.95)) 27 | print(fit) 28 | fit.plot(['b_X1', 'b_X2', 'b_X3', 'b_X4']) 29 | plt.show() 30 | 31 | # model diagnostics 32 | # https://github.com/betanalpha/jupyter_case_studies/blob/master/pystan_workflow/stan_utility.py 33 | stan_utility.check_all_diagnostics(fit) 34 | 35 | # visualize model 36 | # we'll plot histogram of our errors 37 | Y_pred = fit.extract()['Y_pred'].mean(axis=0) 38 | Y = data['Y'].values 39 | abs_err = np.abs(Y - Y_pred) 40 | sns.distplot(abs_err) 41 | plt.title("Histogram of absolute errors") 42 | plt.xlabel("Errors") 43 | plt.ylabel("Frequency") 44 | plt.show() 45 | 46 | # Save model for later use 47 | with open('../models/saved/robustRegression.pkl', 'wb') as f: 48 | pickle.dump(sm, f) 49 | -------------------------------------------------------------------------------- /scripts/twowayANOVA.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import pandas as pd 6 | import pystan 7 | 8 | from helper import stan_utility 9 | 10 | model_file = "../models/twowayANOVA.stan" 11 | # http://staff.bath.ac.uk/pssiw/stats2/page16/page16.html 12 | data_file = "../data/drugtrial.csv" 13 | 14 | data = pd.read_csv(data_file, index_col=0) 15 | 16 | data = pd.get_dummies(data, columns=['gender', 'dose']) 17 | data.drop(columns=['gender_1', 'dose_1'], inplace=True) 18 | 19 | data = {'N': 48, 20 | 'x1': data['gender_2'], 21 | 'x2': data['dose_2'], 22 | 'y': data['score']} 23 | 24 | sm = pystan.StanModel(file=model_file) 25 | fit = sm.sampling(data=data, control=dict(adapt_delta=0.95)) 26 | print(fit) 27 | fit.plot() 28 | plt.show() 29 | 30 | # model diagnostics 31 | # https://github.com/betanalpha/jupyter_case_studies/blob/master/pystan_workflow/stan_utility.py 32 | stan_utility.check_all_diagnostics(fit) 33 | 34 | # extract coefficents 35 | fit_dict = fit.extract() 36 | alpha = fit_dict['alpha'] 37 | beta_x1 = fit_dict['beta_x1'] 38 | beta_x2 = fit_dict['beta_x2'] 39 | beta_x3 = fit_dict['beta_x3'] 40 | # calculate group means from coefficents 41 | mean_11 = alpha.mean(axis=0) 42 | mean_12 = alpha.mean(axis=0) + beta_x1.mean(axis=0) 43 | mean_21 = alpha.mean(axis=0) + beta_x2.mean(axis=0) 44 | mean_22 = alpha.mean(axis=0) + beta_x1.mean(axis=0) + \ 45 | beta_x2.mean(axis=0) + beta_x3.mean(axis=0) 46 | print( 47 | f'Mean of gender=1, dose=1: {mean_11},\n' 48 | f'Mean of gender=1, dose=2: {mean_12},\n' 49 | f'Mean of gender=2, dose=1: {mean_21},\n' 50 | f'Mean of gender=2, dose=2: {mean_22}\n') 51 | 52 | # Save model for later use 53 | with open('../models/saved/twowayANOVA.pkl', 'wb') as f: 54 | pickle.dump(sm, f) 55 | --------------------------------------------------------------------------------