├── .gitattributes ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── PUBLICITY.md ├── README.md ├── data ├── cpds_excel_new.xlsx ├── cpds_stata.dta ├── dirty.csv ├── feedback.Rda ├── large.csv ├── merge_practice_1.csv ├── merge_practice_2.csv ├── merge_practice_3.csv ├── mydata.Rda ├── mydata.csv └── pew.sav ├── examples ├── save_console_output.R └── save_console_output.txt ├── instructor ├── day_four.R ├── day_four.Rmd ├── day_four.html ├── day_four.pdf ├── day_one.R ├── day_one.Rmd ├── day_one.html ├── day_one.pdf ├── day_three.R ├── day_three.Rmd ├── day_three.html ├── day_three.pdf ├── day_two.R ├── day_two.Rmd ├── day_two.html ├── day_two.pdf ├── overflow.R ├── overflow.Rmd ├── overflow.html └── overflow.pdf └── scripts ├── feedback_cleaner.R └── regenerate_files.R /.gitattributes: -------------------------------------------------------------------------------- 1 | *.Rmd linguist-language=R 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | 5 | # Example code in package build process 6 | *-Ex.R 7 | 8 | # RStudio files 9 | .Rproj.user/ 10 | *.Rproj 11 | .RData 12 | 13 | # produced vignettes 14 | vignettes/*.html 15 | vignettes/*.pdf 16 | .Rproj.user 17 | 18 | # other 19 | .DS_Store 20 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ## How to contribute 2 | 3 | ### Report an issue 4 | 5 | Reporting a bug with our code is one way that you can help improve it. If you choose to report an issue, please follow these steps: 6 | 7 | 1. Give your issue a descriptive title (e.g. **not** "code doesn't work") 8 | 2. Describe the exact steps that reproduce the bug, with no extraneous steps 9 | 3. If a specific bunch of code you wrote causes a bug, submit the code 10 | 4. If you are getting an error message, submit the error message 11 | 5. Include version numbers for your OS, R, and IDE 12 | 13 | ### Submit a patch 14 | 15 | Submitting new materials, data, examples, and/or code is a great way to help improve teaching materials. If you choose to submit materials to this repository, please follow these steps: 16 | 17 | 1. Fork this repository 18 | 2. Make focused, directed, and clean changes to your fork 19 | 3. Run `R --quiet -f scripts/regenerate_files.R` from the base directory 20 | 4. Commit your changes to your fork 21 | - Make sure your commit is thematically focused 22 | - Small commits are better than large commits 23 | - Use informative commit messages 24 | - If your commit fixes an issue, include `closes #<>` in your commit message 25 | 5. Push your commit to your fork 26 | 6. Create a pull request 27 | - Give your pull request a descriptive title (e.g. **not** "changes") 28 | - Explain the motivation for your changes 29 | - Explain what you have changed 30 | 31 | ##### And thanks! 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Creative Commons Attribution-NonCommercial 4.0 International Public License 3 | 4 | By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-NonCommercial 4.0 International Public License ("Public License"). To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions. 5 | 6 | Section 1 – Definitions. 7 | 8 | Adapted Material means material subject to Copyright and Similar Rights that is derived from or based upon the Licensed Material and in which the Licensed Material is translated, altered, arranged, transformed, or otherwise modified in a manner requiring permission under the Copyright and Similar Rights held by the Licensor. For purposes of this Public License, where the Licensed Material is a musical work, performance, or sound recording, Adapted Material is always produced where the Licensed Material is synched in timed relation with a moving image. 9 | Adapter's License means the license You apply to Your Copyright and Similar Rights in Your contributions to Adapted Material in accordance with the terms and conditions of this Public License. 10 | Copyright and Similar Rights means copyright and/or similar rights closely related to copyright including, without limitation, performance, broadcast, sound recording, and Sui Generis Database Rights, without regard to how the rights are labeled or categorized. For purposes of this Public License, the rights specified in Section 2(b)(1)-(2) are not Copyright and Similar Rights. 11 | Effective Technological Measures means those measures that, in the absence of proper authority, may not be circumvented under laws fulfilling obligations under Article 11 of the WIPO Copyright Treaty adopted on December 20, 1996, and/or similar international agreements. 12 | Exceptions and Limitations means fair use, fair dealing, and/or any other exception or limitation to Copyright and Similar Rights that applies to Your use of the Licensed Material. 13 | Licensed Material means the artistic or literary work, database, or other material to which the Licensor applied this Public License. 14 | Licensed Rights means the rights granted to You subject to the terms and conditions of this Public License, which are limited to all Copyright and Similar Rights that apply to Your use of the Licensed Material and that the Licensor has authority to license. 15 | Licensor means the individual(s) or entity(ies) granting rights under this Public License. 16 | NonCommercial means not primarily intended for or directed towards commercial advantage or monetary compensation. For purposes of this Public License, the exchange of the Licensed Material for other material subject to Copyright and Similar Rights by digital file-sharing or similar means is NonCommercial provided there is no payment of monetary compensation in connection with the exchange. 17 | Share means to provide material to the public by any means or process that requires permission under the Licensed Rights, such as reproduction, public display, public performance, distribution, dissemination, communication, or importation, and to make material available to the public including in ways that members of the public may access the material from a place and at a time individually chosen by them. 18 | Sui Generis Database Rights means rights other than copyright resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other essentially equivalent rights anywhere in the world. 19 | You means the individual or entity exercising the Licensed Rights under this Public License. Your has a corresponding meaning. 20 | Section 2 – Scope. 21 | 22 | License grant. 23 | Subject to the terms and conditions of this Public License, the Licensor hereby grants You a worldwide, royalty-free, non-sublicensable, non-exclusive, irrevocable license to exercise the Licensed Rights in the Licensed Material to: 24 | reproduce and Share the Licensed Material, in whole or in part, for NonCommercial purposes only; and 25 | produce, reproduce, and Share Adapted Material for NonCommercial purposes only. 26 | Exceptions and Limitations. For the avoidance of doubt, where Exceptions and Limitations apply to Your use, this Public License does not apply, and You do not need to comply with its terms and conditions. 27 | Term. The term of this Public License is specified in Section 6(a). 28 | Media and formats; technical modifications allowed. The Licensor authorizes You to exercise the Licensed Rights in all media and formats whether now known or hereafter created, and to make technical modifications necessary to do so. The Licensor waives and/or agrees not to assert any right or authority to forbid You from making technical modifications necessary to exercise the Licensed Rights, including technical modifications necessary to circumvent Effective Technological Measures. For purposes of this Public License, simply making modifications authorized by this Section 2(a)(4) never produces Adapted Material. 29 | Downstream recipients. 30 | Offer from the Licensor – Licensed Material. Every recipient of the Licensed Material automatically receives an offer from the Licensor to exercise the Licensed Rights under the terms and conditions of this Public License. 31 | No downstream restrictions. You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, the Licensed Material if doing so restricts exercise of the Licensed Rights by any recipient of the Licensed Material. 32 | No endorsement. Nothing in this Public License constitutes or may be construed as permission to assert or imply that You are, or that Your use of the Licensed Material is, connected with, or sponsored, endorsed, or granted official status by, the Licensor or others designated to receive attribution as provided in Section 3(a)(1)(A)(i). 33 | Other rights. 34 | 35 | Moral rights, such as the right of integrity, are not licensed under this Public License, nor are publicity, privacy, and/or other similar personality rights; however, to the extent possible, the Licensor waives and/or agrees not to assert any such rights held by the Licensor to the limited extent necessary to allow You to exercise the Licensed Rights, but not otherwise. 36 | Patent and trademark rights are not licensed under this Public License. 37 | To the extent possible, the Licensor waives any right to collect royalties from You for the exercise of the Licensed Rights, whether directly or through a collecting society under any voluntary or waivable statutory or compulsory licensing scheme. In all other cases the Licensor expressly reserves any right to collect such royalties, including when the Licensed Material is used other than for NonCommercial purposes. 38 | Section 3 – License Conditions. 39 | 40 | Your exercise of the Licensed Rights is expressly made subject to the following conditions. 41 | 42 | Attribution. 43 | 44 | If You Share the Licensed Material (including in modified form), You must: 45 | 46 | retain the following if it is supplied by the Licensor with the Licensed Material: 47 | identification of the creator(s) of the Licensed Material and any others designated to receive attribution, in any reasonable manner requested by the Licensor (including by pseudonym if designated); 48 | a copyright notice; 49 | a notice that refers to this Public License; 50 | a notice that refers to the disclaimer of warranties; 51 | a URI or hyperlink to the Licensed Material to the extent reasonably practicable; 52 | indicate if You modified the Licensed Material and retain an indication of any previous modifications; and 53 | indicate the Licensed Material is licensed under this Public License, and include the text of, or the URI or hyperlink to, this Public License. 54 | You may satisfy the conditions in Section 3(a)(1) in any reasonable manner based on the medium, means, and context in which You Share the Licensed Material. For example, it may be reasonable to satisfy the conditions by providing a URI or hyperlink to a resource that includes the required information. 55 | If requested by the Licensor, You must remove any of the information required by Section 3(a)(1)(A) to the extent reasonably practicable. 56 | If You Share Adapted Material You produce, the Adapter's License You apply must not prevent recipients of the Adapted Material from complying with this Public License. 57 | Section 4 – Sui Generis Database Rights. 58 | 59 | Where the Licensed Rights include Sui Generis Database Rights that apply to Your use of the Licensed Material: 60 | 61 | for the avoidance of doubt, Section 2(a)(1) grants You the right to extract, reuse, reproduce, and Share all or a substantial portion of the contents of the database for NonCommercial purposes only; 62 | if You include all or a substantial portion of the database contents in a database in which You have Sui Generis Database Rights, then the database in which You have Sui Generis Database Rights (but not its individual contents) is Adapted Material; and 63 | You must comply with the conditions in Section 3(a) if You Share all or a substantial portion of the contents of the database. 64 | For the avoidance of doubt, this Section 4 supplements and does not replace Your obligations under this Public License where the Licensed Rights include other Copyright and Similar Rights. 65 | Section 5 – Disclaimer of Warranties and Limitation of Liability. 66 | 67 | Unless otherwise separately undertaken by the Licensor, to the extent possible, the Licensor offers the Licensed Material as-is and as-available, and makes no representations or warranties of any kind concerning the Licensed Material, whether express, implied, statutory, or other. This includes, without limitation, warranties of title, merchantability, fitness for a particular purpose, non-infringement, absence of latent or other defects, accuracy, or the presence or absence of errors, whether or not known or discoverable. Where disclaimers of warranties are not allowed in full or in part, this disclaimer may not apply to You. 68 | To the extent possible, in no event will the Licensor be liable to You on any legal theory (including, without limitation, negligence) or otherwise for any direct, special, indirect, incidental, consequential, punitive, exemplary, or other losses, costs, expenses, or damages arising out of this Public License or use of the Licensed Material, even if the Licensor has been advised of the possibility of such losses, costs, expenses, or damages. Where a limitation of liability is not allowed in full or in part, this limitation may not apply to You. 69 | The disclaimer of warranties and limitation of liability provided above shall be interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability. 70 | Section 6 – Term and Termination. 71 | 72 | This Public License applies for the term of the Copyright and Similar Rights licensed here. However, if You fail to comply with this Public License, then Your rights under this Public License terminate automatically. 73 | Where Your right to use the Licensed Material has terminated under Section 6(a), it reinstates: 74 | 75 | automatically as of the date the violation is cured, provided it is cured within 30 days of Your discovery of the violation; or 76 | upon express reinstatement by the Licensor. 77 | For the avoidance of doubt, this Section 6(b) does not affect any right the Licensor may have to seek remedies for Your violations of this Public License. 78 | For the avoidance of doubt, the Licensor may also offer the Licensed Material under separate terms or conditions or stop distributing the Licensed Material at any time; however, doing so will not terminate this Public License. 79 | Sections 1, 5, 6, 7, and 8 survive termination of this Public License. 80 | Section 7 – Other Terms and Conditions. 81 | 82 | The Licensor shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed. 83 | Any arrangements, understandings, or agreements regarding the Licensed Material not stated herein are separate from and independent of the terms and conditions of this Public License. 84 | Section 8 – Interpretation. 85 | 86 | For the avoidance of doubt, this Public License does not, and shall not be interpreted to, reduce, limit, restrict, or impose conditions on any use of the Licensed Material that could lawfully be made without permission under this Public License. 87 | To the extent possible, if any provision of this Public License is deemed unenforceable, it shall be automatically reformed to the minimum extent necessary to make it enforceable. If the provision cannot be reformed, it shall be severed from this Public License without affecting the enforceability of the remaining terms and conditions. 88 | No term or condition of this Public License will be waived and no failure to comply consented to unless expressly agreed to by the Licensor. 89 | Nothing in this Public License constitutes or may be interpreted as a limitation upon, or waiver of, any privileges and immunities that apply to the Licensor or You, including from the legal processes of any jurisdiction or authority. 90 | -------------------------------------------------------------------------------- /PUBLICITY.md: -------------------------------------------------------------------------------- 1 | # Facetweet announcement 2 | 3 | Learn how to analyze your datasets in R! [insert link here](https://youtu.be/dQw4w9WgXcQ) 4 | 5 | # Information for calendar 6 | 7 | The workshop duration is 3hrs per class. 8 | 9 | # Descriptions for website 10 | 11 | ## Header 12 | 13 | **title** : R for Data Science 14 | 15 | **description** : The R for Data Science workshop series is a four part course, designed to take novices in the R language for statistical computing and produce programmers who are competent in finding, displaying, analyzing, and publishing data in R. 16 | 17 | ## Part 1 18 | 19 | **subtitle** : Basics of R 20 | 21 | **description** : Students will understand the motivation behind object orientation, and how that relates to computation. Students will be able to perform basic functions in R necessary to use the software on their computers and conduct basic arithmetic. Students will understand data types and data structures, and why and how they are different from each other. 22 | 23 | **knowledge requirements** : [Programming Fun!damentals](https://github.com/dlab-berkeley/programming-fundamentals), or equivalent prior knowledge 24 | 25 | **tech requirements** : Laptop required; please install R version 3.2 or greater in advance (University laptops will need to have R installed by an administrator); the RStudio IDE is recommended but not required 26 | 27 | ## Part 2 28 | 29 | **subtitle** : Clean and tidy data 30 | 31 | **description** : Students will be introduced to DRY principles and best practices for sanitizing and tidying data. Students will learn what missingness is, and how best to accommodate missing data in their research designs. Students will be able to read in files from disk or a database, clean the data found within them, select specific data from them, and merge them with other datasets. 32 | 33 | **knowledge requirements** : R-for-Data-Science Part 1 or equivalent prior knowledge 34 | 35 | **tech requirements** : Laptop required; please install R version 3.2 or greater in advance (University laptops will need to have R installed by an administrator); the RStudio IDE is recommended but not required 36 | 37 | ## Part 3 38 | 39 | **subtitle** : Analyzing data 40 | 41 | **description** : Students will be introduced to the principles behind the grammar of graphics and the general linear model. Students will understand the implementation of plotting in R. Students will be able to explore, summarize, and analyze data using R's implementation of exploratory and inferential data analysis. 42 | 43 | **knowledge requirements** : R-for-Data-Science Part 2 or equivalent prior knowledge 44 | 45 | **tech requirements** : Laptop required; please install R version 3.2 or greater in advance (University laptops will need to have R installed by an administrator); the RStudio IDE is recommended but not required 46 | 47 | ## Part 4 48 | 49 | **subtitle** : Functions and packages 50 | 51 | **description** : Students will be introduced to the principles behind functional programming. Students will learn how to write and import functions, add looped and vectorized computation to their functions, and control the flow of data through a function. Students will understand the basics of name spaces, and how that relates to assigning values within functions. Students will see how to successfully package a function for CRAN. 52 | 53 | **knowledge requirements** : R-for-Data-Science Part 2 or equivalent prior knowledge 54 | 55 | **tech requirements** : Laptop required; please install R version 3.2 or greater in advance (University laptops will need to have R installed by an administrator); the RStudio IDE is recommended but not required 56 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Materials for D-Lab's R for Data Science 3 | author: Dillon Niederhut 4 | --- 5 | 6 | This repository contains the instructor materials for the D-Lab's R intensive. 7 | 8 | ## If you are a student: 9 | 10 | You can download the contents of this repository with: 11 | 12 | ``` 13 | git clone https://github.com/dlab-berkeley/r-for-data-science.git 14 | ``` 15 | 16 | or, by clicking the "Download Zip" button and then extracting the `.zip` file. 17 | 18 | The instructor of this workshop series will lead you through the activities for each day. 19 | 20 | ## If you are a D-Lab instructor 21 | 22 | You'll see accumulated teaching notes and examples for each day's topics in the instructor folder. For your convenience, these are available as .Rmd, commented .R files, PDF documents, and HTML slides. The meta-document for this workshop series, which explains the logic behind the structure and topics, can be viewed [at the D-Lab guides repository](https://github.com/dlab-berkeley/guides/blob/master/r.pdf) 23 | 24 | For information on contributing to this repository, see `CONTRIBUTING.md` 25 | 26 | ## If you are a D-Lab facilitator 27 | 28 | The standard Drupal workshop descriptions and facetweet postings for this workshop series are in `PUBLICITY.md` 29 | 30 | ## Description 31 | 32 | * `data/` : data necessary for interactive coding examples 33 | * `examples/` 34 | * `save_console_output.R` : R code for saving console output to pdf 35 | * `instructor/` : teaching notes 36 | * `scripts/` 37 | * `feedback_cleaner.R` : used to clean data for use in Day 3 38 | * `regenrate_files.R` : for regenerating `.R` and `.pdf` files from `.Rmd` 39 | 40 | ## Topics: 41 | 42 | This workshop series covers: 43 | 44 | 1. Interacting with R 45 | 2. Datatypes 46 | 3. Data structures 47 | 4. Reading data 48 | 5. Sanitizing data 49 | 6. Missing data 50 | 7. Reshaping data 51 | 8. Summary statistics 52 | 9. Plotting 53 | 10. Linear models 54 | 11. Non-parametric models 55 | 12. Functions 56 | 13. Loops 57 | 14. Parallelization 58 | 15. Packages 59 | 60 | ## Libraries 61 | 62 | This workshop uses the following packages: 63 | 64 | * Amelia 65 | * devtools 66 | * dplyr 67 | * foreign 68 | * ggplot2 69 | * parallelMap 70 | * RCurl 71 | * roxygen2 72 | * stringr 73 | * tidyr 74 | * XML 75 | 76 | --- 77 | _D-Lab == Data Intensive Social Science, For All!_ 78 | -------------------------------------------------------------------------------- /data/cpds_excel_new.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/R-for-Data-Science/798d5b220d66afea5644a245c7327e4bb740039a/data/cpds_excel_new.xlsx -------------------------------------------------------------------------------- /data/cpds_stata.dta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/R-for-Data-Science/798d5b220d66afea5644a245c7327e4bb740039a/data/cpds_stata.dta -------------------------------------------------------------------------------- /data/dirty.csv: -------------------------------------------------------------------------------- 1 | Timestamp,How tall are you?,What department are you in?,Are you currently enrolled?,What is your birth order? 2 | 7/25/2015 10:08:41,very,Geology ,Yes,1 3 | 7/25/2015 10:10:56,70,999,Yes,1 4 | 7/25/2015 10:11:20,5'9, geology,999,2 5 | 7/25/2015 10:11:25,2.1,goelogy,No,"9,000" 6 | 7/25/2015 10:11:29,156,anthro,999,2 7 | -------------------------------------------------------------------------------- /data/feedback.Rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/R-for-Data-Science/798d5b220d66afea5644a245c7327e4bb740039a/data/feedback.Rda -------------------------------------------------------------------------------- /data/large.csv: -------------------------------------------------------------------------------- 1 | "a","b","c" 2 | 1.3581505173944,501.358150517394,-249998.641849483 3 | 11.3577147595264,510.357714759526,-248989.64228524 4 | -8.95555545612576,489.044444543874,-248012.955555456 5 | 1.88114421744398,498.881144217444,-247007.118855783 6 | 0.543663168950277,496.54366316895,-246015.456336831 7 | 3.85425376063794,498.854253760638,-245021.145746239 8 | -11.6013517930778,482.398648206922,-244047.601351793 9 | 20.2670228311676,513.267022831168,-243028.732977169 10 | -7.18780058689817,484.812199413102,-242071.187800587 11 | 6.42867292162213,497.428672921622,-241074.571327078 12 | -4.26342063090416,485.736579369096,-240104.263420631 13 | -4.62510870047102,484.374891299529,-239125.6251087 14 | 21.8557020064228,509.855702006423,-238122.144297994 15 | -5.15416757671307,481.845832423287,-237174.154167577 16 | 2.78173677952288,488.781736779523,-236193.21826322 17 | -7.39782232223834,477.602177677762,-235232.397822322 18 | 1.37565518535506,485.375655185355,-234254.624344815 19 | -1.90923527346237,481.090764726538,-233290.909235273 20 | -8.94943767253883,473.050562327461,-232332.949437673 21 | -15.9737723743721,465.026227625628,-231376.973772374 22 | -7.51565031035233,472.484349689648,-230407.51565031 23 | 6.94587168391045,485.94587168391,-229434.054128316 24 | -1.0496465784606,476.950353421539,-228485.049646578 25 | 17.0179224970545,494.017922497054,-227511.982077503 26 | 0.987848544058621,476.987848544059,-226575.012151456 27 | -1.8855214434642,473.114478556536,NA 28 | -2.89112302138148,471.108876978619,-224678.891123021 29 | -8.88373788763536,464.116262112365,NA 30 | 17.8041175915493,489.804117591549,-222766.195882408 31 | -5.12198149627124,465.878018503729,-221846.121981496 32 | 2.0005109528654,472.000510952865,-220897.999489047 33 | 5.53018496290098,474.530184962901,-219955.469815037 34 | 3.56393288323932,471.563932883239,-219020.436067117 35 | 12.2533993964188,479.253399396419,-218076.746600604 36 | 4.9746350696378,470.974635069638,-217151.02536493 37 | 9.48818384641285,NA,-216215.511816154 38 | 16.1634219553809,480.163421955381,-215279.836578045 39 | 3.24632573187004,466.24632573187,-214365.753674268 40 | -3.86165289433562,458.138347105664,-213447.861652894 41 | -3.83930087880723,457.160699121193,-212524.839300879 42 | -1.46552787866293,458.534472121337,-211601.465527879 43 | 10.06325345765,469.06325345765,-210670.936746542 44 | 8.7969736722717,466.796973672272,-209755.203026328 45 | 3.04064737136917,460.040647371369,-208845.959352629 46 | -1.50043739837891,454.499562601621,-207937.500437398 47 | 13.5331566888627,468.533156688863,-207011.466843311 48 | -12.0613500909166,441.938649909083,-206128.061350091 49 | -11.5439925079878,441.456007492012,-205220.543992508 50 | -0.376125115071009,451.623874884929,-204304.376125115 51 | -6.06441822918584,444.935581770814,-203407.064418229 52 | -5.88246018867732,444.117539811323,-202505.882460189 53 | -24.5275887312936,424.472411268706,NA 54 | 20.4240181837923,468.424018183792,-200683.575981816 55 | 7.25011433950764,454.250114339508,-199801.74988566 56 | -3.73667212877388,442.263327871226,-198919.736672129 57 | 3.93420188883161,448.934201888832,-198021.065798111 58 | 8.12932300409276,452.129323004093,-197127.870676996 59 | 13.0478718418723,456.047871841872,-196235.952128158 60 | 9.15726161508085,451.157261615081,-195354.842738385 61 | -2.47948906202933,438.520510937971,-194483.479489062 62 | 12.6907600620263,452.690760062026,-193587.309239938 63 | 15.7780182194041,454.778018219404,-192705.221981781 64 | -19.6285670734271,418.371432926573,-191863.628567073 65 | -5.55020458503775,431.449795414962,-190974.550204585 66 | -0.311818735355713,435.688181264644,-190096.311818735 67 | 8.56427052865354,443.564270528654,-189216.435729471 68 | 7.63675361892641,441.636753618926,-188348.363246381 69 | 6.94918984307691,439.949189843077,-187482.050810157 70 | 14.2088859180484,446.208885918048,-186609.791114082 71 | -24.9520663587553,406.047933641245,-185785.952066359 72 | 2.58929947743396,432.589299477434,NA 73 | -11.4241154982323,417.575884501768,NA 74 | 3.55993302857193,431.559933028572,-183180.440066971 75 | 10.9564846163327,437.956484616333,-182318.043515384 76 | 0.114643121028011,426.114643121028,-181475.885356879 77 | -10.162935627928,414.837064372072,-180635.162935628 78 | 8.07539286491372,432.075392864914,-179767.924607135 79 | -16.2439915645735,406.756008435427,-178945.243991565 80 | -9.49277709185509,412.507222908145,-178093.492777092 81 | 0.421844602169055,421.421844602169,-177240.578155398 82 | -4.2515981402115,415.748401859788,-176404.25159814 83 | -1.05328824784313,417.946711752157,-175562.053288248 84 | 1.16593338025192,419.165933380252,-174722.83406662 85 | 22.4791745416699,439.47917454167,-173866.520825458 86 | -6.034485578695,409.965514421305,-173062.034485579 87 | -5.82601358372921,409.173986416271,-172230.826013584 88 | -5.96213258127571,408.037867418724,-171401.962132581 89 | 3.94071976661435,416.940719766614,-170565.059280233 90 | -11.0222372863317,400.977762713668,-169755.022237286 91 | 1.64679675504341,412.646796755043,-168919.353203245 92 | 14.9567754592232,424.956775459223,-168085.043224541 93 | 13.8226435741203,422.82264357412,-167267.177356426 94 | 5.28340538463221,413.283405384632,-166458.716594615 95 | -8.78828990075244,398.211710099248,-165657.788289901 96 | 3.97265442085011,409.97265442085,-164832.027345579 97 | 11.9293646479669,416.929364647967,-164013.070635352 98 | -10.3330482666406,393.666951733359,-163226.333048267 99 | 12.8464360814843,415.846436081484,-162396.153563919 100 | 5.89441807362869,407.894418073629,-161598.105581926 101 | 3.65497768135184,404.654977681352,-160797.345022319 102 | 8.95499363626723,408.954993636267,-159991.045006364 103 | -21.7021552310344,377.297844768966,-159222.702155231 104 | -0.0465940891250309,397.953405910875,-158404.046594089 105 | -6.5291574157617,390.470842584238,-157615.529157416 106 | 10.5230033415697,406.52300334157,-156805.476996658 107 | -2.80025676469946,392.199743235301,-156027.800256765 108 | -8.48969709317775,385.510302906822,-155244.489697093 109 | 2.3950835264703,395.39508352647,-154446.604916474 110 | 1.84016651556131,393.840166515561,NA 111 | 5.94263722106124,396.942637221061,-152875.057362779 112 | 10.4433938638096,400.44339386381,-152089.556606136 113 | 0.744670766355569,389.744670766356,-151320.255329234 114 | 14.3721987516088,402.372198751609,-150529.627801248 115 | 7.10235444387405,394.102354443874,-149761.897645556 116 | 19.6421087313306,405.642108731331,NA 117 | 8.06146243489221,393.061462434892,-148216.938537565 118 | -11.3270947858309,372.672905214169,-147467.327094786 119 | 2.34007394825573,385.340073948256,-146686.659926052 120 | -13.1993586548931,368.800641345107,-145937.199358655 121 | 3.68431685168775,384.684316851688,-145157.315683148 122 | -17.6178564660341,362.382143533966,-144417.617856466 123 | 6.98194349085335,385.981943490853,-143634.018056509 124 | -6.97575154559194,371.024248454408,-142890.975751546 125 | 12.2225257479955,NA,-142116.777474252 126 | 10.2589828233097,386.25898282331,-141365.741017177 127 | -13.875818168771,361.124181831229,-140638.875818169 128 | 13.7507811760448,387.750781176045,-139862.249218824 129 | 0.624879444184843,373.624879444185,-139128.375120556 130 | -14.4649047957753,357.535095204225,-138398.464904796 131 | 12.9804428894361,383.980442889436,-137628.019557111 132 | -10.8382172957735,359.161782704227,-136910.838217296 133 | NA,378.041613293329,-136151.958386707 134 | -3.16270014690147,364.837299853099,-135427.162700147 135 | 1.0349482916108,368.034948291611,-134687.965051708 136 | 16.9455952677516,382.945595267752,-133939.054404732 137 | -18.7186518874281,346.281348112572,-133243.718651887 138 | -1.79158948488302,362.208410515117,-132497.791589485 139 | -12.377784917909,350.622215082091,-131781.377784918 140 | -12.5391519130401,349.46084808696,-131056.539151913 141 | 0.274063431279612,361.27406343128,-130320.725936569 142 | -19.2248201730858,340.775179826914,-129619.224820173 143 | 3.44442622462494,362.444426224625,-128877.555573775 144 | 12.2101150683734,370.210115068373,-128151.789884932 145 | -10.0779610901819,346.922038909818,-127459.07796109 146 | -0.985421629741494,355.014578370259,-126736.98542163 147 | 3.25122710133514,358.251227101335,-126021.748772899 148 | -25.637721335228,328.362278664772,-125341.637721335 149 | -10.818588628341,342.181411371659,-124619.818588628 150 | -1.48470852792634,350.515291472074,-123905.484708528 151 | -16.5626687750116,334.437331224988,-123217.562668775 152 | 3.19189092669998,353.1918909267,-122496.808109073 153 | 0.478583427057276,349.478583427057,-121800.521416573 154 | -3.80458463516452,NA,-121107.804584635 155 | -16.4373475020683,330.562652497932,-120425.437347502 156 | 9.05836957054321,355.058369570543,-119706.941630429 157 | 1.73534072733478,346.735340727335,-119023.264659273 158 | -15.9561062275161,328.043893772484,-118351.956106228 159 | -14.524254590909,328.475745409091,-117663.524254591 160 | -15.037525782204,326.962474217796,-116979.037525782 161 | 1.26624271455263,342.266242714553,-116279.733757285 162 | -11.6363827081937,328.363617291806,-115611.636382708 163 | -3.38212909727806,335.617870902722,-114924.382129097 164 | -23.1645260631891,314.835473936811,-114267.164526063 165 | -4.37974698789867,332.620253012101,-113573.379746988 166 | 7.24039490212025,343.24039490212,-112888.759605098 167 | 1.63626940133176,336.636269401332,-112223.363730599 168 | -0.432433557677967,333.567566442322,-111556.432433558 169 | 7.90485939830822,NA,-110881.095140602 170 | -0.734809945897447,331.265190054103,-110224.734809946 171 | -7.73139589131911,323.268604108681,-109568.731395891 172 | 5.61362209303005,335.61362209303,-108894.386377907 173 | 1.33569775999879,330.335697759999,-108239.66430224 174 | -1.64918391715711,326.350816082843,-107585.649183917 175 | 6.45429657809836,333.454296578098,-106922.545703422 176 | 19.4910265703386,345.491026570339,-106256.50897343 177 | 1.09316608428602,326.093166084286,-105623.906833916 178 | -3.21363459186323,NA,-104979.213634592 179 | -0.63513754055205,322.364862459448,-104329.635137541 180 | 16.0851609846775,338.085160984677,-103667.914839015 181 | -0.903742272269506,320.096257727731,-103041.903742272 182 | 1.78209411530552,321.782094115306,-102398.217905885 183 | NA,314.134678184772,-101765.865321815 184 | NA,328.386139150149,NA 185 | 4.29172055786855,321.291720557869,-100484.708279442 186 | 0.133336083210772,316.133336083211,-99855.8666639168 187 | 0.0873211432145336,315.087321143215,-99224.9126788568 188 | 4.34703393065037,318.34703393065,-98591.6529660693 189 | -6.54732353217941,306.452676467821,-97975.5473235322 190 | 5.44633583461175,317.446335834612,-97338.5536641654 191 | -3.66377027820612,307.336229721794,-96724.6637702782 192 | -8.61048934646851,301.389510653531,-96108.6104893465 193 | -3.31471289752472,305.685287102475,-95484.3147128975 194 | 6.21570957381675,314.215709573817,-94857.7842904262 195 | 5.18041185382813,312.180411853828,-94243.8195881462 196 | -9.49900695685376,296.500993043146,-93645.4990069569 197 | 15.279601807336,320.279601807336,-93009.7203981927 198 | 8.70443551911112,312.704435519111,-92407.2955644809 199 | 10.3574561487484,313.357456148748,-91798.6425438513 200 | 2.6052659503092,304.605265950309,-91201.3947340497 201 | -5.1110467554415,295.888953244558,-90606.1110467554 202 | 6.5558265114815,306.555826511481,-89993.4441734885 203 | -3.43073719720164,295.569262802798,-89404.4307371972 204 | -8.93689975442135,289.063100245579,-88812.9368997544 205 | -4.24060225239137,292.759397747609,-88213.2406022524 206 | -3.11811108607603,292.881888913924,-87619.1181110861 207 | -8.29064935639284,286.709350643607,-87033.2906493564 208 | -23.758635412573,270.241364587427,-86459.7586354126 209 | -12.313523713606,280.686476286394,-85861.3135237136 210 | 3.38702450698695,295.387024506987,-85260.612975493 211 | 13.9360535868532,304.936053586853,-84667.0639464132 212 | 17.2538049595115,307.253804959512,-84082.7461950405 213 | 0.995896257700937,289.995896257701,-83520.0041037423 214 | 5.13345807824215,293.133458078242,-82938.8665419218 215 | 8.2487040867109,295.248704086711,NA 216 | 4.70727953512302,290.707279535123,-81791.2927204649 217 | -1.44598401603106,283.554015983969,-81226.445984016 218 | -22.7231102955257,261.276889704474,-80678.7231102955 219 | 0.859360245326562,283.859360245327,-80088.1406397547 220 | 0.802281751053773,282.802281751054,-79523.1977182489 221 | 15.5819596590036,296.581959659004,-78945.418040341 222 | -5.8000887662359,274.199911233764,-78405.8000887662 223 | -13.1392425826392,265.860757417361,-77854.1392425826 224 | 1.76570103090634,279.765701030906,-77282.2342989691 225 | -20.0934880832453,256.906511916755,NA 226 | 4.58734284097442,280.587342840974,-76171.412657159 227 | 2.84741016364773,277.847410163648,-75622.1525898363 228 | -18.8704820563166,255.129517943683,-75094.8704820563 229 | 3.66064975731091,276.660649757311,-74525.3393502427 230 | -5.34710873154519,266.652891268455,-73989.3471087315 231 | -12.6754149821461,258.324585017854,-73453.6754149821 232 | 4.55624068166475,274.556240681665,-72895.4437593183 233 | -4.65331263851466,264.346687361485,-72365.6533126385 234 | -5.27088777964937,262.729112220351,-71829.2708877797 235 | 0.528830506687434,267.528830506687,-71288.4711694933 236 | 7.03591033801163,273.035910338012,NA 237 | 12.8776181063941,277.877618106394,-70212.1223818936 238 | 11.3783555537835,275.378355553783,-69684.6216444462 239 | 3.79947253499103,266.799472534991,-69165.200527465 240 | 0.224453283618614,262.224453283619,-68643.7755467164 241 | -3.19817766025074,257.801822339749,-68124.1981776602 242 | 0.296041237584306,260.296041237584,-67599.7039587624 243 | -9.42430707567786,249.575692924322,-67090.4243070757 244 | 10.0490949471492,268.049094947149,-66553.9509050529 245 | 9.23150099378385,266.231500993784,NA 246 | -1.58419908270131,254.415800917299,-65537.5841990827 247 | -3.04195775027664,251.958042249723,-65028.0419577503 248 | -1.95449641955894,252.045503580441,-64517.9544964196 249 | -22.893747890277,230.106252109723,-64031.8937478903 250 | 5.43519268017744,257.435192680177,-63498.5648073198 251 | -5.00142003779859,245.998579962201,-63006.0014200378 252 | -11.9791437207358,238.020856279264,-62511.9791437207 253 | 23.3182556005606,272.318255600561,-61977.6817443994 254 | 7.03066620593532,255.030666205935,-61496.9693337941 255 | 6.47235569352978,253.47235569353,-61002.5276443065 256 | 5.66791131677032,251.66791131677,-60510.3320886832 257 | -4.62140329595731,240.378596704043,-60029.621403296 258 | 6.38505543821194,250.385055438212,-59529.6149445618 259 | -7.17921392946991,235.82078607053,-59056.1792139295 260 | 17.5012593363778,259.501259336378,-58546.4987406636 261 | 3.36646517152614,244.366465171526,-58077.6335348285 262 | 2.37209035036224,242.372090350362,-57597.6279096496 263 | -3.63559175149936,235.364408248501,-57124.6355917515 264 | -18.7297112949892,219.270288705011,-56662.729711295 265 | -1.40343970042959,235.59656029957,-56170.4034397004 266 | 3.67179586111382,239.671795861114,NA 267 | -15.1853327173846,219.814667282615,-55240.1853327174 268 | -1.60334987916399,232.396650120836,-54757.6033498792 269 | 0.029548906663531,233.029548906664,-54288.9704510933 270 | -1.11620163225631,230.883798367744,-53825.1162016323 271 | -0.326958730783566,230.673041269216,-53361.3269587308 272 | -1.06581810028232,228.934181899718,-52901.0658181003 273 | 14.3320700291467,243.332070029147,-52426.6679299709 274 | -8.44982787552532,219.550172124475,-51992.4498278755 275 | 9.59560490746185,236.595604907462,-51519.4043950925 276 | 20.4538062052741,246.453806205274,-51055.5461937947 277 | 10.2030650954982,235.203065095498,-50614.7969349045 278 | 1.47126327267434,225.471263272674,-50174.5287367273 279 | -17.0331031304331,205.966896869567,-49746.0331031304 280 | -15.5183030772447,NA,-49299.5183030772 281 | 31.1318937478317,252.131893747832,-48809.8681062522 282 | -3.65548932014268,216.344510679857,NA 283 | 2.40252172903538,221.402521729035,-47958.597478271 284 | 2.39534280827645,220.395342808276,-47521.6046571917 285 | 11.0029749563063,228.002974956306,-47077.9970250437 286 | 6.46609680545292,222.466096805453,-46649.5339031945 287 | -13.4011616297204,201.59883837028,-46238.4011616297 288 | 8.86814829360056,NA,-45787.1318517064 289 | NA,213.930286454016,-45368.069713546 290 | 20.4191875354118,232.419187535412,-44923.5808124646 291 | 2.75061098332672,NA,-44518.2493890167 292 | 8.56842484412152,NA,-44091.4315751559 293 | -20.9527390650258,188.047260934974,-43701.952739065 294 | 4.73427599901052,212.734275999011,-43259.265724001 295 | 4.02332142066545,211.023321420665,-42844.9766785793 296 | -9.22133296383933,196.778667036161,-42445.2213329638 297 | -14.9576383118771,190.042361688123,-42039.9576383119 298 | 8.43692906973503,212.436929069735,-41607.5630709303 299 | NA,214.03360904411,-41197.9663909559 300 | 16.8659874632013,218.865987463201,-40787.1340125368 301 | 17.7954314540345,NA,-40383.204568546 302 | 9.24622601053251,209.246226010533,NA 303 | -8.19781490118796,190.802185098812,-39609.1978149012 304 | -5.98068619156291,192.019313808437,-39209.9806861916 305 | -12.4864088577363,184.513591142264,-38821.4864088577 306 | 8.89014654703184,204.890146547032,-38407.109853453 307 | 5.27474725391051,200.274747253911,-38019.7252527461 308 | 10.1806699832168,204.180669983217,-37625.8193300168 309 | 1.48411543672474,194.484115436725,-37247.5158845633 310 | -13.6219503956211,178.378049604379,-36877.6219503956 311 | 8.88100039962013,199.88100039962,-36472.1189996004 312 | 4.6247137590008,194.624713759001,-36095.375286241 313 | NA,185.673559357793,-35724.3264406422 314 | -11.4420004002092,NA,-35355.4420004002 315 | 0.641291339366978,187.641291339367,-34968.3587086606 316 | -18.0576638578454,167.942336142155,-34614.0576638578 317 | NA,193.218456689476,-34216.7815433105 318 | 12.4543075843774,196.454307584377,-33843.5456924156 319 | 13.0025523046514,196.002552304651,-33475.9974476954 320 | 8.04287570242736,190.042875702427,-33115.9571242976 321 | -6.73429938840597,174.265700611594,-32767.7342993884 322 | -5.17704201004565,174.822957989954,-32405.17704201 323 | -3.19479066115539,175.805209338845,-32044.1947906612 324 | -8.18100502406904,169.818994975931,-31692.1810050241 325 | -25.6335505740385,151.366449425961,-31354.633550574 326 | -12.2500774036384,163.749922596362,-30988.2500774036 327 | 0.23338722026117,175.233387220261,-30624.7666127797 328 | 7.08553323356053,181.085533233561,-30268.9144667664 329 | 1.77768610026373,174.777686100264,-29927.2223138997 330 | -5.33249583799775,166.667504162002,NA 331 | -3.52194231506352,167.478057684936,-29244.5219423151 332 | -9.676389714038,160.323610285962,-28909.676389714 333 | 22.2106747102422,191.210674710242,-28538.7893252898 334 | 0.332343005686162,168.332343005686,-28223.6676569943 335 | -0.0560260543282312,166.943973945672,-27889.0560260543 336 | -18.0410281792819,147.958971820718,-27574.0410281793 337 | -17.601027250371,147.398972749629,-27242.6010272504 338 | 6.43005197028487,170.430051970285,-26889.5699480297 339 | 14.1323386317504,177.13233863175,-26554.8676613683 340 | 21.2873949643941,183.287394964394,-26222.7126050356 341 | -6.83663368790321,154.163366312097,-25927.8366336879 342 | -1.1782097718109,158.821790228189,-25601.1782097718 343 | -0.967528004898314,158.032471995102,-25281.9675280049 344 | 10.7486764506202,168.74867645062,-24953.2513235494 345 | 1.45576595822401,158.455765958224,-24647.5442340418 346 | NA,159.13176899058,-24332.8682310094 347 | 6.35816569334053,161.358165693341,-24018.6418343067 348 | -13.6574513827807,140.342548617219,-23729.6574513828 349 | 4.14899890701915,157.148998907019,-23404.851001093 350 | 9.0120315351118,NA,-23094.9879684649 351 | 9.27812603090718,160.278126030907,-22791.7218739691 352 | 14.3990910293475,164.399091029347,-22485.6009089707 353 | -15.9784984708197,133.02150152918,-22216.9784984708 354 | -16.7792699850092,131.220730014991,-21920.779269985 355 | 6.98176781659272,153.981767816593,-21602.0182321834 356 | -8.30215566870221,137.697844331298,-21324.3021556687 357 | -4.02164670149323,140.978353298507,-21029.0216467015 358 | -6.50691833620289,137.493081663797,-20742.5069183362 359 | -8.02690938506061,134.973090614939,-20457.0269093851 360 | 4.64106689991784,146.641066899918,-20159.3589331001 361 | -0.718788040803492,140.281211959196,-19881.7187880408 362 | -8.86211612391727,131.137883876083,-19608.8621161239 363 | -18.5922988250833,120.407701174917,-19339.5922988251 364 | -1.63482796664884,136.365172033351,-19045.6348279666 365 | 5.1240184462262,142.124018446226,-18763.8759815538 366 | 11.168665627226,147.168665627226,-18484.8313343728 367 | 8.23963321753628,143.239633217536,-18216.7603667825 368 | 5.93899982091755,139.938999820918,-17950.0610001791 369 | -4.25037313302133,128.749626866979,-17693.250373133 370 | 1.81364633207338,133.813646332073,-17422.1863536679 371 | -2.75119927479544,128.248800725205,-17163.7511992748 372 | 0.36468447141091,130.364684471411,-16899.6353155286 373 | 0.451347581596385,129.451347581596,-16640.5486524184 374 | -5.50722288782811,122.492777112172,-16389.5072228878 375 | 3.00868935392603,130.008689353926,-16125.9913106461 376 | 6.03597924680919,132.035979246809,-15869.9640207532 377 | 4.24359404217288,129.243594042173,-15620.7564059578 378 | NA,119.28087656221,-15380.7191234378 379 | 14.5361346215774,137.536134621577,-15114.4638653784 380 | 10.3467344217592,132.346734421759,-14873.6532655782 381 | 13.830327666598,134.830327666598,-14627.1696723334 382 | 7.2333729202237,127.233372920224,NA 383 | -5.93735656647783,113.062643433522,-14166.9373565665 384 | -9.87436084007066,108.125639159929,-13933.8743608401 385 | 8.8886476603226,125.888647660323,-13680.1113523397 386 | 4.60188231629073,120.601882316291,-13451.3981176837 387 | 7.72784218487801,122.727842184878,-13217.2721578151 388 | 13.5189328485284,127.518932848528,-12982.4810671515 389 | 1.38942330585812,114.389423305858,-12767.6105766941 390 | -3.97443899659605,108.025561003404,-12547.9744389966 391 | 20.6214498475297,131.62144984753,-12300.3785501525 392 | -17.5891438162142,92.4108561837858,-12117.5891438162 393 | 19.5065977801116,128.506597780112,-11861.4934022199 394 | -4.84964593084772,103.150354069152,-11668.8496459308 395 | -1.38744055675105,105.612559443249,-11450.3874405568 396 | -16.2906387836688,89.7093612163312,-11252.2906387837 397 | NA,NA,-11012.3907875452 398 | -14.1077439191535,89.8922560808465,-10830.1077439192 399 | -19.515255667366,83.484744332634,-10628.5152556674 400 | 5.22463500903376,107.224635009034,-10398.775364991 401 | 1.99549278023119,102.995492780231,-10199.0045072198 402 | 5.77574660328523,105.775746603285,-9994.22425339671 403 | NA,83.7973763071257,-9816.20262369287 404 | -6.16303165707752,91.8369683429225,-9610.16303165708 405 | 4.01609142850684,101.016091428507,-9404.98390857149 406 | -6.35258227266276,NA,-9222.35258227266 407 | -18.4550665848532,76.5449334151468,-9043.45506658485 408 | -2.02469673086054,91.9753032691395,-8838.02469673086 409 | -0.50630363624899,NA,-8649.50630363625 410 | 12.5306150732288,104.530615073229,-8451.46938492677 411 | -22.0974053309113,68.9025946690887,-8303.09740533091 412 | -19.820336271728,70.179663728272,-8119.82033627173 413 | 1.78896999200279,90.7889699920028,-7919.211030008 414 | -3.94451017471007,84.0554898252899,-7747.94451017471 415 | 4.89645642127055,91.8964564212706,-7564.10354357873 416 | 1.57748485101347,87.5774848510135,-7394.42251514899 417 | 1.84230346206637,86.8423034620664,-7223.15769653793 418 | 3.32933460843148,87.3293346084315,-7052.67066539157 419 | -24.1637096846804,58.8362903153196,-6913.16370968468 420 | -13.0541307351287,68.9458692648713,-6737.05413073513 421 | -3.35865946700822,77.6413405329918,-6564.35865946701 422 | 9.81587028777839,89.8158702877784,-6390.18412971222 423 | -7.61846407626734,71.3815359237327,-6248.61846407627 424 | 2.78964019452356,80.7896401945236,-6081.21035980548 425 | -5.27193577646954,71.7280642235305,-5934.27193577647 426 | 1.50240709409589,77.5024070940959,-5774.4975929059 427 | -16.0884527821159,58.9115472178841,-5641.08845278212 428 | 15.4292962206608,89.4292962206608,-5460.57070377934 429 | 3.25879534621418,76.2587953462142,-5325.74120465379 430 | 8.98841790279139,80.9884179027914,-5175.01158209721 431 | NA,56.8535045569069,-5055.14649544309 432 | 2.26963288868339,72.2696328886834,-4897.73036711132 433 | -0.789822202864098,68.2101777971359,-4761.78982220286 434 | -6.57361163315829,61.4263883668417,-4630.57361163316 435 | 3.72709697854662,70.7270969785466,-4485.27290302145 436 | 2.09503038825283,68.0950303882528,-4353.90496961175 437 | -8.98606592306685,56.0139340769332,-4233.98606592307 438 | -8.07371441570947,55.9262855842905,-4104.07371441571 439 | 0.154683117395073,63.1546831173951,-3968.8453168826 440 | -1.24350523171531,60.7564947682847,-3845.24350523172 441 | 1.19185092761792,62.1918509276179,-3719.80814907238 442 | 12.6537474269021,72.6537474269021,-3587.3462525731 443 | 16.9815130472735,75.9815130472735,-3464.01848695273 444 | 7.22378362786968,65.2237836278697,-3356.77621637213 445 | -2.48117470865023,54.5188252913498,-3251.48117470865 446 | 2.9185185132886,58.9185185132886,-3133.08148148671 447 | 8.19481917158294,63.1948191715829,-3016.80518082842 448 | 9.79612364010528,63.7961236401053,NA 449 | 0.820578391637766,53.8205783916378,-2808.17942160836 450 | 6.1426646400082,58.1426646400082,-2697.85733535999 451 | 1.81055069510415,52.8105506951042,NA 452 | 5.81213366932509,55.8121336693251,-2494.18786633067 453 | -6.02443900462176,42.9755609953782,-2407.02443900462 454 | 8.19837900644015,56.1983790064401,-2295.80162099356 455 | 1.00736125979011,48.0073612597901,-2207.99263874021 456 | 8.04073460227165,54.0407346022716,-2107.95926539773 457 | -6.48337707879908,38.5166229212009,-2031.4833770788 458 | -18.3872929869521,25.6127070130479,-1954.38729298695 459 | -13.4201710950248,29.5798289049752,-1862.42017109502 460 | 10.3181608414707,52.3181608414707,-1753.68183915853 461 | NA,20.3856184922044,-1701.6143815078 462 | 4.74079566536518,44.7407956653652,-1595.25920433463 463 | -0.243945480665689,38.7560545193343,NA 464 | -12.0641747521876,25.9358252478124,-1456.06417475219 465 | 2.18232551113565,39.1823255111356,-1366.81767448886 466 | -1.86569911095333,34.1343008890467,-1297.86569911095 467 | -8.34763477742517,26.6523652225748,-1233.34763477743 468 | -6.70377022638064,27.2962297736194,-1162.70377022638 469 | -3.76473723047327,29.2352627695267,-1092.76473723047 470 | 10.286014795104,42.286014795104,NA 471 | 7.64640558209949,38.6464055820995,-953.3535944179 472 | -12.4223242142771,17.5776757857229,NA 473 | -15.3580600522366,13.6419399477634,-856.358060052237 474 | -2.7546286970986,25.2453713029014,-786.754628697099 475 | -0.816413970090352,26.1835860299096,-729.81641397009 476 | -14.4558884298015,11.5441115701985,-690.455888429801 477 | 0.416813868853537,25.4168138688535,-624.583186131146 478 | -6.3038422883684,17.6961577116316,-582.303842288368 479 | 1.00385799829052,24.0038579982905,-527.996142001709 480 | -2.06206074115041,19.9379392588496,-486.06206074115 481 | 0.0775012076121474,21.0775012076121,-440.922498792388 482 | 9.17891125499711,29.1789112549971,-390.821088745003 483 | 9.91912355676178,28.9191235567618,-351.080876443238 484 | -14.5520523083471,3.44794769165293,NA 485 | 4.34636312334071,21.3463631233407,-284.653636876659 486 | -17.6050358691066,-1.60503586910659,-273.605035869107 487 | 16.4137338443373,31.4137338443373,-208.586266155663 488 | 3.2899079708406,17.2899079708406,-192.710092029159 489 | 8.67333785782265,21.6733378578227,-160.326662142177 490 | 4.38116307213271,16.3811630721327,NA 491 | -10.3668484444118,0.633151555588217,-131.366848444412 492 | 10.9882503272617,20.9882503272617,-89.0117496727383 493 | 3.80939939301041,12.8093993930104,-77.1906006069896 494 | -5.15484733031509,2.84515266968491,-69.1548473303151 495 | -1.52668212028725,5.47331787971275,-50.5266821202872 496 | 8.6644687139296,14.6644687139296,-27.3355312860704 497 | 8.2601550777275,13.2601550777275,-16.7398449222725 498 | 10.068234812964,14.068234812964,-5.93176518703597 499 | NA,1.86614273154914,-10.1338572684509 500 | 15.7706878161089,17.7706878161089,11.7706878161089 501 | -14.4009277915753,-13.4009277915753,-15.4009277915753 502 | -7.53976446840454,-7.53976446840454,-7.53976446840454 503 | -2.39549688382698,NA,-3.39549688382698 504 | 0.150538601456505,2.15053860145651,-3.84946139854349 505 | 13.8692553099038,16.8692553099038,4.86925530990378 506 | -10.6520900379986,-6.65209003799861,-26.6520900379986 507 | 2.64812009840244,7.64812009840244,-22.3518799015976 508 | -13.8078140169599,-7.80781401695988,NA 509 | 9.3274745751037,16.3274745751037,-39.6725254248963 510 | -10.2296710622364,-2.22967106223638,-74.2296710622364 511 | -6.74308147817495,2.25691852182505,-87.7430814781749 512 | -2.04816189964136,7.95183810035864,NA 513 | -4.04680527578107,6.95319472421893,-125.046805275781 514 | 6.88881952133604,18.888819521336,-137.111180478664 515 | NA,NA,-183.433059184529 516 | 2.25026164627264,16.2502616462726,-193.749738353727 517 | NA,15.0498977956706,-224.950102204329 518 | -17.7593042298037,-1.75930422980374,-273.759304229804 519 | -1.63460883196287,15.3653911680371,NA 520 | 27.7029727051096,45.7029727051096,-296.29702729489 521 | -10.7361182999644,8.26388170003561,-371.736118299964 522 | -5.21237791208784,14.7876220879122,-405.212377912088 523 | -6.44016542000442,14.5598345799956,-447.440165420004 524 | -8.79890556466638,13.2010944353336,-492.798905564666 525 | 15.0400170435458,38.0400170435458,-513.959982956454 526 | NA,43.3169597526548,-556.683040247345 527 | -1.30655624485332,23.6934437551467,-626.306556244853 528 | -17.7196362047579,8.28036379524212,NA 529 | -10.9606477924032,16.0393522075968,-739.960647792403 530 | NA,33.0862657830691,-778.913734216931 531 | -18.7977833173593,10.2022166826407,-859.797783317359 532 | 4.86281904133198,34.862819041332,-895.137180958668 533 | 11.1834814772347,42.1834814772347,-949.816518522765 534 | -13.6209563995055,18.3790436004945,-1037.62095639951 535 | -12.5468235629716,20.4531764370284,-1101.54682356297 536 | -3.26555483450762,NA,-1159.26555483451 537 | 4.88658046959534,39.8865804695953,-1220.1134195304 538 | -12.3929938030509,23.6070061969491,-1308.39299380305 539 | 8.03070370648856,NA,-1360.96929629351 540 | -6.74802966148667,NA,-1450.74802966149 541 | -1.47647267368283,37.5235273263172,-1522.47647267368 542 | 15.3877486640325,55.3877486640325,-1584.61225133597 543 | 10.9676724398033,51.9676724398033,-1670.0323275602 544 | 9.3147793545691,51.3147793545691,-1754.68522064543 545 | -7.94012618418086,35.0598738158191,-1856.94012618418 546 | -1.37319371997205,42.6268062800279,-1937.37319371997 547 | 12.2006659737446,57.2006659737446,-2012.79933402626 548 | -4.81172464883714,41.1882753511629,-2120.81172464884 549 | 6.10401463892476,53.1040146389248,-2202.89598536108 550 | 14.4818894802253,62.4818894802253,-2289.51811051977 551 | 10.7788185272438,59.7788185272438,-2390.22118147276 552 | 17.3001548312883,67.3001548312883,-2482.69984516871 553 | NA,53.768232141344,-2598.23176785866 554 | -9.94007146083015,42.0599285391699,-2713.94007146083 555 | -5.22612630251312,47.7738736974869,-2814.22612630251 556 | 18.1189657624604,72.1189657624604,-2897.88103423754 557 | 15.0978703227409,70.0978703227409,-3009.90212967726 558 | 7.38668774718786,63.3866877471879,-3128.61331225281 559 | -18.581447020701,NA,-3267.5814470207 560 | -1.12375074724748,56.8762492527525,-3365.12375074725 561 | 5.85665444925622,64.8566544492562,-3475.14334555074 562 | -14.3652590542801,45.6347409457199,-3614.36525905428 563 | 24.5213430151443,85.5213430151443,-3696.47865698486 564 | NA,58.2150264953006,-3847.7849735047 565 | -10.9204059018046,52.0795940981954,-3979.9204059018 566 | -9.50010826824079,54.4998917317592,-4105.50010826824 567 | 3.12253733613454,68.1225373361345,-4221.87746266387 568 | -10.5594292110351,55.4405707889649,-4366.55942921104 569 | 4.48381692363331,NA,-4484.51618307637 570 | -4.65187836757334,63.3481216324267,-4628.65187836757 571 | -8.86793981753588,60.1320601824641,-4769.86793981754 572 | -5.05244292497743,64.9475570750226,-4905.05244292498 573 | 8.64081748211685,79.6408174821168,-5032.35918251788 574 | -1.15970869995478,NA,-5185.15970869995 575 | -1.99233864920552,71.0076613507945,-5330.99233864921 576 | 8.85784942400344,82.8578494240034,-5467.142150576 577 | 10.4202387975456,85.4202387975456,-5614.57976120245 578 | -1.66548024942545,74.3345197505746,-5777.66548024943 579 | -6.83100965702602,70.168990342974,-5935.83100965703 580 | -0.997843412322988,NA,-6084.99784341232 581 | -18.7083786583352,60.2916213416648,-6259.70837865834 582 | 7.92464418249362,87.9246441824936,-6392.07535581751 583 | 9.34324345730678,90.3432434573068,-6551.65675654269 584 | 6.9439371015606,88.9439371015606,-6717.05606289844 585 | -0.684582273782629,82.3154177262174,-6889.68458227378 586 | -6.24008798954491,77.7599120104551,-7062.24008798954 587 | -2.96118115091889,82.0388188490811,-7227.96118115092 588 | -6.28125223452861,79.7187477654714,-7402.28125223453 589 | 7.61599025464523,NA,-7561.38400974535 590 | -11.4317964749455,76.5682035250545,-7755.43179647495 591 | -6.2244155769132,NA,-7927.22441557691 592 | 1.60975600364268,91.6097560036427,-8098.39024399636 593 | 5.70554210159641,96.7055421015964,-8275.2944578984 594 | 0.107760249353106,92.1077602493531,-8463.89223975065 595 | -9.92835630779587,83.0716436922041,-8658.9283563078 596 | 17.7062224004433,111.706222400443,-8818.29377759956 597 | 14.3444524995555,NA,-9010.65554750044 598 | 11.8992238727779,107.899223872778,-9204.10077612722 599 | 8.89310151924384,105.893101519244,-9400.10689848076 600 | NA,113.005247431322,-9588.99475256868 601 | 16.8429190819164,115.842919081916,-9784.15708091808 602 | 10.0863696600592,110.086369660059,-9989.91363033994 603 | 9.98779783587724,110.987797835877,-10191.0122021641 604 | 12.8563240453033,114.856324045303,-10391.1436759547 605 | 6.49892104696125,109.498921046961,-10602.501078953 606 | 5.24651452218954,109.24651452219,-10810.7534854778 607 | -9.21627499261352,95.7837250073865,-11034.2162749926 608 | 0.0738886246236505,106.073888624624,-11235.9261113754 609 | -0.281054611973225,106.718945388027,-11449.281054612 610 | 3.54880169170882,NA,-11660.4511983083 611 | -11.8801973858473,97.1198026141527,-11892.8801973858 612 | -7.57738103374405,102.422618966256,-12107.5773810337 613 | NA,96.4907539324426,-12335.5092460676 614 | 11.6026040384085,NA,-12532.3973959616 615 | -4.94527199778479,108.054728002215,-12773.9452719978 616 | -33.9842637672717,80.0157362327283,-13029.9842637673 617 | 24.1474946308353,139.147494630835,-13200.8525053692 618 | 9.58667847703962,125.58667847704,-13446.413321523 619 | 5.8833513123193,122.883351312319,-13683.1166486877 620 | 4.46205794388402,122.462057943884,-13919.5379420561 621 | -2.67729219298474,116.322707807015,-14163.677292193 622 | -4.77668775187588,115.223312248124,-14404.7766877519 623 | -0.610736587471765,120.389263412528,-14641.6107365875 624 | -8.99033222660606,113.009667773394,-14892.9903322266 625 | 21.7737167035625,144.773716703563,-15107.2262832964 626 | 0.122944768239039,124.122944768239,-15375.8770552318 627 | -2.99935744496538,122.000642555035,-15627.999357445 628 | 0.645739133235881,126.645739133236,-15875.3542608668 629 | NA,120.628743143074,-16135.3712568569 630 | NA,125.467171689036,-16386.532828311 631 | -8.86163035859746,120.138369641403,-16649.8616303586 632 | -3.36925028475569,126.630749715244,-16903.3692502848 633 | 6.10943352488785,137.109433524888,-17154.8905664751 634 | -2.80755747300529,129.192442526995,-17426.807557473 635 | 0.233352656946647,133.233352656947,-17688.7666473431 636 | -10.4216628181219,123.578337181878,NA 637 | -19.8531563245835,115.146843675417,-18244.8531563246 638 | -1.59241388441367,134.407586115586,-18497.5924138844 639 | -16.0801538720373,120.919846127963,-18785.080153872 640 | 19.8136204336778,157.813620433678,-19024.1863795663 641 | -2.76268902185469,136.237310978145,-19323.7626890219 642 | 6.32867928723853,146.328679287239,-19593.6713207128 643 | -0.513952132335796,140.486047867664,-19881.5139521323 644 | 12.142441537474,154.142441537474,-20151.8575584625 645 | -8.88634282106792,NA,-20457.8863428211 646 | 6.80800277300723,150.808002773007,-20729.191997227 647 | -10.6659654759033,134.334034524097,-21035.6659654759 648 | 13.2429787415029,159.242978741503,NA 649 | -20.8510162168392,126.148983783161,-21629.8510162168 650 | -9.34950533937452,138.650494660625,-21913.3495053394 651 | 3.30576427185301,152.305764271853,-22197.6942357281 652 | 10.0016607196553,160.001660719655,NA 653 | 2.53128589715722,153.531285897157,-22798.4687141028 654 | NA,169.895018953569,-23086.1049810464 655 | 3.47197899027792,156.471978990278,-23405.5280210097 656 | 1.18837147398423,155.188371473984,-23714.811628526 657 | 7.98438253399258,162.984382533993,-24017.015617466 658 | -16.374389860157,NA,-24352.3743898602 659 | 8.45530003363516,165.455300033635,-24640.5446999664 660 | -14.3862822932361,143.613717706764,NA 661 | 12.7267737849221,171.726773784922,-25268.2732262151 662 | NA,NA,-25600.0739069623 663 | -26.7297367203789,134.270263279621,-25947.7297367204 664 | NA,170.25019592393,-26235.7498040761 665 | -4.04417320720569,158.955826792794,-26573.0441732072 666 | -9.79667564403804,154.203324355962,-26905.796675644 667 | 4.56296519995819,169.562965199958,-27220.4370348 668 | -1.38036169780591,164.619638302194,-27557.3803616978 669 | -6.29758881902369,160.702411180976,-27895.297588819 670 | -3.10087613861085,164.899123861389,-28227.1008761386 671 | 10.2721819854403,179.27218198544,-28550.7278180146 672 | 6.43835335837489,NA,-28893.5616466416 673 | -24.6551758784006,146.344824121599,-29265.6551758784 674 | -6.53113155404074,165.468868445959,NA 675 | 21.8051418505153,194.805141850515,-29907.1948581495 676 | -16.0556115612376,157.944388438762,-30292.0556115612 677 | -14.1004222808724,160.899577719128,-30639.1004222809 678 | -12.2571868867353,NA,-30988.2571868867 679 | 7.41647118434432,184.416471184344,-31321.5835288157 680 | 16.4538615791345,194.453861579135,-31667.5461384209 681 | -2.94910909522574,176.050890904774,-32043.9491090952 682 | -8.36667793006495,171.633322069935,-32408.3666779301 683 | -1.5228787840537,179.477121215946,-32762.5228787841 684 | 2.49169804148079,184.491698041481,-33121.5083019585 685 | -7.38269093762674,175.617309062373,-33496.3826909376 686 | 12.3039925943835,196.303992594383,-33843.6960074056 687 | 4.66195756763859,189.661957567639,-34220.3380424324 688 | -3.07966186394432,182.920338136056,-34599.0796618639 689 | 6.95193574546771,193.951935745468,-34962.0480642545 690 | 0.572548334057169,188.572548334057,-35343.4274516659 691 | -1.0328032107061,187.967196789294,-35722.0328032107 692 | -12.2695886929943,177.730411307006,-36112.269588693 693 | -0.211269544563074,190.788730455437,-36481.2112695446 694 | -4.52417054153588,187.475829458464,-36868.5241705415 695 | -6.68824131702804,186.311758682972,-37255.688241317 696 | -5.58879955171365,188.411200448286,-37641.5887995517 697 | -3.08228372182559,191.917716278174,-38028.0822837218 698 | 4.39915616252833,200.399156162528,-38411.6008438375 699 | 2.97113044134424,199.971130441344,NA 700 | 2.51689314044674,200.516893140447,-39201.4831068596 701 | 6.50793823464231,205.507938234642,-39594.4920617654 702 | NA,196.383918725058,-40003.6160812749 703 | 12.30910412972,213.30910412972,-40388.6908958703 704 | NA,202.494664256963,-40803.505335743 705 | -8.26626370175504,194.733736298245,-41217.2662637018 706 | 7.09598473635784,211.095984736358,-41608.9040152636 707 | 4.20736145471901,209.207361454719,-42020.7926385453 708 | 7.20928511408149,213.209285114081,-42428.7907148859 709 | -14.9215300701712,192.078469929829,-42863.9215300702 710 | -10.6006475801135,197.399352419886,-43274.6006475801 711 | NA,210.337914204509,-43679.6620857955 712 | 2.79694162767306,212.796941627673,-44097.2030583723 713 | 9.79969236001963,NA,-44511.20030764 714 | -7.98328805064708,204.016711949353,-44951.9832880506 715 | 17.4287672641476,230.428767264148,-45351.5712327358 716 | -20.9711644513341,193.028835548666,-45816.9711644513 717 | -10.1622078104068,204.837792189593,-46235.1622078104 718 | 11.1183622922041,227.118362292204,-46644.8816377078 719 | NA,227.823152607413,-47078.1768473926 720 | -18.3723157233631,199.627684276637,-47542.3723157234 721 | 4.39951024419529,223.399510244195,NA 722 | -8.85273755503474,211.147262444965,NA 723 | 6.14117620467376,227.141176204674,-48834.8588237953 724 | -4.59419644405593,217.405803555944,-49288.5941964441 725 | -8.07740017685674,214.922599823143,-49737.0774001769 726 | NA,203.008549435616,-50196.9914505644 727 | -3.55856104486838,221.441438955132,-50628.5585610449 728 | 0.962351091015784,226.962351091016,-51075.037648909 729 | 5.81955122618082,232.819551226181,-51523.1804487738 730 | -8.42979135854895,219.570208641451,-51992.4297913585 731 | -3.17572563345997,225.82427436654,NA 732 | 2.92695317388403,232.926953173884,-52897.0730468261 733 | 5.03750795944561,236.037507959446,-53355.9624920406 734 | -2.92296793442726,229.077032065573,-53826.9229679344 735 | -5.226490994492,227.773509005508,-54294.2264909945 736 | -0.339425686967397,233.660574313033,NA 737 | 11.9503413763604,246.95034137636,-55213.0496586236 738 | -2.99160073266251,233.008399267337,-55698.9916007327 739 | 2.92200591386977,239.92200591387,-56166.0779940861 740 | -10.1742630918038,227.825736908196,-56654.1742630918 741 | -21.9527512229075,217.047248777092,-57142.9527512229 742 | 18.997685301614,258.997685301614,-57581.0023146984 743 | 2.33524017701429,243.335240177014,-58078.664759823 744 | -2.43527518166968,239.56472481833,-58566.4352751817 745 | -6.43881515068881,236.561184849311,-59055.4388151507 746 | 7.82192853362503,251.821928533625,-59528.1780714664 747 | 11.8973084014001,256.8973084014,-60013.1026915986 748 | 14.476643305774,260.476643305774,-60501.5233566942 749 | 4.62220893460546,251.622208934605,-61004.3777910654 750 | 5.77963251004145,253.779632510041,-61498.22036749 751 | -2.71413796282027,246.28586203718,-62003.7141379628 752 | 19.7275221438101,269.72752214381,-62480.2724778562 753 | NA,237.366404782388,-63014.6335952176 754 | NA,229.214578246571,-63526.7854217534 755 | 7.21683167239703,260.216831672397,-64001.7831683276 756 | 10.7607037025968,NA,-64505.2392962974 757 | 4.96695941653927,259.966959416539,-65020.0330405835 758 | 1.59361033905506,257.593610339055,-65534.4063896609 759 | 16.5683974134533,273.568397413453,-66032.4316025865 760 | 13.6694365106732,271.669436510673,-66550.3305634893 761 | 2.10701340310128,261.107013403101,NA 762 | 1.98669710810489,261.986697108105,-67598.0133028919 763 | -5.99328805380289,255.006711946197,-68126.9932880538 764 | NA,261.678361002923,-68644.3216389971 765 | 12.5324533047917,275.532453304792,-69156.4675466952 766 | 10.2897497811132,274.289749781113,-69685.7102502189 767 | -5.10581207471904,259.894187925281,-70230.1058120747 768 | -19.1261177393958,246.873882260604,-70775.1261177394 769 | -7.22703146618972,259.77296853381,-71296.2270314662 770 | -7.49538908560765,260.504610914392,-71831.4953890856 771 | -13.9903345539331,255.009665446067,-72374.9903345539 772 | 12.4212102371258,282.421210237126,-72887.5787897629 773 | -6.91233050034148,264.087669499659,-73447.9123305003 774 | -10.0817992894704,261.91820071053,-73994.0817992895 775 | 2.67354870049896,NA,-74526.3264512995 776 | 0.244159559156272,274.244159559156,NA 777 | 6.10769536694611,281.107695366946,-75618.8923046331 778 | 12.7768376735555,288.776837673556,-76163.2231623264 779 | -0.213771668207007,276.786228331793,-76729.2137716682 780 | -10.3993141088909,267.600685891109,-77294.3993141089 781 | 7.48761143506866,286.487611435069,-77833.5123885649 782 | -11.9745614858962,268.025438514104,-78411.9745614859 783 | 6.82548217219321,287.825482172193,-78954.1745178278 784 | 0.323354565925003,282.323354565925,-79523.6766454341 785 | -23.1883552140509,259.811644785949,-80112.188355214 786 | -2.87575630104142,281.124243698959,-80658.875756301 787 | NA,278.547640198234,-81231.4523598018 788 | -3.87738747426764,NA,NA 789 | -12.096618063484,274.903381936516,-82381.0966180635 790 | NA,298.805416266303,-82933.1945837337 791 | 4.72970913814775,293.729709138148,-83516.2702908619 792 | 12.521333640377,302.521333640377,-84087.4786663596 793 | -18.2574762077492,272.742523792251,-84699.2574762077 794 | 6.02026036068589,298.020260360686,-85257.9797396393 795 | NA,296.541770884719,-85845.4582291153 796 | -3.81836419939036,290.18163580061,-86439.8183641994 797 | -14.3567190253763,280.643280974624,-87039.3567190254 798 | -6.5638675701017,289.436132429898,-87622.5638675701 799 | -1.47178669439696,295.528213305603,-88210.4717866944 800 | 1.89516988684296,299.895169886843,-88802.1048301132 801 | NA,306.651240628037,-89393.348759372 802 | -3.70837353825334,296.291626461747,-90003.7083735382 803 | -4.84101338158333,296.158986618417,-90605.8410133816 804 | -4.48577359709881,297.514226402901,-91208.4857735971 805 | 7.26253252251694,310.262532522517,-91801.7374674775 806 | 12.5067832115923,316.506783211592,-92403.4932167884 807 | NA,297.487601852177,-93032.5123981478 808 | -2.64761694098151,303.352383059019,-93638.647616941 809 | -0.345512000049881,306.65448799995,-94249.345512 810 | 2.81216358061449,310.812163580614,-94861.1878364194 811 | -4.901620176634,304.098379823366,-95485.9016201766 812 | -1.89020765849988,308.1097923415,-96101.8902076585 813 | 3.96551910887607,314.965519108876,-96717.0344808911 814 | -2.81510163439772,309.184898365602,-97346.8151016344 815 | 2.03826705153913,315.038267051539,-97966.9617329485 816 | -19.9049048903275,294.095095109672,-98615.9049048903 817 | 3.79577063743909,318.795770637439,-99221.2042293626 818 | -1.84481321294895,314.155186787051,-99857.844813213 819 | -14.4808343484659,302.519165651534,-100503.480834348 820 | -12.3654245780611,305.634575421939,-101136.365424578 821 | -13.5745017509701,305.42549824903,-101774.574501751 822 | -4.88058958837628,315.119410411624,-102404.880589588 823 | 7.67619522784334,328.676195227843,-103033.323804772 824 | -2.63665843383552,NA,-103686.636658434 825 | -7.30416751842107,315.695832481579,-104336.304167518 826 | 1.92562967235124,325.925629672351,-104974.074370328 827 | NA,339.717481367442,-105610.282518633 828 | 2.23258111232975,328.23258111233,-106273.767418888 829 | 11.5195113488801,338.51951134888,-106917.480488651 830 | 4.02662169584606,332.026621695846,-107579.973378304 831 | 7.92042568207374,336.920425682074,-108233.079574318 832 | 18.7356538111309,348.735653811131,-108881.264346189 833 | -5.92017965418968,325.07982034581,-109566.920179654 834 | -26.894880930526,305.105119069474,-110250.894880931 835 | 6.81938392108683,339.819383921087,-110882.180616079 836 | -3.60095735329387,330.399042646706,-111559.600957353 837 | 5.69045296542246,340.690452965422,-112219.309547035 838 | 20.6849954513645,356.684995451365,-112875.315004549 839 | 20.702735826414,357.702735826414,-113548.297264174 840 | -11.8567015297049,326.143298470295,-114255.85670153 841 | 12.1709085598216,351.170908559822,-114908.82909144 842 | 0.713620788406968,340.713620788407,-115599.286379212 843 | -8.99279807174457,332.007201928255,-116289.992798072 844 | -11.4838945989798,NA,-116975.483894599 845 | -12.7031813879043,330.296818612096,-117661.703181388 846 | 12.8365859168959,356.836585916896,-118323.163414083 847 | -12.1230145151778,332.876985484822,-119037.123014515 848 | 23.0290001467885,369.029000146788,-119692.970999853 849 | -4.56346702637965,342.43653297362,-120413.563467026 850 | -5.06442266506751,342.935577334932,-121109.064422665 851 | -19.8785822772327,329.121417722767,-121820.878582277 852 | -14.1817793915634,335.818220608437,-122514.181779392 853 | 10.5802105404704,361.58021054047,-123190.41978946 854 | 15.45681907697,367.45681907697,-123888.543180923 855 | 3.61588996894255,356.615889968943,-124605.384110031 856 | -0.379008241556369,353.620991758444,-125316.379008242 857 | 35.3330623386149,390.333062338615,-125989.666937661 858 | -9.64008745155252,346.359912548447,-126745.640087452 859 | 8.5060502052833,365.506050205283,-127440.493949795 860 | -1.21751958512581,356.782480414874,-128165.217519585 861 | 1.90862598249357,360.908625982494,NA 862 | -7.64274867433343,352.357251325667,-129607.642748674 863 | -18.6955059500918,342.304494049908,-130339.69550595 864 | 2.19827955027047,364.19827955027,-131041.80172045 865 | 1.46736092872906,364.467360928729,-131767.532639071 866 | -10.8376871707957,353.162312829204,-132506.837687171 867 | 18.7147542200113,383.714754220011,-133206.28524578 868 | 5.2694373547109,371.269437354711,-133950.730562645 869 | -0.56214583900979,366.43785416099,-134689.562145839 870 | 12.4686452939199,380.46864529392,-135411.531354706 871 | -7.70520528425845,361.294794715742,-136168.705205284 872 | 8.53248013035957,378.53248013036,-136891.46751987 873 | -13.1469811176334,357.853018882367,-137654.146981118 874 | 4.28836597260642,376.288365972606,-138379.711634027 875 | -3.49319938905116,369.506800610949,-139132.493199389 876 | -2.39407900311311,371.605920996887,-139878.394079003 877 | -20.5511964573748,354.448803542625,-140645.551196457 878 | 8.30524439119723,384.305244391197,-141367.694755609 879 | 0.730924438670374,377.73092443867,NA 880 | 11.1746631470609,389.174663147061,-142872.825336853 881 | -8.28357820457808,370.716421795422,-143649.283578205 882 | 3.49653801389179,383.496538013892,NA 883 | NA,NA,-145164.41895134 884 | 4.030543463448,386.030543463448,-145919.969456537 885 | -11.3353842697382,371.664615730262,-146700.33538427 886 | -0.189415533963909,383.810584466036,-147456.189415534 887 | -5.45520158428216,379.544798415718,-148230.455201584 888 | -9.39570968775395,376.604290312246,-149005.395709688 889 | -9.80991742243168,377.190082577568,-149778.809917422 890 | 18.0737643063123,406.073764306312,-150525.926235694 891 | -14.597886555614,NA,-151335.597886556 892 | -8.2474392833107,381.752560716689,-152108.247439283 893 | -13.9142250964059,377.085774903594,-152894.914225096 894 | 3.64114152480662,395.641141524807,-153660.358858475 895 | -12.2538944084125,380.746105591587,-154461.253894408 896 | 8.61262111265418,402.612621112654,-155227.387378887 897 | 1.6646039274845,396.664603927485,-156023.335396073 898 | 3.62686247649461,399.626862476495,-156812.373137524 899 | -22.0944225280066,374.905577471993,-157631.094422528 900 | 26.4133175238047,424.413317523805,-158377.586682476 901 | -0.999906773485587,398.000093226514,-159201.999906773 902 | -7.29171790803853,392.708282091961,-160007.291717908 903 | 8.92827905043416,409.928279050434,-160792.07172095 904 | -7.05450685566596,394.945493144334,-161611.054506856 905 | 8.58031162020582,411.580311620206,-162400.41968838 906 | 9.90817145852465,413.908171458525,-163206.091828541 907 | -2.25642127758782,402.743578722412,-164027.256421278 908 | 8.02402671991871,414.024026719919,-164827.97597328 909 | 3.34787820264946,410.347878202649,-165645.652121797 910 | 7.12442678704475,415.124426787045,-166456.875573213 911 | -16.0060291759219,392.993970824078,-167297.006029176 912 | 8.20242172928525,418.202421729285,-168091.797578271 913 | -6.17349506757299,404.826504932427,-168927.173495068 914 | -17.9347410946175,394.065258905383,-169761.934741095 915 | 6.74612335606387,419.746123356064,-170562.253876644 916 | -3.72305064843455,410.276949351565,-171399.723050648 917 | -18.3642681942493,396.635731805751,-172243.364268194 918 | 3.98659785658303,419.986597856583,-173052.013402143 919 | 2.46697214771214,419.466972147712,-173886.533027852 920 | 8.25314523898725,426.253145238987,-174715.746854761 921 | -16.3254604613521,402.674539538648,-175577.325460461 922 | 10.2466931139349,430.246693113935,-176389.753306886 923 | 11.2579773409734,432.257977340973,-177229.742022659 924 | 14.8868349806631,436.886834980663,-178069.113165019 925 | -9.91027079136712,413.089729208633,-178938.910270791 926 | 10.0846152523678,434.084615252368,-179765.915384748 927 | 10.033961132722,435.033961132722,-180614.966038867 928 | 19.4872792864784,445.487279286478,-181456.512720714 929 | -2.36498338307293,424.635016616927,-182331.364983383 930 | -18.0742703814286,409.925729618571,-183202.074270381 931 | 4.2854387665671,433.285438766567,-184036.714561233 932 | 12.3281289588236,442.328128958824,-184887.671871041 933 | -13.2408915898606,417.759108410139,-185774.24089159 934 | 0.890335743848767,432.890335743849,-186623.109664256 935 | -10.896519369116,422.103480630884,-187499.896519369 936 | -11.0894051156182,422.910594884382,-188367.089405116 937 | 15.2635136282408,450.263513628241,-189209.736486372 938 | 11.4550049494299,447.45500494943,-190084.544995051 939 | 4.00249613895046,441.00249613895,-190964.997503861 940 | 4.47618776899719,442.476187768997,-191839.523812231 941 | -3.56514921510299,435.434850784897,-192724.565149215 942 | -1.32381393363285,438.676186066367,-193601.323813934 943 | 1.4656938664313,442.465693866431,-194479.534306134 944 | -1.45299991977582,440.547000080224,-195365.45299992 945 | 4.68641279070251,447.686412790702,-196244.313587209 946 | -11.027091123669,432.972908876331,-197147.027091124 947 | 1.89007624567715,446.890076245677,-198023.109923754 948 | 7.25506221307761,453.255062213078,-198908.744937787 949 | NA,434.21105784034,-199821.78894216 950 | 5.08629555738184,453.086295557382,-200698.913704443 951 | NA,456.473955620616,-201593.526044379 952 | 0.892291380965081,450.892291380965,-202499.107708619 953 | -8.24246944544852,442.757530554551,-203409.242469445 954 | 1.19060884252431,453.190608842524,-204302.809391157 955 | -13.9821316106556,NA,-205222.982131611 956 | 2.17974715467324,456.179747154673,-206113.820252845 957 | 4.7101061731545,459.710106173154,-207020.289893827 958 | -1.64052577919065,454.359474220809,-207937.640525779 959 | -5.76787086094377,451.232129139056,-208854.767870861 960 | -5.1053205264906,452.894679473509,-209769.105320526 961 | -1.04982179764579,457.950178202354,-210682.049821798 962 | 4.14831418862799,464.148314188628,-211595.851685811 963 | 5.73845459686489,NA,-212515.261545403 964 | -2.83011653497051,459.169883465029,-213446.830116535 965 | -11.6661176875241,451.333882312476,-214380.666117688 966 | -1.99697151026301,462.003028489737,-215297.99697151 967 | 10.4496193442997,475.4496193443,-216214.550380656 968 | -5.27166075885522,460.728339241145,-217161.271660759 969 | 6.62297910493918,473.622979104939,-218082.377020895 970 | 2.38962837370768,470.389628373708,-219021.610371626 971 | -13.2453446481541,455.754655351846,-219974.245344648 972 | 5.79351590528125,475.793515905281,-220894.206484095 973 | -11.1618460956336,459.838153904366,-221852.161846096 974 | 13.8413127025685,485.841312702569,-222770.158687297 975 | 4.17027048654253,477.170270486543,-223724.829729513 976 | -5.90682738264821,468.093172617352,-224681.906827383 977 | 11.8295657143184,486.829565714318,-225613.170434286 978 | -0.923854101223184,475.076145898777,-226576.923854101 979 | 3.07007463924511,480.070074639245,-227525.929925361 980 | -9.10399340534712,468.896006594653,-228493.103993405 981 | -6.93069056295894,472.069309437041,-229447.930690563 982 | 24.0280355364959,504.028035536496,-230375.971964464 983 | NA,478.639679989278,-231363.360320011 984 | 1.91160603549883,483.911606035499,-232322.088393965 985 | -0.588654829691102,482.411345170309,-233289.58865483 986 | 11.6767619492437,495.676761949244,-234244.323238051 987 | 6.7078055585454,491.707805558545,-235218.292194441 988 | 6.93322115787572,NA,-236189.066778842 989 | 7.9970415128052,494.997041512805,-237161.002958487 990 | -10.6402636617322,477.359736338268,-238154.640263662 991 | -8.73412125325873,480.265878746741,-239129.734121253 992 | -7.6152057758142,482.384794224186,-240107.615205776 993 | 6.71614251666255,497.716142516663,NA 994 | -3.9622651415599,488.03773485844,-242067.962265142 995 | 2.88590283450715,495.885902834507,-243046.114097165 996 | -5.92764500774785,488.072354992252,-244041.927645008 997 | 18.263939089187,513.263939089187,-245006.736060911 998 | -4.38753536537589,491.612464634624,-246020.387535365 999 | -1.91323376473378,495.086766235266,-247010.913233765 1000 | -4.77687579005279,493.223124209947,-248008.77687579 1001 | 7.09218825602436,506.092188256024,-248993.907811744 1002 | -------------------------------------------------------------------------------- /data/merge_practice_1.csv: -------------------------------------------------------------------------------- 1 | id,name,job,location 2 | 1,Alice,communications,New York 3 | 2,Bob,communications,Cambridge 4 | 3,Chuck,hacker,New York 5 | 4,Dave,communications,Berkeley 6 | 5,Eve,spy,Cambridge -------------------------------------------------------------------------------- /data/merge_practice_2.csv: -------------------------------------------------------------------------------- 1 | id,name,job,location 2 | 1,Alice,hacker,cambridge 3 | 4,Dave,tree,palo alto 4 | 5,Eve,handler,new york 5 | 6,Faith,hacker,berkeley -------------------------------------------------------------------------------- /data/merge_practice_3.csv: -------------------------------------------------------------------------------- 1 | location,population 2 | Cambridge,107289 3 | New York,8406000 4 | Berkeley,116768 5 | Palo Alto,66642 6 | Reno,233294 -------------------------------------------------------------------------------- /data/mydata.Rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/R-for-Data-Science/798d5b220d66afea5644a245c7327e4bb740039a/data/mydata.Rda -------------------------------------------------------------------------------- /data/mydata.csv: -------------------------------------------------------------------------------- 1 | "n","c","b","d","really.long.and.complicated.variable.name" 2 | 1,"one",TRUE,2015-07-27,999 3 | 2,"two",TRUE,2015-08-03,999 4 | 3,"three",FALSE,2015-07-20,999 5 | -------------------------------------------------------------------------------- /data/pew.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/R-for-Data-Science/798d5b220d66afea5644a245c7327e4bb740039a/data/pew.sav -------------------------------------------------------------------------------- /examples/save_console_output.R: -------------------------------------------------------------------------------- 1 | sink('examples/save_console_output.txt') 2 | 3 | print("This is an example of redirecting console output to a pdf using sink()") 4 | 5 | t.test(c(1,2,3,4), c(5,6,7,8)) 6 | 7 | sink() 8 | -------------------------------------------------------------------------------- /examples/save_console_output.txt: -------------------------------------------------------------------------------- 1 | [1] "This is an example of redirecting console output to a pdf using sink()" 2 | 3 | Welch Two Sample t-test 4 | 5 | data: c(1, 2, 3, 4) and c(5, 6, 7, 8) 6 | t = -4.3818, df = 6, p-value = 0.004659 7 | alternative hypothesis: true difference in means is not equal to 0 8 | 95 percent confidence interval: 9 | -6.233715 -1.766285 10 | sample estimates: 11 | mean of x mean of y 12 | 2.5 6.5 13 | 14 | -------------------------------------------------------------------------------- /instructor/day_four.R: -------------------------------------------------------------------------------- 1 | ## ---- echo=FALSE--------------------------------------------------------- 2 | knitr::opts_knit$set(root.dir = '../') 3 | 4 | ## ------------------------------------------------------------------------ 5 | ## I want to create a matrix of 2 to the power of n where n is 1 to 10. 6 | 7 | mat <- c(rep(NA, 10)) # first create a null vector 8 | 9 | # There are many ways to do the same task 10 | mat <- c(rep(NA, 6)) 11 | for(i in 1:6){ # I want to create a matrix of 2 to the power of n where n is 5 to 10. 12 | mat[i] <- 2^(i+4) 13 | } # or 14 | 15 | mat <- c(rep(NA, 6)) 16 | for(i in 5:10){ 17 | mat[i-4] <- 2^i 18 | } # by setting sequence and statement accordingly 19 | 20 | ## ------------------------------------------------------------------------ 21 | a <- c("Berkeley", "SF", "Oakland") 22 | b <- c(20, 18, 22) 23 | city.temp <- data.frame(cbind(a, b)) 24 | 25 | for(city in c("Berkeley", "Walnut Creek", "Richmond")){ 26 | if(sum(city==city.temp$a)>0){ 27 | print(city.temp[which(city==city.temp$a),]) 28 | # if we have the city in our data, then print it's temperature and the name of the city 29 | } 30 | if(sum(city==city.temp$a)==0){ 31 | print(paste(city, "is NOT in the data. :(", sep=" ")) 32 | # if not, then just print the name of the city next to "is Not in the data. :(" 33 | } 34 | } # Loops can be as complicated and long as they could be. Often not so efficient. 35 | 36 | ## ---- eval=FALSE--------------------------------------------------------- 37 | ## system.time( 38 | ## for(i in 1:1000){ 39 | ## print(i) 40 | ## }) 41 | ## 42 | ## system.time( 43 | ## for(i in 1:1000){ 44 | ## print(i) 45 | ## if(i == 50) break 46 | ## }) 47 | 48 | ## ------------------------------------------------------------------------ 49 | x <- 7 50 | if(x > 10){ 51 | print(x) 52 | 53 | }else{ # "else" should not start its own line. 54 | # Always let it be preceded by a closing brace on the same line. 55 | print("NOT BIG ENOUGH!!") 56 | } 57 | 58 | ## ------------------------------------------------------------------------ 59 | # ifelse(test, yes, no) 60 | gender <- sample(c("male", "female"), 100, replace=TRUE) 61 | gender 62 | gender <- ifelse(gender=="male", 1, 0) 63 | gender 64 | 65 | ## ---- eval=FALSE--------------------------------------------------------- 66 | ## # if there are multiple statements, then use ; to separate each statement 67 | ## x <- 0 68 | ## while(x < 5) {print(x <- x+1)} 69 | ## x <- 1 70 | ## while(x < 5) {x <- x+1; if (x == 3) break; print(x)} # break the loop when x=3 71 | 72 | ## ------------------------------------------------------------------------ 73 | f <- function(x) x + 1 74 | class(f) 75 | 76 | ## ------------------------------------------------------------------------ 77 | formals(f) 78 | body(f) 79 | environment(f) 80 | 81 | ## ------------------------------------------------------------------------ 82 | f <- function(x) x + y 83 | y <- 1 84 | f(x = 1) 85 | 86 | ## ------------------------------------------------------------------------ 87 | y <- 9001 88 | f <- function(x) { 89 | y <- 1 90 | g <- function (x) { 91 | x + y 92 | } 93 | g(x) 94 | } 95 | f(1) 96 | 97 | ## ------------------------------------------------------------------------ 98 | h <- function(){ 99 | if (!exists('a')) { 100 | a <- 1 101 | } 102 | else { 103 | a <- 9000 104 | } 105 | print(a) 106 | } 107 | h() 108 | h() 109 | 110 | ## ------------------------------------------------------------------------ 111 | in_to_cm <- function(x) x * 2.5 112 | in_to_cm(69) 113 | 114 | ## ------------------------------------------------------------------------ 115 | in_to_m <- function(x){ 116 | in_to_cm(x) / 100 117 | } 118 | in_to_m(69) 119 | 120 | ## ------------------------------------------------------------------------ 121 | in_to_cm <- function(x) x * 2.54 122 | in_to_m(69) 123 | 124 | ## ------------------------------------------------------------------------ 125 | 69 == c(69) 126 | 127 | ## ------------------------------------------------------------------------ 128 | heights <- c(69,54,73,82) 129 | in_to_m(heights) 130 | 131 | ## ---- eval=FALSE--------------------------------------------------------- 132 | ## heights <- list(69,54,73,82) 133 | ## in_to_m(heights) 134 | 135 | ## ------------------------------------------------------------------------ 136 | in_to_m(heights[[1]]) 137 | in_to_m(heights[[2]]) 138 | in_to_m(heights[[3]]) 139 | 140 | ## ------------------------------------------------------------------------ 141 | lapply(heights, in_to_m) 142 | 143 | ## ------------------------------------------------------------------------ 144 | lapply(heights, FUN = function(x) x %/% 12) 145 | 146 | ## ------------------------------------------------------------------------ 147 | dat <- read.csv('data/large.csv') 148 | str(dat) 149 | lapply(dat, mean) 150 | 151 | ## ---- eval=FALSE--------------------------------------------------------- 152 | ## lapply(dat, mean(na.rm = TRUE)) 153 | 154 | ## ---- eval=FALSE--------------------------------------------------------- 155 | ## Map(mean, dat, na.rm=TRUE) 156 | 157 | ## ------------------------------------------------------------------------ 158 | install.packages('parallelMap') 159 | library(parallelMap) 160 | system.time(Map(median, dat, na.rm=TRUE)) 161 | system.time(parallelMap(median, dat, na.rm=TRUE)) 162 | 163 | ## ---- eval=FALSE--------------------------------------------------------- 164 | ## install.packages('devtools') 165 | 166 | ## ------------------------------------------------------------------------ 167 | library(devtools) 168 | # has_devel() # this is currently returning a clang compiler error 169 | 170 | ## ---- eval=FALSE--------------------------------------------------------- 171 | ## devtools::create("convertR") 172 | 173 | ## ---- eval=FALSE--------------------------------------------------------- 174 | ## install.packages('roxygen2') 175 | 176 | ## ------------------------------------------------------------------------ 177 | library(roxygen2) 178 | 179 | ## ---- eval=FALSE--------------------------------------------------------- 180 | ## devtools::document('convertR') 181 | 182 | ## ---- eval=FALSE--------------------------------------------------------- 183 | ## devtools::document('convertR') 184 | 185 | ## ---- eval=FALSE--------------------------------------------------------- 186 | ## devtools::use_build_ignore("Rproj", pkg = "convertR") 187 | 188 | ## ---- eval=FALSE--------------------------------------------------------- 189 | ## devtools::check("convertR") 190 | 191 | ## ---- eval=FALSE--------------------------------------------------------- 192 | ## devtools::build('convertR') 193 | 194 | ## ------------------------------------------------------------------------ 195 | RJ <- readLines("http://shakespeare.mit.edu/romeo_juliet/full.html") 196 | 197 | ## ------------------------------------------------------------------------ 198 | RJ[grep("", RJ, perl=TRUE)] 199 | RJ[grep("<H3>", RJ, perl=TRUE)] 200 | RJ[grep("<h3>", RJ, perl=TRUE)] 201 | 202 | ## ------------------------------------------------------------------------ 203 | x <- list(NA) 204 | y <- grep("<H3>", RJ, perl=TRUE) 205 | for(i in 1:length(y)){ 206 | if(i < length(y)){ 207 | x[[i]] <- RJ[c(y[i]:(y[i+1]-1))] 208 | } 209 | if(i == length(y)){ 210 | x[[i]] <- RJ[c(y[i]:length(RJ))] 211 | } 212 | } 213 | 214 | ## ------------------------------------------------------------------------ 215 | countR <- function(z){ 216 | return(c(length(grep("Romeo", z, perl=T)), length(grep("Juliet", z, perl=T)))) 217 | } 218 | lapply(x, countR) 219 | 220 | ## ------------------------------------------------------------------------ 221 | # now count the lines in each scene 222 | countL <- function(z){ 223 | return(length(grep("</A><br>$", z, perl=T))) 224 | } 225 | lapply(x, countL) 226 | 227 | -------------------------------------------------------------------------------- /instructor/day_four.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Day Four: Functional Programming" 3 | author: ["Dillon Niederhut", "Shinhye Choi"] 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | output: 6 | - pdf_document 7 | - slidy_presentation 8 | --- 9 | 10 | ```{r, echo=FALSE} 11 | knitr::opts_knit$set(root.dir = '../') 12 | ``` 13 | 14 | ## Introduction 15 | 16 | Remember how we said that R started off as a functional language? This means that, underneath the hood, R is built on many small functions that can be grouped together in smart ways to do powerful things. It also means that, if you want to do more complicated things in R than run summary statistics and linear models, you'll need to learn how to create and use functions. 17 | 18 | In R, functions are first-class citizens. This means that you can do anything to a function that you can do to an object, including using functions to create other functions. 19 | 20 | Structuring a computer language around functions and their methods makes it easily parellelizable in ways that object oriented languages are usually not (for many complicated reasons that we don't have time to talk about). Key components of function-oriented languages are functions that write functions and the ability to map functions to data structures. 21 | 22 | Finally, when we talk about the amazing extensibility of R, what we mean is that other people have written useful functions that you can find and download. If R is required in your field, it is likely because there are many functions specific to your field that have been developed in R. We'll close the intensive with a brief introduction to packaging and sharing R functions. 23 | 24 | # Looping 25 | 26 | Before we get to understand how functions work and learn how to create functions, let us go over how for-loop works in R. Many functions often have for-loops embedded in them, hence it will be useful to understand looping first. the basic syntax looks like the following: 27 | 28 | > side note - while you can use loops in R, the community strongly discourages explicit looping in favor of implicit loop functionals like `Map` and `lapply` 29 | 30 | 31 | syntax: for (variable in sequence) {statement} 32 | 33 | ```{r} 34 | ## I want to create a matrix of 2 to the power of n where n is 1 to 10. 35 | 36 | mat <- c(rep(NA, 10)) # first create a null vector 37 | 38 | # There are many ways to do the same task 39 | mat <- c(rep(NA, 6)) 40 | for(i in 1:6){ # I want to create a matrix of 2 to the power of n where n is 5 to 10. 41 | mat[i] <- 2^(i+4) 42 | } # or 43 | 44 | mat <- c(rep(NA, 6)) 45 | for(i in 5:10){ 46 | mat[i-4] <- 2^i 47 | } # by setting sequence and statement accordingly 48 | ``` 49 | 50 | You can also loop over a non-numeric vector 51 | 52 | ```{r} 53 | a <- c("Berkeley", "SF", "Oakland") 54 | b <- c(20, 18, 22) 55 | city.temp <- data.frame(cbind(a, b)) 56 | 57 | for(city in c("Berkeley", "Walnut Creek", "Richmond")){ 58 | if(sum(city==city.temp$a)>0){ 59 | print(city.temp[which(city==city.temp$a),]) 60 | # if we have the city in our data, then print it's temperature and the name of the city 61 | } 62 | if(sum(city==city.temp$a)==0){ 63 | print(paste(city, "is NOT in the data. :(", sep=" ")) 64 | # if not, then just print the name of the city next to "is Not in the data. :(" 65 | } 66 | } # Loops can be as complicated and long as they could be. Often not so efficient. 67 | ``` 68 | 69 | How can we make the running time shorter? The "break" command can be handy. We use the command when we want to "end the looping" once the variable reaches where we want it to stop. 70 | 71 | ```{r, eval=FALSE} 72 | system.time( 73 | for(i in 1:1000){ 74 | print(i) 75 | }) 76 | 77 | system.time( 78 | for(i in 1:1000){ 79 | print(i) 80 | if(i == 50) break 81 | }) 82 | ``` 83 | 84 | Next we move on to control structures, such as if statements. ``If" statements are very useful when you want to assign different tasks to different subsets of data using a single for-loop. The basic syntax looks like the following: if(condition){statement} else{other statement} 85 | 86 | > side note - there is no `elseif` or `elif` keyword in R, which will be confusing to folks coming from Matlab and Python 87 | 88 | ```{r} 89 | x <- 7 90 | if(x > 10){ 91 | print(x) 92 | 93 | }else{ # "else" should not start its own line. 94 | # Always let it be preceded by a closing brace on the same line. 95 | print("NOT BIG ENOUGH!!") 96 | } 97 | ``` 98 | 99 | The ``ifelse" function can be handy as long as you have two conditions that are mutually exclusive 100 | 101 | ```{r} 102 | # ifelse(test, yes, no) 103 | gender <- sample(c("male", "female"), 100, replace=TRUE) 104 | gender 105 | gender <- ifelse(gender=="male", 1, 0) 106 | gender 107 | ``` 108 | 109 | 110 | ``While" loops are used less frequently. The basic syntax: while(condition) {statement} 111 | 112 | ```{r, eval=FALSE} 113 | # if there are multiple statements, then use ; to separate each statement 114 | x <- 0 115 | while(x < 5) {print(x <- x+1)} 116 | x <- 1 117 | while(x < 5) {x <- x+1; if (x == 3) break; print(x)} # break the loop when x=3 118 | ``` 119 | 120 | # Functions 121 | 122 | ## it's really easy to create functions in R 123 | 124 | ```{r} 125 | f <- function(x) x + 1 126 | class(f) 127 | ``` 128 | 129 | ## every function has three parts 130 | 131 | every function needs inputs - these are called `arguments` (or, here, formal arguments) 132 | 133 | every function has stuff it does, and this stuff is contained in the body 134 | 135 | every function has an `environment` that it executes in 136 | 137 | ```{r} 138 | formals(f) 139 | body(f) 140 | environment(f) 141 | ``` 142 | 143 | ## environments are where the function was defined 144 | 145 | see how our function has `R_GlobalEnv` as it's environment? that's because we defined it in the global environment 146 | 147 | this means that if you tell a function to look for an `object`, it will look in the global namespace 148 | 149 | ```{r} 150 | f <- function(x) x + y 151 | y <- 1 152 | f(x = 1) 153 | ``` 154 | 155 | ## it's very common for functions to be declared within another function 156 | 157 | ```{r} 158 | y <- 9001 159 | f <- function(x) { 160 | y <- 1 161 | g <- function (x) { 162 | x + y 163 | } 164 | g(x) 165 | } 166 | f(1) 167 | ``` 168 | 169 | this is important because it means that functions can be separated from the state of your computer (which is what makes them easy to parallelize) 170 | 171 | ## functions don't modify your computer state (usually) 172 | 173 | an obvious exception is writing/reading from disk, but what we really mean here is that anything created inside the function environment doesn't show up in the global environment 174 | 175 | ```{r} 176 | h <- function(){ 177 | if (!exists('a')) { 178 | a <- 1 179 | } 180 | else { 181 | a <- 9000 182 | } 183 | print(a) 184 | } 185 | h() 186 | h() 187 | ``` 188 | 189 | there are ways to make functions modify global variables, but this is generally not a good idea - anything that needs to go into a function should be in the arguments, and anything that needs to come out of the function should be returned 190 | 191 | > side note - R automatically returns the value of the last expression, so there is no need for an explicit `return` statement unless you want to break the function early 192 | 193 | a couple of days ago, we were dealing with data that came in several different units of length - let's try writing a function that converts inches to centimeters 194 | 195 | ```{r} 196 | in_to_cm <- function(x) x * 2.5 197 | in_to_cm(69) 198 | ``` 199 | 200 | that's not juvenile humor - it's actually Dillon's height in inches 201 | 202 | what if we want to know how tall we are in meters? 203 | 204 | you could do `function(x) x * 2.5 / 100 ` but this would be repeating yourself 205 | 206 | then, when you figure out that the conversion factor is really *2.54*, not 2.5, you might update one and forget to update the other 207 | 208 | ```{r} 209 | in_to_m <- function(x){ 210 | in_to_cm(x) / 100 211 | } 212 | in_to_m(69) 213 | ``` 214 | 215 | now, if we go back and update `in_to_cm`, those changes automatically get propogated to `in_to_m` 216 | 217 | ```{r} 218 | in_to_cm <- function(x) x * 2.54 219 | in_to_m(69) 220 | ``` 221 | 222 | if you were here for the intro to Unix, this idea of small functions combined together should sound awfully familiar 223 | 224 | R is a bit quirky in that there is no such thing as an uncontained value, e.g. `4` is really a vector with length of one, and a value of 4 in position 1 225 | 226 | ```{r} 227 | 69 == c(69) 228 | ``` 229 | 230 | this means that R automatically broadcasts functions across vectors of any length 231 | 232 | ```{r} 233 | heights <- c(69,54,73,82) 234 | in_to_m(heights) 235 | ``` 236 | 237 | ## this doesn't work with lists 238 | 239 | ```{r, eval=FALSE} 240 | heights <- list(69,54,73,82) 241 | in_to_m(heights) 242 | ``` 243 | 244 | # Functionals 245 | 246 | a functional is a function that takes functions as arguments 247 | 248 | ## the wrong way to be functional 249 | 250 | imagine you want to apply a function to the columns of a dataframe (which is a list!) 251 | 252 | you could do something like this: 253 | 254 | ```{r} 255 | in_to_m(heights[[1]]) 256 | in_to_m(heights[[2]]) 257 | in_to_m(heights[[3]]) 258 | ``` 259 | 260 | but this is clunky, prone to errors, and can't acommodate changes in your list - if you added another item in the list, you would need to find every place you tried to do this and `cntrl-c` `cntrl-v` a whole bunch of crap 261 | 262 | ## the right way to be functional 263 | 264 | ```{r} 265 | lapply(heights, in_to_m) 266 | ``` 267 | 268 | ## it's not always smart to name functions 269 | 270 | these are called anonymous functions - they aren't actually any different, but you should know they exist 271 | 272 | ```{r} 273 | lapply(heights, FUN = function(x) x %/% 12) 274 | ``` 275 | 276 | ## lapply has limits 277 | 278 | ```{r} 279 | dat <- read.csv('data/large.csv') 280 | str(dat) 281 | lapply(dat, mean) 282 | ``` 283 | 284 | we *know* there are numbers there - why are the means all missing? 285 | 286 | a. we didn't use amelia 287 | b. `mean` has an argument named `na.rm` that ignores missingness 288 | 289 | *and for Hadley knows what reason, the default is FALSE* 290 | 291 | but we can't do this 292 | 293 | ```{r, eval=FALSE} 294 | lapply(dat, mean(na.rm = TRUE)) 295 | ``` 296 | 297 | ## so we use Map 298 | 299 | Map is a function similar to those found in other functional languages 300 | 301 | ```{r, eval=FALSE} 302 | Map(mean, dat, na.rm=TRUE) 303 | ``` 304 | 305 | ## this can be parallelized 306 | 307 | > side note - previous versions of these materials imported the `parallel` library, which is no longer supported as of R versions >= 3.2 308 | 309 | ```{r} 310 | install.packages('parallelMap') 311 | library(parallelMap) 312 | system.time(Map(median, dat, na.rm=TRUE)) 313 | system.time(parallelMap(median, dat, na.rm=TRUE)) 314 | ``` 315 | 316 | parallel processing incurs time costs from memory management and message passing that can make small jobs take longer in parallel than in serial 317 | 318 | # Packages 319 | 320 | ** Real artists ship - Steve Jobs ** 321 | 322 | ## why to package 323 | 324 | 1. embrace your inner sloth 325 | 2. Linus's law 326 | 3. R packages get cited 327 | 328 | ## how to package 329 | 330 | 1. document first 331 | 2. avoid feature creep 332 | 3. release early, release often 333 | 334 | we're going to be using devtools, an R package which makes it easier to build, install, and share packages 335 | 336 | ```{r, eval=FALSE} 337 | install.packages('devtools') 338 | ``` 339 | 340 | ```{r} 341 | library(devtools) 342 | # has_devel() # this is currently returning a clang compiler error 343 | ``` 344 | 345 | ## basic components of a package 346 | 347 | 1. the only thing a package needs to be a package is `Description` - this should tell you something about the importance of documentation (namely, that code without documentation is worthless) 348 | 349 | 2. your package will also need a name - keep it simple but unique (i.e. easily Googleable) 350 | 351 | 3. *the code* 352 | 353 | 4. namespace 354 | 355 | ## getting started 356 | 357 | we'll use devtools to create the boilerplate for us 358 | 359 | ```{r, eval=FALSE} 360 | devtools::create("convertR") 361 | ``` 362 | 363 | ## editing the DESCRIPTION 364 | 365 | the `DESCRIPTION` is a plaintext file in DCF format (similar to YAML). if you open it up, you should see something like this: 366 | 367 | ``` 368 | Package: convertR 369 | Title: What the Package Does (one line, title case) 370 | Version: 0.0.0.9000 371 | Authors@R: person("First", "Last", email = "first.last@example.com", role = c("aut", "cre")) 372 | Description: What the package does (one paragraph) 373 | Depends: R (>= 3.2.1) 374 | License: What license is it under? 375 | LazyData: true 376 | ``` 377 | 378 | it's your job to edit this to contain the correct information 379 | 380 | ## adding dependencies 381 | 382 | devtools automatically adds in that your code depends on the ability to run R, and at a version number equal to or greater than the one you are currently using 383 | 384 | what if there are other packages that your package uses? like ggplot2? do 385 | 386 | ``` 387 | Imports: ggplot 388 | ``` 389 | 390 | and if you want list optional packages, you can do so like this: 391 | 392 | ``` 393 | Suggests: 394 | reshape2 (>=1.4.1) 395 | plyr (>=1.8.3) 396 | ``` 397 | 398 | > side note - moving the packages a line below is a stylistic choice so that they line up if there is more than one - be sure to indent! 399 | 400 | ## adding your code 401 | 402 | generally speaking, your package should only contain definitions of objects, most of which are functions 403 | 404 | these are placed in `.R` files in the `/R` directory 405 | 406 | we can take the functions that we defined above: 407 | 408 | ``` 409 | in_to_cm <- function(x) x * 2.54 410 | 411 | in_to_m <- function(x){ 412 | in_to_cm(x) / 100 413 | } 414 | ``` 415 | 416 | put them in a file in `/R`, and save it with an informative name like `lengths.R` 417 | 418 | ## creating man pages 419 | 420 | these are descriptions of each object in your .R script 421 | 422 | you could write these in R's semi-LaTeX format yourself, but that's time consuming 423 | 424 | as Raymond Hettinger would say "there must be a better way" 425 | 426 | ```{r, eval=FALSE} 427 | install.packages('roxygen2') 428 | ``` 429 | 430 | ```{r} 431 | library(roxygen2) 432 | ``` 433 | 434 | now we're going to add specialized comments to our length.R file 435 | 436 | ``` 437 | #' Converts inches to centimeters 438 | #' 439 | #' @param x A numeric 440 | #' @return Converted numeric 441 | #' @examples 442 | #' in_to_cm(1) 443 | #' in_to_cm(c(1,2,3)) 444 | in_to_cm <- function(x) x * 2.54 445 | 446 | #' Converts inches to meters 447 | #' 448 | #' @param x A numeric 449 | #' @return Converted numeric 450 | #' @examples 451 | #' in_to_m(1) 452 | #' in_to_m(c(1,2,3)) 453 | in_to_m <- function(x){ 454 | in_to_cm(x) / 100 455 | } 456 | ``` 457 | 458 | and now create the documentation with 459 | 460 | ```{r, eval=FALSE} 461 | devtools::document('convertR') 462 | ``` 463 | 464 | ## `NAMESPACE` 465 | 466 | this is a file that tells R what `names` from the `environment` your package calls, and what `names` your package is going to put into the `global environment` for the user 467 | 468 | again, you can write this yourself in `NAMESPACE`: 469 | 470 | ``` 471 | export(in_to_cm) 472 | export(in_to_m) 473 | ``` 474 | 475 | or you can have roxygen2 handle it for you by adding `#' @export` in the function blocks you want to have exported 476 | 477 | ``` 478 | #' Converts inches to centimeters 479 | #' 480 | #' @param x A numeric 481 | #' @return Converted numeric 482 | #' @examples 483 | #' in_to_cm(1) 484 | #' in_to_cm(c(1,2,3)) 485 | #' @export 486 | in_to_cm <- function(x) x * 2.54 487 | 488 | #' Converts inches to meters 489 | #' 490 | #' @param x A numeric 491 | #' @return Converted numeric 492 | #' @examples 493 | #' in_to_m(1) 494 | #' in_to_m(c(1,2,3)) 495 | #' @export 496 | in_to_m <- function(x){ 497 | in_to_cm(x) / 100 498 | } 499 | ``` 500 | 501 | then running: 502 | 503 | ```{r, eval=FALSE} 504 | devtools::document('convertR') 505 | ``` 506 | 507 | roxygen is careful in that it will only write files if they: 508 | 509 | a. do not exist yet 510 | b. were created by roxygen 511 | 512 | you can see this in the header: 513 | 514 | ``` 515 | # Generated by roxygen2 (4.1.1): do not edit by hand 516 | ``` 517 | 518 | ## data 519 | 520 | lastly, if you are shipping data with your code, it goes in the `/data` directory 521 | 522 | CRAN expects this to contain a single `.Rdata` file created by `save()` 523 | 524 | the other option is to use `devtools::use_data()` 525 | 526 | ## source packages are not bundled packages 527 | 528 | now you have a package! 529 | 530 | but no one else can use it 531 | 532 | this might be want you want, but you may want to share it 533 | 534 | first, we'll have to tell R what things are not a part of our shipped package 535 | 536 | ## adding .Rbuildignore 537 | 538 | CRAN is very fussy about what they allow to be uploaded, and in what format 539 | 540 | remember .gitignore? there's a similar function for R packages 541 | 542 | you can add regex to .Rbuildignore via devtools 543 | 544 | ```{r, eval=FALSE} 545 | devtools::use_build_ignore("Rproj", pkg = "convertR") 546 | ``` 547 | 548 | devtools automatically initiates this as a project file for RStudio, which no one else wants to see 549 | 550 | if you have a README, NEWS, or UPDATES file, you should add them to .Rbuildignore 551 | 552 | ## checking 553 | 554 | before you ship your code anywhere, you should check it to make sure it works the way it's supposed to 555 | 556 | ideally, you will have been doing this with unit testing all along, but that's beyond the scope of this class 557 | 558 | ```{r, eval=FALSE} 559 | devtools::check("convertR") 560 | ``` 561 | 562 | this checks over 50 compatibility issues, and takes a bit of time, even with our two tiny functions 563 | 564 | ## shipping to Github 565 | 566 | if you're planning on shipping to github, you're pretty much set 567 | 568 | initialize git in your package, add the contents, and push to a repo 569 | 570 | ``` 571 | git init 572 | git add * 573 | git commit -m "initial commit" 574 | git remote add origin git@github.com:deniederhut/convertR 575 | git push 576 | ``` 577 | 578 | ## shipping to CRAN 579 | 580 | this is a bit more involved 581 | 582 | first, you'll need to build your package 583 | 584 | ```{r, eval=FALSE} 585 | devtools::build('convertR') 586 | ``` 587 | 588 | this creates a compressed tarball, which should be called something like `convertR_0.0.0.9000.tar.gz"` 589 | 590 | which you'll submit via [https://cran.r-project.org/submit.html](https://cran.r-project.org/submit.html) 591 | 592 | in the comments, you should include: 593 | 594 | 1. the environments you checked on 595 | 2. the results of `devtools::check()`, with an explanation for any errors 596 | 597 | 598 | # Practice 599 | 600 | ## Assignment 601 | 602 | Remember how we read lines from an html document of Romeo and Juliet on day two? 603 | 604 | ```{r} 605 | RJ <- readLines("http://shakespeare.mit.edu/romeo_juliet/full.html") 606 | ``` 607 | 608 | Write functions to parse this document into acts, then count the number of times the words "Romeo" and "Juliet" appear in each act. Then, package these functions. 609 | 610 | ## Example code 611 | 612 | Take a look at the data and look for some pattern. 613 | 614 | ```{r} 615 | RJ[grep("<title>", RJ, perl=TRUE)] 616 | RJ[grep("<H3>", RJ, perl=TRUE)] 617 | RJ[grep("<h3>", RJ, perl=TRUE)] 618 | ``` 619 | 620 | Now that we know that the first line of each act begins with the string ``<H3>", we will create a null list object called x, and then assign all the lines in each act to each component in x. 621 | 622 | ```{r} 623 | x <- list(NA) 624 | y <- grep("<H3>", RJ, perl=TRUE) 625 | for(i in 1:length(y)){ 626 | if(i < length(y)){ 627 | x[[i]] <- RJ[c(y[i]:(y[i+1]-1))] 628 | } 629 | if(i == length(y)){ 630 | x[[i]] <- RJ[c(y[i]:length(RJ))] 631 | } 632 | } 633 | ``` 634 | 635 | How should we count the number of the words appear in each act? Create a wrapper function that counts the number of the words and returns the number. 636 | 637 | ```{r} 638 | countR <- function(z){ 639 | return(c(length(grep("Romeo", z, perl=T)), length(grep("Juliet", z, perl=T)))) 640 | } 641 | lapply(x, countR) 642 | ``` 643 | 644 | Now count the lines in each scene 645 | 646 | ```{r} 647 | # now count the lines in each scene 648 | countL <- function(z){ 649 | return(length(grep("</A><br>$", z, perl=T))) 650 | } 651 | lapply(x, countL) 652 | ``` 653 | 654 | # Acknowledgements 655 | 656 | ## Materials taken from: 657 | 658 | [Software Carpentry](https://swcarpentry.github.io/) 659 | 660 | [Hadley Wickham](http://adv-r.had.co.nz/) 661 | 662 | [more Hadley Wickham](http://r-pkgs.had.co.nz/) 663 | -------------------------------------------------------------------------------- /instructor/day_four.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/R-for-Data-Science/798d5b220d66afea5644a245c7327e4bb740039a/instructor/day_four.pdf -------------------------------------------------------------------------------- /instructor/day_one.R: -------------------------------------------------------------------------------- 1 | ## ---- echo=FALSE--------------------------------------------------------- 2 | knitr::opts_knit$set(root.dir = '../') 3 | 4 | ## ------------------------------------------------------------------------ 5 | ls 6 | 7 | ## ------------------------------------------------------------------------ 8 | my.name <- dir 9 | my.name 10 | 11 | ## ------------------------------------------------------------------------ 12 | my.name <- dir() 13 | my.name 14 | 15 | ## ------------------------------------------------------------------------ 16 | class(dir) 17 | 18 | ## ------------------------------------------------------------------------ 19 | sum(1,2,3) 20 | 21 | ## ------------------------------------------------------------------------ 22 | is.object(sum) 23 | 24 | ## ------------------------------------------------------------------------ 25 | getwd() 26 | 27 | ## ---- eval=FALSE--------------------------------------------------------- 28 | ## setwd("/Users/dillonniederhut/Dropbox/dlab/R-for-Data-Science") 29 | 30 | ## ------------------------------------------------------------------------ 31 | dir() 32 | 33 | ## ------------------------------------------------------------------------ 34 | ls() 35 | 36 | ## ------------------------------------------------------------------------ 37 | test <- "I have no idea what I'm doing" 38 | ls() 39 | 40 | ## ---- eval=FALSE--------------------------------------------------------- 41 | ## rm(list = ls()) 42 | ## exists(test) 43 | 44 | ## ------------------------------------------------------------------------ 45 | ?exists 46 | 47 | ## ------------------------------------------------------------------------ 48 | ??exists 49 | 50 | ## ------------------------------------------------------------------------ 51 | example(exists) 52 | 53 | ## ------------------------------------------------------------------------ 54 | apropos('lm') 55 | 56 | ## ---- eval=FALSE--------------------------------------------------------- 57 | ## install.packages("Amelia") 58 | 59 | ## ------------------------------------------------------------------------ 60 | library(Amelia) 61 | 62 | ## ---- eval=FALSE--------------------------------------------------------- 63 | ## library(supercalifragilisticexpialedocious) 64 | 65 | ## ------------------------------------------------------------------------ 66 | 2 + 2 67 | 2 - 2 68 | 2 * 2 69 | 2 %% 2 70 | 2 %/% 2 71 | 2 / 2 72 | 2 ** 2 73 | 2 ** .5 74 | 2 ** -1 75 | 76 | 77 | ## ------------------------------------------------------------------------ 78 | abs(-2) 79 | pi 80 | round(pi,digits = 2) 81 | sign(-2) 82 | log(2) 83 | log10(2) 84 | cos(pi) 85 | 86 | ## ------------------------------------------------------------------------ 87 | TRUE & TRUE 88 | TRUE | FALSE 89 | xor(TRUE,FALSE) 90 | ! FALSE 91 | 1 & 1 92 | 1 & 0 93 | !0 94 | 95 | ## ------------------------------------------------------------------------ 96 | class(TRUE) 97 | class(1) 98 | 99 | ## ------------------------------------------------------------------------ 100 | class(FALSE) 101 | class(pi) 102 | class("Look mama I'm letters") 103 | class(as.Date("2015-07-27")) 104 | class(factor(c('undergraduate','graduate','professor','staff'))) 105 | 106 | ## ------------------------------------------------------------------------ 107 | TRUE + TRUE 108 | 2 & 1 109 | TRUE * TRUE 110 | 2 & -1 111 | 112 | ## ------------------------------------------------------------------------ 113 | if (9001) print('This is evaluated as a boolean value') 114 | 115 | ## ------------------------------------------------------------------------ 116 | my.character <- paste("Hey", "momma", "I'm", "a", "string") 117 | my.character 118 | 119 | ## ------------------------------------------------------------------------ 120 | substr(my.character,1,4) 121 | 122 | ## ------------------------------------------------------------------------ 123 | substr(my.character,1,4) <- "Yes " 124 | my.character 125 | 126 | ## ------------------------------------------------------------------------ 127 | strsplit(my.character, ' ') 128 | 129 | ## ------------------------------------------------------------------------ 130 | gsub('.', 'X', my.character) 131 | 132 | ## ------------------------------------------------------------------------ 133 | gsub('[.]', 'X', my.character) 134 | gsub('[g]', 'X', my.character) 135 | 136 | ## ------------------------------------------------------------------------ 137 | my.date <- as.Date("2015-07-27") 138 | my.date + 7 139 | weekdays(my.date + 7) 140 | my.date - 365 141 | weekdays(my.date - 365) 142 | 143 | ## ------------------------------------------------------------------------ 144 | my.factor <- factor(c('undergraduate','graduate','professor','staff')) 145 | levels(my.factor) 146 | 147 | ## ------------------------------------------------------------------------ 148 | my.factor <- factor(c(1,2,3,4), 149 | levels=c(1,2,3,4), 150 | labels=c('undergraduate','graduate','professor','staff')) 151 | levels(my.factor) 152 | 153 | ## ------------------------------------------------------------------------ 154 | is.character(my.character) 155 | is.numeric(my.character) 156 | 157 | ## ------------------------------------------------------------------------ 158 | as.character(9) 159 | as.numeric(my.character) 160 | 161 | ## ------------------------------------------------------------------------ 162 | my.vector <- c(TRUE, TRUE, FALSE, FALSE, TRUE) 163 | my.vector 164 | 165 | ## ------------------------------------------------------------------------ 166 | your.vector <- c(1,2,3,4,5) 167 | my.vector * your.vector 168 | 169 | ## ------------------------------------------------------------------------ 170 | seq(from=0,to=length(my.vector),by=2) 171 | 172 | ## ------------------------------------------------------------------------ 173 | 1:length(my.vector) 174 | 175 | ## ------------------------------------------------------------------------ 176 | c(1,2,3) * c(TRUE, FALSE) 177 | 178 | ## ------------------------------------------------------------------------ 179 | my.vector[1] 180 | your.vector[1:2] 181 | my.vector[c(1,3)] 182 | 183 | ## ------------------------------------------------------------------------ 184 | my.list <- list(TRUE, 'two', 3) 185 | my.list 186 | 187 | ## ------------------------------------------------------------------------ 188 | str(my.list) 189 | 190 | ## ---- eval=FALSE--------------------------------------------------------- 191 | ## my.list * list(1, 'two', FALSE) 192 | 193 | ## ------------------------------------------------------------------------ 194 | my.list[1] 195 | 196 | ## ------------------------------------------------------------------------ 197 | my.list[[1]] 198 | 199 | ## ------------------------------------------------------------------------ 200 | my.data <- data.frame(n = c(1,2,3),c=c('one','two','three'),b=c(TRUE,TRUE,FALSE)) 201 | my.data 202 | 203 | ## ------------------------------------------------------------------------ 204 | dim(my.data) #this gives you nrow() and ncol() 205 | colnames(my.data) 206 | rownames(my.data) 207 | 208 | ## ------------------------------------------------------------------------ 209 | my.data[1:2,3] 210 | 211 | ## ------------------------------------------------------------------------ 212 | str(my.data) 213 | 214 | ## ------------------------------------------------------------------------ 215 | my.data$b 216 | my.data$d <- c(my.date, my.date+7, my.date-7) 217 | my.data 218 | 219 | ## ------------------------------------------------------------------------ 220 | my.data$really.long.and.complicated.variable.name <- 999 221 | my.data$r 222 | 223 | ## ------------------------------------------------------------------------ 224 | rbind(my.data, my.data) 225 | cbind(my.data, my.data) 226 | 227 | -------------------------------------------------------------------------------- /instructor/day_one.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Day One: R Basics" 3 | author: "Dillon Niederhut" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | output: 6 | - pdf_document 7 | - slidy_presentation 8 | --- 9 | 10 | ```{r, echo=FALSE} 11 | knitr::opts_knit$set(root.dir = '../') 12 | ``` 13 | 14 | ## Pre-introduction 15 | 16 | You should start by having your class go to our github page at [github.com/dlab-berkeley/R-for-Data-Science](github.com/dlab-berkeley/R-for-Data-Science) to get the course materials either via: 17 | 18 | 1. `git clone https://github.com/dlab-berkeley/R-for-Data-Science.git`; or 19 | 2. clicking the 'download zip` button on the right hand side of the screen 20 | 21 | The students won't need these materials today, but they will for the rest of the workshop. While everything is downloading, you can go on to: 22 | 23 | ## Introduction to the class 24 | 25 | Tthese materials are meant to be guides for you, the instructor. Your students will retain more of this content if they type these commands themselves than if they read them off of the slidedeck. That being said, at any time, you can create a slide deck by changing `output:` to be `html_slides` instead of `pdf_document`. 26 | 27 | **If you are a student and you are looking at this PDF**, be sure not to copy the code examples directly from the PDF. The typesetter uses character encoding that might not match the local settings on your computer, and the examples won't work. If you have fallen behind or can't see the screen very well, and want to copy the code examples, do so from the `.R` files in this directory. 28 | 29 | It is a good idea to start off the class by asking folks why they want to learn R. Common responses include: 30 | 31 | 1. Stata/SPSS/Matlab is too expensive 32 | 2. I saw a pretty graph someone made in R 33 | 3. My field uses analytical packages written for R 34 | 4. I have a deep and burning desire for open and reproducible research 35 | 36 | The outline below is designed to give each of these kinds of students the tools they need to get what they want out of R while avoiding common pitfalls. As the instructor, you should draw on your own experience to include further examples and advice, especially for students who do not fall into one of the four categories above. 37 | 38 | ## Introduction to R 39 | 40 | It may also be helpful to start off with a little bit of background knowledge about R. I find that explicitly informing students about the design principles of a language is a quick way to bootstrap their intuitions about how to use that language. R, for example, is a very old language whose objective was to allow scientists to quickly and interactively conduct statistical tests when the only other options at the time were: 41 | 42 | 1. Compile a whole program in C or FORTRAN; or, 43 | 2. Do the math yourself with a pencil and a piece of paper 44 | 45 | Obviously, neither of these is optimal, but what might not be obvious is that they both share the same problems; they require lots of human time, and those humans have to be very knowledgeable about the mathematical principles underlying statistical computation (e.g. that even simple functions have multiple implementations to balance accuracy/efficiency for different input values). 46 | 47 | The good news is that very complicated processes like logistic regression are a single command in R. The bad news is that R is typically not concerned with being logical or conistent. If you find yourself wanting to tear your hair out, this is **normal**. 48 | 49 | # Object Oriented Programming 50 | 51 | In the grand scheme of computer software, object orientation is a way of organizing code such that it is easy to update without breaking. This means grouping functions that serve a similar purpose into hierarchies. However, stating it this way is confusing and abstract. 52 | 53 | You can think about it this way: a soccer ball is an object. So is a basketball. They share a lot of things in common. It's simpler to know that balls generaly bounce than to explicitly declare for every ball I ever see in my entire life whether it bounces or not. I can't bounce you, for example, but you didn't need to tell me that when I met you. If I came to believe that people were bounce-able, I would update my idea of people generally, not every person specifically. 54 | 55 | We call things like you and basketball `objects`, and they are in `classes` like human and ball. If I want to create a new object, like a football, I don't have to declare every single thing there is to know about footballs. I can say it `inherits` `attributes` from the `class` ball, except that it's an oblate spheroid instead of a sphere. Easy. 56 | 57 | > side note - if you are coming from C++ or Java, be warned that objects in R do not have methods that are accessible with dot notation (in fact, the `.` is used just like `_`) 58 | 59 | ## everything in R is an object 60 | 61 | yes, even the commands, just watch 62 | 63 | ```{r} 64 | ls 65 | ``` 66 | 67 | `ls`, like basketball, is a specific thing with a `name` and stuff inside it that makes it `ls` and not dillon niederhut. in this particular instance, we are looking at the function that tells you what `objects` are in your `environment` 68 | 69 | until we get to functional programming, your `environment` is just R plus whatever you put in R 70 | 71 | ## in R, you store objects with names with the `<-` operator 72 | 73 | just like you need names to tell things apart, R does too 74 | 75 | ```{r} 76 | my.name <- dir 77 | my.name 78 | ``` 79 | 80 | ## names must be unique 81 | 82 | everytime you give an `object` a `name`, it removes anything that already had that `name` from your environment 83 | 84 | ```{r} 85 | my.name <- dir() 86 | my.name 87 | ``` 88 | 89 | you see those parentheses? that means you are calling an object (here, it's a function evaluator) on `dir`. 90 | 91 | ## classes in R 92 | 93 | because it is code to be evalueated, `dir` belongs in a class called 'functions' 94 | 95 | ```{r} 96 | class(dir) 97 | ``` 98 | 99 | functions all have the same basic structure 100 | 101 | `function(arguments)`, where the arguments are other objects, like 102 | 103 | ```{r} 104 | sum(1,2,3) 105 | ``` 106 | 107 | `1,2,3` are also objects, with a class of their own 108 | 109 | when you call a function, it looks at the classes of the things you are calling it on to figure out how to behave 110 | 111 | in much the same way, if my function is to move things from point A to point B, the way I might do that to a basketball is different from the way I might do that to you 112 | 113 | what kind of class do you think `1` is? 114 | 115 | ## more bad news 116 | 117 | R started out as a functional programming language (more on this later), to which object orientation was later added 118 | 119 | this means that R doesn't know that some things are objects, because they predate the addition of class systems 120 | 121 | ```{r} 122 | is.object(sum) 123 | ``` 124 | 125 | most of R uses what are called S3 methods, which have no rules except be easy to use. this can make them wildly inconsistent, even to the point where a single function will have multiple sets of rules for how it can be called (you'll see this in day 3). 126 | 127 | > as a side note, there is also no agreement about how to name things, so you'll likely see a mixture of snake_case and CamelCase, based on the preferences of the person who originally wrote the function 128 | 129 | # living in R 130 | 131 | ## figure out where you are with 132 | 133 | ```{r} 134 | getwd() 135 | ``` 136 | 137 | like in Unix, in R you are always in a directory 138 | 139 | your actions are all relative to that directory 140 | 141 | ## tell R where you would like it to be with 142 | 143 | ```{r, eval=FALSE} 144 | setwd("/Users/dillonniederhut/Dropbox/dlab/R-for-Data-Science") 145 | ``` 146 | 147 | ## find out what's in your directory with 148 | 149 | ```{r} 150 | dir() 151 | ``` 152 | 153 | ## find out what's in your environment with 154 | 155 | in R, you are always in an environment (more on scoping in day 4) 156 | 157 | ```{r} 158 | ls() 159 | ``` 160 | 161 | our environment is currently empty 162 | 163 | ```{r} 164 | test <- "I have no idea what I'm doing" 165 | ls() 166 | ``` 167 | 168 | ## we can clean our environment with 169 | 170 | ```{r, eval=FALSE} 171 | rm(list = ls()) 172 | exists(test) 173 | ``` 174 | 175 | ## you can pull documentation with `?` 176 | 177 | ```{r} 178 | ?exists 179 | ``` 180 | 181 | ## and search the help pages with `??` 182 | 183 | ```{r} 184 | ??exists 185 | ``` 186 | 187 | ## you can get a quick example with 188 | 189 | ```{r} 190 | example(exists) 191 | ``` 192 | 193 | when you kind of remember what you are looking for, try 194 | 195 | ```{r} 196 | apropos('lm') 197 | ``` 198 | 199 | # The power of R is its extensibility 200 | 201 | many people write clever software that makes R smarter/better/faster/stronger 202 | 203 | ## you can install these packages with 204 | 205 | ```{r, eval=FALSE} 206 | install.packages("Amelia") 207 | ``` 208 | 209 | > side note - by default, R tries to install packages to a write-protected directory on Windows machines. There are two ways around this: 1, say 'yes' to the option of installing the packages in a different location; or 2, use the `Tools / Install Packages` drop-down menu item in RStudio 210 | 211 | ## and include them in your environment with 212 | 213 | ```{r} 214 | library(Amelia) 215 | ``` 216 | 217 | note that when you are installing something, you give R a bunch of letters to search CRAN for, which is why it's in quotes 218 | 219 | but when you pull it into your environment, you are calling a function on a `name`, which is why it isn't in quotes 220 | 221 | ## if you try to call `library` on package that you haven't downloaded, R will fuss at you 222 | 223 | ```{r, eval=FALSE} 224 | library(supercalifragilisticexpialedocious) 225 | ``` 226 | 227 | # Math in R 228 | 229 | ## R can be a calculator 230 | 231 | ```{r} 232 | 2 + 2 233 | 2 - 2 234 | 2 * 2 235 | 2 %% 2 236 | 2 %/% 2 237 | 2 / 2 238 | 2 ** 2 239 | 2 ** .5 240 | 2 ** -1 241 | 242 | ``` 243 | 244 | ## R does a few more complicated things 245 | 246 | ```{r} 247 | abs(-2) 248 | pi 249 | round(pi,digits = 2) 250 | sign(-2) 251 | log(2) 252 | log10(2) 253 | cos(pi) 254 | ``` 255 | 256 | ## R also handles logic tables and testing 257 | 258 | ```{r} 259 | TRUE & TRUE 260 | TRUE | FALSE 261 | xor(TRUE,FALSE) 262 | ! FALSE 263 | 1 & 1 264 | 1 & 0 265 | !0 266 | ``` 267 | 268 | # Data Types 269 | 270 | ## R differentiates between different types of data 271 | 272 | for example, the boolean and numeric values above 273 | 274 | ```{r} 275 | class(TRUE) 276 | class(1) 277 | ``` 278 | 279 | you could also use `mode` to get the type of an object 280 | 281 | this will mean later, when you try to call `mode` to get the most frequently occurring level of a variable, you will be frustrated and sad 282 | 283 | don't dislike the messenger 284 | 285 | you will likely only ever deal with five flavors of data in R, which are stored as 286 | 287 | ## three data types 288 | 289 | ```{r} 290 | class(FALSE) 291 | class(pi) 292 | class("Look mama I'm letters") 293 | class(as.Date("2015-07-27")) 294 | class(factor(c('undergraduate','graduate','professor','staff'))) 295 | ``` 296 | 297 | > side note - by default, R stores everything as doubles (64 bit floating point numbers) which makes R very memory hungry. You can force it use an integer type with the `L` operator, like: `class(1L) ==` `r class(1L)` 298 | 299 | we've already dealt a lot with numerics above, so let's talk about 300 | 301 | # Boolean data 302 | 303 | ## logical values pretty much act like numerics 304 | 305 | ```{r} 306 | TRUE + TRUE 307 | 2 & 1 308 | TRUE * TRUE 309 | 2 & -1 310 | ``` 311 | 312 | this can make it easy to use if/then statements, as `if x` evaluates to `TRUE` if it is anything other than zero 313 | 314 | ```{r} 315 | if (9001) print('This is evaluated as a boolean value') 316 | ``` 317 | 318 | also, any vector (we'll talk about these below) multiplied by a boolean vector has all of its false values set to zero, which can be helpful for summing and average only specific cases 319 | 320 | # Character data 321 | 322 | ## character handling in R is fairly close to character handling in a Unix terminal 323 | 324 | ```{r} 325 | my.character <- paste("Hey", "momma", "I'm", "a", "string") 326 | my.character 327 | ``` 328 | 329 | ## whitespace is the default separater in the paste function, if you don't want this, use `paste0()` 330 | 331 | ```{r} 332 | substr(my.character,1,4) 333 | ``` 334 | 335 | ## note here that R is not a zero-indexed language 336 | 337 | ```{r} 338 | substr(my.character,1,4) <- "Yes " 339 | my.character 340 | ``` 341 | 342 | ## you can separate characters with 343 | 344 | ```{r} 345 | strsplit(my.character, ' ') 346 | ``` 347 | 348 | ## you can substitute with 349 | 350 | ```{r} 351 | gsub('.', 'X', my.character) 352 | ``` 353 | 354 | R here calls Perl's regex library, where `.` is a special shorthand for "anything" 355 | 356 | ## to be safe, put it in brackets 357 | 358 | ```{r} 359 | gsub('[.]', 'X', my.character) 360 | gsub('[g]', 'X', my.character) 361 | ``` 362 | 363 | # Datetime data 364 | 365 | ## R stores dates internally as the number of days since the epoch (1 Jan 1970) 366 | 367 | ```{r} 368 | my.date <- as.Date("2015-07-27") 369 | my.date + 7 370 | weekdays(my.date + 7) 371 | my.date - 365 372 | weekdays(my.date - 365) 373 | ``` 374 | 375 | ## the epoch is common to (most) Unix systems 376 | 377 | makes it easy to add and subtract days 378 | 379 | however, most other languages use seconds since the epoch, not days 380 | 381 | these can both cause interoperability issues 382 | 383 | # Factor data 384 | 385 | ## R stores factors internally as integers, and uses the character strings as labels 386 | 387 | ```{r} 388 | my.factor <- factor(c('undergraduate','graduate','professor','staff')) 389 | levels(my.factor) 390 | ``` 391 | 392 | notice how it sorts those levels alphabetically? 393 | 394 | this can cause issues when making plots or trying to display in a particular order - if sort order is critical 395 | 396 | ## try giving your factor explicitly numeric levels and character labels 397 | 398 | ```{r} 399 | my.factor <- factor(c(1,2,3,4), 400 | levels=c(1,2,3,4), 401 | labels=c('undergraduate','graduate','professor','staff')) 402 | levels(my.factor) 403 | ``` 404 | 405 | # Testing and changing data types 406 | 407 | ## you can test types with `is.type`, e.g. 408 | 409 | ```{r} 410 | is.character(my.character) 411 | is.numeric(my.character) 412 | ``` 413 | 414 | ## you can change datatypes with `as.type`, e.g. 415 | 416 | ```{r} 417 | as.character(9) 418 | as.numeric(my.character) 419 | ``` 420 | 421 | trying to coerce types can lead to weird behavior 422 | 423 | # Data Structures 424 | 425 | there are five kinds of data structures in R, but you will probably only ever use three of these 426 | 427 | 1. vector 428 | 2. list 429 | 3. dataframe 430 | 431 | ## a vector is an ordered group of the same kind of data, e.g. 432 | 433 | ```{r} 434 | my.vector <- c(TRUE, TRUE, FALSE, FALSE, TRUE) 435 | my.vector 436 | ``` 437 | 438 | ## it doesn't matter what the datatype is, as long as it is all the same 439 | 440 | ```{r} 441 | your.vector <- c(1,2,3,4,5) 442 | my.vector * your.vector 443 | ``` 444 | 445 | ## you will frequently need to create vectors that are sequences of numbers 446 | 447 | ```{r} 448 | seq(from=0,to=length(my.vector),by=2) 449 | ``` 450 | 451 | ## R also gives you a shorthand operator for creating sequences where `by=1` 452 | 453 | ```{r} 454 | 1:length(my.vector) 455 | ``` 456 | 457 | remember what we said about multiplying logical vectors? 458 | 459 | ## you can add and multiply vectors, but they need to be the same length 460 | 461 | ```{r} 462 | c(1,2,3) * c(TRUE, FALSE) 463 | ``` 464 | 465 | you will run into this issue a bunch dealing with dataframes and logical vectors 466 | 467 | ## you can pull elements out of a vector by 468 | 469 | ```{r} 470 | my.vector[1] 471 | your.vector[1:2] 472 | my.vector[c(1,3)] 473 | ``` 474 | 475 | ## a list is an ordered group of things that are not of the same type 476 | 477 | ```{r} 478 | my.list <- list(TRUE, 'two', 3) 479 | my.list 480 | ``` 481 | 482 | ## you can find out the attributes for and types of data in a list with 483 | 484 | ```{r} 485 | str(my.list) 486 | ``` 487 | 488 | ## lists are simple containers, and are not additive or multiplicative 489 | 490 | ```{r, eval=FALSE} 491 | my.list * list(1, 'two', FALSE) 492 | ``` 493 | 494 | ## subsetting a list with brackets pulls out the element along with its attribute 495 | 496 | this will be annoying when you try to pull values out of objects like summary(lm()) 497 | 498 | ```{r} 499 | my.list[1] 500 | ``` 501 | 502 | ## if you want only the element, use double brackets 503 | 504 | ```{r} 505 | my.list[[1]] 506 | ``` 507 | 508 | # Data frames 509 | 510 | ## inside R, a dataframe is just a list of equal-length vectors 511 | 512 | much like in SQL where a table is a tuple of attributes 513 | 514 | ```{r} 515 | my.data <- data.frame(n = c(1,2,3),c=c('one','two','three'),b=c(TRUE,TRUE,FALSE)) 516 | my.data 517 | ``` 518 | 519 | see how this is just a list of vectors? 520 | 521 | ## you can learn some things about data frames 522 | 523 | ```{r} 524 | dim(my.data) #this gives you nrow() and ncol() 525 | colnames(my.data) 526 | rownames(my.data) 527 | ``` 528 | 529 | ## dataframes have some special operators they share with matrices - subset with brackets 530 | 531 | ```{r} 532 | my.data[1:2,3] 533 | ``` 534 | 535 | ## dataframes also have special operators that they inherit from lists 536 | 537 | ```{r} 538 | str(my.data) 539 | ``` 540 | 541 | ```{r} 542 | my.data$b 543 | my.data$d <- c(my.date, my.date+7, my.date-7) 544 | my.data 545 | ``` 546 | 547 | ## the dollar operator also does partial matching 548 | 549 | ```{r} 550 | my.data$really.long.and.complicated.variable.name <- 999 551 | my.data$r 552 | ``` 553 | 554 | since the number of rows in the dataframe (3) is a multiple of the length of the assignment (1), the vectors gets concatenated against itself three times 555 | 556 | ## you can combine data frames with 557 | 558 | ```{r} 559 | rbind(my.data, my.data) 560 | cbind(my.data, my.data) 561 | ``` 562 | 563 | you'll learn tomorrow about better ways to merge data, especially heterogeneous data 564 | 565 | # saving console output 566 | 567 | ## introduction 568 | 569 | at the end of the day, it's likely that one or two students will want to know how to "save what we did". the commands are of course already in the .R file that the students have been typing their notes into. If they want to save the console output, they basically have three options: 570 | 571 | 1. copy all the output and paste it into a separate text file; or, 572 | 2. use a sink; or, 573 | 3. write their notes as .Rmd 574 | 575 | ## sinks 576 | 577 | to use a sink, have the student put `sink('filename')` as the very first line in their notes, and `sink()` as the very last. then, when they re-run their entire .R file, the output will go to a pdf called "filename" instead of the R console. for an example, see `save_console_output.*` in the examples directory. 578 | 579 | ## .Rmd 580 | 581 | See [Dynamic documents in R Markdown](https://github.com/deniederhut/workshop_Rmd) 582 | 583 | # Acknowledgements 584 | 585 | ## Materials taken from: 586 | 587 | [Hadley Wickham](http://adv-r.had.co.nz/) 588 | -------------------------------------------------------------------------------- /instructor/day_one.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/R-for-Data-Science/798d5b220d66afea5644a245c7327e4bb740039a/instructor/day_one.pdf -------------------------------------------------------------------------------- /instructor/day_three.R: -------------------------------------------------------------------------------- 1 | ## ---- echo=FALSE--------------------------------------------------------- 2 | knitr::opts_knit$set(root.dir = '../') 3 | 4 | ## ------------------------------------------------------------------------ 5 | load('data/feedback.Rda') 6 | str(dat) 7 | 8 | ## ------------------------------------------------------------------------ 9 | summary(dat) 10 | table(dat$department) 11 | 12 | ## ------------------------------------------------------------------------ 13 | library(psych) 14 | describe(dat) 15 | 16 | ## ------------------------------------------------------------------------ 17 | library(dplyr) 18 | dat %>% group_by(gender) %>% summarize(n()) 19 | 20 | ## ------------------------------------------------------------------------ 21 | library(tidyr) 22 | dat %>% filter(!is.na(gender)) %>% group_by(gender, department) %>% 23 | summarize(n=n()) %>% spread(gender, n) 24 | 25 | ## ------------------------------------------------------------------------ 26 | install.packages('ggplot2') 27 | library(ggplot2) 28 | 29 | ## ------------------------------------------------------------------------ 30 | dat$wday <- factor(weekdays(dat$timestamp, abbreviate = TRUE), 31 | levels = c('Mon','Tue','Wed','Thu','Fri','Sat','Sun') 32 | ) 33 | summary(dat$wday) 34 | 35 | ## ------------------------------------------------------------------------ 36 | qplot(instructor.communicated, data = dat) 37 | qplot(wday, course.delivered, data = dat) 38 | 39 | ## ------------------------------------------------------------------------ 40 | ggplot(data=dat, aes(x=wday)) + geom_bar() 41 | 42 | ## ------------------------------------------------------------------------ 43 | ggplot(data=dat, aes(x=course.delivered)) + 44 | geom_histogram(binwidth=1) 45 | 46 | ## ------------------------------------------------------------------------ 47 | ggplot(data=dat, aes(x=course.delivered)) + 48 | geom_histogram(binwidth=1, fill = 'gold', colour= 'blue') 49 | 50 | ## ------------------------------------------------------------------------ 51 | ggplot(data=dat, aes(x=gender,y=interest)) + geom_boxplot() 52 | 53 | ## ------------------------------------------------------------------------ 54 | ggplot(data=dat, aes(x=instructor.communicated, y=course.delivered)) + geom_point() 55 | 56 | ## ------------------------------------------------------------------------ 57 | ggplot(data=dat, aes(x=instructor.communicated, y=course.delivered)) + 58 | geom_jitter() 59 | 60 | ## ------------------------------------------------------------------------ 61 | ggplot(data=dat, aes(x=instructor.communicated, y=course.delivered)) + 62 | geom_jitter(aes(colour = wday)) 63 | 64 | ## ------------------------------------------------------------------------ 65 | ggplot(data=dat, aes(x=wday, y=course.delivered)) + 66 | geom_boxplot(colour = 'gold') + 67 | geom_jitter(colour = 'blue') 68 | 69 | ## ------------------------------------------------------------------------ 70 | ggplot(data=dat, aes(x=instructor.communicated, y=course.delivered)) + 71 | geom_jitter() + 72 | stat_smooth(method = 'lm') 73 | 74 | ## ------------------------------------------------------------------------ 75 | ggplot(data=dat, aes(x=instructor.communicated, y=course.delivered, colour = wday)) + 76 | geom_jitter() + 77 | stat_smooth(method = 'lm', se = FALSE) 78 | 79 | ## ------------------------------------------------------------------------ 80 | ggplot(data=dat, aes(x=instructor.communicated, y=course.delivered)) + 81 | geom_jitter() + 82 | stat_smooth(method = 'lm', colour = 'black') + 83 | xlab('How well the instructor communicated (1-7)') + 84 | ylab('How well the course delivered advertised content (1-7)') + 85 | ggtitle("I have no idea what I'm doing") 86 | 87 | ## ------------------------------------------------------------------------ 88 | ggplot(data=dat, aes(x=instructor.communicated, y=course.delivered)) + 89 | geom_jitter() + 90 | stat_smooth(method = 'lm') + 91 | facet_grid(. ~ useful) 92 | 93 | ## ------------------------------------------------------------------------ 94 | t.test(dat$inside.barriers, dat$outside.barriers) 95 | 96 | ## ------------------------------------------------------------------------ 97 | t.test(dat$outside.barriers[dat$gender == "Male/Man"], dat$outside.barriers[dat$gender == "Female/Woman"]) 98 | 99 | ## ------------------------------------------------------------------------ 100 | t.test(outside.barriers ~ gender, data = dat, subset = dat$gender %in% c("Male/Man", "Female/Woman")) 101 | 102 | ## ------------------------------------------------------------------------ 103 | aov(outside.barriers ~ gender, data = dat) 104 | 105 | ## ------------------------------------------------------------------------ 106 | model.1 <- aov(outside.barriers ~ gender, data = dat) 107 | summary(model.1) 108 | 109 | ## ------------------------------------------------------------------------ 110 | TukeyHSD(model.1) 111 | 112 | ## ------------------------------------------------------------------------ 113 | cor.test(dat$outside.barriers, dat$inside.barriers) 114 | 115 | ## ------------------------------------------------------------------------ 116 | model.1 <- lm(inside.barriers ~ outside.barriers, data = dat) 117 | summary(model.1) 118 | 119 | ## ------------------------------------------------------------------------ 120 | model.2 <- lm(inside.barriers ~ outside.barriers + department, data = dat) 121 | summary(model.2) 122 | 123 | ## ------------------------------------------------------------------------ 124 | model.3 <- lm(inside.barriers ~ outside.barriers + department + outside.barriers*department, data = dat) 125 | summary(model.3) 126 | 127 | ## ------------------------------------------------------------------------ 128 | model.1$coefficients 129 | model.1$coefficients[[2]] 130 | 131 | ## ---- eval=FALSE--------------------------------------------------------- 132 | ## dat$residuals <- model.1$residuals 133 | 134 | ## ------------------------------------------------------------------------ 135 | dat.listwise <- dat[!is.na(dat$inside.barriers) & !is.na(dat$outside.barriers), ] 136 | dat.listwise$resid <- model.1$residuals 137 | 138 | ## ------------------------------------------------------------------------ 139 | ggplot(data = dat.listwise, aes(x=gender,y=resid)) + 140 | geom_boxplot() 141 | 142 | ## ------------------------------------------------------------------------ 143 | wilcox.test(dat$outside.barriers, dat$inside.barriers, alternative = "two.sided", paired = FALSE, mu = 0, conf.level = 0.95) 144 | 145 | ## ------------------------------------------------------------------------ 146 | cor.test(dat$outside.barriers, dat$inside.barriers, method = 'spearman') 147 | 148 | ## ------------------------------------------------------------------------ 149 | chisq.test(dat$gender, dat$department) 150 | 151 | ## ------------------------------------------------------------------------ 152 | names(data) 153 | 154 | -------------------------------------------------------------------------------- /instructor/day_three.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Day Three: Data Analysis" 3 | author: "Dillon Niederhut" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | output: 6 | - pdf_document 7 | - slidy_presentation 8 | --- 9 | 10 | ```{r, echo=FALSE} 11 | knitr::opts_knit$set(root.dir = '../') 12 | ``` 13 | 14 | ## Pre-introduction 15 | 16 | While everyone is getting situated and/or cloning the course materials, pull up `feedback_cleaner.R`. As a review of day 3, walk through the different parts of the script, and ask the students to describe to you what each piece does. For example, 17 | 18 | ``` 19 | dat$timestamp <- sub(' [0-9]+:[0-9]+:[0-9]+', '', dat$timestamp) 20 | dat$timestamp <- as.Date(dat$timestamp, "%m/%d/%Y") 21 | ``` 22 | 23 | is code that reformats ISO timestamps so that R can read them as date-type values. 24 | 25 | ## Introduction 26 | 27 | analysis generally procedes in two steps: 28 | 29 | 1. exploratory data analysis 30 | 2. statistical inference 31 | 32 | our treatment of graphing owes a lot to the Grammar of Graphics 33 | 34 | # Summarizing 35 | 36 | ## let's load in some data about D-Lab feedback 37 | 38 | ```{r} 39 | load('data/feedback.Rda') 40 | str(dat) 41 | ``` 42 | 43 | ## R provides two easy/simple summary functions in the base package 44 | 45 | ```{r} 46 | summary(dat) 47 | table(dat$department) 48 | ``` 49 | 50 | ## the `psych` package provides trimmed means, skew, kurtosis, and missingness 51 | 52 | ```{r} 53 | library(psych) 54 | describe(dat) 55 | ``` 56 | 57 | ## you can use dplyr::groupby to generate summaries 58 | 59 | ```{r} 60 | library(dplyr) 61 | dat %>% group_by(gender) %>% summarize(n()) 62 | ``` 63 | 64 | ## and you can combine dplyr with tidyr::spread to generate crosstabs 65 | 66 | > side note - we are filtering out missing values of gender, because `tidyr` doesn't allow `NA` as a column name 67 | 68 | ```{r} 69 | library(tidyr) 70 | dat %>% filter(!is.na(gender)) %>% group_by(gender, department) %>% 71 | summarize(n=n()) %>% spread(gender, n) 72 | ``` 73 | 74 | # Plotting 75 | 76 | ## every time you use `base::plot`, [Edward Tufte does something unkind to a cute animal](http://markandrewgoetz.com/blog/2009/11/my-new-wallpaper/) 77 | 78 | - we'll be using ggplot, R's implementation of the **grammar of graphics** 79 | 80 | - in this grammar, you use 'aesthetics' to define how data is mapped to objects the graph space 81 | 82 | - each graph space has at least three layers: 83 | - theme/background/annotations 84 | - axes 85 | - objects 86 | 87 | - most objects are geometric shapes 88 | 89 | - some objects are statistics built on those shapes 90 | 91 | - you can stack as many layers as you like 92 | 93 | ```{r} 94 | install.packages('ggplot2') 95 | library(ggplot2) 96 | ``` 97 | 98 | ## getting weekdays 99 | 100 | let's imagine that we are interested in looking at differences in feedback based on the day of the week -- how would we do this in R? 101 | 102 | > side note - `weekdays` is locale aware, so students who have their laptop language set to something other than english will get their weekday names in the other language 103 | 104 | ```{r} 105 | dat$wday <- factor(weekdays(dat$timestamp, abbreviate = TRUE), 106 | levels = c('Mon','Tue','Wed','Thu','Fri','Sat','Sun') 107 | ) 108 | summary(dat$wday) 109 | ``` 110 | 111 | ## use qplot for initial poking around 112 | 113 | it has very strong intuitions about what you want to see, and is not particularly customizable 114 | 115 | ```{r} 116 | qplot(instructor.communicated, data = dat) 117 | qplot(wday, course.delivered, data = dat) 118 | ``` 119 | 120 | ## for 1D categorical, use bar 121 | 122 | ```{r} 123 | ggplot(data=dat, aes(x=wday)) + geom_bar() 124 | ``` 125 | 126 | ## for 1D continuous, use hist 127 | 128 | this is really just convenience for `geom_bar(stat = 'bin')`, as opposed to bar plots, whose `stat` is `'count'` 129 | 130 | ```{r} 131 | ggplot(data=dat, aes(x=course.delivered)) + 132 | geom_histogram(binwidth=1) 133 | ``` 134 | 135 | you can add color to this plot 136 | 137 | ```{r} 138 | ggplot(data=dat, aes(x=course.delivered)) + 139 | geom_histogram(binwidth=1, fill = 'gold', colour= 'blue') 140 | ``` 141 | 142 | GO BEARS 143 | 144 | ## for many 1D variables, use a box plot 145 | 146 | these are handy for a whole bunch of reasons, and you should make them your close associates 147 | 148 | ```{r} 149 | ggplot(data=dat, aes(x=gender,y=interest)) + geom_boxplot() 150 | ``` 151 | 152 | ## to plot two continuous variables, use points 153 | 154 | ```{r} 155 | ggplot(data=dat, aes(x=instructor.communicated, y=course.delivered)) + geom_point() 156 | ``` 157 | 158 | all of these values are discrete, which makes them hard to see 159 | 160 | ## to scatter points randomy, use jitter 161 | 162 | this is really just convenience for `geom_point(position = jitter())` 163 | 164 | ```{r} 165 | ggplot(data=dat, aes(x=instructor.communicated, y=course.delivered)) + 166 | geom_jitter() 167 | ``` 168 | 169 | not only can you add color, you can make the color a mapping of other variables 170 | 171 | ```{r} 172 | ggplot(data=dat, aes(x=instructor.communicated, y=course.delivered)) + 173 | geom_jitter(aes(colour = wday)) 174 | ``` 175 | 176 | the last time we used `colour` it was not an aesthetic - why is it now? 177 | 178 | ## you can stack layers until your eyes hurt 179 | 180 | ```{r} 181 | ggplot(data=dat, aes(x=wday, y=course.delivered)) + 182 | geom_boxplot(colour = 'gold') + 183 | geom_jitter(colour = 'blue') 184 | ``` 185 | 186 | ## add summary functions with smooth 187 | 188 | ```{r} 189 | ggplot(data=dat, aes(x=instructor.communicated, y=course.delivered)) + 190 | geom_jitter() + 191 | stat_smooth(method = 'lm') 192 | ``` 193 | 194 | if you are using colour as an aesthetic, you'll produce stats for each color 195 | 196 | ```{r} 197 | ggplot(data=dat, aes(x=instructor.communicated, y=course.delivered, colour = wday)) + 198 | geom_jitter() + 199 | stat_smooth(method = 'lm', se = FALSE) 200 | ``` 201 | 202 | ## good scientists put units on their axes 203 | 204 | ```{r} 205 | ggplot(data=dat, aes(x=instructor.communicated, y=course.delivered)) + 206 | geom_jitter() + 207 | stat_smooth(method = 'lm', colour = 'black') + 208 | xlab('How well the instructor communicated (1-7)') + 209 | ylab('How well the course delivered advertised content (1-7)') + 210 | ggtitle("I have no idea what I'm doing") 211 | ``` 212 | 213 | the general point here is that every single object on this graph is customizable 214 | 215 | frequent customizations are very simple to add 216 | 217 | infrequent customizations will take a lot of tinkering on your part 218 | 219 | ## facetting 220 | 221 | often useful for looking at relationships between three variables at the same time 222 | 223 | ```{r} 224 | ggplot(data=dat, aes(x=instructor.communicated, y=course.delivered)) + 225 | geom_jitter() + 226 | stat_smooth(method = 'lm') + 227 | facet_grid(. ~ useful) 228 | ``` 229 | 230 | # Mean testing 231 | 232 | a picture is worth 1,000 words, but a p-value is worth a dissertation 233 | 234 | basically, inferential statistics is the application of probability theory to decide what is real and what isn't 235 | 236 | we'll start by trying to tell whether differences between group summaries are real 237 | 238 | ## t.test with two vectors (default method) 239 | 240 | ```{r} 241 | t.test(dat$inside.barriers, dat$outside.barriers) 242 | ``` 243 | 244 | note that R takes care of the defaults for you - what it is really computing is `t.test(dat$inside.barriers, dat$outside.barriers, alternative = "two.sided", paired = FALSE, var.equal = FALSE, mu = 0, conf.level = 0.95) 245 | 246 | how would you find this out for yourself? 247 | 248 | ## t.test with subsets of one vector (default method) 249 | 250 | ```{r} 251 | t.test(dat$outside.barriers[dat$gender == "Male/Man"], dat$outside.barriers[dat$gender == "Female/Woman"]) 252 | ``` 253 | 254 | recall that we mentioned inconsistency on day one - here it is, and in a big way 255 | 256 | ## t.test with S3 method 257 | 258 | ```{r} 259 | t.test(outside.barriers ~ gender, data = dat, subset = dat$gender %in% c("Male/Man", "Female/Woman")) 260 | ``` 261 | 262 | ## aov 263 | 264 | first, you would think anova would be called by `anova`, but that's reserved for conducting F-tests on lm objects 265 | 266 | second, you really shouldn't be using anova, but if you must do it in R, the syntax looks like this 267 | 268 | > side note - ANOVA was invented by Ron Fisher to make it easy to do linear models with only a pencil and paper, and has been superceded by regression since the advent of computation in the 70s 269 | 270 | ```{r} 271 | aov(outside.barriers ~ gender, data = dat) 272 | ``` 273 | 274 | this isn't particularly helpful, but remember that it is an object, and we can call other, more helpful functions, on that object 275 | 276 | remember our old friend `summary`? it works on almost everything 277 | 278 | ```{r} 279 | model.1 <- aov(outside.barriers ~ gender, data = dat) 280 | summary(model.1) 281 | ``` 282 | 283 | that's a little better - but what about post-hoc testing? 284 | 285 | ```{r} 286 | TukeyHSD(model.1) 287 | ``` 288 | 289 | > side note - apparently Stata stores all of the models that you generate, whether you assign them names or not; in R, you must explicitly give your models names or they will disappear into the ether 290 | 291 | # linear models 292 | 293 | mean tests are really just a subset of linear models where your predictor is a category 294 | 295 | ## cor.test (Pearson) 296 | 297 | earlier, we were looking at differences between the means of two variables 298 | 299 | but those variables were both continuous, so we can ask whether they are related 300 | 301 | ```{r} 302 | cor.test(dat$outside.barriers, dat$inside.barriers) 303 | ``` 304 | 305 | okay, so they're related - now what? 306 | 307 | ## lm 308 | 309 | this is probably the closest you will get to building a linear model by hand 310 | 311 | this means lm is a powerful tool, but you have to know what you're doing 312 | 313 | the basic call is the S3 method 314 | 315 | ```{r} 316 | model.1 <- lm(inside.barriers ~ outside.barriers, data = dat) 317 | summary(model.1) 318 | ``` 319 | 320 | ## R automatically one-hot encodes your categories 321 | 322 | ```{r} 323 | model.2 <- lm(inside.barriers ~ outside.barriers + department, data = dat) 324 | summary(model.2) 325 | ``` 326 | 327 | ## R does not assume you want the full factorial model 328 | 329 | ```{r} 330 | model.3 <- lm(inside.barriers ~ outside.barriers + department + outside.barriers*department, data = dat) 331 | summary(model.3) 332 | ``` 333 | 334 | ## extract model parameters with `$` 335 | 336 | ```{r} 337 | model.1$coefficients 338 | model.1$coefficients[[2]] 339 | ``` 340 | 341 | ## this is useful if you want to plot residuals 342 | 343 | ```{r, eval=FALSE} 344 | dat$residuals <- model.1$residuals 345 | ``` 346 | 347 | oh boy golly gee gosh darn! remember how we talked about R having casewise deletion + bad indexing? this is one place where it makes your life difficult 348 | 349 | we have to do something like this: 350 | 351 | ```{r} 352 | dat.listwise <- dat[!is.na(dat$inside.barriers) & !is.na(dat$outside.barriers), ] 353 | dat.listwise$resid <- model.1$residuals 354 | ``` 355 | 356 | then we can do this 357 | 358 | ```{r} 359 | ggplot(data = dat.listwise, aes(x=gender,y=resid)) + 360 | geom_boxplot() 361 | ``` 362 | 363 | # Nonparametric 364 | 365 | parametric refers to using means, deviations, and other estimates of population parameters 366 | 367 | *BUT* what if you don't want to make assumptions about the structure of the population? 368 | 369 | or what if you **gasp** can't? 370 | 371 | ## ranked variables 372 | 373 | a simple case is where means don't have meaning 374 | 375 | above we were looking at correlations between Likert variables 376 | 377 | all Likerts are really rank variables, which means they don't act like actual number-y numbers 378 | 379 | in the real world, a 6 foot tall person is twice as tall as a 3 foot tall person 380 | 381 | but is a level '6' really twice as many barriers to access as a '3'? 382 | 383 | **NOPE** 384 | 385 | we know that 6 is more than 3, but can't really say how much - in that sense then, a scale of 1-7 is exactly the same thing as a scale of a-g. 386 | 387 | ## median testing ranks 388 | 389 | we use Mann-Whitney sums to test that the ranks are centered the same way 390 | 391 | ```{r} 392 | wilcox.test(dat$outside.barriers, dat$inside.barriers, alternative = "two.sided", paired = FALSE, mu = 0, conf.level = 0.95) 393 | ``` 394 | 395 | see how this setup looks exactly like a t-test? that's not an accident 396 | 397 | ## correlating ranks 398 | 399 | this is just like the `cor.test` you did above, but with `method` set to equal 'spearman' instead of pearson 400 | 401 | ```{r} 402 | cor.test(dat$outside.barriers, dat$inside.barriers, method = 'spearman') 403 | ``` 404 | 405 | rho is pretty close to the r from above 406 | 407 | ## chisq 408 | 409 | what if both of your variables are categories? we can test their counts with R's built in `chisq.test` function 410 | 411 | i.e. what if we want to know if gender is distributed evenly over departments? 412 | 413 | ```{r} 414 | chisq.test(dat$gender, dat$department) 415 | ``` 416 | 417 | # Practice 418 | 419 | ## Assignment 420 | 421 | There were a lot of variables in this dataset that we did not look at today: 422 | 423 | ```{r} 424 | names(data) 425 | ``` 426 | 427 | Choose two of those variables, and explore their distribution and relationship to each other. Can you conclude anything about the D-Lab based on the feedback? 428 | 429 | # Acknowledgements 430 | 431 | ## Materials taken from: 432 | 433 | [D-Lab's Feedback Analytics](https://github.com/dlab-berkeley/feedback-analytics) -------------------------------------------------------------------------------- /instructor/day_three.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/R-for-Data-Science/798d5b220d66afea5644a245c7327e4bb740039a/instructor/day_three.pdf -------------------------------------------------------------------------------- /instructor/day_two.R: -------------------------------------------------------------------------------- 1 | ## ---- echo=FALSE--------------------------------------------------------- 2 | knitr::opts_knit$set(root.dir = '../') 3 | 4 | ## ---- eval=FALSE--------------------------------------------------------- 5 | ## data(state) 6 | ## str(state.x77) 7 | 8 | ## ------------------------------------------------------------------------ 9 | state.division 10 | length(state.division) 11 | levels(state.division) 12 | 13 | ## ------------------------------------------------------------------------ 14 | state <- state.x77 15 | rm(state.x77) 16 | state <- as.data.frame(state) 17 | head(state) 18 | 19 | ## ------------------------------------------------------------------------ 20 | my.data <- data.frame(n = c(1, 2, 3), 21 | c=c('one', 'two', 'three'), 22 | b=c(TRUE, TRUE, FALSE), 23 | d=c(as.Date("2015-07-27"), 24 | as.Date("2015-07-27")+7, 25 | as.Date("2015-07-27")-7), 26 | really.long.and.complicated.variable.name=999) 27 | 28 | ## ------------------------------------------------------------------------ 29 | str(my.data) 30 | 31 | ## ------------------------------------------------------------------------ 32 | read.table("data/mydata.csv", sep=',', header = TRUE) 33 | 34 | ## ------------------------------------------------------------------------ 35 | read.csv("data/mydata.csv") 36 | 37 | ## ------------------------------------------------------------------------ 38 | read.csv("data/mydata.csv", nrows=2) 39 | 40 | ## ------------------------------------------------------------------------ 41 | load("data/mydata.Rda") 42 | 43 | ## ---- eval=FALSE--------------------------------------------------------- 44 | ## # WARNING! xlsx package install crashed current version of RStudio 45 | ## install.packages("xlsx") 46 | ## library(xlsx) 47 | ## read.xlsx("data/cpds_excel_new.xlsx") 48 | 49 | ## ---- eval=FALSE--------------------------------------------------------- 50 | ## # examples of these? 51 | ## install.packages("foreign") 52 | ## library(foreign) 53 | ## read.dta("data/cpds_stata.dta") 54 | ## read.spss() 55 | ## read.octave() 56 | 57 | ## ------------------------------------------------------------------------ 58 | dirty <- read.csv('data/dirty.csv') 59 | str(dirty) 60 | 61 | ## ------------------------------------------------------------------------ 62 | dirty <- read.csv('data/dirty.csv',stringsAsFactors = FALSE) 63 | str(dirty) 64 | 65 | ## ------------------------------------------------------------------------ 66 | tail(dirty) 67 | dirty <- dirty[1:5,-6] 68 | dim(dirty) 69 | 70 | ## ------------------------------------------------------------------------ 71 | names(dirty) 72 | names(dirty) <- c("time", "height", "dept", "enroll", "birth.order") 73 | 74 | ## ------------------------------------------------------------------------ 75 | dirty$enroll 76 | 77 | ## ------------------------------------------------------------------------ 78 | table(dirty$enroll) 79 | dirty$enroll[dirty$enroll=="999"] <- NA 80 | table(dirty$enroll, useNA = "ifany") 81 | 82 | ## ------------------------------------------------------------------------ 83 | class(dirty$height) 84 | as.numeric(dirty$height) 85 | 86 | ## ------------------------------------------------------------------------ 87 | dirty$height[grep("’", dirty$height, perl=TRUE)] <- 5*30.48 + 9*2.54 88 | dirty$height[2] <- 70*2.54 89 | dirty$height[3] <- 2.1*100 90 | 91 | ## ------------------------------------------------------------------------ 92 | dirty$dept 93 | dirty$dept <- tolower(dirty$dept) 94 | dirty$dept <- gsub(' ', '', dirty$dept) # what did we just do? 95 | dirty$dept[4] <- "geology" 96 | dirty[dirty == "999"] <- NA 97 | 98 | ## ------------------------------------------------------------------------ 99 | dirty$time <- as.Date(dirty$time,'%m/%d/%Y') 100 | dirty$height <- as.numeric(dirty$height) 101 | dirty$dept <- as.factor(dirty$dept) 102 | dirty$enroll <- as.factor(dirty$enroll) 103 | dirty$birth.order <- as.numeric(dirty$birth.order) 104 | str(dirty) 105 | 106 | ## ------------------------------------------------------------------------ 107 | na.omit(dirty) 108 | 109 | ## ------------------------------------------------------------------------ 110 | nrow(dirty) 111 | sum(is.na(dirty$height)) 112 | sum(is.na(dirty$birth.order)) 113 | length(lm(height ~ birth.order,data=dirty)$fitted.values) 114 | 115 | ## ------------------------------------------------------------------------ 116 | library(Amelia) 117 | 118 | ## ------------------------------------------------------------------------ 119 | large <- read.csv('data/large.csv') 120 | summary(large) 121 | nrow(na.omit(large)) 122 | 123 | ## ------------------------------------------------------------------------ 124 | a <- amelia(large,m = 1) 125 | print(a) 126 | 127 | ## ------------------------------------------------------------------------ 128 | large.imputed <- a[[1]][[1]] 129 | summary(large.imputed) 130 | 131 | ## ------------------------------------------------------------------------ 132 | a <- amelia(large[990:1000,],m = 1) 133 | print(a) 134 | 135 | ## ------------------------------------------------------------------------ 136 | 1 == 2 137 | 1 != 1 138 | 1 >= 1 139 | 140 | ## ------------------------------------------------------------------------ 141 | 1 >= c(0,1,2) 142 | 143 | ## ------------------------------------------------------------------------ 144 | c(1,2) >= c(1,2,3) 145 | c(1,2) >= c(1,2,3,4) # why no warning this time? R recycles! 146 | 147 | ## ------------------------------------------------------------------------ 148 | my.data$numeric == 2 149 | my.data[my.data$numeric == 2,] 150 | 151 | ## ------------------------------------------------------------------------ 152 | my.data[my.data$b,] 153 | 154 | ## ------------------------------------------------------------------------ 155 | my.data[,'d'] 156 | 157 | ## ------------------------------------------------------------------------ 158 | good.things <- c("three", "four", "five") 159 | my.data[my.data$character %in% good.things, ] 160 | 161 | ## ------------------------------------------------------------------------ 162 | str(my.data[!(my.data$character %in% good.things), ]) 163 | 164 | ## ------------------------------------------------------------------------ 165 | str(my.data$numeric) 166 | 167 | ## ---- eval=FALSE--------------------------------------------------------- 168 | ## install.packages('tidyr') 169 | ## install.packages('stringr') 170 | ## install.packages('dplyr') 171 | 172 | ## ------------------------------------------------------------------------ 173 | library(tidyr) 174 | library(stringr) 175 | library(dplyr) 176 | 177 | ## ------------------------------------------------------------------------ 178 | abnormal <- data.frame(name = c('Alice','Bob','Eve'), 179 | time1 = c(90,90,150), 180 | time2 = c(100,95,100)) 181 | 182 | ## ------------------------------------------------------------------------ 183 | normal <- gather(abnormal, "time", "score", time1, time2) 184 | normal 185 | 186 | ## ------------------------------------------------------------------------ 187 | normal$id <- seq(1:nrow(normal)) 188 | normal$time <- str_replace(normal$time,'time','') 189 | normal$time <- as.numeric(normal$time) 190 | 191 | ## ------------------------------------------------------------------------ 192 | normal[normal$time == 1,] 193 | normal[normal$name == 'Alice',] 194 | 195 | ## ------------------------------------------------------------------------ 196 | t.test(score ~ time, data=normal) 197 | 198 | ## ------------------------------------------------------------------------ 199 | data.1 <- read.csv('data/merge_practice_1.csv') 200 | data.2 <- read.csv('data/merge_practice_2.csv') 201 | str(data.1) 202 | str(data.2) 203 | 204 | ## ------------------------------------------------------------------------ 205 | merge(data.1, data.2, by = 'id') 206 | 207 | ## ------------------------------------------------------------------------ 208 | merge(data.1, data.2, by = 'id', all = TRUE) 209 | 210 | ## ------------------------------------------------------------------------ 211 | lookup <- read.csv('data/merge_practice_3.csv') 212 | str(lookup) 213 | 214 | ## ------------------------------------------------------------------------ 215 | merge(data.1, lookup, by = "location") 216 | 217 | ## ------------------------------------------------------------------------ 218 | lookup[lookup$location == 'Reno', ] 219 | 220 | ## ------------------------------------------------------------------------ 221 | library(dplyr) 222 | 223 | ## ------------------------------------------------------------------------ 224 | normal 225 | arrange(normal, score) 226 | 227 | ## ------------------------------------------------------------------------ 228 | summarise(normal, mean(score), sd(score)) 229 | 230 | ## ------------------------------------------------------------------------ 231 | group_by(normal, time) 232 | summarize(group_by(normal, time), mean(score)) 233 | mutate(group_by(normal, time), diff=score-mean(score)) 234 | ungroup(mutate(group_by(normal, time), diff=score-mean(score))) 235 | 236 | ## ------------------------------------------------------------------------ 237 | normal %>% group_by(time) %>% mutate(diff=score-mean(score)) %>% ungroup() -> super 238 | 239 | ## ------------------------------------------------------------------------ 240 | library(foreign) 241 | pew <- as.data.frame(read.spss("data/pew.sav")) 242 | religion <- pew[c("q16", "reltrad", "income")] 243 | rm(pew) 244 | 245 | ## ------------------------------------------------------------------------ 246 | religion$reltrad <- as.character(religion$reltrad) 247 | religion$reltrad <- str_replace(religion$reltrad, " Churches", "") 248 | religion$reltrad <- str_replace(religion$reltrad, " Protestant", " Prot") 249 | religion$reltrad[religion$q16 == " Atheist (do not believe in God) "] <- "Atheist" 250 | religion$reltrad[religion$q16 == " Agnostic (not sure if there is a God) "] <- "Agnostic" 251 | religion$reltrad <- str_trim(religion$reltrad) 252 | religion$reltrad <- str_replace_all(religion$reltrad, " \\(.*?\\)", "") 253 | 254 | religion$income <- c("Less than $10,000" = "<$10k", 255 | "10 to under $20,000" = "$10-20k", 256 | "20 to under $30,000" = "$20-30k", 257 | "30 to under $40,000" = "$30-40k", 258 | "40 to under $50,000" = "$40-50k", 259 | "50 to under $75,000" = "$50-75k", 260 | "75 to under $100,000" = "$75-100k", 261 | "100 to under $150,000" = "$100-150k", 262 | "$150,000 or more" = ">150k", 263 | "Don't know/Refused (VOL)" = "Don't know/refused")[religion$income] 264 | 265 | religion$income <- factor(religion$income, levels = c("<$10k", "$10-20k", "$20-30k", "$30-40k", "$40-50k", "$50-75k", 266 | "$75-100k", "$100-150k", ">150k", "Don't know/refused")) 267 | 268 | ## ---- eval=FALSE--------------------------------------------------------- 269 | ## religion <- count(religion, reltrad, income) 270 | ## names(religion)[1] <- "religion" 271 | 272 | -------------------------------------------------------------------------------- /instructor/day_two.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'Day Two: Data Cleaning' 3 | author: ["Dillon Niederhut", "Shinhye Choi"] 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | output: 6 | - pdf_document 7 | - slidy_presentation 8 | --- 9 | 10 | ```{r, echo=FALSE} 11 | knitr::opts_knit$set(root.dir = '../') 12 | ``` 13 | 14 | # Review 15 | 16 | ## Inspecting objects 17 | 18 | we'll start by using some data that is already in R 19 | 20 | ```{r, eval=FALSE} 21 | data(state) 22 | str(state.x77) 23 | ``` 24 | 25 | ## Inspecting variables 26 | 27 | We should see 50 levels in this division variable 28 | 29 | ```{r} 30 | state.division 31 | length(state.division) 32 | levels(state.division) 33 | ``` 34 | 35 | ## Inspecting data frames 36 | 37 | recall, a dataframe is a list of vectors, where each vector is one variable with all of its measurements 38 | 39 | R expects dataframes to be rectangular 40 | 41 | ```{r} 42 | state <- state.x77 43 | rm(state.x77) 44 | state <- as.data.frame(state) 45 | head(state) 46 | ``` 47 | 48 | ## Introduction 49 | 50 | Today's class will be essentially be split into two components: CRUD operations in R and TIDY data. For more on tidiness in data, see [Hadley Wickham's paper](www.jstatsoft.org/v59/i10/paper). We will also touch on missingness - for an accessible introduction, you can read [this very old and no longer state-of-the-art paper](http://psycnet.apa.org/journals/met/7/2/147/). 51 | 52 | yesterday we saw how to create dataframes in R 53 | 54 | ```{r} 55 | my.data <- data.frame(n = c(1, 2, 3), 56 | c=c('one', 'two', 'three'), 57 | b=c(TRUE, TRUE, FALSE), 58 | d=c(as.Date("2015-07-27"), 59 | as.Date("2015-07-27")+7, 60 | as.Date("2015-07-27")-7), 61 | really.long.and.complicated.variable.name=999) 62 | ``` 63 | 64 | remember, you can learn about dataframes with 65 | 66 | ```{r} 67 | str(my.data) 68 | ``` 69 | 70 | in practice, you will only rarely create dataframes by hand, because creating tables in a text editor is both boring and prone to error 71 | 72 | ## Readibility 73 | 74 | we've broken up the previous command across multiple lines to make it easier to read 75 | this is a stylistic choice, and one that should be encouraged: however, it won't be obvious to most of the students that it is necessary to either highlight the whole command and run, or hit run for every line, starting from the first one, in order 76 | 77 | often, students will just run the second line, and be confused when nothing runs correctly in the console anymore - the way to get out of this is by hitting `ESC` 78 | 79 | # Reading dataframes from file 80 | 81 | ## why read data from text files? 82 | 83 | they are human-readable and highly interoperable 84 | 85 | ```{r} 86 | read.table("data/mydata.csv", sep=',', header = TRUE) 87 | ``` 88 | 89 | > side note - anyone who is 100% new to computing will have a hard time understanding the concept of a working directory, and will try to run this code from their home directory (spoiler alert - it doesn't work) 90 | 91 | ## R has convenience wrappers for reading in tables 92 | 93 | ```{r} 94 | read.csv("data/mydata.csv") 95 | ``` 96 | 97 | note that we are only reading the files by doing this 98 | 99 | ## R lets you read in part of a table 100 | 101 | you'll sometimes find that you want to work with a smaller part of a dataset - maybe because the data is too large to fit into memory, or maybe because you want to test out some code on a small piece of the data so it runs faster 102 | 103 | ```{r} 104 | read.csv("data/mydata.csv", nrows=2) 105 | ``` 106 | 107 | note that `nrows` is **not** equal to the number of lines in the file, because it does not include the file header 108 | 109 | ## R also has its own kind of data file 110 | 111 | ```{r} 112 | load("data/mydata.Rda") 113 | ``` 114 | 115 | the `load` function does actually put the file into memory, and with the name you originally gave it when you saved it 116 | 117 | this is typically a bad thing, and there is currently no easy workaround 118 | 119 | ## to read in tables from excel, use the `xlsx` package 120 | 121 | if you are exporting data from excel, be sure to export datetimes as strings, as excel does not store dates internally the same way Unix does 122 | 123 | ```{r, eval=FALSE} 124 | # WARNING! xlsx package install crashed current version of RStudio 125 | install.packages("xlsx") 126 | library(xlsx) 127 | read.xlsx("data/cpds_excel_new.xlsx") 128 | ``` 129 | But it may be better to save your .xlsx file as a csv. format in Excel first, and then read the csv file into R. 130 | 131 | ## you can also use R to read in data from proprietary software 132 | 133 | ```{r, eval=FALSE} 134 | # examples of these? 135 | install.packages("foreign") 136 | library(foreign) 137 | read.dta("data/cpds_stata.dta") 138 | read.spss() 139 | read.octave() 140 | ``` 141 | 142 | # Cleaning data 143 | 144 | there are two major steps to data cleaning, which we will call 'sanitizing' and 'tidying' 145 | 146 | in sanitizing, our goal is to take each variable and force its values to be honest representations of its levels 147 | 148 | in tidying, we are arranging our data structurally such that each row contains exactly one observation, and each column contains exactly one kind of data about that observation (this is sometimes expressed in SQL terms as "An attribute must tell something about the key, the whole key, and nothing but the key, so help me Codd") 149 | 150 | ## exporting data from other software can do weird things to numbers and factors 151 | 152 | ```{r} 153 | dirty <- read.csv('data/dirty.csv') 154 | str(dirty) 155 | ``` 156 | 157 | ## it's usually better to DISABLE R's intuition about data types 158 | 159 | unless you already know the data is clean and has no non-factor strings in it (i.e. you are the one who created it) 160 | 161 | ```{r} 162 | dirty <- read.csv('data/dirty.csv',stringsAsFactors = FALSE) 163 | str(dirty) 164 | ``` 165 | 166 | ## let's start by removing the empty rows and columns 167 | 168 | ```{r} 169 | tail(dirty) 170 | dirty <- dirty[1:5,-6] 171 | dim(dirty) 172 | ``` 173 | 174 | ## you can replace variable names 175 | 176 | and you should, if they are uninformative or long 177 | 178 | ```{r} 179 | names(dirty) 180 | names(dirty) <- c("time", "height", "dept", "enroll", "birth.order") 181 | ``` 182 | 183 | ## it's common for hand-coded data to have a signifier for subject-missingness 184 | 185 | (to help differentiate it from your hand-coder forgetting to do something) 186 | 187 | ```{r} 188 | dirty$enroll 189 | ``` 190 | 191 | ## you should replace all of these values in your dataframe with R's missingness signifier, `NA` 192 | 193 | ```{r} 194 | table(dirty$enroll) 195 | dirty$enroll[dirty$enroll=="999"] <- NA 196 | table(dirty$enroll, useNA = "ifany") 197 | ``` 198 | 199 | > side note - read.table() has an option to specify field values as `NA` as soon as you import the data, but this is a BAAAAD idea because R automatically encodes blank fields as missing too, and thus you lose the ability to distinguish between user-missing and experimenter-missing 200 | 201 | ## the height variable is in four different units 202 | 203 | we can fix this with a somewhat complicated loop (since R started as a functional language, there are not easy ways to conditionally modify structures in place) 204 | 205 | OR 206 | 207 | we can do the same task line-by-line, since the number of observations is small 208 | 209 | ```{r} 210 | class(dirty$height) 211 | as.numeric(dirty$height) 212 | ``` 213 | 214 | because there are apostrophes and quotation marks, R thinks these are strings 215 | 216 | ```{r} 217 | dirty$height[grep("’", dirty$height, perl=TRUE)] <- 5*30.48 + 9*2.54 218 | dirty$height[2] <- 70*2.54 219 | dirty$height[3] <- 2.1*100 220 | ``` 221 | 222 | ## let's fix some of those department spellings 223 | 224 | first, let's make this all lowercase 225 | 226 | ```{r} 227 | dirty$dept 228 | dirty$dept <- tolower(dirty$dept) 229 | dirty$dept <- gsub(' ', '', dirty$dept) # what did we just do? 230 | dirty$dept[4] <- "geology" 231 | dirty[dirty == "999"] <- NA 232 | ``` 233 | 234 | ## then, you can coerce the data into the types they should be 235 | 236 | ```{r} 237 | dirty$time <- as.Date(dirty$time,'%m/%d/%Y') 238 | dirty$height <- as.numeric(dirty$height) 239 | dirty$dept <- as.factor(dirty$dept) 240 | dirty$enroll <- as.factor(dirty$enroll) 241 | dirty$birth.order <- as.numeric(dirty$birth.order) 242 | str(dirty) 243 | ``` 244 | 245 | # Missingness 246 | 247 | there are many reasons why you might have missing data 248 | 249 | *AS LONG AS MISSINGNESS IS NOT CAUSED BY YOUR INDEPENDENT VARIABLE* this is fine 250 | 251 | deleting those observations is wasteful, but easy (listwise deletion) 252 | 253 | ignoring the individual missing data points is typical (casewise deletion) 254 | 255 | imputing mean values for missing data is possibly the worst thing you can do 256 | 257 | imputing via MI + error is currently the best option 258 | 259 | ## listwise deletion is wasteful 260 | 261 | ```{r} 262 | na.omit(dirty) 263 | ``` 264 | 265 | ## casewise deletion is what R does internally 266 | 267 | ```{r} 268 | nrow(dirty) 269 | sum(is.na(dirty$height)) 270 | sum(is.na(dirty$birth.order)) 271 | length(lm(height ~ birth.order,data=dirty)$fitted.values) 272 | ``` 273 | 274 | this is usually the default strategy 275 | 276 | ## remember how we talked about the extensibility of R? 277 | 278 | amelia is a package that makes a complicated MI approach work without you knowing anything about its implementation 279 | 280 | ```{r} 281 | library(Amelia) 282 | ``` 283 | 284 | ## let's use this large dataset as an example 285 | 286 | ```{r} 287 | large <- read.csv('data/large.csv') 288 | summary(large) 289 | nrow(na.omit(large)) 290 | ``` 291 | 292 | ## for it to work you need low missingness and large N 293 | 294 | ```{r} 295 | a <- amelia(large,m = 1) 296 | print(a) 297 | ``` 298 | 299 | ## amelia returns a list, where the first item is a list of your imputations 300 | 301 | we only did one, so here it is 302 | 303 | ```{r} 304 | large.imputed <- a[[1]][[1]] 305 | summary(large.imputed) 306 | ``` 307 | 308 | ## if you give it a tiny dataset, it will fuss at you 309 | 310 | ```{r} 311 | a <- amelia(large[990:1000,],m = 1) 312 | print(a) 313 | ``` 314 | 315 | # Reshaping 316 | 317 | now that our data is clean, it's time to put it in a tidy format. this is a way of storing data that makes it easy to: 318 | 319 | 1. make graphs 320 | 2. run tests 321 | 3. summarize 322 | 4. transform into other formats 323 | 324 | we are basically trying to organize ourselves such that: 325 | 326 | 1. any grouping is made on rows 327 | 2. any testing is done between columns 328 | 329 | ## an aside on testing 330 | 331 | in R, you use double symbols for testing 332 | 333 | ```{r} 334 | 1 == 2 335 | 1 != 1 336 | 1 >= 1 337 | ``` 338 | 339 | (you've already seen a couple of these) 340 | 341 | ## tests return boolean vectors 342 | 343 | ```{r} 344 | 1 >= c(0,1,2) 345 | ``` 346 | 347 | ## recall that boolean vectors need to be the same length or a divisor 348 | 349 | if your vectors are not multiples of each other, R will fuss at you 350 | 351 | ```{r} 352 | c(1,2) >= c(1,2,3) 353 | c(1,2) >= c(1,2,3,4) # why no warning this time? R recycles! 354 | ``` 355 | 356 | the combination of the length requirement, the lack of support in R for proper indexing, and missingness in your data will cause many headaches later on 357 | 358 | ## subsetting data frames 359 | 360 | subsetting your data is where you will use this regularly 361 | 362 | ```{r} 363 | my.data$numeric == 2 364 | my.data[my.data$numeric == 2,] 365 | ``` 366 | 367 | ## boolean variables can act as filters right out of the box 368 | 369 | ```{r} 370 | my.data[my.data$b,] 371 | ``` 372 | 373 | you see the empty space after the comma? that tells R to grab all the columns 374 | 375 | ## you can also select columns 376 | 377 | ```{r} 378 | my.data[,'d'] 379 | ``` 380 | 381 | that empy space **before** the comma? that tells R to grab all the rows 382 | 383 | ## you can also match elements from a vector 384 | 385 | ```{r} 386 | good.things <- c("three", "four", "five") 387 | my.data[my.data$character %in% good.things, ] 388 | ``` 389 | 390 | ## most subsetting operations on dataframes also return a dataframe 391 | 392 | ```{r} 393 | str(my.data[!(my.data$character %in% good.things), ]) 394 | ``` 395 | 396 | ## subsets that are a single column return a vector 397 | 398 | ```{r} 399 | str(my.data$numeric) 400 | ``` 401 | 402 | ## most tidying can be done with two R packages 403 | 404 | (plus a wrapper around the base string functions) 405 | 406 | ```{r, eval=FALSE} 407 | install.packages('tidyr') 408 | install.packages('stringr') 409 | install.packages('dplyr') 410 | ``` 411 | 412 | ```{r} 413 | library(tidyr) 414 | library(stringr) 415 | library(dplyr) 416 | ``` 417 | 418 | ## reshaping 419 | 420 | our goal here is to arrange our data such that each table is about one kind of thing: whether it is everything about a measurement, everything about a person, or everything about a group of people 421 | 422 | ```{r} 423 | abnormal <- data.frame(name = c('Alice','Bob','Eve'), 424 | time1 = c(90,90,150), 425 | time2 = c(100,95,100)) 426 | ``` 427 | 428 | this table is not tidy - why not? 429 | 430 | the table is about measurements, but each measurement does not have its own row, and each type of measurement value is represented by more than one column 431 | 432 | ```{r} 433 | normal <- gather(abnormal, "time", "score", time1, time2) 434 | normal 435 | ``` 436 | 437 | we can gather the two columns with time data into a column representing just time, and another representing just scores 438 | 439 | now that each row is a unique observation, we can clean up the dataframe a bit 440 | 441 | ```{r} 442 | normal$id <- seq(1:nrow(normal)) 443 | normal$time <- str_replace(normal$time,'time','') 444 | normal$time <- as.numeric(normal$time) 445 | ``` 446 | 447 | now that we are in a tidy format, see how easy it is to subset 448 | 449 | ```{r} 450 | normal[normal$time == 1,] 451 | normal[normal$name == 'Alice',] 452 | ``` 453 | 454 | and test 455 | 456 | > side note - don't worry about how this works yet - we'll talk about it tomorrow 457 | 458 | ```{r} 459 | t.test(score ~ time, data=normal) 460 | ``` 461 | 462 | it's easy to combine tidy tables to compare different levels of information simultaneously 463 | 464 | # Merging data frames 465 | 466 | ## flexibly join dataframes with `merge` 467 | 468 | imagine you have two datasets that you want to merge 469 | 470 | ```{r} 471 | data.1 <- read.csv('data/merge_practice_1.csv') 472 | data.2 <- read.csv('data/merge_practice_2.csv') 473 | str(data.1) 474 | str(data.2) 475 | ``` 476 | 477 | sometimes the same people have differet jobs in different locations 478 | 479 | you can do an *inner* join using merge 480 | 481 | ```{r} 482 | merge(data.1, data.2, by = 'id') 483 | ``` 484 | 485 | that's no good - we lost half of our people! 486 | 487 | inner joins are mostly used when you **only** want records that appear in both tables 488 | 489 | if you want the union, you can use an outer join 490 | 491 | ```{r} 492 | merge(data.1, data.2, by = 'id', all = TRUE) 493 | ``` 494 | 495 | this works basically the same as `join` in SQL 496 | 497 | running merges is particularly useful when: 498 | 499 | a. your data is tidy; and, 500 | b. you want to add information with a lookup table 501 | 502 | in this case, you can store your lookup table as a dataframe, then merge it 503 | 504 | ```{r} 505 | lookup <- read.csv('data/merge_practice_3.csv') 506 | str(lookup) 507 | ``` 508 | 509 | this lookup table gives us the population for each location 510 | 511 | we can add this to our people table with 512 | 513 | ```{r} 514 | merge(data.1, lookup, by = "location") 515 | ``` 516 | 517 | note that Reno was in our lookup table 518 | 519 | ```{r} 520 | lookup[lookup$location == 'Reno', ] 521 | ``` 522 | 523 | but doesn't show up when we merge - why do you think this is? 524 | 525 | # Transforming data 526 | 527 | ## introduction 528 | 529 | because R started out as a functional language, it can be hard to modify data, especially in place 530 | 531 | in practice, if you want 100% control over how your frames are being modified, you'll be writing lots of `for` loops, which is messy 532 | 533 | luckily, there is a package that handles the common tasks for you 534 | 535 | ```{r} 536 | library(dplyr) 537 | ``` 538 | 539 | ## sort data with `arranage` 540 | 541 | base R syntax for sorting is a bit of a pain in that you have to create a sorting vector based on the values in a column, then subset the same dataframe and apply the sorting vector to the rows slice 542 | 543 | to demonstrate this, let's have another look at our 'normal' data frame 544 | 545 | ```{r} 546 | normal 547 | arrange(normal, score) 548 | ``` 549 | 550 | ## apply summary fucntions with `summarise` 551 | 552 | dplyr includes most of the base R summary statistics, along with: 553 | 554 | * `n()` 555 | * `n_distinct()` 556 | * `first()` 557 | * `last()` 558 | 559 | if we want to get the mean and sd for the scores, we can do 560 | 561 | ```{r} 562 | summarise(normal, mean(score), sd(score)) 563 | ``` 564 | 565 | ## dplyr allows you to apply functions to groups 566 | 567 | so far, these have taken base R functions and made them faster (with C++ calls behind the scenes), easier to use, or both 568 | 569 | dplyr's real utility is in its grouped dataframes, which apply dplyr functions groupwise 570 | 571 | let's say that we want to know the rank at each time -- we can `groupby` time and then do some variable transformation 572 | 573 | ```{r} 574 | group_by(normal, time) 575 | summarize(group_by(normal, time), mean(score)) 576 | mutate(group_by(normal, time), diff=score-mean(score)) 577 | ungroup(mutate(group_by(normal, time), diff=score-mean(score))) 578 | ``` 579 | 580 | you can add as many functions as you want inbetween, but wrapping function call around function call can be hard to read (and write!) 581 | 582 | ## you can pipe functions with the `%>%` operator 583 | 584 | pipes take the output of one function and give it as an input to the next function, without deep nesting of functions nor saving all of the intermediate steps 585 | 586 | this makes code a lot easier to read, and to understand 587 | 588 | ```{r} 589 | normal %>% group_by(time) %>% mutate(diff=score-mean(score)) %>% ungroup() -> super 590 | ``` 591 | 592 | # Practice 593 | 594 | ## Grab some data from Pew 595 | 596 | and sanitize/tidy it 597 | 598 | this will be hard 599 | 600 | ```{r} 601 | library(foreign) 602 | pew <- as.data.frame(read.spss("data/pew.sav")) 603 | religion <- pew[c("q16", "reltrad", "income")] 604 | rm(pew) 605 | ``` 606 | 607 | ## we'll start by cleaning up the factor variables 608 | 609 | ```{r} 610 | religion$reltrad <- as.character(religion$reltrad) 611 | religion$reltrad <- str_replace(religion$reltrad, " Churches", "") 612 | religion$reltrad <- str_replace(religion$reltrad, " Protestant", " Prot") 613 | religion$reltrad[religion$q16 == " Atheist (do not believe in God) "] <- "Atheist" 614 | religion$reltrad[religion$q16 == " Agnostic (not sure if there is a God) "] <- "Agnostic" 615 | religion$reltrad <- str_trim(religion$reltrad) 616 | religion$reltrad <- str_replace_all(religion$reltrad, " \\(.*?\\)", "") 617 | 618 | religion$income <- c("Less than $10,000" = "<$10k", 619 | "10 to under $20,000" = "$10-20k", 620 | "20 to under $30,000" = "$20-30k", 621 | "30 to under $40,000" = "$30-40k", 622 | "40 to under $50,000" = "$40-50k", 623 | "50 to under $75,000" = "$50-75k", 624 | "75 to under $100,000" = "$75-100k", 625 | "100 to under $150,000" = "$100-150k", 626 | "$150,000 or more" = ">150k", 627 | "Don't know/Refused (VOL)" = "Don't know/refused")[religion$income] 628 | 629 | religion$income <- factor(religion$income, levels = c("<$10k", "$10-20k", "$20-30k", "$30-40k", "$40-50k", "$50-75k", 630 | "$75-100k", "$100-150k", ">150k", "Don't know/refused")) 631 | ``` 632 | 633 | ## now we can reduce this down to three columns for three variables 634 | 635 | ```{r, eval=FALSE} 636 | religion <- count(religion, reltrad, income) 637 | names(religion)[1] <- "religion" 638 | ``` 639 | 640 | # Acknowledgements 641 | 642 | ## Materials taken from: 643 | 644 | [Chris Krogslund](https://github.com/ckrogs/r_useful_dlab) 645 | [Hadley Wickham](http://www.jstatsoft.org/v59/i10/paper) 646 | -------------------------------------------------------------------------------- /instructor/day_two.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/R-for-Data-Science/798d5b220d66afea5644a245c7327e4bb740039a/instructor/day_two.pdf -------------------------------------------------------------------------------- /instructor/overflow.R: -------------------------------------------------------------------------------- 1 | ## ---- echo=FALSE--------------------------------------------------------- 2 | knitr::opts_knit$set(root.dir = '../') 3 | 4 | ## ------------------------------------------------------------------------ 5 | #install.packages('RCurl') 6 | library(RCurl) 7 | #install.packages("XML") 8 | library(XML) 9 | 10 | ## ------------------------------------------------------------------------ 11 | RJ <- readLines("http://shakespeare.mit.edu/romeo_juliet/full.html") 12 | RJ[1:25] 13 | 14 | ## ------------------------------------------------------------------------ 15 | RJ[grep("<h3>", RJ, perl=T)] 16 | RJ[grep("<h3>", RJ, perl=TRUE)] 17 | 18 | ## ---- eval=FALSE--------------------------------------------------------- 19 | ## link <- "http://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml" 20 | ## page <- getURL(url = link) 21 | ## xmlParse(file = page) 22 | 23 | ## ------------------------------------------------------------------------ 24 | link<-"http://clerk.house.gov/evs/2014/ROLL_000.asp" 25 | readHTMLTable(doc=link, header=T, which=1, stringsAsFactors=F)[1:10, ] 26 | 27 | ## ---- eval=FALSE--------------------------------------------------------- 28 | ## #are there websites that allow you to connect to test servers? 29 | ## install.packages("RMySQL") 30 | ## library(RMySQL) 31 | ## con <- dbConnect(MySQL(), 32 | ## user="", password="", 33 | ## dbname="", host="localhost") 34 | ## data <- fetch(dbSendQuery(con, "select * from table"), n=10) 35 | ## con.exit(dbDisconnect(con)) 36 | 37 | ## ---- eval=FALSE--------------------------------------------------------- 38 | ## install.packages("RPostgreSQL") 39 | ## library(RPostgreSQL) 40 | ## con <- dbConnect(dbDriver("PostgreSQL"), 41 | ## dbname="", 42 | ## host="localhost", 43 | ## port=1234, 44 | ## user="", 45 | ## password="") 46 | ## data <- dbReadTable(con, c("column1","column2")) 47 | ## dbDisconnect(con) 48 | 49 | ## ---- eval=FALSE--------------------------------------------------------- 50 | ## install.packages("rmongodb") 51 | ## library(rmongodb) 52 | ## con <- mongo.create(host = localhost, 53 | ## name = "", 54 | ## username = "", 55 | ## password = "", 56 | ## db = "admin") 57 | ## if(mongo.is.connected(con) == TRUE) { 58 | ## data <- mongo.find.all(con, "collection", list("city" = list( "$exists" = "true"))) 59 | ## } 60 | ## mongo.destroy(con) 61 | 62 | ## ---- eval=FALSE--------------------------------------------------------- 63 | ## # plyr package 64 | ## mydata <- read.csv("http://www.ats.ucla.edu/stat/data/binary.csv") 65 | ## # Consider the case where we want to calculate descriptive statistics across admits and not-admits 66 | ## # from the dataset and return them as a data.frame 67 | ## ddata <- ddply(mydata, c("admit"), summarize, 68 | ## gpa.over3 = length(gpa[gpa>=3]), 69 | ## gpa.over3.5 = length(gpa[gpa>=3.5]), 70 | ## gpa.over3per = length(gpa[gpa>=3])/length(gpa), 71 | ## gpa.over3.5per = length(gpa[gpa>=3.5])/length(gpa)) 72 | ## ) 73 | 74 | ## ---- eval=FALSE--------------------------------------------------------- 75 | ## 76 | ## mydata <- ddply(mydata, c("admit"), transform, 77 | ## gre.ave=mean(x=gre, na.rm=T), 78 | ## gre.sd = sd(x=gre, na.rm=T)) 79 | ## head(mydata) 80 | ## unique(mydata$gre.ave) 81 | ## ) 82 | 83 | ## ---- eval=FALSE--------------------------------------------------------- 84 | ## # Another very useful function is arrange, which orders a data frame on the basis of column contents 85 | ## # arrange by "rank" 86 | ## mydata.rank <- plyr::arrange(mydata, rank) 87 | ## # arrange by "rank", descending 88 | ## mydata.rank <- plyr::arrange(mydata, desc(rank)) 89 | ## # arrange by "rank", then "gre", then "gpa 90 | ## mydata.comb <- plyr::arrange(mydata, rank, desc(gre), desc(gpa)) 91 | ## head(mydata.comb) 92 | 93 | -------------------------------------------------------------------------------- /instructor/overflow.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Additional Course Materials" 3 | author: "Dillon Niederhut" 4 | date: "`r format(Sys.time(), '%d %B, %Y')`" 5 | output: 6 | - pdf_document 7 | - slidy_presentation 8 | --- 9 | 10 | ```{r, echo=FALSE} 11 | knitr::opts_knit$set(root.dir = '../') 12 | ``` 13 | 14 | ## Introduction 15 | 16 | The following are materials that do not fit into the course as currently taught, but that may be useful for students later on. 17 | 18 | # Data does not need to be in the local filesystem 19 | 20 | ## R has an interface to curl called RCurl 21 | 22 | ```{r} 23 | #install.packages('RCurl') 24 | library(RCurl) 25 | #install.packages("XML") 26 | library(XML) 27 | ``` 28 | 29 | ## you can use this to access remote data 30 | 31 | you may just want to read text lines from a webpage 32 | 33 | ```{r} 34 | RJ <- readLines("http://shakespeare.mit.edu/romeo_juliet/full.html") 35 | RJ[1:25] 36 | ``` 37 | 38 | and use the kinds of string manipulation we learned yesterday to retrieve the first lines of an act or a scene 39 | 40 | ```{r} 41 | RJ[grep("<h3>", RJ, perl=T)] 42 | RJ[grep("<h3>", RJ, perl=TRUE)] 43 | ``` 44 | 45 | or maybe pull information out of an RSS feed 46 | 47 | ```{r, eval=FALSE} 48 | link <- "http://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml" 49 | page <- getURL(url = link) 50 | xmlParse(file = page) 51 | ``` 52 | 53 | ## R also has libraries for pulling and parsing web pages 54 | 55 | ```{r} 56 | link<-"http://clerk.house.gov/evs/2014/ROLL_000.asp" 57 | readHTMLTable(doc=link, header=T, which=1, stringsAsFactors=F)[1:10, ] 58 | ``` 59 | 60 | # Connecting to a database 61 | 62 | why read from a database? they use less memory, are faster, create their own backups, and offer optimized querying/joining 63 | 64 | databases generally come in two flavors, relational and non-relational, which has to do with how important schemas are (and is a bit beyond the scope of an R intro) 65 | 66 | two popular relational databases are SQL (or one of its many flavors) 67 | 68 | ```{r, eval=FALSE} 69 | #are there websites that allow you to connect to test servers? 70 | install.packages("RMySQL") 71 | library(RMySQL) 72 | con <- dbConnect(MySQL(), 73 | user="", password="", 74 | dbname="", host="localhost") 75 | data <- fetch(dbSendQuery(con, "select * from table"), n=10) 76 | con.exit(dbDisconnect(con)) 77 | ``` 78 | 79 | and postgres 80 | 81 | ```{r, eval=FALSE} 82 | install.packages("RPostgreSQL") 83 | library(RPostgreSQL) 84 | con <- dbConnect(dbDriver("PostgreSQL"), 85 | dbname="", 86 | host="localhost", 87 | port=1234, 88 | user="", 89 | password="") 90 | data <- dbReadTable(con, c("column1","column2")) 91 | dbDisconnect(con) 92 | ``` 93 | 94 | a popular non-relational database is MongoDB 95 | 96 | ```{r, eval=FALSE} 97 | install.packages("rmongodb") 98 | library(rmongodb) 99 | con <- mongo.create(host = localhost, 100 | name = "", 101 | username = "", 102 | password = "", 103 | db = "admin") 104 | if(mongo.is.connected(con) == TRUE) { 105 | data <- mongo.find.all(con, "collection", list("city" = list( "$exists" = "true"))) 106 | } 107 | mongo.destroy(con) 108 | ``` 109 | 110 | one quirk about mongo is that your connection always authenticates to the authentication database, not the database you are querying - this db is usually called 'admin' 111 | 112 | # Data tidying with plyr 113 | 114 | ## enter plyr 115 | 116 | - *plyr* is the go-to package for all your splitting-applying-combining needs 117 | - Among its many benefits (above base R capabilities): 118 | a) Don't have to worry about different name, argument, or output consistencies 119 | b) Easily parallelized 120 | c) Input from, and output to, data frames, matricies, and lists 121 | d) Progress bars for lengthy computation 122 | e) Informative error messages 123 | 124 | ## group-wise operations/plyr/selecting functions 125 | 126 | - Two essential questions: 127 | 1. What is the class of your input object? 128 | 2. What is the class of your desired output object? 129 | - If you want to split a **d**ata frame, and return results as a **d**ata frame, you use **dd**ply 130 | - If you want to split a **d**ata frame, and return results as a **l**ist, you use **dl**ply 131 | - If you want to split a **l**ist, and return results as a **d**ata frame, you use **ld**ply 132 | 133 | 134 | ```{r, eval=FALSE} 135 | # plyr package 136 | mydata <- read.csv("http://www.ats.ucla.edu/stat/data/binary.csv") 137 | # Consider the case where we want to calculate descriptive statistics across admits and not-admits 138 | # from the dataset and return them as a data.frame 139 | ddata <- ddply(mydata, c("admit"), summarize, 140 | gpa.over3 = length(gpa[gpa>=3]), 141 | gpa.over3.5 = length(gpa[gpa>=3.5]), 142 | gpa.over3per = length(gpa[gpa>=3])/length(gpa), 143 | gpa.over3.5per = length(gpa[gpa>=3.5])/length(gpa)) 144 | ) 145 | ``` 146 | 147 | # Group-wise Operations/plyr/functions 148 | 149 | - plyr can accomodate any user-defined function, but it also comes with some pre-defined functions that assist with the most common split-apply-combine tasks 150 | - We've already seen **summarize**, which creates user-specified vectors and combines them into a data.frame. Here are some other helpful functions: 151 | 152 | **transform**: applies a function to a data.frame and adds new vectors (columns) to it 153 | 154 | # add a column containing the average gre score of students 155 | 156 | ```{r, eval=FALSE} 157 | 158 | mydata <- ddply(mydata, c("admit"), transform, 159 | gre.ave=mean(x=gre, na.rm=T), 160 | gre.sd = sd(x=gre, na.rm=T)) 161 | head(mydata) 162 | unique(mydata$gre.ave) 163 | ) 164 | ``` 165 | 166 | > side note: note that **transform** can't do transformations that involve the results of *other* transformations from the same call 167 | 168 | Another very useful function is **arrange**, which orders a data frame on the basis of column contents 169 | 170 | ```{r, eval=FALSE} 171 | # Another very useful function is arrange, which orders a data frame on the basis of column contents 172 | # arrange by "rank" 173 | mydata.rank <- plyr::arrange(mydata, rank) 174 | # arrange by "rank", descending 175 | mydata.rank <- plyr::arrange(mydata, desc(rank)) 176 | # arrange by "rank", then "gre", then "gpa 177 | mydata.comb <- plyr::arrange(mydata, rank, desc(gre), desc(gpa)) 178 | head(mydata.comb) 179 | ``` 180 | -------------------------------------------------------------------------------- /instructor/overflow.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/R-for-Data-Science/798d5b220d66afea5644a245c7327e4bb740039a/instructor/overflow.pdf -------------------------------------------------------------------------------- /scripts/feedback_cleaner.R: -------------------------------------------------------------------------------- 1 | # creating day_three dataset 2 | library(stringr) 3 | 4 | dat <- read.csv('../../feedback-analytics/feedback.csv', stringsAsFactors = FALSE) 5 | dat[dat == ""] <- NA 6 | 7 | # get rid of empty or identifying columns 8 | dat <- subset(dat, select = -c(I.will.use.my.new.powers..., Instructor, Date.of.Training, Training.Title, 9 | What.department..school..program..or.organization.at.Berkeley.are.you.associated.with..1)) 10 | dat <- Filter(function(x) !all(is.na(x)), dat) 11 | 12 | # simplify column names 13 | new.names <- c("timestamp", "course.delivered", "instructor.communicated", "hear", 14 | "interest", "department", "verbs", "useful", "gender", "ethnicity", 15 | "outside.barriers", "inside.barriers", "what.barriers", "position") 16 | names(dat) <- new.names 17 | 18 | # fix timestamp 19 | dat$timestamp <- sub(' [0-9]+:[0-9]+:[0-9]+', '', dat$timestamp) 20 | dat$timestamp <- as.Date(dat$timestamp, "%m/%d/%Y") 21 | 22 | # entity resolution on departments 23 | dat$department <- str_trim(dat$department) 24 | dat$department <- str_to_lower(dat$department) 25 | dat$department <- sub('school of ', '', dat$department) 26 | for (pattern in c('afric','aas')) { 27 | dat[grepl(pattern, dat$department), 'department'] <- "African American Studies" 28 | } 29 | for (pattern in c('are')) { 30 | dat[grepl(pattern, dat$department), 'department'] <- "Ag & Resource Econ & Pol" 31 | } 32 | for (pattern in c('anth')) { 33 | dat[grepl(pattern, dat$department), 'department'] <- "Anthropology" 34 | } 35 | for (pattern in c('applied','ast')) { 36 | dat[grepl(pattern, dat$department), 'department'] <- "App Sci & Tech Grad Grp" 37 | } 38 | for (pattern in c('bio[ ]*stat')) { 39 | dat[grepl(pattern, dat$department), 'department'] <- "Biostatistics Grad Grp" 40 | } 41 | for (pattern in c('haas', 'business')) { 42 | dat[grepl(pattern, dat$department), 'department'] <- "Business" 43 | } 44 | for (pattern in c('crp', 'region', 'planning')) { 45 | dat[grepl(pattern, dat$department), 'department'] <- "City & Regional Planning" 46 | } 47 | for (pattern in c('demo')) { 48 | dat[grepl(pattern, dat$department), 'department'] <- "Demography" 49 | } 50 | for (pattern in c('econ')) { 51 | dat[grepl(pattern, dat$department), 'department'] <- "Economics" 52 | } 53 | for (pattern in c('ed[.]+', 'edu', 'gse', 'g.s.e.', 'pome')) { 54 | dat[grepl(pattern, dat$department), 'department'] <- "Education" 55 | } 56 | for (pattern in c('erg', 'energy')) { 57 | dat[grepl(pattern, dat$department), 'department'] <- "Energy & Resources Group" 58 | } 59 | for (pattern in c('espm', 'epsm')) { 60 | dat[grepl(pattern, dat$department), 'department'] <- "Env Sci, Policy, & Mgmt" 61 | } 62 | for (pattern in c('ethnic')) { 63 | dat[grepl(pattern, dat$department), 'department'] <- "Ethnic Studies Grad Grp" 64 | } 65 | for (pattern in c('geo')) { 66 | dat[grepl(pattern, dat$department), 'department'] <- "Geography" 67 | } 68 | for (pattern in c('hist')) { 69 | dat[grepl(pattern, dat$department), 'department'] <- "History" 70 | } 71 | for (pattern in c('ieor')) { 72 | dat[grepl(pattern, dat$department), 'department'] <- "Industrial Eng & Ops Rsch" 73 | } 74 | for (pattern in c('i school', 'info')) { 75 | dat[grepl(pattern, dat$department), 'department'] <- "Information" 76 | } 77 | for (pattern in c('ib', 'integrative')) { 78 | dat[grepl(pattern, dat$department), 'department'] <- "Integrative Biology" 79 | } 80 | for (pattern in c('jsp', 'jurisprudence')) { 81 | dat[grepl(pattern, dat$department), 'department'] <- "JSP Grad Pgm" 82 | } 83 | for (pattern in c('law$', 'law ')) { 84 | dat[grepl(pattern, dat$department), 'department'] <- "Law" 85 | } 86 | for (pattern in c('ling')) { 87 | dat[grepl(pattern, dat$department), 'department'] <- "Linguistics" 88 | } 89 | for (pattern in c('music')) { 90 | dat[grepl(pattern, dat$department), 'department'] <- "Music" 91 | } 92 | for (pattern in c('hwni', 'neuro', 'helen wills')) { 93 | dat[grepl(pattern, dat$department), 'department'] <- "Neuroscience" 94 | } 95 | for (pattern in c('pol[.]+', 'poli ', 'politic')) { 96 | dat[grepl(pattern, dat$department), 'department'] <- "Political Science" 97 | } 98 | for (pattern in c('psych')) { 99 | dat[grepl(pattern, dat$department), 'department'] <- "Psychology" 100 | } 101 | for (pattern in c('health', 'ph')) { 102 | dat[grepl(pattern, dat$department), 'department'] <- "Public Health" 103 | } 104 | for (pattern in c('gspp', 'policy', 'goldman')) { 105 | dat[grepl(pattern, dat$department), 'department'] <- "Public Policy" 106 | } 107 | for (pattern in c('rhet')) { 108 | dat[grepl(pattern, dat$department), 'department'] <- "Rhetoric" 109 | } 110 | for (pattern in c('iseees', 'slavic')) { 111 | dat[grepl(pattern, dat$department), 'department'] <- "Slavic Languages & Lit" 112 | } 113 | for (pattern in c('asian')) { 114 | dat[grepl(pattern, dat$department), 'department'] <- "South and Southeast Asian Studies" 115 | } 116 | for (pattern in c('welfare')) { 117 | dat[grepl(pattern, dat$department), 'department'] <- "Social Welfare" 118 | } 119 | for (pattern in c('socio', 'soc$', 'soc ')) { 120 | dat[grepl(pattern, dat$department), 'department'] <- "Sociology" 121 | } 122 | department.levels <- grep('[A-Z]+', unique(dat$department), value=TRUE) 123 | dat$department <- factor(dat$department, levels = sort(department.levels)) 124 | 125 | # type other columns 126 | dat$hear <- as.factor(dat$hear) 127 | dat$gender <- factor(dat$gender, levels = c("Female/Woman", "Male/Man", "Genderqueer/Gender non-conforming")) 128 | dat$position <- as.factor(dat$position) 129 | 130 | # output 131 | save(dat, file='data/feedback.Rda') 132 | -------------------------------------------------------------------------------- /scripts/regenerate_files.R: -------------------------------------------------------------------------------- 1 | #' This script regenerates the .pdf and .R files in the instructor 2 | #' directory 3 | 4 | #' function definitions 5 | 6 | install <- function(package){ 7 | if ( !( package %in% installed.packages() ) ) { 8 | install.packages(package, dependencies=TRUE) 9 | } 10 | } 11 | 12 | write_document <- function(document){ 13 | knitr::knit(document, tangle = TRUE) 14 | rmarkdown::render(document, output_format='all') 15 | } 16 | 17 | #' main call 18 | 19 | if ( 'scripts' %in% strsplit(getwd(), '/') ) { 20 | setwd('../instructor') 21 | } else { 22 | setwd('instructor') 23 | } 24 | 25 | for ( package in c('knitr', 'rmarkdown') ) { 26 | install(package) 27 | library(package, character.only=TRUE) 28 | } 29 | 30 | document_list = list.files(pattern='*.Rmd') 31 | lapply(document_list, FUN=write_document) 32 | 33 | --------------------------------------------------------------------------------