├── second-R-programming-exercise-if-else-if-else-syntax-and-logicPDF.pdf ├── Fourth-R-programming-exercise-find-prime-integers-less-than-or-equal-N.pdf ├── Sixth-R-function-paths-listing-files-string-search-constructing-a-data-framePDF.pdf ├── first-R-programming-exercise-column-number-from-Excel-column-letter-markdown-UTF8.pdf ├── Fifth-article-Review-of-getting-subsets-of-a-data-frame-constructing-data-framesPDF.pdf ├── Seventh-R-function-systematic-debugging-composing-a-function-constructing-a-data-frame.pdf ├── Nineth-R-Practice-exercise-using-a-row-of-a-data-frame-the-unlist-function-composing-a-function.pdf ├── tiny-subset-of-GSE18885-gene-expression-data-9-genes-WG-5-samples-Normal-Control-4-samples.tab.txt ├── Tenth-R-Practice-exercise-merging-annotation-data-into-a-gene-expression-analysis-results-data-frame.pdf ├── Twelth-R-Practice-exercise-using-Monte-Carlo-simulation-to-investigate-a-statistical-quantity-March-6.pdf ├── third-R-programming-exercise-using-for-loops-and-if-tests-to-check-if-a-positive-integer-is-a-prime.pdf ├── Eighth-R-Practice-exercise-composing-a-function-and-constructing-a-data-frame-files-not-containing-search-strings.pdf ├── Eleventh-R-Practice-exercise-using-sapply-and-split-and-also-use-of-ellipsis-to-pass-in-additional-argumentsFeb11.pdf ├── GPL6104-Illumina-microarray-platform-annotation-from-GEO-repository-small-subset-edited-example-Feb12.tab.txt ├── README ├── first-R-programming-exercise-column-number-from-Excel-column-letter-markdown-UTF8.Rmd ├── first-R-programming-exercise-column-number-from-Excel-column-letter-markdown-UTF8.md ├── Fourth-R-programming-exercise-find-prime-integers-less-than-or-equal-N.Rmd ├── second-R-programming-exercise-if-else-if-else-syntax-and-logic.Rmd ├── Fourth-R-programming-exercise-find-prime-integers-less-than-or-equal-N.md ├── second-R-programming-exercise-if-else-if-else-syntax-and-logic.md ├── Tenth-R-Practice-exercise-merging-annotation-data-into-a-gene-expression-analysis-results-data-frame.Rmd ├── third-R-programming-exercise-using-for-loops-and-if-tests-to-check-if-a-positive-integer-is-a-prime.Rmd ├── Fifth-article-Review-of-getting-subsets-of-a-data-frame-constructing-data-frames.Rmd ├── Eighth-R-Practice-exercise-composing-a-function-and-constructing-a-data-frame-files-not-containing-search-strings.md ├── Eighth-R-Practice-exercise-composing-a-function-and-constructing-a-data-frame-files-not-containing-search-strings.Rmd └── Eleventh-R-Practice-exercise-using-sapply-and-split-and-also-use-of-ellipsis-to-pass-in-additional-argumentsFeb11.Rmd /second-R-programming-exercise-if-else-if-else-syntax-and-logicPDF.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanBerger/Practice-programming-exercises-for-R/HEAD/second-R-programming-exercise-if-else-if-else-syntax-and-logicPDF.pdf -------------------------------------------------------------------------------- /Fourth-R-programming-exercise-find-prime-integers-less-than-or-equal-N.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanBerger/Practice-programming-exercises-for-R/HEAD/Fourth-R-programming-exercise-find-prime-integers-less-than-or-equal-N.pdf -------------------------------------------------------------------------------- /Sixth-R-function-paths-listing-files-string-search-constructing-a-data-framePDF.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanBerger/Practice-programming-exercises-for-R/HEAD/Sixth-R-function-paths-listing-files-string-search-constructing-a-data-framePDF.pdf -------------------------------------------------------------------------------- /first-R-programming-exercise-column-number-from-Excel-column-letter-markdown-UTF8.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanBerger/Practice-programming-exercises-for-R/HEAD/first-R-programming-exercise-column-number-from-Excel-column-letter-markdown-UTF8.pdf -------------------------------------------------------------------------------- /Fifth-article-Review-of-getting-subsets-of-a-data-frame-constructing-data-framesPDF.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanBerger/Practice-programming-exercises-for-R/HEAD/Fifth-article-Review-of-getting-subsets-of-a-data-frame-constructing-data-framesPDF.pdf -------------------------------------------------------------------------------- /Seventh-R-function-systematic-debugging-composing-a-function-constructing-a-data-frame.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanBerger/Practice-programming-exercises-for-R/HEAD/Seventh-R-function-systematic-debugging-composing-a-function-constructing-a-data-frame.pdf -------------------------------------------------------------------------------- /Nineth-R-Practice-exercise-using-a-row-of-a-data-frame-the-unlist-function-composing-a-function.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanBerger/Practice-programming-exercises-for-R/HEAD/Nineth-R-Practice-exercise-using-a-row-of-a-data-frame-the-unlist-function-composing-a-function.pdf -------------------------------------------------------------------------------- /tiny-subset-of-GSE18885-gene-expression-data-9-genes-WG-5-samples-Normal-Control-4-samples.tab.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanBerger/Practice-programming-exercises-for-R/HEAD/tiny-subset-of-GSE18885-gene-expression-data-9-genes-WG-5-samples-Normal-Control-4-samples.tab.txt -------------------------------------------------------------------------------- /Tenth-R-Practice-exercise-merging-annotation-data-into-a-gene-expression-analysis-results-data-frame.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanBerger/Practice-programming-exercises-for-R/HEAD/Tenth-R-Practice-exercise-merging-annotation-data-into-a-gene-expression-analysis-results-data-frame.pdf -------------------------------------------------------------------------------- /Twelth-R-Practice-exercise-using-Monte-Carlo-simulation-to-investigate-a-statistical-quantity-March-6.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanBerger/Practice-programming-exercises-for-R/HEAD/Twelth-R-Practice-exercise-using-Monte-Carlo-simulation-to-investigate-a-statistical-quantity-March-6.pdf -------------------------------------------------------------------------------- /third-R-programming-exercise-using-for-loops-and-if-tests-to-check-if-a-positive-integer-is-a-prime.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanBerger/Practice-programming-exercises-for-R/HEAD/third-R-programming-exercise-using-for-loops-and-if-tests-to-check-if-a-positive-integer-is-a-prime.pdf -------------------------------------------------------------------------------- /Eighth-R-Practice-exercise-composing-a-function-and-constructing-a-data-frame-files-not-containing-search-strings.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanBerger/Practice-programming-exercises-for-R/HEAD/Eighth-R-Practice-exercise-composing-a-function-and-constructing-a-data-frame-files-not-containing-search-strings.pdf -------------------------------------------------------------------------------- /Eleventh-R-Practice-exercise-using-sapply-and-split-and-also-use-of-ellipsis-to-pass-in-additional-argumentsFeb11.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanBerger/Practice-programming-exercises-for-R/HEAD/Eleventh-R-Practice-exercise-using-sapply-and-split-and-also-use-of-ellipsis-to-pass-in-additional-argumentsFeb11.pdf -------------------------------------------------------------------------------- /GPL6104-Illumina-microarray-platform-annotation-from-GEO-repository-small-subset-edited-example-Feb12.tab.txt: -------------------------------------------------------------------------------- 1 | Illumina Probe_ID ILMN_Gene Entrez_Gene_ID Chromosome Probe location in chromosome Protein coded by gene - very short description 2 | ILMN_1698220 PHTF2 57157 7 77424374-77424423 homeodomain transcription factor 2 3 | ILMN_1810835 SPRR3 6707 1 151242655-151242704 small proline-rich protein 3 4 | ILMN_1688580 CAMP 820 3 48241909-48241918:48241919-48241958 cathelicidin antimicrobial peptide 5 | ILMN_1802867 RNASE3 6037 14 20430090-20430139 "ribonuclease, RNase A family, 3" 6 | ILMN_1766736 BPI 671 20 36399055-36399104 bactericidal/permeability-increasing protein 7 | ILMN_1753347 DEFA4 1669 8 6781040-6781073:6781660-6781675 "defensin, alpha 4" 8 | ILMN_1749014 ACLY 47 17 37277254-37277303 ATP citrate lyase 9 | ILMN_1785926 ZNF621 285268 3 40555813-40555862 zinc finger protein 621 10 | ILMN_1796316 MMP9 4318 20 44078320-44078369 matrix metallopeptidase 9 11 | ILMN_1706635 ELA2 1991 19 807179-807228 elastase 2 12 | ILMN_1705183 MPO 4353 17 53702640-53702689 myeloperoxidase 13 | ILMN_1806056 CEACAM8 1088 19 47776394-47776443 carcinoembryonic antigen-related cell adhesion 8 14 | ILMN_1730867 AZU1 566 19 781858-781907 azurocidin 1 antimicrobial protein 15 | ILMN_1813399 ATP2B1 490 12 88506745-88506794 "ATPase, Ca++ transporting variant 2" 16 | ILMN_1750599 ATP2B1 490 12 88516580-88516629 "ATPase, Ca++ transporting variant 1" 17 | 18 | 19 | small subset of annotation from the Gene Expression Omnibus for Illumina microarray platform GPL6104 20 | for use in an R programming exercise 21 | the full file is located at 22 | https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL6104 23 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | This repository will contain a sequence of programming exercises (currently 12) 2 | intended to fill the gap between learning the correct syntax of basic R commands and 3 | the programming assignments in the R Programming course in the Johns Hopkins University 4 | Data Science Specialization on Coursera. These exercises review basic R constructs 5 | and provide practice in "composing" an R function to carry out a particular task. 6 | The idea is to practice correct use of R constructs and built in functions 7 | (functions the "come with" the basic R installation), while "putting together" a correct 8 | sequence of groups of commands that in a logical sequence of steps will obtain the desired result. 9 | 10 | In these exercises, there will be a statement of what your function (or R code) should do - 11 | what are the input variables and what the function should return - and an outline or sequence of "hints". 12 | To get the most out of these exercises, try to write your function using as few hints as possible. A working 13 | code for each function is provided. If at first doing the programming is too hard, it still should be helpful 14 | to read the commentary on how the functions were "put together" and looking over the code and seeing how it works. 15 | 16 | Note there are often several ways to write a function that will obtain the correct result. 17 | For these exercises the directions and hints may point toward a particular approach intended 18 | to practice particular constructs in R and a particular line of reasoning. 19 | There well may be an existing R function or package that will do what is stated for a given 20 | practice exercise, but here (unlike other aspects of the R Programming course) the point 21 | is to practice "putting together" a logical sequence of steps, 22 | with each step a section of code, to obtain a working function, 23 | not to find an existing solution or a quick solution using a 24 | more powerful R construct that is better addressed later on. 25 | 26 | For each exercise, an .md file and or .pdf file is given, and the R markdown (.Rmd) file which generated it is also given. 27 | If you want to copy code into a file or into R, do so from either the .Rmd or the .md files, since copying R code from these pdf files does not always work (some lines seem to have extra encoding that disrupts use in R). 28 | 29 | A list of the exercises with their number; the R constructs they practice and / or R topics they address; and what the function(s) in that exercise do is given below. This listing is as of 10 March 2022 30 | 31 | 1. Use of the R letters character vector and the R paste and which and tolower functions; construct a function that, given an Excel column letter will return the corresponding integer column number, construct a function that will similarly deal with a vector of Excel column numbers 32 | 33 | 2. if, else if, else syntax and logic; construct a function that given a numeric value between 0 and 1 will return a corresponding character variable (a length 1 character vector) with 3 significant digits (practice using if constructs and the round function rather than R’s signif or format functions) 34 | 35 | 3. Practice using a for loop and if tests and the R mod function %%, return within an if test, comments on debugging code, return an integer vector whose entries have names; construct a function called isItPrime(n) that tests whether a given positive integer is a prime number 36 | 37 | 4. Accumulate entries in a vector using successive concatenation or by: starting with a sufficiently large initial vector and filling in the desired entries using a running index, and then trimming the vector to the appropriate size, use of the readline function to have interactive input from the user; construct a function called getPrimeNumbers(N) that given a positive integer N that is at least 2, returns in an integer vector all the prime numbers that are less than or equal N (uses isItPrime(n) from the previous exercise) 38 | 39 | 5. This file reviews topics in "dealing with" data frames (programming exercises using data frames are given in the next several files): Extracting a column of a data frame as a vector, review of the many ways to get a specified subset of a data frame, the which function for getting a vector of row numbers for which some condition is TRUE, the %in% function for determining the indices k of a vector V such that V[k] is an entry of some other vector W, creating a data frame by reading in a suitable text file using read.csv or read.table, creating a data frame using the data.frame function, concatenating suitable data frames using rbind and cbind 40 | 41 | 6. The path to a folder - full (absolute) paths and relative paths, the list.files function for listing names of files that are in a folder that have a given pattern in their file name, the grep function for finding which character strings have a given pattern in them, a brief introduction to regular expressions for pattern searching, the file.info function for obtaining information about a file, the colnames function for (re)naming the columns of a data frame; construct a function that finds all the file names in a given folder that have every pattern given in a character vector (called search.strings) in their file name, and construct a data frame containing these file names along with their most recent modification date and file size, do test runs for files in a folder set up to test these functions 42 | 43 | 7. More practice composing a function to carry out a prescribed task, a systematic way to debug a function by running its code line by line in the R console sub-window of RStudio and examining what happens with each line, the unique function; construct a modified version of the function in the previous exercise that outputs the same information but for the files containing any of the patterns in search.strings (rather than all the patterns in search.strings) 44 | 45 | 8. More practice composing a function to carry out a prescribed task, practice using the %in% function, the setdiff function; construct a function that returns the file names in a folder that do not contain any of the entries of serch.strings in their file name 46 | 47 | 9. Extracting part of a row of a data frame as a vector, the unlist and unname functions, reading in a data file using the web url of the file, more practice constructing a data frame; R code to analyze a small gene expression data set - getting results for each gene where the data for each gene is in a row of the input data frame 48 | 49 | 10. Merging a subset of rows in a data frame dfy to a data frame dfx where dfy may contain more rows than dfx and the relevant subset of rows of dfy need to be reordered to line up with those in dfx, doing this "by hand" to practice basic R constructs, doing this with the R merge function, the row.names function, the identical function, and another debugging "investigation"; R code to append (merge) annotation data to a gene expression analysis results data frame 50 | 51 | 11. Detailed information on using sapply and split, and also use of the ellipsis (...) functionality in sapply for passing in additional arguments to the function used in sapply. Most of this information is valid for lapply as well. Simple examples, and examples and exercises using the R iris data set 52 | 53 | 12. An example of using Monte Carlo simulation (using an appropriate random number generator) to investigate a statistical question. The statistical question is: given some independent random samples s_1,..., s_k without replacement from the integers 1 through N (so none of the integers in 1 through N can be chosen more than once), what is a good estimate for N? This is a well known question with a known best (frequentist) estimate for N (references are given). The point here is to describe in detail Monte Carlo simulation to "explore" a question, and to give example R code to implement it. 54 | 55 | Note the reader should not infer any endorsement or recommendation or approval for the material in these files from any of the sources or persons cited in this file or any of these files, or from any other entities mentioned in any of these files. 56 | -------------------------------------------------------------------------------- /first-R-programming-exercise-column-number-from-Excel-column-letter-markdown-UTF8.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: md_document 3 | --- 4 | 5 | ## "First programming exercise" 6 | 7 | ### Alan Berger Aug 20, 2020 minor edits Jan 18, 2021 8 | 9 | ### additional R code, run the functions, and some text edits Aug 22, 2020 10 | 11 | ## Introduction 12 | 13 | This is the first in a sequence of programming exercises intended to fill the gap between learning the correct 14 | syntax of basic R commands and the programming assignments in the R Programming course in the Johns Hopkins University 15 | Data Science Specialization on Coursera. In this sequence of exercises in "composing" an R function to carry out a particular task, 16 | the idea is to practice correct use of R constructs and built in functions (functions the "come with" the basic R installation), while 17 | learning how to "put together" a correct sequence of blocks of commands that will obtain the desired result. 18 | 19 | In these exercises, there will be a statement of what your function should do (what are the input variables and what the function 20 | should return) and a sequence of "hints". To get the most out of these exercises, try to write your function using as few hints as possible. 21 | Note there are often several ways to write a function that will obtain the correct result. For these exercises the directions and 22 | hints may point toward a particular approach intended to practice particular constructs in R and a particular line of reasoning. 23 | There may be an existing R function or package that will do what is stated for a given practice exercise, but here 24 | (unlike other aspects of the R Programming course) the point is to practice formulating a logical sequence of steps, 25 | with each step a section of code, to obtain a working function, not to find an existing solution or a quick solution using a 26 | more powerful R construct that is better addressed later on. 27 | 28 | ## Motivation for this exercise 29 | 30 | The input data for many functions will be or include a tab delimited text file that comes 31 | from or is naturally viewed as an Excel spreadsheet. For example, the last programming assignment in the R Programming 32 | course uses a .csv (comma separated variables) text file appropriately read in using R's read.csv function. It has a total of 46 columns, 33 | only 5 of which are relevant to the assignment (Excel columns B, G, K, Q and W). For this and many other situations, 34 | it would be convenient to have an R function that would take as its input an Excel column name (as a character variable) 35 | and output the corresponding column number. 36 | 37 | ### Instruction for this function 38 | 39 | For the first version of your function, just have it work for a column letter between a 40 | and z and only take as input a lower case letter. 41 | The skeleton of your function should "look like" 42 | 43 | ``` 44 | colnameToNumber <- function(colname) { 45 | # convert an Excel column name 46 | # to the corresponding column number; 47 | # colname should be a lower case letter in the 48 | # form of a character string, for example "a" or "r" 49 | # "between" a and z (including a and z) 50 | 51 | # coding lines 52 | 53 | return(colnumber) 54 | } 55 | ``` 56 | 57 | Directions: do NOT use a whole bunch of if statements (yes, one could do 26 if statements of the form 58 | `if(colname == "a") colnumber = 1` 59 | and so on, but, among other things, it would be easy to have typographic 60 | mistakes, and writing all those out would get pretty boring). 61 | 62 | Note that the built in R variable (R object) **letters** is 63 | the character vector with entries "a", "b", ... , "z" and note the built in R function **which** takes as input a logical 64 | vector (often defined by whether some condition is or is not true) and outputs the integer vector of the 65 | _indices_ of the input vector for which the logical value is **TRUE**, so for example `which(1:4 < 3)` is equal `c(1, 2)`, 66 | and `which(sqrt(1:4) == 2)` is the single value 4 (since `sqrt(1:4)` is the vector c(1, $\sqrt 2$, $\sqrt 3$, 2) which is 67 | `c(1, 1.414214, 1.732051, 2)` (to the number of digits printed). 68 | 69 | Your function, when applied to b, g, k, q and w (as character variables) should return 2, 7, 11, 17 and 23, 70 | respectively, and `colnameToNumber("zz")` should 71 | "throw an error" along the lines of 72 | 73 | Error in colnameToNumber("zz") : no match to colname 74 | 75 | This comes from the R statement: `stop("no match for colname")` which you should have occur in your function if it is the case that the input was not a lower case letter between a and z. 76 | 77 | Try writing your program now before going to the additional hints below (the more you do "on your own" the faster you will gain skill at programming). 78 | 79 | ### Further hints 80 | 81 | You can use the **which** function to "pick out" the index k (an integer) of the **letters** vector for which `letters[k]` equals colname, this is the number you want to return. 82 | 83 | How might you check for whether colname was a letter between "a" and "z"? What will the **which** function return if colname was not a "valid" value for 84 | the colnameToNumber function as currently constructed (colname is not equal any entry of **letters**)? (Try it out in an R session.) 85 | 86 | Things to think about: how would you "extend" your function to treat some additional Excel columns, say through column "dz" (check out the **paste** function, you could use 87 | it along with **letters** to construct vectors containing additional column names and concatenate them together). You could do this with several lines of 88 | code to get the vector of "a" through "dz" (a through z; aa through az, ba through bz; ca through cz; da through dz; then concatenate them into 1 vector). 89 | To be able to find the column number for many more Excel columns you would want to use a for loop to construct and concatenate blocks of column names. 90 | 91 | How would you extend your function to easily deal with upper case letters or a "mix" of upper and lower case letters, 92 | as in the column name "Cz" (some people like me are sloppy typists, or will forget about a lower case restriction and use capital letters as Excel does). 93 | Now is the time to make use of Google to search for a built in R function that will convert any upper case letters in a character variable to lower case (and leave other characters as is). 94 | 95 | How would you extend your function to treat a vector (of length > 1) of column names? 96 | 97 | A working version of this function is given below, but try to do your own function before looking. 98 | 99 | ### colnameToNumber function 100 | 101 | ```{r} 102 | colnameToNumber <- function(colname){ 103 | # convert an Excel column name to the 104 | # corresponding column number; 105 | # should not depend on whether the 106 | # letter(s) in colname are upper or 107 | # lower case or a mixture of upper and 108 | # lower case 109 | # colname should be a character string such as "k" or "dz" 110 | 111 | colname <- tolower(colname) 112 | # so only need to deal with lower case 113 | 114 | # arrange to treat column names from "a" up through "dz" 115 | az <- letters # c("a", ... , "z") 116 | aaz <- paste("a", az, sep = "") 117 | # c("aa", "ab", ... , "az") 118 | # additional column names 119 | baz <- paste("b", az, sep = "") 120 | caz <- paste("c", az, sep = "") 121 | daz <- paste("d", az, sep = "") 122 | 123 | # concatenate the column names 124 | colnames <- c(az, aaz, baz, caz, daz) 125 | 126 | # get the number corresponding to the input column name 127 | # using the which function 128 | colnumber <- which(colnames == colname) 129 | # test for having found a (unique) match 130 | if(length(colnumber) != 1) stop("no match to colname") 131 | return(colnumber) 132 | } 133 | ``` 134 | 135 | Do check runs. 136 | 137 | ```{r} 138 | colnameToNumber("a") 139 | colnameToNumber("z") 140 | colnameToNumber("aa") 141 | colnameToNumber("az") 142 | colnameToNumber("ba") 143 | colnameToNumber("bz") 144 | colnameToNumber("ca") 145 | colnameToNumber("cz") 146 | colnameToNumber("da") 147 | colnameToNumber("dz") 148 | # colnameToNumber("zz") # should get error message 149 | ``` 150 | 151 | Note because we used the built in **letters** character vector, the calls 152 | to colnameToNumber above should be sufficient to check that it has been coded correctly. Contrast that 153 | with the checks that would be needed if we had used if tests for each column name between a and dz. 154 | 155 | For the last part of this exercise, write a function that uses the colnameToNumber function to 156 | convert a character _vector_ of column names to the corresponding vector of column numbers. One could modify 157 | the colnameToNumber function to do this using a for loop, but here write a separate function that uses 158 | the colnameToNumber function (this set up will be used as practice with the **sapply** function later on). 159 | 160 | Hints: The output of this function, call the function colvecToNumbers will be an integer vector, call it colnumbers, 161 | having the same **length** as the input character vector, call it colvec. One can initialize colnumbers using the **integer** 162 | function, and then "fill in" its entries in a for loop using the colnameToNumber function. 163 | 164 | 165 | A working version of this function is given below, but try to do your own function before looking. 166 | 167 | ### colvecToNumbers function 168 | 169 | ```{r} 170 | colvecToNumbers <- function(colvec){ 171 | # convert a character vector of Excel column names 172 | # to the integer vector of corresponding column numbers 173 | # each entry of colvec should be a column name between a and dz 174 | 175 | # check the input is a non-empty character vector 176 | # note a single character string is a character vector of length 1 177 | if(length(colvec) < 1) stop("colvec input to colvecToNumbers is empty") 178 | if(!is.character(colvec)) stop("colvec input to colvecToNumbers is not character") 179 | 180 | # create the integer vector to be output 181 | n <- length(colvec) 182 | colnumbers <- integer(n) 183 | 184 | # use colnameToNumber to get each entry of colnumbers 185 | for (i in 1:n) { 186 | colname <- colvec[i] 187 | colnumbers[i] <- colnameToNumber(colname) 188 | } 189 | 190 | return(colnumbers) 191 | } 192 | ``` 193 | 194 | ### check runs for colvecToNumbers 195 | 196 | ```{r} 197 | colvec <- c("b", "g", "k", "q", "w") # the hospital data relevant columns 198 | colvecToNumbers(colvec) 199 | colvec <- c("a", "aa", "ba", "ca", "da") # start of groups of 26 columns 200 | colvecToNumbers(colvec) 201 | colvec <- c("z", "az", "bz", "cz", "dz") # end of groups of 26 columns 202 | colvecToNumbers(colvec) 203 | ``` 204 | 205 | Agrees with what it should be. 206 | 207 | Hope this programming exercise was informative and good practice with 208 | writing a function. 209 | 210 | Things to look forward to: later in the R Programming course one will learn 211 | the "apply family" of powerful built in R functions. Not counting the lines 212 | that check whether the input value of colvec is valid, one can write a version 213 | of colvecToNumbers using the one!! line 214 | 215 | sapply(colvec, colnameToNumber) 216 | 217 | Nice and concise and only one line needs to be checked for any mistakes - **sapply** does 218 | the for loop and creates the vector to be returned without any more lines of code needed. 219 | Fewer lines of code (as long as they are reasonably "readable" - not a birds nest of 220 | parentheses and brackets) mean fewer chances to make a mistake and fewer places for bugs to hide. 221 | -------------------------------------------------------------------------------- /first-R-programming-exercise-column-number-from-Excel-column-letter-markdown-UTF8.md: -------------------------------------------------------------------------------- 1 | "First programming exercise" 2 | ---------------------------- 3 | 4 | ### Alan Berger Aug 20, 2020 minor edits Jan 18, 2021 5 | 6 | ### additional R code, run the functions, and some text edits Aug 22, 2020 7 | 8 | Introduction 9 | ------------ 10 | 11 | This is the first in a sequence of programming exercises intended to 12 | fill the gap between learning the correct syntax of basic R commands and 13 | the programming assignments in the R Programming course in the Johns 14 | Hopkins University Data Science Specialization on Coursera. In this 15 | sequence of exercises in "composing" an R function to carry out a 16 | particular task, the idea is to practice correct use of R constructs and 17 | built in functions (functions the "come with" the basic R installation), 18 | while learning how to "put together" a correct sequence of blocks of 19 | commands that will obtain the desired result. 20 | 21 | In these exercises, there will be a statement of what your function 22 | should do (what are the input variables and what the function should 23 | return) and a sequence of "hints". To get the most out of these 24 | exercises, try to write your function using as few hints as possible. 25 | Note there are often several ways to write a function that will obtain 26 | the correct result. For these exercises the directions and hints may 27 | point toward a particular approach intended to practice particular 28 | constructs in R and a particular line of reasoning. 29 | There may be an existing R function or package that will do what is 30 | stated for a given practice exercise, but here (unlike other aspects of 31 | the R Programming course) the point is to practice formulating a logical 32 | sequence of steps, with each step a section of code, to obtain a working 33 | function, not to find an existing solution or a quick solution using a 34 | more powerful R construct that is better addressed later on. 35 | 36 | Motivation for this exercise 37 | ---------------------------- 38 | 39 | The input data for many functions will be or include a tab delimited 40 | text file that comes from or is naturally viewed as an Excel 41 | spreadsheet. For example, the last programming assignment in the R 42 | Programming course uses a .csv (comma separated variables) text file 43 | appropriately read in using R's read.csv function. It has a total of 46 44 | columns, only 5 of which are relevant to the assignment (Excel columns 45 | B, G, K, Q and W). For this and many other situations, it would be 46 | convenient to have an R function that would take as its input an Excel 47 | column name (as a character variable) and output the corresponding 48 | column number. 49 | 50 | ### Instruction for this function 51 | 52 | For the first version of your function, just have it work for a column 53 | letter between a and z and only take as input a lower case letter. The 54 | skeleton of your function should "look like" 55 | 56 | colnameToNumber <- function(colname) { 57 | # convert an Excel column name 58 | # to the corresponding column number; 59 | # colname should be a lower case letter in the 60 | # form of a character string, for example "a" or "r" 61 | # "between" a and z (including a and z) 62 | 63 | # coding lines 64 | 65 | return(colnumber) 66 | } 67 | 68 | Directions: do NOT use a whole bunch of if statements (yes, one could do 69 | 26 if statements of the form `if(colname == "a") colnumber = 1` and so 70 | on, but, among other things, it would be easy to have typographic 71 | mistakes, and writing all those out would get pretty boring). 72 | 73 | Note that the built in R variable (R object) **letters** is the 74 | character vector with entries "a", "b", ... , "z" and note the built in 75 | R function **which** takes as input a logical vector (often defined by 76 | whether some condition is or is not true) and outputs the integer vector 77 | of the *indices* of the input vector for which the logical value is 78 | **TRUE**, so for example `which(1:4 < 3)` is equal `c(1, 2)`, and 79 | `which(sqrt(1:4) == 2)` is the single value 4 (since `sqrt(1:4)` is the 80 | vector c(1, $\\sqrt 2$, $\\sqrt 3$, 2) which is 81 | `c(1, 1.414214, 1.732051, 2)` (to the number of digits printed). 82 | 83 | Your function, when applied to b, g, k, q and w (as character variables) 84 | should return 2, 7, 11, 17 and 23, respectively, and 85 | `colnameToNumber("zz")` should "throw an error" along the lines of 86 | 87 | Error in colnameToNumber("zz") : no match to colname 88 | 89 | This comes from the R statement: `stop("no match for colname")` which 90 | you should have occur in your function if it is the case that the input 91 | was not a lower case letter between a and z. 92 | 93 | Try writing your program now before going to the additional hints below 94 | (the more you do "on your own" the faster you will gain skill at 95 | programming). 96 | 97 | ### Further hints 98 | 99 | You can use the **which** function to "pick out" the index k (an 100 | integer) of the **letters** vector for which `letters[k]` equals 101 | colname, this is the number you want to return. 102 | 103 | How might you check for whether colname was a letter between "a" and 104 | "z"? What will the **which** function return if colname was not a 105 | "valid" value for the colnameToNumber function as currently constructed 106 | (colname is not equal any entry of **letters**)? (Try it out in an R 107 | session.) 108 | 109 | Things to think about: how would you "extend" your function to treat 110 | some additional Excel columns, say through column "dz" (check out the 111 | **paste** function, you could use it along with **letters** to construct 112 | vectors containing additional column names and concatenate them 113 | together). You could do this with several lines of code to get the 114 | vector of "a" through "dz" (a through z; aa through az, ba through bz; 115 | ca through cz; da through dz; then concatenate them into 1 vector). To 116 | be able to find the column number for many more Excel columns you would 117 | want to use a for loop to construct and concatenate blocks of column 118 | names. 119 | 120 | How would you extend your function to easily deal with upper case 121 | letters or a "mix" of upper and lower case letters, as in the column 122 | name "Cz" (some people like me are sloppy typists, or will forget about 123 | a lower case restriction and use capital letters as Excel does). 124 | Now is the time to make use of Google to search for a built in R 125 | function that will convert any upper case letters in a character 126 | variable to lower case (and leave other characters as is). 127 | 128 | How would you extend your function to treat a vector (of length > 1) 129 | of column names? 130 | 131 | A working version of this function is given below, but try to do your 132 | own function before looking. 133 | 134 | ### colnameToNumber function 135 | 136 | colnameToNumber <- function(colname){ 137 | # convert an Excel column name to the 138 | # corresponding column number; 139 | # should not depend on whether the 140 | # letter(s) in colname are upper or 141 | # lower case or a mixture of upper and 142 | # lower case 143 | # colname should be a character string such as "k" or "dz" 144 | 145 | colname <- tolower(colname) 146 | # so only need to deal with lower case 147 | 148 | # arrange to treat column names from "a" up through "dz" 149 | az <- letters # c("a", ... , "z") 150 | aaz <- paste("a", az, sep = "") 151 | # c("aa", "ab", ... , "az") 152 | # additional column names 153 | baz <- paste("b", az, sep = "") 154 | caz <- paste("c", az, sep = "") 155 | daz <- paste("d", az, sep = "") 156 | 157 | # concatenate the column names 158 | colnames <- c(az, aaz, baz, caz, daz) 159 | 160 | # get the number corresponding to the input column name 161 | # using the which function 162 | colnumber <- which(colnames == colname) 163 | # test for having found a (unique) match 164 | if(length(colnumber) != 1) stop("no match to colname") 165 | return(colnumber) 166 | } 167 | 168 | Do check runs. 169 | 170 | colnameToNumber("a") 171 | 172 | ## [1] 1 173 | 174 | colnameToNumber("z") 175 | 176 | ## [1] 26 177 | 178 | colnameToNumber("aa") 179 | 180 | ## [1] 27 181 | 182 | colnameToNumber("az") 183 | 184 | ## [1] 52 185 | 186 | colnameToNumber("ba") 187 | 188 | ## [1] 53 189 | 190 | colnameToNumber("bz") 191 | 192 | ## [1] 78 193 | 194 | colnameToNumber("ca") 195 | 196 | ## [1] 79 197 | 198 | colnameToNumber("cz") 199 | 200 | ## [1] 104 201 | 202 | colnameToNumber("da") 203 | 204 | ## [1] 105 205 | 206 | colnameToNumber("dz") 207 | 208 | ## [1] 130 209 | 210 | # colnameToNumber("zz") # should get error message 211 | 212 | Note because we used the built in **letters** character vector, the 213 | calls to colnameToNumber above should be sufficient to check that it has 214 | been coded correctly. Contrast that with the checks that would be needed 215 | if we had used if tests for each column name between a and dz. 216 | 217 | For the last part of this exercise, write a function that uses the 218 | colnameToNumber function to convert a character *vector* of column names 219 | to the corresponding vector of column numbers. One could modify the 220 | colnameToNumber function to do this using a for loop, but here write a 221 | separate function that uses the colnameToNumber function (this set up 222 | will be used as practice with the **sapply** function later on). 223 | 224 | Hints: The output of this function, call the function colvecToNumbers 225 | will be an integer vector, call it colnumbers, having the same 226 | **length** as the input character vector, call it colvec. One can 227 | initialize colnumbers using the **integer** 228 | function, and then "fill in" its entries in a for loop using the 229 | colnameToNumber function. 230 | 231 | A working version of this function is given below, but try to do your 232 | own function before looking. 233 | 234 | ### colvecToNumbers function 235 | 236 | colvecToNumbers <- function(colvec){ 237 | # convert a character vector of Excel column names 238 | # to the integer vector of corresponding column numbers 239 | # each entry of colvec should be a column name between a and dz 240 | 241 | # check the input is a non-empty character vector 242 | # note a single character string is a character vector of length 1 243 | if(length(colvec) < 1) stop("colvec input to colvecToNumbers is empty") 244 | if(!is.character(colvec)) stop("colvec input to colvecToNumbers is not character") 245 | 246 | # create the integer vector to be output 247 | n <- length(colvec) 248 | colnumbers <- integer(n) 249 | 250 | # use colnameToNumber to get each entry of colnumbers 251 | for (i in 1:n) { 252 | colname <- colvec[i] 253 | colnumbers[i] <- colnameToNumber(colname) 254 | } 255 | 256 | return(colnumbers) 257 | } 258 | 259 | ### check runs for colvecToNumbers 260 | 261 | colvec <- c("b", "g", "k", "q", "w") # the hospital data relevant columns 262 | colvecToNumbers(colvec) 263 | 264 | ## [1] 2 7 11 17 23 265 | 266 | colvec <- c("a", "aa", "ba", "ca", "da") # start of groups of 26 columns 267 | colvecToNumbers(colvec) 268 | 269 | ## [1] 1 27 53 79 105 270 | 271 | colvec <- c("z", "az", "bz", "cz", "dz") # end of groups of 26 columns 272 | colvecToNumbers(colvec) 273 | 274 | ## [1] 26 52 78 104 130 275 | 276 | Agrees with what it should be. 277 | 278 | Hope this programming exercise was informative and good practice with 279 | writing a function. 280 | 281 | Things to look forward to: later in the R Programming course one will 282 | learn the "apply family" of powerful built in R functions. Not counting 283 | the lines that check whether the input value of colvec is valid, one can 284 | write a version of colvecToNumbers using the one!! line 285 | 286 | sapply(colvec, colnameToNumber) 287 | 288 | Nice and concise and only one line needs to be checked for any mistakes 289 | - **sapply** does the for loop and creates the vector to be returned 290 | without any more lines of code needed. Fewer lines of code (as long as 291 | they are reasonably "readable" - not a birds nest of parentheses and 292 | brackets) mean fewer chances to make a mistake and fewer places for bugs 293 | to hide. 294 | -------------------------------------------------------------------------------- /Fourth-R-programming-exercise-find-prime-integers-less-than-or-equal-N.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | md_document: 4 | variant: markdown_github 5 | # output: pdf_document 6 | --- 7 | 8 | 9 | ## "Fourth R programming exercise find prime integers less than or equal N" 10 | 11 | ### Alan E. Berger November 22, 2020 12 | 13 | ### version 1 14 | 15 | ### available at https://github.com/AlanBerger/Practice-programming-exercises-for-R 16 | 17 | ## Finish the construction of a function to return all the prime numbers between 1 and a positive integer N 18 | 19 | ## Introduction 20 | 21 | This is the fourth in a sequence of programming exercises in "composing" an R function 22 | to carry out a particular task. The idea is to practice correct use of R constructs and 23 | built in functions (functions that "come with" the basic R installation), while learning how 24 | to "put together" a correct sequence of blocks of commands that will obtain the desired result. 25 | Note these exercises are quite cumulative - one should do them in order. 26 | 27 | In these exercises, there will be a statement of what your function should do 28 | (what are the input variables and what the function should return) and a sequence of "hints". 29 | To get the most out of these exercises, try to write your function using as few hints as possible. 30 | Note there are often several ways to write a function that will obtain the correct result. 31 | For these exercises the directions and hints may point toward a particular approach intended to 32 | practice particular constructs in R and a particular line of reasoning, 33 | even if there is a more efficent way to obtain the same result. 34 | There may also be an existing R function or package that will do what is stated for a given 35 | practice exercise, but here the point is to practice formulating a logical sequence of steps, 36 | with each step a section of code, to obtain a working function, not to find an existing 37 | solution or a quick solution using a more powerful R construct that is better addressed later on. 38 | 39 | ## Motivation for this exercise 40 | 41 | For this exercise, we will finish constructing the function getPrimeNumbers(N = 1000) which will 42 | return all the prime numbers between 1 and the positive integer N. We will use the isItPrime(n) function 43 | constructed in the previous exercise, which tests whether the positive integer n is a prime number. 44 | This illustrates construction of a function in several steps and in a modular fashion, allowing 45 | for flexibility and easier testing and debugging. 46 | 47 | ## Background 48 | 49 | Recall the definitions and results about prime numbers from the previous exercise: 50 | A positive integer q **evenly divides** a positive integer n if there is a positive 51 | integer k such that n = k * q, for example 3 evenly divides 15; 6 evenly divides 24; but 4 does not evenly 52 | divide 9 (in integer arithmetic, since 9 = 2 * 4 with a **remainder** of 1). 53 | R provides the **mod** function **%%** such that n %% q gives the remainder **r** from integer 54 | dividing n by q (also phrased as **n equals r mod q**). So q evenly divides n is equivalent to n %% q = 0 55 | 56 | A positive integer p is called **prime** if p > 1 and the only positive integers that evenly divide p are 1 and p 57 | (so the first several prime numbers are 2, 3, 5, 7, 11, 13). In the previous exercise we used the mod function to 58 | construct the isItPrime(n) function. 59 | 60 | The function to be constructed is **getPrimeNumbers**, whose argument N is to be 61 | a positive integer greater than 1, and which should return, in a vector, call it for example primes_up_to_N, 62 | all the prime numbers between 2 and N (including 2, and if N is a prime number, N). 63 | 64 | ## Instructions for constructing **getPrimeNumbers** 65 | 66 | In the previous exercise we constructed **isItPrime(n)** whose argument is a positive integer n that is 67 | at most 1,000,000 (just to avoid accidently starting an extremely time consuming calculation) 68 | which will return either TRUE if n is a prime and FALSE otherwise. This is a copy of isItPrime, the same 69 | as in the previous exercise, except that here I have commented out the check for N being too large since that will 70 | be done in getPrimeNumbers: 71 | 72 | ``` {r} 73 | isItPrime <- function(n) { 74 | # determine whether the positive integer n is prime 75 | # using the mod function, Version 2 76 | 77 | # check that the function argument is "admissible" 78 | # test that n is a positive integer (or a real number that equals a positive integer) 79 | n.int <- as.integer(n) 80 | # if n was a real number such as 3.2 then n.int will be n truncated 81 | # to an integer (for this example, 3) 82 | 83 | if(!(n.int == n)) stop("n is not an integer") 84 | if(n < 1) stop("n is not positive") 85 | 86 | # stop if n is "too large" to avoid a very long calculation 87 | # if(n > 1000000) stop("n is > a million") 88 | 89 | # code to test if n is prime using R's mod function %% 90 | # return TRUE or FALSE 91 | 92 | if(n.int == 1) return(FALSE) 93 | if(n.int == 2) return(TRUE) 94 | # if got to here, n is at least 3 95 | # test if an integer between 2 and sqrt(n) + 1 evenly divides n 96 | 97 | lastq <- as.integer(sqrt(n)) + 1L 98 | # the L in 1L "tells" R is treat 1 as an 99 | # integer value rather than a real (numeric) value 100 | # this could also have equivalently been done by 101 | # lastq <- as.integer(sqrt(n) + 1) 102 | for (q in 2:lastq) { 103 | if((n %% q) == 0) return(FALSE) 104 | } 105 | 106 | # if got to here, n is prime 107 | return(TRUE) 108 | } 109 | ``` 110 | 111 | Use a for loop and use isItPrime(n) to test each positive integer n between 2 and N to see if it is prime. 112 | Return the integers that are found to be prime in a vector called, for example, primes_up_to_N 113 | 114 | For the first version of getPrimeNumbers, use the following simple comstruction to obtain primes_up_to_N: 115 | initialize primes_up_to_N to be integer(0), then in a for loop whose index, call it n, runs from 2L to N, use 116 | isItPrime to test if n is a prime. If n is prime, append n to primes_up_to_N via the statement 117 | 118 | primes_up_to_N <- c(primes_up_to_N, n) 119 | 120 | Try writing getPrimeNumbers now. 121 | 122 | If you do getPrimeNumbers(N = 111) you should get 123 | 124 | ``` 125 | getPrimeNumbers(111) 126 | [1] 2 3 5 7 11 13 17 19 23 29 31 37 41 43 47 127 | [16] 53 59 61 67 71 73 79 83 89 97 101 103 107 109 128 | ``` 129 | 130 | The number of values printed on each line in an R session depends on the width of the R console window. 131 | 132 | A working version of getPrimeNumbers follows: 133 | 134 | ``` {r} 135 | getPrimeNumbers <- function(N) { 136 | # N should be a positive integer that is at least 2 137 | # return a vector containing all the prime numbers between 2 and N 138 | # (including 2 and including N if N is a prime) 139 | 140 | # check that the function argument is "admissible" 141 | # test that N is a positive integer (or a real number that equals a positive integer) 142 | N.int <- as.integer(N) 143 | # if N was a real number such as 3.2 then N.int will be N truncated 144 | # to an integer (for this example, 3) 145 | 146 | if(!(N.int == N)) stop("N is not an integer") 147 | if(N < 2) stop("N is not at least 2") 148 | 149 | # stop if N is "too large" to avoid a very long calculation 150 | if(N > 1000000) stop("N is > a million") 151 | 152 | # initialize primes_up_to_N 153 | primes_up_to_N <- integer(0) 154 | 155 | for (n in 2L:N.int) { 156 | if(isItPrime(n)) { 157 | primes_up_to_N <- c(primes_up_to_N, n) 158 | } 159 | } 160 | 161 | return(primes_up_to_N) 162 | } 163 | ``` 164 | 165 | ## Using a **running index** with a preset vector to obtain primes_up_to_N 166 | 167 | In the next version of getPrimeNumbers, instead of doing 168 | 169 | primes_up_to_N <- c(primes_up_to_N, n) 170 | 171 | to "accumulate" the prime numbers in a vector, you are to initialize the integer vector 172 | primes_up_to_N to be of length N to contain the prime numbers between 2 and N. Obviously this 173 | vector will generally be larger than needed, but we can place each prime number as it is found into successive 174 | entries of primes_up_to_N using a **running index**, call it k. How this works is one intializes k to 175 | 0 and then each time inside the for loop an integer n is found to be prime, one increases k by 1 and 176 | then sets primes_up_to_N[k] <- n When the for loop is completed, k will be the number of primes that 177 | were found between 2 and N, and so one then "trims" primes_up_to_N by doing 178 | 179 | primes_up_to_N <- primes_up_to_N[1:k] 180 | 181 | This takes more initial storage space, but is "cleaner" than successively creating new vectors by 182 | doing primes_up_to_N <- c(primes_up_to_N, n) and is a technique one should be familiar with. 183 | 184 | Try writing a version of getPrimeNumbers that uses a predefined primes_up_to_N integer vector 185 | (of length N) and a running index to fill in its entries, and then trim it to the correct length 186 | before returning it. A working version is given below. 187 | 188 | ``` {r} 189 | getPrimeNumbers <- function(N) { 190 | # N should be a positive integer that is at least 2 191 | # return a vector containing all the prime numbers between 2 and N 192 | # (including 2 and including N if N is a prime) 193 | 194 | # for this version use a predefined integer vector primes_up_to_N of 195 | # length N and a running index k to fill in entries, and then trim it 196 | # after the for loop is completed 197 | 198 | # check that the function argument is "admissible" 199 | # test that N is a positive integer (or a real number that equals a positive integer) 200 | N.int <- as.integer(N) 201 | # if N was a real number such as 3.2 then N.int will be N truncated 202 | # to an integer (for this example, 3) 203 | 204 | if(!(N.int == N)) stop("N is not an integer") 205 | if(N < 2) stop("N is not at least 2") 206 | 207 | # if N is "too large" (> 1,000,000) then stop 208 | if(N > 1000000) { 209 | cat("N = ", N, "\n") # print N and also include going to a new output line 210 | stop("N is > a million") 211 | } 212 | 213 | # initialize primes_up_to_N 214 | primes_up_to_N <- integer(N) 215 | k <- 0 # the running index 216 | 217 | for (n in 2L:N.int) { 218 | if(isItPrime(n)) { 219 | k <- k + 1 # get next location in primes_up_to_N 220 | primes_up_to_N[k] <- n 221 | } 222 | } 223 | 224 | primes_up_to_N <- primes_up_to_N[1:k] # trim to correct length 225 | return(primes_up_to_N) 226 | } 227 | 228 | # do a test run 229 | getPrimeNumbers(111) 230 | ``` 231 | 232 | On my computer the latter version of getPrimeNumbers runs a bit faster than the 233 | former version (for N = 1000000 the former takes about 18 seconds and the latter 12 seconds). 234 | 235 | 236 | ## Using the **readline** function to let the user decide whether to continue a run if N > a million 237 | 238 | 239 | The next version of getPrimeNumbers is the same as the one immediately above except that 240 | instead of stopping with an error if N is > 1,000,000 this version asks the user to decide whether or not to 241 | continue with running the function by replying "yes" or "no" using the **readline** function if N is 242 | > 1,000,000 as illustrated below. 243 | 244 | ``` {r} 245 | getPrimeNumbers <- function(N) { 246 | # N should be a positive integer that is at least 2 247 | # return a vector containing all the prime numbers between 2 and N 248 | # (including 2 and including N if N is a prime) 249 | 250 | # for this version use a predefined integer vector primes_up_to_N of 251 | # length N and a running index k to fill in entries, and then trim it 252 | # after the for loop is completed 253 | 254 | # check that the function argument is "admissible" 255 | # test that N is a positive integer (or a real number that equals a positive integer) 256 | N.int <- as.integer(N) 257 | # if N was a real number such as 3.2 then N.int will be N truncated 258 | # to an integer (for this example, 3) 259 | 260 | if(!(N.int == N)) stop("N is not an integer") 261 | if(N < 2) stop("N is not at least 2") 262 | 263 | 264 | # if N is "large" (> 1,000,000) check with the user to see if the user wants to proceed 265 | if(N > 1000000) { 266 | cat("N = ", N, "\n") # print N and also include going to a new output line 267 | yes.or.no <- readline("this N is large, do you want to continue, type yes or no: ") 268 | if(yes.or.no != "yes") return("N was large so exited getPrimeNumbers") 269 | } 270 | 271 | 272 | # initialize primes_up_to_N 273 | primes_up_to_N <- integer(N) 274 | k <- 0 # the running index 275 | 276 | for (n in 2L:N.int) { 277 | if(isItPrime(n)) { 278 | k <- k + 1 # get next location in primes_up_to_N 279 | primes_up_to_N[k] <- n 280 | } 281 | } 282 | 283 | primes_up_to_N <- primes_up_to_N[1:k] # trim to correct length 284 | return(primes_up_to_N) 285 | } 286 | 287 | # do a test run 288 | getPrimeNumbers(111) 289 | 290 | # do a second test run 291 | primes.for.N.equal.a.million <- getPrimeNumbers(1000000) 292 | length(primes.for.N.equal.a.million) # should be 78498 293 | primes.for.N.equal.a.million[1000] # should be 7919 294 | primes.for.N.equal.a.million[10000] # should be 104729 295 | tail(primes.for.N.equal.a.million) # the last value should be 999983 296 | ``` 297 | 298 | Hope this programming exercise was informative and good practice. 299 | The next set of exercises will get into using data frames. 300 | 301 | = = = = = = = = = = = = = = = = = = = = = = = = 302 | 303 | This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 304 | To view a copy of this license, visit https://creativecommons.org/licenses/by-nc-sa/4.0/ or send a letter 305 | to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. There is a full version of this license at this web site: 306 | https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode 307 | -------------------------------------------------------------------------------- /second-R-programming-exercise-if-else-if-else-syntax-and-logic.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: md_document 3 | --- 4 | 5 | ## "Second programming exercise if, else if, else syntax and logic" 6 | 7 | ### Alan Berger Aug 25, 2020 minor edits Jan 18, 2021 8 | 9 | ### version 1 10 | 11 | ## Introduction 12 | 13 | This is the second in a sequence of programming exercises intended to fill the gap between learning the correct 14 | syntax of basic R commands and the programming assignments in the R Programming course in the Johns Hopkins University 15 | Data Science Specialization on Coursera. In this sequence of exercises in "composing" an R function to carry out a particular task, 16 | the idea is to practice correct use of R constructs and built in functions (functions the "come with" the basic R installation), while 17 | learning how to "put together" a correct sequence of blocks of commands that will obtain the desired result. 18 | Note these exercises are quite cumulative - one should do them in order. 19 | 20 | In these exercises, there will be a statement of what your function should do (what are the input variables and what the function 21 | should return) and a sequence of "hints". To get the most out of these exercises, 22 | try to write your function using as few hints as possible. 23 | Note there are often several ways to write a function that will obtain the correct result. For these exercises the directions and 24 | hints may point toward a particular approach intended to practice particular constructs in R and a particular line of reasoning. 25 | There may be an existing R function or package that will do what is stated for a given practice exercise, but here 26 | (unlike other aspects of the R Programming course) the point is to practice formulating a logical sequence of steps, 27 | with each step a section of code, to obtain a working function, not to find an existing solution or a quick solution using a 28 | more powerful R construct that is better addressed later on. 29 | 30 | ## Motivation for this exercise 31 | 32 | If statements are a basic construct for determining which commands or blocks of commands should be executed. 33 | The specific function for this exercise is described below. 34 | 35 | ## Some if, else if, else templates 36 | 37 | Below are templates for common if statement constructs. Note it is helpful to use indentation of lines of code and 38 | blank spaces between sections of code as illustrated below. The number of spaces of indentation is somewhat 39 | a personal choice, balancing making code easy to read by having separate blocks of code stand out by having 40 | more spaces in indentations, with not having too many R commands extend over more than 1 line 41 | (and in general, don't go over 80 characters in a line). 42 | The motivation is that code that is easier to read is easier to proofread and spot bugs in. 43 | 44 | ``` 45 | if (condition1) short.code # short.code is a short R command 46 | # condition1 (and below, condition2, condition3) is a logical statement evaluating to TRUE or FALSE 47 | # when condition1 is true, execute the short.code statement 48 | 49 | 50 | 51 | if (condition1) { 52 | code1 # where code1 (and below, code2, code3, code4) stands for one or more lines of code 53 | } 54 | # when condition1 is true, execute code1 55 | 56 | 57 | 58 | if (condition1) { 59 | code1 60 | } else { 61 | code2 62 | } 63 | # when condition1 is TRUE, then the line(s) of code1 are executed, 64 | # otherwise the line(s) of code2 are executed 65 | 66 | 67 | 68 | if (condition1) { 69 | code1 70 | } else if (condition2) { 71 | code2 72 | } 73 | # when condition1 is TRUE, code1 is executed 74 | # when condition1 is FALSE, then code2 will be executed if condition2 is TRUE; 75 | # when neither condition is TRUE, neither code1 nor code2 get executed, and 76 | # R "proceeds" to the next line of code 77 | 78 | 79 | 80 | # The full range of possibilities within one if, else if, else block are 81 | # illustrated here 82 | if (condition1) { 83 | code1 84 | } else if (condition2) { 85 | code2 86 | } else if (condition3) { 87 | code3 88 | } else { 89 | code4 90 | } 91 | # there can be any (reasonable) number of "else ifs" 92 | 93 | 94 | # Another quick way to use a sequence of if tests when there are only 95 | # a few cases is shown in this example, of getting the rgb (red, green, blue) 96 | # color scale values when there are only a couple possible color names. 97 | # This is, if nothing else, code that is easy to read/understand 98 | 99 | rgbvec <- 0 # when, after the if statements, rgbvec is still 0, throw an error 100 | if(color == "Magenta") rgbvec <- c(255, 0, 255) 101 | if(color == "ForestGreen") rgbvec <- c(34, 139, 34) 102 | if(color == "Cyan") rgbvec <- c(0, 255, 255) 103 | # test that color matched one of the above choices 104 | if(identical(rgbvec, 0)) stop("color did not match one of the choices") 105 | 106 | # This is just an example, for "real" use for getting rgb colors one would want to have a data frame 107 | # with this information (many color names and corresponding rgb values) 108 | # and extract the rgb values for a given color from it. 109 | # The color information above came from 110 | # https://en.wikipedia.org/wiki/Web_colors 111 | # side note: there are web sites for checking how a color figure would appear 112 | # to people with various types of color blindness, for example 113 | # https://www.color-blindness.com/coblis-color-blindness-simulator/ 114 | ``` 115 | 116 | In many cases in a block of if, else if, else statements the conditions will be 117 | mutually exclusive (at most one of the conditions will be TRUE), and (when there is an else statement) 118 | one of the code blocks will certainly be executed. 119 | The range of values for which the conditions are TRUE can also be increasing or decreasing 120 | sets ("nested" sets) if one orders the sequence of if tests appropriately as 121 | in the exercise program below. 122 | 123 | 124 | ### Instruction for the exercise function using an if, else if, else block 125 | 126 | Constructing the function specified below is an exercise in using an if, else if, else block. 127 | Yes, one could do this by simply using the **signif** or **format** function, but the point here 128 | is to practice if logic. 129 | 130 | We are given a p-value, denoted by pval, (a number between 0 and 1) and we want to convert it 131 | to a value having 3 significant digits (specifically, we want the corresponding character string). 132 | For example each of these values have 3 significant digits (for a number < 1, "leading 0's" to the right of the 133 | decimal point before one encounters a non-zero digit "don't count" toward the number of significant digits): 134 | 0.123, 0.0123, 0.00123, 0.000123, 0.0000123, 0.00000123 135 | 136 | We would like to do this since for a p-value such as 0.012345678, in most circumstances digits beyond 137 | 0.0123 are just "clutter" since they would rarely matter, and also often wouldn't be justified 138 | due to limited accuracy of the data. 139 | An aside: one reason to keep many digits would be to check code by comparing results 140 | with an independent calculation or "known" value. 141 | 142 | Background note (but not necessary for writing this function): p-values often come from some statistical 143 | test, such as on whether measured values from (independent random) samples from two groups 144 | indicate that the two groups have different group means (for what was being measured). 145 | The pval is then (for this example) the probability that one 146 | would have seen, just by random chance, a difference in group means as large in magnitude or larger 147 | than the difference one actually observed, if the "truth" was that there was no difference between 148 | the two groups (in what was being measured). 149 | 150 | The bottom line is that for this exercise we want to convert pval to a character string having 3 significant digits. 151 | 152 | Again, one could do this by simply using the **signif** or **format** function, but the point here 153 | is to practice if logic. So you are to use the **round** function which takes as input a number (or vector) and 154 | "rounds" the input value(s) to have only a specified number of digits appearing to the right of the decimal point. 155 | 156 | The lines below illustrate the behavior of the **round** function, round(numeric.value, digits), which specifies 157 | the number of digits to the right of the decimal point 158 | in the returned value. 159 | 160 | 161 | ``` 162 | > round(1.123, digits = 3) 163 | [1] 1.123 164 | > round(0.123, digits = 3) 165 | [1] 0.123 166 | > round(0.666666667, digits = 3) 167 | [1] 0.667 168 | > round(0.0123, digits = 3) 169 | [1] 0.012 170 | > round(0.0123, digits = 4) 171 | [1] 0.0123 172 | > round(0.00123, digits = 5) 173 | [1] 0.00123 174 | > round(0.000123, digits = 6) 175 | [1] 0.000123 176 | > round(0.0000123, digits = 7) 177 | [1] 1.23e-05 178 | ``` 179 | 180 | Your function, call it pval_To_3Sig_Digits should have the 1 argument pval (a number between 0 and 1, including 0 and 1) 181 | and should return a character string corresponding to pval rounded to 3 significant digits, using one if, else if, else 182 | block of code (that will have multiple else if statements within it), and 183 | the round function. If pval is < 0.00001, then return the character string "p < 0.00001" 184 | The **as.character** function will convert a numeric value to the corresponding character string 185 | 186 | Some hints follow, try programming your function using as few hints as possible. 187 | 188 | Think about how to order the conditions in the if, else if, else statements. Start by treating smaller p-values first and 189 | end with larger p-values. Note more than 1 test within an if, else if, else block of code might be satisfied: in 190 | that case the "consequence" of the first test that is satisfied will be carried out and then control will pass 191 | to the first statement after the if, elseif, else block of code. 192 | 193 | Larger hint: the beginning and end of your function should resemble 194 | 195 | ``` 196 | pval_To_3Sig_Digits <- function(pval) { 197 | # the input pval is to be a number between 0 and 1 198 | # use the round function and an if, else if, else block to 199 | # return a character string corresponding to pval rounded to 200 | # 3 significant digits 201 | 202 | # check pvalue is a number between 0 and 1 203 | if(!is.numeric(pval)) stop("pval is not numeric") 204 | # check pval is between 0 and 1 205 | if(pval < 0 || pval > 1) stop("pval not between 0 and 1") 206 | 207 | # the start of the if, else if, else block 208 | 209 | if (pval < 0.00001) { 210 | pval.string <- "p < 0.00001" 211 | 212 | # note if one has gotten to here, pval must be >= 0.00001 so 213 | # digits should be 7 be treat values like 0.0000123 214 | # How small does pval have to be to not get more than 215 | # 3 significant digits from round(pval, digits = 7): this 216 | # determines the next else if condition 217 | 218 | } else if (pval < 0.0001) { 219 | pval.string <- as.character(round(pval, digits = 7)) 220 | # if pval was >= 0.0001 then digits = 7 would in general get 221 | # 4 significant digits 222 | # Hint: the code in the next else if block will use digits = 6, 223 | # what should the condition be in the next else if so that 224 | # digits = 6 does not give more than 3 significant digits 225 | 226 | 227 | # MORE CODE needs to be provided 228 | 229 | 230 | } else { 231 | pval.string <- as.character(round(pval, digits = 3)) 232 | } 233 | 234 | return(pval.string) 235 | } 236 | ``` 237 | 238 | One could also start testing if pval >= 0.001 and "work downward" using successively 239 | smaller values of pval; doing it this way, the else if tests will involve testing whether 240 | pval >= some value 241 | 242 | ## A working version of pval_To_3Sig_Digits 243 | 244 | ```{r} 245 | pval_To_3Sig_Digits <- function(pval) { 246 | # the input pval is to be a number between 0 and 1 247 | # use the round function and an if, else if, else block to 248 | # return a character string corresponding to pval rounded to 249 | # 3 significant digits 250 | 251 | # check pvalue is a number between 0 and 1 252 | if(!is.numeric(pval)) stop("pval is not numeric") 253 | # check pval is between 0 and 1 254 | if(pval < 0 || pval > 1) stop("pval not between 0 and 1") 255 | 256 | # the if, else if, else block 257 | 258 | if (pval < 0.00001) { 259 | pval.string <- "p < 0.00001" 260 | } else if (pval < 0.0001) { 261 | pval.string <- as.character(round(pval, digits = 7)) 262 | } else if (pval < 0.001) { 263 | pval.string <- as.character(round(pval, digits = 6)) 264 | } else if (pval < 0.01) { 265 | pval.string <- as.character(round(pval, digits = 5)) 266 | } else if (pval < 0.1) { 267 | pval.string <- as.character(round(pval, digits = 4)) 268 | } else { 269 | pval.string <- as.character(round(pval, digits = 3)) 270 | } 271 | 272 | return(pval.string) 273 | } 274 | ``` 275 | 276 | 277 | Do some test runs 278 | 279 | ```{r} 280 | pval_To_3Sig_Digits(1.0) 281 | pval_To_3Sig_Digits(0.123456) 282 | pval_To_3Sig_Digits(0.0123456) 283 | pval_To_3Sig_Digits(0.00123456) 284 | pval_To_3Sig_Digits(0.000123456) 285 | pval_To_3Sig_Digits(0.0000123456) 286 | pval_To_3Sig_Digits(0.00001) 287 | pval_To_3Sig_Digits(0.00000999) 288 | pval_To_3Sig_Digits(0) 289 | 290 | # a note just for future reference 291 | # if one doesn't want scientific (exponential) notation 292 | # one can use the options function to change 293 | # "scipen" which is an R system variable (here an integer) governing 294 | # when R will use scientific notation (for small or large numbers) 295 | # options can change various R options for the current R session 296 | getOption("scipen") # 0 so can change it back 297 | options("scipen" = 999) # don't do scientific (exponential) notation 298 | pval_To_3Sig_Digits(0.0000123456) 299 | pval_To_3Sig_Digits(0.00001) 300 | options("scipen" = 0) # reset to the default value we retrieved above 301 | getOption("scipen") # check it was reset 302 | ``` 303 | 304 | Hope this programming exercise was informative and good practice with 305 | writing a function with an if block. 306 | -------------------------------------------------------------------------------- /Fourth-R-programming-exercise-find-prime-integers-less-than-or-equal-N.md: -------------------------------------------------------------------------------- 1 | "Fourth R programming exercise find prime integers less than or equal N" 2 | ------------------------------------------------------------------------ 3 | 4 | ### Alan E. Berger November 22, 2020 5 | 6 | ### version 1 7 | 8 | ### available at 9 | 10 | Finish the construction of a function to return all the prime numbers between 1 and a positive integer N 11 | -------------------------------------------------------------------------------------------------------- 12 | 13 | Introduction 14 | ------------ 15 | 16 | This is the fourth in a sequence of programming exercises in "composing" an R function to carry out a particular task. The idea is to practice correct use of R constructs and built in functions (functions that "come with" the basic R installation), while learning how to "put together" a correct sequence of blocks of commands that will obtain the desired result. 17 | Note these exercises are quite cumulative - one should do them in order. 18 | 19 | In these exercises, there will be a statement of what your function should do (what are the input variables and what the function should return) and a sequence of "hints". To get the most out of these exercises, try to write your function using as few hints as possible. 20 | Note there are often several ways to write a function that will obtain the correct result. For these exercises the directions and hints may point toward a particular approach intended to practice particular constructs in R and a particular line of reasoning, even if there is a more efficent way to obtain the same result. 21 | There may also be an existing R function or package that will do what is stated for a given practice exercise, but here the point is to practice formulating a logical sequence of steps, with each step a section of code, to obtain a working function, not to find an existing solution or a quick solution using a more powerful R construct that is better addressed later on. 22 | 23 | Motivation for this exercise 24 | ---------------------------- 25 | 26 | For this exercise, we will finish constructing the function getPrimeNumbers(N = 1000) which will return all the prime numbers between 1 and the positive integer N. We will use the isItPrime(n) function constructed in the previous exercise, which tests whether the positive integer n is a prime number. This illustrates construction of a function in several steps and in a modular fashion, allowing for flexibility and easier testing and debugging. 27 | 28 | Background 29 | ---------- 30 | 31 | Recall the definitions and results about prime numbers from the previous exercise: 32 | A positive integer q **evenly divides** a positive integer n if there is a positive integer k such that n = k \* q, for example 3 evenly divides 15; 6 evenly divides 24; but 4 does not evenly divide 9 (in integer arithmetic, since 9 = 2 \* 4 with a **remainder** of 1). 33 | R provides the **mod** function **%%** such that n %% q gives the remainder **r** from integer dividing n by q (also phrased as **n equals r mod q**). So q evenly divides n is equivalent to n %% q = 0 34 | 35 | A positive integer p is called **prime** if p > 1 and the only positive integers that evenly divide p are 1 and p (so the first several prime numbers are 2, 3, 5, 7, 11, 13). In the previous exercise we used the mod function to construct the isItPrime(n) function. 36 | 37 | The function to be constructed is **getPrimeNumbers**, whose argument N is to be a positive integer greater than 1, and which should return, in a vector, call it for example primes\_up\_to\_N, all the prime numbers between 2 and N (including 2, and if N is a prime number, N). 38 | 39 | Instructions for constructing **getPrimeNumbers** 40 | ------------------------------------------------- 41 | 42 | In the previous exercise we constructed **isItPrime(n)** whose argument is a positive integer n that is at most 1,000,000 (just to avoid accidently starting an extremely time consuming calculation) which will return either TRUE if n is a prime and FALSE otherwise. This is a copy of isItPrime, the same as in the previous exercise, except that here I have commented out the check for N being too large since that will be done in getPrimeNumbers: 43 | 44 | ``` r 45 | isItPrime <- function(n) { 46 | # determine whether the positive integer n is prime 47 | # using the mod function, Version 2 48 | 49 | # check that the function argument is "admissible" 50 | # test that n is a positive integer (or a real number that equals a positive integer) 51 | n.int <- as.integer(n) 52 | # if n was a real number such as 3.2 then n.int will be n truncated 53 | # to an integer (for this example, 3) 54 | 55 | if(!(n.int == n)) stop("n is not an integer") 56 | if(n < 1) stop("n is not positive") 57 | 58 | # stop if n is "too large" to avoid a very long calculation 59 | # if(n > 1000000) stop("n is > a million") 60 | 61 | # code to test if n is prime using R's mod function %% 62 | # return TRUE or FALSE 63 | 64 | if(n.int == 1) return(FALSE) 65 | if(n.int == 2) return(TRUE) 66 | # if got to here, n is at least 3 67 | # test if an integer between 2 and sqrt(n) + 1 evenly divides n 68 | 69 | lastq <- as.integer(sqrt(n)) + 1L 70 | # the L in 1L "tells" R is treat 1 as an 71 | # integer value rather than a real (numeric) value 72 | # this could also have equivalently been done by 73 | # lastq <- as.integer(sqrt(n) + 1) 74 | for (q in 2:lastq) { 75 | if((n %% q) == 0) return(FALSE) 76 | } 77 | 78 | # if got to here, n is prime 79 | return(TRUE) 80 | } 81 | ``` 82 | 83 | Use a for loop and use isItPrime(n) to test each positive integer n between 2 and N to see if it is prime. Return the integers that are found to be prime in a vector called, for example, primes\_up\_to\_N 84 | 85 | For the first version of getPrimeNumbers, use the following simple comstruction to obtain primes\_up\_to\_N: initialize primes\_up\_to\_N to be integer(0), then in a for loop whose index, call it n, runs from 2L to N, use isItPrime to test if n is a prime. If n is prime, append n to primes\_up\_to\_N via the statement 86 | 87 | primes\_up\_to\_N <- c(primes\_up\_to\_N, n) 88 | 89 | Try writing getPrimeNumbers now. 90 | 91 | If you do getPrimeNumbers(N = 111) you should get 92 | 93 | getPrimeNumbers(111) 94 | [1] 2 3 5 7 11 13 17 19 23 29 31 37 41 43 47 95 | [16] 53 59 61 67 71 73 79 83 89 97 101 103 107 109 96 | 97 | The number of values printed on each line in an R session depends on the width of the R console window. 98 | 99 | A working version of getPrimeNumbers follows: 100 | 101 | ``` r 102 | getPrimeNumbers <- function(N) { 103 | # N should be a positive integer that is at least 2 104 | # return a vector containing all the prime numbers between 2 and N 105 | # (including 2 and including N if N is a prime) 106 | 107 | # check that the function argument is "admissible" 108 | # test that N is a positive integer (or a real number that equals a positive integer) 109 | N.int <- as.integer(N) 110 | # if N was a real number such as 3.2 then N.int will be N truncated 111 | # to an integer (for this example, 3) 112 | 113 | if(!(N.int == N)) stop("N is not an integer") 114 | if(N < 2) stop("N is not at least 2") 115 | 116 | # stop if N is "too large" to avoid a very long calculation 117 | if(N > 1000000) stop("N is > a million") 118 | 119 | # initialize primes_up_to_N 120 | primes_up_to_N <- integer(0) 121 | 122 | for (n in 2L:N.int) { 123 | if(isItPrime(n)) { 124 | primes_up_to_N <- c(primes_up_to_N, n) 125 | } 126 | } 127 | 128 | return(primes_up_to_N) 129 | } 130 | ``` 131 | 132 | Using a **running index** with a preset vector to obtain primes\_up\_to\_N 133 | -------------------------------------------------------------------------- 134 | 135 | In the next version of getPrimeNumbers, instead of doing 136 | 137 | primes\_up\_to\_N <- c(primes\_up\_to\_N, n) 138 | 139 | to "accumulate" the prime numbers in a vector, you are to initialize the integer vector 140 | primes\_up\_to\_N to be of length N to contain the prime numbers between 2 and N. Obviously this vector will generally be larger than needed, but we can place each prime number as it is found into successive entries of primes\_up\_to\_N using a **running index**, call it k. How this works is one intializes k to 0 and then each time inside the for loop an integer n is found to be prime, one increases k by 1 and then sets primes\_up\_to\_N\[k\] <- n When the for loop is completed, k will be the number of primes that were found between 2 and N, and so one then "trims" primes\_up\_to\_N by doing 141 | 142 | primes\_up\_to\_N <- primes\_up\_to\_N\[1:k\] 143 | 144 | This takes more initial storage space, but is "cleaner" than successively creating new vectors by doing primes\_up\_to\_N <- c(primes\_up\_to\_N, n) and is a technique one should be familiar with. 145 | 146 | Try writing a version of getPrimeNumbers that uses a predefined primes\_up\_to\_N integer vector (of length N) and a running index to fill in its entries, and then trim it to the correct length before returning it. A working version is given below. 147 | 148 | ``` r 149 | getPrimeNumbers <- function(N) { 150 | # N should be a positive integer that is at least 2 151 | # return a vector containing all the prime numbers between 2 and N 152 | # (including 2 and including N if N is a prime) 153 | 154 | # for this version use a predefined integer vector primes_up_to_N of 155 | # length N and a running index k to fill in entries, and then trim it 156 | # after the for loop is completed 157 | 158 | # check that the function argument is "admissible" 159 | # test that N is a positive integer (or a real number that equals a positive integer) 160 | N.int <- as.integer(N) 161 | # if N was a real number such as 3.2 then N.int will be N truncated 162 | # to an integer (for this example, 3) 163 | 164 | if(!(N.int == N)) stop("N is not an integer") 165 | if(N < 2) stop("N is not at least 2") 166 | 167 | # if N is "too large" (> 1,000,000) then stop 168 | if(N > 1000000) { 169 | cat("N = ", N, "\n") # print N and also include going to a new output line 170 | stop("N is > a million") 171 | } 172 | 173 | # initialize primes_up_to_N 174 | primes_up_to_N <- integer(N) 175 | k <- 0 # the running index 176 | 177 | for (n in 2L:N.int) { 178 | if(isItPrime(n)) { 179 | k <- k + 1 # get next location in primes_up_to_N 180 | primes_up_to_N[k] <- n 181 | } 182 | } 183 | 184 | primes_up_to_N <- primes_up_to_N[1:k] # trim to correct length 185 | return(primes_up_to_N) 186 | } 187 | 188 | # do a test run 189 | getPrimeNumbers(111) 190 | ``` 191 | 192 | ## [1] 2 3 5 7 11 13 17 19 23 29 31 37 41 43 47 53 59 61 67 193 | ## [20] 71 73 79 83 89 97 101 103 107 109 194 | 195 | On my computer the latter version of getPrimeNumbers runs a bit faster than the former version (for N = 1000000 the former takes about 18 seconds and the latter 12 seconds). 196 | 197 | Using the **readline** function to let the user decide whether to continue a run if N > a million 198 | ---------------------------------------------------------------------------------------------------- 199 | 200 | The next version of getPrimeNumbers is the same as the one immediately above except that instead of stopping with an error if N is > 1,000,000 this version asks the user to decide whether or not to continue with running the function by replying "yes" or "no" using the **readline** function if N is > 1,000,000 as illustrated below. 201 | 202 | ``` r 203 | getPrimeNumbers <- function(N) { 204 | # N should be a positive integer that is at least 2 205 | # return a vector containing all the prime numbers between 2 and N 206 | # (including 2 and including N if N is a prime) 207 | 208 | # for this version use a predefined integer vector primes_up_to_N of 209 | # length N and a running index k to fill in entries, and then trim it 210 | # after the for loop is completed 211 | 212 | # check that the function argument is "admissible" 213 | # test that N is a positive integer (or a real number that equals a positive integer) 214 | N.int <- as.integer(N) 215 | # if N was a real number such as 3.2 then N.int will be N truncated 216 | # to an integer (for this example, 3) 217 | 218 | if(!(N.int == N)) stop("N is not an integer") 219 | if(N < 2) stop("N is not at least 2") 220 | 221 | 222 | # if N is "large" (> 1,000,000) check with the user to see if the user wants to proceed 223 | if(N > 1000000) { 224 | cat("N = ", N, "\n") # print N and also include going to a new output line 225 | yes.or.no <- readline("this N is large, do you want to continue, type yes or no: ") 226 | if(yes.or.no != "yes") return("N was large so exited getPrimeNumbers") 227 | } 228 | 229 | 230 | # initialize primes_up_to_N 231 | primes_up_to_N <- integer(N) 232 | k <- 0 # the running index 233 | 234 | for (n in 2L:N.int) { 235 | if(isItPrime(n)) { 236 | k <- k + 1 # get next location in primes_up_to_N 237 | primes_up_to_N[k] <- n 238 | } 239 | } 240 | 241 | primes_up_to_N <- primes_up_to_N[1:k] # trim to correct length 242 | return(primes_up_to_N) 243 | } 244 | 245 | # do a test run 246 | getPrimeNumbers(111) 247 | ``` 248 | 249 | ## [1] 2 3 5 7 11 13 17 19 23 29 31 37 41 43 47 53 59 61 67 250 | ## [20] 71 73 79 83 89 97 101 103 107 109 251 | 252 | ``` r 253 | # do a second test run 254 | primes.for.N.equal.a.million <- getPrimeNumbers(1000000) 255 | length(primes.for.N.equal.a.million) # should be 78498 256 | ``` 257 | 258 | ## [1] 78498 259 | 260 | ``` r 261 | primes.for.N.equal.a.million[1000] # should be 7919 262 | ``` 263 | 264 | ## [1] 7919 265 | 266 | ``` r 267 | primes.for.N.equal.a.million[10000] # should be 104729 268 | ``` 269 | 270 | ## [1] 104729 271 | 272 | ``` r 273 | tail(primes.for.N.equal.a.million) # the last value should be 999983 274 | ``` 275 | 276 | ## [1] 999931 999953 999959 999961 999979 999983 277 | 278 | Hope this programming exercise was informative and good practice. The next set of exercises will get into using data frames. 279 | 280 | = = = = = = = = = = = = = = = = = = = = = = = = 281 | 282 | This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. To view a copy of this license, visit or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. There is a full version of this license at this web site: 283 | -------------------------------------------------------------------------------- /second-R-programming-exercise-if-else-if-else-syntax-and-logic.md: -------------------------------------------------------------------------------- 1 | "Second programming exercise if, else if, else syntax and logic" 2 | ---------------------------------------------------------------- 3 | 4 | ### Alan Berger Aug 25, 2020 minor edits Jan 18, 2021 5 | 6 | ### version 1 7 | 8 | Introduction 9 | ------------ 10 | 11 | This is the second in a sequence of programming exercises intended to 12 | fill the gap between learning the correct syntax of basic R commands and 13 | the programming assignments in the R Programming course in the Johns 14 | Hopkins University Data Science Specialization on Coursera. In this 15 | sequence of exercises in "composing" an R function to carry out a 16 | particular task, the idea is to practice correct use of R constructs and 17 | built in functions (functions the "come with" the basic R installation), 18 | while learning how to "put together" a correct sequence of blocks of 19 | commands that will obtain the desired result. 20 | Note these exercises are quite cumulative - one should do them in order. 21 | 22 | In these exercises, there will be a statement of what your function 23 | should do (what are the input variables and what the function should 24 | return) and a sequence of "hints". To get the most out of these 25 | exercises, try to write your function using as few hints as possible. 26 | Note there are often several ways to write a function that will obtain 27 | the correct result. For these exercises the directions and hints may 28 | point toward a particular approach intended to practice particular 29 | constructs in R and a particular line of reasoning. 30 | There may be an existing R function or package that will do what is 31 | stated for a given practice exercise, but here (unlike other aspects of 32 | the R Programming course) the point is to practice formulating a logical 33 | sequence of steps, with each step a section of code, to obtain a working 34 | function, not to find an existing solution or a quick solution using a 35 | more powerful R construct that is better addressed later on. 36 | 37 | Motivation for this exercise 38 | ---------------------------- 39 | 40 | If statements are a basic construct for determining which commands or 41 | blocks of commands should be executed. The specific function for this 42 | exercise is described below. 43 | 44 | Some if, else if, else templates 45 | -------------------------------- 46 | 47 | Below are templates for common if statement constructs. Note it is 48 | helpful to use indentation of lines of code and blank spaces between 49 | sections of code as illustrated below. The number of spaces of 50 | indentation is somewhat a personal choice, balancing making code easy to 51 | read by having separate blocks of code stand out by having more spaces 52 | in indentations, with not having too many R commands extend over more 53 | than 1 line (and in general, don't go over 80 characters in a line). The 54 | motivation is that code that is easier to read is easier to proofread 55 | and spot bugs in. 56 | 57 | if (condition1) short.code # short.code is a short R command 58 | # condition1 (and below, condition2, condition3) is a logical statement evaluating to TRUE or FALSE 59 | # when condition1 is true, execute the short.code statement 60 | 61 | 62 | 63 | if (condition1) { 64 | code1 # where code1 (and below, code2, code3, code4) stands for one or more lines of code 65 | } 66 | # when condition1 is true, execute code1 67 | 68 | 69 | 70 | if (condition1) { 71 | code1 72 | } else { 73 | code2 74 | } 75 | # when condition1 is TRUE, then the line(s) of code1 are executed, 76 | # otherwise the line(s) of code2 are executed 77 | 78 | 79 | 80 | if (condition1) { 81 | code1 82 | } else if (condition2) { 83 | code2 84 | } 85 | # when condition1 is TRUE, code1 is executed 86 | # when condition1 is FALSE, then code2 will be executed if condition2 is TRUE; 87 | # when neither condition is TRUE, neither code1 nor code2 get executed, and 88 | # R "proceeds" to the next line of code 89 | 90 | 91 | 92 | # The full range of possibilities within one if, else if, else block are 93 | # illustrated here 94 | if (condition1) { 95 | code1 96 | } else if (condition2) { 97 | code2 98 | } else if (condition3) { 99 | code3 100 | } else { 101 | code4 102 | } 103 | # there can be any (reasonable) number of "else ifs" 104 | 105 | 106 | # Another quick way to use a sequence of if tests when there are only 107 | # a few cases is shown in this example, of getting the rgb (red, green, blue) 108 | # color scale values when there are only a couple possible color names. 109 | # This is, if nothing else, code that is easy to read/understand 110 | 111 | rgbvec <- 0 # when, after the if statements, rgbvec is still 0, throw an error 112 | if(color == "Magenta") rgbvec <- c(255, 0, 255) 113 | if(color == "ForestGreen") rgbvec <- c(34, 139, 34) 114 | if(color == "Cyan") rgbvec <- c(0, 255, 255) 115 | # test that color matched one of the above choices 116 | if(identical(rgbvec, 0)) stop("color did not match one of the choices") 117 | 118 | # This is just an example, for "real" use for getting rgb colors one would want to have a data frame 119 | # with this information (many color names and corresponding rgb values) 120 | # and extract the rgb values for a given color from it. 121 | # The color information above came from 122 | # https://en.wikipedia.org/wiki/Web_colors 123 | # side note: there are web sites for checking how a color figure would appear 124 | # to people with various types of color blindness, for example 125 | # https://www.color-blindness.com/coblis-color-blindness-simulator/ 126 | 127 | In many cases in a block of if, else if, else statements the conditions 128 | will be mutually exclusive (at most one of the conditions will be TRUE), 129 | and (when there is an else statement) one of the code blocks will 130 | certainly be executed. The range of values for which the conditions are 131 | TRUE can also be increasing or decreasing sets ("nested" sets) if one 132 | orders the sequence of if tests appropriately as in the exercise program 133 | below. 134 | 135 | ### Instruction for the exercise function using an if, else if, else block 136 | 137 | Constructing the function specified below is an exercise in using an if, 138 | else if, else block. Yes, one could do this by simply using the 139 | **signif** or **format** function, but the point here is to practice if 140 | logic. 141 | 142 | We are given a p-value, denoted by pval, (a number between 0 and 1) and 143 | we want to convert it to a value having 3 significant digits 144 | (specifically, we want the corresponding character string). For example 145 | each of these values have 3 significant digits (for a number < 1, 146 | "leading 0's" to the right of the decimal point before one encounters a 147 | non-zero digit "don't count" toward the number of significant digits): 148 | 0.123, 0.0123, 0.00123, 0.000123, 0.0000123, 0.00000123 149 | 150 | We would like to do this since for a p-value such as 0.012345678, in 151 | most circumstances digits beyond 0.0123 are just "clutter" since they 152 | would rarely matter, and also often wouldn't be justified due to limited 153 | accuracy of the data. An aside: one reason to keep many digits would be 154 | to check code by comparing results 155 | with an independent calculation or "known" value. 156 | 157 | Background note (but not necessary for writing this function): p-values 158 | often come from some statistical test, such as on whether measured 159 | values from (independent random) samples from two groups indicate that 160 | the two groups have different group means (for what was being measured). 161 | The pval is then (for this example) the probability that one would have 162 | seen, just by random chance, a difference in group means as large in 163 | magnitude or larger than the difference one actually observed, if the 164 | "truth" was that there was no difference between the two groups (in what 165 | was being measured). 166 | 167 | The bottom line is that for this exercise we want to convert pval to a 168 | character string having 3 significant digits. 169 | 170 | Again, one could do this by simply using the **signif** or **format** 171 | function, but the point here is to practice if logic. So you are to use 172 | the **round** function which takes as input a number (or vector) and 173 | "rounds" the input value(s) to have only a specified number of digits 174 | appearing to the right of the decimal point. 175 | 176 | The lines below illustrate the behavior of the **round** function, 177 | round(numeric.value, digits), which specifies the number of digits to 178 | the right of the decimal point in the returned value. 179 | 180 | > round(1.123, digits = 3) 181 | [1] 1.123 182 | > round(0.123, digits = 3) 183 | [1] 0.123 184 | > round(0.666666667, digits = 3) 185 | [1] 0.667 186 | > round(0.0123, digits = 3) 187 | [1] 0.012 188 | > round(0.0123, digits = 4) 189 | [1] 0.0123 190 | > round(0.00123, digits = 5) 191 | [1] 0.00123 192 | > round(0.000123, digits = 6) 193 | [1] 0.000123 194 | > round(0.0000123, digits = 7) 195 | [1] 1.23e-05 196 | 197 | Your function, call it pval\_To\_3Sig\_Digits should have the 1 argument 198 | pval (a number between 0 and 1, including 0 and 1) and should return a 199 | character string corresponding to pval rounded to 3 significant digits, 200 | using one if, else if, else block of code (that will have multiple else 201 | if statements within it), and the round function. If pval is < 202 | 0.00001, then return the character string "p < 0.00001" The 203 | **as.character** function will convert a numeric value to the 204 | corresponding character string 205 | 206 | Some hints follow, try programming your function using as few hints as 207 | possible. 208 | 209 | Think about how to order the conditions in the if, else if, else 210 | statements. Start by treating smaller p-values first and end with larger 211 | p-values. Note more than 1 test within an if, else if, else block of 212 | code might be satisfied: in that case the "consequence" of the first 213 | test that is satisfied will be carried out and then control will pass to 214 | the first statement after the if, elseif, else block of code. 215 | 216 | Larger hint: the beginning and end of your function should resemble 217 | 218 | pval_To_3Sig_Digits <- function(pval) { 219 | # the input pval is to be a number between 0 and 1 220 | # use the round function and an if, else if, else block to 221 | # return a character string corresponding to pval rounded to 222 | # 3 significant digits 223 | 224 | # check pvalue is a number between 0 and 1 225 | if(!is.numeric(pval)) stop("pval is not numeric") 226 | # check pval is between 0 and 1 227 | if(pval < 0 || pval > 1) stop("pval not between 0 and 1") 228 | 229 | # the start of the if, else if, else block 230 | 231 | if (pval < 0.00001) { 232 | pval.string <- "p < 0.00001" 233 | 234 | # note if one has gotten to here, pval must be >= 0.00001 so 235 | # digits should be 7 be treat values like 0.0000123 236 | # How small does pval have to be to not get more than 237 | # 3 significant digits from round(pval, digits = 7): this 238 | # determines the next else if condition 239 | 240 | } else if (pval < 0.0001) { 241 | pval.string <- as.character(round(pval, digits = 7)) 242 | # if pval was >= 0.0001 then digits = 7 would in general get 243 | # 4 significant digits 244 | # Hint: the code in the next else if block will use digits = 6, 245 | # what should the condition be in the next else if so that 246 | # digits = 6 does not give more than 3 significant digits 247 | 248 | 249 | # MORE CODE needs to be provided 250 | 251 | 252 | } else { 253 | pval.string <- as.character(round(pval, digits = 3)) 254 | } 255 | 256 | return(pval.string) 257 | } 258 | 259 | One could also start testing if pval >= 0.001 and "work downward" 260 | using successively smaller values of pval; doing it this way, the else 261 | if tests will involve testing whether pval >= some value 262 | 263 | A working version of pval\_To\_3Sig\_Digits 264 | ------------------------------------------- 265 | 266 | pval_To_3Sig_Digits <- function(pval) { 267 | # the input pval is to be a number between 0 and 1 268 | # use the round function and an if, else if, else block to 269 | # return a character string corresponding to pval rounded to 270 | # 3 significant digits 271 | 272 | # check pvalue is a number between 0 and 1 273 | if(!is.numeric(pval)) stop("pval is not numeric") 274 | # check pval is between 0 and 1 275 | if(pval < 0 || pval > 1) stop("pval not between 0 and 1") 276 | 277 | # the if, else if, else block 278 | 279 | if (pval < 0.00001) { 280 | pval.string <- "p < 0.00001" 281 | } else if (pval < 0.0001) { 282 | pval.string <- as.character(round(pval, digits = 7)) 283 | } else if (pval < 0.001) { 284 | pval.string <- as.character(round(pval, digits = 6)) 285 | } else if (pval < 0.01) { 286 | pval.string <- as.character(round(pval, digits = 5)) 287 | } else if (pval < 0.1) { 288 | pval.string <- as.character(round(pval, digits = 4)) 289 | } else { 290 | pval.string <- as.character(round(pval, digits = 3)) 291 | } 292 | 293 | return(pval.string) 294 | } 295 | 296 | Do some test runs 297 | 298 | pval_To_3Sig_Digits(1.0) 299 | 300 | ## [1] "1" 301 | 302 | pval_To_3Sig_Digits(0.123456) 303 | 304 | ## [1] "0.123" 305 | 306 | pval_To_3Sig_Digits(0.0123456) 307 | 308 | ## [1] "0.0123" 309 | 310 | pval_To_3Sig_Digits(0.00123456) 311 | 312 | ## [1] "0.00123" 313 | 314 | pval_To_3Sig_Digits(0.000123456) 315 | 316 | ## [1] "0.000123" 317 | 318 | pval_To_3Sig_Digits(0.0000123456) 319 | 320 | ## [1] "1.23e-05" 321 | 322 | pval_To_3Sig_Digits(0.00001) 323 | 324 | ## [1] "1e-05" 325 | 326 | pval_To_3Sig_Digits(0.00000999) 327 | 328 | ## [1] "p < 0.00001" 329 | 330 | pval_To_3Sig_Digits(0) 331 | 332 | ## [1] "p < 0.00001" 333 | 334 | # a note just for future reference 335 | # if one doesn't want scientific (exponential) notation 336 | # one can use the options function to change 337 | # "scipen" which is an R system variable (here an integer) governing 338 | # when R will use scientific notation (for small or large numbers) 339 | # options can change various R options for the current R session 340 | getOption("scipen") # 0 so can change it back 341 | 342 | ## [1] 0 343 | 344 | options("scipen" = 999) # don't do scientific (exponential) notation 345 | pval_To_3Sig_Digits(0.0000123456) 346 | 347 | ## [1] "0.0000123" 348 | 349 | pval_To_3Sig_Digits(0.00001) 350 | 351 | ## [1] "0.00001" 352 | 353 | options("scipen" = 0) # reset to the default value we retrieved above 354 | getOption("scipen") # check it was reset 355 | 356 | ## [1] 0 357 | 358 | Hope this programming exercise was informative and good practice with 359 | writing a function with an if block. 360 | -------------------------------------------------------------------------------- /Tenth-R-Practice-exercise-merging-annotation-data-into-a-gene-expression-analysis-results-data-frame.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | md_document: 4 | variant: markdown_github 5 | # output: pdf_document 6 | --- 7 | 8 | ## Tenth R Practice exercise merging annotation data into a gene expression analysis results data frame.Rmd 9 | 10 | ### Alan E. Berger Feb 17, 2020 11 | 12 | ### available at https://github.com/AlanBerger/Practice-programming-exercises-for-R 13 | 14 | ## Introduction 15 | 16 | This is the tenth in a sequence of programming exercises in "composing" an R function 17 | to carry out a particular task. Several of these "exercise files" likely 18 | will take several sessions to master the content. The material below practices composing a logical 19 | sequence of steps to program a function that will accomplish a specified task, and 20 | preparing a corresponding data frame. 21 | 22 | The idea of this set of exercises is to practice correct use of R constructs and 23 | built in functions (functions that "come with" the basic R installation), while learning how 24 | to "put together" a correct sequence of blocks of commands that will obtain the desired result. 25 | Note these exercises are quite cumulative - one should do them in order. 26 | 27 | In these exercises, there will be a statement of what your function should do 28 | (what are the input variables and what the function should return) and a sequence of "hints". 29 | To get the most out of these exercises, try to write your function using as few hints as possible. 30 | Note there are often several ways to write a function that will obtain the correct result. 31 | For these exercises the directions and hints may point toward a particular approach intended to 32 | practice particular constructs in R and a particular line of reasoning, 33 | even if there is a more efficent way to obtain the same result. 34 | There may also be an existing R function or package that will do what is stated for a given 35 | practice exercise, but here the point is to practice formulating a logical sequence of steps, 36 | with each step a section of code, to obtain a working function, not to find an existing 37 | solution or a quick solution using a more powerful R construct that is better addressed later on. 38 | 39 | ## Motivation for this exercise 40 | 41 | In some cases, such as with a gene expression data set, one will want to combine analysis results as 42 | obtained in the previous exercise with annotation information on the probes and on the genes that is in a 43 | separate file that can be read in as a data frame. 44 | 45 | In the R code below we repeat the analysis, done in the previous exercise, of a small subset of gene expression 46 | data comparing expression levels in PBMC samples from patients with Wegener's granulomatosis (WG) with samples 47 | from normal controls (NC). We then also read in a small subset of the annotation file for the Illumina microarray 48 | platform used to measure these expression levels. The web site containing the full expression data set is: 49 | https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE18885 50 | and the web site containing the full annotation data for the microarray platform used in obtaining this data is: 51 | https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL6104 52 | 53 | What we will want to do is, conceptually, for each row **r** of the analysis results data frame, "find" the row **ra** 54 | of the annotation data frame that has the same Illumina Probe_ID and in effect append selected columns of row ra 55 | from the annotation data frame to row r of the analysis results data frame. R has a function **merge** that will 56 | do this, but for the first exercise we will practice using basic R constructs to compose R code that will do this - 57 | the second exercise here will use the merge function. First read through the code below, that provides the data frames 58 | one will use. 59 | 60 | ``` {r} 61 | ############## analyze the gene expression data 62 | 63 | # the url for reading the little gene expression data file into an R data frame using 64 | # read.delim (for reading in tab delimited text files) is given in the next 3 lines 65 | url.for.data.file <- "https://raw.githubusercontent.com/AlanBerger/ 66 | Practice-programming-exercises-for-R/master/tiny-subset-of-GSE18885- 67 | gene-expression-data-9-genes-WG-5-samples-Normal-Control-4-samples.tab.txt" 68 | 69 | # read in the data as a data frame 70 | ma <- read.delim(url.for.data.file, nrows = 9, check.names = FALSE, 71 | stringsAsFactors = FALSE) 72 | 73 | # display ma 74 | ma 75 | 76 | # now, in a for loop, get the p-values and fold changes 77 | num.genes <- nrow(ma) # the number of genes in this data frame 78 | 79 | gene <- ma$gene # the column of gene names 80 | probe.vec <- ma[[1]] # the column of Illumina Probe_IDs 81 | # get vectors to hold the p-value and fold change values 82 | p.value <- numeric(num.genes) 83 | fold.change <- numeric(num.genes) 84 | 85 | for (i in 1:num.genes) { 86 | # get the vector for the WG expression values and the vector 87 | # for the NC expression values for the ith gene 88 | NCvec <- unlist(ma[i, 3:6]) 89 | WGvec <- unlist(ma[i, 7:11]) 90 | 91 | # calculate the p-value and fold change 92 | pval <- t.test(NCvec, WGvec)$p.value # two-sided unequal variance (Welch) t-test 93 | p.value[i] <- pval 94 | WG.over.NC.fold.change <- 2^(mean(WGvec) - mean(NCvec)) 95 | fold.change[i] <- WG.over.NC.fold.change 96 | } 97 | 98 | # Construct the desired data frame. 99 | analysis.results <- data.frame(probe.vec, gene, p.value, fold.change, 100 | stringsAsFactors = FALSE, check.names = FALSE) 101 | colnames(analysis.results) <- c("Illumina PROBE_ID", "gene", "two-sided p-value", 102 | "WG/NC fold change") 103 | analysis.results 104 | 105 | ############## read in the annotation data file (a small subset of the full annotation) 106 | 107 | # read in the short edited Illumina microarray annotation data file called 108 | # GPL6104-Illumina-microarray-platform-annotation-from-GEO-repository- 109 | # small-subset-edited-example-Feb12.tab.txt 110 | 111 | url.for.annotation.file <- 112 | "https://raw.githubusercontent.com/AlanBerger/Practice-programming-exercises-for-R/ 113 | master/GPL6104-Illumina-microarray-platform-annotation-from-GEO-repository-small- 114 | subset-edited-example-Feb12.tab.txt" 115 | 116 | annotation.df <- read.delim(url.for.annotation.file, nrows = 15, check.names = FALSE, 117 | stringsAsFactors = FALSE) 118 | 119 | # Note the use of nrows = 15 since there is information on the 120 | # source of this data in later rows that should not be read in as data. 121 | # The choice check.names = FALSE "tells" R to leave the column headers as is 122 | 123 | annotation.df 124 | 125 | # We see that this annotation data file has data for more Illumina probes than 126 | # are in the analysis results data frame, and that the probe IDs are not in the 127 | # same order as in the analysis results data frame. For the purpose of the 128 | # practice exercise below we will only append the columns containing the gene name, 129 | # Chromosome number (which chromosome the gene is located on) 130 | # and the short desciption of the protein encoded by the gene. 131 | # Repeating the gene name gives a indicator to use to double check that the "merge" 132 | # was done correctly. 133 | 134 | # Note this data frame has an example of more than 1 probe for a given 135 | # gene (ATP2B1), where different parts of the same gene are "queried". 136 | 137 | # To keep things simpler with this example, for each probe ID in the analysis results, 138 | # there is 1 row of the annotation data frame with the same probe ID. 139 | 140 | # The merge function can handle the case where there is no matching probe ID in 141 | # the annotation file for a probe ID in the analysis results data frame, in which case 142 | # we would want to append NA's indicating that information is not available in the 143 | # annotation file being used. 144 | 145 | columns.to.keep <- c(1, 2, 4, 6) # keep just these columns of the annotation data frame 146 | # to have print outs easy to see 147 | # we need to keep the probe IDs in column 1 to be able to match rows 148 | 149 | annotation.df <- annotation.df[, columns.to.keep] 150 | # from now on annotation.df will refer to this shortened version of the 151 | # annotation data frame 152 | 153 | annotation.df 154 | 155 | ``` 156 | 157 | ## Programming Exercise: Append to analysis.results information from the annotation data frame 158 | 159 | Approach: Form a vector of row numbers, call it annot.rows, such that for each row r of the analysis.results 160 | data frame; annot.rows[r] will contain the row of the annotation.df data frame that has the same 161 | Illumina Probe_ID as does row r of analysis.results 162 | Then column binding annotation.df[annot.rows, ] to analysis.results (using **cbind**) will yield the desired result. 163 | 164 | Hint: Use a for loop, and use the **which** function to find, for each row r of analysis.results the 165 | row number ra of annotation.df that has the same probe ID as does row r of analysis.results 166 | 167 | A working version of R code which does this is given below. 168 | 169 | ``` {r} 170 | # use the analysis.results and annotation.df obtained in the R code above. 171 | nrows <- nrow(analysis.results) 172 | # create the integer vector annot.rows of length nrows to hold the 173 | # row numbers of annotation.df matching (with respect to the probe ID) the 174 | # analysis.results rows 175 | annot.rows <- vector(mode = "integer", length = nrows) 176 | 177 | # get the Illumina probe IDs vector from the annotation data frame 178 | annotation.df.probe.ids <- annotation.df[[1]] 179 | 180 | for (r in 1:nrows) { 181 | probe.id <- analysis.results[r, 1] 182 | # find the row ra of annotation.df whose Illumina probe ID matches probe.id 183 | ra <- which(annotation.df.probe.ids == probe.id) 184 | if (length(ra) != 1) stop("did not find unique matching probe id row") 185 | annot.rows[r] <- ra 186 | } 187 | 188 | # append the correct rows (correctly lined up) of annotation.df to analysis.results 189 | analysis.results.with.annotation <- cbind(analysis.results, annotation.df[annot.rows, ]) 190 | analysis.results.with.annotation # display it 191 | 192 | # Note the row numbers are from the rows of annotation.df whose probe IDs 193 | # matched up with the those in analysis.results 194 | ``` 195 | 196 | ## Second exercise: use the R **merge** function to append matching annotation lines to analysis.results 197 | 198 | The R merge function can combine two data frames in various ways. See for example the web page by Joachim Schork 199 | which is a page in https://statisticsglobe.com/ titled "Merge Data Frames by Column Names in R (3 Examples)": 200 | https://statisticsglobe.com/r-merging-data-frames-by-column-names-merge-function 201 | See also the R help on the merge function (via ? merge). 202 | 203 | While it is good practice to use basic R constructs until they are easy for you to use, using an available 204 | R function can greatly simplify code which then makes it easier to keep free of bugs. Code that uses the 205 | merge function is given below. The merge function is capable of a number of types of merging in addition to the 206 | example below. 207 | 208 | ``` {r} 209 | # Use the R merge function to append annotation to the analysis results 210 | 211 | # recall that annotation.df is referring to the shortened version of the annotation 212 | # merged.df <- merge(x = analysis.results, y = annotation.df, 213 | # by.x = "Illumina PROBE_ID", by.y = "Illumina Probe_ID", 214 | # all.x = TRUE, all.y = FALSE, sort = FALSE) 215 | 216 | # What the above call to merge will do (once the comment symbols # are removed) is: 217 | # use the by.x = "Illumina PROBE_ID" column of analysis.results as the "guide" 218 | # and for each row r of analysis.results, the merge function will in effect search 219 | # to find the row ra of annotation.df such that the entry of row ra in column 220 | # by.y = "Illumina Probe_ID" of annotation.df matches the entry of row r in 221 | # the column by.x = "Illumina PROBE_ID" of analysis.results 222 | # Note these 2 column names are not exactly the same so we need to specify 223 | # the column names in x and y to be used to do matching of rows, 224 | # using the arguments by.x and by.y 225 | # The merge function will, in effect, append row ra of annotation.df to row r 226 | # of analysis.results 227 | # The choice all.x = TRUE means: if there is no match for the entry of row r in 228 | # the column "Illumina PROBE_ID" of analysis.results anywhere in the column 229 | # "Illumina Probe_ID" of annotation.df, then a row of NA's is appended to row r 230 | # of analysis.results 231 | # The choice all.y = FALSE means don't include rows of annotation.df other than 232 | # those appended to analysis.results as desribed above. 233 | # The choice sort = FALSE means do not sort the resulting data frame 234 | # (any sorting would have been done for this call to merge using 235 | # the "Illumina PROBE_ID" column). 236 | 237 | merged.df <- merge(x = analysis.results, y = annotation.df, 238 | by.x = "Illumina PROBE_ID", by.y = "Illumina Probe_ID", 239 | all.x = TRUE, all.y = FALSE, sort = FALSE) 240 | # display it 241 | merged.df 242 | 243 | # Note the Illumina Probe_ID column of annotation.df is NOT included in merge.df 244 | # Let's check that merged.df is the same as analysis.results.with.annotation obtained 245 | # above. First we need to remove the Illumina Probe_ID column from annotation.df 246 | # that is included in analysis.results.with.annotation before we check. 247 | analysis.results.with.annotation <- analysis.results.with.annotation[, -5] 248 | 249 | # check if they are the same 250 | identical(analysis.results.with.annotation, merged.df) 251 | 252 | 253 | # What happened? they looked the same -- so now a little "adventure" 254 | # in finding out what happened -- this sort of thing "comes with the territory" 255 | # when programming in any language (they each have their own quirks). 256 | 257 | # Let's look closer 258 | attributes(analysis.results.with.annotation) 259 | 260 | attributes(merged.df) 261 | 262 | # So the row names were different 263 | 264 | # Looks like we can fix this by setting the row names of 265 | # analysis.results.with.annotation to be those for merged.df 266 | row.names(analysis.results.with.annotation) <- row.names(merged.df) 267 | identical(analysis.results.with.annotation, merged.df) 268 | 269 | 270 | # Now what ???? Let's look at the attributes again 271 | 272 | attributes(analysis.results.with.annotation) 273 | 274 | attributes(merged.df) 275 | 276 | # So the row names for analysis.results.with.annotation are 1:9 as characters 277 | # and the row names for merged.df are 1:9 as integers - 278 | # As I said, every language has its quirks 279 | row.names(analysis.results.with.annotation) <- 1:9 280 | attributes(analysis.results.with.annotation) 281 | 282 | # Now if they aren't identical we really do have problems 283 | identical(analysis.results.with.annotation, merged.df) 284 | 285 | # So some semblance of order is restored. The problem was 286 | # row.names(merged.df) returned a character vector 287 | str(row.names(merged.df)) 288 | 289 | # One final verification: I'm going to remove the row of the annotation 290 | # data frame corresponding the probe ID for the BPI gene 291 | # and then use the merge function 292 | annotation.df <- annotation.df[-5, ] 293 | 294 | merged.df <- merge(x = analysis.results, y = annotation.df, 295 | by.x = "Illumina PROBE_ID", by.y = "Illumina Probe_ID", 296 | all.x = TRUE, all.y = FALSE, sort = FALSE) 297 | # display it 298 | merged.df 299 | 300 | # Note merge filled in NA's for the annotation columns for the row for the probe ID 301 | # corresponding to BPI as expected. 302 | # The merge function also placed the row for which there was no match for the 303 | # probe ID in the annotation file at the bottom of the merged data frame. 304 | 305 | # This illustrates the kind of "exploring" one should do when using a new R function, 306 | # particularly one that has a somewhat complex range of options and for which the 307 | # output has a range of possibilities, in order to be confident about what it will 308 | # do when called a certain way 309 | 310 | ``` 311 | 312 | Hope this was informative and good practice. 313 | The next exercise will contain further practice in using data frames, and point out some 314 | types of logical mistakes that may result in actual output that, however, is incorrect, 315 | rather than an error message. This is the most dangerous type of mistake, in that if the 316 | incorrect output is not obviously wrong, the mistake might not be recognized until it 317 | causes serious consequences. That is why it is always wise to do, whenever possible, test 318 | runs for cases where one knows or can independently calculate the true result. 319 | 320 | = = = = = = = = = = = = = = = = = = = = = = = = 321 | 322 | This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 323 | To view a copy of this license, visit https://creativecommons.org/licenses/by-nc-sa/4.0/ or send a letter 324 | to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. There is a full version of this license at this web site: 325 | https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode 326 | 327 | Note the reader should not infer any endorsement or recommendation or approval for the material in this article from 328 | any of the sources or persons cited above or any other entities mentioned in this article. 329 | 330 | -------------------------------------------------------------------------------- /third-R-programming-exercise-using-for-loops-and-if-tests-to-check-if-a-positive-integer-is-a-prime.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | #output: 3 | # md_document: 4 | # variant: markdown_github 5 | output: pdf_document 6 | --- 7 | 8 | 9 | ## "Third programming exercise using for loops and if tests to check if a positive integer is a prime" 10 | 11 | ### Alan E. Berger Sept 24, 2020 minor edits 21 Nov 2020 12 | 13 | ### version 1 14 | 15 | ### available at https://github.com/AlanBerger/Practice-programming-exercises-for-R 16 | 17 | ## Introduction 18 | 19 | The construction of a function to return all the prime numbers between 1 and a positive integer N 20 | will be done through a sequence of exercises. 21 | 22 | This is the third in a sequence of programming exercises in "composing" an R function 23 | to carry out a particular task. The idea is to practice correct use of R constructs and 24 | built in functions (functions that "come with" the basic R installation), while learning how 25 | to "put together" a correct sequence of blocks of commands that will obtain the desired result. 26 | Note these exercises are quite cumulative - one should do them in order. 27 | 28 | In these exercises, there will be a statement of what your function should do 29 | (what are the input variables and what the function should return) and a sequence of "hints". 30 | To get the most out of these exercises, try to write your function using as few hints as possible. 31 | Note there are often several ways to write a function that will obtain the correct result. 32 | For these exercises the directions and hints may point toward a particular approach intended to 33 | practice particular constructs in R and a particular line of reasoning, 34 | even if there is a more efficent way to obtain the same result. 35 | There may also be an existing R function or package that will do what is stated for a given 36 | practice exercise, but here the point is to practice formulating a logical sequence of steps, 37 | with each step a section of code, to obtain a working function, not to find an existing 38 | solution or a quick solution using a more powerful R construct that is better addressed later on. 39 | 40 | ## Motivation for this exercise 41 | 42 | If statements and for loops are basic constructs for constructing a function that will carry out a specified computation. 43 | This exercise practices writing a code containing if statements and a for loop 44 | that through a sequence of steps computes the desired result. 45 | The specific function for this exercise is described below. 46 | 47 | ## Background 48 | 49 | The function to be written, getPrimeNumbers(N = 1000), in several steps with increasing amount of 50 | efficiency and use of logic in writing the code, 51 | is to return the all prime numbers between 1 and the integer N. 52 | For this exercise one will write a function, isItPrime, to test if a given positive integer is a prime. 53 | The isItPrime function will then be used in the next exercise to construct getPrimeNumbers. 54 | 55 | Some basic definitions and results about prime numbers needed in writing these functions follow. 56 | A positive integer q **evenly divides** a positive integer n if there is a positive 57 | integer k such that n = k * q, for example 3 evenly divides 15; 6 evenly divides 24; but 4 does not evenly 58 | divide 9 (in integer arithmetic, since 9 = 2 * 4 with a **remainder** of 1). R provides the **mod** function **%%** such 59 | that n %% q gives the remainder **r** from integer dividing n by q (also phrased as **n equals r mod q**). 60 | So q evenly divides n is equivalent to n %% q = 0 61 | 62 | Here are a few sample results from using the mod function 63 | 64 | ``` {r} 65 | 9 %% 4 # 9 mod 4 equals 1 66 | 15 %% 3 # 3 evenly divides 15 67 | 24 %% 6 # 6 evenly divides 24 68 | 32 %% 15 # 15 does not evenly divide 32 69 | ``` 70 | 71 | A positive integer p is called **prime** if p > 1 and the only positive integers that evenly divide p are 1 and p 72 | (so the first several prime numbers are 2, 3, 5, 7, 11, 13). We can thus use the mod function to test whether a 73 | given positive integer is prime. There are more sophisticated approaches, see for example the Wikipedia article 74 | {"Primality Test"}(https://en.wikipedia.org/wiki/Primality_test#:~:text=9%20External%20links-,Simple%20methods,composite%2C%20otherwise%20it%20is%20prime.) 75 | 76 | This is an example of it often being the case that one needs to learn something about the data and or the science 77 | that is relevant to the program one is writing in order to properly do an analysis or correctly carry out a calculation. 78 | This can sometimes be essential to avoid serious mistakes, or to write a program that does not take 79 | an impractical amount of time to do the calculation. 80 | 81 | The function to be constructed by the end of this sequence of exercises is **getPrimeNumbers**, whose argument N is to be 82 | a positive integer greater than 1, and which should return, in a vector, call it for example primes_up_to_N, 83 | all the prime numbers between 2 and N (including 2, and if N is a prime number, N). 84 | 85 | This sequence of exercises will work through the construction of several functions leading up to getPrimeNumbers, 86 | the point of which is to practice basic R constructs and "putting together" code to get a working function. 87 | This is also an example of writing several functions that together give a modular construction of code to 88 | obtain a desired result. It is often easier to test and debug a sequence of functions than one large function, 89 | and in some cases the individual functions can be used or quickly modified for use for another purpose. 90 | 91 | 92 | ### Instruction for the first function in this exercise 93 | 94 | The first function in this sequence will be **isItPrime(n)** whose argument is a positive integer n that is 95 | at most 1,000,000 (just to avoid accidently starting an extremely time consuming calculation) 96 | which will return either TRUE if n is a prime and FALSE otherwise. 97 | To do this, treat the two cases n equal 1 (not a prime) and n equal 2 (a prime) separately 98 | (at the beginning of the function). Then for n greater than 2 simply check whether any integer 99 | between 2 and (n-1) evenly divides n (we will make this more efficient in the next version 100 | of isItPrime). Note my usage of the phrase: values "between X and Y" includes the "endpoints" X and Y. 101 | A skeleton of this funtion is: 102 | 103 | ``` 104 | isItPrimeV1 <- function(n) { 105 | # determine whether the positive integer n is prime 106 | # using the mod function, return TRUE or FALSE accordingly 107 | 108 | # check that the function argument is "admissible" 109 | # test that n is a positive integer (or a real number that equals a positive integer) 110 | n.int <- as.integer(n) 111 | # if n was a real number such as 3.2 then n.int will be n truncated 112 | # to an integer (for this example, 3) 113 | 114 | if(!(n.int == n)) stop("n is not an integer") 115 | if(n < 1) stop("n is not positive") 116 | 117 | # stop if n is "too large" to avoid a very long calculation 118 | if(n > 1000000) stop("n is > a million") 119 | 120 | # code to test if n is prime using R's mod function %% 121 | 122 | # special cases 123 | if(n == 1) return(FALSE) 124 | if(n == 2) return(TRUE) 125 | 126 | ##### rest of code to test if n is prime when n is at least 3 127 | 128 | } 129 | ``` 130 | 131 | Try programming this now. 132 | 133 | 134 | 135 | First hint: 136 | 137 | positive integer q evenly divides positive integer n 138 | if and only if n mod q is 0 139 | 140 | 141 | 142 | 143 | Second hint: 144 | 145 | positive integer n greater than 2 is a prime if and only if 146 | no integer between 2 and (n-1) evenly divides n; test using a for loop 147 | 148 | 149 | note 2:(n-1) (what you want for the range of the for loop) 150 | is **not** the same as 2:n-1 which 151 | equals (2:n) - 1 and is 1,...,(n-1) 152 | 153 | 154 | A working code is given below. 155 | 156 | 157 | 158 | ```{r} 159 | isItPrimeV1 <- function(n) { 160 | # determine whether the positive integer n is prime 161 | # using the mod function 162 | 163 | # check that the function argument is "admissible" 164 | # test that n is a positive integer (or a real number that equals a positive integer) 165 | n.int <- as.integer(n) 166 | # if n was a real number such as 3.2 then n.int will be n truncated 167 | # to an integer (for this example, 3) 168 | 169 | if(!(n.int == n)) stop("n is not an integer") 170 | if(n < 1) stop("n is not positive") 171 | 172 | # stop if n is "too large" to avoid a very long calculation 173 | if(n > 1000000) stop("n is > a million") 174 | 175 | # code to test if n is prime using R's mod function %% 176 | # return TRUE or FALSE 177 | 178 | if(n.int == 1) return(FALSE) 179 | if(n.int == 2) return(TRUE) 180 | # if got to here, n is at least 3 181 | # test if an integer between 2 and (n-1) evenly divides n 182 | 183 | for (q in 2:(n-1)) { 184 | if((n %% q) == 0) return(FALSE) 185 | } 186 | 187 | # if got to here, n is prime 188 | return(TRUE) 189 | } 190 | 191 | ##### 192 | # do a couple test runs 193 | isItPrimeV1(2) 194 | isItPrimeV1(3) 195 | isItPrimeV1(4) 196 | isItPrimeV1(5) 197 | isItPrimeV1(6) 198 | 199 | # test several Mersenne numbers 200 | # if you are curious, Google "Mersenne number" 201 | isItPrimeV1(2^17 - 1) # known to be prime 202 | isItPrimeV1(2^11 - 1) # known to be not prime 203 | isItPrimeV1(2^6 - 1) # known to be not prime 204 | ``` 205 | 206 | ## Instruction for the second function in this exercise 207 | 208 | Often one can improve the efficiency or accuracy of a program if one learns some more 209 | about the subject matter. Here, consider the possible positive integers q (q at least 2) 210 | that could evenly divide a positive integer n that is at least 3 211 | (recall we are treating n equal 1 and n equal 2 "by hand"). If q is between 2 and (n-1) and evenly 212 | divides n, then n = q * k for some positive integer k, and k must be between 2 and (n-1) (since k equal 213 | 1 would be too small because q is at most (n-1), and k equal n would be too large because q is at least 2). 214 | In fact, by similar reasoning, one of the values k and q must be at or below $\sqrt n$ since 215 | otherwise q * k would be > n. Hence, if n is not a prime, it must be evenly divisible by an integer 216 | between 2 and $\sqrt n$. So we only have to run the for loop in isItPrime 217 | from 2 through as.integer(sqrt(n)). While that doesn't make much practical difference in computation 218 | time for modest size n, in other circumstances a little analysis can make a substantial difference 219 | in run time and or accuracy. 220 | 221 | Now modify the isItPrimeV1 function to take advantage of this additional information, call it isItPrime 222 | 223 | Try writing it - and do the test runs. Did you get the correct answer that it is TRUE that 3 is prime? 224 | If not - What went wrong? 225 | 226 | 227 | Here is a version **that fails for n = 3** (but works for other positive integers) 228 | 229 | ```{r} 230 | isItPrime <- function(n) { 231 | # determine whether the positive integer n is prime 232 | # using the mod function, Version 2 233 | 234 | # check that the function argument is "admissible" 235 | # test that n is a positive integer (or a real number that equals a positive integer) 236 | n.int <- as.integer(n) 237 | # if n was a real number such as 3.2 then n.int will be n truncated 238 | # to an integer (for this example, 3) 239 | 240 | if(!(n.int == n)) stop("n is not an integer") 241 | if(n < 1) stop("n is not positive") 242 | 243 | # stop if n is "too large" to avoid a very long calculation 244 | if(n > 1000000) stop("n is > a million") 245 | 246 | # code to test if n is prime using R's mod function %% 247 | # return TRUE or FALSE 248 | 249 | if(n.int == 1) return(FALSE) 250 | if(n.int == 2) return(TRUE) 251 | # if got to here, n is at least 3 252 | # test if an integer between 2 and (n-1) evenly divides n 253 | 254 | lastq <- as.integer(sqrt(n)) 255 | for (q in 2:lastq) { 256 | if((n %% q) == 0) return(FALSE) 257 | } 258 | 259 | # if got to here, n is prime 260 | return(TRUE) 261 | } 262 | 263 | 264 | # do a couple test runs 265 | isItPrime(2) 266 | isItPrime(3) # this should return TRUE, did it?, if not why not? 267 | isItPrime(4) 268 | isItPrime(5) 269 | isItPrime(6) 270 | 271 | # test several Mersenne numbers 272 | isItPrime(2^17 - 1) # known to be prime 273 | isItPrime(2^11 - 1) # known to be not prime 274 | isItPrime(2^6 - 1) # known to be not prime 275 | ``` 276 | 277 | Debugging syntax errors (such as forgetting a parenthesis or bracket or curly brace or typing one of these 278 | when another is required, or typing a left one when a right one is needed etc.) or 279 | using the $ form to extract a column from a data frame with a **variable** that contains a column 280 | name but not an actual name of a column (which doesn't even give an error message! - R just returns **NULL**), 281 | and debugging errors in the logical construction of the code as in this case, can be a frustrating part of programming, 282 | but is a necessary skill that one learns with practice (more on this in later exercises). 283 | 284 | In the test cases, this code failed for n = 3. So look at each line of the code (from the top) and think about 285 | (or, in general (but not needed here), print out, or for larger objects use head or tail or str (structure) to look at) 286 | what takes place in each line (that is, what was the new value of the variable that was created or modified in that line). 287 | Note the upper right RStudio sub-window (Global Environment) displays helpful information on existing R objects 288 | if you run the lines of a function in the R console (not within a function), which is often a useful way to 289 | do testing and debugging. Or, in this case, since the code worked before, look at the effect of what was changed, 290 | which was to run the for loop from 2 to lastq equal to (in the case that failed) as.integer(sqrt(3)). 291 | Well, sqrt(3) is 1.732 (to 4 significant digits) so lastq is 1 when n = 3, and the range of the for loop 292 | in this case is the two values {2, 1} and 1 evenly divides any integer, so that is why the code failed 293 | (returned FALSE when n was 3). This is an example of the type of reasoning used to track down a coding error. 294 | The failure only occurs with n equal 3 since for n larger than 3, as.integer(sqrt(n)) is at least 2. 295 | One easy fix is to do the case n = 3 "by hand": add the if test: if(n.int == 3) return(TRUE) 296 | Another way is to increase lastq by 1: lastq <- as.integer(sqrt(n)) + 1L 297 | (With some algebra one can check that this value of lastq is < n when n is at least 3, so 298 | it is alright to use this value of lastq in isItPrime). 299 | Here is a version that works: 300 | 301 | ```{r} 302 | isItPrime <- function(n) { 303 | # determine whether the positive integer n is prime 304 | # using the mod function, Version 2 305 | 306 | # check that the function argument is "admissible" 307 | # test that n is a positive integer (or a real number that equals a positive integer) 308 | n.int <- as.integer(n) 309 | # if n was a real number such as 3.2 then n.int will be n truncated 310 | # to an integer (for this example, 3) 311 | 312 | if(!(n.int == n)) stop("n is not an integer") 313 | if(n < 1) stop("n is not positive") 314 | 315 | # stop if n is "too large" to avoid a very long calculation 316 | if(n > 1000000) stop("n is > a million") 317 | 318 | # code to test if n is prime using R's mod function %% 319 | # return TRUE or FALSE 320 | 321 | if(n.int == 1) return(FALSE) 322 | if(n.int == 2) return(TRUE) 323 | # if got to here, n is at least 3 324 | # test if an integer between 2 and sqrt(n) + 1 evenly divides n 325 | 326 | lastq <- as.integer(sqrt(n)) + 1L 327 | # the L in 1L "tells" R is treat 1 as an 328 | # integer value rather than a real (numeric) value 329 | # this could also have equivalently been done by 330 | # lastq <- as.integer(sqrt(n) + 1) 331 | for (q in 2:lastq) { 332 | if((n %% q) == 0) return(FALSE) 333 | } 334 | 335 | # if got to here, n is prime 336 | return(TRUE) 337 | } 338 | 339 | 340 | # do a couple test runs 341 | isItPrime(2) 342 | isItPrime(3) 343 | isItPrime(4) 344 | isItPrime(5) 345 | isItPrime(6) 346 | 347 | # test several Mersenne numbers 348 | isItPrime(2^17 - 1) # known to be prime 349 | isItPrime(2^11 - 1) # known to be not prime 350 | isItPrime(2^6 - 1) # known to be not prime 351 | ``` 352 | 353 | ## Instruction for the third function in this exercise 354 | 355 | Suppose we are curious to see, for values of n that were not prime, what was the first value of q 356 | in the for loop that evenly divided n. Modify isItPrime (call the modified function isItPrimeV2) so that it 357 | returns a vector with 3 named entries: 358 | 359 | c(is_n_prime = 1, n = n, firstq = n) when n is prime, and 360 | 361 | c(is_n_prime = 0, n = n, firstq = q) when n is not a prime 362 | 363 | Running these test cases should give the results that follow: 364 | 365 | ``` 366 | # do a couple test runs 367 | isItPrimeV2(2) 368 | isItPrimeV2(3) 369 | isItPrimeV2(4) 370 | isItPrimeV2(5) 371 | isItPrimeV2(6) 372 | 373 | # test several Mersenne numbers 374 | isItPrimeV2(2^17 - 1) # known to be prime 375 | isItPrimeV2(2^11 - 1) # known to be not prime 376 | isItPrimeV2(2^6 - 1) # known to be not prime 377 | 378 | ##### should get these results: 379 | 380 | isItPrimeV2(2) 381 | is_n_prime n firstq 382 | 1 2 2 383 | 384 | isItPrimeV2(3) 385 | is_n_prime n firstq 386 | 1 3 3 387 | 388 | isItPrimeV2(4) 389 | is_n_prime n firstq 390 | 0 4 2 391 | 392 | isItPrimeV2(5) 393 | is_n_prime n firstq 394 | 1 5 5 395 | 396 | isItPrimeV2(6) 397 | is_n_prime n firstq 398 | 0 6 2 399 | 400 | # test several Mersenne numbers 401 | isItPrimeV3(2^17 - 1) # known to be prime 402 | is_n_prime n firstq 403 | 1 131071 131071 404 | 405 | isItPrimeV3(2^11 - 1) # known to be not prime 406 | is_n_prime n firstq 407 | 0 2047 23 408 | 409 | isItPrimeV3(2^6 - 1) # known to be not prime 410 | is_n_prime n firstq 411 | 0 63 3 412 | ``` 413 | 414 | 415 | A working version of the code is: 416 | 417 | 418 | ```{r} 419 | isItPrimeV2 <- function(n) { 420 | # determine whether the positive integer n is prime 421 | # using the mod function, Version 2 422 | 423 | # check that the function argument is "admissible" 424 | # test that n is a positive integer (or a real number that equals a positive integer) 425 | n.int <- as.integer(n) 426 | # if n was a real number such as 3.2 then n.int will be n truncated 427 | # to an integer (for this example, 3) 428 | 429 | if(!(n.int == n)) stop("n is not an integer") 430 | if(n < 1) stop("n is not positive") 431 | 432 | # stop if n is "too large" to avoid a very long calculation 433 | if(n > 1000000) stop("n is > a million") 434 | 435 | # code to test if n is prime using R's mod function %% 436 | # return c(is_n_prime = 1, n = n, firstq = n) when n is prime 437 | # return c(is_n_prime = 0, n = n, firstq = q) when n is not a prime 438 | # where firstq is the first (smallest) integer 439 | # greater than 1 that evenly divides n (firstq is set to 1 if n is 1) 440 | # (q is the index of the for loop below) 441 | 442 | if(n.int == 1) return(c(is_n_prime = 0, n = 1, firstq = 1)) 443 | if(n.int == 2) return(c(is_n_prime = 1, n = 2, firstq = 2)) 444 | # if got to here, n is at least 3 445 | # test if an integer between 2 and sqrt(n) + 1 evenly divides n 446 | 447 | lastq <- as.integer(sqrt(n)) + 1L 448 | # the L in 1L "tells" R is treat 1 as an 449 | # integer value rather than a real (numeric) value 450 | # this could also have equivalently been done by 451 | # lastq <- as.integer(sqrt(n) + 1) 452 | for (q in 2:lastq) { 453 | if((n %% q) == 0) return(c(is_n_prime = 0, n = n.int, firstq = q)) 454 | } 455 | 456 | # if got to here, n is prime 457 | return(c(is_n_prime = 1, n = n.int, firstq = n.int)) 458 | } 459 | 460 | 461 | # do a couple test runs 462 | isItPrimeV2(2) 463 | isItPrimeV2(3) 464 | isItPrimeV2(4) 465 | isItPrimeV2(5) 466 | isItPrimeV2(6) 467 | 468 | # test several Mersenne numbers 469 | isItPrimeV2(2^17 - 1) # known to be prime 470 | isItPrimeV2(2^11 - 1) # known to be not prime 471 | isItPrimeV2(2^6 - 1) # known to be not prime 472 | ``` 473 | 474 | Hope this programming exercise was informative and good practice. 475 | The next programming exercise will be to use your isItPrime function 476 | as the "engine" for writing getPrimeNumbers(N = 1000), which will return all the prime numbers 477 | between 1 and the positive integer N. 478 | 479 | = = = = = = = = = = = = = = = = = = = = = = = = 480 | 481 | This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 482 | To view a copy of this license, visit https://creativecommons.org/licenses/by-nc-sa/4.0/ or send a letter 483 | to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. There is a full version of this license at this web site: 484 | https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode 485 | -------------------------------------------------------------------------------- /Fifth-article-Review-of-getting-subsets-of-a-data-frame-constructing-data-frames.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | md_document: 4 | variant: markdown_github 5 | # output: pdf_document 6 | --- 7 | 8 | ## Fifth article: Review of getting subsets of a data frame, constructing data frames 9 | 10 | ### Alan E. Berger December 9, 2020 11 | 12 | ### version 1 13 | 14 | ### available at https://github.com/AlanBerger/Practice-programming-exercises-for-R 15 | 16 | ## Motivation 17 | 18 | Data frames, which are analogous to Excel spreadsheets for which the entries in each column are of the same 19 | "type" and each column has the same number of rows, are a fundamental way of handling data in R. 20 | 21 | I'll first review various ways of extracting subsets of a data frame, and then review several ways to 22 | construct a data frame from multiple vectors or from smaller data frames. This is a fairly long article - 23 | it is not intended to be read all at one time. 24 | 25 | R often has several equivalent ways of doing something. Perhaps?? this is from R having been developed in 26 | a collaborative fashion by several people with their own favorite ways of doing various programming constructs, so 27 | they all got included. One can choose one's favorite ways of doing things, but one needs to be familiar with all 28 | the commonly used constructs in order to be able to understand code written by others (and, importantly, 29 | to understand code in instructions and examples for R packages one wants to use). 30 | 31 | ## Review of how to get specified subsets of a data frame: A. getting a single column 32 | 33 | This material is taken/modified from a pinned post of mine 34 | "Examples of extracting as a vector a single column from a data frame" in the Week 2 Discussion Forum for 35 | the R Programming Course in the Johns Hopkins Data Science Specialization on Coursera. 36 | 37 | If you are taking this course, then note in the Week 1 pinned posts in the Discussion Forum, that Leonard Greski has 38 | written a very good more general article on getting row and column subsets from a data 39 | frame "Forms of the Extract Operator in R" (this article also contains some more advanced material covered later 40 | in the R course, so read what is relevant to where you are in the R Programming course and refer back later as you 41 | learn more R); and one is also well advised to read Al Warren's pinned post in the Week 2 Discussion 42 | Forum "Subsetting with bracket notation". 43 | 44 | Getting (often referred to as *extracting*) a single column from a data frame is a common step in an R function, 45 | and one usually will want to get the column in the form of a vector, not as a data frame with that one column. 46 | 47 | Let's see how to get, as a vector, for example the sulfate column of a simple example data frame; 48 | 49 | ``` 50 | df <- data.frame(sulfate = c(4.79, 1.46, 4.28, NA), nitrate = c(0.299, NA, 4.280, 3.560)) 51 | df 52 | sulfate nitrate 53 | 1 4.79 0.299 54 | 2 1.46 NA 55 | 3 4.28 4.280 56 | 4 NA 3.560 57 | ``` 58 | 59 | To get the sulfate column as vector you can do either of the following 5 equivalent statements: 60 | 61 | ``` 62 | df[["sulfate"]] # double brackets 63 | [1] 4.79 1.46 4.28 NA 64 | # or 65 | df$sulfate # note sulfate does not need to be in quotes for the $ form of extraction 66 | # (but a text string with blanks in it would need to be) 67 | [1] 4.79 1.46 4.28 NA 68 | # or 69 | df[, "sulfate"] # single brackets but note the comma so we get all the rows in the sulfate column 70 | [1] 4.79 1.46 4.28 NA 71 | # or one could use the column number 72 | df[[1]] 73 | [1] 4.79 1.46 4.28 NA 74 | df[, 1] 75 | [1] 4.79 1.46 4.28 NA 76 | 77 | # Note that df["sulfate"] # single brackets, no comma, 78 | # is a 1 column data frame containing the sulfate column; if you are getting 79 | # 1 column from a data frame you will usually want it as a vector 80 | 81 | df["sulfate"] # single brackets gives a data frame 82 | sulfate 83 | 1 4.79 84 | 2 1.46 85 | 3 4.28 86 | 4 NA 87 | 88 | class(df["sulfate"]) 89 | [1] "data.frame" 90 | # Note for example the mean function "expects" a vector and 91 | # will return NA and give a not very informative message if you 92 | # "feed it" a data frame 93 | 94 | mean(df["sulfate"]) 95 | [1] NA 96 | Warning message: 97 | In mean.default(df["sulfate"]) : 98 | argument is not numeric or logical: returning NA 99 | 100 | # If pollutant is an R variable containing the text string "sulfate" 101 | # then these will work to extract the column as a vector 102 | pollutant <- "sulfate" 103 | df[[pollutant]] 104 | [1] 4.79 1.46 4.28 NA 105 | # or 106 | df[, pollutant] 107 | [1] 4.79 1.46 4.28 NA 108 | 109 | # BUT NOT 110 | df$pollutant 111 | NULL 112 | ``` 113 | 114 | `df$pollutant` does NOT work since pollutant is NOT an actual column name; it is a variable *containing* 115 | the text string sulfate which is not acceptable for the \$ form of getting/extracting a 116 | column from a data frame as a vector (those are the R "rules" and we have to live with them). 117 | And note R does NOT even warn you about this type of mistake - it just cheerfully gives back 118 | NULL which can lead to v e r y mysterious bugs. Similarly, mistyping the name of a column in the 119 | following example commands results in NULL (with NO warning): `df$sulfffate` and also `df[["sulffffate"]]`. 120 | Programming requires very careful attention to details - one might be tempted to think R should be 121 | able to "figure out" what you meant, but recall what type of mischief an "auto correct" in a word 122 | processor or message app can create - and in a programming language you wouldn't even get to view 123 | in real time what the "compiler" had done to your code. Better to know that if you program correctly 124 | exactly what you want, some "gremlin" won't be changing it! 125 | 126 | 127 | ## Review of how to get specified subsets of a data frame: B. subsetting rows and/or columns 128 | 129 | If v is a vector of row indices (that are in the range of the number of rows of the data frame df) 130 | one can get the rows of df corresponding to v 131 | 132 | ``` 133 | df <- data.frame(sulfate = c(4.79, 1.46, 4.28, NA), nitrate = c(0.299, NA, 4.280, 3.560)) 134 | df 135 | sulfate nitrate 136 | 1 4.79 0.299 137 | 2 1.46 NA 138 | 3 4.28 4.280 139 | 4 NA 3.560 140 | 141 | v <- c(1, 3, 2, 2, 2) 142 | df[v, ] 143 | sulfate nitrate 144 | 1 4.79 0.299 145 | 3 4.28 4.280 146 | 2 1.46 NA 147 | 2.1 1.46 NA 148 | 2.2 1.46 NA 149 | 150 | # note reordering and repeats are allowed 151 | # note the indication of repeats in the row numbers R generates (R "does not like" 152 | # duplicate row names and so does modifications to make them unique) 153 | ``` 154 | 155 | Note R does not issue warnings or errors for indices that are "out of range", 156 | it just fills in NA's 157 | 158 | ``` 159 | v <- c(1, 3, 2, 6, 2) # 6 is out of the range of the number of rows of df 160 | df[v, ] 161 | sulfate nitrate 162 | 1 4.79 0.299 163 | 3 4.28 4.280 164 | 2 1.46 NA 165 | NA NA NA 166 | 2.1 1.46 NA 167 | ``` 168 | 169 | One can use negative row indices to **exclude** those rows: 170 | 171 | ``` 172 | df 173 | sulfate nitrate 174 | 1 4.79 0.299 175 | 2 1.46 NA 176 | 3 4.28 4.280 177 | 4 NA 3.560 178 | 179 | v <- c(-1, -3) # exclude rows 1, 3 and keep the rest 180 | df[v, ] 181 | sulfate nitrate 182 | 2 1.46 NA 183 | 4 NA 3.56 184 | ``` 185 | 186 | One can also specify desired columns (and repeats of columns) 187 | 188 | ``` 189 | v <- c(1, 3, 2, 2, 2) 190 | # if we just want the first column (sulfate) with these rows 191 | # we can do 192 | df[v, 1] 193 | [1] 4.79 4.28 1.46 1.46 1.46 194 | # or 195 | df[v, "sulfate"] 196 | [1] 4.79 4.28 1.46 1.46 1.46 197 | # or 198 | df[v, ]$sulfate 199 | [1] 4.79 4.28 1.46 1.46 1.46 200 | 201 | # we can also specify several columns 202 | w <- c(1, 2, 1, 2) 203 | df[v, w] 204 | sulfate nitrate sulfate.1 nitrate.1 205 | 1 4.79 0.299 4.79 0.299 206 | 3 4.28 4.280 4.28 4.280 207 | 2 1.46 NA 1.46 NA 208 | 2.1 1.46 NA 1.46 NA 209 | 2.2 1.46 NA 1.46 NA 210 | ``` 211 | 212 | Note R "does not like" duplicate column names and so does modifications to make them unique. 213 | 214 | Also one can use a logical vector V having the same number of rows as df; rows where the 215 | corresponding entry of V is TRUE are kept, rows where the corresponding entry of V is FALSE are 216 | not kept. 217 | 218 | ``` 219 | df 220 | sulfate nitrate 221 | 1 4.79 0.299 222 | 2 1.46 NA 223 | 3 4.28 4.280 224 | 4 NA 3.560 225 | 226 | logicalVector <- c(T, F, F, T) 227 | df[logicalVector, ] 228 | sulfate nitrate 229 | 1 4.79 0.299 230 | 4 NA 3.560 231 | ``` 232 | 233 | 234 | ## The **which** function 235 | 236 | If one has a vector V, one can ask for which rows of V is some logical condition TRUE; R's **which** function 237 | does this: the conceptual description is 238 | 239 | which(some logical condition on each entry of V) 240 | 241 | returns the vector of the **indices** of V for which the the condition is TRUE 242 | (Any entries of V that are NA will be considered to yield FALSE, so those indices of V will **not** be included 243 | in the result.) If there are no indices for which the condition is TRUE, the which function returns an empty 244 | integer vector (integer(0)) 245 | 246 | For example 247 | 248 | ``` 249 | df 250 | sulfate nitrate 251 | 1 4.79 0.299 252 | 2 1.46 NA 253 | 3 4.28 4.280 254 | 4 NA 3.560 255 | 256 | result <- which(df[["sulfate"]] > 2) 257 | result 258 | [1] 1 3 259 | # any entries of V that are NA are considered to yield FALSE 260 | 261 | # one can then use result to get the rows of df for which the condition was TRUE 262 | df[result, ] 263 | sulfate nitrate 264 | 1 4.79 0.299 265 | 3 4.28 4.280 266 | 267 | # Note the result of having an NA involved in the following: 268 | 269 | df # repeating what df is 270 | sulfate nitrate 271 | 1 4.79 0.299 272 | 2 1.46 NA 273 | 3 4.28 4.280 274 | 4 NA 3.560 275 | 276 | V <- df[["sulfate"]] > 2 # a logical vector with a value for each row of df 277 | V 278 | [1] TRUE FALSE TRUE NA 279 | 280 | df[V, ] # keeps rows of the data frame where V is TRUE, but note the effect of the NA 281 | sulfate nitrate 282 | 1 4.79 0.299 283 | 3 4.28 4.280 284 | NA NA NA 285 | ``` 286 | 287 | I like the conceptual viewpoint of the which function. Apparently it is rather universal in 288 | that, for example, **IDL** has the corresponding function called **where** and **MATLAB** has a 289 | corresponding function called **find** 290 | 291 | 292 | ## The **%in%** function 293 | 294 | The **%in%** function addresses the question of whether or not each entry of some vector v occurs in another vector w. 295 | It returns a logical vector z with z[k] being TRUE if v[k] is equal to some entry in w, and z[k] FALSE if 296 | v[k] is not equal some entry in w. This can be used to obtain a logical vector for use in selecting 297 | rows of a data frame 298 | 299 | An example of how %in% behaves: 300 | 301 | ``` 302 | ?"%in%" # look at the help on %in% Note because of the special 303 | # character % one needs to "protect" %in% by enclosing it in 304 | # either quotes or apostrophes when "asking for help" on it 305 | 306 | v <- c(1, 2, 3, 4, 5, NA) 307 | w <- c(12, 3, 8, 22, 4) 308 | v %in% w 309 | [1] FALSE FALSE TRUE TRUE FALSE FALSE 310 | 311 | v <- c(1, 2, 3, 4, 5, NA) 312 | w <- c(12, 3, 8, 22, 4, NA) 313 | v %in% w 314 | [1] FALSE FALSE TRUE TRUE FALSE TRUE 315 | 316 | # note %in% will declare a match for an NA in v if there is an NA in w 317 | ``` 318 | 319 | 320 | ## Creating data frames 321 | 322 | ### Reading in a data file as a data frame 323 | 324 | 325 | As noted above, data frames are a fundamental way that R handles data. Many data files that are text files (as opposed 326 | to binary files) are naturally suitable for reading in as a data frame using for example **read.csv** 327 | (where the column separator (delimiter) is a comma), or 328 | more generally **read.table** The options for read.table also apply for read.csv but note some of the important 329 | default choices are different, in particular the default for "telling R" whether there is a header line containing 330 | column names is **header = TRUE** for read.csv and **header = FALSE** for read.table, and the default column separator 331 | for read.csv is `sep = ","` while 332 | for read.table one should usually specify it since the default is "white space"; a common column separator other 333 | than comma is a tab, which is specified by `sep = "\t"` (the backslash "tells" R to interpret the t in a special way). 334 | 335 | To start with, when learning R, there are 2 other options one should be aware of: **stringsAsFactors = FALSE** "tells" R to 336 | read in character columns as character data, not as factors which is the default (unless a column is to be used 337 | as a factor in a statistical analysis it is likely better to have it read in as character data). 338 | 339 | The second option one should be aware of when starting to learn read.csv and read.table is **na.strings** This option lets 340 | one specify the character string (or several character strings) that should be interpreted as NA (the default is `"NA"`). 341 | For example `na.strings = c("NA", "data is missing", "not available", "the experimenter dropped the sample", "the experimenter was texting when 342 | the data should have been measured")` If there is character data other than NA signifying missing data in a column that 343 | should be read in as numeric data, and R is not "informed" about this, then R will read in that column as factor or 344 | character data, which can lead to "issues" that are best avoided by properly reading in the data. 345 | 346 | Another option to be aware of if one is dealing with a file that has "non-standard for R" column 347 | names is **check.names** which if set equal TRUE (the default), then R will modify read in column names to conform with what 348 | R considers standard. That means blank spaces and many characters that are not letters will get replaced by a period. 349 | I find this rather annoying since I like to use long descriptive column headers in files I create, and as long as I am not 350 | having R use the column names other than to write them back out after I have done some analysis on the data, it is OK to 351 | instruct R to leave the column names alone (by setting check.names = FALSE). 352 | 353 | 354 | ## Creating a data frame from smaller data frames and/or vectors and matrices: A. the **data.frame** function 355 | 356 | Looking over the R help on **data.frame**, one sees that it can combine objects that are or can be converted to be 357 | data frames into one combined data frame. As with read.csv 358 | and read.table, one may well want to use the option **stringsAsFactors = FALSE** 359 | (unless one needs to have one or more factor columns). (The data frame function will do *recycling* on rows 360 | but I would recommend having the number of rows in objects being combined into a data frame all be the same.) 361 | Note the R **rep** (replicate) function can be used to replicate "patterns", for example 362 | 363 | ``` 364 | rep(c(1,2), times = 4) # repeat the pattern 4 times 365 | [1] 1 2 1 2 1 2 1 2 366 | 367 | # rep can also be used this way: 368 | rep(c(1,2), each = 4) 369 | [1] 1 1 1 1 2 2 2 2 370 | ``` 371 | 372 | ## Some examples with the data.frame function: 373 | 374 | ``` 375 | df1 <- data.frame(sulfate = c(4.79, 1.46, 4.28, NA), nitrate = c(0.299, NA, 4.280, 3.560)) 376 | df1 # our continuing example data frame 377 | sulfate nitrate 378 | 1 4.79 0.299 379 | 2 1.46 NA 380 | 3 4.28 4.280 381 | 4 NA 3.560 382 | 383 | v1 <- c(1,2,3,4) 384 | v2 <- c(TRUE, FALSE, TRUE, TRUE) 385 | v3 <- c("a", "b", "c", "d") 386 | 387 | m1 <- matrix(1:12, nrow = 4, ncol = 3) 388 | m1 389 | [,1] [,2] [,3] 390 | [1,] 1 5 9 391 | [2,] 2 6 10 392 | [3,] 3 7 11 393 | [4,] 4 8 12 394 | 395 | df <- data.frame(df1, v1, v2, m1, v3, stringsAsFactors = FALSE) 396 | df 397 | sulfate nitrate v1 v2 X1 X2 X3 v3 398 | 1 4.79 0.299 1 TRUE 1 5 9 a 399 | 2 1.46 NA 2 FALSE 2 6 10 b 400 | 3 4.28 4.280 3 TRUE 3 7 11 c 401 | 4 NA 3.560 4 TRUE 4 8 12 d 402 | 403 | # The row names of df are the row names of the first argument of data.frame, i.e., 404 | # the first of the objects being combined into one data frame 405 | 406 | # If one wants to change some column names one could do, for example, 407 | 408 | colnames(df)[c(5, 6, 7)] <- c("m1", "m2", "m3") 409 | df 410 | sulfate nitrate v1 v2 m1 m2 m3 v3 411 | 1 4.79 0.299 1 TRUE 1 5 9 a 412 | 2 1.46 NA 2 FALSE 2 6 10 b 413 | 3 4.28 4.280 3 TRUE 3 7 11 c 414 | 4 NA 3.560 4 TRUE 4 8 12 d 415 | 416 | # and similarly with row names 417 | rownames(df) <- c("r1", "r2", "r3", "r4") 418 | df 419 | sulfate nitrate v1 v2 m1 m2 m3 m4 420 | r1 4.79 0.299 1 TRUE 1 5 9 a 421 | r2 1.46 NA 2 FALSE 2 6 10 b 422 | r3 4.28 4.280 3 TRUE 3 7 11 c 423 | r4 NA 3.560 4 TRUE 4 8 12 d 424 | ``` 425 | 426 | ## Creating a data frame from smaller data frames and/or vectors and matrices: B. **rbind** 427 | 428 | If one has several data frames that have the same number of columns AND the same column names, 429 | then one can "stack them vertically" using the **rbind** (row bind) function. For example with 2 data frames 430 | df1 and df2 431 | 432 | ``` 433 | df1 <- data.frame(sulfate = c(4.79, 1.46, 4.28, NA), nitrate = c(0.299, NA, 4.280, 3.560)) 434 | df1 435 | sulfate nitrate 436 | 1 4.79 0.299 437 | 2 1.46 NA 438 | 3 4.28 4.280 439 | 4 NA 3.560 440 | 441 | df2 <- data.frame(sulfate = c(24.79, 21.46, 24.28, NA), nitrate = c(2.299, NA, 2.280, 2.560)) 442 | df2 443 | sulfate nitrate 444 | 1 24.79 2.299 445 | 2 21.46 NA 446 | 3 24.28 2.280 447 | 4 NA 2.560 448 | 449 | # then one can do 450 | df <- rbind(df1, df2) 451 | df 452 | sulfate nitrate 453 | 1 4.79 0.299 454 | 2 1.46 NA 455 | 3 4.28 4.280 456 | 4 NA 3.560 457 | 5 24.79 2.299 458 | 6 21.46 NA 459 | 7 24.28 2.280 460 | 8 NA 2.560 461 | 462 | # The column names for all the items being "rbinded" must be the same 463 | # (except for an important exception described below). 464 | 465 | # will get an error message if the column names don't match: for example below I have 466 | # the column names in df2 not matching those in df1 467 | 468 | colnames(df2)[2] <- "another.name" 469 | df2 470 | sulfate another.name 471 | 1 24.79 2.299 472 | 2 21.46 NA 473 | 3 24.28 2.280 474 | 4 NA 2.560 475 | 476 | df <- rbind(df1, df2) # gets error message 477 | Error in match.names(clabs, names(xi)) : 478 | names do not match previous names 479 | ``` 480 | 481 | 482 | ## Creating a data frame from smaller data frames and/or vectors and matrices: C. Using rbind in a loop 483 | 484 | 485 | In some circumstances one might be reading in or constructing a succession of data frames, each with 486 | the same number of columns and the same column names, and want to combine them vertically. 487 | One can do this in a loop if one initializes an empty data frame via `df <- data.frame()` 488 | 489 | One can rbind any data.frame to this empty data frame; this is the exception to the rule 490 | on same number of columns and column names so then this "conceptual" for loop will work: 491 | 492 | ``` 493 | df <- data.frame() # initialize an empty data frame 494 | for (i in some.set) { 495 | read in or derive a data frame dfi corresponding to i (each dfi must have the same 496 | number of columns and the same column names) 497 | df <- rbind(df, dfi) 498 | } 499 | 500 | # after this loop the data frame df will consist of all the data frames dfi stacked vertically 501 | ``` 502 | 503 | 504 | ## Creating a data frame from smaller data frames and/or vectors and matrices: D. the **cbind** function 505 | 506 | 507 | The **cbind** (column bind) function can combine data frames or combine objects that are or can be converted to be 508 | data frames. cbind is the same as data.frame except that the default for cbind is check.names = FALSE 509 | 510 | 511 | Hope this review was informative. 512 | The next set of exercises will get into practicing using and creating data frames. 513 | 514 | = = = = = = = = = = = = = = = = = = = = = = = = 515 | 516 | This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 517 | To view a copy of this license, visit https://creativecommons.org/licenses/by-nc-sa/4.0/ or send a letter 518 | to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. There is a full version of this license at this web site: 519 | https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode 520 | 521 | Some of the material above (Review of how to get specified subsets of a data frame: getting a single column) was 522 | taken/modified from a post of mine in the Discussion Forum for the R Programming Course in 523 | the Johns Hopkins Data Science Specialization on Coursera, as noted above. 524 | As such Coursera and Coursera authorized Partners retain additional rights to that material as described in 525 | their "Terms of Use" https://www.coursera.org/about/terms 526 | 527 | Note the reader should not infer any endorsement or recommendation or approval for the material in this article from 528 | any of the sources or persons cited above or any other entities mentioned in this article. 529 | 530 | 531 | -------------------------------------------------------------------------------- /Eighth-R-Practice-exercise-composing-a-function-and-constructing-a-data-frame-files-not-containing-search-strings.md: -------------------------------------------------------------------------------- 1 | Eighth R Practice exercise composing a function and constructing a data frame; files not containing search strings 2 | ------------------------------------------------------------------------------------------------------------------ 3 | 4 | ### Alan E. Berger Feb 2, 2020 5 | 6 | ### available at 7 | 8 | Introduction 9 | ------------ 10 | 11 | This is the eighth in a sequence of programming exercises in "composing" an R function to carry out a particular task. Several of these "exercise files" likely 12 | will take several sessions to master the content. The material below practices composing a logical sequences of steps to program a function that will accomplish a specified task, and preparing a corresponding data frame. 13 | 14 | The idea of this set of exercises is to practice correct use of R constructs and built in functions (functions that "come with" the basic R installation), while learning how to "put together" a correct sequence of blocks of commands that will obtain the desired result. 15 | Note these exercises are quite cumulative - one should do them in order. 16 | 17 | In these exercises, there will be a statement of what your function should do (what are the input variables and what the function should return) and a sequence of "hints". To get the most out of these exercises, try to write your function using as few hints as possible. 18 | Note there are often several ways to write a function that will obtain the correct result. For these exercises the directions and hints may point toward a particular approach intended to practice particular constructs in R and a particular line of reasoning, even if there is a more efficent way to obtain the same result. 19 | There may also be an existing R function or package that will do what is stated for a given practice exercise, but here the point is to practice formulating a logical sequence of steps, with each step a section of code, to obtain a working function, not to find an existing solution or a quick solution using a more powerful R construct that is better addressed later on. 20 | 21 | Motivation for this exercise 22 | ---------------------------- 23 | 24 | We will compose a function that returns a vector of the names of files in a folder that do NOT contain any of the entries of search.strings in their name. This will be an opportunity to practice use of the **%in%** function; and then use of the **setdiff** function applied to two vectors: `setdiff(V1, V2)` for vectors V1 and V2 of the same type (e.g., numeric or character) will give a vector consisting of the entries of V1 that are **not** equal to any entry of V2. 25 | 26 | In the previous (seventh) exercise file we prepared the function 27 | 28 | **search\_for\_filenames\_containing\_any\_of\_the\_patterns\_and\_output\_file\_info**(directory, search.strings) 29 | 30 | that returns a data frame containing information on the file names in directory that match **ANY entry** of search.strings (i.e., that have 1 or more of the entries of search.strings in their file name). The first column of the returned data frame has the names of the files. The final version is copied here: 31 | 32 | ``` r 33 | search_for_filenames_matching_any_of_the_patterns_and_output_file_info <- 34 | function(directory, search.strings){ 35 | 36 | # directory is an absolute path (full path) or a path relative to the R working directory to the 37 | # folder to be searched. If want to search the R working directory itself, 38 | # can set directory = "." with the line of code: directory <- "." 39 | # or could set directory to be the full path to the R working directory. 40 | 41 | # search.strings is a character string vector 42 | 43 | # Return a data frame of the file names (not including folders) in directory that contain 44 | # ANY of the entries of the search.strings vector somewhere in their file name, 45 | # (or at the beginning of the filename, or at the end of the filename, if so specified). 46 | # The search will be case insensitive (treats lower case and upper case letters as the same). 47 | 48 | # The first step is to initialize the filenames vector: filenames <- character(0) 49 | 50 | # In a for loop, use R's list.files function to list all the files (file names) matching the 51 | # entries of search.strings, one by one, 52 | # eliminating names of folders, and appending them to filenames 53 | 54 | # Use the unique function to eliminate duplicates (keep only 1 copy of each file name) 55 | 56 | # For the files whose names contain any of the character strings in search.strings, use 57 | # R's file.info function to get the file size and last modification time as in the previous function. 58 | 59 | # The output data frame will contain the file size and the last modification time 60 | # for each file that has any member of search.strings somewhere in its file name 61 | # (or at the beginning or at the end of the file name, if so specified) 62 | # 63 | # We will get the file names without the folder path leading to the files included in the name. 64 | 65 | # check that search.strings is a non-empty character vector 66 | 67 | ns <- length(search.strings) 68 | if(ns < 1) stop("no entries in search.strings") 69 | if(!is.character(search.strings)) stop("search.strings is not a character vector") 70 | 71 | # We will return a data frame whose first column contains the files in directory 72 | # that contain any character string in the search.strings vector somewhere in their name. 73 | # The second column will contain the last time (and date) the file was modified, 74 | # and the third column will be the file size (in bytes). 75 | 76 | # The first step is to initialize the filenames vector 77 | filenames <- character(0) 78 | 79 | # Then in a for loop, for each entry S of search strings, 80 | # use list.files to get the vector V the file names in directory that 81 | # contain S in their file name and append V to filenames (after eliminating any folder names). 82 | # do not include the path to the file in the file name. 83 | 84 | # To eliminate names of any folders (directories) that are in V: 85 | # do paste(directory, "/", V, sep = "") to get the filenames including either 86 | # the relative path from the R working directory or the absolute path (depending on what 87 | # directory is); use these names in the R dir.exists function to check for folder 88 | # (directory) names in filenames 89 | 90 | for (k in 1:ns) { 91 | V <- list.files(directory, pattern = search.strings[k], 92 | full.names = FALSE, ignore.case = TRUE) 93 | # exclude directory names from V (we need to do this since we are "adding" 94 | # the file names in V to filenames and want only file names, not folder names 95 | # if V is empty (character(0)) then skip this 96 | if(length(V) > 0) V <- V[!dir.exists(paste(directory, "/", V, sep = ""))] 97 | filenames <- c(filenames, V) 98 | } 99 | 100 | ### It is important to note we could have also "run the for loop" in this fashion: 101 | ### for (S in search.strings) { 102 | ### V <- list.files(directory, pattern = S, 103 | ############## rest of the for loop 104 | ### 105 | 106 | nf <- length(filenames) 107 | if(nf == 0) { 108 | print("no files contain any of the search strings") 109 | return("no files contain any of the search strings") 110 | } 111 | 112 | filenames <- unique(filenames) 113 | nf <- length(filenames) # need to do this since may have eliminated some duplicate name(s) 114 | 115 | # If got to here, at least 1 file has a character string in search.strings 116 | # in its name, so get the information on these file(s) into a data frame. 117 | 118 | ################################## get the data frame to be output 119 | 120 | # Get the desired output data frame using vectors 121 | 122 | dfcolnames <- c("file.name", "modif.date", "size.in.bytes") 123 | # initialize the 3 vectors that will hold this information on the files 124 | # whose names matched any of the members of search.strings 125 | 126 | fname <- character(0) 127 | fdate <- character(0) 128 | fsize <- numeric(0) 129 | 130 | for(k in 1:nf) { 131 | finfo <- file.info(paste(directory, "/", filenames[k], sep = "")) 132 | # needed to include the path to the file so file.info can locate it 133 | fname <- c(fname, filenames[k]) 134 | fdate <- c(fdate, as.character(finfo$mtime)) 135 | fsize <- c(fsize, finfo$size) 136 | } 137 | 138 | df <- data.frame(fname, fdate, fsize, stringsAsFactors = FALSE) 139 | colnames(df) <- dfcolnames 140 | 141 | ################################## finished getting the data frame to be output 142 | 143 | # Write the data frame out to a tab delimited text file called scrlisting.txt in directory 144 | # (i.e., in the folder specified by the argument directory this function was called with). 145 | 146 | outpfilename <- paste(directory, "/", "scrlisting.txt", sep = "") 147 | # One can rename this "scratch file" as desired after viewing it (best viewed in Excel or equivalent). 148 | write.table(df, file = outpfilename, 149 | append = FALSE, quote = FALSE, sep = "\t", 150 | row.names = FALSE, col.names = TRUE) 151 | # This call to write.table will write out a data frame 152 | # as one would usually want; it specifies the column separator to be a tab 153 | 154 | return(df) 155 | } 156 | ``` 157 | 158 | Exercises 159 | --------- 160 | 161 | The function for this exercise will be to construct a modified version of the search function called 162 | 163 | search_for_filenames_matching_NONE_of_the_patterns_and_output_file_names <- 164 | function(directory, search.strings) 165 | 166 | that will return the vector of file names in directory that do **NOT** match **ANY entry** of search.strings 167 | 168 | One way to view this, is that the first function in this sequence obtained the **intersection** of the sets of file names matching each entry of search strings, while the previous function (the function above) obtained the **union** U of the sets of file names matching each entry of search.strings This programming exercise will be to construct a version that will return the vector of file names in directory that **DO NOT match ANY** entry of search.strings. This can be viewed as obtaining, relative to the set of all the files in directory, the **complement of U**. 169 | 170 | One could as in the previous functions return information on these files, but this is more a practice exercise in using the %in% function (and in the next exercise, using the setdiff function) so we are just going to concentrate on getting the vector of the file names. 171 | 172 | Hints: Use the 173 | 174 | **search\_for\_filenames\_matching\_any\_of\_the\_patterns\_and\_output\_file\_info** 175 | 176 | function to find the file names in directory that match one or more of the file names in search.strings, call this vector **Vany**. Then remove these file names from the vector of all the file names in directory. One approach for this is to use **list.files** to get the vector of all the file and folder names in directory, then remove the folder names from this vector, resulting in the vector **Vall**. Then construct a logical vector Vlogical the same length as Vall such that: Vlogical\[k\] is TRUE if Vall\[k\] is **not** an entry of Vany (and Vlogical\[k\] is FALSE if Vall\[k\] is an entry of Vany). The `%in%` function is very convenient for this (and this is good practice using it). 177 | 178 | Recall from the fifth article in this series: The **%in%** function addresses the question of whether or not each entry of some vector v occurs in another vector w: `z <- v %in% w` obtains a logical vector z with z\[k\] being TRUE if v\[k\] is equal to some entry in w, and z\[k\] being FALSE if v\[k\] is not equal some entry in w. Note then the logical vector `nz <- !z` has the property that nz\[k\] will be TRUE if v\[k\] is NOT an entry of w, and nz\[k\] will be FALSE if v\[k\] is an entry of w; hence v\[nz\] gives the entries of v that are NOT in w. 179 | 180 | An example of how %in% behaves: 181 | 182 | ?"%in%" # look at the help on %in% Note because of the special 183 | # character % one needs to "protect" %in% by enclosing it in 184 | # either quotes or apostrophes when "asking for help" on it 185 | 186 | v <- c(1, 2, 3, 4, 5, NA) 187 | w <- c(12, 3, 8, 22, 4) 188 | v %in% w 189 | [1] FALSE FALSE TRUE TRUE FALSE FALSE 190 | 191 | v <- c(1, 2, 3, 4, 5, NA) 192 | w <- c(12, 3, 8, 22, 4, NA) 193 | v %in% w 194 | [1] FALSE FALSE TRUE TRUE FALSE TRUE 195 | 196 | # note %in% will declare a match for an NA in v if there is an NA in w 197 | 198 | From the discussion above, Vall\[Vlogical\] is the desired vector of file names, and we can obtain Vlogical using the `%in%` function and logical negation (!). Try doing this - a working version is given below. 199 | 200 | ``` r 201 | search_for_filenames_matching_NONE_of_the_patterns_and_output_file_names <- 202 | function(directory, search.strings){ 203 | 204 | # directory is an absolute path (full path) or a path relative to the R working directory to the 205 | # folder to be searched. If want to search the R working directory itself, 206 | # can set directory = "." 207 | # or could set directory to be the full path to the R working directory. 208 | 209 | # search.strings is a text string vector 210 | 211 | # Return a vector of the file names (not including folders) in directory that contain 212 | # NONE of the entries of the search.strings vector in their file name, 213 | # the search will be case insensitive (treats lower case and upper case letters as the same). 214 | # Since this is mainly an exercise in using the %in% function, just return the vector of file 215 | # names (and don't bother with getting the last modification data or file size). 216 | 217 | # Use R's list.files function to list all the files (file names) in directory 218 | # then eliminate names of folders; put the result in Vall 219 | 220 | # Then use the previous function 221 | # search_for_filenames_matching_any_of_the_patterns_and_output_file_info 222 | # to find all the files in directory that match some entry in search.strings 223 | # and put the vector of these file names in Vany 224 | 225 | # Then eliminate from Vall any entries that are in Vany using the %in% function 226 | # and logical negation (!) 227 | 228 | # We will get the file names without the folder path leading to the files included in the name. 229 | 230 | # check that search.strings is a non-empty character vector 231 | 232 | ns <- length(search.strings) 233 | if(ns < 1) stop("no entries in search.strings") 234 | if(!is.character(search.strings)) stop("search.strings is not a character vector") 235 | 236 | Vall <- list.files(directory, full.names = FALSE, ignore.case = TRUE) 237 | nf <- length(Vall) 238 | if(nf == 0) { 239 | print("no files in directory") 240 | return("no files in directory") 241 | } 242 | # exclude directory names from Vall 243 | Vall <- Vall[!dir.exists(paste(directory, "/", Vall, sep = ""))] 244 | 245 | nf <- length(Vall) 246 | if(nf == 0) { 247 | print("no files in directory") 248 | return("no files in directory") 249 | } 250 | 251 | # now get Vany 252 | df.Vany <- 253 | search_for_filenames_matching_any_of_the_patterns_and_output_file_info(directory, search.strings) 254 | 255 | # need to handle the case that NO file names were a match for any of the search strings 256 | # in which case df.Vany is the character string: "no files contain any of the search strings" 257 | # rather than a data frame 258 | if(class(df.Vany) == "character") { 259 | Vany <- character(0) 260 | } else { 261 | Vany <- df.Vany$file.name # the column of file names 262 | } 263 | # When programming, one should always be sure the function handles "extreme cases", 264 | # here that would be when NONE the file names in directory match some entry in search.strings, 265 | # and when ALL of the file names match search.strings 266 | 267 | # eliminate names in Vany from Vall 268 | 269 | Vlogical <- !(Vall %in% Vany) # we want entries in Vall that are NOT in Vany so need to use ! 270 | filenames <- Vall[Vlogical] # the desired file names (names NOT in Vany) 271 | 272 | nf <- length(filenames) 273 | if(nf == 0) { 274 | print("all files contain contain an entry of search strings") 275 | return("all files contain contain an entry of search strings") 276 | } 277 | 278 | # If got to here, at least 1 file has no entry of search.strings in its name 279 | return(filenames) # here we are just returning the file names without other information on them 280 | } 281 | ``` 282 | 283 | ### Test runs on my computer 284 | 285 | Recalling from the previous two exercise files, I have constructed in my R working directory a folder called test\_dir containing a small number of files with names I picked to conveniently test the functions that search for file names satisfying various conditions. Here are all the file names (and the one folder name) in the test\_dir folder: 286 | 287 | directory <- "test_dir" 288 | list.files(directory) # 9 files and 1 folder 289 | [1] "001.csv" "002.csv" "003.txt" 290 | [4] "004csvfile.txt" "005txt2csv" "1.csv" 291 | [7] "308.csv" "folder001txtcsv" "scrlisting.txt" 292 | [10] "txt.csv" 293 | 294 | We will use this folder and files (9 filenames and 1 folder name) to test the search function written immediately above (you can construct a similar test\_dir folder and files with these filenames and also a folder in it called "folder001txtcsv" to run tests, the only difference will be the dates and sizes of the files). Or you could test the function using a suitable folder in your computer for which you can pick search.strings so that you will get a known reasonable length vector for the test run. 295 | 296 | Some test runs 297 | -------------- 298 | 299 | directory <- "test_dir" 300 | list.files(directory) # 9 files and 1 folder 301 | [1] "001.csv" "002.csv" "003.txt" 302 | [4] "004csvfile.txt" "005txt2csv" "1.csv" 303 | [7] "308.csv" "folder001txtcsv" "scrlisting.txt" 304 | [10] "txt.csv" 305 | 306 | search.strings <- c("2", "3", "4", "5") 307 | search_for_filenames_matching_NONE_of_the_patterns_and_output_file_names(directory, search.strings) 308 | [1] "001.csv" "1.csv" "scrlisting.txt" "txt.csv" 309 | 310 | search.strings <- c("csv") 311 | search_for_filenames_matching_NONE_of_the_patterns_and_output_file_names(directory, search.strings) 312 | [1] "003.txt" "scrlisting.txt" 313 | 314 | # test case when all the file names match some member of search.strings 315 | search.strings <- c("csv", "txt") 316 | search_for_filenames_matching_NONE_of_the_patterns_and_output_file_names(directory, search.strings) 317 | [1] "all files contain contain an entry of search strings" 318 | [1] "all files contain contain an entry of search strings" 319 | # when there are no such files (no files in directory that don't have an entry of search.strings 320 | # in their file name), the search function both prints this message and returns it, so here it is 321 | # output twice 322 | 323 | search.strings <- c("00") 324 | search_for_filenames_matching_NONE_of_the_patterns_and_output_file_names(directory, search.strings) 325 | [1] "1.csv" "308.csv" "scrlisting.txt" "txt.csv" 326 | 327 | # test case when none of the file names match some member of search.strings, 328 | # one should get all the file names (but not the folder name) 329 | search.strings <- c("00987") 330 | search_for_filenames_matching_NONE_of_the_patterns_and_output_file_names(directory, search.strings) 331 | [1] "no files contain any of the search strings" 332 | [1] "001.csv" "002.csv" "003.txt" "004csvfile.txt" 333 | [5] "005txt2csv" "1.csv" "308.csv" "scrlisting.txt" 334 | [9] "txt.csv" 335 | 336 | # note [1] "no files contain any of the search strings" came from 337 | # "looking for" the files that matched ANY entry of search.strings - 338 | # in this case there were none, and so one obtained all the file names 339 | # in test_dir (and, correctly, not the folder name) 340 | 341 | The next exercise is to use the **setdiff** function in place of using the %in% function to get the file names that are in Vall but not in Vany. 342 | 343 | Hint: In the function above you simply need to replace the following 2 lines with 1 line using setdiff 344 | 345 | Vlogical <- !(Vall %in% Vany) # we want entries in Vall that are NOT in Vany so need to use ! 346 | filenames <- Vall[Vlogical] # the desired file names (names NOT in Vany) 347 | 348 | A correct line using setdiff is: `filenames <- setdiff(Vall, Vany)` 349 | 350 | One should do the test runs again to check this works correctly. 351 | 352 | This exercise was intended to practice the very useful **%in%** function, and also illustrate that finding an existing R function (here setdiff) that can do exactly what you want can result in very concise easy to understand code, which leaves less room for bugs to occur and to hide. 353 | 354 | Hope this was informative and good practice. The next set of exercises will address dealing with using subsets of an individual row of a data frame. = = = = = = = = = = = = = = = = = = = = = = = = 355 | 356 | This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. To view a copy of this license, visit or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. There is a full version of this license at this web site: 357 | -------------------------------------------------------------------------------- /Eighth-R-Practice-exercise-composing-a-function-and-constructing-a-data-frame-files-not-containing-search-strings.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: 3 | md_document: 4 | variant: markdown_github 5 | # output: pdf_document 6 | --- 7 | 8 | 9 | ## Eighth R Practice exercise composing a function and constructing a data frame; files not containing search strings 10 | 11 | ### Alan E. Berger Feb 2, 2020 12 | 13 | ### available at https://github.com/AlanBerger/Practice-programming-exercises-for-R 14 | 15 | ## Introduction 16 | 17 | This is the eighth in a sequence of programming exercises in "composing" an R function 18 | to carry out a particular task. Several of these "exercise files" likely 19 | will take several sessions to master the content. The material below practices composing a logical 20 | sequences of steps to program a function that will accomplish a specified task, and 21 | preparing a corresponding data frame. 22 | 23 | The idea of this set of exercises is to practice correct use of R constructs and 24 | built in functions (functions that "come with" the basic R installation), while learning how 25 | to "put together" a correct sequence of blocks of commands that will obtain the desired result. 26 | Note these exercises are quite cumulative - one should do them in order. 27 | 28 | In these exercises, there will be a statement of what your function should do 29 | (what are the input variables and what the function should return) and a sequence of "hints". 30 | To get the most out of these exercises, try to write your function using as few hints as possible. 31 | Note there are often several ways to write a function that will obtain the correct result. 32 | For these exercises the directions and hints may point toward a particular approach intended to 33 | practice particular constructs in R and a particular line of reasoning, 34 | even if there is a more efficent way to obtain the same result. 35 | There may also be an existing R function or package that will do what is stated for a given 36 | practice exercise, but here the point is to practice formulating a logical sequence of steps, 37 | with each step a section of code, to obtain a working function, not to find an existing 38 | solution or a quick solution using a more powerful R construct that is better addressed later on. 39 | 40 | ## Motivation for this exercise 41 | 42 | We will compose a function that returns a vector of the names of files in a folder that do NOT contain any of the 43 | entries of search.strings in their name. This will be an opportunity to practice use of the **%in%** function; 44 | and then use of the **setdiff** function applied to two vectors: `setdiff(V1, V2)` for vectors V1 and V2 of 45 | the same type (e.g., numeric or character) will give a vector consisting of the entries of V1 that 46 | are **not** equal to any entry of V2. 47 | 48 | 49 | In the previous (seventh) exercise file we prepared the function 50 | 51 | **search_for_filenames_containing_any_of_the_patterns_and_output_file_info**(directory, search.strings) 52 | 53 | that returns a data frame containing information on the file names in directory that match **ANY entry** of 54 | search.strings (i.e., that have 1 or more of the entries of search.strings in their file name). 55 | The first column of the returned data frame has the names of the files. The final version is copied here: 56 | 57 | ``` {r} 58 | search_for_filenames_matching_any_of_the_patterns_and_output_file_info <- 59 | function(directory, search.strings){ 60 | 61 | # directory is an absolute path (full path) or a path relative to the R working directory to the 62 | # folder to be searched. If want to search the R working directory itself, 63 | # can set directory = "." with the line of code: directory <- "." 64 | # or could set directory to be the full path to the R working directory. 65 | 66 | # search.strings is a character string vector 67 | 68 | # Return a data frame of the file names (not including folders) in directory that contain 69 | # ANY of the entries of the search.strings vector somewhere in their file name, 70 | # (or at the beginning of the filename, or at the end of the filename, if so specified). 71 | # The search will be case insensitive (treats lower case and upper case letters as the same). 72 | 73 | # The first step is to initialize the filenames vector: filenames <- character(0) 74 | 75 | # In a for loop, use R's list.files function to list all the files (file names) matching the 76 | # entries of search.strings, one by one, 77 | # eliminating names of folders, and appending them to filenames 78 | 79 | # Use the unique function to eliminate duplicates (keep only 1 copy of each file name) 80 | 81 | # For the files whose names contain any of the character strings in search.strings, use 82 | # R's file.info function to get the file size and last modification time as in the previous function. 83 | 84 | # The output data frame will contain the file size and the last modification time 85 | # for each file that has any member of search.strings somewhere in its file name 86 | # (or at the beginning or at the end of the file name, if so specified) 87 | # 88 | # We will get the file names without the folder path leading to the files included in the name. 89 | 90 | # check that search.strings is a non-empty character vector 91 | 92 | ns <- length(search.strings) 93 | if(ns < 1) stop("no entries in search.strings") 94 | if(!is.character(search.strings)) stop("search.strings is not a character vector") 95 | 96 | # We will return a data frame whose first column contains the files in directory 97 | # that contain any character string in the search.strings vector somewhere in their name. 98 | # The second column will contain the last time (and date) the file was modified, 99 | # and the third column will be the file size (in bytes). 100 | 101 | # The first step is to initialize the filenames vector 102 | filenames <- character(0) 103 | 104 | # Then in a for loop, for each entry S of search strings, 105 | # use list.files to get the vector V the file names in directory that 106 | # contain S in their file name and append V to filenames (after eliminating any folder names). 107 | # do not include the path to the file in the file name. 108 | 109 | # To eliminate names of any folders (directories) that are in V: 110 | # do paste(directory, "/", V, sep = "") to get the filenames including either 111 | # the relative path from the R working directory or the absolute path (depending on what 112 | # directory is); use these names in the R dir.exists function to check for folder 113 | # (directory) names in filenames 114 | 115 | for (k in 1:ns) { 116 | V <- list.files(directory, pattern = search.strings[k], 117 | full.names = FALSE, ignore.case = TRUE) 118 | # exclude directory names from V (we need to do this since we are "adding" 119 | # the file names in V to filenames and want only file names, not folder names 120 | # if V is empty (character(0)) then skip this 121 | if(length(V) > 0) V <- V[!dir.exists(paste(directory, "/", V, sep = ""))] 122 | filenames <- c(filenames, V) 123 | } 124 | 125 | ### It is important to note we could have also "run the for loop" in this fashion: 126 | ### for (S in search.strings) { 127 | ### V <- list.files(directory, pattern = S, 128 | ############## rest of the for loop 129 | ### 130 | 131 | nf <- length(filenames) 132 | if(nf == 0) { 133 | print("no files contain any of the search strings") 134 | return("no files contain any of the search strings") 135 | } 136 | 137 | filenames <- unique(filenames) 138 | nf <- length(filenames) # need to do this since may have eliminated some duplicate name(s) 139 | 140 | # If got to here, at least 1 file has a character string in search.strings 141 | # in its name, so get the information on these file(s) into a data frame. 142 | 143 | ################################## get the data frame to be output 144 | 145 | # Get the desired output data frame using vectors 146 | 147 | dfcolnames <- c("file.name", "modif.date", "size.in.bytes") 148 | # initialize the 3 vectors that will hold this information on the files 149 | # whose names matched any of the members of search.strings 150 | 151 | fname <- character(0) 152 | fdate <- character(0) 153 | fsize <- numeric(0) 154 | 155 | for(k in 1:nf) { 156 | finfo <- file.info(paste(directory, "/", filenames[k], sep = "")) 157 | # needed to include the path to the file so file.info can locate it 158 | fname <- c(fname, filenames[k]) 159 | fdate <- c(fdate, as.character(finfo$mtime)) 160 | fsize <- c(fsize, finfo$size) 161 | } 162 | 163 | df <- data.frame(fname, fdate, fsize, stringsAsFactors = FALSE) 164 | colnames(df) <- dfcolnames 165 | 166 | ################################## finished getting the data frame to be output 167 | 168 | # Write the data frame out to a tab delimited text file called scrlisting.txt in directory 169 | # (i.e., in the folder specified by the argument directory this function was called with). 170 | 171 | outpfilename <- paste(directory, "/", "scrlisting.txt", sep = "") 172 | # One can rename this "scratch file" as desired after viewing it (best viewed in Excel or equivalent). 173 | write.table(df, file = outpfilename, 174 | append = FALSE, quote = FALSE, sep = "\t", 175 | row.names = FALSE, col.names = TRUE) 176 | # This call to write.table will write out a data frame 177 | # as one would usually want; it specifies the column separator to be a tab 178 | 179 | return(df) 180 | } 181 | 182 | ``` 183 | 184 | ## Exercises 185 | 186 | The function for this exercise will be to construct a modified version of the search function called 187 | 188 | ``` 189 | search_for_filenames_matching_NONE_of_the_patterns_and_output_file_names <- 190 | function(directory, search.strings) 191 | ``` 192 | 193 | that will return the vector of file names in directory that do **NOT** match **ANY entry** of search.strings 194 | 195 | One way to view this, is that the first function in this sequence obtained the **intersection** of the 196 | sets of file names matching each entry of search strings, while the previous function (the function above) 197 | obtained the **union** U of the sets of file names matching each entry of search.strings 198 | This programming exercise will be to construct a version that will return 199 | the vector of file names in directory that **DO NOT match ANY** entry of search.strings. 200 | This can be viewed as obtaining, relative to the set of all the files in directory, the **complement of U**. 201 | 202 | One could as in the previous functions return information on 203 | these files, but this is more a practice exercise in using the %in% function (and in the next exercise, using 204 | the setdiff function) so we are just going to concentrate on getting the vector of the file names. 205 | 206 | Hints: Use the 207 | 208 | **search_for_filenames_matching_any_of_the_patterns_and_output_file_info** 209 | 210 | function to find the file names in directory that match one or more of the file names 211 | in search.strings, call this vector **Vany**. Then remove these file names from the vector of all the file names 212 | in directory. One approach for this is to use **list.files** to get the vector of all the file and folder names 213 | in directory, then remove the folder names from this vector, resulting in the vector **Vall**. 214 | Then construct a logical vector Vlogical the same length as Vall such that: Vlogical[k] is TRUE if Vall[k] is 215 | **not** an entry of Vany (and Vlogical[k] is FALSE if Vall[k] is an entry of Vany). The `%in%` function 216 | is very convenient for this (and this is good practice using it). 217 | 218 | Recall from the fifth article in this series: The **%in%** function addresses the question of whether or 219 | not each entry of some vector v occurs in another vector w: `z <- v %in% w` obtains a logical 220 | vector z with z[k] being TRUE if v[k] is equal to some entry in w, 221 | and z[k] being FALSE if v[k] is not equal some entry in w. Note then the 222 | logical vector `nz <- !z` has the property that nz[k] will be TRUE if v[k] is NOT an entry of w, and nz[k] will 223 | be FALSE if v[k] is an entry of w; hence v[nz] gives the entries of v that are NOT in w. 224 | 225 | An example of how %in% behaves: 226 | 227 | ``` 228 | ?"%in%" # look at the help on %in% Note because of the special 229 | # character % one needs to "protect" %in% by enclosing it in 230 | # either quotes or apostrophes when "asking for help" on it 231 | 232 | v <- c(1, 2, 3, 4, 5, NA) 233 | w <- c(12, 3, 8, 22, 4) 234 | v %in% w 235 | [1] FALSE FALSE TRUE TRUE FALSE FALSE 236 | 237 | v <- c(1, 2, 3, 4, 5, NA) 238 | w <- c(12, 3, 8, 22, 4, NA) 239 | v %in% w 240 | [1] FALSE FALSE TRUE TRUE FALSE TRUE 241 | 242 | # note %in% will declare a match for an NA in v if there is an NA in w 243 | ``` 244 | 245 | From the discussion above, Vall[Vlogical] is the desired vector of file names, and we can obtain Vlogical 246 | using the `%in%` function and logical negation (!). 247 | Try doing this - a working version is given below. 248 | 249 | ``` {r} 250 | search_for_filenames_matching_NONE_of_the_patterns_and_output_file_names <- 251 | function(directory, search.strings){ 252 | 253 | # directory is an absolute path (full path) or a path relative to the R working directory to the 254 | # folder to be searched. If want to search the R working directory itself, 255 | # can set directory = "." 256 | # or could set directory to be the full path to the R working directory. 257 | 258 | # search.strings is a text string vector 259 | 260 | # Return a vector of the file names (not including folders) in directory that contain 261 | # NONE of the entries of the search.strings vector in their file name, 262 | # the search will be case insensitive (treats lower case and upper case letters as the same). 263 | # Since this is mainly an exercise in using the %in% function, just return the vector of file 264 | # names (and don't bother with getting the last modification data or file size). 265 | 266 | # Use R's list.files function to list all the files (file names) in directory 267 | # then eliminate names of folders; put the result in Vall 268 | 269 | # Then use the previous function 270 | # search_for_filenames_matching_any_of_the_patterns_and_output_file_info 271 | # to find all the files in directory that match some entry in search.strings 272 | # and put the vector of these file names in Vany 273 | 274 | # Then eliminate from Vall any entries that are in Vany using the %in% function 275 | # and logical negation (!) 276 | 277 | # We will get the file names without the folder path leading to the files included in the name. 278 | 279 | # check that search.strings is a non-empty character vector 280 | 281 | ns <- length(search.strings) 282 | if(ns < 1) stop("no entries in search.strings") 283 | if(!is.character(search.strings)) stop("search.strings is not a character vector") 284 | 285 | Vall <- list.files(directory, full.names = FALSE, ignore.case = TRUE) 286 | nf <- length(Vall) 287 | if(nf == 0) { 288 | print("no files in directory") 289 | return("no files in directory") 290 | } 291 | # exclude directory names from Vall 292 | Vall <- Vall[!dir.exists(paste(directory, "/", Vall, sep = ""))] 293 | 294 | nf <- length(Vall) 295 | if(nf == 0) { 296 | print("no files in directory") 297 | return("no files in directory") 298 | } 299 | 300 | # now get Vany 301 | df.Vany <- 302 | search_for_filenames_matching_any_of_the_patterns_and_output_file_info(directory, search.strings) 303 | 304 | # need to handle the case that NO file names were a match for any of the search strings 305 | # in which case df.Vany is the character string: "no files contain any of the search strings" 306 | # rather than a data frame 307 | if(class(df.Vany) == "character") { 308 | Vany <- character(0) 309 | } else { 310 | Vany <- df.Vany$file.name # the column of file names 311 | } 312 | # When programming, one should always be sure the function handles "extreme cases", 313 | # here that would be when NONE the file names in directory match some entry in search.strings, 314 | # and when ALL of the file names match search.strings 315 | 316 | # eliminate names in Vany from Vall 317 | 318 | Vlogical <- !(Vall %in% Vany) # we want entries in Vall that are NOT in Vany so need to use ! 319 | filenames <- Vall[Vlogical] # the desired file names (names NOT in Vany) 320 | 321 | nf <- length(filenames) 322 | if(nf == 0) { 323 | print("all files contain contain an entry of search strings") 324 | return("all files contain contain an entry of search strings") 325 | } 326 | 327 | # If got to here, at least 1 file has no entry of search.strings in its name 328 | return(filenames) # here we are just returning the file names without other information on them 329 | } 330 | 331 | ``` 332 | 333 | ### Test runs on my computer 334 | 335 | Recalling from the previous two exercise files, I have constructed in my R working directory a folder 336 | called test_dir containing a small number of files with names I picked to conveniently test 337 | the functions that search for file names satisfying various conditions. 338 | Here are all the file names (and the one folder name) in the test_dir folder: 339 | 340 | ``` 341 | directory <- "test_dir" 342 | list.files(directory) # 9 files and 1 folder 343 | [1] "001.csv" "002.csv" "003.txt" 344 | [4] "004csvfile.txt" "005txt2csv" "1.csv" 345 | [7] "308.csv" "folder001txtcsv" "scrlisting.txt" 346 | [10] "txt.csv" 347 | ``` 348 | 349 | We will use this folder and files (9 filenames and 1 folder name) to test the search function 350 | written immediately above (you can construct a similar test_dir folder and files with these filenames and also a 351 | folder in it called "folder001txtcsv" to run tests, the only difference will be the dates and sizes of the files). 352 | Or you could test the function using a suitable folder in your computer for which you can pick search.strings so that 353 | you will get a known reasonable length vector for the test run. 354 | 355 | 356 | ## Some test runs 357 | 358 | ``` 359 | directory <- "test_dir" 360 | list.files(directory) # 9 files and 1 folder 361 | [1] "001.csv" "002.csv" "003.txt" 362 | [4] "004csvfile.txt" "005txt2csv" "1.csv" 363 | [7] "308.csv" "folder001txtcsv" "scrlisting.txt" 364 | [10] "txt.csv" 365 | 366 | search.strings <- c("2", "3", "4", "5") 367 | search_for_filenames_matching_NONE_of_the_patterns_and_output_file_names(directory, search.strings) 368 | [1] "001.csv" "1.csv" "scrlisting.txt" "txt.csv" 369 | 370 | search.strings <- c("csv") 371 | search_for_filenames_matching_NONE_of_the_patterns_and_output_file_names(directory, search.strings) 372 | [1] "003.txt" "scrlisting.txt" 373 | 374 | # test case when all the file names match some member of search.strings 375 | search.strings <- c("csv", "txt") 376 | search_for_filenames_matching_NONE_of_the_patterns_and_output_file_names(directory, search.strings) 377 | [1] "all files contain contain an entry of search strings" 378 | [1] "all files contain contain an entry of search strings" 379 | # when there are no such files (no files in directory that don't have an entry of search.strings 380 | # in their file name), the search function both prints this message and returns it, so here it is 381 | # output twice 382 | 383 | search.strings <- c("00") 384 | search_for_filenames_matching_NONE_of_the_patterns_and_output_file_names(directory, search.strings) 385 | [1] "1.csv" "308.csv" "scrlisting.txt" "txt.csv" 386 | 387 | # test case when none of the file names match some member of search.strings, 388 | # one should get all the file names (but not the folder name) 389 | search.strings <- c("00987") 390 | search_for_filenames_matching_NONE_of_the_patterns_and_output_file_names(directory, search.strings) 391 | [1] "no files contain any of the search strings" 392 | [1] "001.csv" "002.csv" "003.txt" "004csvfile.txt" 393 | [5] "005txt2csv" "1.csv" "308.csv" "scrlisting.txt" 394 | [9] "txt.csv" 395 | 396 | # note [1] "no files contain any of the search strings" came from 397 | # "looking for" the files that matched ANY entry of search.strings - 398 | # in this case there were none, and so one obtained all the file names 399 | # in test_dir (and, correctly, not the folder name) 400 | ``` 401 | 402 | The next exercise is to use the **setdiff** function in place of using the %in% function to get 403 | the file names that are in Vall but not in Vany. 404 | 405 | Hint: 406 | In the function above you simply need to replace the following 2 lines with 1 line using setdiff 407 | ``` 408 | Vlogical <- !(Vall %in% Vany) # we want entries in Vall that are NOT in Vany so need to use ! 409 | filenames <- Vall[Vlogical] # the desired file names (names NOT in Vany) 410 | ``` 411 | 412 | A correct line using setdiff is: `filenames <- setdiff(Vall, Vany)` 413 | 414 | One should do the test runs again to check this works correctly. 415 | 416 | This exercise was intended to practice the very useful **%in%** function, and also illustrate 417 | that finding an existing R function (here setdiff) that can do exactly what you want can result in very concise 418 | easy to understand code, which leaves less room for bugs to occur and to hide. 419 | 420 | 421 | Hope this was informative and good practice. 422 | The next set of exercises will address dealing with using subsets of an individual row of a data frame. 423 | = = = = = = = = = = = = = = = = = = = = = = = = 424 | 425 | This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 426 | To view a copy of this license, visit https://creativecommons.org/licenses/by-nc-sa/4.0/ or send a letter 427 | to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. There is a full version of this license at this web site: 428 | https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode 429 | -------------------------------------------------------------------------------- /Eleventh-R-Practice-exercise-using-sapply-and-split-and-also-use-of-ellipsis-to-pass-in-additional-argumentsFeb11.Rmd: -------------------------------------------------------------------------------- 1 | 2 | --- 3 | # output: 4 | # md_document: 5 | # variant: markdown_github 6 | output: pdf_document 7 | fontsize: 12pt 8 | --- 9 | 10 | ## Eleventh R Practice exercise using sapply and split and also use of ellipsis to pass in additional arguments 11 | 12 | ### Alan E. Berger Feb 11, 2022; edit June 20 to emphasize requirement on first argument of FUN in sapply 13 | 14 | ### available at https://github.com/AlanBerger/Practice-programming-exercises-for-R 15 | 16 | ## Introduction 17 | 18 | This article will discuss **sapply** at length, and then practice using **split** and **sapply**. sapply is a version of **lapply** that when possible will give its output as a vector (or matrix when that is appropriate) instead of a list (lapply will produce a list; a list that is "naturally a vector" can be converted to the vector using the **unlist** function; using sapply avoids having to do that extra step). 19 | 20 | Much of what is described below for sapply is also applicable to lapply. 21 | 22 | We will go over some simple examples, and then use the iris data set to demonstrate sapply and split, and the ellipsis (...) functionality available in sapply. 23 | 24 | Note if you want to copy lines from this for use in R, it is best to copy from the .Rmd file, since sometimes text in the pdf file can contain formatting characters that R does not accept. To download an Rmd file, open the Rmd file to display it in GitHub and then toward the upper right of the resulting window there will be a "raw box" (to the left of "Blame"), on which one should be able to do (for Windows): right click, "save as" to download the .Rmd file as a text file. 25 | 26 | 27 | ``` {r} 28 | 29 | # The help for sapply indicates a "standard" call to sapply will 30 | # have the form: 31 | 32 | # sapply(X, FUN, ...) 33 | 34 | # Here X is a vector or data.frame or list 35 | # FUN is the name of a function that is available in the current R session 36 | # (which could be a user defined function, or a function that 37 | # "comes with R", or that has been "loaded" (via library)from a package), 38 | # OR is an anonymous function defined within the call to sapply. 39 | # The function specified by FUN will be applied to each entry of X if X 40 | # is a vector; to each column of X (taken as a vector) if X is 41 | # a data frame, to each entry of X if X is a list (vectors and data.frames 42 | # are special types of list). When possible the result from sapply will 43 | # be a vector (or matrix if that is appropriate). Otherwise sapply will 44 | # return a list (just as lapply would). 45 | 46 | # The optional ellipsis argument can be used to pass in additional 47 | # arguments to the function specified in the FUN argument of sapply 48 | # as illustrated below. 49 | 50 | # a simple case: square each element of a vector (yes we could do this 51 | # by x^2, but here we demonstrate use of sapply and an anonymous function): 52 | 53 | v <- 1:5 54 | sapply(X = v, FUN = function(y) y^2) 55 | 56 | # ******** IT IS BEST to use the form: argument.name = user.chosen.value 57 | # ******** when calling one of the apply family of functions. In this call 58 | # ******** to sapply, the name of the first argument of sapply is X and 59 | # ******** the value chosen for X is v; and the second argument of sapply 60 | # ******** is FUN, and the value chosen for FUN is the 61 | # ******** anonymous function, function(y) y^2 62 | 63 | # Note y in the definition of this function is a "formal argument", 64 | # also called a "dummy argument", meaning any variable name permissible 65 | # in R could be used, for example 66 | 67 | # sapply(X = v, 68 | # FUN = function(user.selected.name) user.selected.name^2) 69 | 70 | # One could also optionally include curly brackets to delineate the 71 | # function: 72 | 73 | # sapply(X = v, FUN = function(y) {y^2}) 74 | 75 | # sapply, in effect, successively calls the function the user has 76 | # provided for FUN, here function(y) {y^2} 77 | # with each entry of v as its argument, 78 | # and "collects" the results in a vector. 79 | 80 | # My point of view is it is fine to use an anonymous function if it 81 | # is short (will fit in a line or so), but otherwise it makes the code 82 | # much easier to proofread and so avoid/detect bugs if one codes the 83 | # function as a stand-alone function, call it, for example, 84 | # user.function and uses its name in the FUN argument of sapply via 85 | # FUN = user.function 86 | # Here user function could also be any function that is available in the 87 | # current R session 88 | 89 | # demonstrate use of the ellipsis argument of sapply to pass in an 90 | # additional argument of FUN 91 | 92 | 93 | sapply(X = v, FUN = function(x, power) x^power, power = 3) 94 | 95 | 96 | # Here each entry of v is passed into the function as the value for x, 97 | # and the additional argument, power, of the function is supplied via 98 | # the third (ellipsis) argument of sapply. 99 | 100 | # One could also rely on lexical scoping to "find" the value of power, 101 | # but that is not good programming practice when it can be avoided, 102 | # since the value where R "finds" it might not be what was wanted 103 | # (or it might have gotten changed from what you thought it would be). 104 | 105 | power <- 4 106 | sapply(X = v, FUN = function(x) x^power) # this worked but is not best practice 107 | 108 | 109 | # Here is an example with two additional arguments supplied using 110 | # the ellipsis functionality of sapply, and where sapply returns a matrix 111 | 112 | v <- 1:5 113 | sapply(X = v, FUN = function(x, power1, power2) {c(x^power1, x^power2)}, 114 | power1 = 2, power2 = 3) 115 | 116 | ``` 117 | 118 | ## The first argument of the function in sapply MUST BE for the entries of X 119 | 120 | Suppose one is calling sapply via, for example, 121 | 122 | result <- sapply(X = v, FUN = user.function, other.arg1 = a1, other.arg2 = a2) 123 | 124 | where v is a vector or data frame or list, so sapply will successively 125 | call user.function with each entry, refer to it as $E$, of v (each column if v is a data frame). 126 | **It is important to note that**: 127 | 128 | sapply will successively call user.function with $E$ **as the first argument of user.function** 129 | 130 | So user.function (if programmed by us) should be programmed that way, and if user.function is a built in function that comes with the base R installation, or has been "loaded" from a package, its first argument should be for entries of v. (Technically, this is not absolutely required, but not conforming with this "first argument condition" can lead to mysterious errors and is best avoided.) In the conceptual example above, other.arg1 and other.arg2 are other arguments of user.function and their values have been set to a1 and a2, respectively, in this call. Arguments passed into user.function via the ellipsis functionality of sapply must be called in the 131 | 132 | argument.name = chosen.value 133 | 134 | format. 135 | 136 | This "first argument condition" is pretty natural, in that for many R functions, the first argument (or first couple of arguments) are what the function "acts on" and following arguments (which have default values) govern options on how that is done. For example the mean function takes the mean of a vector, and its other options modulate how that is done, and read.table reads in a suitable file as a data frame, and it has quite a few options on how that is done, and the plot function can produce a scatterplot of y vs. x with a multitude of options allowing for fine detailed control of the form of the plot. 137 | 138 | ## Now use the iris data set to illustrate use of sapply 139 | 140 | From the R help on the iris data set (? iris): "This famous (Fisher's or Anderson's) iris data set gives the measurements in centimeters of the variables sepal length and width and petal length and width, respectively, for 50 flowers from each of 3 species of iris. The species are Iris ***setosa***, ***versicolor***, and ***virginica***" 141 | 142 | ``` {r} 143 | 144 | # from the R help on the iris data set: 145 | # iris is a data frame with 150 cases (rows) and 5 variables (columns) 146 | # named Sepal.Length, Sepal.Width, Petal.Length, Petal.Width, and Species, 147 | # respectively, for 50 flowers from each of 3 species of iris. 148 | # The species are Iris setosa, versicolor, and virginica. 149 | 150 | # There are equal numbers of the three types of flowers, and they are 151 | # grouped together in the data frame, but neither "equal numbers" nor 152 | # "grouped together" are necessary for using split together with sapply 153 | # as will be demonstrated below. 154 | 155 | data(iris) # make the iris data set available to this R session 156 | iris.df <- iris # a copy, emphasizing it is a data frame 157 | 158 | # take a look at it 159 | # head(iris.df) 160 | 161 | # tail(iris.df) 162 | 163 | # place an NA into the Sepal.Width column, to later demonstrate 164 | # how to use the ellipsis (...) functionality for sapply 165 | # to pass in one or more additional optional arguments into the function 166 | # being used (for example na.rm = TRUE, trim = 0.11, for the mean function) 167 | 168 | iris.df[2,2] <- NA 169 | # check that was done 170 | head(iris.df) 171 | 172 | # An aside: Explanation of what the optional argument trim in 173 | # the mean function does: 174 | 175 | # we will use this information to do a "by hand" check of 176 | # the output obtained below from using sapply with the mean function 177 | # on the iris.df data frame 178 | 179 | # If V is a numeric vector and one does 180 | # mean(V, na.rm = TRUE, trim = w) 181 | # what will happen is that the mean function will, in effect, 182 | # first produce the vector Vn which is V but with any and all NAs removed 183 | # from V. 184 | # If L is the length of Vn (here assume it is > 0), then let K = w * L 185 | # (w was what the argument trim was set to; the default for trim is 0). 186 | # Let Ki be the integer part of K, e.g., if K was 2.3 then Ki is 2 187 | # if K was 0.4 then Ki is 0 188 | # Then mean(V, na.rm = TRUE, trim = w) is evaluated as follows: 189 | # Let Vn.sorted be the result of doing sort(Vn) 190 | # Let Vn.sorted.trimmed be the result of: 191 | # removing the first Ki entries of Vn.sorted AND 192 | # removing the last Ki entries of Vn.sorted 193 | # (assume that Vn.sorted.trimmed still has at least 1 element left in it) 194 | 195 | # mean(V, na.rm = TRUE, trim = w) is then equal to mean(Vn.sorted.trimmed) 196 | 197 | 198 | # first use sapply on the 4 data columns of iris.df, 199 | # using na.rm = TRUE and trim = 0.11 200 | # (which will trim 16 entries from each end of the vector whose mean 201 | # is being calculated). The sapply command below will successively pass 202 | # each column of iris.df[, 1:4] AS A VECTOR into the mean function 203 | 204 | sapply(X = iris.df[, 1:4], FUN = mean, na.rm = TRUE, trim = 0.11) 205 | 206 | # Note that sapply "captured" the names of the columns of iris.df[, 1:4] 207 | 208 | # since the lengths of the 4 columns after removing NAs 209 | # were 150, 149, 150, 150; 210 | # setting trim equal to 0.11 in effect removed 16 elements from 211 | # each end of each column AFTER 212 | # any NAs were removed and the column was sorted; 213 | # then the mean was calculated 214 | 215 | # check the result from sapply above "by hand" 216 | 217 | x <- sort(na.omit(iris.df[[1]])) 218 | mean(x[17:(length(x) - 16)]) 219 | 220 | 221 | x <- sort(na.omit(iris.df[[2]])) 222 | mean(x[17:(length(x) - 16)]) 223 | 224 | # these and also checking columns 3 and 4 are OK 225 | 226 | # Note 227 | # sapply(X = iris.df[, 1:4], FUN = mean, na.rm = TRUE, trim = 0.11) 228 | # worked to get the optional arguments 229 | # na.rm = TRUE, trim = 0.11 230 | # into the mean function (FUN = mean) since the mean function 231 | # is "part of" R and it has na.rm and trim as optional arguments 232 | 233 | # When using an anonymous function in sapply, AND using the optional 234 | # ellipsis (...) argument of sapply to pass in additional arguments to 235 | # the function specified by FUN, one needs to either have those additional 236 | # arguments be declared arguments of the function (as with the power 237 | # argument(s) in the examples above), OR to have an ellipsis in the 238 | # argument list of the function specified by FUN, AND used appropriately 239 | # within the anonymous function. Again, the first argument 240 | # of the function FUN in sapply should be for 241 | # what will be passed from each entry of the first argument X of sapply. 242 | # For example in the call to sapply below, 243 | # x will successively have the value of each of the first 4 columns 244 | # of iris.df 245 | 246 | sapply(X = iris.df[, 1:4], 247 | FUN = function(x, ...) {mean(x, ...)}, 248 | na.rm = TRUE, trim = 0.11) 249 | 250 | 251 | # In effect, in the above call to sapply, 252 | # the ellipsis ... was "set to": na.rm = TRUE, trim = 0.11 253 | # and that got "passed through" via the ... in the argument list of 254 | # the anonymous function AND within the anonymous function itself 255 | # (here "into" the ... in the argument list of the mean function) 256 | # * * * It was necessary to have ... included in the argument list of 257 | # * * * the mean function within this anonymous function 258 | 259 | # Here is what happens if the ellipsis ... is left out of the 260 | # argument list of mean in the anonymous function 261 | # (it runs, but na.rm = TRUE, trim = 0.11 are not used, 262 | # so the result is not what we wanted): 263 | sapply(X = iris.df[, 1:4], 264 | FUN = function(x, ...) {mean(x)}, 265 | na.rm = TRUE, trim = 0.11) 266 | 267 | # The result is indeed the same as if na.rm = TRUE and trim = 0.11 were 268 | # NOT invoked when calling the mean function: 269 | sapply(X = iris.df[, 1:4], FUN = mean) 270 | 271 | 272 | # Note this also works correctly: 273 | sapply(X = iris.df[, 1:4], 274 | FUN = function(x, na.rm.value, trim.value) 275 | {mean(x, na.rm = na.rm.value, trim = trim.value)}, 276 | na.rm.value = TRUE, trim.value = 0.11) 277 | 278 | 279 | # ************ note various plotting functions, e.g., plot(x,y, ...) 280 | # have ... in their argument list, 281 | # so various of the many optional arguments for them can be passed in 282 | # by sapply. 283 | # For example, the first two arguments of plot, x and y, can be vectors 284 | # for making a scatterplot. 285 | # If X = data.frame.list is a list of data frames whose first two columns 286 | # are numeric values (our example will be constructed from iris.df), 287 | # and plotting_function is a function that 288 | # will make a scatterplot from data in the first two columns of a data 289 | # frame, then, one could, conceptually, do for example: 290 | 291 | # sapply(X = data.frame.list, FUN = plotting_function, 292 | # type = "p", pch = 1, col = "blue") 293 | # 294 | # Here plotting_function 295 | # should extract the x and y vectors from the first and second columns 296 | # of the data frame it is called with, and then call the plot function via 297 | # plot(x, y, ...) 298 | # In this setup, plotting_function must have an ellipsis in its argument 299 | # list. Then sapply would pass through, via the ellipsis functionality, 300 | # the various plotting arguments that were specified 301 | # (there are many many optional arguments for plot); 302 | # the ones given here "say": do a point plot with circle symbols, 303 | # and have the symbol color be blue 304 | 305 | plotting_function <- function(df, ...) { 306 | # here df is a data frame that will be passed in by sapply 307 | # scatter plot column 2 of df vs. column 1 of df 308 | x <- df[[1]] 309 | y <- df[[2]] 310 | 311 | # get x and y axis labels from column names of df 312 | xlabel <- colnames(df)[1] 313 | ylabel <- colnames(df)[2] 314 | 315 | # other arguments for the plot function are passed into plot from 316 | # the call to sapply via the ellipsis 317 | 318 | plot(x, y, xlab = xlabel, ylab = ylabel, ...) 319 | } 320 | 321 | 322 | # Demonstrate this: 323 | # Construct a list of two data frames, each with two columns, 324 | # from iris.df 325 | 326 | data.frame.list = list(iris.df[, c(1,2)], iris.df[, c(3,4)]) 327 | 328 | 329 | # invoke sapply, creating 2 plots, while using the ellipsis functionality 330 | # to pass plotting arguments into the plot function 331 | 332 | # one could do more informative plotting with this data, 333 | # but the point here is to illustrate 334 | # use of the ellipsis functionality in sapply 335 | 336 | sapply(X = data.frame.list, FUN = plotting_function, 337 | type = "p", pch = 1, col = "blue") 338 | 339 | # this produced the plots along with a "pro forma" "NULL" from each 340 | # call to the plot function 341 | # to suppress this useless text, one could use the invisible function: 342 | 343 | # invisible(sapply(X = data.frame.list, FUN = plotting_function, 344 | # type = "p", pch = 1, col = "blue")) 345 | 346 | ``` 347 | 348 | ## A note when using sapply and passing in arguments using its ellipsis capability 349 | 350 | When using the ellipsis functionality in sapply to pass arguments, 351 | the R help on sapply recommends specifying the X and FUN arguments explicitly, 352 | (not just by position), for example 353 | 354 | sapply(X = some.data.frame, FUN = function.that.does.plots, 355 | type = "p", pch = 1, col = "green") 356 | 357 | ## The split function. 358 | 359 | The **split** function can be used to "split up" a data frame df into a list 360 | of data frames that are subsets of df, call it df.split.list, 361 | based on a character or factor column of df. 362 | For each of the distinct entries E in the character (or factor) 363 | column of df being used to "do the split", there will be a 364 | data frame in df.split.list which is the subset of df containing all the 365 | rows of df for which the entry in the column being used to do the split 366 | equals E, and the name of that entry in df.split.list will be the 367 | character string E. 368 | 369 | ## The practice exercise 370 | 371 | The practice exercise is: 372 | given the name of one of the 4 numeric data columns in iris.df, 373 | compute the mean of the entries in that column that correspond to each of 374 | the 3 iris flower types (so one will compute 3 means; one mean for all the 375 | entries in that column whose iris type is setosa, one mean for the 376 | versicolor flowers and one mean for virginica flowers). 377 | Do this by using split on iris.df to produce a list of 3 data frames, 378 | one for each of the 3 types of iris flower. 379 | Then write a function that given one of these data frames, and the name 380 | of one of the 4 numeric data columns in iris.df, will 381 | compute the mean for the specified column. Use this function, and the list 382 | of 3 data frames produced by split, in sapply to get the result. 383 | 384 | Try doing this before looking at one possible solution in the R session given below. 385 | 386 | ```{r} 387 | 388 | # make this session self contained 389 | data(iris) # make the iris data set available to this R session 390 | iris.df <- iris # a copy, emphasizing it is a data frame 391 | 392 | head(iris.df) # look at it 393 | 394 | # split iris.df by the 3 iris flower types (species) 395 | df.split.list <- split(iris.df, iris.df$Species) 396 | 397 | get.mean.of.specified.column <- function(df, column.name) { 398 | # df will be one of the 3 data frames in df.split.list 399 | # column.name will be one of the quantities measured for each flower: 400 | # (Sepal.Length, Sepal.Width, Petal.Length, Petal.Width) 401 | 402 | mean.of.given.column <- mean(df[[column.name]]) 403 | # there are no NAs in iris.df so here don't need to use na.rm = TRUE 404 | # (in an example above I placed an NA in iris.df to test ways of passing 405 | # na.rm = TRUE into the mean function) 406 | 407 | return(mean.of.given.column) 408 | } 409 | 410 | # Use sapply for the various columns 411 | # (One could also use sapply (with a modified function) to return a matrix 412 | # containing the means for the 3 flower species for each of the 4 413 | # measured quantities. The modified function would return a vector of the 414 | # means for the 4 quantities for any of the 3 data frames in df.split.list) 415 | 416 | sapply(X = df.split.list, FUN = get.mean.of.specified.column, 417 | column.name = "Sepal.Length") 418 | 419 | sapply(X = df.split.list, FUN = get.mean.of.specified.column, 420 | column.name = "Sepal.Width") 421 | 422 | sapply(X = df.split.list, FUN = get.mean.of.specified.column, 423 | column.name = "Petal.Length") 424 | 425 | sapply(X = df.split.list, FUN = get.mean.of.specified.column, 426 | column.name = "Petal.Width") 427 | 428 | 429 | # Now program the function to be used in sapply to return the 430 | # vector of all 4 means. Then sapply will return a matrix 431 | 432 | # Here is a solution 433 | 434 | get.all.4.means.in.df <- function(df) { 435 | # df will be one of the 3 data frames in df.split.list 436 | # Its 4 numeric column names are 437 | # Sepal.Length, Sepal.Width, Petal.Length, Petal.Width 438 | 439 | # use sapply within this function to get the 4 means 440 | the.4.means <- sapply(df[, 1:4], mean) 441 | # this will also "capture" the column names 442 | 443 | # there are no NAs in iris.df so here don't need to use na.rm = TRUE 444 | # (in an example above I placed an NA in iris.df to test ways of passing 445 | # na.rm = TRUE into the mean function) 446 | 447 | return(the.4.means) 448 | } 449 | 450 | # use sapply and this function to get the matrix of means 451 | result <- sapply(X = df.split.list, FUN = get.all.4.means.in.df) 452 | result 453 | 454 | # notice sapply conveniently captured the row and column names for 455 | # the matrix from the data frames it operated on 456 | 457 | rownames(result) 458 | 459 | # And in particular: 460 | 461 | colnames(result) 462 | 463 | # will be the same as 464 | 465 | names(df.split.list) 466 | 467 | # if one prefers one can transpose the matrix 468 | print(t(result)) 469 | 470 | ``` 471 | 472 | Hope this discussion has been helpful. 473 | 474 | 475 | = = = = = = = = = = = = = = = = = = = = = = = = 476 | 477 | This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 478 | To view a copy of this license, visit https://creativecommons.org/licenses/by-nc-sa/4.0/ or send a letter 479 | to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. There is a full version of this license at this web site: 480 | https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode 481 | --------------------------------------------------------------------------------