├── second-R-programming-exercise-if-else-if-else-syntax-and-logicPDF.pdf
├── Fourth-R-programming-exercise-find-prime-integers-less-than-or-equal-N.pdf
├── Sixth-R-function-paths-listing-files-string-search-constructing-a-data-framePDF.pdf
├── first-R-programming-exercise-column-number-from-Excel-column-letter-markdown-UTF8.pdf
├── Fifth-article-Review-of-getting-subsets-of-a-data-frame-constructing-data-framesPDF.pdf
├── Seventh-R-function-systematic-debugging-composing-a-function-constructing-a-data-frame.pdf
├── Nineth-R-Practice-exercise-using-a-row-of-a-data-frame-the-unlist-function-composing-a-function.pdf
├── tiny-subset-of-GSE18885-gene-expression-data-9-genes-WG-5-samples-Normal-Control-4-samples.tab.txt
├── Tenth-R-Practice-exercise-merging-annotation-data-into-a-gene-expression-analysis-results-data-frame.pdf
├── Twelth-R-Practice-exercise-using-Monte-Carlo-simulation-to-investigate-a-statistical-quantity-March-6.pdf
├── third-R-programming-exercise-using-for-loops-and-if-tests-to-check-if-a-positive-integer-is-a-prime.pdf
├── Eighth-R-Practice-exercise-composing-a-function-and-constructing-a-data-frame-files-not-containing-search-strings.pdf
├── Eleventh-R-Practice-exercise-using-sapply-and-split-and-also-use-of-ellipsis-to-pass-in-additional-argumentsFeb11.pdf
├── GPL6104-Illumina-microarray-platform-annotation-from-GEO-repository-small-subset-edited-example-Feb12.tab.txt
├── README
├── first-R-programming-exercise-column-number-from-Excel-column-letter-markdown-UTF8.Rmd
├── first-R-programming-exercise-column-number-from-Excel-column-letter-markdown-UTF8.md
├── Fourth-R-programming-exercise-find-prime-integers-less-than-or-equal-N.Rmd
├── second-R-programming-exercise-if-else-if-else-syntax-and-logic.Rmd
├── Fourth-R-programming-exercise-find-prime-integers-less-than-or-equal-N.md
├── second-R-programming-exercise-if-else-if-else-syntax-and-logic.md
├── Tenth-R-Practice-exercise-merging-annotation-data-into-a-gene-expression-analysis-results-data-frame.Rmd
├── third-R-programming-exercise-using-for-loops-and-if-tests-to-check-if-a-positive-integer-is-a-prime.Rmd
├── Fifth-article-Review-of-getting-subsets-of-a-data-frame-constructing-data-frames.Rmd
├── Eighth-R-Practice-exercise-composing-a-function-and-constructing-a-data-frame-files-not-containing-search-strings.md
├── Eighth-R-Practice-exercise-composing-a-function-and-constructing-a-data-frame-files-not-containing-search-strings.Rmd
└── Eleventh-R-Practice-exercise-using-sapply-and-split-and-also-use-of-ellipsis-to-pass-in-additional-argumentsFeb11.Rmd


/second-R-programming-exercise-if-else-if-else-syntax-and-logicPDF.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanBerger/Practice-programming-exercises-for-R/HEAD/second-R-programming-exercise-if-else-if-else-syntax-and-logicPDF.pdf


--------------------------------------------------------------------------------
/Fourth-R-programming-exercise-find-prime-integers-less-than-or-equal-N.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanBerger/Practice-programming-exercises-for-R/HEAD/Fourth-R-programming-exercise-find-prime-integers-less-than-or-equal-N.pdf


--------------------------------------------------------------------------------
/Sixth-R-function-paths-listing-files-string-search-constructing-a-data-framePDF.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanBerger/Practice-programming-exercises-for-R/HEAD/Sixth-R-function-paths-listing-files-string-search-constructing-a-data-framePDF.pdf


--------------------------------------------------------------------------------
/first-R-programming-exercise-column-number-from-Excel-column-letter-markdown-UTF8.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanBerger/Practice-programming-exercises-for-R/HEAD/first-R-programming-exercise-column-number-from-Excel-column-letter-markdown-UTF8.pdf


--------------------------------------------------------------------------------
/Fifth-article-Review-of-getting-subsets-of-a-data-frame-constructing-data-framesPDF.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanBerger/Practice-programming-exercises-for-R/HEAD/Fifth-article-Review-of-getting-subsets-of-a-data-frame-constructing-data-framesPDF.pdf


--------------------------------------------------------------------------------
/Seventh-R-function-systematic-debugging-composing-a-function-constructing-a-data-frame.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanBerger/Practice-programming-exercises-for-R/HEAD/Seventh-R-function-systematic-debugging-composing-a-function-constructing-a-data-frame.pdf


--------------------------------------------------------------------------------
/Nineth-R-Practice-exercise-using-a-row-of-a-data-frame-the-unlist-function-composing-a-function.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanBerger/Practice-programming-exercises-for-R/HEAD/Nineth-R-Practice-exercise-using-a-row-of-a-data-frame-the-unlist-function-composing-a-function.pdf


--------------------------------------------------------------------------------
/tiny-subset-of-GSE18885-gene-expression-data-9-genes-WG-5-samples-Normal-Control-4-samples.tab.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanBerger/Practice-programming-exercises-for-R/HEAD/tiny-subset-of-GSE18885-gene-expression-data-9-genes-WG-5-samples-Normal-Control-4-samples.tab.txt


--------------------------------------------------------------------------------
/Tenth-R-Practice-exercise-merging-annotation-data-into-a-gene-expression-analysis-results-data-frame.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanBerger/Practice-programming-exercises-for-R/HEAD/Tenth-R-Practice-exercise-merging-annotation-data-into-a-gene-expression-analysis-results-data-frame.pdf


--------------------------------------------------------------------------------
/Twelth-R-Practice-exercise-using-Monte-Carlo-simulation-to-investigate-a-statistical-quantity-March-6.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanBerger/Practice-programming-exercises-for-R/HEAD/Twelth-R-Practice-exercise-using-Monte-Carlo-simulation-to-investigate-a-statistical-quantity-March-6.pdf


--------------------------------------------------------------------------------
/third-R-programming-exercise-using-for-loops-and-if-tests-to-check-if-a-positive-integer-is-a-prime.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanBerger/Practice-programming-exercises-for-R/HEAD/third-R-programming-exercise-using-for-loops-and-if-tests-to-check-if-a-positive-integer-is-a-prime.pdf


--------------------------------------------------------------------------------
/Eighth-R-Practice-exercise-composing-a-function-and-constructing-a-data-frame-files-not-containing-search-strings.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanBerger/Practice-programming-exercises-for-R/HEAD/Eighth-R-Practice-exercise-composing-a-function-and-constructing-a-data-frame-files-not-containing-search-strings.pdf


--------------------------------------------------------------------------------
/Eleventh-R-Practice-exercise-using-sapply-and-split-and-also-use-of-ellipsis-to-pass-in-additional-argumentsFeb11.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlanBerger/Practice-programming-exercises-for-R/HEAD/Eleventh-R-Practice-exercise-using-sapply-and-split-and-also-use-of-ellipsis-to-pass-in-additional-argumentsFeb11.pdf


--------------------------------------------------------------------------------
/GPL6104-Illumina-microarray-platform-annotation-from-GEO-repository-small-subset-edited-example-Feb12.tab.txt:
--------------------------------------------------------------------------------
 1 | Illumina Probe_ID	ILMN_Gene	Entrez_Gene_ID	Chromosome	Probe location in chromosome	Protein coded by gene - very short description
 2 | ILMN_1698220	PHTF2	57157	7	77424374-77424423	homeodomain transcription factor 2
 3 | ILMN_1810835	SPRR3	6707	1	151242655-151242704	small proline-rich protein 3
 4 | ILMN_1688580	CAMP	820	3	48241909-48241918:48241919-48241958	cathelicidin antimicrobial peptide
 5 | ILMN_1802867	RNASE3	6037	14	20430090-20430139	"ribonuclease, RNase A family, 3"
 6 | ILMN_1766736	BPI	671	20	36399055-36399104	bactericidal/permeability-increasing protein
 7 | ILMN_1753347	DEFA4	1669	8	6781040-6781073:6781660-6781675	"defensin, alpha 4"
 8 | ILMN_1749014	ACLY	47	17	37277254-37277303	ATP citrate lyase
 9 | ILMN_1785926	ZNF621	285268	3	40555813-40555862	zinc finger protein 621
10 | ILMN_1796316	MMP9	4318	20	44078320-44078369	matrix metallopeptidase 9
11 | ILMN_1706635	ELA2	1991	19	807179-807228	elastase 2
12 | ILMN_1705183	MPO	4353	17	53702640-53702689	myeloperoxidase
13 | ILMN_1806056	CEACAM8	1088	19	47776394-47776443	carcinoembryonic antigen-related cell adhesion 8
14 | ILMN_1730867	AZU1	566	19	781858-781907	azurocidin 1 antimicrobial protein
15 | ILMN_1813399	ATP2B1	490	12	88506745-88506794	"ATPase, Ca++ transporting variant 2"
16 | ILMN_1750599	ATP2B1	490	12	88516580-88516629	"ATPase, Ca++ transporting variant 1"
17 | 					
18 | 					
19 | small subset of annotation from the Gene Expression Omnibus for Illumina microarray platform GPL6104					
20 | for use in an R programming exercise					
21 | the full file is located at					
22 | https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL6104					
23 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | This repository will contain a sequence of programming exercises (currently 12) 
 2 | intended to fill the gap between learning the correct syntax of basic R commands and 
 3 | the programming assignments in the R Programming course in the Johns Hopkins University 
 4 | Data Science Specialization on Coursera. These exercises review basic R constructs 
 5 | and provide practice in "composing" an R function to carry out a particular task. 
 6 | The idea is to practice correct use of R constructs and built in functions 
 7 | (functions the "come with" the basic R installation), while "putting together" a correct 
 8 | sequence of groups of commands that in a logical sequence of steps will obtain the desired result.
 9 | 
10 | In these exercises, there will be a statement of what your function (or R code) should do - 
11 | what are the input variables and what the function should return - and an outline or sequence of "hints". 
12 | To get the most out of these exercises, try to write your function using as few hints as possible. A working 
13 | code for each function is provided. If at first doing the programming is too hard, it still should be helpful 
14 | to read the commentary on how the functions were "put together" and looking over the code and seeing how it works.  
15 |  
16 | Note there are often several ways to write a function that will obtain the correct result. 
17 | For these exercises the directions and hints may point toward a particular approach intended 
18 | to practice particular constructs in R and a particular line of reasoning.  
19 | There well may be an existing R function or package that will do what is stated for a given 
20 | practice exercise, but here (unlike other aspects of the R Programming course) the point 
21 | is to practice "putting together" a logical sequence of steps, 
22 | with each step a section of code, to obtain a working function, 
23 | not to find an existing solution or a quick solution using a 
24 | more powerful R construct that is better addressed later on.
25 | 
26 | For each exercise, an .md file and or .pdf file is given, and the R markdown (.Rmd) file which generated it is also given. 
27 | If you want to copy code into a file or into R,  do so from either the .Rmd or the .md files, since copying R code from these pdf files does not always work (some lines seem to have extra encoding that disrupts use in  R). 
28 | 
29 | A list of the exercises with their number; the R constructs they practice and / or R topics they address; and what the function(s) in that exercise do is given below. This listing is as of 10 March 2022
30 | 
31 | 1. Use of the R letters character vector and the R paste and which and tolower functions; construct a function that, given an Excel column letter will return the corresponding integer column number, construct a function that will similarly deal with a vector of Excel column numbers 
32 | 
33 | 2. if, else if, else syntax and logic; construct a function that given a numeric value between 0 and 1 will return a corresponding character variable (a length 1 character vector) with 3 significant digits (practice using if constructs and the round function rather than R’s signif or format functions)
34 | 
35 | 3. Practice using a for loop and if tests and the R mod function %%, return within an if test, comments on debugging code, return an integer vector whose entries have names; construct a function called isItPrime(n) that tests whether a given positive integer is a prime number
36 | 
37 | 4. Accumulate entries in a vector using successive concatenation or by: starting with a sufficiently large initial vector and filling in the desired entries using a running index, and then trimming the vector to the appropriate size, use of the readline function to have interactive input from the user; construct a function called getPrimeNumbers(N) that given a positive integer N that is at least 2, returns in an integer vector all the prime numbers that are less than or equal N (uses isItPrime(n) from the previous exercise)
38 | 
39 | 5. This file reviews topics in "dealing with" data frames (programming exercises using data frames are given in the next several files): Extracting a column of a data frame as a vector, review of the many ways to get a specified subset of a data frame, the which function for getting a vector of row numbers for which some condition is TRUE, the %in% function for determining the indices k of a vector V such that V[k] is an entry of some other vector W, creating a data frame by reading in a suitable text file using read.csv or read.table, creating a data frame using the data.frame function, concatenating suitable data frames using rbind and cbind 
40 | 
41 | 6. The path to a folder - full (absolute) paths and relative paths, the list.files function for listing names of files that are in a folder that have a given pattern in their file name, the grep function for finding which character strings have a given pattern in them, a brief introduction to regular expressions for pattern searching, the file.info function for obtaining information about a file, the colnames function for (re)naming the columns of a data frame; construct a function that finds all the file names in a given folder that have every pattern given in a character vector (called search.strings) in their file name, and construct a data frame containing these file names along with their most recent modification date and file size, do test runs for files in a folder set up to test these functions
42 | 
43 | 7. More practice composing a function to carry out a prescribed task, a systematic way to debug a function by running its code line by line in the R console sub-window of RStudio and examining what happens with each line, the unique function; construct a modified version of the function in the previous exercise that outputs the same information but for the files containing any of the patterns in search.strings (rather than all the patterns in search.strings)
44 | 
45 | 8. More practice composing a function to carry out a prescribed task, practice using the %in% function, the setdiff function; construct a function that returns the file names in a folder that do not contain any of the entries of serch.strings in their file name
46 | 
47 | 9. Extracting part of a row of a data frame as a vector, the unlist and unname functions, reading in a data file using the web url of the file, more practice constructing a data frame; R code to analyze a small gene expression data set - getting results for each gene where the data for each gene is in a row of the input data frame
48 | 
49 | 10. Merging a subset of rows in a data frame dfy to a data frame dfx where dfy may contain more rows than dfx and the relevant subset of rows of dfy need to be reordered to line up with those in dfx, doing this "by hand" to practice basic R constructs, doing this with the R merge function, the row.names function, the identical function, and another debugging "investigation"; R code to append (merge) annotation data to a gene expression analysis results data frame 
50 | 
51 | 11. Detailed information on using sapply and split, and also use of  the ellipsis (...) functionality in sapply for passing in additional arguments to the function used in sapply. Most of this information is valid for lapply as well. Simple examples, and examples and exercises using the R iris data set
52 | 
53 | 12. An example of using Monte Carlo simulation (using an appropriate random number generator) to investigate a statistical question. The statistical question is: given some independent random samples s_1,..., s_k without replacement from the integers 1 through N (so none of the integers in 1 through N can be chosen more than once), what is a good estimate for N? This is a well known question with a known best (frequentist) estimate for N (references are given). The point here is to describe in detail Monte Carlo simulation to "explore" a question, and to give example R code to implement it.
54 | 
55 | Note the reader should not infer any endorsement or recommendation or approval for the material in these files from any of the sources or persons cited in this file or any of these files, or from any other entities mentioned in any of these files.
56 | 


--------------------------------------------------------------------------------
/first-R-programming-exercise-column-number-from-Excel-column-letter-markdown-UTF8.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: md_document
  3 | ---
  4 | 
  5 | ## "First programming exercise"
  6 | 
  7 | ### Alan Berger  Aug 20, 2020   minor edits Jan 18, 2021
  8 | 
  9 | ### additional R code, run the functions, and some text edits Aug 22, 2020
 10 | 
 11 | ## Introduction
 12 | 
 13 | This is the first in a sequence of programming exercises intended to fill the gap between learning the correct 
 14 | syntax of basic R commands and the programming assignments in the R Programming course in the Johns Hopkins University 
 15 | Data Science Specialization on Coursera. In this sequence of exercises in "composing" an R function to carry out a particular task, 
 16 | the idea is to practice correct use of R constructs and built in functions (functions the "come with" the basic R installation), while 
 17 | learning how to "put together" a correct sequence of blocks of commands that will obtain the desired result.
 18 | 
 19 | In these exercises, there will be a statement of what your function should do (what are the input variables and what the function 
 20 | should return) and a sequence of "hints". To get the most out of these exercises, try to write your function using as few hints as possible.  
 21 | Note there are often several ways to write a function that will obtain the correct result. For these exercises the directions and 
 22 | hints may point toward a particular approach intended to practice particular constructs in R and a particular line of reasoning.  
 23 | There may be an existing R function or package that will do what is stated for a given practice exercise, but here 
 24 | (unlike other aspects of the R Programming course) the point is to practice formulating a logical sequence of steps, 
 25 | with each step a section of code, to obtain a working function, not to find an existing solution or a quick solution using a 
 26 | more powerful R construct that is better addressed later on.
 27 | 
 28 | ## Motivation for this exercise
 29 | 
 30 |  The input data for many functions will be or include a tab delimited text file that comes 
 31 | from or is naturally viewed as an Excel spreadsheet. For example, the last programming assignment in the R Programming 
 32 | course uses a .csv (comma separated variables) text file appropriately read in using R's read.csv function. It has a total of 46 columns, 
 33 | only 5 of which are relevant to the assignment (Excel columns B, G, K, Q and W). For this and many other situations, 
 34 | it would be convenient to have an R function that would take as its input an Excel column name (as a character variable) 
 35 | and output the corresponding column number. 
 36 | 
 37 | ### Instruction for this function
 38 | 
 39 | For the first version of your function, just have it work for a column letter between a 
 40 | and z and only take as input a lower case letter.
 41 | The skeleton of your function should "look like"
 42 | 
 43 | ```
 44 | colnameToNumber <- function(colname) {
 45 | # convert an Excel column name
 46 | # to the corresponding column number;
 47 | # colname should be a lower case letter in the
 48 | # form of a character string, for example "a" or "r" 
 49 | # "between" a and z (including a and z)
 50 | 
 51 | #   coding lines
 52 | 
 53 |     return(colnumber)
 54 | }
 55 | ```
 56 | 
 57 | Directions: do NOT use a whole bunch of if statements (yes, one could do 26 if statements of the form 
 58 | `if(colname == "a") colnumber = 1` 
 59 | and so on, but, among other things, it would be easy to have typographic
 60 | mistakes, and writing all those out would get pretty boring). 
 61 | 
 62 | Note that the built in R variable (R object) **letters** is
 63 | the character vector with entries "a", "b", ... , "z" and note the built in R function **which** takes as input a logical 
 64 | vector (often defined by whether some condition is or is not true) and outputs the integer vector of the
 65 |   _indices_ of the input vector for which the logical value is **TRUE**, so for example `which(1:4 < 3)` is equal `c(1, 2)`, 
 66 | and `which(sqrt(1:4) == 2)` is the single value 4 (since `sqrt(1:4)` is the vector c(1, $\sqrt 2$, $\sqrt 3$, 2) which is 
 67 | `c(1, 1.414214, 1.732051, 2)` (to the number of digits printed).
 68 | 
 69 | Your function, when applied to b, g, k, q and w (as character variables) should return 2, 7, 11, 17 and 23, 
 70 | respectively, and `colnameToNumber("zz")` should 
 71 | "throw an error" along the lines of 
 72 | 
 73 | Error in colnameToNumber("zz") : no match to colname
 74 | 
 75 | This comes from the R statement: `stop("no match for colname")` which you should have occur in your function if it is the case that the input was not a lower case letter between a and z.  
 76 | 
 77 | Try writing your program now before going to the additional hints below (the more you do "on your own" the faster you will gain skill at programming).
 78 | 
 79 | ### Further hints 
 80 | 
 81 | You can use the **which** function to "pick out" the index k (an integer) of the **letters** vector for which `letters[k]` equals colname, this is the number you want to return.
 82 | 
 83 | How might you check for whether colname was a letter between "a" and "z"? What will the **which** function return if colname was not a "valid" value for 
 84 | the colnameToNumber function as currently constructed (colname is not equal any entry of **letters**)? (Try it out in an R session.)
 85 | 
 86 | Things to think about: how would you "extend" your function to treat some additional Excel columns, say through column "dz" (check out the **paste** function, you could use 
 87 | it along with **letters** to construct   vectors containing additional column names and concatenate them together).  You could do this with several lines of 
 88 | code to get the vector of "a" through "dz" (a through z; aa through az, ba through bz; ca through cz; da through dz; then concatenate them into 1 vector).
 89 | To be able to find the column number for many more Excel columns you would want to use a for loop to construct and concatenate blocks of column names. 
 90 | 
 91 | How would you extend your function to easily deal with upper case letters or a "mix" of upper and lower case letters, 
 92 | as in the column name "Cz" (some people like me are sloppy typists, or will forget about a lower case restriction and use capital letters as Excel does).  
 93 | Now is the time to make use of Google to search for a built in R function that will convert any upper case letters in a character variable to lower case (and leave other characters as is).
 94 | 
 95 | How would you extend your function to treat a vector (of length > 1) of column names?
 96 | 
 97 | A working version of this function is given below, but try to do your own function before looking.
 98 | 
 99 | ### colnameToNumber function
100 | 
101 | ```{r}
102 | colnameToNumber <- function(colname){
103 | # convert an Excel column name to the 
104 | # corresponding column number;
105 | # should not depend on whether the
106 | # letter(s) in colname are upper or
107 | # lower case or a mixture of upper and
108 | # lower case
109 | # colname should be a character string such as "k" or "dz"
110 | 
111 |     colname <- tolower(colname)
112 | # so only need to deal with lower case
113 | 
114 | # arrange to treat column names from "a" up through "dz"
115 |     az <- letters  # c("a", ... , "z")
116 |     aaz <- paste("a", az, sep = "")  
117 | # c("aa", "ab", ... , "az")
118 | # additional column names
119 |     baz <- paste("b", az, sep = "")
120 |     caz <- paste("c", az, sep = "")
121 |     daz <- paste("d", az, sep = "")
122 | 
123 | # concatenate the column names
124 |     colnames <- c(az, aaz, baz, caz, daz)
125 | 
126 | # get the number corresponding to the input column name
127 | # using the which function
128 |     colnumber <- which(colnames == colname)
129 | # test for having found a (unique) match
130 |     if(length(colnumber) != 1) stop("no match to colname")
131 |     return(colnumber)
132 | }
133 | ```
134 | 
135 | Do check runs.
136 | 
137 | ```{r}
138 | colnameToNumber("a")
139 | colnameToNumber("z")
140 | colnameToNumber("aa")
141 | colnameToNumber("az")
142 | colnameToNumber("ba")
143 | colnameToNumber("bz")
144 | colnameToNumber("ca")
145 | colnameToNumber("cz")
146 | colnameToNumber("da")
147 | colnameToNumber("dz")
148 | # colnameToNumber("zz") # should get error message
149 | ```
150 | 
151 | Note because we used the built in **letters** character vector, the calls 
152 | to colnameToNumber above should be sufficient to check that it has been coded correctly. Contrast that 
153 | with the checks that would be needed if we had used if tests for each column name between a and dz.
154 | 
155 | For the last part of this exercise, write a function that uses the colnameToNumber function to
156 | convert a character _vector_ of column names to the corresponding vector of column numbers. One could modify 
157 | the colnameToNumber function to do this using a for loop, but here write a separate function that uses 
158 | the colnameToNumber function (this set up will be used as practice with the **sapply** function later on). 
159 | 
160 | Hints: The output of this function, call the function colvecToNumbers will be an integer vector, call it colnumbers, 
161 | having the same **length**  as the input character vector, call it colvec. One can initialize colnumbers using the **integer**  
162 | function, and then "fill in" its entries in a for loop using the colnameToNumber function.
163 | 
164 | 
165 | A working version of this function is given below, but try to do your own function before looking.
166 | 
167 | ### colvecToNumbers function
168 | 
169 | ```{r}
170 | colvecToNumbers <- function(colvec){
171 | # convert a character vector of Excel column names
172 | # to the integer vector of corresponding column numbers
173 | # each entry of colvec should be a column name between a and dz
174 | 
175 | # check the input is a non-empty character vector
176 | # note a single character string is a character vector of length 1
177 | if(length(colvec) < 1) stop("colvec input to colvecToNumbers is empty")
178 | if(!is.character(colvec)) stop("colvec input to colvecToNumbers is not character")
179 | 
180 | # create the integer vector to be output
181 |     n <- length(colvec)
182 |     colnumbers <- integer(n)
183 | 
184 | # use colnameToNumber to get each entry of colnumbers
185 |     for (i in 1:n) {
186 |         colname <- colvec[i]
187 |         colnumbers[i] <- colnameToNumber(colname) 
188 |     }
189 | 
190 | return(colnumbers)
191 | }
192 | ```
193 | 
194 | ### check runs for colvecToNumbers
195 | 
196 | ```{r}
197 | colvec <- c("b", "g", "k", "q", "w") # the hospital data relevant columns
198 | colvecToNumbers(colvec)
199 | colvec <- c("a", "aa", "ba", "ca", "da") # start of groups of 26 columns
200 | colvecToNumbers(colvec)
201 | colvec <- c("z", "az", "bz", "cz", "dz") # end of groups of 26 columns
202 | colvecToNumbers(colvec)
203 | ```
204 | 
205 | Agrees with what it should be.
206 | 
207 | Hope this programming exercise was informative and good practice with 
208 | writing a function.
209 | 
210 | Things to look forward to: later in the R Programming course one will learn 
211 | the "apply family" of powerful built in R functions. Not counting the lines 
212 | that check whether the input value of colvec is valid, one can write a version 
213 | of colvecToNumbers using the one!! line
214 | 
215 | sapply(colvec, colnameToNumber)
216 | 
217 | Nice and concise and only one line needs to be checked for any mistakes - **sapply** does 
218 | the for loop and creates the vector to be returned without any more lines of code needed. 
219 | Fewer lines of code (as long as they are reasonably "readable" - not a birds nest of 
220 | parentheses and brackets) mean fewer chances to make a mistake and fewer places for bugs to hide.
221 |  


--------------------------------------------------------------------------------
/first-R-programming-exercise-column-number-from-Excel-column-letter-markdown-UTF8.md:
--------------------------------------------------------------------------------
  1 | "First programming exercise"
  2 | ----------------------------
  3 | 
  4 | ### Alan Berger Aug 20, 2020 minor edits Jan 18, 2021
  5 | 
  6 | ### additional R code, run the functions, and some text edits Aug 22, 2020
  7 | 
  8 | Introduction
  9 | ------------
 10 | 
 11 | This is the first in a sequence of programming exercises intended to
 12 | fill the gap between learning the correct syntax of basic R commands and
 13 | the programming assignments in the R Programming course in the Johns
 14 | Hopkins University Data Science Specialization on Coursera. In this
 15 | sequence of exercises in "composing" an R function to carry out a
 16 | particular task, the idea is to practice correct use of R constructs and
 17 | built in functions (functions the "come with" the basic R installation),
 18 | while learning how to "put together" a correct sequence of blocks of
 19 | commands that will obtain the desired result.
 20 | 
 21 | In these exercises, there will be a statement of what your function
 22 | should do (what are the input variables and what the function should
 23 | return) and a sequence of "hints". To get the most out of these
 24 | exercises, try to write your function using as few hints as possible.  
 25 | Note there are often several ways to write a function that will obtain
 26 | the correct result. For these exercises the directions and hints may
 27 | point toward a particular approach intended to practice particular
 28 | constructs in R and a particular line of reasoning.  
 29 | There may be an existing R function or package that will do what is
 30 | stated for a given practice exercise, but here (unlike other aspects of
 31 | the R Programming course) the point is to practice formulating a logical
 32 | sequence of steps, with each step a section of code, to obtain a working
 33 | function, not to find an existing solution or a quick solution using a
 34 | more powerful R construct that is better addressed later on.
 35 | 
 36 | Motivation for this exercise
 37 | ----------------------------
 38 | 
 39 | The input data for many functions will be or include a tab delimited
 40 | text file that comes from or is naturally viewed as an Excel
 41 | spreadsheet. For example, the last programming assignment in the R
 42 | Programming course uses a .csv (comma separated variables) text file
 43 | appropriately read in using R's read.csv function. It has a total of 46
 44 | columns, only 5 of which are relevant to the assignment (Excel columns
 45 | B, G, K, Q and W). For this and many other situations, it would be
 46 | convenient to have an R function that would take as its input an Excel
 47 | column name (as a character variable) and output the corresponding
 48 | column number.
 49 | 
 50 | ### Instruction for this function
 51 | 
 52 | For the first version of your function, just have it work for a column
 53 | letter between a and z and only take as input a lower case letter. The
 54 | skeleton of your function should "look like"
 55 | 
 56 |     colnameToNumber <- function(colname) {
 57 |     # convert an Excel column name
 58 |     # to the corresponding column number;
 59 |     # colname should be a lower case letter in the
 60 |     # form of a character string, for example "a" or "r" 
 61 |     # "between" a and z (including a and z)
 62 | 
 63 |     #   coding lines
 64 | 
 65 |         return(colnumber)
 66 |     }
 67 | 
 68 | Directions: do NOT use a whole bunch of if statements (yes, one could do
 69 | 26 if statements of the form `if(colname == "a") colnumber = 1` and so
 70 | on, but, among other things, it would be easy to have typographic
 71 | mistakes, and writing all those out would get pretty boring).
 72 | 
 73 | Note that the built in R variable (R object) **letters** is the
 74 | character vector with entries "a", "b", ... , "z" and note the built in
 75 | R function **which** takes as input a logical vector (often defined by
 76 | whether some condition is or is not true) and outputs the integer vector
 77 | of the *indices* of the input vector for which the logical value is
 78 | **TRUE**, so for example `which(1:4 < 3)` is equal `c(1, 2)`, and
 79 | `which(sqrt(1:4) == 2)` is the single value 4 (since `sqrt(1:4)` is the
 80 | vector c(1, $\\sqrt 2$, $\\sqrt 3$, 2) which is
 81 | `c(1, 1.414214, 1.732051, 2)` (to the number of digits printed).
 82 | 
 83 | Your function, when applied to b, g, k, q and w (as character variables)
 84 | should return 2, 7, 11, 17 and 23, respectively, and
 85 | `colnameToNumber("zz")` should "throw an error" along the lines of
 86 | 
 87 | Error in colnameToNumber("zz") : no match to colname
 88 | 
 89 | This comes from the R statement: `stop("no match for colname")` which
 90 | you should have occur in your function if it is the case that the input
 91 | was not a lower case letter between a and z.
 92 | 
 93 | Try writing your program now before going to the additional hints below
 94 | (the more you do "on your own" the faster you will gain skill at
 95 | programming).
 96 | 
 97 | ### Further hints
 98 | 
 99 | You can use the **which** function to "pick out" the index k (an
100 | integer) of the **letters** vector for which `letters[k]` equals
101 | colname, this is the number you want to return.
102 | 
103 | How might you check for whether colname was a letter between "a" and
104 | "z"? What will the **which** function return if colname was not a
105 | "valid" value for the colnameToNumber function as currently constructed
106 | (colname is not equal any entry of **letters**)? (Try it out in an R
107 | session.)
108 | 
109 | Things to think about: how would you "extend" your function to treat
110 | some additional Excel columns, say through column "dz" (check out the
111 | **paste** function, you could use it along with **letters** to construct
112 | vectors containing additional column names and concatenate them
113 | together). You could do this with several lines of code to get the
114 | vector of "a" through "dz" (a through z; aa through az, ba through bz;
115 | ca through cz; da through dz; then concatenate them into 1 vector). To
116 | be able to find the column number for many more Excel columns you would
117 | want to use a for loop to construct and concatenate blocks of column
118 | names.
119 | 
120 | How would you extend your function to easily deal with upper case
121 | letters or a "mix" of upper and lower case letters, as in the column
122 | name "Cz" (some people like me are sloppy typists, or will forget about
123 | a lower case restriction and use capital letters as Excel does).  
124 | Now is the time to make use of Google to search for a built in R
125 | function that will convert any upper case letters in a character
126 | variable to lower case (and leave other characters as is).
127 | 
128 | How would you extend your function to treat a vector (of length &gt; 1)
129 | of column names?
130 | 
131 | A working version of this function is given below, but try to do your
132 | own function before looking.
133 | 
134 | ### colnameToNumber function
135 | 
136 |     colnameToNumber <- function(colname){
137 |     # convert an Excel column name to the 
138 |     # corresponding column number;
139 |     # should not depend on whether the
140 |     # letter(s) in colname are upper or
141 |     # lower case or a mixture of upper and
142 |     # lower case
143 |     # colname should be a character string such as "k" or "dz"
144 | 
145 |         colname <- tolower(colname)
146 |     # so only need to deal with lower case
147 | 
148 |     # arrange to treat column names from "a" up through "dz"
149 |         az <- letters  # c("a", ... , "z")
150 |         aaz <- paste("a", az, sep = "")  
151 |     # c("aa", "ab", ... , "az")
152 |     # additional column names
153 |         baz <- paste("b", az, sep = "")
154 |         caz <- paste("c", az, sep = "")
155 |         daz <- paste("d", az, sep = "")
156 | 
157 |     # concatenate the column names
158 |         colnames <- c(az, aaz, baz, caz, daz)
159 | 
160 |     # get the number corresponding to the input column name
161 |     # using the which function
162 |         colnumber <- which(colnames == colname)
163 |     # test for having found a (unique) match
164 |         if(length(colnumber) != 1) stop("no match to colname")
165 |         return(colnumber)
166 |     }
167 | 
168 | Do check runs.
169 | 
170 |     colnameToNumber("a")
171 | 
172 |     ## [1] 1
173 | 
174 |     colnameToNumber("z")
175 | 
176 |     ## [1] 26
177 | 
178 |     colnameToNumber("aa")
179 | 
180 |     ## [1] 27
181 | 
182 |     colnameToNumber("az")
183 | 
184 |     ## [1] 52
185 | 
186 |     colnameToNumber("ba")
187 | 
188 |     ## [1] 53
189 | 
190 |     colnameToNumber("bz")
191 | 
192 |     ## [1] 78
193 | 
194 |     colnameToNumber("ca")
195 | 
196 |     ## [1] 79
197 | 
198 |     colnameToNumber("cz")
199 | 
200 |     ## [1] 104
201 | 
202 |     colnameToNumber("da")
203 | 
204 |     ## [1] 105
205 | 
206 |     colnameToNumber("dz")
207 | 
208 |     ## [1] 130
209 | 
210 |     # colnameToNumber("zz") # should get error message
211 | 
212 | Note because we used the built in **letters** character vector, the
213 | calls to colnameToNumber above should be sufficient to check that it has
214 | been coded correctly. Contrast that with the checks that would be needed
215 | if we had used if tests for each column name between a and dz.
216 | 
217 | For the last part of this exercise, write a function that uses the
218 | colnameToNumber function to convert a character *vector* of column names
219 | to the corresponding vector of column numbers. One could modify the
220 | colnameToNumber function to do this using a for loop, but here write a
221 | separate function that uses the colnameToNumber function (this set up
222 | will be used as practice with the **sapply** function later on).
223 | 
224 | Hints: The output of this function, call the function colvecToNumbers
225 | will be an integer vector, call it colnumbers, having the same
226 | **length** as the input character vector, call it colvec. One can
227 | initialize colnumbers using the **integer**  
228 | function, and then "fill in" its entries in a for loop using the
229 | colnameToNumber function.
230 | 
231 | A working version of this function is given below, but try to do your
232 | own function before looking.
233 | 
234 | ### colvecToNumbers function
235 | 
236 |     colvecToNumbers <- function(colvec){
237 |     # convert a character vector of Excel column names
238 |     # to the integer vector of corresponding column numbers
239 |     # each entry of colvec should be a column name between a and dz
240 | 
241 |     # check the input is a non-empty character vector
242 |     # note a single character string is a character vector of length 1
243 |     if(length(colvec) < 1) stop("colvec input to colvecToNumbers is empty")
244 |     if(!is.character(colvec)) stop("colvec input to colvecToNumbers is not character")
245 | 
246 |     # create the integer vector to be output
247 |         n <- length(colvec)
248 |         colnumbers <- integer(n)
249 | 
250 |     # use colnameToNumber to get each entry of colnumbers
251 |         for (i in 1:n) {
252 |             colname <- colvec[i]
253 |             colnumbers[i] <- colnameToNumber(colname) 
254 |         }
255 | 
256 |     return(colnumbers)
257 |     }
258 | 
259 | ### check runs for colvecToNumbers
260 | 
261 |     colvec <- c("b", "g", "k", "q", "w") # the hospital data relevant columns
262 |     colvecToNumbers(colvec)
263 | 
264 |     ## [1]  2  7 11 17 23
265 | 
266 |     colvec <- c("a", "aa", "ba", "ca", "da") # start of groups of 26 columns
267 |     colvecToNumbers(colvec)
268 | 
269 |     ## [1]   1  27  53  79 105
270 | 
271 |     colvec <- c("z", "az", "bz", "cz", "dz") # end of groups of 26 columns
272 |     colvecToNumbers(colvec)
273 | 
274 |     ## [1]  26  52  78 104 130
275 | 
276 | Agrees with what it should be.
277 | 
278 | Hope this programming exercise was informative and good practice with
279 | writing a function.
280 | 
281 | Things to look forward to: later in the R Programming course one will
282 | learn the "apply family" of powerful built in R functions. Not counting
283 | the lines that check whether the input value of colvec is valid, one can
284 | write a version of colvecToNumbers using the one!! line
285 | 
286 | sapply(colvec, colnameToNumber)
287 | 
288 | Nice and concise and only one line needs to be checked for any mistakes
289 | - **sapply** does the for loop and creates the vector to be returned
290 | without any more lines of code needed. Fewer lines of code (as long as
291 | they are reasonably "readable" - not a birds nest of parentheses and
292 | brackets) mean fewer chances to make a mistake and fewer places for bugs
293 | to hide.
294 | 


--------------------------------------------------------------------------------
/Fourth-R-programming-exercise-find-prime-integers-less-than-or-equal-N.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output:
  3 |   md_document:
  4 |     variant: markdown_github
  5 | # output: pdf_document
  6 | ---
  7 | 
  8 | 
  9 | ## "Fourth R programming exercise find prime integers less than or equal N"
 10 | 
 11 | ### Alan E. Berger  November 22, 2020
 12 | 
 13 | ### version 1
 14 | 
 15 | ### available at https://github.com/AlanBerger/Practice-programming-exercises-for-R
 16 | 
 17 | ## Finish the construction of a function to return all the prime numbers between 1 and a positive integer N 
 18 | 
 19 | ## Introduction
 20 | 
 21 | This is the fourth in a sequence of programming exercises in "composing" an R function 
 22 | to carry out a particular task. The idea is to practice correct use of R constructs and 
 23 | built in functions (functions that "come with" the basic R installation), while learning how 
 24 | to "put together" a correct sequence of blocks of commands that will obtain the desired result.  
 25 | Note these exercises are quite cumulative - one should do them in order. 
 26 | 
 27 | In these exercises, there will be a statement of what your function should do 
 28 | (what are the input variables and what the function should return) and a sequence of "hints". 
 29 | To get the most out of these exercises, try to write your function using as few hints as possible.  
 30 | Note there are often several ways to write a function that will obtain the correct result. 
 31 | For these exercises the directions and hints may point toward a particular approach intended to 
 32 | practice particular constructs in R and a particular line of reasoning, 
 33 | even if there is a more efficent way to obtain the same result.  
 34 | There may also be an existing R function or package that will do what is stated for a given 
 35 | practice exercise, but here the point is to practice formulating a logical sequence of steps, 
 36 | with each step a section of code, to obtain a working function, not to find an existing 
 37 | solution or a quick solution using a more powerful R construct that is better addressed later on.
 38 | 
 39 | ## Motivation for this exercise
 40 | 
 41 | For this exercise, we will finish constructing the function getPrimeNumbers(N = 1000) which will 
 42 | return all the prime numbers between 1 and the positive integer N. We will use the isItPrime(n) function 
 43 | constructed in the previous exercise, which tests whether the positive integer n is a prime number. 
 44 | This illustrates construction of a function in several steps and in a modular fashion, allowing 
 45 | for flexibility and easier testing and debugging. 
 46 | 
 47 | ## Background
 48 | 
 49 | Recall the definitions and results about prime numbers from the previous exercise:  
 50 | A positive integer q **evenly divides** a positive integer n if there is a positive 
 51 | integer k such that n = k * q, for example 3 evenly divides 15; 6 evenly divides 24; but 4 does not evenly 
 52 | divide 9 (in integer arithmetic, since 9 = 2 * 4 with a **remainder** of 1).  
 53 | R provides the **mod** function **%%** such that n %% q gives the remainder **r** from integer 
 54 | dividing n by q (also phrased as **n equals r mod q**). So q evenly divides n is equivalent to n %% q = 0
 55 | 
 56 | A positive integer p is called **prime** if p > 1 and the only positive integers that evenly divide p are 1 and p 
 57 | (so the first several prime numbers are 2, 3, 5, 7, 11, 13). In the previous exercise we used the mod function to 
 58 | construct the isItPrime(n) function. 
 59 | 
 60 | The function to be constructed is **getPrimeNumbers**, whose argument N is to be 
 61 | a positive integer greater than 1, and which should return, in a vector, call it for example primes_up_to_N, 
 62 | all the prime numbers between 2 and N (including 2, and if N is a prime number, N). 
 63 |  
 64 | ## Instructions for constructing **getPrimeNumbers**
 65 | 
 66 | In the previous exercise we constructed **isItPrime(n)** whose argument is a positive integer n that is 
 67 | at most 1,000,000 (just to avoid accidently starting an extremely time consuming calculation) 
 68 | which will return either TRUE if n is a prime and FALSE otherwise. This is a copy of isItPrime, the same 
 69 | as in the previous exercise, except that here I have commented out the check for N being too large since that will 
 70 | be done in getPrimeNumbers:
 71 | 
 72 | ``` {r}
 73 | isItPrime <- function(n) {
 74 | # determine whether the positive integer n is prime
 75 | # using the mod function, Version 2
 76 | 
 77 | # check that the function argument is "admissible"
 78 | # test that n is a positive integer (or a real number that equals a positive integer)
 79 | n.int <- as.integer(n) 
 80 | # if n was a real number such as 3.2 then n.int will be n truncated 
 81 | # to an integer (for this example, 3)
 82 | 
 83 | if(!(n.int == n)) stop("n is not an integer")
 84 | if(n < 1) stop("n is not positive")
 85 | 
 86 | # stop if n is "too large" to avoid a very long calculation
 87 | # if(n > 1000000) stop("n is > a million") 
 88 | 
 89 | # code to test if n is prime using R's mod function %%
 90 | # return TRUE or FALSE
 91 | 
 92 | if(n.int == 1) return(FALSE)
 93 | if(n.int == 2) return(TRUE)
 94 | # if got to here, n is at least 3
 95 | # test if an integer between 2 and sqrt(n) + 1 evenly divides n
 96 | 
 97 | lastq <- as.integer(sqrt(n)) + 1L  
 98 | # the L in 1L "tells" R is treat 1 as an 
 99 | # integer value rather than a real (numeric) value
100 | # this could also have equivalently been done by 
101 | # lastq <- as.integer(sqrt(n) + 1)  
102 | for (q in 2:lastq) {
103 |    if((n %% q) == 0) return(FALSE)
104 |    }
105 | 
106 | # if got to here, n is prime
107 | return(TRUE)
108 | }
109 | ```
110 | 
111 | Use a for loop and use isItPrime(n) to test each positive integer n between 2 and N to see if it is prime.
112 | Return the integers that are found to be prime in a vector called, for example, primes_up_to_N  
113 | 
114 | For the first version of getPrimeNumbers, use the following simple comstruction to obtain primes_up_to_N:
115 | initialize primes_up_to_N  to be integer(0), then in a for loop whose index, call it n, runs from 2L to N, use 
116 | isItPrime to test if n is a prime. If n is prime, append n to primes_up_to_N via the statement 
117 | 
118 | primes_up_to_N <- c(primes_up_to_N, n)
119 | 
120 | Try writing getPrimeNumbers now.
121 | 
122 | If you do getPrimeNumbers(N = 111) you should get
123 | 
124 | ```
125 | getPrimeNumbers(111)
126 |  [1]   2   3   5   7  11  13  17  19  23  29  31  37  41  43  47
127 | [16]  53  59  61  67  71  73  79  83  89  97 101 103 107 109
128 | ```
129 | 
130 | The number of values printed on each line in an R session depends on the width of the R console window.
131 | 
132 | A working version of getPrimeNumbers follows:
133 | 
134 | ``` {r}
135 | getPrimeNumbers <- function(N) {
136 | # N should be a positive integer that is at least 2
137 | # return a vector containing all the prime numbers between 2 and N
138 | # (including 2 and including N if N is a prime)
139 | 
140 | # check that the function argument is "admissible"
141 | # test that N is a positive integer (or a real number that equals a positive integer)
142 | N.int <- as.integer(N) 
143 | # if N was a real number such as 3.2 then N.int will be N truncated 
144 | # to an integer (for this example, 3)
145 | 
146 | if(!(N.int == N)) stop("N is not an integer")
147 | if(N < 2) stop("N is not at least 2")
148 | 
149 | # stop if N is "too large" to avoid a very long calculation
150 | if(N > 1000000) stop("N is > a million") 
151 | 
152 | # initialize primes_up_to_N
153 | primes_up_to_N <- integer(0)
154 | 
155 | for (n in 2L:N.int) {
156 |    if(isItPrime(n)) {
157 |       primes_up_to_N <- c(primes_up_to_N, n)
158 |    }
159 | }
160 | 
161 | return(primes_up_to_N)
162 | }
163 | ```
164 | 
165 | ## Using a **running index** with a preset vector to obtain primes_up_to_N
166 |   
167 | In the next version of getPrimeNumbers, instead of doing
168 | 
169 | primes_up_to_N <- c(primes_up_to_N, n)
170 | 
171 | to "accumulate" the prime numbers in a vector, you are to initialize the integer vector   
172 | primes_up_to_N to be of length N to contain the prime numbers between 2 and N. Obviously this
173 | vector will generally be larger than needed, but we can place each prime number as it is found into successive 
174 | entries of primes_up_to_N using a **running index**, call it k. How this works is one intializes k to 
175 | 0 and then each time inside the for loop an integer n is found to be  prime, one increases k by 1 and 
176 | then sets primes_up_to_N[k] <- n When the for loop is completed, k will be the number of primes that 
177 | were found between 2 and N, and so one then "trims" primes_up_to_N by doing 
178 | 
179 | primes_up_to_N <- primes_up_to_N[1:k]
180 | 
181 | This takes more initial storage space, but is "cleaner" than successively creating new vectors by
182 | doing primes_up_to_N <- c(primes_up_to_N, n) and is a technique one should be familiar with.
183 | 
184 | Try writing a version of getPrimeNumbers that uses a predefined primes_up_to_N integer vector 
185 | (of length N) and a running index to fill in its entries, and then trim it to the correct length 
186 | before returning it. A working version is given below. 
187 | 
188 | ``` {r}
189 | getPrimeNumbers <- function(N) {
190 | # N should be a positive integer that is at least 2
191 | # return a vector containing all the prime numbers between 2 and N
192 | # (including 2 and including N if N is a prime)
193 | 
194 | # for this version use a predefined integer vector primes_up_to_N of
195 | # length N and a running index k to fill in entries, and then trim it
196 | # after the for loop is completed
197 | 
198 | # check that the function argument is "admissible"
199 | # test that N is a positive integer (or a real number that equals a positive integer)
200 | N.int <- as.integer(N) 
201 | # if N was a real number such as 3.2 then N.int will be N truncated 
202 | # to an integer (for this example, 3)
203 | 
204 | if(!(N.int == N)) stop("N is not an integer")
205 | if(N < 2) stop("N is not at least 2")
206 | 
207 | # if N is "too large" (> 1,000,000) then stop
208 | if(N > 1000000) {
209 |    cat("N = ", N, "\n")  # print N and also include going to a new output line 
210 |    stop("N is > a million") 
211 | }
212 | 
213 | # initialize primes_up_to_N
214 | primes_up_to_N <- integer(N)
215 | k <- 0  # the running index
216 | 
217 | for (n in 2L:N.int) {
218 |    if(isItPrime(n)) {
219 |       k <- k + 1  # get next location in primes_up_to_N 
220 |       primes_up_to_N[k] <- n
221 |    }
222 | }
223 | 
224 | primes_up_to_N <- primes_up_to_N[1:k]  # trim to correct length
225 | return(primes_up_to_N)
226 | }
227 | 
228 | # do a test run
229 | getPrimeNumbers(111)
230 | ```
231 | 
232 | On my computer the latter version of getPrimeNumbers runs a bit faster than the 
233 | former version (for N = 1000000 the former takes about 18 seconds and the latter 12 seconds).
234 | 
235 | 
236 | ## Using the **readline** function to let the user decide whether to continue a run if N > a million 
237 | 
238 | 
239 | The next version of getPrimeNumbers is the same as the one immediately above except that 
240 | instead of stopping with an error if N is > 1,000,000 this version asks the user to decide whether or not to 
241 | continue with running the function by replying "yes" or "no" using the **readline** function if N is
242 | > 1,000,000 as illustrated below.
243 | 
244 | ``` {r}
245 | getPrimeNumbers <- function(N) {
246 | # N should be a positive integer that is at least 2
247 | # return a vector containing all the prime numbers between 2 and N
248 | # (including 2 and including N if N is a prime)
249 | 
250 | # for this version use a predefined integer vector primes_up_to_N of
251 | # length N and a running index k to fill in entries, and then trim it
252 | # after the for loop is completed
253 | 
254 | # check that the function argument is "admissible"
255 | # test that N is a positive integer (or a real number that equals a positive integer)
256 | N.int <- as.integer(N) 
257 | # if N was a real number such as 3.2 then N.int will be N truncated 
258 | # to an integer (for this example, 3)
259 | 
260 | if(!(N.int == N)) stop("N is not an integer")
261 | if(N < 2) stop("N is not at least 2")
262 | 
263 | 
264 | # if N is "large" (> 1,000,000) check with the user to see if the user wants to proceed
265 | if(N > 1000000) {
266 |    cat("N = ", N, "\n")  # print N and also include going to a new output line 
267 |    yes.or.no <- readline("this N is large, do you want to continue, type yes or no: ")
268 |    if(yes.or.no != "yes") return("N was large so exited getPrimeNumbers")
269 | }
270 | 
271 | 
272 | # initialize primes_up_to_N
273 | primes_up_to_N <- integer(N)
274 | k <- 0  # the running index
275 | 
276 | for (n in 2L:N.int) {
277 |    if(isItPrime(n)) {
278 |       k <- k + 1  # get next location in primes_up_to_N 
279 |       primes_up_to_N[k] <- n
280 |    }
281 | }
282 | 
283 | primes_up_to_N <- primes_up_to_N[1:k]  # trim to correct length
284 | return(primes_up_to_N)
285 | }
286 | 
287 | # do a test run
288 | getPrimeNumbers(111)
289 | 
290 | # do a second test run
291 | primes.for.N.equal.a.million <- getPrimeNumbers(1000000)
292 | length(primes.for.N.equal.a.million) # should be 78498
293 | primes.for.N.equal.a.million[1000] # should be 7919
294 | primes.for.N.equal.a.million[10000] # should be 104729
295 | tail(primes.for.N.equal.a.million) # the last value should be 999983
296 | ```
297 | 
298 | Hope this programming exercise was informative and good practice.
299 | The next set of exercises will get into using data frames.
300 | 
301 | = = = = = = = = = = = = = = = = = = = = = = = = 
302 | 
303 | This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 
304 | To view a copy of this license, visit https://creativecommons.org/licenses/by-nc-sa/4.0/ or send a letter 
305 | to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. There is a full version of this license at this web site: 
306 | https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode      
307 | 


--------------------------------------------------------------------------------
/second-R-programming-exercise-if-else-if-else-syntax-and-logic.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output: md_document
  3 | ---
  4 | 
  5 | ## "Second programming exercise if, else if, else syntax and logic"
  6 | 
  7 | ### Alan Berger  Aug 25, 2020  minor edits Jan 18, 2021
  8 | 
  9 | ### version 1
 10 | 
 11 | ## Introduction
 12 | 
 13 | This is the second in a sequence of programming exercises intended to fill the gap between learning the correct 
 14 | syntax of basic R commands and the programming assignments in the R Programming course in the Johns Hopkins University 
 15 | Data Science Specialization on Coursera. In this sequence of exercises in "composing" an R function to carry out a particular task, 
 16 | the idea is to practice correct use of R constructs and built in functions (functions the "come with" the basic R installation), while 
 17 | learning how to "put together" a correct sequence of blocks of commands that will obtain the desired result.  
 18 | Note these exercises are quite cumulative - one should do them in order. 
 19 | 
 20 | In these exercises, there will be a statement of what your function should do (what are the input variables and what the function 
 21 | should return) and a sequence of "hints". To get the most out of these exercises, 
 22 | try to write your function using as few hints as possible.  
 23 | Note there are often several ways to write a function that will obtain the correct result. For these exercises the directions and 
 24 | hints may point toward a particular approach intended to practice particular constructs in R and a particular line of reasoning.  
 25 | There may be an existing R function or package that will do what is stated for a given practice exercise, but here 
 26 | (unlike other aspects of the R Programming course) the point is to practice formulating a logical sequence of steps, 
 27 | with each step a section of code, to obtain a working function, not to find an existing solution or a quick solution using a 
 28 | more powerful R construct that is better addressed later on.
 29 | 
 30 | ## Motivation for this exercise
 31 | 
 32 | If statements are a basic construct for determining which commands or blocks of commands should be executed. 
 33 | The specific function for this exercise is described below.
 34 | 
 35 | ## Some if, else if, else templates
 36 | 
 37 | Below are templates for common if statement constructs.  Note it is helpful to use indentation of lines of code and
 38 | blank spaces between sections of code as illustrated below.  The number of spaces of indentation is somewhat 
 39 | a personal choice, balancing making code easy to read by having separate blocks of code stand out by having
 40 | more spaces in indentations, with not having too many R commands extend over more than 1 line 
 41 | (and in general, don't go over 80 characters in a line).
 42 | The motivation is that code that is easier to read is easier to proofread and spot bugs in.
 43 | 
 44 | ```
 45 | if (condition1) short.code  # short.code is a short R command 
 46 | # condition1 (and below, condition2, condition3) is a logical statement evaluating to TRUE or FALSE
 47 | # when condition1 is true, execute the short.code statement
 48 | 
 49 | 
 50 | 
 51 | if (condition1) { 
 52 |     code1  # where code1 (and below, code2, code3, code4) stands for one or more lines of code
 53 | }
 54 | # when condition1 is true, execute code1
 55 | 
 56 | 
 57 | 
 58 | if (condition1) { 
 59 |     code1
 60 |     } else {
 61 |     code2
 62 | } 
 63 | # when condition1 is TRUE, then the line(s) of code1 are executed,
 64 | # otherwise the line(s) of code2 are executed
 65 | 
 66 | 
 67 | 
 68 | if (condition1) { 
 69 |     code1
 70 |     } else if (condition2) {
 71 |     code2
 72 | }
 73 | # when condition1 is TRUE, code1 is executed
 74 | # when condition1 is FALSE, then code2 will be executed if condition2 is TRUE;
 75 | # when neither condition is TRUE, neither code1 nor code2 get executed, and
 76 | # R "proceeds" to the next line of code
 77 | 
 78 | 
 79 | 
 80 | # The full range of possibilities within one if, else if, else block are
 81 | # illustrated here
 82 | if (condition1) { 
 83 |     code1
 84 |     } else if (condition2) {
 85 |     code2
 86 |     } else if (condition3) {
 87 |     code3
 88 |     } else {
 89 |     code4
 90 | }
 91 | # there can be any (reasonable) number of "else ifs"
 92 | 
 93 | 
 94 | # Another quick way to use a sequence of if tests when there are only
 95 | # a few cases is shown in this example, of getting the rgb (red, green, blue)
 96 | # color scale values when there are only a couple possible color names.
 97 | # This is, if nothing else, code that is easy to read/understand
 98 | 
 99 | rgbvec <- 0 # when, after the if statements, rgbvec is still 0, throw an error
100 | if(color == "Magenta") rgbvec <- c(255, 0, 255)
101 | if(color == "ForestGreen") rgbvec <- c(34, 139, 34)
102 | if(color == "Cyan") rgbvec <- c(0, 255, 255)
103 | # test that color matched one of the above choices
104 | if(identical(rgbvec, 0)) stop("color did not match one of the choices")
105 | 
106 | # This is just an example, for "real" use for getting rgb colors one would want to have a data frame
107 | # with this information (many color names and corresponding rgb values) 
108 | # and extract the rgb values for a given color from it.
109 | # The color information above came from
110 | # https://en.wikipedia.org/wiki/Web_colors
111 | # side note: there are web sites for checking how a color figure would appear
112 | # to people with various types of color blindness, for example
113 | # https://www.color-blindness.com/coblis-color-blindness-simulator/
114 | ```
115 | 
116 | In many cases in a block of if, else if, else statements the conditions will be
117 | mutually exclusive (at most one of the conditions will be TRUE), and (when there is an else statement)
118 | one of the code blocks will certainly be executed.
119 | The range of values for which the conditions are TRUE can also be increasing or decreasing 
120 | sets ("nested" sets) if one orders the sequence of if tests appropriately as
121 | in the exercise program below.
122 | 
123 | 
124 | ### Instruction for the exercise function using an if, else if, else block
125 | 
126 | Constructing the function specified below is an exercise in using an if, else if, else block.
127 | Yes, one could do this by simply using the **signif** or **format** function, but the point here 
128 | is to practice if logic.
129 | 
130 | We are given a p-value, denoted by pval, (a number between 0 and 1) and we want to convert it 
131 | to a value having 3 significant digits (specifically, we want the corresponding character string). 
132 | For example each of these values have 3 significant digits (for a number < 1, "leading 0's" to the right of the 
133 | decimal point before one encounters a non-zero digit "don't count" toward the number of significant digits):
134 | 0.123, 0.0123, 0.00123, 0.000123, 0.0000123, 0.00000123
135 | 
136 | We would like to do this since for a p-value such as 0.012345678, in most circumstances digits beyond 
137 | 0.0123 are just "clutter" since they would rarely matter, and also often wouldn't be justified 
138 | due to limited accuracy of the data. 
139 | An aside: one reason to keep many digits would be to check code by comparing results   
140 | with an independent calculation or "known" value. 
141 |       
142 | Background note (but not necessary for writing this function): p-values often come from some statistical 
143 | test, such as on whether measured values from (independent random) samples from two groups 
144 | indicate that the two groups have different group means (for what was being measured). 
145 | The pval is then (for this example) the probability that one 
146 | would have seen, just by random chance, a difference in group means as large in magnitude or larger 
147 | than the difference one actually observed, if the "truth" was that there was no difference between 
148 | the two groups (in what was being measured). 
149 | 
150 | The bottom line is that for this exercise we want to convert pval to a character string having 3 significant digits.  
151 | 
152 | Again, one could do this by simply using the **signif** or **format** function, but the point here 
153 | is to practice if logic. So you are to use the **round** function which takes as input a number (or vector) and 
154 | "rounds" the input value(s) to have only a specified number of digits appearing to the right of the decimal point.
155 | 
156 | The lines below illustrate the behavior of the **round** function, round(numeric.value, digits), which specifies 
157 | the number of digits to the right of the decimal point 
158 | in the returned value.
159 | 
160 | 
161 | ```
162 | > round(1.123, digits = 3)
163 | [1] 1.123
164 | > round(0.123, digits = 3)
165 | [1] 0.123
166 | > round(0.666666667, digits = 3)
167 | [1] 0.667
168 | > round(0.0123, digits = 3)
169 | [1] 0.012
170 | > round(0.0123, digits = 4)
171 | [1] 0.0123
172 | > round(0.00123, digits = 5)
173 | [1] 0.00123
174 | > round(0.000123, digits = 6)
175 | [1] 0.000123
176 | > round(0.0000123, digits = 7)
177 | [1] 1.23e-05
178 | ```
179 | 
180 | Your function, call it pval_To_3Sig_Digits should have the 1 argument pval (a number between 0 and 1, including 0 and 1)
181 | and should return a character string corresponding to pval rounded to 3 significant digits, using one if, else if, else 
182 | block of code (that will have multiple else if statements within it), and 
183 | the round function. If pval is < 0.00001, then return the character string "p < 0.00001"
184 | The **as.character** function will convert a numeric value to the corresponding character string
185 | 
186 | Some hints follow, try programming your function using as few hints as possible.
187 | 
188 | Think about how to order the conditions in the if, else if, else statements. Start by treating smaller p-values first and
189 | end with larger p-values. Note more than 1 test within an if, else if, else block of code might be satisfied: in 
190 | that case the "consequence" of the first test that is satisfied will be carried out and then control will pass 
191 | to the first statement after the if, elseif, else block of code.
192 | 
193 | Larger hint: the beginning and end of your function should resemble
194 | 
195 | ```
196 | pval_To_3Sig_Digits <- function(pval) {
197 | # the input pval is to be a number between 0 and 1
198 | # use the round function and an if, else if, else block to
199 | # return a character string corresponding to pval rounded to
200 | # 3 significant digits
201 | 
202 | # check pvalue is a number between 0 and 1
203 | if(!is.numeric(pval)) stop("pval is not numeric")
204 | # check pval is between 0 and 1
205 | if(pval < 0 || pval > 1) stop("pval not between 0 and 1")
206 | 
207 | # the start of the if, else if, else block
208 | 
209 | if (pval < 0.00001) { 
210 |     pval.string <- "p < 0.00001"
211 | 
212 | #   note if one has gotten to here, pval must be >= 0.00001 so
213 | #   digits should be 7 be treat values like 0.0000123
214 | #   How small does pval have to be to not get more than
215 | #   3 significant digits from round(pval, digits = 7): this 
216 | #   determines the next else if condition
217 | 
218 |     } else if (pval < 0.0001) {
219 |     pval.string <- as.character(round(pval, digits = 7))
220 | #   if pval was >= 0.0001 then digits = 7 would in general get
221 | #   4 significant digits
222 | #   Hint: the code in the next else if block will use digits = 6,
223 | #   what should the condition be in the next else if so that 
224 | #   digits = 6 does not give more than 3 significant digits
225 | 
226 | 
227 | #   MORE CODE needs to be provided
228 | 
229 | 
230 |     } else {
231 |     pval.string <- as.character(round(pval, digits = 3))
232 | }
233 | 
234 | return(pval.string)
235 | }
236 | ```
237 | 
238 | One could also start testing if pval >= 0.001 and "work downward" using successively 
239 | smaller values of pval; doing it this way, the else if tests will involve testing whether
240 | pval >= some value
241 | 
242 | ## A working version of pval_To_3Sig_Digits 
243 | 
244 | ```{r}
245 | pval_To_3Sig_Digits <- function(pval) {
246 | # the input pval is to be a number between 0 and 1
247 | # use the round function and an if, else if, else block to
248 | # return a character string corresponding to pval rounded to
249 | # 3 significant digits
250 | 
251 | # check pvalue is a number between 0 and 1
252 | if(!is.numeric(pval)) stop("pval is not numeric")
253 | # check pval is between 0 and 1
254 | if(pval < 0 || pval > 1) stop("pval not between 0 and 1")
255 | 
256 | # the if, else if, else block
257 | 
258 | if (pval < 0.00001) { 
259 |     pval.string <- "p < 0.00001"
260 |     } else if (pval < 0.0001) {
261 |     pval.string <- as.character(round(pval, digits = 7))
262 |     } else if (pval < 0.001) {
263 |     pval.string <- as.character(round(pval, digits = 6))
264 |     } else if (pval < 0.01) {
265 |     pval.string <- as.character(round(pval, digits = 5))
266 |     } else if (pval < 0.1) {
267 |     pval.string <- as.character(round(pval, digits = 4))
268 |     } else {
269 |     pval.string <- as.character(round(pval, digits = 3))
270 | }
271 | 
272 | return(pval.string)
273 | }
274 | ```
275 | 
276 | 
277 | Do some test runs
278 | 
279 | ```{r}
280 | pval_To_3Sig_Digits(1.0)
281 | pval_To_3Sig_Digits(0.123456)
282 | pval_To_3Sig_Digits(0.0123456)
283 | pval_To_3Sig_Digits(0.00123456)
284 | pval_To_3Sig_Digits(0.000123456)
285 | pval_To_3Sig_Digits(0.0000123456)
286 | pval_To_3Sig_Digits(0.00001)
287 | pval_To_3Sig_Digits(0.00000999)
288 | pval_To_3Sig_Digits(0)
289 | 
290 | # a note just for future reference
291 | # if one doesn't want scientific (exponential) notation
292 | # one can use the options function to change
293 | # "scipen" which is an R system variable (here an integer) governing 
294 | # when R will use scientific notation (for small or large numbers)
295 | # options can change various R options for the current R session
296 | getOption("scipen") # 0  so can change it back
297 | options("scipen" = 999) # don't do scientific (exponential) notation
298 | pval_To_3Sig_Digits(0.0000123456)
299 | pval_To_3Sig_Digits(0.00001)
300 | options("scipen" = 0)  # reset to the default value we retrieved above
301 | getOption("scipen")  # check it was reset
302 | ```
303 | 
304 | Hope this programming exercise was informative and good practice with 
305 | writing a function with an if block.
306 | 


--------------------------------------------------------------------------------
/Fourth-R-programming-exercise-find-prime-integers-less-than-or-equal-N.md:
--------------------------------------------------------------------------------
  1 | "Fourth R programming exercise find prime integers less than or equal N"
  2 | ------------------------------------------------------------------------
  3 | 
  4 | ### Alan E. Berger November 22, 2020
  5 | 
  6 | ### version 1
  7 | 
  8 | ### available at <https://github.com/AlanBerger/Practice-programming-exercises-for-R>
  9 | 
 10 | Finish the construction of a function to return all the prime numbers between 1 and a positive integer N
 11 | --------------------------------------------------------------------------------------------------------
 12 | 
 13 | Introduction
 14 | ------------
 15 | 
 16 | This is the fourth in a sequence of programming exercises in "composing" an R function to carry out a particular task. The idea is to practice correct use of R constructs and built in functions (functions that "come with" the basic R installation), while learning how to "put together" a correct sequence of blocks of commands that will obtain the desired result.
 17 | Note these exercises are quite cumulative - one should do them in order.
 18 | 
 19 | In these exercises, there will be a statement of what your function should do (what are the input variables and what the function should return) and a sequence of "hints". To get the most out of these exercises, try to write your function using as few hints as possible.
 20 | Note there are often several ways to write a function that will obtain the correct result. For these exercises the directions and hints may point toward a particular approach intended to practice particular constructs in R and a particular line of reasoning, even if there is a more efficent way to obtain the same result.
 21 | There may also be an existing R function or package that will do what is stated for a given practice exercise, but here the point is to practice formulating a logical sequence of steps, with each step a section of code, to obtain a working function, not to find an existing solution or a quick solution using a more powerful R construct that is better addressed later on.
 22 | 
 23 | Motivation for this exercise
 24 | ----------------------------
 25 | 
 26 | For this exercise, we will finish constructing the function getPrimeNumbers(N = 1000) which will return all the prime numbers between 1 and the positive integer N. We will use the isItPrime(n) function constructed in the previous exercise, which tests whether the positive integer n is a prime number. This illustrates construction of a function in several steps and in a modular fashion, allowing for flexibility and easier testing and debugging.
 27 | 
 28 | Background
 29 | ----------
 30 | 
 31 | Recall the definitions and results about prime numbers from the previous exercise:
 32 | A positive integer q **evenly divides** a positive integer n if there is a positive integer k such that n = k \* q, for example 3 evenly divides 15; 6 evenly divides 24; but 4 does not evenly divide 9 (in integer arithmetic, since 9 = 2 \* 4 with a **remainder** of 1).
 33 | R provides the **mod** function **%%** such that n %% q gives the remainder **r** from integer dividing n by q (also phrased as **n equals r mod q**). So q evenly divides n is equivalent to n %% q = 0
 34 | 
 35 | A positive integer p is called **prime** if p &gt; 1 and the only positive integers that evenly divide p are 1 and p (so the first several prime numbers are 2, 3, 5, 7, 11, 13). In the previous exercise we used the mod function to construct the isItPrime(n) function.
 36 | 
 37 | The function to be constructed is **getPrimeNumbers**, whose argument N is to be a positive integer greater than 1, and which should return, in a vector, call it for example primes\_up\_to\_N, all the prime numbers between 2 and N (including 2, and if N is a prime number, N).
 38 | 
 39 | Instructions for constructing **getPrimeNumbers**
 40 | -------------------------------------------------
 41 | 
 42 | In the previous exercise we constructed **isItPrime(n)** whose argument is a positive integer n that is at most 1,000,000 (just to avoid accidently starting an extremely time consuming calculation) which will return either TRUE if n is a prime and FALSE otherwise. This is a copy of isItPrime, the same as in the previous exercise, except that here I have commented out the check for N being too large since that will be done in getPrimeNumbers:
 43 | 
 44 | ``` r
 45 | isItPrime <- function(n) {
 46 | # determine whether the positive integer n is prime
 47 | # using the mod function, Version 2
 48 | 
 49 | # check that the function argument is "admissible"
 50 | # test that n is a positive integer (or a real number that equals a positive integer)
 51 | n.int <- as.integer(n) 
 52 | # if n was a real number such as 3.2 then n.int will be n truncated 
 53 | # to an integer (for this example, 3)
 54 | 
 55 | if(!(n.int == n)) stop("n is not an integer")
 56 | if(n < 1) stop("n is not positive")
 57 | 
 58 | # stop if n is "too large" to avoid a very long calculation
 59 | # if(n > 1000000) stop("n is > a million") 
 60 | 
 61 | # code to test if n is prime using R's mod function %%
 62 | # return TRUE or FALSE
 63 | 
 64 | if(n.int == 1) return(FALSE)
 65 | if(n.int == 2) return(TRUE)
 66 | # if got to here, n is at least 3
 67 | # test if an integer between 2 and sqrt(n) + 1 evenly divides n
 68 | 
 69 | lastq <- as.integer(sqrt(n)) + 1L  
 70 | # the L in 1L "tells" R is treat 1 as an 
 71 | # integer value rather than a real (numeric) value
 72 | # this could also have equivalently been done by 
 73 | # lastq <- as.integer(sqrt(n) + 1)  
 74 | for (q in 2:lastq) {
 75 |    if((n %% q) == 0) return(FALSE)
 76 |    }
 77 | 
 78 | # if got to here, n is prime
 79 | return(TRUE)
 80 | }
 81 | ```
 82 | 
 83 | Use a for loop and use isItPrime(n) to test each positive integer n between 2 and N to see if it is prime. Return the integers that are found to be prime in a vector called, for example, primes\_up\_to\_N
 84 | 
 85 | For the first version of getPrimeNumbers, use the following simple comstruction to obtain primes\_up\_to\_N: initialize primes\_up\_to\_N to be integer(0), then in a for loop whose index, call it n, runs from 2L to N, use isItPrime to test if n is a prime. If n is prime, append n to primes\_up\_to\_N via the statement
 86 | 
 87 | primes\_up\_to\_N &lt;- c(primes\_up\_to\_N, n)
 88 | 
 89 | Try writing getPrimeNumbers now.
 90 | 
 91 | If you do getPrimeNumbers(N = 111) you should get
 92 | 
 93 |     getPrimeNumbers(111)
 94 |      [1]   2   3   5   7  11  13  17  19  23  29  31  37  41  43  47
 95 |     [16]  53  59  61  67  71  73  79  83  89  97 101 103 107 109
 96 | 
 97 | The number of values printed on each line in an R session depends on the width of the R console window.
 98 | 
 99 | A working version of getPrimeNumbers follows:
100 | 
101 | ``` r
102 | getPrimeNumbers <- function(N) {
103 | # N should be a positive integer that is at least 2
104 | # return a vector containing all the prime numbers between 2 and N
105 | # (including 2 and including N if N is a prime)
106 | 
107 | # check that the function argument is "admissible"
108 | # test that N is a positive integer (or a real number that equals a positive integer)
109 | N.int <- as.integer(N) 
110 | # if N was a real number such as 3.2 then N.int will be N truncated 
111 | # to an integer (for this example, 3)
112 | 
113 | if(!(N.int == N)) stop("N is not an integer")
114 | if(N < 2) stop("N is not at least 2")
115 | 
116 | # stop if N is "too large" to avoid a very long calculation
117 | if(N > 1000000) stop("N is > a million") 
118 | 
119 | # initialize primes_up_to_N
120 | primes_up_to_N <- integer(0)
121 | 
122 | for (n in 2L:N.int) {
123 |    if(isItPrime(n)) {
124 |       primes_up_to_N <- c(primes_up_to_N, n)
125 |    }
126 | }
127 | 
128 | return(primes_up_to_N)
129 | }
130 | ```
131 | 
132 | Using a **running index** with a preset vector to obtain primes\_up\_to\_N
133 | --------------------------------------------------------------------------
134 | 
135 | In the next version of getPrimeNumbers, instead of doing
136 | 
137 | primes\_up\_to\_N &lt;- c(primes\_up\_to\_N, n)
138 | 
139 | to "accumulate" the prime numbers in a vector, you are to initialize the integer vector
140 | primes\_up\_to\_N to be of length N to contain the prime numbers between 2 and N. Obviously this vector will generally be larger than needed, but we can place each prime number as it is found into successive entries of primes\_up\_to\_N using a **running index**, call it k. How this works is one intializes k to 0 and then each time inside the for loop an integer n is found to be prime, one increases k by 1 and then sets primes\_up\_to\_N\[k\] &lt;- n When the for loop is completed, k will be the number of primes that were found between 2 and N, and so one then "trims" primes\_up\_to\_N by doing
141 | 
142 | primes\_up\_to\_N &lt;- primes\_up\_to\_N\[1:k\]
143 | 
144 | This takes more initial storage space, but is "cleaner" than successively creating new vectors by doing primes\_up\_to\_N &lt;- c(primes\_up\_to\_N, n) and is a technique one should be familiar with.
145 | 
146 | Try writing a version of getPrimeNumbers that uses a predefined primes\_up\_to\_N integer vector (of length N) and a running index to fill in its entries, and then trim it to the correct length before returning it. A working version is given below.
147 | 
148 | ``` r
149 | getPrimeNumbers <- function(N) {
150 | # N should be a positive integer that is at least 2
151 | # return a vector containing all the prime numbers between 2 and N
152 | # (including 2 and including N if N is a prime)
153 | 
154 | # for this version use a predefined integer vector primes_up_to_N of
155 | # length N and a running index k to fill in entries, and then trim it
156 | # after the for loop is completed
157 | 
158 | # check that the function argument is "admissible"
159 | # test that N is a positive integer (or a real number that equals a positive integer)
160 | N.int <- as.integer(N) 
161 | # if N was a real number such as 3.2 then N.int will be N truncated 
162 | # to an integer (for this example, 3)
163 | 
164 | if(!(N.int == N)) stop("N is not an integer")
165 | if(N < 2) stop("N is not at least 2")
166 | 
167 | # if N is "too large" (> 1,000,000) then stop
168 | if(N > 1000000) {
169 |    cat("N = ", N, "\n")  # print N and also include going to a new output line 
170 |    stop("N is > a million") 
171 | }
172 | 
173 | # initialize primes_up_to_N
174 | primes_up_to_N <- integer(N)
175 | k <- 0  # the running index
176 | 
177 | for (n in 2L:N.int) {
178 |    if(isItPrime(n)) {
179 |       k <- k + 1  # get next location in primes_up_to_N 
180 |       primes_up_to_N[k] <- n
181 |    }
182 | }
183 | 
184 | primes_up_to_N <- primes_up_to_N[1:k]  # trim to correct length
185 | return(primes_up_to_N)
186 | }
187 | 
188 | # do a test run
189 | getPrimeNumbers(111)
190 | ```
191 | 
192 |     ##  [1]   2   3   5   7  11  13  17  19  23  29  31  37  41  43  47  53  59  61  67
193 |     ## [20]  71  73  79  83  89  97 101 103 107 109
194 | 
195 | On my computer the latter version of getPrimeNumbers runs a bit faster than the former version (for N = 1000000 the former takes about 18 seconds and the latter 12 seconds).
196 | 
197 | Using the **readline** function to let the user decide whether to continue a run if N &gt; a million
198 | ----------------------------------------------------------------------------------------------------
199 | 
200 | The next version of getPrimeNumbers is the same as the one immediately above except that instead of stopping with an error if N is &gt; 1,000,000 this version asks the user to decide whether or not to continue with running the function by replying "yes" or "no" using the **readline** function if N is &gt; 1,000,000 as illustrated below.
201 | 
202 | ``` r
203 | getPrimeNumbers <- function(N) {
204 | # N should be a positive integer that is at least 2
205 | # return a vector containing all the prime numbers between 2 and N
206 | # (including 2 and including N if N is a prime)
207 | 
208 | # for this version use a predefined integer vector primes_up_to_N of
209 | # length N and a running index k to fill in entries, and then trim it
210 | # after the for loop is completed
211 | 
212 | # check that the function argument is "admissible"
213 | # test that N is a positive integer (or a real number that equals a positive integer)
214 | N.int <- as.integer(N) 
215 | # if N was a real number such as 3.2 then N.int will be N truncated 
216 | # to an integer (for this example, 3)
217 | 
218 | if(!(N.int == N)) stop("N is not an integer")
219 | if(N < 2) stop("N is not at least 2")
220 | 
221 | 
222 | # if N is "large" (> 1,000,000) check with the user to see if the user wants to proceed
223 | if(N > 1000000) {
224 |    cat("N = ", N, "\n")  # print N and also include going to a new output line 
225 |    yes.or.no <- readline("this N is large, do you want to continue, type yes or no: ")
226 |    if(yes.or.no != "yes") return("N was large so exited getPrimeNumbers")
227 | }
228 | 
229 | 
230 | # initialize primes_up_to_N
231 | primes_up_to_N <- integer(N)
232 | k <- 0  # the running index
233 | 
234 | for (n in 2L:N.int) {
235 |    if(isItPrime(n)) {
236 |       k <- k + 1  # get next location in primes_up_to_N 
237 |       primes_up_to_N[k] <- n
238 |    }
239 | }
240 | 
241 | primes_up_to_N <- primes_up_to_N[1:k]  # trim to correct length
242 | return(primes_up_to_N)
243 | }
244 | 
245 | # do a test run
246 | getPrimeNumbers(111)
247 | ```
248 | 
249 |     ##  [1]   2   3   5   7  11  13  17  19  23  29  31  37  41  43  47  53  59  61  67
250 |     ## [20]  71  73  79  83  89  97 101 103 107 109
251 | 
252 | ``` r
253 | # do a second test run
254 | primes.for.N.equal.a.million <- getPrimeNumbers(1000000)
255 | length(primes.for.N.equal.a.million) # should be 78498
256 | ```
257 | 
258 |     ## [1] 78498
259 | 
260 | ``` r
261 | primes.for.N.equal.a.million[1000] # should be 7919
262 | ```
263 | 
264 |     ## [1] 7919
265 | 
266 | ``` r
267 | primes.for.N.equal.a.million[10000] # should be 104729
268 | ```
269 | 
270 |     ## [1] 104729
271 | 
272 | ``` r
273 | tail(primes.for.N.equal.a.million) # the last value should be 999983
274 | ```
275 | 
276 |     ## [1] 999931 999953 999959 999961 999979 999983
277 | 
278 | Hope this programming exercise was informative and good practice. The next set of exercises will get into using data frames.
279 | 
280 | = = = = = = = = = = = = = = = = = = = = = = = =
281 | 
282 | This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. To view a copy of this license, visit <https://creativecommons.org/licenses/by-nc-sa/4.0/> or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. There is a full version of this license at this web site: <https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode>
283 | 


--------------------------------------------------------------------------------
/second-R-programming-exercise-if-else-if-else-syntax-and-logic.md:
--------------------------------------------------------------------------------
  1 | "Second programming exercise if, else if, else syntax and logic"
  2 | ----------------------------------------------------------------
  3 | 
  4 | ### Alan Berger Aug 25, 2020 minor edits Jan 18, 2021
  5 | 
  6 | ### version 1
  7 | 
  8 | Introduction
  9 | ------------
 10 | 
 11 | This is the second in a sequence of programming exercises intended to
 12 | fill the gap between learning the correct syntax of basic R commands and
 13 | the programming assignments in the R Programming course in the Johns
 14 | Hopkins University Data Science Specialization on Coursera. In this
 15 | sequence of exercises in "composing" an R function to carry out a
 16 | particular task, the idea is to practice correct use of R constructs and
 17 | built in functions (functions the "come with" the basic R installation),
 18 | while learning how to "put together" a correct sequence of blocks of
 19 | commands that will obtain the desired result.  
 20 | Note these exercises are quite cumulative - one should do them in order.
 21 | 
 22 | In these exercises, there will be a statement of what your function
 23 | should do (what are the input variables and what the function should
 24 | return) and a sequence of "hints". To get the most out of these
 25 | exercises, try to write your function using as few hints as possible.  
 26 | Note there are often several ways to write a function that will obtain
 27 | the correct result. For these exercises the directions and hints may
 28 | point toward a particular approach intended to practice particular
 29 | constructs in R and a particular line of reasoning.  
 30 | There may be an existing R function or package that will do what is
 31 | stated for a given practice exercise, but here (unlike other aspects of
 32 | the R Programming course) the point is to practice formulating a logical
 33 | sequence of steps, with each step a section of code, to obtain a working
 34 | function, not to find an existing solution or a quick solution using a
 35 | more powerful R construct that is better addressed later on.
 36 | 
 37 | Motivation for this exercise
 38 | ----------------------------
 39 | 
 40 | If statements are a basic construct for determining which commands or
 41 | blocks of commands should be executed. The specific function for this
 42 | exercise is described below.
 43 | 
 44 | Some if, else if, else templates
 45 | --------------------------------
 46 | 
 47 | Below are templates for common if statement constructs. Note it is
 48 | helpful to use indentation of lines of code and blank spaces between
 49 | sections of code as illustrated below. The number of spaces of
 50 | indentation is somewhat a personal choice, balancing making code easy to
 51 | read by having separate blocks of code stand out by having more spaces
 52 | in indentations, with not having too many R commands extend over more
 53 | than 1 line (and in general, don't go over 80 characters in a line). The
 54 | motivation is that code that is easier to read is easier to proofread
 55 | and spot bugs in.
 56 | 
 57 |     if (condition1) short.code  # short.code is a short R command 
 58 |     # condition1 (and below, condition2, condition3) is a logical statement evaluating to TRUE or FALSE
 59 |     # when condition1 is true, execute the short.code statement
 60 | 
 61 | 
 62 | 
 63 |     if (condition1) { 
 64 |         code1  # where code1 (and below, code2, code3, code4) stands for one or more lines of code
 65 |     }
 66 |     # when condition1 is true, execute code1
 67 | 
 68 | 
 69 | 
 70 |     if (condition1) { 
 71 |         code1
 72 |         } else {
 73 |         code2
 74 |     } 
 75 |     # when condition1 is TRUE, then the line(s) of code1 are executed,
 76 |     # otherwise the line(s) of code2 are executed
 77 | 
 78 | 
 79 | 
 80 |     if (condition1) { 
 81 |         code1
 82 |         } else if (condition2) {
 83 |         code2
 84 |     }
 85 |     # when condition1 is TRUE, code1 is executed
 86 |     # when condition1 is FALSE, then code2 will be executed if condition2 is TRUE;
 87 |     # when neither condition is TRUE, neither code1 nor code2 get executed, and
 88 |     # R "proceeds" to the next line of code
 89 | 
 90 | 
 91 | 
 92 |     # The full range of possibilities within one if, else if, else block are
 93 |     # illustrated here
 94 |     if (condition1) { 
 95 |         code1
 96 |         } else if (condition2) {
 97 |         code2
 98 |         } else if (condition3) {
 99 |         code3
100 |         } else {
101 |         code4
102 |     }
103 |     # there can be any (reasonable) number of "else ifs"
104 | 
105 | 
106 |     # Another quick way to use a sequence of if tests when there are only
107 |     # a few cases is shown in this example, of getting the rgb (red, green, blue)
108 |     # color scale values when there are only a couple possible color names.
109 |     # This is, if nothing else, code that is easy to read/understand
110 | 
111 |     rgbvec <- 0 # when, after the if statements, rgbvec is still 0, throw an error
112 |     if(color == "Magenta") rgbvec <- c(255, 0, 255)
113 |     if(color == "ForestGreen") rgbvec <- c(34, 139, 34)
114 |     if(color == "Cyan") rgbvec <- c(0, 255, 255)
115 |     # test that color matched one of the above choices
116 |     if(identical(rgbvec, 0)) stop("color did not match one of the choices")
117 | 
118 |     # This is just an example, for "real" use for getting rgb colors one would want to have a data frame
119 |     # with this information (many color names and corresponding rgb values) 
120 |     # and extract the rgb values for a given color from it.
121 |     # The color information above came from
122 |     # https://en.wikipedia.org/wiki/Web_colors
123 |     # side note: there are web sites for checking how a color figure would appear
124 |     # to people with various types of color blindness, for example
125 |     # https://www.color-blindness.com/coblis-color-blindness-simulator/
126 | 
127 | In many cases in a block of if, else if, else statements the conditions
128 | will be mutually exclusive (at most one of the conditions will be TRUE),
129 | and (when there is an else statement) one of the code blocks will
130 | certainly be executed. The range of values for which the conditions are
131 | TRUE can also be increasing or decreasing sets ("nested" sets) if one
132 | orders the sequence of if tests appropriately as in the exercise program
133 | below.
134 | 
135 | ### Instruction for the exercise function using an if, else if, else block
136 | 
137 | Constructing the function specified below is an exercise in using an if,
138 | else if, else block. Yes, one could do this by simply using the
139 | **signif** or **format** function, but the point here is to practice if
140 | logic.
141 | 
142 | We are given a p-value, denoted by pval, (a number between 0 and 1) and
143 | we want to convert it to a value having 3 significant digits
144 | (specifically, we want the corresponding character string). For example
145 | each of these values have 3 significant digits (for a number &lt; 1,
146 | "leading 0's" to the right of the decimal point before one encounters a
147 | non-zero digit "don't count" toward the number of significant digits):
148 | 0.123, 0.0123, 0.00123, 0.000123, 0.0000123, 0.00000123
149 | 
150 | We would like to do this since for a p-value such as 0.012345678, in
151 | most circumstances digits beyond 0.0123 are just "clutter" since they
152 | would rarely matter, and also often wouldn't be justified due to limited
153 | accuracy of the data. An aside: one reason to keep many digits would be
154 | to check code by comparing results  
155 | with an independent calculation or "known" value.
156 | 
157 | Background note (but not necessary for writing this function): p-values
158 | often come from some statistical test, such as on whether measured
159 | values from (independent random) samples from two groups indicate that
160 | the two groups have different group means (for what was being measured).
161 | The pval is then (for this example) the probability that one would have
162 | seen, just by random chance, a difference in group means as large in
163 | magnitude or larger than the difference one actually observed, if the
164 | "truth" was that there was no difference between the two groups (in what
165 | was being measured).
166 | 
167 | The bottom line is that for this exercise we want to convert pval to a
168 | character string having 3 significant digits.
169 | 
170 | Again, one could do this by simply using the **signif** or **format**
171 | function, but the point here is to practice if logic. So you are to use
172 | the **round** function which takes as input a number (or vector) and
173 | "rounds" the input value(s) to have only a specified number of digits
174 | appearing to the right of the decimal point.
175 | 
176 | The lines below illustrate the behavior of the **round** function,
177 | round(numeric.value, digits), which specifies the number of digits to
178 | the right of the decimal point in the returned value.
179 | 
180 |     > round(1.123, digits = 3)
181 |     [1] 1.123
182 |     > round(0.123, digits = 3)
183 |     [1] 0.123
184 |     > round(0.666666667, digits = 3)
185 |     [1] 0.667
186 |     > round(0.0123, digits = 3)
187 |     [1] 0.012
188 |     > round(0.0123, digits = 4)
189 |     [1] 0.0123
190 |     > round(0.00123, digits = 5)
191 |     [1] 0.00123
192 |     > round(0.000123, digits = 6)
193 |     [1] 0.000123
194 |     > round(0.0000123, digits = 7)
195 |     [1] 1.23e-05
196 | 
197 | Your function, call it pval\_To\_3Sig\_Digits should have the 1 argument
198 | pval (a number between 0 and 1, including 0 and 1) and should return a
199 | character string corresponding to pval rounded to 3 significant digits,
200 | using one if, else if, else block of code (that will have multiple else
201 | if statements within it), and the round function. If pval is &lt;
202 | 0.00001, then return the character string "p &lt; 0.00001" The
203 | **as.character** function will convert a numeric value to the
204 | corresponding character string
205 | 
206 | Some hints follow, try programming your function using as few hints as
207 | possible.
208 | 
209 | Think about how to order the conditions in the if, else if, else
210 | statements. Start by treating smaller p-values first and end with larger
211 | p-values. Note more than 1 test within an if, else if, else block of
212 | code might be satisfied: in that case the "consequence" of the first
213 | test that is satisfied will be carried out and then control will pass to
214 | the first statement after the if, elseif, else block of code.
215 | 
216 | Larger hint: the beginning and end of your function should resemble
217 | 
218 |     pval_To_3Sig_Digits <- function(pval) {
219 |     # the input pval is to be a number between 0 and 1
220 |     # use the round function and an if, else if, else block to
221 |     # return a character string corresponding to pval rounded to
222 |     # 3 significant digits
223 | 
224 |     # check pvalue is a number between 0 and 1
225 |     if(!is.numeric(pval)) stop("pval is not numeric")
226 |     # check pval is between 0 and 1
227 |     if(pval < 0 || pval > 1) stop("pval not between 0 and 1")
228 | 
229 |     # the start of the if, else if, else block
230 | 
231 |     if (pval < 0.00001) { 
232 |         pval.string <- "p < 0.00001"
233 | 
234 |     #   note if one has gotten to here, pval must be >= 0.00001 so
235 |     #   digits should be 7 be treat values like 0.0000123
236 |     #   How small does pval have to be to not get more than
237 |     #   3 significant digits from round(pval, digits = 7): this 
238 |     #   determines the next else if condition
239 | 
240 |         } else if (pval < 0.0001) {
241 |         pval.string <- as.character(round(pval, digits = 7))
242 |     #   if pval was >= 0.0001 then digits = 7 would in general get
243 |     #   4 significant digits
244 |     #   Hint: the code in the next else if block will use digits = 6,
245 |     #   what should the condition be in the next else if so that 
246 |     #   digits = 6 does not give more than 3 significant digits
247 | 
248 | 
249 |     #   MORE CODE needs to be provided
250 | 
251 | 
252 |         } else {
253 |         pval.string <- as.character(round(pval, digits = 3))
254 |     }
255 | 
256 |     return(pval.string)
257 |     }
258 | 
259 | One could also start testing if pval &gt;= 0.001 and "work downward"
260 | using successively smaller values of pval; doing it this way, the else
261 | if tests will involve testing whether pval &gt;= some value
262 | 
263 | A working version of pval\_To\_3Sig\_Digits
264 | -------------------------------------------
265 | 
266 |     pval_To_3Sig_Digits <- function(pval) {
267 |     # the input pval is to be a number between 0 and 1
268 |     # use the round function and an if, else if, else block to
269 |     # return a character string corresponding to pval rounded to
270 |     # 3 significant digits
271 | 
272 |     # check pvalue is a number between 0 and 1
273 |     if(!is.numeric(pval)) stop("pval is not numeric")
274 |     # check pval is between 0 and 1
275 |     if(pval < 0 || pval > 1) stop("pval not between 0 and 1")
276 | 
277 |     # the if, else if, else block
278 | 
279 |     if (pval < 0.00001) { 
280 |         pval.string <- "p < 0.00001"
281 |         } else if (pval < 0.0001) {
282 |         pval.string <- as.character(round(pval, digits = 7))
283 |         } else if (pval < 0.001) {
284 |         pval.string <- as.character(round(pval, digits = 6))
285 |         } else if (pval < 0.01) {
286 |         pval.string <- as.character(round(pval, digits = 5))
287 |         } else if (pval < 0.1) {
288 |         pval.string <- as.character(round(pval, digits = 4))
289 |         } else {
290 |         pval.string <- as.character(round(pval, digits = 3))
291 |     }
292 | 
293 |     return(pval.string)
294 |     }
295 | 
296 | Do some test runs
297 | 
298 |     pval_To_3Sig_Digits(1.0)
299 | 
300 |     ## [1] "1"
301 | 
302 |     pval_To_3Sig_Digits(0.123456)
303 | 
304 |     ## [1] "0.123"
305 | 
306 |     pval_To_3Sig_Digits(0.0123456)
307 | 
308 |     ## [1] "0.0123"
309 | 
310 |     pval_To_3Sig_Digits(0.00123456)
311 | 
312 |     ## [1] "0.00123"
313 | 
314 |     pval_To_3Sig_Digits(0.000123456)
315 | 
316 |     ## [1] "0.000123"
317 | 
318 |     pval_To_3Sig_Digits(0.0000123456)
319 | 
320 |     ## [1] "1.23e-05"
321 | 
322 |     pval_To_3Sig_Digits(0.00001)
323 | 
324 |     ## [1] "1e-05"
325 | 
326 |     pval_To_3Sig_Digits(0.00000999)
327 | 
328 |     ## [1] "p < 0.00001"
329 | 
330 |     pval_To_3Sig_Digits(0)
331 | 
332 |     ## [1] "p < 0.00001"
333 | 
334 |     # a note just for future reference
335 |     # if one doesn't want scientific (exponential) notation
336 |     # one can use the options function to change
337 |     # "scipen" which is an R system variable (here an integer) governing 
338 |     # when R will use scientific notation (for small or large numbers)
339 |     # options can change various R options for the current R session
340 |     getOption("scipen") # 0  so can change it back
341 | 
342 |     ## [1] 0
343 | 
344 |     options("scipen" = 999) # don't do scientific (exponential) notation
345 |     pval_To_3Sig_Digits(0.0000123456)
346 | 
347 |     ## [1] "0.0000123"
348 | 
349 |     pval_To_3Sig_Digits(0.00001)
350 | 
351 |     ## [1] "0.00001"
352 | 
353 |     options("scipen" = 0)  # reset to the default value we retrieved above
354 |     getOption("scipen")  # check it was reset
355 | 
356 |     ## [1] 0
357 | 
358 | Hope this programming exercise was informative and good practice with
359 | writing a function with an if block.
360 | 


--------------------------------------------------------------------------------
/Tenth-R-Practice-exercise-merging-annotation-data-into-a-gene-expression-analysis-results-data-frame.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 |  output:
  3 |   md_document:
  4 |     variant: markdown_github
  5 | # output: pdf_document
  6 | ---
  7 | 
  8 | ## Tenth R Practice exercise merging annotation data into a gene expression analysis results data frame.Rmd
  9 | 
 10 | ### Alan E. Berger  Feb 17, 2020
 11 | 
 12 | ### available at https://github.com/AlanBerger/Practice-programming-exercises-for-R 
 13 | 
 14 | ## Introduction
 15 | 
 16 | This is the tenth in a sequence of programming exercises in "composing" an R function 
 17 | to carry out a particular task. Several of these "exercise files" likely 
 18 | will take several sessions to master the content.  The material below practices composing a logical
 19 | sequence of steps to program a function that will accomplish a specified task, and 
 20 | preparing a corresponding data frame. 
 21 | 
 22 | The idea of this set of exercises is to practice correct use of R constructs and 
 23 | built in functions (functions that "come with" the basic R installation), while learning how 
 24 | to "put together" a correct sequence of blocks of commands that will obtain the desired result.  
 25 | Note these exercises are quite cumulative - one should do them in order. 
 26 | 
 27 | In these exercises, there will be a statement of what your function should do 
 28 | (what are the input variables and what the function should return) and a sequence of "hints". 
 29 | To get the most out of these exercises, try to write your function using as few hints as possible.  
 30 | Note there are often several ways to write a function that will obtain the correct result. 
 31 | For these exercises the directions and hints may point toward a particular approach intended to 
 32 | practice particular constructs in R and a particular line of reasoning, 
 33 | even if there is a more efficent way to obtain the same result. 
 34 | There may also be an existing R function or package that will do what is stated for a given 
 35 | practice exercise, but here the point is to practice formulating a logical sequence of steps, 
 36 | with each step a section of code, to obtain a working function, not to find an existing 
 37 | solution or a quick solution using a more powerful R construct that is better addressed later on.
 38 | 
 39 | ## Motivation for this exercise
 40 | 
 41 | In some cases, such as with a gene expression data set, one will want to combine analysis results as 
 42 | obtained in the previous exercise with annotation information on the probes and on the genes that is in a 
 43 | separate file that can be read in as a data frame.
 44 | 
 45 | In the R code below we repeat the analysis, done in the previous exercise, of a small subset of gene expression 
 46 | data comparing expression levels in PBMC samples from patients with Wegener's granulomatosis (WG) with samples 
 47 | from normal controls (NC). We then also read in a small subset of the annotation file for the Illumina microarray 
 48 | platform used to measure these expression levels. The web site containing the full expression data set is: 
 49 | https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE18885 
 50 | and the web site containing the full annotation data for the microarray platform used in obtaining this data is: 
 51 | https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GPL6104  
 52 | 
 53 | What we will want to do is, conceptually, for each row **r** of the analysis results data frame, "find" the row **ra** 
 54 | of the annotation data frame that has the same Illumina Probe_ID and in effect append selected columns of row ra 
 55 | from the annotation data frame to row r of the analysis results data frame. R has a function **merge** that will 
 56 | do this, but for the first exercise we will practice using basic R constructs to compose R code that will do this - 
 57 | the second exercise here will use the merge function. First read through the code below, that provides the data frames 
 58 | one will use.
 59 | 
 60 | ``` {r}
 61 | ##############  analyze the gene expression data
 62 | 
 63 | # the url for reading the little gene expression data file into an R data frame using 
 64 | # read.delim  (for reading in tab delimited text files) is given in the next 3 lines 
 65 | url.for.data.file <- "https://raw.githubusercontent.com/AlanBerger/
 66 | Practice-programming-exercises-for-R/master/tiny-subset-of-GSE18885-
 67 | gene-expression-data-9-genes-WG-5-samples-Normal-Control-4-samples.tab.txt" 
 68 |  
 69 | # read in the data as a data frame
 70 | ma <- read.delim(url.for.data.file, nrows = 9, check.names = FALSE, 
 71 |                  stringsAsFactors = FALSE)
 72 | 
 73 | # display ma
 74 | ma
 75 | 
 76 | # now, in a for loop, get the p-values and fold changes 
 77 | num.genes <- nrow(ma)  # the number of genes in this data frame
 78 | 
 79 | gene <- ma$gene  # the column of gene names
 80 | probe.vec <- ma[[1]]  # the column of Illumina Probe_IDs
 81 | # get vectors to hold the p-value and fold change values
 82 | p.value <- numeric(num.genes)
 83 | fold.change <- numeric(num.genes)
 84 | 
 85 | for (i in 1:num.genes) {
 86 | # get the vector for the WG expression values and the vector 
 87 | # for the NC expression values for the ith gene
 88 |    NCvec <- unlist(ma[i, 3:6])
 89 |    WGvec <- unlist(ma[i, 7:11])
 90 | 
 91 | # calculate the p-value and fold change
 92 |    pval <- t.test(NCvec, WGvec)$p.value  # two-sided unequal variance (Welch) t-test
 93 |    p.value[i] <- pval 
 94 |    WG.over.NC.fold.change <- 2^(mean(WGvec) - mean(NCvec))
 95 |    fold.change[i] <- WG.over.NC.fold.change  
 96 | }
 97 | 
 98 | # Construct the desired data frame. 
 99 | analysis.results <- data.frame(probe.vec, gene, p.value, fold.change, 
100 |                                stringsAsFactors = FALSE, check.names = FALSE)
101 | colnames(analysis.results) <- c("Illumina PROBE_ID", "gene", "two-sided p-value", 
102 |                                 "WG/NC fold change")
103 | analysis.results 
104 | 
105 | ##############  read in the annotation data file (a small subset of the full annotation)
106 | 
107 | # read in the short edited Illumina microarray annotation data file called
108 | # GPL6104-Illumina-microarray-platform-annotation-from-GEO-repository-
109 | # small-subset-edited-example-Feb12.tab.txt
110 | 
111 | url.for.annotation.file <- 
112 | "https://raw.githubusercontent.com/AlanBerger/Practice-programming-exercises-for-R/
113 | master/GPL6104-Illumina-microarray-platform-annotation-from-GEO-repository-small-
114 | subset-edited-example-Feb12.tab.txt"
115 | 
116 | annotation.df  <- read.delim(url.for.annotation.file, nrows = 15, check.names = FALSE, 
117 |                  stringsAsFactors = FALSE)
118 | 
119 | # Note the use of nrows = 15 since there is information on the 
120 | # source of this data in later rows that should not be read in as data.
121 | # The choice check.names = FALSE "tells" R to leave the column headers as is 
122 | 
123 | annotation.df
124 | 
125 | # We see that this annotation data file has data for more Illumina probes than 
126 | # are in the analysis results data frame, and that the probe IDs are not in the 
127 | # same order as in the analysis results data frame.  For the purpose of the 
128 | # practice exercise below we will only append the columns containing the gene name,  
129 | # Chromosome number (which chromosome the gene is located on) 
130 | # and the short desciption of the protein encoded by the gene. 
131 | # Repeating the gene name gives a indicator to use to double check that the "merge"
132 | # was done correctly.
133 | 
134 | # Note this data frame has an example of more than 1 probe for a given 
135 | # gene (ATP2B1), where different parts of the same gene are "queried". 
136 |  
137 | # To keep things simpler with this example, for each probe ID in the analysis results, 
138 | # there is 1 row of the annotation data frame with the same probe ID. 
139 |  
140 | # The merge function can handle the case where there is no matching probe ID in 
141 | # the annotation file for a probe ID in the analysis results data frame, in which case 
142 | # we would want to append NA's indicating that information is not available in the 
143 | # annotation file being used.
144 | 
145 | columns.to.keep <- c(1, 2, 4, 6)  # keep just these columns of the annotation data frame 
146 | # to have print outs easy to see
147 | # we need to keep the probe IDs in column 1 to be able to match rows
148 | 
149 | annotation.df <- annotation.df[, columns.to.keep]
150 | # from now on annotation.df will refer to this shortened version of the
151 | # annotation data frame
152 | 
153 | annotation.df 
154 | 
155 | ```
156 | 
157 | ## Programming Exercise:  Append to analysis.results information from the annotation data frame
158 | 
159 | Approach: Form a vector of row numbers, call it annot.rows, such that for each row r of the analysis.results 
160 | data frame; annot.rows[r] will contain the row of the annotation.df data frame that has the same 
161 | Illumina Probe_ID as does row r of analysis.results  
162 | Then column binding  annotation.df[annot.rows, ] to analysis.results (using **cbind**) will yield the desired result.
163 | 
164 | Hint: Use a for loop, and use the **which** function to find, for each row r of analysis.results the
165 | row number ra of annotation.df that has the same probe ID as does row r of analysis.results 
166 | 
167 | A working version of R code which does this is given below.
168 | 
169 | ``` {r}
170 | # use the analysis.results and annotation.df obtained in the R code above.
171 | nrows <- nrow(analysis.results) 
172 | # create the integer vector annot.rows of length nrows to hold the 
173 | # row numbers of annotation.df matching (with respect to the probe ID) the 
174 | # analysis.results rows 
175 | annot.rows <- vector(mode = "integer", length = nrows)  
176 | 
177 | # get the Illumina probe IDs vector from the annotation data frame
178 | annotation.df.probe.ids <- annotation.df[[1]]  
179 | 
180 | for (r in 1:nrows) {
181 |    probe.id <- analysis.results[r, 1]
182 | #  find the row ra of annotation.df whose Illumina probe ID matches probe.id
183 | ra <- which(annotation.df.probe.ids == probe.id)
184 | if (length(ra) != 1) stop("did not find unique matching probe id row")
185 | annot.rows[r] <- ra
186 | }
187 | 
188 | # append the correct rows (correctly lined up) of annotation.df to analysis.results 
189 | analysis.results.with.annotation <- cbind(analysis.results, annotation.df[annot.rows, ])
190 | analysis.results.with.annotation  # display it
191 | 
192 | # Note the row numbers are from the rows of annotation.df whose probe IDs 
193 | # matched up with the those in analysis.results
194 | ```    
195 | 
196 | ## Second exercise: use the R **merge** function to append matching annotation lines to analysis.results 
197 | 
198 | The R merge function can combine two data frames in various ways. See for example the web page by Joachim Schork 
199 | which is a page in https://statisticsglobe.com/ titled "Merge Data Frames by Column Names in R (3 Examples)": 
200 | https://statisticsglobe.com/r-merging-data-frames-by-column-names-merge-function   
201 | See also the R help on the merge function (via ? merge). 
202 | 
203 | While it is good practice to use basic R constructs until they are easy for you to use, using an available 
204 | R function can greatly simplify code which then makes it easier to keep free of bugs. Code that uses the 
205 | merge function is given below. The merge function is capable of a number of types of merging in addition to the 
206 | example below.
207 | 
208 | ``` {r}
209 | # Use the R merge function to append annotation to the analysis results
210 | 
211 | # recall that annotation.df is referring to the shortened version of the annotation
212 | # merged.df <- merge(x = analysis.results, y = annotation.df, 
213 | #              by.x = "Illumina PROBE_ID", by.y = "Illumina Probe_ID", 
214 | #              all.x = TRUE, all.y = FALSE, sort = FALSE)
215 | 
216 | # What the above call to merge will do (once the comment symbols # are removed) is: 
217 | # use the by.x = "Illumina PROBE_ID" column of analysis.results as the "guide" 
218 | # and for each row r of analysis.results, the merge function will in effect search 
219 | # to find the row ra of annotation.df such that the entry of row ra in column 
220 | # by.y = "Illumina Probe_ID" of annotation.df  matches the entry of row r in 
221 | # the column by.x = "Illumina PROBE_ID" of analysis.results
222 | # Note these 2 column names are not exactly the same so we need to specify
223 | # the column names in x and y to be used to do matching of rows, 
224 | # using the arguments by.x and by.y
225 | # The merge function will, in effect, append row ra of annotation.df to row r 
226 | # of analysis.results 
227 | # The choice all.x = TRUE means: if there is no match for the entry of row r in 
228 | # the column "Illumina PROBE_ID" of analysis.results anywhere in the column 
229 | # "Illumina Probe_ID" of annotation.df, then a row of NA's is appended to row r 
230 | # of analysis.results
231 | # The choice all.y = FALSE means don't include rows of annotation.df other than 
232 | # those appended to analysis.results as desribed above. 
233 | # The choice sort = FALSE means do not sort the resulting data frame  
234 | # (any sorting would have been done for this call to merge using 
235 | # the "Illumina PROBE_ID" column). 
236 | 
237 | merged.df <- merge(x = analysis.results, y = annotation.df, 
238 |               by.x = "Illumina PROBE_ID", by.y = "Illumina Probe_ID", 
239 |               all.x = TRUE, all.y = FALSE, sort = FALSE)
240 | # display it
241 | merged.df 
242 | 
243 | # Note the Illumina Probe_ID column of annotation.df is NOT included in merge.df
244 | # Let's check that merged.df is the same as analysis.results.with.annotation obtained 
245 | # above. First we need to remove the Illumina Probe_ID column from annotation.df 
246 | # that is included in analysis.results.with.annotation before we check.
247 | analysis.results.with.annotation <- analysis.results.with.annotation[, -5]
248 | 
249 | # check if they are the same
250 | identical(analysis.results.with.annotation, merged.df)
251 | 
252 | 
253 | # What happened? they looked the same -- so now a little "adventure" 
254 | # in finding out what happened -- this sort of thing "comes with the territory" 
255 | # when programming in any language (they each have their own quirks).
256 | 
257 | # Let's look closer
258 | attributes(analysis.results.with.annotation)
259 | 
260 | attributes(merged.df)
261 | 
262 | # So the row names were different
263 | 
264 | # Looks like we can fix this by setting the row names of
265 | # analysis.results.with.annotation to be those for merged.df
266 | row.names(analysis.results.with.annotation) <- row.names(merged.df)
267 | identical(analysis.results.with.annotation, merged.df)
268 | 
269 | 
270 | # Now what ????   Let's look at the attributes again
271 | 
272 | attributes(analysis.results.with.annotation)
273 | 
274 | attributes(merged.df)
275 | 
276 | # So the row names for analysis.results.with.annotation are 1:9 as characters
277 | # and the row names for merged.df are 1:9 as integers -
278 | # As I said, every language has its quirks
279 | row.names(analysis.results.with.annotation) <- 1:9
280 | attributes(analysis.results.with.annotation)
281 | 
282 | # Now if they aren't identical we really do have problems
283 | identical(analysis.results.with.annotation, merged.df) 
284 | 
285 | # So some semblance of order is restored.  The problem was 
286 | # row.names(merged.df) returned a character vector
287 | str(row.names(merged.df))
288 | 
289 | # One final verification: I'm going to remove the row of the annotation
290 | # data frame corresponding the probe ID for the BPI gene 
291 | # and then use the merge function
292 | annotation.df <- annotation.df[-5, ]
293 | 
294 | merged.df <- merge(x = analysis.results, y = annotation.df, 
295 |               by.x = "Illumina PROBE_ID", by.y = "Illumina Probe_ID", 
296 |               all.x = TRUE, all.y = FALSE, sort = FALSE)
297 | # display it
298 | merged.df 
299 | 
300 | # Note merge filled in NA's for the annotation columns for the row for the probe ID 
301 | # corresponding to BPI as expected.
302 | # The merge function also placed the row for which there was no match for the 
303 | # probe ID  in the annotation file at the bottom of the merged data frame.
304 | 
305 | # This illustrates the kind of "exploring" one should do when using a new R function,
306 | # particularly one that has a somewhat complex range of options and for which the 
307 | # output has a range of possibilities, in order to be confident about what it will 
308 | # do when called a certain way
309 |  
310 | ```
311 | 
312 | Hope this was informative and good practice. 
313 | The next exercise will contain further practice in using data frames, and point out some 
314 | types of logical mistakes that may result in actual output that, however, is incorrect, 
315 | rather than an error message. This is the most dangerous type of mistake, in that if the 
316 | incorrect output is not obviously wrong, the mistake might not be recognized until it 
317 | causes serious consequences.  That is why it is always wise to do, whenever possible, test 
318 | runs for cases where one knows or can independently calculate the true result. 
319 | 
320 | = = = = = = = = = = = = = = = = = = = = = = = = 
321 | 
322 | This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 
323 | To view a copy of this license, visit https://creativecommons.org/licenses/by-nc-sa/4.0/ or send a letter 
324 | to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. There is a full version of this license at this web site: 
325 | https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode   
326 | 
327 | Note the reader should not infer any endorsement or recommendation or approval for the material in this article from 
328 | any of the sources or persons cited above or any other entities mentioned in this article.
329 |    
330 | 


--------------------------------------------------------------------------------
/third-R-programming-exercise-using-for-loops-and-if-tests-to-check-if-a-positive-integer-is-a-prime.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | #output:
  3 | #  md_document:
  4 | #    variant: markdown_github
  5 | output: pdf_document
  6 | ---
  7 | 
  8 | 
  9 | ## "Third programming exercise using for loops and if tests to check if a positive integer is a prime"
 10 | 
 11 | ### Alan E. Berger  Sept 24, 2020   minor edits 21 Nov 2020
 12 | 
 13 | ### version 1
 14 | 
 15 | ### available at https://github.com/AlanBerger/Practice-programming-exercises-for-R
 16 | 
 17 | ## Introduction
 18 | 
 19 | The construction of a function to return all the prime numbers between 1 and a positive integer N 
 20 | will be done through a sequence of exercises.
 21 | 
 22 | This is the third in a sequence of programming exercises in "composing" an R function 
 23 | to carry out a particular task. The idea is to practice correct use of R constructs and 
 24 | built in functions (functions that "come with" the basic R installation), while learning how 
 25 | to "put together" a correct sequence of blocks of commands that will obtain the desired result.  
 26 | Note these exercises are quite cumulative - one should do them in order. 
 27 | 
 28 | In these exercises, there will be a statement of what your function should do 
 29 | (what are the input variables and what the function should return) and a sequence of "hints". 
 30 | To get the most out of these exercises, try to write your function using as few hints as possible.  
 31 | Note there are often several ways to write a function that will obtain the correct result. 
 32 | For these exercises the directions and hints may point toward a particular approach intended to 
 33 | practice particular constructs in R and a particular line of reasoning, 
 34 | even if there is a more efficent way to obtain the same result.  
 35 | There may also be an existing R function or package that will do what is stated for a given 
 36 | practice exercise, but here the point is to practice formulating a logical sequence of steps, 
 37 | with each step a section of code, to obtain a working function, not to find an existing 
 38 | solution or a quick solution using a more powerful R construct that is better addressed later on.
 39 | 
 40 | ## Motivation for this exercise
 41 | 
 42 | If statements and for loops are basic constructs for constructing a function that will carry out a specified computation. 
 43 | This exercise practices writing a code containing if statements and a for loop 
 44 | that through a sequence of steps computes the desired result. 
 45 | The specific function for this exercise is described below.
 46 | 
 47 | ## Background
 48 | 
 49 | The function to be written, getPrimeNumbers(N = 1000), in several steps with increasing amount of 
 50 | efficiency and use of logic in writing the code,
 51 | is to return the all prime numbers between 1 and the integer N. 
 52 | For this exercise one will write a function, isItPrime, to test if a given positive integer is a prime.
 53 | The isItPrime function will then be used in the next exercise to construct getPrimeNumbers.
 54 | 
 55 | Some basic definitions and results about prime numbers needed in writing these functions follow. 
 56 | A positive integer q **evenly divides** a positive integer n if there is a positive 
 57 | integer k such that n = k * q, for example 3 evenly divides 15; 6 evenly divides 24; but 4 does not evenly 
 58 | divide 9 (in integer arithmetic, since 9 = 2 * 4 with a **remainder** of 1).  R provides the **mod** function **%%** such 
 59 | that n %% q gives the remainder **r** from integer dividing n by q (also phrased as **n equals r mod q**). 
 60 | So q evenly divides n is equivalent to n %% q = 0
 61 | 
 62 | Here are a few sample results from using the mod function
 63 | 
 64 | ``` {r}
 65 | 9 %% 4  # 9 mod 4 equals 1
 66 | 15 %% 3  # 3 evenly divides 15
 67 | 24 %% 6  # 6 evenly divides 24
 68 | 32 %% 15  # 15 does not evenly divide 32
 69 | ```
 70 | 
 71 | A positive integer p is called **prime** if p > 1 and the only positive integers that evenly divide p are 1 and p 
 72 | (so the first several prime numbers are 2, 3, 5, 7, 11, 13). We can thus use the mod function to test whether a 
 73 | given positive integer is prime. There are more sophisticated approaches, see for example the Wikipedia article 
 74 | {"Primality Test"}(https://en.wikipedia.org/wiki/Primality_test#:~:text=9%20External%20links-,Simple%20methods,composite%2C%20otherwise%20it%20is%20prime.)   
 75 | 
 76 | This is an example of it often being the case that one needs to learn something about the data and or the science 
 77 | that is relevant to the program one is writing in order to properly do an analysis or correctly carry out a calculation. 
 78 | This can sometimes be essential to avoid serious mistakes, or to write a program that does not take 
 79 | an impractical amount of time to do the calculation.
 80 | 
 81 | The function to be constructed by the end of this sequence of exercises is **getPrimeNumbers**, whose argument N is to be 
 82 | a positive integer greater than 1, and which should return, in a vector, call it for example primes_up_to_N, 
 83 | all the prime numbers between 2 and N (including 2, and if N is a prime number, N). 
 84 |  
 85 | This sequence of exercises will work through the construction of several functions leading up to getPrimeNumbers, 
 86 | the point of which is to practice basic R constructs and "putting together" code to get a working function. 
 87 | This is also an example of writing several functions that together give a modular construction of code to 
 88 | obtain a desired result. It is often easier to test and debug a sequence of functions than one large function, 
 89 | and in some cases the individual functions can be used or quickly modified for use for another purpose. 
 90 | 
 91 | 
 92 | ### Instruction for the first function in this exercise
 93 | 
 94 | The first function in this sequence will be **isItPrime(n)** whose argument is a positive integer n that is  
 95 | at most 1,000,000 (just to avoid accidently starting an extremely time consuming calculation) 
 96 | which will return either TRUE if n is a prime and FALSE otherwise. 
 97 | To do this, treat the two cases n equal 1 (not a prime) and n equal 2 (a prime) separately 
 98 | (at the beginning of the function). Then for n greater than 2 simply check whether any integer 
 99 | between 2 and (n-1) evenly divides n (we will make this more efficient in the next version 
100 | of isItPrime).  Note my usage of the phrase: values "between X and Y" includes the "endpoints" X and Y.
101 | A skeleton of this funtion is:
102 | 
103 | ```
104 | isItPrimeV1 <- function(n) {
105 | # determine whether the positive integer n is prime
106 | # using the mod function, return TRUE or FALSE accordingly
107 | 
108 | # check that the function argument is "admissible"
109 | # test that n is a positive integer (or a real number that equals a positive integer)
110 | n.int <- as.integer(n) 
111 | # if n was a real number such as 3.2 then n.int will be n truncated 
112 | # to an integer (for this example, 3)
113 | 
114 | if(!(n.int == n)) stop("n is not an integer")
115 | if(n < 1) stop("n is not positive")
116 | 
117 | # stop if n is "too large" to avoid a very long calculation
118 | if(n > 1000000) stop("n is > a million") 
119 | 
120 | # code to test if n is prime using R's mod function %%
121 | 
122 | # special cases
123 | if(n == 1) return(FALSE)
124 | if(n == 2) return(TRUE)
125 | 
126 | ##### rest of code to test if n is prime when n is at least 3
127 | 
128 | }
129 | ```
130 | 
131 | Try programming this now. 
132 |  
133 | 
134 | 
135 | First hint: 
136 | 
137 | positive integer q evenly divides positive integer n 
138 | if and only if   n mod q is 0
139 | 
140 | 
141 | 
142 | 
143 | Second hint:
144 | 
145 | positive integer n greater than 2 is a prime if and only if
146 | no integer between 2 and (n-1) evenly divides n; test using a for loop
147 | 
148 | 
149 | note 2:(n-1) (what you want for the range of the for loop) 
150 | is **not** the same as 2:n-1 which 
151 | equals (2:n) - 1 and is 1,...,(n-1) 
152 | 
153 | 
154 | A working code is given below.
155 | 
156 | 
157 | 
158 | ```{r}
159 | isItPrimeV1 <- function(n) {
160 | # determine whether the positive integer n is prime
161 | # using the mod function
162 | 
163 | # check that the function argument is "admissible"
164 | # test that n is a positive integer (or a real number that equals a positive integer)
165 | n.int <- as.integer(n) 
166 | # if n was a real number such as 3.2 then n.int will be n truncated 
167 | # to an integer (for this example, 3)
168 | 
169 | if(!(n.int == n)) stop("n is not an integer")
170 | if(n < 1) stop("n is not positive")
171 | 
172 | # stop if n is "too large" to avoid a very long calculation
173 | if(n > 1000000) stop("n is > a million") 
174 | 
175 | # code to test if n is prime using R's mod function %%
176 | # return TRUE or FALSE
177 | 
178 | if(n.int == 1) return(FALSE)
179 | if(n.int == 2) return(TRUE)
180 | # if got to here, n is at least 3
181 | # test if an integer between 2 and (n-1) evenly divides n
182 | 
183 | for (q in 2:(n-1)) {
184 |    if((n %% q) == 0) return(FALSE)
185 |    }
186 | 
187 | # if got to here, n is prime
188 | return(TRUE)  
189 | }
190 | 
191 | #####
192 | # do a couple test runs
193 | isItPrimeV1(2)   
194 | isItPrimeV1(3)
195 | isItPrimeV1(4)
196 | isItPrimeV1(5)
197 | isItPrimeV1(6)
198 | 
199 | # test several Mersenne numbers
200 | # if you are curious, Google "Mersenne number" 
201 | isItPrimeV1(2^17 - 1)  # known to be prime
202 | isItPrimeV1(2^11 - 1)  # known to be not prime
203 | isItPrimeV1(2^6 - 1)   # known to be not prime
204 | ```
205 | 
206 | ## Instruction for the second function in this exercise
207 | 
208 | Often one can improve the efficiency or accuracy of a program if one learns some more 
209 | about the subject matter.  Here, consider the possible positive integers q (q at least 2) 
210 | that could evenly divide a positive integer n that is at least 3 
211 | (recall we are treating n equal 1 and n equal 2 "by hand"). If q is between 2 and (n-1) and evenly 
212 | divides n, then n = q * k for some positive integer k, and k must be between 2 and (n-1) (since k equal 
213 | 1 would be too small because q is at most (n-1), and k equal n would be too large because q is at least 2).  
214 | In fact, by similar reasoning, one of the values k and q must be at or below $\sqrt n$ since 
215 | otherwise q * k would be > n. Hence, if n is not a prime, it must be evenly divisible by an integer 
216 | between 2 and $\sqrt n$. So we only have to run the for loop in isItPrime 
217 | from 2 through as.integer(sqrt(n)). While that doesn't make much practical difference in computation 
218 | time for modest size n, in other circumstances a little analysis can make a substantial difference 
219 | in run time and or accuracy.
220 | 
221 | Now modify the isItPrimeV1 function to take advantage of this additional information, call it isItPrime
222 | 
223 | Try writing it - and do the test runs.  Did you get the correct answer that it is TRUE that 3 is prime?
224 | If not - What went wrong?
225 | 
226 | 
227 | Here is a version **that fails for n = 3** (but works for other positive integers)
228 | 
229 | ```{r}
230 | isItPrime <- function(n) {
231 | # determine whether the positive integer n is prime
232 | # using the mod function, Version 2
233 | 
234 | # check that the function argument is "admissible"
235 | # test that n is a positive integer (or a real number that equals a positive integer)
236 | n.int <- as.integer(n) 
237 | # if n was a real number such as 3.2 then n.int will be n truncated 
238 | # to an integer (for this example, 3)
239 | 
240 | if(!(n.int == n)) stop("n is not an integer")
241 | if(n < 1) stop("n is not positive")
242 | 
243 | # stop if n is "too large" to avoid a very long calculation
244 | if(n > 1000000) stop("n is > a million") 
245 | 
246 | # code to test if n is prime using R's mod function %%
247 | # return TRUE or FALSE
248 | 
249 | if(n.int == 1) return(FALSE)
250 | if(n.int == 2) return(TRUE)
251 | # if got to here, n is at least 3
252 | # test if an integer between 2 and (n-1) evenly divides n
253 | 
254 | lastq <- as.integer(sqrt(n))
255 | for (q in 2:lastq) {
256 |    if((n %% q) == 0) return(FALSE)
257 |    }
258 | 
259 | # if got to here, n is prime
260 | return(TRUE) 
261 | }
262 | 
263 | 
264 | # do a couple test runs
265 | isItPrime(2)
266 | isItPrime(3)  #  this should return TRUE, did it?, if not why not?
267 | isItPrime(4)
268 | isItPrime(5)
269 | isItPrime(6)
270 | 
271 | # test several Mersenne numbers
272 | isItPrime(2^17 - 1)  # known to be prime
273 | isItPrime(2^11 - 1)  # known to be not prime
274 | isItPrime(2^6 - 1)   # known to be not prime
275 | ```
276 | 
277 | Debugging syntax errors (such as forgetting a parenthesis or bracket or curly brace or typing one of these 
278 | when another is required, or typing a left one when a right one is needed etc.) or 
279 | using the $ form to extract a column from a data frame with a **variable** that contains a column 
280 | name but not an actual name of a column (which doesn't even give an error message! - R just returns **NULL**), 
281 | and debugging errors in the logical construction of the code as in this case, can be a frustrating part of programming, 
282 | but is a necessary skill that one learns with practice (more on this in later exercises).
283 | 
284 | In the test cases, this code failed for n = 3. So look at each line of the code (from the top) and think about 
285 | (or, in general (but not needed here), print out, or for larger objects use head or tail or str (structure) to look at) 
286 | what takes place in each line (that is, what was the new value of the variable that was created or modified in that line).
287 | Note the upper right RStudio sub-window (Global Environment) displays helpful information on existing R objects 
288 | if you run the lines of a function in the R console (not within a function), which is often a useful way to 
289 | do testing and debugging. Or, in this case, since the code worked before, look at the effect of what was changed, 
290 | which was to run the for loop from 2 to lastq equal to (in the case that failed) as.integer(sqrt(3)). 
291 | Well, sqrt(3) is 1.732 (to 4 significant digits) so lastq is 1 when n = 3, and the range of the for loop 
292 | in this case is the two values {2, 1} and 1 evenly divides any integer, so that is why the code failed 
293 | (returned FALSE when n was 3).  This is an example of the type of reasoning used to track down a coding error.
294 | The failure only occurs with n equal 3 since for n larger than 3, as.integer(sqrt(n)) is at least 2.
295 | One easy fix is to do the case n = 3 "by hand": add the if test:  if(n.int == 3) return(TRUE)  
296 | Another way is to increase lastq by 1: lastq <- as.integer(sqrt(n)) + 1L 
297 | (With some algebra one can check that this value of lastq is < n when n is at least 3, so 
298 | it is alright to use this value of lastq in isItPrime). 
299 | Here is a version that works:
300 | 
301 | ```{r}
302 | isItPrime <- function(n) {
303 | # determine whether the positive integer n is prime
304 | # using the mod function, Version 2
305 | 
306 | # check that the function argument is "admissible"
307 | # test that n is a positive integer (or a real number that equals a positive integer)
308 | n.int <- as.integer(n) 
309 | # if n was a real number such as 3.2 then n.int will be n truncated 
310 | # to an integer (for this example, 3)
311 | 
312 | if(!(n.int == n)) stop("n is not an integer")
313 | if(n < 1) stop("n is not positive")
314 | 
315 | # stop if n is "too large" to avoid a very long calculation
316 | if(n > 1000000) stop("n is > a million") 
317 | 
318 | # code to test if n is prime using R's mod function %%
319 | # return TRUE or FALSE
320 | 
321 | if(n.int == 1) return(FALSE)
322 | if(n.int == 2) return(TRUE)
323 | # if got to here, n is at least 3
324 | # test if an integer between 2 and sqrt(n) + 1 evenly divides n
325 | 
326 | lastq <- as.integer(sqrt(n)) + 1L  
327 | # the L in 1L "tells" R is treat 1 as an 
328 | # integer value rather than a real (numeric) value
329 | # this could also have equivalently been done by 
330 | # lastq <- as.integer(sqrt(n) + 1)  
331 | for (q in 2:lastq) {
332 |    if((n %% q) == 0) return(FALSE)
333 |    }
334 | 
335 | # if got to here, n is prime
336 | return(TRUE)
337 | }
338 | 
339 | 
340 | # do a couple test runs
341 | isItPrime(2)
342 | isItPrime(3)  
343 | isItPrime(4)
344 | isItPrime(5)
345 | isItPrime(6)
346 | 
347 | # test several Mersenne numbers
348 | isItPrime(2^17 - 1)  # known to be prime
349 | isItPrime(2^11 - 1)  # known to be not prime
350 | isItPrime(2^6 - 1)   # known to be not prime
351 | ```
352 | 
353 | ## Instruction for the third function in this exercise
354 | 
355 | Suppose we are curious to see, for values of n that were not prime, what was the first value of q 
356 | in the for loop that evenly divided n. Modify isItPrime (call the modified function isItPrimeV2) so that it 
357 | returns a vector with 3 named entries:
358 | 
359 | c(is_n_prime = 1, n = n, firstq = n)  when n is prime, and
360 | 
361 | c(is_n_prime = 0, n = n, firstq = q)  when n is not a prime
362 | 
363 | Running these test cases should give the results that follow:
364 | 
365 | ```
366 | # do a couple test runs
367 | isItPrimeV2(2)
368 | isItPrimeV2(3)  
369 | isItPrimeV2(4)
370 | isItPrimeV2(5)
371 | isItPrimeV2(6)
372 | 
373 | # test several Mersenne numbers
374 | isItPrimeV2(2^17 - 1)  # known to be prime
375 | isItPrimeV2(2^11 - 1)  # known to be not prime
376 | isItPrimeV2(2^6 - 1)   # known to be not prime
377 | 
378 | ##### should get these results:
379 | 
380 | isItPrimeV2(2)
381 | is_n_prime          n     firstq 
382 |          1          2          2 
383 | 
384 | isItPrimeV2(3)  
385 | is_n_prime          n     firstq 
386 |          1          3          3 
387 | 
388 | isItPrimeV2(4)
389 | is_n_prime          n     firstq 
390 |          0          4          2 
391 | 
392 | isItPrimeV2(5)
393 | is_n_prime          n     firstq 
394 |          1          5          5 
395 | 
396 | isItPrimeV2(6)
397 | is_n_prime          n     firstq 
398 |          0          6          2 
399 |  
400 | # test several Mersenne numbers
401 | isItPrimeV3(2^17 - 1)  # known to be prime
402 | is_n_prime          n     firstq 
403 |          1     131071     131071 
404 | 
405 | isItPrimeV3(2^11 - 1)  # known to be not prime
406 | is_n_prime          n     firstq 
407 |          0       2047         23 
408 | 
409 | isItPrimeV3(2^6 - 1)   # known to be not prime
410 | is_n_prime          n     firstq 
411 |          0         63          3 
412 | ```
413 | 
414 | 
415 | A working version of the code is:
416 | 
417 | 
418 | ```{r}
419 | isItPrimeV2 <- function(n) {
420 | # determine whether the positive integer n is prime
421 | # using the mod function, Version 2
422 | 
423 | # check that the function argument is "admissible"
424 | # test that n is a positive integer (or a real number that equals a positive integer)
425 | n.int <- as.integer(n) 
426 | # if n was a real number such as 3.2 then n.int will be n truncated 
427 | # to an integer (for this example, 3)
428 | 
429 | if(!(n.int == n)) stop("n is not an integer")
430 | if(n < 1) stop("n is not positive")
431 | 
432 | # stop if n is "too large" to avoid a very long calculation
433 | if(n > 1000000) stop("n is > a million") 
434 | 
435 | # code to test if n is prime using R's mod function %%
436 | # return c(is_n_prime = 1, n = n, firstq = n)  when n is prime
437 | # return c(is_n_prime = 0, n = n, firstq = q)  when n is not a prime
438 | # where firstq is the first (smallest) integer 
439 | # greater than 1 that evenly divides n (firstq is set to 1 if n is 1)
440 | # (q is the index of the for loop below)
441 | 
442 | if(n.int == 1) return(c(is_n_prime = 0, n = 1, firstq = 1))  
443 | if(n.int == 2) return(c(is_n_prime = 1, n = 2, firstq = 2))
444 | # if got to here, n is at least 3
445 | # test if an integer between 2 and sqrt(n) + 1 evenly divides n
446 | 
447 | lastq <- as.integer(sqrt(n)) + 1L  
448 | # the L in 1L "tells" R is treat 1 as an 
449 | # integer value rather than a real (numeric) value
450 | # this could also have equivalently been done by 
451 | # lastq <- as.integer(sqrt(n) + 1)  
452 | for (q in 2:lastq) {
453 |    if((n %% q) == 0) return(c(is_n_prime = 0, n = n.int, firstq = q))
454 |    }
455 | 
456 | # if got to here, n is prime
457 | return(c(is_n_prime = 1, n = n.int, firstq = n.int))
458 | }
459 | 
460 | 
461 | # do a couple test runs
462 | isItPrimeV2(2)
463 | isItPrimeV2(3)  
464 | isItPrimeV2(4)
465 | isItPrimeV2(5)
466 | isItPrimeV2(6)
467 | 
468 | # test several Mersenne numbers
469 | isItPrimeV2(2^17 - 1)  # known to be prime
470 | isItPrimeV2(2^11 - 1)  # known to be not prime
471 | isItPrimeV2(2^6 - 1)   # known to be not prime
472 | ```
473 | 
474 | Hope this programming exercise was informative and good practice.
475 | The next programming exercise will be to use your isItPrime function 
476 | as the "engine" for writing getPrimeNumbers(N = 1000), which will return all the prime numbers 
477 | between 1 and the positive integer N. 
478 | 
479 | = = = = = = = = = = = = = = = = = = = = = = = = 
480 | 
481 | This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 
482 | To view a copy of this license, visit https://creativecommons.org/licenses/by-nc-sa/4.0/ or send a letter 
483 | to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. There is a full version of this license at this web site: 
484 | https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode      
485 | 


--------------------------------------------------------------------------------
/Fifth-article-Review-of-getting-subsets-of-a-data-frame-constructing-data-frames.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | output:
  3 |   md_document:
  4 |     variant: markdown_github
  5 | # output: pdf_document
  6 | ---
  7 | 
  8 | ## Fifth article: Review of getting subsets of a data frame, constructing data frames
  9 | 
 10 | ### Alan E. Berger  December 9, 2020
 11 | 
 12 | ### version 1
 13 | 
 14 | ### available at https://github.com/AlanBerger/Practice-programming-exercises-for-R
 15 | 
 16 | ## Motivation 
 17 | 
 18 | Data frames, which are analogous to Excel spreadsheets for which the entries in each column are of the same 
 19 | "type" and each column has the same number of rows, are a fundamental way of handling data in R.
 20 | 
 21 | I'll first review various ways of extracting subsets of a data frame, and then review several ways to 
 22 | construct a data frame from multiple vectors or from smaller data frames. This is a fairly long article - 
 23 | it is not intended to be read all at one time.
 24 | 
 25 | R often has several equivalent ways of doing something. Perhaps?? this is from R having been developed in 
 26 | a collaborative fashion by several people with their own favorite ways of doing various programming constructs, so 
 27 | they all got included. One can choose one's favorite ways of doing things, but one needs to be familiar with all 
 28 | the commonly used constructs in order to be able to understand code written by others (and, importantly, 
 29 | to understand code in instructions and examples for R packages one wants to use).
 30 | 
 31 | ## Review of how to get specified subsets of a data frame: A. getting a single column 
 32 | 
 33 | This material is taken/modified from a pinned post of mine 
 34 | "Examples of extracting as a vector a single column from a data frame" in the Week 2 Discussion Forum for 
 35 | the R Programming Course in the Johns Hopkins Data Science Specialization on Coursera. 
 36 | 
 37 | If you are taking this course, then note in the Week 1 pinned posts in the Discussion Forum, that Leonard Greski has 
 38 | written a very good more general article on getting row and column subsets from a data 
 39 | frame "Forms of the Extract Operator in R" (this article also contains some more advanced material covered later 
 40 | in the R course, so read what is relevant to where you are in the R Programming course and refer back later as you 
 41 | learn more R); and one is also well advised to read Al Warren's pinned post in the Week 2 Discussion 
 42 | Forum "Subsetting with bracket notation".
 43 | 
 44 | Getting (often referred to as *extracting*) a single column from a data frame is a common step in an R function, 
 45 | and one usually will want to get the column in the form of a vector, not as a data frame with that one column.
 46 | 
 47 | Let's see how to get, as a vector, for example the sulfate column of a simple example data frame;
 48 | 
 49 | ```
 50 | df <- data.frame(sulfate = c(4.79, 1.46, 4.28, NA), nitrate = c(0.299, NA, 4.280, 3.560))
 51 | df
 52 |   sulfate nitrate
 53 | 1    4.79   0.299
 54 | 2    1.46      NA
 55 | 3    4.28   4.280
 56 | 4      NA   3.560
 57 | ```
 58 | 
 59 | To get the sulfate column as vector you can do either of the following 5 equivalent statements:
 60 | 
 61 | ```
 62 | df[["sulfate"]] # double brackets
 63 | [1] 4.79 1.46 4.28   NA
 64 | # or
 65 | df$sulfate  # note sulfate does not need to be in quotes for the $ form of extraction 
 66 | #            (but a text string with blanks in it would need to be)
 67 | [1] 4.79 1.46 4.28   NA
 68 | # or
 69 | df[, "sulfate"]  # single brackets but note the comma so we get all the rows in the sulfate column
 70 | [1] 4.79 1.46 4.28   NA
 71 | # or one could use the column number
 72 | df[[1]]
 73 | [1] 4.79 1.46 4.28   NA
 74 | df[, 1]
 75 | [1] 4.79 1.46 4.28   NA
 76 |  
 77 | # Note that df["sulfate"] # single brackets, no comma, 
 78 | # is a 1 column data frame containing the sulfate column; if you are getting 
 79 | # 1 column from a data frame you will usually want it as a vector
 80 | 
 81 | df["sulfate"] # single brackets gives a data frame
 82 |   sulfate
 83 | 1    4.79
 84 | 2    1.46
 85 | 3    4.28
 86 | 4      NA
 87 | 
 88 | class(df["sulfate"])
 89 | [1] "data.frame"
 90 | # Note for example the mean function "expects" a vector and
 91 | # will return NA and give a not very informative message if you
 92 | # "feed it" a data frame
 93 | 
 94 | mean(df["sulfate"])
 95 | [1] NA
 96 | Warning message:
 97 | In mean.default(df["sulfate"]) :
 98 |   argument is not numeric or logical: returning NA
 99 | 
100 | # If pollutant is an R variable containing the text string "sulfate" 
101 | # then these will work to extract the column as a vector
102 | pollutant <- "sulfate"
103 | df[[pollutant]] 
104 | [1] 4.79 1.46 4.28   NA
105 | # or
106 | df[, pollutant]
107 | [1] 4.79 1.46 4.28   NA
108 | 
109 | # BUT NOT
110 | df$pollutant 
111 | NULL
112 | ```
113 | 
114 | `df$pollutant` does NOT work since pollutant is NOT an actual column name; it is a variable *containing*  
115 | the text string sulfate which is not acceptable for the \$ form of getting/extracting a 
116 | column from a data frame as a vector (those are the R "rules" and we have to live with them).
117 | And note R does NOT even warn you about this type of mistake - it just cheerfully gives back 
118 | NULL which can lead to v e r y mysterious bugs. Similarly, mistyping the name of a column in the 
119 | following example commands results in NULL (with NO warning): `df$sulfffate` and also `df[["sulffffate"]]`. 
120 | Programming requires very careful attention to details - one might be tempted to think R should be 
121 | able to "figure out" what you meant, but recall what type of mischief an "auto correct" in a word 
122 | processor or message app can create - and in a programming language you wouldn't even get to view 
123 | in real time what the "compiler" had done to your code. Better to know that if you program correctly 
124 | exactly what you want, some "gremlin" won't be changing it!
125 | 
126 | 
127 | ## Review of how to get specified subsets of a data frame: B. subsetting rows and/or columns
128 | 
129 | If v is a vector of row indices (that are in the range of the number of rows of the data frame df) 
130 | one can get the rows of df corresponding to v
131 | 
132 | ```
133 | df <- data.frame(sulfate = c(4.79, 1.46, 4.28, NA), nitrate = c(0.299, NA, 4.280, 3.560))
134 | df
135 |   sulfate nitrate
136 | 1    4.79   0.299
137 | 2    1.46      NA
138 | 3    4.28   4.280
139 | 4      NA   3.560
140 | 
141 | v <- c(1, 3, 2, 2, 2)
142 | df[v, ]   
143 |     sulfate nitrate
144 | 1      4.79   0.299
145 | 3      4.28   4.280
146 | 2      1.46      NA
147 | 2.1    1.46      NA
148 | 2.2    1.46      NA
149 | 
150 | # note reordering and repeats are allowed
151 | # note the indication of repeats in the row numbers R generates (R "does not like" 
152 | # duplicate row names and so does modifications to make them unique)
153 | ```
154 | 
155 | Note R does not issue warnings or errors for indices that are "out of range",
156 | it just fills in NA's
157 | 
158 | ```
159 | v <- c(1, 3, 2, 6, 2)  # 6 is out of the range of the number of rows of df
160 | df[v, ]
161 |     sulfate nitrate
162 | 1      4.79   0.299
163 | 3      4.28   4.280
164 | 2      1.46      NA
165 | NA       NA      NA
166 | 2.1    1.46      NA
167 | ```
168 | 
169 | One can use negative row indices to **exclude** those rows:
170 | 
171 | ```
172 | df
173 |   sulfate nitrate
174 | 1    4.79   0.299
175 | 2    1.46      NA
176 | 3    4.28   4.280
177 | 4      NA   3.560
178 | 
179 | v <- c(-1, -3)  # exclude rows 1, 3 and keep the rest
180 | df[v, ]
181 |   sulfate nitrate
182 | 2    1.46      NA
183 | 4      NA    3.56
184 | ```
185 | 
186 | One can also specify desired columns (and repeats of columns)
187 | 
188 | ```
189 | v <- c(1, 3, 2, 2, 2)
190 | # if we just want the first column (sulfate) with these rows
191 | # we can do
192 | df[v, 1]
193 | [1] 4.79 4.28 1.46 1.46 1.46
194 | # or
195 | df[v, "sulfate"]
196 | [1] 4.79 4.28 1.46 1.46 1.46
197 | # or
198 | df[v, ]$sulfate
199 | [1] 4.79 4.28 1.46 1.46 1.46
200 |  
201 | # we can also specify several columns
202 | w <- c(1, 2, 1, 2)
203 | df[v, w]
204 |     sulfate nitrate sulfate.1 nitrate.1
205 | 1      4.79   0.299      4.79     0.299
206 | 3      4.28   4.280      4.28     4.280
207 | 2      1.46      NA      1.46        NA
208 | 2.1    1.46      NA      1.46        NA
209 | 2.2    1.46      NA      1.46        NA
210 | ```
211 | 
212 | Note R "does not like" duplicate column names and so does modifications to make them unique.
213 | 
214 | Also one can use a logical vector V having the same number of rows as df; rows where the 
215 | corresponding entry of V is TRUE are kept, rows where the corresponding entry of V is FALSE are 
216 | not kept.
217 | 
218 | ```
219 | df
220 |   sulfate nitrate
221 | 1    4.79   0.299
222 | 2    1.46      NA
223 | 3    4.28   4.280
224 | 4      NA   3.560
225 | 
226 | logicalVector <- c(T, F, F, T)
227 | df[logicalVector, ]
228 |   sulfate nitrate
229 | 1    4.79   0.299
230 | 4      NA   3.560 
231 | ```
232 | 
233 | 
234 | ## The **which** function
235 | 
236 | If one has a vector V, one can ask for which rows of V is some logical condition TRUE; R's **which** function
237 | does this: the conceptual description is
238 | 
239 | which(some logical condition on each entry of V)
240 | 
241 | returns the vector of the **indices** of V for which the the condition is TRUE 
242 | (Any entries of V that are NA will be considered to yield FALSE, so those indices of V will **not** be included 
243 | in the result.) If there are no indices for which the condition is TRUE, the which function returns an empty 
244 | integer vector (integer(0))
245 | 
246 | For example
247 | 
248 | ```
249 | df
250 |   sulfate nitrate
251 | 1    4.79   0.299
252 | 2    1.46      NA
253 | 3    4.28   4.280
254 | 4      NA   3.560
255 | 
256 | result <- which(df[["sulfate"]] > 2)
257 | result
258 | [1] 1 3
259 | # any entries of V that are NA are considered to yield FALSE 
260 | 
261 | # one can then use result to get the rows of df for which the condition was TRUE
262 | df[result, ]
263 |   sulfate nitrate
264 | 1    4.79   0.299
265 | 3    4.28   4.280
266 | 
267 | # Note the result of having an NA involved in the following:
268 | 
269 | df  # repeating what df is
270 |   sulfate nitrate
271 | 1    4.79   0.299
272 | 2    1.46      NA
273 | 3    4.28   4.280
274 | 4      NA   3.560
275 |  
276 | V <- df[["sulfate"]] > 2  # a logical vector with a value for each row of df
277 | V
278 | [1]  TRUE FALSE  TRUE    NA
279 | 
280 | df[V, ]  # keeps rows of the data frame where V is TRUE, but note the effect of the NA
281 |    sulfate nitrate
282 | 1     4.79   0.299
283 | 3     4.28   4.280
284 | NA      NA      NA
285 | ```
286 | 
287 | I like the conceptual viewpoint of the which function.  Apparently it is rather universal in 
288 | that, for example, **IDL** has the corresponding function called **where** and **MATLAB** has a 
289 | corresponding function called **find**
290 | 
291 | 
292 | ## The **%in%** function
293 | 
294 | The **%in%** function addresses the question of whether or not each entry of some vector v occurs in another vector w. 
295 | It returns a logical vector z with z[k] being TRUE if v[k] is equal to some entry in w, and z[k] FALSE if 
296 | v[k] is not equal some entry in w.  This can be used to obtain a logical vector for use in selecting 
297 | rows of a data frame 
298 | 
299 | An example of how %in% behaves:
300 | 
301 | ```
302 | ?"%in%"  # look at the help on %in% Note because of the special
303 | #            character % one needs to "protect" %in% by enclosing it in
304 | #            either quotes or apostrophes when "asking for help" on it
305 | 
306 | v <- c(1, 2, 3, 4, 5, NA)
307 | w <- c(12, 3, 8, 22, 4)
308 | v %in% w
309 | [1] FALSE FALSE  TRUE  TRUE FALSE FALSE
310 | 
311 | v <- c(1, 2, 3, 4, 5, NA)
312 | w <- c(12, 3, 8, 22, 4, NA)
313 | v %in% w
314 | [1] FALSE FALSE  TRUE  TRUE FALSE  TRUE
315 | 
316 | # note %in% will declare a match for an NA in v if there is an NA in w
317 | ```
318 | 
319 | 
320 | ## Creating data frames
321 | 
322 | ### Reading in a data file as a data frame
323 | 
324 | 
325 | As noted above, data frames are a fundamental way that R handles data. Many data files that are text files (as opposed 
326 | to binary files) are naturally suitable for reading in as a data frame using for example **read.csv** 
327 | (where the column separator (delimiter) is a comma), or 
328 | more generally **read.table** The options for read.table also apply for read.csv but note some of the important 
329 | default choices are different, in particular the default for "telling R" whether there is a header line containing 
330 | column names is **header = TRUE** for read.csv and **header = FALSE** for read.table, and the default column separator 
331 | for read.csv is  `sep = ","`  while 
332 | for read.table one should usually specify it since the default is "white space"; a common column separator other 
333 | than comma is a tab, which is specified by `sep = "\t"` (the backslash "tells" R to interpret the t in a special way).
334 | 
335 | To start with, when learning R, there are 2 other options one should be aware of: **stringsAsFactors = FALSE** "tells" R to 
336 | read in character columns as character data, not as factors which is the default (unless a column is to be used 
337 | as a factor in a statistical analysis it is likely better to have it read in as character data).
338 | 
339 | The second option one should be aware of when starting to learn read.csv and read.table is **na.strings** This option lets 
340 | one specify the character string (or several character strings) that should be interpreted as NA (the default is `"NA"`). 
341 | For example `na.strings = c("NA", "data is missing", "not available", "the experimenter dropped the sample", "the experimenter was texting when 
342 | the data should have been measured")` If there is character data other than NA signifying missing data in a column that 
343 | should be read in as numeric data, and R is not "informed" about this, then R will read in that column as factor or 
344 | character data, which can lead to "issues" that are best avoided by properly reading in the data.
345 | 
346 | Another option to be aware of if one is dealing with a file that has "non-standard for R" column 
347 | names is **check.names** which if set equal TRUE (the default), then R will modify read in column names to conform with what 
348 | R considers standard. That means blank spaces and many characters that are not letters will get replaced by a period.
349 | I find this rather annoying since I like to use long descriptive column headers in files I create, and as long as I am not 
350 | having R use the column names other than to write them back out after I have done some analysis on the data, it is OK to 
351 | instruct R to leave the column names alone (by setting check.names = FALSE).
352 | 
353 | 
354 | ## Creating a data frame from smaller data frames and/or vectors and matrices: A. the **data.frame** function
355 | 
356 | Looking over the R help on **data.frame**, one sees that it can combine objects that are or can be converted to be 
357 | data frames into one combined data frame. As with read.csv 
358 | and read.table, one may well want to use the option **stringsAsFactors = FALSE** 
359 | (unless one needs to have one or more factor columns). (The data frame function will do *recycling* on rows 
360 | but I would recommend having the number of rows in objects being combined into a data frame all be the same.) 
361 | Note the R **rep** (replicate) function can be used to replicate "patterns", for example
362 | 
363 | ```
364 | rep(c(1,2), times = 4)  # repeat the pattern 4 times
365 | [1] 1 2 1 2 1 2 1 2
366 | 
367 | # rep can also be used this way:
368 | rep(c(1,2), each = 4)
369 | [1] 1 1 1 1 2 2 2 2
370 | ``` 
371 | 
372 | ## Some examples with the data.frame function:
373 | 
374 | ```
375 | df1 <- data.frame(sulfate = c(4.79, 1.46, 4.28, NA), nitrate = c(0.299, NA, 4.280, 3.560))
376 | df1  # our continuing example data frame
377 |   sulfate nitrate
378 | 1    4.79   0.299
379 | 2    1.46      NA
380 | 3    4.28   4.280
381 | 4      NA   3.560
382 | 
383 | v1 <- c(1,2,3,4)
384 | v2 <- c(TRUE, FALSE, TRUE, TRUE)
385 | v3 <- c("a", "b", "c", "d")
386 | 
387 | m1 <- matrix(1:12, nrow = 4, ncol = 3)
388 | m1
389 |      [,1] [,2] [,3]
390 | [1,]    1    5    9
391 | [2,]    2    6   10
392 | [3,]    3    7   11
393 | [4,]    4    8   12
394 | 
395 | df <- data.frame(df1, v1, v2, m1, v3, stringsAsFactors = FALSE)
396 | df
397 |   sulfate nitrate v1    v2 X1 X2 X3 v3
398 | 1    4.79   0.299  1  TRUE  1  5  9  a
399 | 2    1.46      NA  2 FALSE  2  6 10  b
400 | 3    4.28   4.280  3  TRUE  3  7 11  c
401 | 4      NA   3.560  4  TRUE  4  8 12  d
402 | 
403 | # The row names of df are the row names of the first argument of data.frame, i.e.,
404 | # the first of the objects being combined into one data frame
405 | 
406 | # If one wants to change some column names one could do, for example,
407 | 
408 | colnames(df)[c(5, 6, 7)] <- c("m1", "m2", "m3")
409 | df
410 |   sulfate nitrate v1    v2 m1 m2 m3 v3
411 | 1    4.79   0.299  1  TRUE  1  5  9  a
412 | 2    1.46      NA  2 FALSE  2  6 10  b
413 | 3    4.28   4.280  3  TRUE  3  7 11  c
414 | 4      NA   3.560  4  TRUE  4  8 12  d
415 | 
416 | # and similarly with row names
417 | rownames(df) <- c("r1", "r2", "r3", "r4")
418 | df 
419 |    sulfate nitrate v1    v2 m1 m2 m3 m4
420 | r1    4.79   0.299  1  TRUE  1  5  9  a
421 | r2    1.46      NA  2 FALSE  2  6 10  b
422 | r3    4.28   4.280  3  TRUE  3  7 11  c
423 | r4      NA   3.560  4  TRUE  4  8 12  d
424 | ```
425 | 
426 | ## Creating a data frame from smaller data frames and/or vectors and matrices: B. **rbind**
427 | 
428 | If one has several data frames that have the same number of columns AND the same column names, 
429 | then one can  "stack them vertically" using the **rbind** (row bind) function. For example with 2 data frames 
430 | df1 and df2 
431 | 
432 | ```
433 | df1 <- data.frame(sulfate = c(4.79, 1.46, 4.28, NA), nitrate = c(0.299, NA, 4.280, 3.560))
434 | df1
435 |   sulfate nitrate
436 | 1    4.79   0.299
437 | 2    1.46      NA
438 | 3    4.28   4.280
439 | 4      NA   3.560
440 | 
441 | df2 <- data.frame(sulfate = c(24.79, 21.46, 24.28, NA), nitrate = c(2.299, NA, 2.280, 2.560))
442 | df2
443 |   sulfate nitrate
444 | 1   24.79   2.299
445 | 2   21.46      NA
446 | 3   24.28   2.280
447 | 4      NA   2.560
448 | 
449 | # then one can do
450 | df <- rbind(df1, df2)
451 | df
452 |   sulfate nitrate
453 | 1    4.79   0.299
454 | 2    1.46      NA
455 | 3    4.28   4.280
456 | 4      NA   3.560
457 | 5   24.79   2.299
458 | 6   21.46      NA
459 | 7   24.28   2.280
460 | 8      NA   2.560
461 | 
462 | # The column names for all the items being "rbinded" must be the same 
463 | # (except for an important exception described below).
464 | 
465 | # will get an error message if the column names don't match: for example below I have 
466 | # the column names in df2 not matching those in df1
467 | 
468 | colnames(df2)[2] <- "another.name"
469 | df2
470 |   sulfate another.name
471 | 1   24.79        2.299
472 | 2   21.46           NA
473 | 3   24.28        2.280
474 | 4      NA        2.560
475 | 
476 | df <- rbind(df1, df2)  # gets error message
477 | Error in match.names(clabs, names(xi)) : 
478 |   names do not match previous names
479 | ```
480 | 
481 | 
482 | ## Creating a data frame from smaller data frames and/or vectors and matrices: C. Using rbind in a loop
483 | 
484 | 
485 | In some circumstances one might be reading in or constructing a succession of data frames, each with 
486 | the same number of columns and the same column names, and want to combine them vertically. 
487 | One can do this in a loop if one initializes an empty data frame via `df <- data.frame()`
488 | 
489 | One can rbind any data.frame to this empty data frame; this is the exception to the rule 
490 | on same number of columns and column names so then this "conceptual" for loop will work:
491 | 
492 | ```
493 | df <- data.frame()  # initialize an empty data frame
494 | for (i in some.set) {
495 |    read in or derive a data frame dfi corresponding to i (each dfi must have the same
496 |    number of columns and the same column names)
497 |    df <- rbind(df, dfi)
498 | }
499 | 
500 | # after this loop the data frame df will consist of all the data frames dfi stacked vertically
501 | ```
502 | 
503 | 
504 | ## Creating a data frame from smaller data frames and/or vectors and matrices: D. the **cbind** function
505 | 
506 | 
507 | The **cbind** (column bind) function can combine data frames or combine objects that are or can be converted to be 
508 | data frames. cbind is the same as data.frame except that the default for cbind is check.names = FALSE
509 | 
510 | 
511 | Hope this review was informative.
512 | The next set of exercises will get into practicing using and creating data frames.
513 | 
514 | = = = = = = = = = = = = = = = = = = = = = = = = 
515 | 
516 | This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 
517 | To view a copy of this license, visit https://creativecommons.org/licenses/by-nc-sa/4.0/ or send a letter 
518 | to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. There is a full version of this license at this web site: 
519 | https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode   
520 | 
521 | Some of the material above (Review of how to get specified subsets of a data frame: getting a single column) was 
522 | taken/modified from a post of mine in the Discussion Forum for the R Programming Course in 
523 | the Johns Hopkins Data Science Specialization on Coursera, as noted above. 
524 | As such Coursera and Coursera authorized Partners retain additional rights to that material as described in 
525 | their "Terms of Use" https://www.coursera.org/about/terms 
526 | 
527 | Note the reader should not infer any endorsement or recommendation or approval for the material in this article from
528 | any of the sources or persons cited above or any other entities mentioned in this article.
529 | 
530 |    
531 | 


--------------------------------------------------------------------------------
/Eighth-R-Practice-exercise-composing-a-function-and-constructing-a-data-frame-files-not-containing-search-strings.md:
--------------------------------------------------------------------------------
  1 | Eighth R Practice exercise composing a function and constructing a data frame; files not containing search strings
  2 | ------------------------------------------------------------------------------------------------------------------
  3 | 
  4 | ### Alan E. Berger Feb 2, 2020
  5 | 
  6 | ### available at <https://github.com/AlanBerger/Practice-programming-exercises-for-R>
  7 | 
  8 | Introduction
  9 | ------------
 10 | 
 11 | This is the eighth in a sequence of programming exercises in "composing" an R function to carry out a particular task. Several of these "exercise files" likely
 12 | will take several sessions to master the content. The material below practices composing a logical sequences of steps to program a function that will accomplish a specified task, and preparing a corresponding data frame.
 13 | 
 14 | The idea of this set of exercises is to practice correct use of R constructs and built in functions (functions that "come with" the basic R installation), while learning how to "put together" a correct sequence of blocks of commands that will obtain the desired result.
 15 | Note these exercises are quite cumulative - one should do them in order.
 16 | 
 17 | In these exercises, there will be a statement of what your function should do (what are the input variables and what the function should return) and a sequence of "hints". To get the most out of these exercises, try to write your function using as few hints as possible.
 18 | Note there are often several ways to write a function that will obtain the correct result. For these exercises the directions and hints may point toward a particular approach intended to practice particular constructs in R and a particular line of reasoning, even if there is a more efficent way to obtain the same result.
 19 | There may also be an existing R function or package that will do what is stated for a given practice exercise, but here the point is to practice formulating a logical sequence of steps, with each step a section of code, to obtain a working function, not to find an existing solution or a quick solution using a more powerful R construct that is better addressed later on.
 20 | 
 21 | Motivation for this exercise
 22 | ----------------------------
 23 | 
 24 | We will compose a function that returns a vector of the names of files in a folder that do NOT contain any of the entries of search.strings in their name. This will be an opportunity to practice use of the **%in%** function; and then use of the **setdiff** function applied to two vectors: `setdiff(V1, V2)` for vectors V1 and V2 of the same type (e.g., numeric or character) will give a vector consisting of the entries of V1 that are **not** equal to any entry of V2.
 25 | 
 26 | In the previous (seventh) exercise file we prepared the function
 27 | 
 28 | **search\_for\_filenames\_containing\_any\_of\_the\_patterns\_and\_output\_file\_info**(directory, search.strings)
 29 | 
 30 | that returns a data frame containing information on the file names in directory that match **ANY entry** of search.strings (i.e., that have 1 or more of the entries of search.strings in their file name). The first column of the returned data frame has the names of the files. The final version is copied here:
 31 | 
 32 | ``` r
 33 | search_for_filenames_matching_any_of_the_patterns_and_output_file_info <- 
 34 |                                          function(directory, search.strings){
 35 | 
 36 | # directory is an absolute path (full path) or a path relative to the R working directory to the
 37 | # folder to be searched. If want to search the R working directory itself, 
 38 | # can set directory = "." with the line of code:   directory <- "." 
 39 | # or could set directory to be the full path to the R working directory.
 40 | 
 41 | # search.strings is a character string vector
 42 | 
 43 | # Return a data frame of the file names (not including folders) in directory that contain
 44 | # ANY of the entries of the search.strings vector somewhere in their file name,
 45 | # (or at the beginning of the filename, or at the end of the filename, if so specified).
 46 | # The search will be case insensitive (treats lower case and upper case letters as the same).
 47 | 
 48 | # The first step is to initialize the filenames vector: filenames <- character(0)
 49 | 
 50 | # In a for loop, use R's list.files function to list all the files (file names) matching the
 51 | # entries of search.strings, one by one, 
 52 | # eliminating names of folders, and appending them to filenames
 53 | 
 54 | # Use the unique function to eliminate duplicates (keep only 1 copy of each file name)
 55 | 
 56 | # For the files whose names contain any of the character strings in search.strings, use
 57 | # R's file.info function to get the file size and last modification time as in the previous function.
 58 | 
 59 | # The output data frame will contain the file size and the last modification time
 60 | # for each file that has any member of search.strings somewhere in its file name 
 61 | # (or at the beginning or at the end of the file name, if so specified)
 62 | #
 63 | # We will get the file names without the folder path leading to the files included in the name. 
 64 | 
 65 | # check that search.strings is a non-empty character vector 
 66 |  
 67 | ns <- length(search.strings)
 68 | if(ns < 1) stop("no entries in search.strings")
 69 | if(!is.character(search.strings)) stop("search.strings is not a character vector")
 70 | 
 71 | # We will return a data frame whose first column contains the files in directory
 72 | # that contain any character string in the search.strings vector somewhere in their name.
 73 | # The second column will contain the last time (and date) the file was modified,
 74 | # and the third column will be the file size (in bytes).
 75 | 
 76 | # The first step is to initialize the filenames vector 
 77 | filenames <- character(0)
 78 | 
 79 | # Then in a for loop, for each entry S of search strings, 
 80 | # use list.files to get the vector V the file names in directory that 
 81 | # contain S in their file name and append V to filenames (after eliminating any folder names). 
 82 | # do not include the path to the file in the file name.
 83 | 
 84 | # To eliminate names of any folders (directories) that are in V: 
 85 | # do paste(directory, "/", V, sep = "") to get the filenames including either
 86 | # the relative path from the R working directory or the absolute path (depending on what 
 87 | # directory is); use these names in the R dir.exists function to check for folder 
 88 | # (directory) names in filenames
 89 | 
 90 |    for (k in 1:ns) {
 91 |       V <- list.files(directory, pattern = search.strings[k],  
 92 |                       full.names = FALSE, ignore.case = TRUE)
 93 | #     exclude directory names from V (we need to do this since we are "adding" 
 94 | #     the file names in V to filenames and want only file names, not folder names
 95 | #     if V is empty (character(0)) then skip this
 96 |       if(length(V) > 0) V <- V[!dir.exists(paste(directory, "/", V, sep = ""))]  
 97 |       filenames <- c(filenames, V)
 98 |    }
 99 | 
100 | ### It is important to note we could have also "run the for loop" in this fashion:
101 | ### for (S in search.strings) {
102 | ###    V <- list.files(directory, pattern = S,  
103 | ##############   rest of the for loop
104 | ###
105 | 
106 | nf <- length(filenames)
107 | if(nf == 0) {
108 |    print("no files contain any of the search strings")
109 |    return("no files contain any of the search strings")
110 | }
111 | 
112 | filenames <- unique(filenames)
113 | nf <- length(filenames)  # need to do this since may have eliminated some duplicate name(s)
114 | 
115 | # If got to here, at least 1 file has a character string in search.strings
116 | # in its name, so get the information on these file(s) into a data frame.
117 | 
118 | ################################## get the data frame to be output
119 | 
120 | # Get the desired output data frame using vectors
121 | 
122 | dfcolnames <- c("file.name", "modif.date", "size.in.bytes")
123 | # initialize the 3 vectors that will hold this information on the files
124 | # whose names matched any of the members of search.strings
125 | 
126 | fname <- character(0)
127 | fdate <- character(0)
128 | fsize <- numeric(0)
129 | 
130 | for(k in 1:nf) {
131 |     finfo <- file.info(paste(directory, "/", filenames[k], sep = ""))
132 | # needed to include the path to the file so file.info can locate it
133 |     fname <- c(fname, filenames[k])
134 |     fdate <- c(fdate, as.character(finfo$mtime))
135 |     fsize <- c(fsize, finfo$size)
136 | }
137 | 
138 | df <- data.frame(fname, fdate, fsize, stringsAsFactors = FALSE)
139 | colnames(df) <- dfcolnames
140 | 
141 | ################################## finished getting the data frame to be output
142 | 
143 | # Write the data frame out to a tab delimited text file called scrlisting.txt in directory 
144 | # (i.e., in the folder specified by the argument directory this function was called with).
145 | 
146 | outpfilename <- paste(directory, "/", "scrlisting.txt", sep = "")
147 | # One can rename this "scratch file" as desired after viewing it (best viewed in Excel or equivalent).
148 | write.table(df, file = outpfilename, 
149 |             append = FALSE, quote = FALSE, sep = "\t",
150 |             row.names = FALSE, col.names = TRUE)
151 | # This call to write.table will write out a data frame
152 | # as one would usually want; it specifies the column separator to be a tab
153 | 
154 | return(df)
155 | }
156 | ```
157 | 
158 | Exercises
159 | ---------
160 | 
161 | The function for this exercise will be to construct a modified version of the search function called
162 | 
163 |     search_for_filenames_matching_NONE_of_the_patterns_and_output_file_names <- 
164 |                                                 function(directory, search.strings)
165 | 
166 | that will return the vector of file names in directory that do **NOT** match **ANY entry** of search.strings
167 | 
168 | One way to view this, is that the first function in this sequence obtained the **intersection** of the sets of file names matching each entry of search strings, while the previous function (the function above) obtained the **union** U of the sets of file names matching each entry of search.strings This programming exercise will be to construct a version that will return the vector of file names in directory that **DO NOT match ANY** entry of search.strings. This can be viewed as obtaining, relative to the set of all the files in directory, the **complement of U**.
169 | 
170 | One could as in the previous functions return information on these files, but this is more a practice exercise in using the %in% function (and in the next exercise, using the setdiff function) so we are just going to concentrate on getting the vector of the file names.
171 | 
172 | Hints: Use the
173 | 
174 | **search\_for\_filenames\_matching\_any\_of\_the\_patterns\_and\_output\_file\_info**
175 | 
176 | function to find the file names in directory that match one or more of the file names in search.strings, call this vector **Vany**. Then remove these file names from the vector of all the file names in directory. One approach for this is to use **list.files** to get the vector of all the file and folder names in directory, then remove the folder names from this vector, resulting in the vector **Vall**. Then construct a logical vector Vlogical the same length as Vall such that: Vlogical\[k\] is TRUE if Vall\[k\] is **not** an entry of Vany (and Vlogical\[k\] is FALSE if Vall\[k\] is an entry of Vany). The `%in%` function is very convenient for this (and this is good practice using it).
177 | 
178 | Recall from the fifth article in this series: The **%in%** function addresses the question of whether or not each entry of some vector v occurs in another vector w: `z <- v %in% w` obtains a logical vector z with z\[k\] being TRUE if v\[k\] is equal to some entry in w, and z\[k\] being FALSE if v\[k\] is not equal some entry in w. Note then the logical vector `nz <- !z` has the property that nz\[k\] will be TRUE if v\[k\] is NOT an entry of w, and nz\[k\] will be FALSE if v\[k\] is an entry of w; hence v\[nz\] gives the entries of v that are NOT in w.
179 | 
180 | An example of how %in% behaves:
181 | 
182 |     ?"%in%"  # look at the help on %in% Note because of the special
183 |     #            character % one needs to "protect" %in% by enclosing it in
184 |     #            either quotes or apostrophes when "asking for help" on it
185 | 
186 |     v <- c(1, 2, 3, 4, 5, NA)
187 |     w <- c(12, 3, 8, 22, 4)
188 |     v %in% w
189 |     [1] FALSE FALSE  TRUE  TRUE FALSE FALSE
190 | 
191 |     v <- c(1, 2, 3, 4, 5, NA)
192 |     w <- c(12, 3, 8, 22, 4, NA)
193 |     v %in% w
194 |     [1] FALSE FALSE  TRUE  TRUE FALSE  TRUE
195 | 
196 |     # note %in% will declare a match for an NA in v if there is an NA in w
197 | 
198 | From the discussion above, Vall\[Vlogical\] is the desired vector of file names, and we can obtain Vlogical using the `%in%` function and logical negation (!). Try doing this - a working version is given below.
199 | 
200 | ``` r
201 | search_for_filenames_matching_NONE_of_the_patterns_and_output_file_names <- 
202 |                                          function(directory, search.strings){
203 | 
204 | # directory is an absolute path (full path) or a path relative to the R working directory to the
205 | # folder to be searched. If want to search the R working directory itself, 
206 | # can set directory = "." 
207 | # or could set directory to be the full path to the R working directory.
208 | 
209 | # search.strings is a text string vector
210 | 
211 | # Return a vector of the file names (not including folders) in directory that contain
212 | # NONE of the entries of the search.strings vector in their file name,
213 | # the search will be case insensitive (treats lower case and upper case letters as the same).
214 | # Since this is mainly an exercise in using the %in% function, just return the vector of file 
215 | # names (and don't bother with getting the last modification data or file size).
216 | 
217 | # Use R's list.files function to list all the files (file names) in directory 
218 | # then eliminate names of folders; put the result in Vall 
219 | 
220 | # Then use the previous function
221 | # search_for_filenames_matching_any_of_the_patterns_and_output_file_info
222 | # to find all the files in directory that match some entry in search.strings
223 | # and put the vector of these file names in Vany
224 | 
225 | # Then eliminate from Vall any entries that are in Vany using the %in% function 
226 | # and logical negation (!)
227 | 
228 | # We will get the file names without the folder path leading to the files included in the name. 
229 | 
230 | # check that search.strings is a non-empty character vector 
231 |  
232 | ns <- length(search.strings)
233 | if(ns < 1) stop("no entries in search.strings")
234 | if(!is.character(search.strings)) stop("search.strings is not a character vector")
235 | 
236 | Vall <- list.files(directory, full.names = FALSE, ignore.case = TRUE)
237 | nf <- length(Vall)
238 | if(nf == 0) {
239 |    print("no files in directory")
240 |    return("no files in directory")
241 | }
242 | #  exclude directory names from Vall 
243 | Vall <- Vall[!dir.exists(paste(directory, "/", Vall, sep = ""))]  
244 | 
245 | nf <- length(Vall)
246 | if(nf == 0) {
247 |    print("no files in directory")
248 |    return("no files in directory")
249 | }
250 | 
251 | # now get Vany
252 | df.Vany <- 
253 | search_for_filenames_matching_any_of_the_patterns_and_output_file_info(directory, search.strings)
254 | 
255 | # need to handle the case that NO file names were a match for any of the search strings 
256 | # in which case df.Vany is the character string: "no files contain any of the search strings" 
257 | # rather than a data frame 
258 | if(class(df.Vany) == "character") {
259 |    Vany <- character(0)
260 |    } else {
261 |    Vany <- df.Vany$file.name  # the column of file names
262 | }
263 | # When programming, one should always be sure the function handles "extreme cases", 
264 | # here that would be when NONE the file names in directory match some entry in search.strings, 
265 | # and when ALL of the file names match search.strings 
266 | 
267 | # eliminate names in Vany from Vall
268 | 
269 | Vlogical <- !(Vall %in% Vany)  # we want entries in Vall that are NOT in Vany so need to use !
270 | filenames <- Vall[Vlogical]  # the desired file names (names NOT in Vany)
271 | 
272 | nf <- length(filenames)
273 | if(nf == 0) {
274 |    print("all files contain contain an entry of search strings")
275 |    return("all files contain contain an entry of search strings")
276 | }
277 | 
278 | # If got to here, at least 1 file has no entry of search.strings in its name
279 | return(filenames)  # here we are just returning the file names without other information on them
280 | }
281 | ```
282 | 
283 | ### Test runs on my computer
284 | 
285 | Recalling from the previous two exercise files, I have constructed in my R working directory a folder called test\_dir containing a small number of files with names I picked to conveniently test the functions that search for file names satisfying various conditions. Here are all the file names (and the one folder name) in the test\_dir folder:
286 | 
287 |     directory <- "test_dir"
288 |     list.files(directory)  # 9 files and 1 folder
289 |      [1] "001.csv"         "002.csv"         "003.txt"        
290 |      [4] "004csvfile.txt"  "005txt2csv"      "1.csv"          
291 |      [7] "308.csv"         "folder001txtcsv" "scrlisting.txt" 
292 |     [10] "txt.csv"        
293 | 
294 | We will use this folder and files (9 filenames and 1 folder name) to test the search function written immediately above (you can construct a similar test\_dir folder and files with these filenames and also a folder in it called "folder001txtcsv" to run tests, the only difference will be the dates and sizes of the files). Or you could test the function using a suitable folder in your computer for which you can pick search.strings so that you will get a known reasonable length vector for the test run.
295 | 
296 | Some test runs
297 | --------------
298 | 
299 |     directory <- "test_dir"
300 |     list.files(directory)  # 9 files and 1 folder
301 |      [1] "001.csv"         "002.csv"         "003.txt"         
302 |      [4] "004csvfile.txt"  "005txt2csv"      "1.csv"           
303 |      [7] "308.csv"         "folder001txtcsv" "scrlisting.txt"     
304 |     [10] "txt.csv"        
305 |      
306 |     search.strings <- c("2", "3", "4", "5") 
307 |     search_for_filenames_matching_NONE_of_the_patterns_and_output_file_names(directory, search.strings) 
308 |     [1] "001.csv"        "1.csv"          "scrlisting.txt"    "txt.csv"
309 | 
310 |     search.strings <- c("csv") 
311 |     search_for_filenames_matching_NONE_of_the_patterns_and_output_file_names(directory, search.strings) 
312 |     [1] "003.txt"        "scrlisting.txt"
313 | 
314 |     # test case when all the file names match some member of search.strings
315 |     search.strings <- c("csv", "txt")  
316 |     search_for_filenames_matching_NONE_of_the_patterns_and_output_file_names(directory, search.strings) 
317 |     [1] "all files contain contain an entry of search strings"
318 |     [1] "all files contain contain an entry of search strings"
319 |     # when there are no such files (no files in directory that don't have an entry of search.strings 
320 |     # in their file name), the search function both prints this message and returns it, so here it is 
321 |     # output twice 
322 | 
323 |     search.strings <- c("00")
324 |     search_for_filenames_matching_NONE_of_the_patterns_and_output_file_names(directory, search.strings) 
325 |     [1] "1.csv"          "308.csv"        "scrlisting.txt"      "txt.csv" 
326 | 
327 |     # test case when none of the file names match some member of search.strings, 
328 |     # one should get all the file names (but not the folder name)
329 |     search.strings <- c("00987")
330 |     search_for_filenames_matching_NONE_of_the_patterns_and_output_file_names(directory, search.strings) 
331 |     [1] "no files contain any of the search strings"
332 |     [1] "001.csv"        "002.csv"        "003.txt"        "004csvfile.txt"
333 |     [5] "005txt2csv"     "1.csv"          "308.csv"        "scrlisting.txt"
334 |     [9] "txt.csv"       
335 | 
336 |     # note  [1] "no files contain any of the search strings" came from
337 |     # "looking for" the files that matched ANY entry of search.strings - 
338 |     # in this case there were none, and so one obtained all the file names 
339 |     # in test_dir (and, correctly, not the folder name)
340 | 
341 | The next exercise is to use the **setdiff** function in place of using the %in% function to get the file names that are in Vall but not in Vany.
342 | 
343 | Hint: In the function above you simply need to replace the following 2 lines with 1 line using setdiff
344 | 
345 |     Vlogical <- !(Vall %in% Vany)  # we want entries in Vall that are NOT in Vany so need to use !
346 |     filenames <- Vall[Vlogical]  # the desired file names (names NOT in Vany)
347 | 
348 | A correct line using setdiff is: `filenames <- setdiff(Vall, Vany)`
349 | 
350 | One should do the test runs again to check this works correctly.
351 | 
352 | This exercise was intended to practice the very useful **%in%** function, and also illustrate that finding an existing R function (here setdiff) that can do exactly what you want can result in very concise easy to understand code, which leaves less room for bugs to occur and to hide.
353 | 
354 | Hope this was informative and good practice. The next set of exercises will address dealing with using subsets of an individual row of a data frame. = = = = = = = = = = = = = = = = = = = = = = = =
355 | 
356 | This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. To view a copy of this license, visit <https://creativecommons.org/licenses/by-nc-sa/4.0/> or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. There is a full version of this license at this web site: <https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode>
357 | 


--------------------------------------------------------------------------------
/Eighth-R-Practice-exercise-composing-a-function-and-constructing-a-data-frame-files-not-containing-search-strings.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 |  output:
  3 |   md_document:
  4 |     variant: markdown_github
  5 | # output: pdf_document
  6 | ---
  7 | 
  8 | 
  9 | ## Eighth R Practice exercise composing a function and constructing a data frame; files not containing search strings
 10 | 
 11 | ### Alan E. Berger  Feb 2, 2020
 12 | 
 13 | ### available at https://github.com/AlanBerger/Practice-programming-exercises-for-R 
 14 | 
 15 | ## Introduction
 16 | 
 17 | This is the eighth in a sequence of programming exercises in "composing" an R function 
 18 | to carry out a particular task. Several of these "exercise files" likely  
 19 | will take several sessions to master the content.  The material below practices composing a logical
 20 | sequences of steps to program a function that will accomplish a specified task, and 
 21 | preparing a corresponding data frame. 
 22 | 
 23 | The idea of this set of exercises is to practice correct use of R constructs and 
 24 | built in functions (functions that "come with" the basic R installation), while learning how 
 25 | to "put together" a correct sequence of blocks of commands that will obtain the desired result.  
 26 | Note these exercises are quite cumulative - one should do them in order. 
 27 | 
 28 | In these exercises, there will be a statement of what your function should do 
 29 | (what are the input variables and what the function should return) and a sequence of "hints". 
 30 | To get the most out of these exercises, try to write your function using as few hints as possible.  
 31 | Note there are often several ways to write a function that will obtain the correct result. 
 32 | For these exercises the directions and hints may point toward a particular approach intended to 
 33 | practice particular constructs in R and a particular line of reasoning, 
 34 | even if there is a more efficent way to obtain the same result.  
 35 | There may also be an existing R function or package that will do what is stated for a given 
 36 | practice exercise, but here the point is to practice formulating a logical sequence of steps, 
 37 | with each step a section of code, to obtain a working function, not to find an existing 
 38 | solution or a quick solution using a more powerful R construct that is better addressed later on.
 39 | 
 40 | ## Motivation for this exercise
 41 | 
 42 | We will compose a function that returns a vector of the names of files in a folder that do NOT contain any of the 
 43 | entries of search.strings in their name. This will be an opportunity to practice use of the **%in%** function; 
 44 | and then use of the **setdiff** function applied to two vectors: `setdiff(V1, V2)` for vectors V1 and V2 of 
 45 | the same type (e.g., numeric or character) will give a vector consisting of the entries of V1 that 
 46 | are **not** equal to any entry of V2. 
 47 | 
 48 | 
 49 | In the previous (seventh) exercise file we prepared the function
 50 | 
 51 | **search_for_filenames_containing_any_of_the_patterns_and_output_file_info**(directory, search.strings) 
 52 | 
 53 | that returns a data frame containing information on the file names in directory that match **ANY entry** of 
 54 | search.strings (i.e., that have 1 or more of the entries of search.strings in their file name). 
 55 | The first column of the returned data frame has the names of the files. The final version is copied here:
 56 | 
 57 | ``` {r}
 58 | search_for_filenames_matching_any_of_the_patterns_and_output_file_info <- 
 59 |                                          function(directory, search.strings){
 60 | 
 61 | # directory is an absolute path (full path) or a path relative to the R working directory to the
 62 | # folder to be searched. If want to search the R working directory itself, 
 63 | # can set directory = "." with the line of code:   directory <- "." 
 64 | # or could set directory to be the full path to the R working directory.
 65 | 
 66 | # search.strings is a character string vector
 67 | 
 68 | # Return a data frame of the file names (not including folders) in directory that contain
 69 | # ANY of the entries of the search.strings vector somewhere in their file name,
 70 | # (or at the beginning of the filename, or at the end of the filename, if so specified).
 71 | # The search will be case insensitive (treats lower case and upper case letters as the same).
 72 | 
 73 | # The first step is to initialize the filenames vector: filenames <- character(0)
 74 | 
 75 | # In a for loop, use R's list.files function to list all the files (file names) matching the
 76 | # entries of search.strings, one by one, 
 77 | # eliminating names of folders, and appending them to filenames
 78 | 
 79 | # Use the unique function to eliminate duplicates (keep only 1 copy of each file name)
 80 | 
 81 | # For the files whose names contain any of the character strings in search.strings, use
 82 | # R's file.info function to get the file size and last modification time as in the previous function.
 83 | 
 84 | # The output data frame will contain the file size and the last modification time
 85 | # for each file that has any member of search.strings somewhere in its file name 
 86 | # (or at the beginning or at the end of the file name, if so specified)
 87 | #
 88 | # We will get the file names without the folder path leading to the files included in the name. 
 89 | 
 90 | # check that search.strings is a non-empty character vector 
 91 |  
 92 | ns <- length(search.strings)
 93 | if(ns < 1) stop("no entries in search.strings")
 94 | if(!is.character(search.strings)) stop("search.strings is not a character vector")
 95 | 
 96 | # We will return a data frame whose first column contains the files in directory
 97 | # that contain any character string in the search.strings vector somewhere in their name.
 98 | # The second column will contain the last time (and date) the file was modified,
 99 | # and the third column will be the file size (in bytes).
100 | 
101 | # The first step is to initialize the filenames vector 
102 | filenames <- character(0)
103 | 
104 | # Then in a for loop, for each entry S of search strings, 
105 | # use list.files to get the vector V the file names in directory that 
106 | # contain S in their file name and append V to filenames (after eliminating any folder names). 
107 | # do not include the path to the file in the file name.
108 | 
109 | # To eliminate names of any folders (directories) that are in V: 
110 | # do paste(directory, "/", V, sep = "") to get the filenames including either
111 | # the relative path from the R working directory or the absolute path (depending on what 
112 | # directory is); use these names in the R dir.exists function to check for folder 
113 | # (directory) names in filenames
114 | 
115 |    for (k in 1:ns) {
116 |       V <- list.files(directory, pattern = search.strings[k],  
117 |                       full.names = FALSE, ignore.case = TRUE)
118 | #     exclude directory names from V (we need to do this since we are "adding" 
119 | #     the file names in V to filenames and want only file names, not folder names
120 | #     if V is empty (character(0)) then skip this
121 |       if(length(V) > 0) V <- V[!dir.exists(paste(directory, "/", V, sep = ""))]  
122 |       filenames <- c(filenames, V)
123 |    }
124 | 
125 | ### It is important to note we could have also "run the for loop" in this fashion:
126 | ### for (S in search.strings) {
127 | ###    V <- list.files(directory, pattern = S,  
128 | ##############   rest of the for loop
129 | ###
130 | 
131 | nf <- length(filenames)
132 | if(nf == 0) {
133 |    print("no files contain any of the search strings")
134 |    return("no files contain any of the search strings")
135 | }
136 | 
137 | filenames <- unique(filenames)
138 | nf <- length(filenames)  # need to do this since may have eliminated some duplicate name(s)
139 | 
140 | # If got to here, at least 1 file has a character string in search.strings
141 | # in its name, so get the information on these file(s) into a data frame.
142 | 
143 | ################################## get the data frame to be output
144 | 
145 | # Get the desired output data frame using vectors
146 | 
147 | dfcolnames <- c("file.name", "modif.date", "size.in.bytes")
148 | # initialize the 3 vectors that will hold this information on the files
149 | # whose names matched any of the members of search.strings
150 | 
151 | fname <- character(0)
152 | fdate <- character(0)
153 | fsize <- numeric(0)
154 | 
155 | for(k in 1:nf) {
156 |     finfo <- file.info(paste(directory, "/", filenames[k], sep = ""))
157 | # needed to include the path to the file so file.info can locate it
158 |     fname <- c(fname, filenames[k])
159 |     fdate <- c(fdate, as.character(finfo$mtime))
160 |     fsize <- c(fsize, finfo$size)
161 | }
162 | 
163 | df <- data.frame(fname, fdate, fsize, stringsAsFactors = FALSE)
164 | colnames(df) <- dfcolnames
165 | 
166 | ################################## finished getting the data frame to be output
167 | 
168 | # Write the data frame out to a tab delimited text file called scrlisting.txt in directory 
169 | # (i.e., in the folder specified by the argument directory this function was called with).
170 | 
171 | outpfilename <- paste(directory, "/", "scrlisting.txt", sep = "")
172 | # One can rename this "scratch file" as desired after viewing it (best viewed in Excel or equivalent).
173 | write.table(df, file = outpfilename, 
174 |             append = FALSE, quote = FALSE, sep = "\t",
175 |             row.names = FALSE, col.names = TRUE)
176 | # This call to write.table will write out a data frame
177 | # as one would usually want; it specifies the column separator to be a tab
178 | 
179 | return(df)
180 | }
181 | 
182 | ```
183 | 
184 | ## Exercises
185 | 
186 | The function for this exercise will be to construct a modified version of the search function called 
187 | 
188 | ```
189 | search_for_filenames_matching_NONE_of_the_patterns_and_output_file_names <- 
190 |                                             function(directory, search.strings)
191 | ```
192 | 
193 | that will return the vector of file names in directory that do **NOT** match **ANY entry** of search.strings 
194 | 
195 | One way to view this, is that the first function in this sequence obtained the **intersection** of the 
196 | sets of file names matching each entry of search strings, while the previous function (the function above)
197 | obtained the **union** U of the sets of file names matching each entry of search.strings 
198 | This programming exercise will be to construct a version that will return 
199 | the vector of file names in directory that **DO NOT match ANY** entry of search.strings. 
200 | This can be viewed as obtaining, relative to the set of all the files in directory, the **complement of U**. 
201 | 
202 | One could as in the previous functions return information on 
203 | these files, but this is more a practice exercise in using the %in% function (and in the next exercise, using 
204 | the setdiff function) so we are just going to concentrate on getting the vector of the file names.
205 | 
206 | Hints: Use the 
207 | 
208 | **search_for_filenames_matching_any_of_the_patterns_and_output_file_info** 
209 | 
210 | function to find the file names in directory that match one or more of the file names 
211 | in search.strings, call this vector **Vany**. Then remove these file names from the vector of all the file names 
212 | in directory. One approach for this is to use **list.files** to get the vector of all the file and folder names 
213 | in directory, then remove the folder names from this vector, resulting in the vector **Vall**.
214 | Then construct a logical vector Vlogical the same length as Vall such that: Vlogical[k] is TRUE if Vall[k] is
215 | **not** an entry of Vany (and Vlogical[k] is FALSE if Vall[k] is an entry of Vany). The `%in%` function 
216 | is very convenient for this (and this is good practice using it). 
217 | 
218 | Recall from the fifth article in this series: The **%in%** function addresses the question of whether or 
219 | not each entry of some vector v occurs in another vector w: `z <- v %in% w` obtains a logical 
220 | vector z with z[k] being TRUE if v[k] is equal to some entry in w, 
221 | and z[k] being FALSE if v[k] is not equal some entry in w. Note then the 
222 | logical vector `nz <- !z` has the property that nz[k] will be TRUE if v[k] is NOT an entry of w, and nz[k] will 
223 | be FALSE if v[k] is an entry of w; hence v[nz] gives the entries of v that are NOT in w.
224 | 
225 | An example of how %in% behaves:
226 | 
227 | ```
228 | ?"%in%"  # look at the help on %in% Note because of the special
229 | #            character % one needs to "protect" %in% by enclosing it in
230 | #            either quotes or apostrophes when "asking for help" on it
231 | 
232 | v <- c(1, 2, 3, 4, 5, NA)
233 | w <- c(12, 3, 8, 22, 4)
234 | v %in% w
235 | [1] FALSE FALSE  TRUE  TRUE FALSE FALSE
236 | 
237 | v <- c(1, 2, 3, 4, 5, NA)
238 | w <- c(12, 3, 8, 22, 4, NA)
239 | v %in% w
240 | [1] FALSE FALSE  TRUE  TRUE FALSE  TRUE
241 | 
242 | # note %in% will declare a match for an NA in v if there is an NA in w
243 | ```
244 | 
245 | From the discussion above, Vall[Vlogical] is the desired vector of file names, and we can obtain Vlogical 
246 | using the `%in%` function and logical negation (!). 
247 | Try doing this - a working version is given below.
248 | 
249 | ``` {r}
250 | search_for_filenames_matching_NONE_of_the_patterns_and_output_file_names <- 
251 |                                          function(directory, search.strings){
252 | 
253 | # directory is an absolute path (full path) or a path relative to the R working directory to the
254 | # folder to be searched. If want to search the R working directory itself, 
255 | # can set directory = "." 
256 | # or could set directory to be the full path to the R working directory.
257 | 
258 | # search.strings is a text string vector
259 | 
260 | # Return a vector of the file names (not including folders) in directory that contain
261 | # NONE of the entries of the search.strings vector in their file name,
262 | # the search will be case insensitive (treats lower case and upper case letters as the same).
263 | # Since this is mainly an exercise in using the %in% function, just return the vector of file 
264 | # names (and don't bother with getting the last modification data or file size).
265 | 
266 | # Use R's list.files function to list all the files (file names) in directory 
267 | # then eliminate names of folders; put the result in Vall 
268 | 
269 | # Then use the previous function
270 | # search_for_filenames_matching_any_of_the_patterns_and_output_file_info
271 | # to find all the files in directory that match some entry in search.strings
272 | # and put the vector of these file names in Vany
273 | 
274 | # Then eliminate from Vall any entries that are in Vany using the %in% function 
275 | # and logical negation (!)
276 | 
277 | # We will get the file names without the folder path leading to the files included in the name. 
278 | 
279 | # check that search.strings is a non-empty character vector 
280 |  
281 | ns <- length(search.strings)
282 | if(ns < 1) stop("no entries in search.strings")
283 | if(!is.character(search.strings)) stop("search.strings is not a character vector")
284 | 
285 | Vall <- list.files(directory, full.names = FALSE, ignore.case = TRUE)
286 | nf <- length(Vall)
287 | if(nf == 0) {
288 |    print("no files in directory")
289 |    return("no files in directory")
290 | }
291 | #  exclude directory names from Vall 
292 | Vall <- Vall[!dir.exists(paste(directory, "/", Vall, sep = ""))]  
293 | 
294 | nf <- length(Vall)
295 | if(nf == 0) {
296 |    print("no files in directory")
297 |    return("no files in directory")
298 | }
299 | 
300 | # now get Vany
301 | df.Vany <- 
302 | search_for_filenames_matching_any_of_the_patterns_and_output_file_info(directory, search.strings)
303 | 
304 | # need to handle the case that NO file names were a match for any of the search strings 
305 | # in which case df.Vany is the character string: "no files contain any of the search strings" 
306 | # rather than a data frame 
307 | if(class(df.Vany) == "character") {
308 |    Vany <- character(0)
309 |    } else {
310 |    Vany <- df.Vany$file.name  # the column of file names
311 | }
312 | # When programming, one should always be sure the function handles "extreme cases", 
313 | # here that would be when NONE the file names in directory match some entry in search.strings, 
314 | # and when ALL of the file names match search.strings 
315 | 
316 | # eliminate names in Vany from Vall
317 | 
318 | Vlogical <- !(Vall %in% Vany)  # we want entries in Vall that are NOT in Vany so need to use !
319 | filenames <- Vall[Vlogical]  # the desired file names (names NOT in Vany)
320 | 
321 | nf <- length(filenames)
322 | if(nf == 0) {
323 |    print("all files contain contain an entry of search strings")
324 |    return("all files contain contain an entry of search strings")
325 | }
326 | 
327 | # If got to here, at least 1 file has no entry of search.strings in its name
328 | return(filenames)  # here we are just returning the file names without other information on them
329 | }
330 | 
331 | ```
332 | 
333 | ### Test runs on my computer
334 | 
335 | Recalling from the previous two exercise files, I have constructed in my R working directory a folder 
336 | called test_dir containing a small number of files with names I picked to conveniently test 
337 | the functions that search for file names satisfying various conditions. 
338 | Here are all the file names (and the one folder name) in the test_dir folder: 
339 | 
340 | ```
341 | directory <- "test_dir"
342 | list.files(directory)  # 9 files and 1 folder
343 |  [1] "001.csv"         "002.csv"         "003.txt"        
344 |  [4] "004csvfile.txt"  "005txt2csv"      "1.csv"          
345 |  [7] "308.csv"         "folder001txtcsv" "scrlisting.txt" 
346 | [10] "txt.csv"        
347 | ``` 
348 | 
349 | We will use this folder and files (9 filenames and 1 folder name) to test the search function 
350 | written immediately above (you can construct a similar test_dir folder and files with these filenames and also a 
351 | folder in it called "folder001txtcsv" to run tests, the only difference will be the dates and sizes of the files). 
352 | Or you could test the function using a suitable folder in your computer for which you can pick search.strings so that 
353 | you will get a known reasonable length vector for the test run.
354 | 
355 | 
356 | ## Some test runs
357 | 
358 | ```
359 | directory <- "test_dir"
360 | list.files(directory)  # 9 files and 1 folder
361 |  [1] "001.csv"         "002.csv"         "003.txt"         
362 |  [4] "004csvfile.txt"  "005txt2csv"      "1.csv"           
363 |  [7] "308.csv"         "folder001txtcsv" "scrlisting.txt"     
364 | [10] "txt.csv"        
365 |  
366 | search.strings <- c("2", "3", "4", "5") 
367 | search_for_filenames_matching_NONE_of_the_patterns_and_output_file_names(directory, search.strings) 
368 | [1] "001.csv"        "1.csv"          "scrlisting.txt"    "txt.csv"
369 | 
370 | search.strings <- c("csv") 
371 | search_for_filenames_matching_NONE_of_the_patterns_and_output_file_names(directory, search.strings) 
372 | [1] "003.txt"        "scrlisting.txt"
373 | 
374 | # test case when all the file names match some member of search.strings
375 | search.strings <- c("csv", "txt")  
376 | search_for_filenames_matching_NONE_of_the_patterns_and_output_file_names(directory, search.strings) 
377 | [1] "all files contain contain an entry of search strings"
378 | [1] "all files contain contain an entry of search strings"
379 | # when there are no such files (no files in directory that don't have an entry of search.strings 
380 | # in their file name), the search function both prints this message and returns it, so here it is 
381 | # output twice 
382 | 
383 | search.strings <- c("00")
384 | search_for_filenames_matching_NONE_of_the_patterns_and_output_file_names(directory, search.strings) 
385 | [1] "1.csv"          "308.csv"        "scrlisting.txt"      "txt.csv" 
386 | 
387 | # test case when none of the file names match some member of search.strings, 
388 | # one should get all the file names (but not the folder name)
389 | search.strings <- c("00987")
390 | search_for_filenames_matching_NONE_of_the_patterns_and_output_file_names(directory, search.strings) 
391 | [1] "no files contain any of the search strings"
392 | [1] "001.csv"        "002.csv"        "003.txt"        "004csvfile.txt"
393 | [5] "005txt2csv"     "1.csv"          "308.csv"        "scrlisting.txt"
394 | [9] "txt.csv"       
395 | 
396 | # note  [1] "no files contain any of the search strings" came from
397 | # "looking for" the files that matched ANY entry of search.strings - 
398 | # in this case there were none, and so one obtained all the file names 
399 | # in test_dir (and, correctly, not the folder name)
400 | ```
401 | 
402 | The next exercise is to use the **setdiff** function in place of using the %in% function to get 
403 | the file names that are in Vall but not in Vany.
404 | 
405 | Hint:
406 | In the function above you simply need to replace the following 2 lines with 1 line using setdiff
407 | ```
408 | Vlogical <- !(Vall %in% Vany)  # we want entries in Vall that are NOT in Vany so need to use !
409 | filenames <- Vall[Vlogical]  # the desired file names (names NOT in Vany)
410 | ```
411 | 
412 | A correct line using setdiff is: `filenames <- setdiff(Vall, Vany)`
413 | 
414 | One should do the test runs again to check this works correctly.
415 | 
416 | This exercise was intended to practice the very useful **%in%** function, and also illustrate 
417 | that finding an existing R function (here setdiff) that can do exactly what you want can result in very concise 
418 | easy to understand code, which leaves less room for bugs to occur and to hide. 
419 | 
420 | 
421 | Hope this was informative and good practice.
422 | The next set of exercises will address dealing with using subsets of an individual row of a data frame.
423 | = = = = = = = = = = = = = = = = = = = = = = = = 
424 | 
425 | This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 
426 | To view a copy of this license, visit https://creativecommons.org/licenses/by-nc-sa/4.0/ or send a letter 
427 | to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. There is a full version of this license at this web site: 
428 | https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode      
429 | 


--------------------------------------------------------------------------------
/Eleventh-R-Practice-exercise-using-sapply-and-split-and-also-use-of-ellipsis-to-pass-in-additional-argumentsFeb11.Rmd:
--------------------------------------------------------------------------------
  1 | 
  2 | ---
  3 | # output:
  4 | #  md_document:
  5 | #    variant: markdown_github
  6 |  output: pdf_document
  7 |  fontsize: 12pt     
  8 | ---
  9 | 
 10 | ## Eleventh R Practice exercise using sapply and split and also use of ellipsis to pass in additional arguments
 11 | 
 12 | ### Alan E. Berger  Feb 11, 2022; edit June 20 to emphasize requirement on first argument of FUN in sapply 
 13 | 
 14 | ### available at https://github.com/AlanBerger/Practice-programming-exercises-for-R 
 15 | 
 16 | ## Introduction
 17 | 
 18 | This article will discuss **sapply** at length, and then practice using **split** and **sapply**. sapply is a version of **lapply**  that when possible will give its output as a vector (or matrix when that is appropriate) instead of a list (lapply will produce a list;  a list that is "naturally a vector" can be converted to the vector using the **unlist** function; using sapply avoids having to do that extra step).
 19 | 
 20 | Much of what is described below for sapply is also applicable to lapply.
 21 | 
 22 | We will go over some simple examples, and then use the iris data set to demonstrate sapply and split, and the ellipsis (...) functionality available in sapply.
 23 | 
 24 | Note if you want to copy lines from this for use in R, it is best to copy from the .Rmd file, since sometimes text in the pdf file can contain formatting characters that R does not accept. To download an Rmd file, open the Rmd file to display it in GitHub and then toward the upper right of the resulting window there will be a "raw box" (to the left of "Blame"), on which one should be able to do (for Windows): right click, "save as" to download the .Rmd file as a text file.  
 25 | 
 26 | 
 27 | ``` {r}
 28 | 
 29 | # The help for sapply indicates a "standard" call to sapply will 
 30 | # have the form:
 31 | 
 32 | # sapply(X, FUN, ...)
 33 | 
 34 | # Here X is a vector or data.frame or list
 35 | # FUN is the name of a function that is available in the current R session 
 36 | # (which could be a user defined function, or a function that 
 37 | # "comes with R", or that has been "loaded" (via library)from a package), 
 38 | # OR is an anonymous function defined within the call to sapply. 
 39 | # The function specified by FUN will be applied to each entry of X if X 
 40 | # is a vector; to each column of X (taken as a vector) if X is 
 41 | # a data frame, to each entry of X if X is a list (vectors and data.frames
 42 | # are special types of list). When possible the result from sapply will 
 43 | # be a vector (or matrix if that is appropriate). Otherwise sapply will 
 44 | # return a list (just as lapply would).
 45 | 
 46 | # The optional ellipsis argument can be used to pass in additional 
 47 | # arguments to the function specified in the FUN argument of sapply 
 48 | # as illustrated below.
 49 | 
 50 | # a simple case: square each element of a vector (yes we could do this 
 51 | # by x^2, but here we demonstrate use of sapply and an anonymous function):
 52 | 
 53 | v <- 1:5
 54 | sapply(X = v, FUN = function(y) y^2) 
 55 | 
 56 | # ********  IT IS BEST to use the form: argument.name = user.chosen.value 
 57 | # ********  when calling one of the apply family of functions. In this call 
 58 | # ********  to sapply, the name of the first argument of sapply is X and 
 59 | # ********  the value chosen for X is v; and the second argument of sapply
 60 | # ********  is FUN, and the value chosen for FUN is the
 61 | # ********  anonymous function, function(y) y^2
 62 | 
 63 | # Note y in the definition of this function is a "formal argument", 
 64 | # also called a "dummy argument", meaning any variable name permissible 
 65 | # in R could be used, for example
 66 | 
 67 | # sapply(X = v, 
 68 | # FUN = function(user.selected.name) user.selected.name^2)
 69 | 
 70 | # One could also optionally include curly brackets to delineate the 
 71 | # function:
 72 | 
 73 | # sapply(X = v, FUN = function(y) {y^2}) 
 74 | 
 75 | # sapply, in effect, successively calls the function the user has
 76 | # provided for FUN, here function(y) {y^2} 
 77 | # with each entry of v as its argument, 
 78 | # and "collects" the results in a vector.
 79 | 
 80 | # My point of view is it is fine to use an anonymous function if it 
 81 | # is short (will fit in a line or so), but otherwise it makes the code 
 82 | # much easier to proofread and so avoid/detect bugs if one codes the 
 83 | # function as a stand-alone function, call it, for example, 
 84 | # user.function and uses its name in the FUN argument of sapply via
 85 | #    FUN = user.function  
 86 | # Here user function could also be any function that is available in the
 87 | # current R session 
 88 | 
 89 | # demonstrate use of the ellipsis argument of sapply to pass in an 
 90 | # additional argument of FUN
 91 | 
 92 | 
 93 | sapply(X = v, FUN = function(x, power) x^power, power = 3)
 94 | 
 95 | 
 96 | # Here each entry of v is passed into the function as the value for x, 
 97 | # and the additional argument, power, of the function is supplied via 
 98 | # the third (ellipsis) argument of sapply.
 99 | 
100 | # One could also rely on lexical scoping to "find" the value of power, 
101 | # but that is not good programming practice when it can be avoided, 
102 | # since the value where R "finds" it might not be what was wanted 
103 | # (or it might have gotten changed from what you thought it would be).
104 | 
105 | power <- 4
106 | sapply(X = v, FUN = function(x) x^power)  # this worked but is not best practice
107 | 
108 | 
109 | # Here is an example with two additional arguments supplied using 
110 | # the ellipsis functionality of sapply, and where sapply returns a matrix
111 | 
112 | v <- 1:5
113 | sapply(X = v, FUN = function(x, power1, power2) {c(x^power1, x^power2)}, 
114 |        power1 = 2, power2 = 3)
115 | 
116 | ```
117 | 
118 | ##  The first argument of the function in sapply MUST BE for the entries of X
119 | 
120 | Suppose one is calling sapply via, for example,
121 | 
122 | result <- sapply(X = v, FUN = user.function, other.arg1 = a1, other.arg2 = a2)
123 | 
124 | where v is a vector or data frame or list, so sapply will successively 
125 | call user.function with each entry, refer to it as $E$, of v (each column if v is a data frame).
126 | **It is important to note that**:
127 | 
128 | sapply will successively call user.function with $E$ **as the first argument of user.function**
129 | 
130 | So user.function (if programmed by us) should be programmed that way, and if user.function is a built in function that comes with the base R installation, or has been "loaded" from a package, its first argument should be for entries of v. (Technically, this is not  absolutely required, but not conforming with this "first argument condition" can lead to mysterious errors and is best avoided.) In the conceptual example above, other.arg1 and other.arg2 are other arguments of user.function and their values have been set to a1 and a2, respectively, in this call.  Arguments passed into user.function via the ellipsis functionality of sapply must be called in the 
131 | 
132 | argument.name = chosen.value
133 | 
134 | format.
135 | 
136 | This "first argument condition" is pretty natural, in that for many R functions, the first argument (or first couple of arguments) are what the function "acts on" and following arguments (which have default values) govern options on how that is done. For example the mean function takes the mean of a vector, and its other options modulate how that is done, and read.table reads in a suitable file as a data frame, and it has quite a few options on how that is done, and the plot function can produce a scatterplot of y vs. x with a multitude of options allowing for fine detailed control of the form of the plot. 
137 | 
138 | ## Now use the iris data set to illustrate use of sapply
139 | 
140 | From the R help on the iris data set (? iris): "This famous (Fisher's or Anderson's) iris data set gives the measurements in centimeters of the variables sepal length and width and petal length and width, respectively, for 50 flowers from each of 3 species of iris. The species are Iris ***setosa***, ***versicolor***, and ***virginica***"
141 | 
142 | ``` {r}
143 | 
144 | # from the R help on the iris data set:
145 | # iris is a data frame with 150 cases (rows) and 5 variables (columns) 
146 | # named Sepal.Length, Sepal.Width, Petal.Length, Petal.Width, and Species,
147 | # respectively, for 50 flowers from each of 3 species of iris. 
148 | # The species are Iris setosa, versicolor, and virginica.
149 | 
150 | # There are equal numbers of the three types of flowers, and they are 
151 | # grouped together in the data frame, but neither "equal numbers" nor 
152 | # "grouped together" are necessary for using split together with sapply 
153 | # as will be demonstrated below.
154 | 
155 | data(iris)  # make the iris data set available to this R session
156 | iris.df <- iris  # a copy, emphasizing it is a data frame
157 | 
158 | # take a look at it
159 | # head(iris.df)
160 |  
161 | # tail(iris.df)
162 | 
163 | # place an NA into the Sepal.Width column, to later demonstrate 
164 | # how to use the ellipsis (...) functionality for sapply
165 | # to pass in one or more additional optional arguments into the function
166 | # being used (for example na.rm = TRUE, trim = 0.11, for the mean function)
167 | 
168 | iris.df[2,2] <- NA
169 | # check that was done
170 | head(iris.df)
171 | 
172 | # An aside: Explanation of what the optional argument trim in 
173 | # the mean function does:
174 | 
175 | # we will use this information to do a "by hand" check of 
176 | # the output obtained below from using sapply with the mean function 
177 | # on the iris.df data frame 
178 | 
179 | # If V is a numeric vector and one does 
180 | # mean(V, na.rm = TRUE, trim = w)
181 | # what will happen is that the mean function will, in effect, 
182 | # first produce the vector Vn which is V but with any and all NAs removed 
183 | # from V.
184 | # If L is the length of Vn (here assume it is > 0), then let K = w * L
185 | # (w was what the argument trim was set to; the default for trim is 0).
186 | # Let Ki be the integer part of K, e.g., if K was 2.3 then Ki is 2
187 | # if K was 0.4 then Ki is 0
188 | # Then mean(V, na.rm = TRUE, trim = w) is evaluated as follows:
189 | # Let Vn.sorted be the result of doing sort(Vn)
190 | # Let Vn.sorted.trimmed be the result of:
191 | #     removing the first Ki entries of Vn.sorted AND 
192 | #     removing the last Ki entries of Vn.sorted
193 | # (assume that Vn.sorted.trimmed still has at least 1 element left in it)
194 | 
195 | # mean(V, na.rm = TRUE, trim = w) is then equal to mean(Vn.sorted.trimmed)
196 | 
197 | 
198 | # first use sapply on the 4 data columns of iris.df, 
199 | # using na.rm = TRUE and trim = 0.11 
200 | # (which will trim 16 entries from each end of the vector whose mean
201 | # is being calculated). The sapply command below will successively pass 
202 | # each column of iris.df[, 1:4] AS A VECTOR into the mean function
203 | 
204 | sapply(X = iris.df[, 1:4], FUN = mean, na.rm = TRUE, trim = 0.11)
205 | 
206 | # Note that sapply "captured" the names of the columns of iris.df[, 1:4]
207 | 
208 | # since the lengths of the 4 columns after removing NAs 
209 | # were 150, 149, 150, 150;
210 | # setting trim equal to 0.11 in effect removed 16 elements from 
211 | # each end of each column AFTER 
212 | # any NAs were removed and the column was sorted; 
213 | # then the mean was calculated
214 | 
215 | # check the result from sapply above "by hand"
216 | 
217 | x <- sort(na.omit(iris.df[[1]]))
218 | mean(x[17:(length(x) - 16)])
219 | 
220 | 
221 | x <- sort(na.omit(iris.df[[2]]))
222 | mean(x[17:(length(x) - 16)])
223 | 
224 | # these and also checking columns 3 and 4 are OK
225 | 
226 | # Note
227 | # sapply(X = iris.df[, 1:4], FUN = mean, na.rm = TRUE, trim = 0.11)
228 | # worked to get the optional arguments 
229 | # na.rm = TRUE, trim = 0.11
230 | # into the mean function (FUN = mean) since the mean function
231 | # is "part of" R and it has na.rm and trim as optional arguments
232 | 
233 | # When using an anonymous function in sapply, AND using the optional 
234 | # ellipsis (...) argument of sapply to pass in additional arguments to 
235 | # the function specified by FUN, one needs to either have those additional 
236 | # arguments be declared arguments of the function (as with the power 
237 | # argument(s) in the examples above), OR to have an ellipsis in the 
238 | # argument list of the function specified by FUN, AND used appropriately 
239 | # within the anonymous function. Again, the first argument 
240 | # of the function FUN in sapply should be for 
241 | # what will be passed from each entry of the first argument X of sapply. 
242 | # For example in the call to sapply below, 
243 | # x will successively have the value of each of the first 4 columns 
244 | # of iris.df
245 | 
246 | sapply(X = iris.df[, 1:4], 
247 |       FUN = function(x, ...) {mean(x, ...)}, 
248 |       na.rm = TRUE, trim = 0.11)
249 | 
250 | 
251 | # In effect, in the above call to sapply,
252 | #     the ellipsis ...   was "set to":   na.rm = TRUE, trim = 0.11
253 | # and that got "passed through" via the ... in the argument list of 
254 | # the anonymous function AND within the anonymous function itself 
255 | # (here "into" the ... in the argument list of the mean function)
256 | #    * * *  It was necessary to have ... included in the argument list of  
257 | #    * * *  the mean function within this anonymous function
258 |  
259 | # Here is what happens if the ellipsis ... is left out of the 
260 | # argument list of mean in the anonymous function 
261 | # (it runs, but na.rm = TRUE, trim = 0.11 are not used, 
262 | # so the result is not what we wanted):
263 | sapply(X = iris.df[, 1:4], 
264 |       FUN = function(x, ...) {mean(x)}, 
265 |       na.rm = TRUE, trim = 0.11)
266 | 
267 | # The result is indeed the same as if na.rm = TRUE and trim = 0.11 were 
268 | # NOT invoked when calling the mean function:
269 | sapply(X = iris.df[, 1:4], FUN = mean)  
270 | 
271 | 
272 | # Note this also works correctly:
273 | sapply(X = iris.df[, 1:4], 
274 |  FUN = function(x, na.rm.value, trim.value) 
275 |                       {mean(x, na.rm = na.rm.value, trim = trim.value)}, 
276 |                 na.rm.value = TRUE, trim.value = 0.11)
277 | 
278 | 
279 | #  ************  note various plotting functions, e.g., plot(x,y, ...)
280 | # have ... in their argument list,
281 | # so various of the many optional arguments for them can be passed in 
282 | # by sapply.
283 | # For example, the first two arguments of plot, x and y, can be vectors 
284 | # for making a scatterplot. 
285 | # If X = data.frame.list is a list of data frames whose first two columns 
286 | # are numeric values (our example will be constructed from iris.df), 
287 | # and  plotting_function  is a function that
288 | # will make a scatterplot from data in the first two columns of a data 
289 | # frame, then, one could, conceptually, do for example:
290 | 
291 | # sapply(X = data.frame.list, FUN = plotting_function, 
292 | #        type = "p", pch = 1, col = "blue")
293 | #         
294 | # Here plotting_function
295 | # should extract the x and y vectors from the first and second columns 
296 | # of the data frame it is called with, and then call the plot function via
297 | # plot(x, y, ...)
298 | # In this setup, plotting_function must have an ellipsis in its argument 
299 | # list. Then sapply would pass through, via the ellipsis functionality, 
300 | # the various plotting arguments that were specified 
301 | # (there are many many optional arguments for plot); 
302 | # the ones given here "say": do a point plot with circle symbols, 
303 | # and have the symbol color be blue 
304 | 
305 | plotting_function <- function(df, ...) {
306 | # here df is a data frame that will be passed in by sapply
307 | # scatter plot column 2 of df vs. column 1 of df
308 |    x <- df[[1]]
309 |    y <- df[[2]]
310 | 
311 | # get x and y axis labels from column names of df
312 |   xlabel <- colnames(df)[1]
313 |   ylabel <- colnames(df)[2]
314 | 
315 | # other arguments for the plot function are passed into plot from 
316 | # the call to sapply via the ellipsis
317 | 
318 |    plot(x, y, xlab = xlabel, ylab = ylabel, ...)
319 | }
320 | 
321 | 
322 | # Demonstrate this:
323 | # Construct a list of two data frames, each with two columns, 
324 | # from iris.df
325 | 
326 | data.frame.list = list(iris.df[, c(1,2)], iris.df[, c(3,4)])
327 | 
328 | 
329 | # invoke sapply, creating 2 plots, while using the ellipsis functionality 
330 | # to pass plotting arguments into the plot function
331 | 
332 | # one could do more informative plotting with this data, 
333 | # but the point here is to illustrate 
334 | # use of the ellipsis functionality in sapply
335 | 
336 | sapply(X = data.frame.list, FUN = plotting_function,
337 |    type = "p", pch = 1, col = "blue")  
338 | 
339 | # this produced the plots along with a "pro forma" "NULL" from each 
340 | # call to the plot function
341 | # to suppress this useless text, one could use the invisible function:
342 | 
343 | # invisible(sapply(X = data.frame.list, FUN = plotting_function,
344 | #   type = "p", pch = 1, col = "blue"))  
345 | 
346 | ```
347 | 
348 | ## A note when using sapply and passing in arguments using its ellipsis capability
349 | 
350 | When using the ellipsis functionality in sapply to pass arguments, 
351 | the R help on sapply recommends specifying the X and FUN arguments explicitly,
352 | (not just by position), for example 
353 | 
354 | sapply(X = some.data.frame, FUN = function.that.does.plots, 
355 |        type = "p", pch = 1, col = "green")
356 | 
357 | ## The split function. 
358 | 
359 | The **split** function can be used to "split up" a data frame df into a list 
360 | of data frames that are subsets of df, call it df.split.list, 
361 | based on a character or factor column of df. 
362 | For each of the distinct entries E in the character (or factor) 
363 | column of df being used to "do the split", there will be a 
364 | data frame in df.split.list which is the subset of df containing all the 
365 | rows of df for which the entry in the column being used to do the split 
366 | equals E, and the name of that entry in df.split.list will be the 
367 | character string E.
368 | 
369 | ## The practice exercise 
370 | 
371 | The practice exercise is: 
372 | given the name of one of the 4 numeric data columns in iris.df,
373 | compute the mean of the entries in that column that correspond to each of 
374 | the 3 iris flower types (so one will compute 3 means; one mean for all the  
375 | entries in that column whose iris type is setosa, one mean for the 
376 | versicolor flowers and one mean for virginica flowers). 
377 | Do this by using split on iris.df to produce a list of 3 data frames, 
378 | one for each of the 3 types of iris flower. 
379 | Then write a function that given one of these data frames, and the name 
380 | of one of the 4 numeric data columns in iris.df, will 
381 | compute the mean for the specified column. Use this function, and the list 
382 | of 3 data frames produced by split, in sapply to get the result.
383 | 
384 | Try doing this before looking at one possible solution in the R session given below.
385 | 
386 | ```{r}
387 | 
388 | # make this session self contained
389 | data(iris)  # make the iris data set available to this R session
390 | iris.df <- iris  # a copy, emphasizing it is a data frame
391 | 
392 | head(iris.df)  # look at it
393 | 
394 | # split iris.df by the 3 iris flower types (species)
395 | df.split.list <- split(iris.df, iris.df$Species)
396 | 
397 | get.mean.of.specified.column <- function(df, column.name) {
398 | # df will be one of the 3 data frames in df.split.list
399 | # column.name will be one of the quantities measured for each flower:
400 | # (Sepal.Length, Sepal.Width, Petal.Length, Petal.Width)
401 | 
402 | mean.of.given.column <- mean(df[[column.name]]) 
403 | # there are no NAs in iris.df so here don't need to use na.rm = TRUE 
404 | # (in an example above I placed an NA in iris.df to test ways of passing
405 | #  na.rm = TRUE into the mean function)
406 | 
407 | return(mean.of.given.column)
408 | }
409 | 
410 | # Use sapply for the various columns
411 | # (One could also use sapply (with a modified function) to return a matrix
412 | # containing the means for the 3 flower species for each of the 4
413 | # measured quantities. The modified function would return a vector of the
414 | # means for the 4 quantities for any of the 3 data frames in df.split.list) 
415 | 
416 | sapply(X = df.split.list, FUN = get.mean.of.specified.column, 
417 |        column.name = "Sepal.Length")
418 | 
419 | sapply(X = df.split.list, FUN = get.mean.of.specified.column, 
420 |        column.name = "Sepal.Width")
421 | 
422 | sapply(X = df.split.list, FUN = get.mean.of.specified.column, 
423 |        column.name = "Petal.Length")
424 | 
425 | sapply(X = df.split.list, FUN = get.mean.of.specified.column, 
426 |        column.name = "Petal.Width")
427 | 
428 | 
429 | # Now program the function to be used in sapply to return the 
430 | # vector of all 4 means. Then sapply will return a matrix
431 | 
432 | # Here is a solution
433 | 
434 | get.all.4.means.in.df <- function(df) {
435 | # df will be one of the 3 data frames in df.split.list
436 | # Its 4 numeric column names are
437 | #     Sepal.Length, Sepal.Width, Petal.Length, Petal.Width
438 | 
439 | # use sapply within this function to get the 4 means
440 | the.4.means <- sapply(df[, 1:4], mean)
441 | # this will also "capture" the column names
442 | 
443 | # there are no NAs in iris.df so here don't need to use na.rm = TRUE 
444 | # (in an example above I placed an NA in iris.df to test ways of passing
445 | #  na.rm = TRUE into the mean function)
446 | 
447 | return(the.4.means)
448 | }
449 | 
450 | # use sapply and this function to get the matrix of means
451 | result <- sapply(X = df.split.list, FUN = get.all.4.means.in.df) 
452 | result
453 | 
454 | # notice sapply conveniently captured the row and column names for 
455 | # the matrix from the data frames it operated on
456 | 
457 | rownames(result)
458 | 
459 | # And in particular:
460 | 
461 | colnames(result)
462 | 
463 | # will be the same as
464 | 
465 | names(df.split.list)
466 | 
467 | # if one prefers one can transpose the matrix
468 | print(t(result))
469 |  
470 | ```
471 | 
472 | Hope this discussion has been helpful.
473 | 
474 | 
475 | = = = = = = = = = = = = = = = = = = = = = = = = 
476 | 
477 | This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. 
478 | To view a copy of this license, visit https://creativecommons.org/licenses/by-nc-sa/4.0/ or send a letter 
479 | to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. There is a full version of this license at this web site: 
480 | https://creativecommons.org/licenses/by-nc-sa/4.0/legalcode      
481 | 


--------------------------------------------------------------------------------