├── Individual Scripts ├── 1. Packages.R ├── 10. Formulas.R ├── 11. Data Tables.R ├── 12. Differences in Subsetting Syntax.R ├── 13. Hashmaps.R ├── 14. Control Flow Structures.R ├── 15. DPLYR.R ├── 16. Functions.R ├── 17. Data Visualization.R ├── 18. Reading and Writing Data.R ├── 19. Web Scraping and Parsing.R ├── 2. Variables.R ├── 20. Feature Selection.R ├── 21. Linear Regression.R ├── 22. Logistic Rregression.R ├── 23. Random Forrest.R ├── 24. XGBoost.R ├── 25. Future To Do List.R ├── 3. R Scalars.R ├── 4. Types of Data Structures.R ├── 5. Vectors.R ├── 6. Matrices.R ├── 7. Lists.R ├── 8. Data Frames.R └── 9. Factors.R ├── Learn R by Example.R ├── README.md └── SplitRFilePSScript.ps1 /Individual Scripts/1. Packages.R: -------------------------------------------------------------------------------- 1 | ## Packages 2 | 3 | # Example of how to download the the 'stringi' package from CRAN and load it: 4 | 5 | install.packages('stringi') 6 | require(stringi) 7 | 8 | # You can also use the library fuction to load and attach add-on packages. 9 | 10 | library(stringi) 11 | 12 | installed.packages() 13 | 14 | # We'll create a function which takes a list of packages as input and checks if the packages are 15 | # installed. If not, the function installs the package and loads it into the R session. 16 | 17 | install_and_load_package <- function(packages) { 18 | 19 | missing_packages <- packages[!(packages %in% installed.packages()[, "Package"])] 20 | 21 | if (length(missing_packages)) 22 | install.packages(missing_packages, dependencies = TRUE) 23 | 24 | # Load all of the packages by applying the require function to each item 25 | sapply(packages, require, character.only = TRUE) 26 | 27 | } 28 | 29 | # Lets try loading a few packages. 30 | 31 | install_and_load_package('rvest') 32 | 33 | packages_to_load <- c("ggplot2", "reshape2") 34 | 35 | install_and_load_package( packages_to_load ) 36 | 37 | install_and_load_package( c('randomForest', 'xgboost', 'Rcpp') ) 38 | 39 | -------------------------------------------------------------------------------- /Individual Scripts/10. Formulas.R: -------------------------------------------------------------------------------- 1 | ## Formulas 2 | 3 | # Statistical functions in R make heavy use of formulas. The formula interface allows you to concisely specify 4 | # which columns to use when fitting a model, as well as the behavior of the model. When running model functions 5 | # like lm (Linear Regression), the formula specifies which regression coefficients shall be estimated. 6 | 7 | # On the left side of the the dependent variable is specified, while the right hand side contains the independent 8 | # variables. 9 | 10 | formula1 <- formula(dependent_variable ~ independent_variable) 11 | 12 | class(formula1) # returns "formula" 13 | 14 | # Use the mtcars data frame to build our model / show our examples: 15 | 16 | data_frame_to_use = mtcars 17 | 18 | # Create new columns in our data frame marking the dependent / independent variables to use for our examples: 19 | 20 | data_frame_to_use$dependent_variable = mtcars$mpg 21 | data_frame_to_use$independent_variable = mtcars$wt 22 | 23 | # Run the linear regression model function: 24 | 25 | model1 <- lm(formula1, data = data_frame_to_use) 26 | 27 | # Technically, the formula call above is redundant because the tilde-operator is an infix function that returns 28 | # an object with a formula class: 29 | 30 | formula1 <- dependent_variable ~ independent_variable 31 | 32 | class(formula1) # returns "formula" 33 | 34 | # The advantage of the formula function over ~ is that it also allows an environment for evaluation to be specified: 35 | 36 | form_mt <- formula(dependent_variable ~ independent_variable, env = data_frame_to_use) 37 | 38 | # The formula operator + means to include a column, not to mathematically add two columns together: 39 | 40 | data_frame_to_use$independent_variable2 = data_frame_to_use$vs 41 | 42 | formula1 <- formula( dependent_variable ~ independent_variable + independent_variable2, data = data_frame_to_use) 43 | 44 | # Some more basic formula operator examples: 45 | 46 | # "-" below means: Include independent_variable but exclude independent_variable2: 47 | 48 | formula1 <- formula( dependent_variable ~ independent_variable - independent_variable2, data = data_frame_to_use) 49 | 50 | # ":" below means: Estimate the independent_variable and independent_variable2 interactions: 51 | 52 | formula1 <- formula( dependent_variable ~ independent_variable:independent_variable2, data = data_frame_to_use) 53 | 54 | # "*" below means: Include columns as well as their interactions. In other words, include independent_variable 55 | # and independent_variable2 as well as their interactions 56 | 57 | formula1 <- formula( dependent_variable ~ independent_variable * independent_variable2, data = data_frame_to_use) 58 | 59 | # Same as: 60 | 61 | formula1 <- formula( dependent_variable ~ independent_variable 62 | + independent_variable2 63 | + independent_variable:independent_variable2 64 | , data = data_frame_to_use) 65 | 66 | # "|" below means: Estimate dependent_variable as a function of independent_variable conditional 67 | # on independent_variable2: 68 | 69 | formula1 <- formula( dependent_variable ~ independent_variable | independent_variable2, data = data_frame_to_use) 70 | 71 | # Finally, "." is shorthand for using all available variables. In the below case, the data argument is used to 72 | # obtain the available variables which are not on the left hand side: 73 | 74 | formula1 <- formula( dependent_variable ~ . , data = data_frame_to_use) 75 | 76 | -------------------------------------------------------------------------------- /Individual Scripts/11. Data Tables.R: -------------------------------------------------------------------------------- 1 | ## Data Tables 2 | 3 | # Data.table is a package that extends the functionality of data frames from base R, particularly improving on their 4 | # performance and syntax. Functions that work on a data.frame will also work with a data.table. There are 5 | # many ways to create, load or coerce to a data.table. 6 | 7 | # There is a constructor of the same name: 8 | 9 | install_and_load_package('data.table') 10 | 11 | data_table1 <- data.table( 12 | x = letters[1:5], 13 | y = 1:5, 14 | z = (1:5) > 3 15 | ) 16 | 17 | data_table1 18 | # x y z 19 | # 1: a 1 FALSE 20 | # 2: b 2 FALSE 21 | # 3: c 3 FALSE 22 | # 4: d 4 TRUE 23 | # 5: e 5 TRUE 24 | 25 | # Unlike data.frame, data.table will not coerce strings to factors: 26 | 27 | sapply(data_table1, class) 28 | # x y z 29 | # "character" "integer" "logical" 30 | 31 | # If you have another R object (such as a matrix), you must use as.data.table to coerce it to a data.table: 32 | 33 | mat <- matrix(0, ncol = 10, nrow = 10) 34 | 35 | data_table2 <- as.data.table(mat) 36 | # or 37 | data_table2 <- data.table(mat) 38 | 39 | -------------------------------------------------------------------------------- /Individual Scripts/12. Differences in Subsetting Syntax.R: -------------------------------------------------------------------------------- 1 | ## Differences in Subsetting Syntax 2 | 3 | # A data.table is one of several two-dimensional data structures available in R, besides data.frame, matrix and (2D) 4 | # array. All of these classes use a very similar but not identical syntax for subsetting, the A[rows, cols] schema. 5 | 6 | # Consider the following data stored in a matrix, a data.frame and a data.table: 7 | 8 | matrix <- matrix(1:12, nrow=4, dimnames=list(letters[1:4], c('X', 'Y', 'Z'))) 9 | 10 | matrix 11 | # X Y Z 12 | # a 1 5 9 13 | # b 2 6 10 14 | # c 3 7 11 15 | # d 4 8 12 16 | 17 | data_frame <- as.data.frame(matrix) 18 | data_table <- as.data.table(matrix) 19 | 20 | matrix[2:3] # returns 2 3, the 2nd and 3rd items, as if matrix were a vector (because it is!) 21 | 22 | data_frame[2:3] # returns the 2nd and 3rd columns 23 | # Y Z 24 | # a 5 9 25 | # b 6 10 26 | # c 7 11 27 | # d 8 12 28 | 29 | data_table[2:3] # returns the 2nd and 3rd rows! 30 | # X Y Z 31 | # 1: 2 6 10 32 | # 2: 3 7 11 33 | 34 | # If you want to be sure of what will be returned, it is better to be explicit. 35 | # To get specific rows, just add a comma after the range: 36 | 37 | matrix[2:3, ] # returns the 2nd and 3rd rows 38 | data_frame[2:3, ] # returns the 2nd and 3rd rows 39 | data_table[2:3, ] # returns the 2nd and 3rd rows 40 | 41 | # But, if you want to subset columns, some cases are interpreted differently. All three can be subset the same way 42 | # with integer or character indices not stored in a variable. 43 | 44 | matrix[, 2:3] # returns the 2nd and 3rd columns 45 | data_frame[, 2:3] # returns the 2nd and 3rd columns 46 | data_table[, 2:3] # returns the 2nd and 3rd columns 47 | matrix[, c("Y", "Z")] # returns the 2nd and 3rd columns 48 | data_frame[, c("Y", "Z")] # returns the 2nd and 3rd columns 49 | data_table[, c("Y", "Z")] # returns the 2nd and 3rd columns 50 | 51 | # The setkey() function sorts a data.table and marks it as sorted. The sorted columns are the key. The key can be any 52 | # columns in any order. The columns are always sorted in ascending order. The table is changed by reference, which 53 | # means that the entire table isn't copied and re-arranged. The rows are just swapped. 54 | 55 | # Setting keys in data.table: 56 | 57 | setkey(data_table, Y) 58 | 59 | # Setting secondary indices: 60 | 61 | # Indexing is a way of sorting a number of records on multiple fields. Creating an index on a field in a table creates 62 | # another data structure which holds the field value, and a pointer to the record it relates to. This index structure 63 | # is then sorted, allowing efficient binary searches to be performed on it. The downside of this is that more memory 64 | # is needed to hold the extra indexing data, although more efficient searches can be performed. 65 | 66 | # In a manner similar to key, you can setindex(DT, key.col) or setindexv(DT, "key.col.string"), where DT is 67 | # your data.table. Remove all indices with setindex(DT, NULL). 68 | 69 | # Let us set x as index: 70 | 71 | setindex(data_table, X) 72 | 73 | # There are many reasons to write code that is guaranteed to work with data.frame and data.table. Maybe you are 74 | # forced to use data.frame, or you may need to share some code that you don't know how will be used. So, there are 75 | # some main strategies for achieving this, in order of convenience: 76 | 77 | # 1. Use syntax that behaves the same for both classes. 78 | # 2. Use a common function that does the same thing as the shortest syntax. 79 | # 3. Force data.table to behave as data.frame (ex.: call the specific method print.data.frame). 80 | # 4. Treat them as list, which they ultimately are. 81 | # 5. Convert the table to a data.frame before doing anything (bad idea if it is a huge table). 82 | # 6. Convert the table to data.table, if dependencies are not a concern. 83 | 84 | -------------------------------------------------------------------------------- /Individual Scripts/13. Hashmaps.R: -------------------------------------------------------------------------------- 1 | ## Hashmaps 2 | 3 | # Although R does not provide a native hash table structure, similar functionality can be achieved by leveraging the 4 | # fact that the environment object returned from new.env (by default) provides hashed key lookups. The following 5 | # two statements are equivalent, as the hash parameter defaults to TRUE: 6 | 7 | hash_map <- new.env(hash = TRUE) 8 | hash_map <- new.env() 9 | 10 | # Insertion of elements may be done using either of the '<-' or '$' methods: 11 | 12 | hash_map[["key"]] = "value" 13 | hash_map$key2 = "value2" 14 | 15 | hash_map$key # returns "value" 16 | hash_map[["key2"]] # returns "value2" 17 | 18 | # Elements can be removed using rm: 19 | 20 | rm("key", envir = hash_map) 21 | 22 | ls.str(hash_map) # returns key2 : chr "value2" 23 | 24 | # One of the major benefits of using environment objects as hash tables is their ability to store virtually any 25 | # type of object as a value, even other environments: 26 | 27 | hash_map2 <- new.env() 28 | hash_map2[["a"]] <- LETTERS 29 | hash_map2[["b"]] <- as.list(x = 1:5, y = matrix(rnorm(10), 2)) 30 | hash_map2[["c"]] <- head(mtcars, 3) 31 | hash_map2[["d"]] <- Sys.Date() 32 | hash_map2[["e"]] <- Sys.time() 33 | 34 | -------------------------------------------------------------------------------- /Individual Scripts/14. Control Flow Structures.R: -------------------------------------------------------------------------------- 1 | ## Control Flow Structures 2 | 3 | # Standard if / else if / else statement: 4 | 5 | x <- 0 6 | 7 | if (x < 0) { 8 | print("Negative") 9 | } else if (x > 0) { 10 | print("Positive") 11 | } else { 12 | print("Zero") 13 | } 14 | 15 | # Outputs: 16 | # [1] "Zero" 17 | 18 | # R allows us to write inline constructs such as the one below: 19 | 20 | x <- 3 21 | 22 | y <- if(x > 3) "Larger than 3" else "Less than or equal to 3" 23 | 24 | y # returns "Less than or equal to 3" 25 | 26 | # Standard for loop: 27 | 28 | values <- c("value1","value2") 29 | 30 | for (value in values) { 31 | print(value) # prints out "value1", followed by "value2" 32 | } 33 | 34 | # To illustrate the effect of good for loop construction, we will calculate the mean of each column in four different 35 | # ways: 36 | 37 | # 1. Using a poorly optimized for loop 38 | # 2. Using a well optimized for loop 39 | # 3. Using an *apply family of functions 40 | # 4. Using the colMeans function 41 | 42 | # 1. Using a poorly optimized for loop example (mean time to run: ~290 ms): 43 | 44 | poor_column_mean <- NULL 45 | 46 | for ( i in 1 : length(mtcars) ) { 47 | poor_column_mean[i] <- mean(mtcars[[i]]) 48 | } 49 | 50 | # 2. Using a well optimized for for loop example (mean time to run: ~260 ms): 51 | 52 | better_column_mean <- vector("numeric", length(mtcars)) 53 | 54 | for (i in seq_along(mtcars)) { 55 | better_column_mean <- mean(mtcars[[i]]) 56 | } 57 | 58 | # 3. Using an *apply family of functions example (mean time to run: ~120 ms): 59 | 60 | vapply_column_mean <- vapply(mtcars, mean, numeric(1)) 61 | 62 | # 4. Using the colMeans function (mean time to run: ~180 ms): 63 | 64 | colMeans_column_mean <- colMeans(mtcars) 65 | 66 | # The while loop 67 | 68 | counter <- 0 69 | 70 | while (counter < 3) { 71 | cat(counter, "\n") 72 | counter <- counter + 1 73 | } 74 | 75 | # Output: 76 | # 0 77 | # 1 78 | # 2 79 | 80 | # The repeat loop 81 | 82 | vector <- c("Repeat","loop") 83 | counter <- 0 84 | 85 | repeat { 86 | print(vector) 87 | counter <- counter + 1 88 | 89 | if(counter >= 2) { 90 | break 91 | } 92 | } 93 | 94 | # Outputs: 95 | # [1] "Repeat" "loop" 96 | # [1] "Repeat" "loop" 97 | 98 | -------------------------------------------------------------------------------- /Individual Scripts/15. DPLYR.R: -------------------------------------------------------------------------------- 1 | ## DPLYR 2 | 3 | # dplyr introduces a grammar of data manipulation in R. It provides a consistent interface to work with data no 4 | # matter where it is stored: data.frame, data.table, or a database. The key pieces of dplyr are written using Rcpp, 5 | # which makes it very fast for working with in-memory data. 6 | # 7 | # dplyr's philosophy is to have small functions that do one thing well. The five simple functions (filter, arrange, 8 | # select, mutate, and summarise) can be used to reveal new ways to describe data. When combined with group_by, 9 | # these functions can be used to calculate group wise summary statistics. 10 | 11 | install_and_load_package('dplyr') 12 | 13 | mtcars_table <- as_data_frame(tibble::rownames_to_column(mtcars, "cars")) 14 | 15 | # Filter helps subset rows that match certain criteria: 16 | 17 | filter(mtcars_table, cyl == 4) # returns all cars that have 4 cylinders 18 | 19 | filter(mtcars_table, cyl == 4 | cyl == 6, gear == 5) # returns the cars which have either 4 or 6 cylinders and 5 gears 20 | 21 | slice(mtcars_table, 6:9) # returns rows 6 through 9 22 | 23 | # Arrange is used to sort the data by a specified variable(s). 24 | 25 | arrange(mtcars_table, hp) # orders the data by horsepower - hp 26 | 27 | arrange(mtcars_table, desc(mpg), cyl) # orders the data by miles per gallon in desc order, followed by # of cylinders 28 | 29 | # Select is used to select only a subset of variables 30 | 31 | select (mtcars_table, mpg, disp, wt, qsec, vs) # returns mpg, disp, wt, qsec, and vs from mtcars_tbl 32 | 33 | select (mtcars_table, cylinders = cyl, displacement = disp) # returns and renames the cylinders and displacement columns 34 | 35 | select (mtcars_table, mpg:wt) # returns all of the columns between the mpg and wt columns 36 | 37 | # Mutate can be used to add new columns to the data. 38 | 39 | mutate(mtcars_table, weight_ton = wt / 2, weight_pounds = weight_ton * 2000) # Adds 2 new columns to the data frame 40 | 41 | # To retain only the newly created columns, use transmute instead of mutate: 42 | 43 | transmute(mtcars_table, weight_ton = wt/2, weight_pounds = weight_ton * 2000) # Only has the 2 columns specified 44 | 45 | # Summarise calculates summary statistics of variables by collapsing multiple values to a single value. 46 | 47 | summarise(mtcars_table, mean_mpg = mean(mpg), sd_mpg = sd(mpg), 48 | mean_disp = mean(disp), sd_disp = sd(disp)) 49 | 50 | # group_by can be used to perform group wise operations on data. 51 | 52 | by_cyl <- group_by(mtcars_table, cyl) 53 | summarise(by_cyl, mean_mpg = mean(mpg), sd_mpg = sd(mpg)) 54 | 55 | # Putting it all together: 56 | 57 | # Example with intermediate results (simple): 58 | 59 | selected <- select(mtcars_table, cars:hp, gear) 60 | ordered <- arrange(selected, cyl, desc(mpg)) 61 | by_cyl <- group_by(ordered, gear) 62 | filter(by_cyl, mpg > 20, hp > 75) 63 | 64 | # Example without intermediate results (more complex): 65 | 66 | filter( 67 | group_by( 68 | arrange( 69 | select( 70 | mtcars_table, cars:hp 71 | ), cyl, desc(mpg) 72 | ), cyl 73 | ),mpg > 20, hp > 75 74 | ) 75 | 76 | # dplyr operations can be chained using the pipe %>% operator: 77 | 78 | mtcars_table %>% 79 | select(cars:hp) %>% 80 | arrange(cyl, desc(mpg)) %>% 81 | group_by(cyl) %>% 82 | filter(mpg > 20, hp > 75) 83 | 84 | # summarise_all() is used to apply functions to all (non-grouping) columns: 85 | 86 | mtcars_table %>% 87 | summarise_all(n_distinct) 88 | 89 | # To summarise specific multiple columns, use summarise_at: 90 | 91 | mtcars_table %>% 92 | group_by(cyl) %>% 93 | summarise_at(c("mpg", "disp", "hp"), mean) 94 | 95 | # To select columns conditionally, use summarise_if: 96 | 97 | mtcars_table %>% 98 | group_by(cyl) %>% 99 | summarise_if(is.numeric, mean) 100 | 101 | -------------------------------------------------------------------------------- /Individual Scripts/16. Functions.R: -------------------------------------------------------------------------------- 1 | ## Functions 2 | 3 | # Anonymous functions 4 | 5 | df <- data.frame(first = 5:9, second = (0:4)^2, third = -1:3) 6 | 7 | # Calculate the root mean square for each column in a data.frame: 8 | 9 | apply( df, 2, function(x) { sqrt(sum(x^2)) }) 10 | # first second third 11 | # 15.968719 18.814888 3.872983 12 | 13 | # This function takes as input a vector (vec in this example) and outputs the same vector with the 14 | # vector's length (6 in this case) subtracted from each of the vector's elements: 15 | 16 | vec <- 4:9 17 | 18 | subtract.length <- function(x) { x - length(x) } 19 | 20 | subtract.length(vec) # returns -2 -1 0 1 2 3 21 | 22 | # The below function is a more complicated example which calls another function and returns a data frame: 23 | 24 | vec2 <- (4:7)/2 25 | 26 | msdf <- function(x, multiplier=4) { 27 | mult <- x * multiplier 28 | subl <- subtract.length(x) 29 | data.frame(mult, subl) 30 | } 31 | 32 | msdf(vec2, 5) 33 | # mult subl 34 | # 1 10.0 -2.0 35 | # 2 12.5 -1.5 36 | # 3 15.0 -1.0 37 | # 4 17.5 -0.5 38 | 39 | # Apply Functions 40 | 41 | # apply: Applies a function to the rows or columns of a matrix (and higher-dimensional analogues). It's not 42 | # advisable to use it for data frames as it will coerce to a matrix first. 43 | 44 | matrix <- matrix(seq(1,16), 4, 4) 45 | 46 | matrix 47 | # [,1] [,2] [,3] [,4] 48 | # [1,] 1 5 9 13 49 | # [2,] 2 6 10 14 50 | # [3,] 3 7 11 15 51 | # [4,] 4 8 12 16 52 | 53 | apply(matrix, 1, min) # applies the min functions to all of the rows and returns [1] 1 2 3 4 54 | 55 | apply(matrix, 2, max) # applies the max functions to all of the rows and returns [1] 4 8 12 16 56 | 57 | # lapply: Applies a function to each element of a list and returns a list containing the results. 58 | 59 | list <- list(first = 1, second = 1:5, third = c(1,3,5)) 60 | 61 | lapply(list, FUN = sum) 62 | # $first 63 | # [1] 1 64 | # 65 | # $second 66 | # [1] 15 67 | # 68 | # $third 69 | # [1] 9 70 | 71 | # sapply: Applies a function to each element of a list and returns a vector containing the results. 72 | 73 | sapply(list, FUN = sum) 74 | # first second third 75 | # 1 15 9 76 | 77 | sapply(list, FUN = length) 78 | # first second third 79 | # 1 5 3 80 | 81 | -------------------------------------------------------------------------------- /Individual Scripts/17. Data Visualization.R: -------------------------------------------------------------------------------- 1 | ## Data Visualization 2 | 3 | # Plotting data 4 | 5 | # Below are some simple examples of how to plot a line in R, how to fit a line to some points, and how to add 6 | # more points to a graph. 7 | 8 | # Make a very simply plot of (x, y) values and plot them: 9 | 10 | x <- c(4, 6, 8, 11, 15, 18) 11 | y <- c(2.8, 4.6, 6.2, 5.5, 7.8, 8.8) 12 | 13 | plot(x, y) 14 | 15 | # We can use a bunch of parameters to produce a more descriptive plot, as shown below: 16 | 17 | plot( x , y 18 | , xlab="X Axis Label" 19 | , ylab="Y Axis Label" 20 | , main = "Plot Title" 21 | , xlim = c(0, 20) # X axis range 22 | , ylim = c(0, 10) # Y axis range 23 | , pch = 4 # Set the plotting symbol to 'X' 24 | , col = "red" # Set the plot color to red 25 | ) 26 | 27 | # Create a linear model and plot it: 28 | 29 | lin_model <- lm(y ~ x) 30 | 31 | abline(lin_model) 32 | 33 | # Add more points to our graph: 34 | 35 | x2 <- c(3.3, 6.6, 9.9, 13.2) 36 | y2 <- c(1.6, 3.3, 5, 6.6 ) 37 | 38 | # Create a 2D line plot for the 4 new values: 39 | 40 | points( x2, y2 41 | , type="o" # Use a line plot 42 | , col = "blue" 43 | ) 44 | 45 | # Histograms 46 | 47 | # A histogram plots the frequencies that data appears within certain ranges. Below, we plot a simple histogram 48 | # showing our mtcars data horse power distribution. 49 | 50 | data(mtcars) 51 | 52 | hist(mtcars$hp, main = "Distribution of HP", xlab = "Horse Power") 53 | 54 | # For our histogram, R will automatically calculate the intervals to use, although we can specify the amount of 55 | # breaks we want using the breaks option: 56 | 57 | hist(mtcars$hp, main = "Distribution of HP", xlab = "HP", breaks = 4) 58 | 59 | # Boxplot 60 | 61 | # A boxplot provides a graphical view of the median, quartiles, maximum, and minimum of a data set. 62 | 63 | boxplot(mtcars$mpg, main = "Boxplot for Miles per Gallon Data") 64 | 65 | # We can also create a boxplot of a numerical variable grouped by a categorical variable 66 | 67 | # Use the iris data set to create a boxplot of the sepal.length column grouped by species: 68 | 69 | data(iris) 70 | 71 | boxplot(Sepal.Length ~ Species, data = iris, main = "Boxplot of Sepal Length Grouped by Species") 72 | 73 | # ggplot2 74 | 75 | # ggplot is a popular visualization package which we can use to create elegant and complex plots. 76 | 77 | # Let's illustrate some simple plots we can make using this library. 78 | 79 | install_and_load_package('ggplot2') 80 | 81 | # Create a regular dot plot of Sepal (length, width) points using our iris data: 82 | 83 | ggplot( iris 84 | , aes(x = Sepal.Length, y = Sepal.Width)) + # Specify the aesthetic mappings (variable mappings) 85 | geom_point() # Specify the geometric object. Here, we specify points (dots) to obtain a plot of points 86 | 87 | # Aesthetic mappings allow us to use properties within our data to influence the visual characeristics of our graphs. 88 | # Lets make the same plot as above, except we will specify a different color for each flower species: 89 | 90 | ggplot( iris 91 | , aes(x = Sepal.Length, y = Sepal.Width, color = Species)) + 92 | geom_point() 93 | 94 | # We can also make plots using different geometric shapes / objects. 95 | 96 | # Create a line chart for the iris data set: 97 | 98 | ggplot( iris 99 | , aes(x = Sepal.Length, y = Sepal.Width, color = Species)) + 100 | geom_line() 101 | 102 | # Create a smoothed line chart: 103 | 104 | ggplot( iris 105 | , aes(x = Sepal.Length, y = Sepal.Width, color = Species)) + 106 | geom_smooth() 107 | 108 | # Create a bar chart: 109 | 110 | ggplot( data = iris 111 | , aes(x = Sepal.Width)) + 112 | geom_bar() 113 | 114 | # Create a histogram (which is similar to the bar chart above): 115 | 116 | ggplot( data = iris 117 | , aes(x = Sepal.Width)) + 118 | geom_histogram() 119 | 120 | # Sometimes, we may want to display multiple plots in one image with the different facets. An advantage of 121 | # this method is that all axes share the same scale across the charts, making it easy to compare them at a glance. 122 | # You can construct a plot with multiple facets by using the facet_wrap() 123 | 124 | # Use the iris dataset and the facet_wrap function to plot the iris sepal width accross the different species: 125 | 126 | ggplot( iris 127 | , aes(x = Sepal.Length, y = Sepal.Width)) + 128 | geom_point() + 129 | facet_wrap(~Species) 130 | 131 | # Create a mew graph wiht added labels to our visuals using the labs function: 132 | 133 | ggplot( iris 134 | , aes(x = Sepal.Length, y = Sepal.Width, color = Species)) + 135 | geom_point() + 136 | labs( title = "Width and Length Iris Data", 137 | subtitle = "Simple description of our data can be included here", 138 | x = "Sepal Length", 139 | y = "Sepal Width", 140 | color = "Species" 141 | ) 142 | 143 | # Use the facet_grid function to facet our data by more than one categorical variable: 144 | 145 | data(mpg) 146 | 147 | ggplot( mpg 148 | , aes(x = displ, y = hwy)) + 149 | geom_point() + 150 | facet_grid(year ~ cyl) # Create a facet grid for year / cylinder data 151 | 152 | # Time Series Data 153 | 154 | # Time series data can be stored as a ts object. ts objects contain information about seasonal frequency that is used 155 | # by ARIMA functions. It also allows for calling of elements in the series by date using the window command. 156 | 157 | # Create a dummy dataset of 100 observations: 158 | 159 | x <- rnorm(100) 160 | 161 | # Convert this vector to a ts object with 100 annual observations: 162 | 163 | x <- ts(x, start = c(1900), freq = 1) 164 | 165 | plot(x) 166 | 167 | # Convert this vector to a ts object with 100 monthly observations starting in July: 168 | 169 | x <- ts(x, start = c(1900, 7), freq = 12) 170 | 171 | plot(x) 172 | 173 | # Exploratory Data Analysis with time-series data 174 | 175 | # Lets load some air passanger data: 176 | 177 | data(AirPassengers) 178 | 179 | # Plot the raw data: 180 | 181 | plot(AirPassengers) 182 | 183 | # Fit a trend line: 184 | 185 | abline(reg=lm(AirPassengers~time(AirPassengers))) 186 | 187 | -------------------------------------------------------------------------------- /Individual Scripts/18. Reading and Writing Data.R: -------------------------------------------------------------------------------- 1 | ## Reading and Writing Data 2 | 3 | # cat takes one or more character vectors as arguments and prints them to the console. If the character vector 4 | # has a length greater than 1, arguments are separated by a space (by default): 5 | 6 | cat(c("hello", "world", "\n")) # outputs 'hello world' 7 | 8 | # Reading from or writing to a file connection 9 | 10 | # We don't always have the liberty to read from or write to a local system path. To establish a file connection 11 | # to read data, use the file() command in read mode ("r" is for read mode): 12 | 13 | stdin_connection <- file("stdin", "r") # when just standard input/output for files are available 14 | 15 | file_connection <- file("README.md", "r") # when file is local 16 | 17 | # We can use the readline method to read the contents of the file. The n parameters specifies the number of lines 18 | # we want to read. Setting n to 1 means that we're reading the file line by line: 19 | 20 | read_file = function(file_path) { 21 | connection = file(file_path, "r") 22 | while ( TRUE ) { 23 | line = readLines(connection, n = 1, warn = FALSE) 24 | if ( length(line) == 0 ) { 25 | break 26 | } 27 | print(line) 28 | } 29 | close(connection) 30 | } 31 | 32 | read_file("README.md") # prints the results of README.md 33 | 34 | # You can change value of n (say 10, 20 etc.) for reading data blocks (i.e. we can use 10 to read 10 lines in 35 | # one go). To read complete file in one go set n = -1. 36 | 37 | all_lines <- readLines(file_connection, n = -1, warn = FALSE) 38 | 39 | print(all_lines) # prints the results of README.md 40 | 41 | # Close the open connections: 42 | 43 | close(file_connection) 44 | 45 | close(stdin_connection) 46 | 47 | # After processing data, you can write the results back to the file connection using many different commands like 48 | # writeLines(),cat() etc. which are capable of writing to a file connection. 49 | 50 | write_file_connection <- file("result.data", "w") # when file is local 51 | 52 | # Then write the data as follows: 53 | 54 | writeLines("text", write_file_connection, sep = "\n") 55 | 56 | close(write_file_connection) 57 | 58 | # Delete the results.data file: 59 | 60 | file.remove("result.data") 61 | 62 | # Importing .csv files 63 | 64 | # Get the file path of a CSV included in R's utils package 65 | 66 | csv_path <- system.file("misc", "exDIF.csv", package = "utils") 67 | 68 | df <- read.csv(csv_path) 69 | 70 | # The data.table package introduces the function fread. While it is similar to read.table, fread is usually faster and 71 | # more flexible. It tries to 'guess' the file's delimiter automatically: 72 | 73 | dt <- fread(csv_path) 74 | 75 | # To return an ordinary data.frame, set the data.table parameter to FALSE: 76 | 77 | df <- fread(csv_path, data.table = FALSE) 78 | 79 | # Data can be written to a CSV file using write.csv(): 80 | 81 | write.csv(mtcars, "mtcars.csv") 82 | 83 | # Importing multiple csv files: 84 | 85 | files = list.files(pattern="*.csv") 86 | 87 | data_list = lapply(files, read.table, header = TRUE) 88 | 89 | print (data_list) 90 | 91 | -------------------------------------------------------------------------------- /Individual Scripts/19. Web Scraping and Parsing.R: -------------------------------------------------------------------------------- 1 | ## Web Scraping and Parsing 2 | 3 | # rvest is a package for web scraping and parsing by Hadley Wickham inspired by Python's Beautiful Soup. It 4 | # leverages Hadley's xml2 package's libxml2 bindings for HTML parsing. 5 | 6 | # To scrape the table of R milestones from the Wikipedia page on R, the code would look like: 7 | 8 | install_and_load_package('rvest') 9 | 10 | url <- 'https://en.wikipedia.org/wiki/R_(programming_language)' 11 | 12 | # Scrape HTML from website and use pipe operators to transform it into a data frame: 13 | 14 | url %>% 15 | # Read the url html: 16 | read_html() %>% 17 | # Select HTML tag with class="wikitable": 18 | html_node(css = '.wikitable') %>% 19 | # Parse table into data.frame: 20 | html_table() %>% 21 | # Trim the description to 100 characters for printing: 22 | dplyr::mutate(Description = substr(Description, 1, 100)) 23 | 24 | -------------------------------------------------------------------------------- /Individual Scripts/2. Variables.R: -------------------------------------------------------------------------------- 1 | ## Variables 2 | 3 | # In R, variables are assigned values using the infix-assignment operator <-. The operator = can also be used for 4 | # assigning values to variables, however, its proper use is for associating values with parameter names in function 5 | # calls. 6 | 7 | variable1 <- 22 8 | variable2 = 23 9 | 10 | # It is also possible to make assignments to variables using ->. 11 | 12 | 3 -> x 13 | 14 | # Private Variables: 15 | 16 | # A leading dot in a name of a variable or function in R is commonly used to denote that the variable or function is 17 | # meant to be hidden. 18 | 19 | .private_variable <- 'private' 20 | 21 | .private_variable 22 | # [1] "private" 23 | 24 | # The ls function which lists the objects will not include the private variable: 25 | 26 | ls() # returns a list which doesn't include 'private_variable' 27 | 28 | -------------------------------------------------------------------------------- /Individual Scripts/20. Feature Selection.R: -------------------------------------------------------------------------------- 1 | ## Feature Selection 2 | 3 | # Feature selection is about removing extraneous features / data cleaning. 4 | 5 | # A feature that has near zero variance is a good candidate for removal. You can manually detect numerical 6 | # variance below your own threshold: 7 | 8 | data(iris) 9 | 10 | variances <- apply(iris, 2, var) 11 | 12 | variances[which(variances <= 0.0025)] # returns character(0) 13 | 14 | # Or, you can use the caret package to find near zero variance: 15 | 16 | install_and_load_package('caret') 17 | 18 | names(iris)[nearZeroVar(iris)] # returns character(0) 19 | 20 | # Removing features with high numbers of NA. If a feature is largely lacking data, it is a good candidate 21 | # for removal: 22 | 23 | install_and_load_package('VIM') 24 | 25 | # Load the sleep data: 26 | 27 | data(sleep) 28 | 29 | # Use the colMeans and is.na functions to find the ratio of missing values for each column 30 | 31 | colMeans( is.na(sleep) ) 32 | # BodyWgt BrainWgt NonD Dream Sleep Span Gest 33 | # 0.00000000 0.00000000 0.22580645 0.19354839 0.06451613 0.06451613 0.06451613 34 | 35 | # In the above case, we may want to remove NonD and Dream, which have around 20% of their values missing 36 | 37 | # To drop columns, there's the subset command, which we could use as demonstrated below: 38 | 39 | sleep_with_missing_col <- subset(sleep, select = -c(NonD, Dream)) 40 | 41 | # Or we can use regular frame filtering and %in% operator: 42 | 43 | columns_to_drop <- c("NonD", "Dream") 44 | 45 | sleep_with_missing_col <- sleep[ , !(names(sleep) %in% columns_to_drop)] 46 | 47 | # Removing Closely Correlated Features 48 | 49 | # Closely correlated features may add variance to your model, and removing one of a correlated pair might help 50 | # reduce that. There are lots of ways to detect correlation. Here's one: 51 | 52 | install_and_load_package('purrr') 53 | 54 | # Select correlatable vars (numeric ones) 55 | 56 | to_correlate <- mtcars %>% keep ( is.numeric ) 57 | 58 | # Calculate correlation matrix 59 | 60 | correlation_matrix <- cor(to_correlate) 61 | 62 | correlation_matrix 63 | 64 | # Pick only one out of each highly correlated pair's mirror image 65 | 66 | correlation_matrix[upper.tri(correlation_matrix)] <- 0 67 | 68 | # We don't remove the highly-correlated-with-itself group 69 | 70 | diag(correlation_matrix) <- 0 71 | 72 | # Find features that are highly correlated with another feature at the +- 0.85 level 73 | 74 | apply(correlation_matrix, 2, function(x) any( abs(x) >= 0.85 ) ) 75 | # mpg cyl disp hp drat wt qsec vs am gear carb 76 | # TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 77 | 78 | # We'll want to look at what MPG is correlated to so strongly, and decide what to keep and what to toss. 79 | # Same for cyl and disp. Alternatively, we might need to combine some strongly correlated features. 80 | 81 | -------------------------------------------------------------------------------- /Individual Scripts/21. Linear Regression.R: -------------------------------------------------------------------------------- 1 | ## Linear Regression 2 | 3 | # The built-in mtcars data frame contains information about 32 cars, including their weight, fuel efficiency 4 | # (in miles per gallon), speed, etc. If we are interested in the relationship between fuel efficiency (mpg) 5 | # and weight (wt) we may start plotting those variables with: 6 | 7 | plot(mpg ~ wt, data = mtcars, col = 2) 8 | 9 | # The plots shows a (linear) relationship!. Then if we want to perform linear regression to determine the 10 | # coefficients of a linear model, we would use the lm function. 11 | # The ~ here means "explained by", so the formula mpg ~ wt means we are predicting mpg as explained by wt: 12 | 13 | example_model <- lm(mpg ~ disp, data = mtcars) 14 | 15 | # We can use the summary function to display the key output / results: 16 | 17 | summary(example_model) 18 | 19 | # Using the 'predict' function: Once a model is built predict is the main function to test with new data. 20 | 21 | # First, we sample from our original data 22 | 23 | set.seed(1234) 24 | 25 | new_data <- sample(mtcars$disp, 5) 26 | 27 | new_data # returns [1] 258.0 71.1 75.7 145.0 400.0 28 | 29 | # Create a new data frame with the same column names as the original data 30 | 31 | new_df <- data.frame( disp = new_data ) 32 | 33 | new_df 34 | 35 | predict(example_model, new_df) 36 | 37 | # Checking accuracy 38 | 39 | # Let's create a new data frame and use our model to make predictions and check our end results. 40 | # Create a new data frame containing the first 10 results from our mtcars data: 41 | 42 | new_df2 = data.frame(mpg = mtcars$mpg[1:10], disp = mtcars$disp[1:10]) 43 | 44 | # Use our linear model to make predictions on the data set created above: 45 | 46 | predictions <- predict(example_model, new_df2) 47 | 48 | # Calculate the root mean square error comparing our generated results and the original data set: 49 | 50 | sqrt(mean( (predictions - new_df2$mpg)^2 , na.rm = TRUE)) # returns 2.325148 51 | 52 | # Checking for nonlinearity with polynomial regression 53 | 54 | # Sometimes when working with linear regression we need to check for non-linearity in the data. One way to do this 55 | # is to fit a polynomial model and check whether it fits the data better than a linear model. There are other reasons, 56 | # such as theoretical, that indicate to fit a quadratic or higher order model because it is believed that the variables 57 | # relationship is inherently polynomial in nature. 58 | 59 | # Let's fit a quadratic model for the mtcars dataset. For a linear model see Linear regression on the mtcars dataset. 60 | 61 | # A linear fit will show that disp is not significant. 62 | 63 | fit0 = lm(mpg ~ wt + disp, mtcars) 64 | 65 | summary(fit0) 66 | 67 | # Output: 68 | # Coefficients: 69 | # Estimate Std. Error t value Pr(>|t|) 70 | # (Intercept) 34.96055 2.16454 16.151 4.91e-16 *** 71 | # wt -3.35082 1.16413 -2.878 0.00743 ** 72 | # disp -0.01773 0.00919 -1.929 0.06362 . 73 | # --- 74 | # Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 75 | # 76 | # Residual standard error: 2.917 on 29 degrees of freedom 77 | # Multiple R-squared: 0.7809, Adjusted R-squared: 0.7658 78 | # F-statistic: 51.69 on 2 and 29 DF, p-value: 2.744e-10 79 | 80 | # Then, to get the result of a quadratic model, we added I(disp^2). The new model appears better when looking at 81 | # R^2 and all variables are significant. 82 | 83 | fit1 = lm(mpg ~ wt + disp + I( disp ^ 2 ), mtcars) 84 | 85 | summary(fit1) 86 | 87 | # Output: 88 | # Coefficients: 89 | # Estimate Std. Error t value Pr(>|t|) 90 | # (Intercept) 41.4019837 2.4266906 17.061 2.5e-16 *** 91 | # wt -3.4179165 0.9545642 -3.581 0.001278 ** 92 | # disp -0.0823950 0.0182460 -4.516 0.000104 *** 93 | # I(disp^2) 0.0001277 0.0000328 3.892 0.000561 *** 94 | # --- 95 | # Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 96 | # 97 | # Residual standard error: 2.391 on 28 degrees of freedom 98 | # Multiple R-squared: 0.8578, Adjusted R-squared: 0.8426 99 | # F-statistic: 56.32 on 3 and 28 DF, p-value: 5.563e-12 100 | 101 | # Another way to specify polynomial regression is using poly with parameter raw=TRUE, otherwise orthogonal 102 | # polynomials will be considered (see the help(ploy) for more information). We get the same result using: 103 | 104 | summary(lm(mpg ~ wt + poly(disp, 2, raw=TRUE), mtcars)) 105 | 106 | -------------------------------------------------------------------------------- /Individual Scripts/22. Logistic Rregression.R: -------------------------------------------------------------------------------- 1 | ## Logistic Rregression 2 | 3 | # Example logistic regression on the Titanic dataset 4 | 5 | # Read the data: 6 | 7 | url <- "http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt" 8 | 9 | titanic <- read.csv(file = url, stringsAsFactors = FALSE) 10 | 11 | # Clean the missing values. In that case, we replace the missing age values with the average: 12 | 13 | titanic$age[is.na(titanic$age)] <- mean(titanic$age, na.rm = TRUE) 14 | 15 | # Train the model: 16 | 17 | titanic.train <- glm( survived ~ pclass + sex + age 18 | , family = binomial 19 | , data = titanic 20 | ) 21 | 22 | # Display the model summary: 23 | 24 | summary(titanic.train) 25 | 26 | # Output: 27 | # glm(formula = survived ~ pclass + sex + age, family = binomial, data = titanic) 28 | # Call: glm(formula = survived ~ pclass + sex + age, family = binomial, 29 | # data = titanic) 30 | # 31 | # Coefficients: 32 | # (Intercept) pclass2nd pclass3rd sexmale age 33 | # 4.52216 -1.49523 -2.84127 -3.08671 -0.04931 34 | # 35 | # Degrees of Freedom: 632 Total (i.e. Null); 628 Residual 36 | # (680 observations deleted due to missingness) 37 | # Null Deviance: 869.5 38 | # Residual Deviance: 539.7 AIC: 549.7 39 | 40 | -------------------------------------------------------------------------------- /Individual Scripts/23. Random Forrest.R: -------------------------------------------------------------------------------- 1 | ## Random Forrest 2 | 3 | # Random Forests is a learning method mainly used for classification. It is based on generating a large 4 | # number of decision trees, each constructed using a different subset of your training set. 5 | 6 | # Load the package using our function: 7 | 8 | install_and_load_package('randomForest') 9 | 10 | # Let's split our data set into training and validation data and train a random forrest algorithm: 11 | 12 | # Set random seed to make results reproducible: 13 | 14 | set.seed(17) 15 | 16 | # Calculate the size of each of the data sets: 17 | 18 | data_set_size <- floor( nrow(iris) / 2 ) 19 | 20 | # Generate a random sample of "data_set_size" indexes 21 | 22 | indexes <- sample(1:nrow(iris), size = data_set_size) 23 | 24 | # Assign the data to the correct sets 25 | 26 | training <- iris[indexes,] 27 | validation1 <- iris[-indexes,] 28 | 29 | # Some important random forrest parameters: 30 | # ntree: Defines the number of trees to be generated. It is typical to test a range of values for this parameter 31 | # (i.e. 100,200,300,400,500) and choose the one that minimises the OOB estimate of error rate. 32 | # mtry: Is the number of features used in the construction of each tree. These features are selected at random, 33 | # which is where the "random" in "random forests" comes from. The default value for this parameter, when 34 | # performing classification, is sqrt(number of features). 35 | # importance: Enables the algorithm to calculate variable importance. 36 | 37 | # Perform the training: 38 | 39 | rf_classifier = randomForest(Species ~ ., data = training, ntree = 100, mtry = 2, importance=TRUE) 40 | 41 | # Validation set assessment #1: looking at confusion matrix. The confusion matrix is a good way of looking at 42 | # how good our classifier is performing when presented with new data. 43 | 44 | prediction_for_table <- predict(rf_classifier, validation1[, -5]) 45 | 46 | table(observed=validation1[, 5], predicted = prediction_for_table) 47 | # predicted 48 | # observed setosa versicolor virginica 49 | # setosa 29 0 0 50 | # versicolor 0 20 3 51 | # virginica 0 1 22 52 | 53 | -------------------------------------------------------------------------------- /Individual Scripts/24. XGBoost.R: -------------------------------------------------------------------------------- 1 | ## XGBoost 2 | 3 | # XGBoost is a library designed and optimized for boosting trees algorithms. 4 | # It's a highly successful algorithm, having won multiple machine learning competitions. 5 | 6 | # Import the iris dataset 7 | 8 | install_and_load_package( c('datasets', 'xgboost')) 9 | 10 | data(iris) 11 | 12 | # The first 4 columns of the dataset supply the feature vector, while the last column contains the 13 | # corresponding labels: 14 | 15 | x = as.matrix(iris[, 1:4]) 16 | y = as.numeric(factor(iris[, 5])) - 1 17 | 18 | # Run the xgboost command: 19 | 20 | model <- xgboost(data = x, label = y, nrounds = 10) 21 | # [1] train-rmse:0.685905 22 | # [2] train-rmse:0.494086 23 | # [3] train-rmse:0.357192 24 | # [4] train-rmse:0.262149 25 | # [5] train-rmse:0.194319 26 | # [6] train-rmse:0.147978 27 | # [7] train-rmse:0.110566 28 | # [8] train-rmse:0.083971 29 | # [9] train-rmse:0.064658 30 | # [10] train-rmse:0.050646 31 | 32 | # We can see that the algorithm went through 10 iterations, and that the training error, as measured by rmse, 33 | # is decreasing, which is great. 34 | 35 | # We didn't define/ supply and test data. Therefore, to assess the performance of the model on unseen data, 36 | # we perform cross-validation: 37 | 38 | set.seed(1) 39 | 40 | cv <- xgb.cv( data = x 41 | , label = y 42 | , nfold = 5 43 | , nrounds = 60 44 | ) 45 | 46 | # Let?s take a closer look at some XGBoost hyperparameters: 47 | # Objective: specifies the task of XGBoost and determines the output: the most common ones are linear 48 | # (reg:linear) and logistic regression (reg:logistic), multiclass logistic regression (multi:softprob) 49 | # and ranking (rank:pairwise). 50 | # eval_metric: specifies the evaluation metric used by XGBoost. It heavily depends on the objective 51 | # (rmse for regression, and error for classification, mean average precision for ranking) and users can 52 | # create custom ones. 53 | # max_depth: specifies the max deepness of the trees (maximal number of splits) used by XGBoost. The 54 | # bigger this number, the more abstract our model can be, but we risk overfitting the model to the data. 55 | # eta controls: the learning rate: the higher eta, the higher the contribution of new trees. Smaller 56 | # number will prevent overfitting, but increase the number of rounds XGBoost will need to converge. 57 | 58 | params <- list("objective" = "multi:softprob", 59 | "eval_metric" = "mlogloss", 60 | "max_depth" = 6, 61 | "eta" = 0.3, 62 | "gamma" = 0, 63 | "colsample_bytree" = 1, 64 | "min_child_weight" = 1, 65 | "num_class" = 3) 66 | 67 | # early_stopping_rounds: This parameter specifies when to stop the training / algorithm. Setting it to 8 68 | # means that we're making the algorithm stop if there has not been an improvement in the test score (test-rmse) 69 | # for 8 rounds. 70 | 71 | cv <- xgb.cv( data = x 72 | , label = y 73 | , nfold = 5 74 | , nrounds = 100 75 | , early_stopping_rounds = 8 76 | , params = params) 77 | 78 | # After running the above, we can see that our algoirthm will use test_mlogloss for early stopping and will 79 | # stop at ~20 iterations. 80 | 81 | -------------------------------------------------------------------------------- /Individual Scripts/25. Future To Do List.R: -------------------------------------------------------------------------------- 1 | ## Future To Do List 2 | 3 | # 1. Add in caret package examples. 4 | # 2. Come up with clearer machine learning explanations / examples. 5 | 6 | -------------------------------------------------------------------------------- /Individual Scripts/3. R Scalars.R: -------------------------------------------------------------------------------- 1 | ## R Scalars 2 | 3 | # Some simple examples of declaring and using scalars: 4 | 5 | scalar1 <- 1 6 | scalar2 <- 2 7 | 8 | scalar1 + scalar2 # returns 3 9 | 10 | # Numeric classes and storage modes 11 | 12 | # Doubles are R's default numeric value. They are double precision vectors, meaning that they take up 8 bytes of 13 | # memory for each value in the vector. 14 | 15 | # Integers are represented by a number with an L after it. Any number without an L after it will be considered a 16 | # double. 17 | 18 | is.double( 1 ) # returns TRUE 19 | is.double( 1.0 ) # returns TRUE 20 | is.double( 1L ) # returns FALSE 21 | 22 | # Numeric represents integers and doubles and is the default mode assigned to vectors of numbers. The function 23 | # is.numeric() will determine if a vector is numeric. It is important to note that although integers and doubles 24 | # will pass is.numeric(), the function as.numeric() will always attempt to convert to type double. 25 | 26 | x <- 12.3 27 | y <- 12L 28 | 29 | typeof(x) # returns "double" 30 | typeof(y) # returns "integer" 31 | 32 | # Logical to numeric conversion 33 | 34 | as.numeric(TRUE) # returns 1 35 | 36 | # While TRUE == 1, it is a double and not an integer 37 | 38 | is.integer(as.numeric(TRUE)) # returns FALSE 39 | 40 | # The logical class 41 | 42 | # ! Not !x 43 | 44 | !TRUE # returns FALSE 45 | 46 | # The || operator evaluates the left condition and if the left condition is TRUE the right side is never evaluated: 47 | 48 | 7 > 6 || stop ( "X is too small" ) # returns TRUE 49 | 50 | 7 > 6 | stop ( "X is too small" ) # returns Error: X is too small 51 | 52 | # The && operator will likewise return FALSE without evaluation of the second argument when the first element 53 | # of the first argument is FALSE: 54 | 55 | 7 <= 6 && stop ( "X is too small" ) # returns FALSE 56 | 57 | 7 <= 6 & stop ( "X is too small" ) # returns Error: X is too small 58 | 59 | -------------------------------------------------------------------------------- /Individual Scripts/4. Types of Data Structures.R: -------------------------------------------------------------------------------- 1 | ## Types of Data Structures 2 | 3 | # There are no scalar data types in R. Vectors of length-one act like scalars. 4 | 5 | # There are 4 main data structures: 6 | 7 | # 1. Vectors: Atomic vectors must be sequence of same-class objects: a sequence of numbers, a sequence of 8 | # logicals, or a sequence of characters. Vectors are usually created with c(), short for combine: 9 | 10 | vec <- c(1, 4.5, 8.5, 10) 11 | 12 | # 2. Matrices: A matrix of numbers, logicals or characters. Matrices are vectors with a dimension attribute. The 13 | # dimension attribute is itself an integer vector of length 2 (number of rows, number of columns). 14 | 15 | matr <- matrix(1:6, ncol = 3, nrow = 2) 16 | 17 | # 3. Lists: A special type of vector that can contain elements of different classes. 18 | 19 | lst <- list(4:8, "g", c(FALSE, FALSE, TRUE), c(1.5, 8.8)) 20 | 21 | # 4. Data Frames: Data frames are represented as a special type of list where every element of the list has to 22 | # have the same length. Each element of the list can be thought of as a column and the length of each element 23 | # of the list is the number of rows. Unlike matrices, data frames can store different classes of objects in 24 | # each column. 25 | 26 | df <- data.frame( x = 1:3, y = c("str1", "str1", "str3")) 27 | 28 | -------------------------------------------------------------------------------- /Individual Scripts/5. Vectors.R: -------------------------------------------------------------------------------- 1 | ## Vectors 2 | 3 | # Atomic vectors must be sequence of same-class objects. 4 | 5 | vector <- c(2, 3, 7, 10) 6 | vector2 <- c("a", "b", "c") 7 | 8 | vector[1] # returns 1, the first element 9 | vector[4] # returns 10, our last element 10 | 11 | vector1 <- c(1,2,3,4) 12 | vector2 <- c(2,3,4,5) 13 | 14 | vector1 + vector2 # returns 3 5 7 9 15 | 16 | vector1 + scalar1 # returns 2 3 4 5 17 | 18 | vector1 ^ 2 # returns 1 4 9 16 19 | 20 | # Some vector functions: 21 | 22 | # Length outputs the number of elements in the vector: 23 | 24 | vector1 <- 1:4 25 | 26 | length(vector1) # returns: 4 27 | 28 | # Sum adds all of the elements of a vector: 29 | 30 | sum(vector1) # returns: 10 31 | 32 | # Adding vectors of different lengths: Shorter vectors in the expression are recycled as often as need be 33 | # until they match the length of the longest vector. In particular a constant is simply repeated. In the below 34 | # example, vector1's first element is recycled since its length is less than that of vector2: 35 | 36 | vector2 = 1:5 37 | 38 | vector1 + vector2 # returns 2 4 6 8 6 (with a warning message) 39 | 40 | # The Character Class 41 | 42 | # Characters are what other languages call 'string vectors.' 43 | 44 | x <- "The quick brown fox jumps over the lazy dog" 45 | 46 | class(x) # returns "character" 47 | 48 | # String Manipulation 49 | 50 | # Count pattern inside string: 51 | 52 | stri_count_fixed("babab", "b") # returns 3 53 | stri_count_fixed("babab", "ba") # returns 2 54 | 55 | # With regex 56 | 57 | stri_count_regex("a1 b2 a3 b4 aa", "a.") # returns 3 58 | stri_count_regex("a1 b2 a3 b4 aa", "a\\d") # returns 2 59 | 60 | # Duplicating strings 61 | 62 | stri_dup("abc", 3) # returns "abcabcabc" 63 | 64 | -------------------------------------------------------------------------------- /Individual Scripts/6. Matrices.R: -------------------------------------------------------------------------------- 1 | ## Matrices 2 | 3 | # Like vectors, matrices must be made of same-class elements. 4 | 5 | # Under the hood, a matrix is a special kind of vector with two dimensions. You can create matrices using the 6 | # matrix function as shown below: 7 | 8 | matrix(data = 1:6, nrow = 2, ncol = 3) 9 | # [,1] [,2] [,3] 10 | # [1,] 1 3 5 11 | # [2,] 2 4 6 12 | 13 | matrix2 <- matrix( data = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12), nrow = 4, ncol = 3, byrow = F) 14 | 15 | matrix2 16 | # [,1] [,2] [,3] 17 | # [1,] 1 5 9 18 | # [2,] 2 6 10 19 | # [3,] 3 7 11 20 | # [4,] 4 8 12 21 | 22 | matrix2[1, 1] # returns 1, the first entry of our matrix. 23 | matrix2[1, 2] # returns 5, the element on the first row, second column. 24 | matrix2[4, 3] # returns 12, the last element of our matrix (fourth row, third column). 25 | 26 | # The rows and columns of a matrix can have names. You can look at these using the functions rownames and colnames. 27 | # As shown below, the rows and columns don't initially have names, and are denoted by NULL: 28 | 29 | rownames(matrix2) # returns NULL 30 | colnames(matrix2) # returns NULL 31 | 32 | # However, you can assign values to them: 33 | 34 | rownames(matrix2) <- c("Row 1", "Row 2", "Row 3", "Row 4") 35 | colnames(matrix2) <- c("Col 1", "Col 2", "Col 3") 36 | 37 | matrix2 38 | # Col 1 Col 2 Col 3 39 | # Row 1 1 5 9 40 | # Row 2 2 6 10 41 | # Row 3 3 7 11 42 | # Row 4 4 8 12 43 | 44 | # The 'class', 'is', and 'as' functions can be used to check and coerce data structures: 45 | 46 | class(matrix2) # returns "matrix" 47 | is.matrix(matrix2) # returns TRUE 48 | as.vector(matrix2) # returns 1 2 3 4 5 6 7 8 9 10 11 12 49 | 50 | # Matrix creation through cbind (column bind) function: 51 | 52 | matrix1 <- cbind(1:4, 5:8, 9:12) 53 | 54 | matrix1 55 | # [,1] [,2] [,3] 56 | # [1,] 1 5 9 57 | # [2,] 2 6 10 58 | # [3,] 3 7 11 59 | # [4,] 4 8 12 60 | 61 | # Matrix creation through rbind (row bind) function. 62 | 63 | matrix2 = rbind( c(1,5,9), c(2,6,10), c(3,7,11), c(4,8,12) ) 64 | # [,1] [,2] [,3] 65 | # [1,] 1 5 9 66 | # [2,] 2 6 10 67 | # [3,] 3 7 11 68 | # [4,] 4 8 12 69 | 70 | # Matrix + scalar example: 71 | # R adds the scalar value to each entry in the matrix: 72 | 73 | matrix1 + scalar1 74 | # [,1] [,2] [,3] 75 | # [1,] 2 6 10 76 | # [2,] 3 7 11 77 | # [3,] 4 8 12 78 | # [4,] 5 9 13 79 | 80 | # Matrix + vector example: 81 | # R does the operation in a column-wise manner: 82 | 83 | matrix1 + vector1 84 | # [,1] [,2] [,3] 85 | # [1,] 2 6 10 86 | # [2,] 4 8 12 87 | # [3,] 6 10 14 88 | # [4,] 8 12 16 89 | 90 | # Matrix + matrix example (it's always component wise): 91 | 92 | matrix1 + matrix2 93 | # [,1] [,2] [,3] 94 | # [1,] 2 10 18 95 | # [2,] 4 12 20 96 | # [3,] 6 14 22 97 | # [4,] 8 16 24 98 | 99 | # Matrix standard product example (it's always component wise): 100 | 101 | matrix1 * matrix2 102 | # [,1] [,2] [,3] 103 | # [1,] 1 25 81 104 | # [2,] 4 36 100 105 | # [3,] 9 49 121 106 | # [4,] 16 64 144 107 | 108 | -------------------------------------------------------------------------------- /Individual Scripts/7. Lists.R: -------------------------------------------------------------------------------- 1 | ## Lists 2 | 3 | # Lists are a special type of vector where each element can be anything, even another list. 4 | 5 | list_example <- list( A = c(5,6,7,8) 6 | , B = letters[1:6] 7 | , CC = list( 5, "Z") 8 | ) 9 | 10 | example_list <- list (course = 'stat' 11 | , date = '04/07/2009' 12 | , num_isc = 7 13 | , num_cons = 6 14 | , num_mat = as.character(c(45020, 45679, 46789, 43126, 42345, 47568, 45674)) 15 | , results = c(30, 19, 29, NA, 25, 26 ,27) 16 | ) 17 | 18 | # Extracting elements from a list can be done by name (if the list is named) or by index: 19 | 20 | example_list$date # returns the date element "04/07/2009" 21 | example_list[1] # returns the 1st element "stat" 22 | 23 | # Lists have two very important uses: 24 | 25 | # 1) Since functions can only return a single value, it is common to return complicated results in a list. 26 | # 2) Lists are also the underlying fundamental class for data frames. Under the hood, a data frame is a list of 27 | # vectors all having the same length. 28 | 29 | # Using a list to return function results: 30 | 31 | example_function <- function(x) list( xplus = x + 10, xsq = x ^ 2 ) 32 | 33 | results = example_function (7) 34 | 35 | results$xplus # returns 17 36 | results$xsq # returns 49 37 | 38 | # Using a list to create a data frame: 39 | 40 | list1 <- list(x = 1:2, y = c("A","B")) 41 | 42 | data_frame1 <- data.frame(list1) 43 | 44 | data_frame1 45 | # x y 46 | # 1 1 A 47 | # 2 2 B 48 | 49 | is.list(data_frame1) # returns TRUE 50 | 51 | -------------------------------------------------------------------------------- /Individual Scripts/8. Data Frames.R: -------------------------------------------------------------------------------- 1 | ## Data Frames 2 | 3 | # A data.frame is a special kind of list: it is rectangular. Each element (column) of the list has the same length, 4 | # and each row has a "row name". Each column has its own class, but the class of one column can be different 5 | # from the class of another column (unlike a matrix, where all elements must have the same class). 6 | 7 | example_frame <- data.frame( matr = as.character(c(45020, 45679, 46789, 43126, 42345, 47568, 45674)) 8 | , res_S = c(30, 19, 29, NA, 25, 26, 27) 9 | , res_O = c(3, 3, 1, NA, 3, 2, NA) 10 | ) 11 | 12 | example_frame 13 | # matr res_S res_O 14 | # 1 45020 30 3 15 | # 2 45679 19 3 16 | # 3 46789 29 1 17 | # 4 43126 NA NA 18 | # 5 42345 25 3 19 | # 6 47568 26 2 20 | # 7 45674 27 NA 21 | 22 | # Subsetting rows and columns from a data frame 23 | 24 | # We can access elements of a data frame using matrix notation (with single brackets data[rows, columns]): 25 | 26 | example_frame[1, 1] # returns 45020 27 | 28 | # Get the first row: 29 | 30 | example_frame[1, ] # returns 45020 30 3 31 | 32 | # Get the first 2 rows: 33 | 34 | example_frame[1:2, ] 35 | # matr res_S res_O 36 | # 1 45020 30 3 37 | # 2 45679 19 3 38 | 39 | # Get the first column: 40 | 41 | example_frame[, 1] # returns 45020 45679 46789 43126 42345 47568 45674 42 | 43 | # Get the res_O column: 44 | 45 | example_frame$res_O # returns 3 3 1 NA 3 2 NA 46 | 47 | # Get the res_S column: 48 | 49 | example_frame[, 'res_S'] # returns 30 19 29 NA 25 26 27 50 | 51 | # Get the first and third columns: 52 | 53 | example_frame[, c(1, 3)] 54 | # matr res_O 55 | # 1 45020 3 56 | # 2 45679 3 57 | # 3 46789 1 58 | # 4 43126 NA 59 | # 5 42345 3 60 | # 6 47568 2 61 | # 7 45674 NA 62 | 63 | # Get the first 4 rows of the res_S and res_O columns: 64 | 65 | example_frame[1:4, c("res_S", "res_O")] 66 | # res_S res_O 67 | # 1 30 3 68 | # 2 19 3 69 | # 3 29 1 70 | # 4 NA NA 71 | 72 | # If you extract multiple columns, you will get a data frame back. However, if you extract a single column, you 73 | # will get a vector, not a data frame under the default options. 74 | 75 | # Multiple columns return a data frame: 76 | 77 | class(mtcars[, c("mpg", "cyl")]) # returns "data.frame" 78 | 79 | # Single column returns a vector: 80 | 81 | class(mtcars[, "mpg"]) # returns "numeric" 82 | 83 | # When you use single brackets and no commas, you will get column back because data frames are lists of columns. 84 | 85 | class(mtcars["mpg"]) # returns "data.frame" 86 | 87 | mtcars[c("mpg", "cyl", "disp")] # returns a data frame containing columns "mpg", "cyl", "disp": 88 | 89 | # To extract a single column as a vector when treating your data.frame as a list, you can use double brackets [[. 90 | # This will only work for a single column at a time. 91 | 92 | # Extract a single column by name as a vector: 93 | 94 | mtcars[["mpg"]] 95 | 96 | # A single column can be extracted using the magical shortcut $ without using a quoted column name. 97 | # Columns accessed by $ will always be vectors, not data frames. 98 | 99 | mtcars$mpg # returns the column "mpg" as a vector: 100 | 101 | # Logical vectors indicate specific elements to keep. We can use a condition such as < to generate a logical vector, 102 | # and extract only the rows that meet the condition. 103 | 104 | mtcars[mtcars$mpg < 15, ] # returns all cars which have miles per galon (mpg) less than 15 105 | 106 | mtcars[mtcars$cyl == 4, ] # returns all columns for rows where the value of cyl is 4 107 | 108 | mtcars[mtcars$cyl == 4, c("cyl", "mpg", "hp")] # returns the cyl, mpg, and hp columns where the value of cyl is 4 109 | 110 | # Convenience functions to manipulate data.frames 111 | 112 | # The subset() function allows you to subset a data.frame in a more convenient way (subset also works with other 113 | # classes): 114 | 115 | # Return the lines for which cyl == 6 and for the columns mpg and hp: 116 | 117 | subset(mtcars, subset = cyl == 6, select = c("mpg", "hp")) 118 | 119 | # Same as: 120 | 121 | mtcars[mtcars$cyl == 6, c("mpg", "hp")] 122 | 123 | # The transform() function is a convenience function to change columns inside a data.frame. 124 | 125 | # The below example adds another column named mpg2 with the result of mpg^2 to the mtcars data.frame: 126 | 127 | mtcars <- transform(mtcars, mpg2 = mpg^2) 128 | 129 | # Both with() and within() let you to evaluate expressions inside the data.frame environment 130 | 131 | # The below example shows how to create, change and/or remove multiple columns in the airquality data.frame: 132 | 133 | aq <- within(airquality, { 134 | lOzone <- log(Ozone) # creates new lOzone column 135 | Month <- factor(month.abb[Month]) # changes new Month column 136 | cTemp <- round((Temp - 32) * 5/9, 1) # creates new cTemp column 137 | S.cT <- Solar.R / cTemp # creates new S.cT column 138 | rm(Day, Temp) # removes Day and Temp columns 139 | }) 140 | 141 | # It is important to note that, by default, data frames coerce characters to factors. 142 | 143 | # The default behavior can be changed with the stringsAsFactors parameter. Example: 144 | 145 | df3 <- data.frame(x = 1:3, y = c("a", "b", "c"), stringsAsFactors = FALSE) 146 | 147 | # If the data has already been created, factor columns can be converted to character columns as shown below. 148 | 149 | person <- data.frame( jobs = c("scientist", "analyst") 150 | , pay = c(160000, 100000) 151 | , age = c(30, 25) 152 | ) 153 | 154 | # Convert all columns to character: 155 | 156 | person[] <- lapply(person, as.character) 157 | 158 | # We can remove all rows from the data frame which have missing (NA) values using the 159 | # complete.cases function, which returns a logical vector indicating which cases have 160 | # no missing values: 161 | 162 | person_with_missing_info <- data.frame( jobs = c("scientist", "secret agent", "analyst") 163 | , pay = c(160000, NA, 120000) 164 | , age = c(30, NA, 45) 165 | ) 166 | 167 | person_with_missing_info[complete.cases(person_with_missing_info), ] 168 | # jobs pay age 169 | # 1 scientist 160000 30 170 | # 3 analyst 120000 45 171 | 172 | # Let's say we want to only omit rows which have missing jobs info: 173 | 174 | person_with_missing_info[complete.cases(person_with_missing_info[, c('jobs')]), ] 175 | # jobs pay age 176 | # 1 scientist 160000 30 177 | # 2 secret agent NA NA 178 | # 3 analyst 120000 45 179 | 180 | # Let's say we want to replace NA vales with zeros - we can do this simply using the is.na function: 181 | 182 | person_with_missing_info[is.na(person_with_missing_info)] <- 0 183 | 184 | # Joining data frames 185 | 186 | # We can do inner/outer/cross joins on data frames. As an example, lets take the 2 frames below. 187 | # As a note, the R rep function replicates the input vector or list. 188 | 189 | data_frame1 = data.frame(CustomerId = c(1:6), Product = c(rep("Toaster", 3), rep("Radio", 3))) 190 | 191 | data_frame2 = data.frame(CustomerId = c(2, 4, 6), State = c(rep("Alabama", 2), rep("Ohio", 1))) 192 | 193 | # By using the merge function and its optional parameters, we can do SQL style joins on our data. 194 | # R automatically joins the frames by common variable names. Some exaples are provided below: 195 | 196 | merge(x = data_frame1, y = data_frame2) # Performs an inner join 197 | 198 | merge(x = data_frame1, y = data_frame2, by = "CustomerId", all = TRUE) # Performs an outer join 199 | 200 | merge(x = data_frame1, y = data_frame2, by = "CustomerId", all.x = TRUE) # Performs a left join 201 | 202 | merge(x = data_frame1, y = data_frame2, by = "CustomerId", all.y = TRUE) # Performs a right join 203 | 204 | merge(x = data_frame1, y = data_frame2, by = NULL) # Performs a cross join 205 | 206 | # You can merge/ join on multiple columns by giving by a vector, e.g., by = c("CustomerId", "OrderId") 207 | 208 | # We can also use the sqldf package, which allows you to express these operations in SQL: 209 | 210 | install_and_load_package('sqldf') 211 | 212 | # Inner join: 213 | 214 | result_frame <- sqldf("SELECT CustomerId, Product, State 215 | FROM data_frame1 216 | JOIN data_frame2 USING(CustomerID)") 217 | 218 | # Left join: 219 | 220 | result_frame <- sqldf("SELECT CustomerId, Product, State 221 | FROM data_frame1 222 | LEFT JOIN data_frame2 USING(CustomerID)") 223 | 224 | -------------------------------------------------------------------------------- /Individual Scripts/9. Factors.R: -------------------------------------------------------------------------------- 1 | ## Factors 2 | 3 | # Factors are used to represent categorical data and can be unordered or ordered. One can think of a factor as 4 | # an integer vector where each integer has a label. Factors are important in statistical modeling and are treated 5 | # specially by modelling functions like lm() and glm(). 6 | 7 | # Using factors with labels is better than using integers because factors are self-describing. Having a variable 8 | # that has values "Male" and "Female" is better than a variable that has values 1 and 2. 9 | 10 | factor_example <- factor(c("yes", "yes", "no", "yes")) 11 | 12 | factor_example 13 | # [1] yes yes no yes 14 | # Levels: no yes 15 | 16 | -------------------------------------------------------------------------------- /Learn R by Example.R: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/photonlines/Learn-R-by-Example/31a02695139f26cd28b7b546a7122c711224e7a9/Learn R by Example.R -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Learn R by Example 2 | 3 | This is an R script meant to serve those who want to get an overview of R by looking through code / code comments and useful examples. 4 | 5 | A summary of the file contents included in this repository is provided below: 6 | 7 | - Learn R by Example.R : File which contains the entire script / R overview. 8 | - 'Individual Scripts' directory : Directory which contains the learn R by example file contents spilt up into seperate sections / R scripts. 9 | - SplitRFilePSScript.ps1 : PowerShell script which splits the learn R by example file into individual sections and saves them in the 'individual scripts' directory. 10 | 11 | ## License 12 | This code is released under Creative Commons BY-SA license. 13 | Most of the original examples / content have been modified and taken from the R Notes for Professionals book available here: https://books.goalkicker.com/RBook/ 14 | The original book content was compiled from Stack Overflow Documentation, and written by the great people at Stack Overflow. 15 | -------------------------------------------------------------------------------- /SplitRFilePSScript.ps1: -------------------------------------------------------------------------------- 1 | # The following PowerShell script divides the original 'Learn R by Example.R' file 2 | # into multiple ones and copies them into another output directory. 3 | 4 | # As a note, some of the individual scripts may not work when run in isolation, 5 | # since some of the examples re-use the install_and_load_package function or 6 | # data from previous sections / examples. 7 | 8 | $InputFileName = "Learn R by Example.R" 9 | $OutputDirectoryName = "Individual Scripts" 10 | 11 | # Flag used to control whether you want to re-create the output directory containing 12 | # the individual R files. If it's set to True, the output directory will be deleted 13 | # prior to copying the files over and re-created. Otherwise, the scripts will be 14 | # copied over and any files already present within the output directory will be 15 | # overwritten. 16 | 17 | $ReCreateOutputDirectory = $True 18 | 19 | # Section delimiter which controls how the original R file is sub-divided. Currently, 20 | # we mark each section beginning with the '##' comment characters followed by the 21 | # section title and contents which are used in splitting the original R file. 22 | 23 | $SectionDelimiter = "##" 24 | 25 | # Fetches the PowerShell script root directory path and returns it: 26 | 27 | Function Get-RootDirectoryPath { 28 | 29 | $ScriptRoot = "" 30 | 31 | # Try to make sure that the script is compatible with both newer and older 32 | # versions of PowerShell: 33 | 34 | Try { 35 | $ScriptRoot = Get-Variable -Name PSScriptRoot -ValueOnly -ErrorAction Stop 36 | } Catch { 37 | $ScriptRoot = Split-Path $script:MyInvocation.MyCommand.Path 38 | } 39 | 40 | return $ScriptRoot 41 | } 42 | 43 | $RootDirectoryPath = Get-RootDirectoryPath 44 | 45 | $InputFilePath = (Join-Path $RootDirectoryPath $InputFileName) 46 | 47 | $OutputDirectoryPath = (Join-Path $RootDirectoryPath $OutputDirectoryName) 48 | 49 | # If the re-create directory path flag is set to true, delete the output 50 | # directory prior to copying the files over. 51 | 52 | If ($ReCreateOutputDirectory) { 53 | 54 | Remove-Item $OutputDirectoryPath -Recurse -ErrorAction Ignore 55 | 56 | } 57 | 58 | # Create the output directory (if it doesn't exist) 59 | 60 | New-Item -ItemType Directory -Force -Path $OutputDirectoryPath 61 | 62 | # Fetch the number of lines in our R file so we can create a progress indicator 63 | 64 | $NumberOfLinesInFile = 0 65 | 66 | gc $InputFilePath -read 100 | % { $NumberOfLinesInFile += $_.Length } 67 | 68 | # Read the Learn R by Example file line by line: 69 | 70 | $Reader = New-Object System.IO.StreamReader($InputFilePath) 71 | 72 | Try { 73 | 74 | $SectionNumber = 1 75 | $LineNumber = 1 76 | 77 | While (($Line = $Reader.ReadLine()) -ne $null) { 78 | 79 | # If we find a new section delimiter / marker, we create a new file to output 80 | # our results to. The new file name will be set to a section number followed by 81 | # the R section title which is provided after the delimiter we want to use. 82 | # I.E. the delimiter '##' in '## Section title' will create a new 'Section title.R' 83 | # file in our output directory. 84 | 85 | If ($Line -match ($SectionDelimiter + " (.+\n?)")) { 86 | 87 | $SectionTitle = $matches[1].Trim() 88 | $OutputFileName = ($SectionNumber).ToString() + ". " + $SectionTitle + ".R" 89 | $SectionNumber = $SectionNumber + 1 90 | 91 | } 92 | 93 | # Add the line we just processed to our output directory / file: 94 | 95 | Add-Content (Join-Path $OutputDirectoryPath $OutputFileName) $Line 96 | 97 | # Update the progress indicator to show percentage processed and line number count: 98 | 99 | $PercentComplete = ($LineNumber / $NumberOfLinesInFile) * 100 100 | Write-Progress -Activity 'Processing R File' -Status "On line $LineNumber" -PercentComplete $PercentComplete 101 | 102 | $LineNumber = $LineNumber + 1 103 | 104 | } 105 | 106 | } Finally { 107 | 108 | $Reader.Dispose() 109 | 110 | } --------------------------------------------------------------------------------