├── Individual Scripts
    ├── 1. Packages.R
    ├── 10. Formulas.R
    ├── 11. Data Tables.R
    ├── 12. Differences in Subsetting Syntax.R
    ├── 13. Hashmaps.R
    ├── 14. Control Flow Structures.R
    ├── 15. DPLYR.R
    ├── 16. Functions.R
    ├── 17. Data Visualization.R
    ├── 18. Reading and Writing Data.R
    ├── 19. Web Scraping and Parsing.R
    ├── 2. Variables.R
    ├── 20. Feature Selection.R
    ├── 21. Linear Regression.R
    ├── 22. Logistic Rregression.R
    ├── 23. Random Forrest.R
    ├── 24. XGBoost.R
    ├── 25. Future To Do List.R
    ├── 3. R Scalars.R
    ├── 4. Types of Data Structures.R
    ├── 5. Vectors.R
    ├── 6. Matrices.R
    ├── 7. Lists.R
    ├── 8. Data Frames.R
    └── 9. Factors.R
├── Learn R by Example.R
├── README.md
└── SplitRFilePSScript.ps1


/Individual Scripts/1. Packages.R:
--------------------------------------------------------------------------------
 1 | ## Packages
 2 | 
 3 | # Example of how to download the the 'stringi' package from CRAN and load it:
 4 | 
 5 | install.packages('stringi') 
 6 | require(stringi)
 7 | 
 8 | # You can also use the library fuction to load and attach add-on packages.
 9 | 
10 | library(stringi)
11 | 
12 | installed.packages()
13 | 
14 | # We'll create a function which takes a list of packages as input and checks if the packages are 
15 | # installed. If not, the function installs the package and loads it into the R session.
16 | 
17 | install_and_load_package <- function(packages) {
18 |   
19 |   missing_packages <- packages[!(packages %in% installed.packages()[, "Package"])]
20 |   
21 |   if (length(missing_packages)) 
22 |     install.packages(missing_packages, dependencies = TRUE)
23 |   
24 |   # Load all of the packages by applying the require function to each item 
25 |   sapply(packages, require, character.only = TRUE)
26 |   
27 | }
28 | 
29 | # Lets try loading a few packages. 
30 | 
31 | install_and_load_package('rvest')
32 | 
33 | packages_to_load <- c("ggplot2", "reshape2")
34 | 
35 | install_and_load_package( packages_to_load )
36 | 
37 | install_and_load_package( c('randomForest', 'xgboost', 'Rcpp') )
38 | 
39 | 


--------------------------------------------------------------------------------
/Individual Scripts/10. Formulas.R:
--------------------------------------------------------------------------------
 1 | ## Formulas
 2 | 
 3 | # Statistical functions in R make heavy use of formulas. The formula interface allows you to concisely specify 
 4 | # which columns to use when fitting a model, as well as the behavior of the model. When running model functions 
 5 | # like lm (Linear Regression), the formula specifies which regression coefficients shall be estimated.
 6 | 
 7 | # On the left side of the the dependent variable is specified, while the right hand side contains the independent 
 8 | # variables. 
 9 | 
10 | formula1 <- formula(dependent_variable ~ independent_variable)
11 | 
12 | class(formula1)  # returns  "formula"
13 | 
14 | # Use the mtcars data frame to build our model / show our examples:
15 | 
16 | data_frame_to_use = mtcars
17 | 
18 | # Create new columns in our data frame marking the dependent / independent variables to use for our examples:
19 | 
20 | data_frame_to_use$dependent_variable = mtcars$mpg
21 | data_frame_to_use$independent_variable = mtcars$wt
22 | 
23 | # Run the linear regression model function:
24 | 
25 | model1 <- lm(formula1, data = data_frame_to_use)
26 | 
27 | # Technically, the formula call above is redundant because the tilde-operator is an infix function that returns 
28 | # an object with a formula class:
29 | 
30 | formula1 <- dependent_variable ~ independent_variable
31 | 
32 | class(formula1)  # returns  "formula"
33 | 
34 | # The advantage of the formula function over ~ is that it also allows an environment for evaluation to be specified:
35 | 
36 | form_mt <- formula(dependent_variable ~ independent_variable, env = data_frame_to_use)
37 | 
38 | # The formula operator + means to include a column, not to mathematically add two columns together:
39 | 
40 | data_frame_to_use$independent_variable2 = data_frame_to_use$vs
41 | 
42 | formula1 <- formula( dependent_variable ~ independent_variable + independent_variable2, data = data_frame_to_use)
43 | 
44 | # Some more basic formula operator examples:
45 | 
46 | # "-" below means: Include independent_variable but exclude independent_variable2:
47 | 
48 | formula1 <- formula( dependent_variable ~ independent_variable - independent_variable2, data = data_frame_to_use)
49 | 
50 | # ":" below means: Estimate the independent_variable and independent_variable2 interactions:
51 | 
52 | formula1 <- formula( dependent_variable ~ independent_variable:independent_variable2, data = data_frame_to_use)
53 | 
54 | # "*" below means: Include columns as well as their interactions. In other words, include independent_variable
55 | # and independent_variable2 as well as their interactions 
56 | 
57 | formula1 <- formula( dependent_variable ~ independent_variable * independent_variable2, data = data_frame_to_use)
58 | 
59 | # Same as:
60 | 
61 | formula1 <- formula( dependent_variable ~ independent_variable 
62 |                      + independent_variable2 
63 |                      + independent_variable:independent_variable2
64 |                      , data = data_frame_to_use)
65 | 
66 | #	"|" below means: Estimate dependent_variable as a function of independent_variable conditional 
67 | # on independent_variable2:
68 | 
69 | formula1 <- formula( dependent_variable ~ independent_variable | independent_variable2, data = data_frame_to_use)
70 | 
71 | # Finally, "." is shorthand for using all available variables. In the below case, the data argument is used to
72 | # obtain the available variables which are not on the left hand side:
73 | 
74 | formula1 <- formula( dependent_variable ~ . , data = data_frame_to_use)
75 | 
76 | 


--------------------------------------------------------------------------------
/Individual Scripts/11. Data Tables.R:
--------------------------------------------------------------------------------
 1 | ## Data Tables
 2 | 
 3 | # Data.table is a package that extends the functionality of data frames from base R, particularly improving on their
 4 | # performance and syntax.  Functions that work on a data.frame will also work with a data.table. There are
 5 | # many ways to create, load or coerce to a data.table.
 6 | 
 7 | # There is a constructor of the same name:
 8 | 
 9 | install_and_load_package('data.table')
10 | 
11 | data_table1 <- data.table(
12 |   x = letters[1:5],
13 |   y = 1:5,
14 |   z = (1:5) > 3
15 | )
16 | 
17 | data_table1
18 | #    x y     z
19 | # 1: a 1 FALSE
20 | # 2: b 2 FALSE
21 | # 3: c 3 FALSE
22 | # 4: d 4  TRUE
23 | # 5: e 5  TRUE
24 | 
25 | # Unlike data.frame, data.table will not coerce strings to factors:
26 | 
27 | sapply(data_table1, class)
28 | #           x         y         z
29 | # "character" "integer" "logical"
30 | 
31 | # If you have another R object (such as a matrix), you must use as.data.table to coerce it to a data.table:
32 | 
33 | mat <- matrix(0, ncol = 10, nrow = 10)
34 | 
35 | data_table2 <- as.data.table(mat)
36 | # or
37 | data_table2 <- data.table(mat)
38 | 
39 | 


--------------------------------------------------------------------------------
/Individual Scripts/12. Differences in Subsetting Syntax.R:
--------------------------------------------------------------------------------
 1 | ## Differences in Subsetting Syntax
 2 | 
 3 | # A data.table is one of several two-dimensional data structures available in R, besides data.frame, matrix and (2D)
 4 | # array. All of these classes use a very similar but not identical syntax for subsetting, the A[rows, cols] schema.
 5 | 
 6 | # Consider the following data stored in a matrix, a data.frame and a data.table:
 7 |   
 8 | matrix <- matrix(1:12, nrow=4, dimnames=list(letters[1:4], c('X', 'Y', 'Z')))
 9 | 
10 | matrix
11 | #   X Y  Z
12 | # a 1 5  9
13 | # b 2 6 10
14 | # c 3 7 11
15 | # d 4 8 12
16 | 
17 | data_frame <- as.data.frame(matrix)
18 | data_table <- as.data.table(matrix)
19 | 
20 | matrix[2:3]       # returns 2 3, the 2nd and 3rd items, as if matrix were a vector (because it is!)
21 | 
22 | data_frame[2:3]   # returns the 2nd and 3rd columns
23 | #   Y  Z
24 | # a 5  9
25 | # b 6 10
26 | # c 7 11
27 | # d 8 12
28 | 
29 | data_table[2:3]   # returns the 2nd and 3rd rows!
30 | #    X Y  Z
31 | # 1: 2 6 10
32 | # 2: 3 7 11
33 | 
34 | # If you want to be sure of what will be returned, it is better to be explicit.
35 | # To get specific rows, just add a comma after the range:
36 |   
37 | matrix[2:3, ]     # returns the 2nd and 3rd rows
38 | data_frame[2:3, ] # returns the 2nd and 3rd rows
39 | data_table[2:3, ] # returns the 2nd and 3rd rows
40 | 
41 | # But, if you want to subset columns, some cases are interpreted differently. All three can be subset the same way
42 | # with integer or character indices not stored in a variable.
43 | 
44 | matrix[, 2:3]              # returns the 2nd and 3rd columns
45 | data_frame[, 2:3]          # returns the 2nd and 3rd columns
46 | data_table[, 2:3]          # returns the 2nd and 3rd columns
47 | matrix[, c("Y", "Z")]      # returns the 2nd and 3rd columns
48 | data_frame[, c("Y", "Z")]  # returns the 2nd and 3rd columns
49 | data_table[, c("Y", "Z")]  # returns the 2nd and 3rd columns
50 | 
51 | # The setkey() function sorts a data.table and marks it as sorted. The sorted columns are the key. The key can be any 
52 | # columns in any order. The columns are always sorted in ascending order. The table is changed by reference, which 
53 | # means that the entire table isn't copied and re-arranged. The rows are just swapped.
54 | 
55 | # Setting keys in data.table:
56 | 
57 | setkey(data_table, Y)
58 | 
59 | # Setting secondary indices:
60 | 
61 | # Indexing is a way of sorting a number of records on multiple fields. Creating an index on a field in a table creates 
62 | # another data structure which holds the field value, and a pointer to the record it relates to. This index structure 
63 | # is then sorted, allowing efficient binary searches to be performed on it. The downside of this is that more memory 
64 | # is needed to hold the extra indexing data, although more efficient searches can be performed.
65 | 
66 | # In a manner similar to key, you can setindex(DT, key.col) or setindexv(DT, "key.col.string"), where DT is
67 | # your data.table. Remove all indices with setindex(DT, NULL).
68 | 
69 | # Let us set x as index:
70 | 
71 | setindex(data_table, X)
72 | 
73 | # There are many reasons to write code that is guaranteed to work with data.frame and data.table. Maybe you are
74 | # forced to use data.frame, or you may need to share some code that you don't know how will be used. So, there are
75 | # some main strategies for achieving this, in order of convenience:
76 | 
77 | # 1. Use syntax that behaves the same for both classes.
78 | # 2. Use a common function that does the same thing as the shortest syntax.
79 | # 3. Force data.table to behave as data.frame (ex.: call the specific method print.data.frame).
80 | # 4. Treat them as list, which they ultimately are.
81 | # 5. Convert the table to a data.frame before doing anything (bad idea if it is a huge table).
82 | # 6. Convert the table to data.table, if dependencies are not a concern.
83 | 
84 | 


--------------------------------------------------------------------------------
/Individual Scripts/13. Hashmaps.R:
--------------------------------------------------------------------------------
 1 | ## Hashmaps
 2 | 
 3 | # Although R does not provide a native hash table structure, similar functionality can be achieved by leveraging the
 4 | # fact that the environment object returned from new.env (by default) provides hashed key lookups. The following
 5 | # two statements are equivalent, as the hash parameter defaults to TRUE:
 6 | 
 7 | hash_map <- new.env(hash = TRUE)
 8 | hash_map <- new.env()
 9 | 
10 | # Insertion of elements may be done using either of the '<-' or '$' methods:
11 | 
12 | hash_map[["key"]] = "value"
13 | hash_map$key2 = "value2"
14 | 
15 | hash_map$key         # returns "value"
16 | hash_map[["key2"]]   # returns "value2"
17 | 
18 | # Elements can be removed using rm:
19 | 
20 | rm("key", envir = hash_map)
21 | 
22 | ls.str(hash_map)     # returns key2 :  chr "value2"
23 | 
24 | # One of the major benefits of using environment objects as hash tables is their ability to store virtually any
25 | # type of object as a value, even other environments:
26 | 
27 | hash_map2 <- new.env()
28 | hash_map2[["a"]] <- LETTERS
29 | hash_map2[["b"]] <- as.list(x = 1:5, y = matrix(rnorm(10), 2))
30 | hash_map2[["c"]] <- head(mtcars, 3)
31 | hash_map2[["d"]] <- Sys.Date()
32 | hash_map2[["e"]] <- Sys.time()
33 | 
34 | 


--------------------------------------------------------------------------------
/Individual Scripts/14. Control Flow Structures.R:
--------------------------------------------------------------------------------
 1 | ## Control Flow Structures
 2 | 
 3 | # Standard if / else if / else statement:
 4 | 
 5 | x <- 0
 6 | 
 7 | if (x < 0) {
 8 |   print("Negative")
 9 | } else if (x > 0) {
10 |   print("Positive")
11 | } else {
12 |   print("Zero")
13 | }
14 | 
15 | # Outputs:
16 | # [1] "Zero"
17 | 
18 | # R allows us to write inline constructs such as the one below:
19 | 
20 | x <- 3
21 | 
22 | y <- if(x > 3) "Larger than 3" else "Less than or equal to 3"
23 | 
24 | y  # returns "Less than or equal to 3"
25 | 
26 | # Standard for loop:
27 | 
28 | values <- c("value1","value2")
29 | 
30 | for (value in values) {
31 |   print(value)   # prints out "value1", followed by "value2"
32 | }
33 | 
34 | # To illustrate the effect of good for loop construction, we will calculate the mean of each column in four different
35 | # ways:
36 | 
37 | # 1. Using a poorly optimized for loop
38 | # 2. Using a well optimized for loop
39 | # 3. Using an *apply family of functions
40 | # 4. Using the colMeans function
41 | 
42 | # 1. Using a poorly optimized for loop example (mean time to run: ~290 ms):
43 | 
44 | poor_column_mean <- NULL
45 | 
46 | for ( i in 1 : length(mtcars) ) {
47 |   poor_column_mean[i] <- mean(mtcars[[i]])
48 | }
49 | 
50 | # 2. Using a well optimized for for loop example (mean time to run: ~260 ms):
51 | 
52 | better_column_mean <- vector("numeric", length(mtcars))
53 | 
54 | for (i in seq_along(mtcars)) {
55 |   better_column_mean <- mean(mtcars[[i]])
56 | }
57 | 
58 | # 3. Using an *apply family of functions example (mean time to run: ~120 ms):
59 | 
60 | vapply_column_mean <- vapply(mtcars, mean, numeric(1))
61 | 
62 | # 4. Using the colMeans function (mean time to run: ~180 ms):
63 | 
64 | colMeans_column_mean <- colMeans(mtcars)
65 | 
66 | # The while loop
67 | 
68 | counter <- 0
69 | 
70 | while (counter < 3) {
71 |   cat(counter, "\n")
72 |   counter <- counter + 1
73 | }
74 | 
75 | # Output:
76 | # 0
77 | # 1
78 | # 2
79 | 
80 | # The repeat loop
81 | 
82 | vector <- c("Repeat","loop")
83 | counter <- 0
84 | 
85 | repeat {
86 |   print(vector)
87 |   counter <- counter + 1
88 |   
89 |   if(counter >= 2) {
90 |     break
91 |   }
92 | }
93 | 
94 | # Outputs:
95 | # [1] "Repeat" "loop"  
96 | # [1] "Repeat" "loop" 
97 | 
98 | 


--------------------------------------------------------------------------------
/Individual Scripts/15. DPLYR.R:
--------------------------------------------------------------------------------
  1 | ## DPLYR
  2 | 
  3 | # dplyr introduces a grammar of data manipulation in R. It provides a consistent interface to work with data no
  4 | # matter where it is stored: data.frame, data.table, or a database. The key pieces of dplyr are written using Rcpp,
  5 | # which makes it very fast for working with in-memory data.
  6 | # 
  7 | # dplyr's philosophy is to have small functions that do one thing well. The five simple functions (filter, arrange,
  8 | # select, mutate, and summarise) can be used to reveal new ways to describe data. When combined with group_by,
  9 | # these functions can be used to calculate group wise summary statistics.
 10 | 
 11 | install_and_load_package('dplyr')
 12 | 
 13 | mtcars_table <- as_data_frame(tibble::rownames_to_column(mtcars, "cars"))
 14 | 
 15 | # Filter helps subset rows that match certain criteria:
 16 | 
 17 | filter(mtcars_table, cyl == 4)    # returns all cars that have 4 cylinders
 18 | 
 19 | filter(mtcars_table, cyl == 4 | cyl == 6, gear == 5) # returns the cars which have either 4 or 6 cylinders and 5 gears
 20 | 
 21 | slice(mtcars_table, 6:9)  # returns rows 6 through 9
 22 | 
 23 | # Arrange is used to sort the data by a specified variable(s).
 24 | 
 25 | arrange(mtcars_table, hp)  # orders the data by horsepower - hp
 26 | 
 27 | arrange(mtcars_table, desc(mpg), cyl) # orders the data by miles per gallon in desc order, followed by # of cylinders
 28 | 
 29 | # Select is used to select only a subset of variables
 30 | 
 31 | select (mtcars_table, mpg, disp, wt, qsec, vs)  # returns mpg, disp, wt, qsec, and vs from mtcars_tbl
 32 | 
 33 | select (mtcars_table, cylinders = cyl, displacement = disp) # returns and renames the cylinders and displacement columns
 34 | 
 35 | select (mtcars_table, mpg:wt) # returns all of the columns between the mpg and wt columns
 36 | 
 37 | # Mutate can be used to add new columns to the data.
 38 | 
 39 | mutate(mtcars_table, weight_ton = wt / 2, weight_pounds = weight_ton * 2000)  # Adds 2 new columns to the data frame
 40 | 
 41 | # To retain only the newly created columns, use transmute instead of mutate:
 42 | 
 43 | transmute(mtcars_table, weight_ton = wt/2, weight_pounds = weight_ton * 2000) # Only has the 2 columns specified
 44 | 
 45 | # Summarise calculates summary statistics of variables by collapsing multiple values to a single value.
 46 | 
 47 | summarise(mtcars_table, mean_mpg = mean(mpg), sd_mpg = sd(mpg),
 48 |           mean_disp = mean(disp), sd_disp = sd(disp))
 49 | 
 50 | # group_by can be used to perform group wise operations on data.
 51 | 
 52 | by_cyl <- group_by(mtcars_table, cyl)
 53 | summarise(by_cyl, mean_mpg = mean(mpg), sd_mpg = sd(mpg))
 54 | 
 55 | # Putting it all together:
 56 | 
 57 | # Example with intermediate results (simple):
 58 | 
 59 | selected <- select(mtcars_table, cars:hp, gear)
 60 | ordered <- arrange(selected, cyl, desc(mpg))
 61 | by_cyl <- group_by(ordered, gear)
 62 | filter(by_cyl, mpg > 20, hp > 75)
 63 | 
 64 | # Example without intermediate results (more complex):
 65 | 
 66 | filter(
 67 |   group_by(
 68 |     arrange(
 69 |       select(
 70 |         mtcars_table, cars:hp
 71 |       ), cyl, desc(mpg)
 72 |     ), cyl
 73 |   ),mpg > 20, hp > 75
 74 | )
 75 | 
 76 | # dplyr operations can be chained using the pipe %>% operator:
 77 | 
 78 | mtcars_table %>%
 79 |   select(cars:hp) %>%
 80 |   arrange(cyl, desc(mpg)) %>%
 81 |   group_by(cyl) %>%
 82 |   filter(mpg > 20, hp > 75)
 83 | 
 84 | # summarise_all() is used to apply functions to all (non-grouping) columns:
 85 | 
 86 | mtcars_table %>%
 87 |   summarise_all(n_distinct)
 88 | 
 89 | # To summarise specific multiple columns, use summarise_at: 
 90 | 
 91 | mtcars_table %>%
 92 |   group_by(cyl) %>%
 93 |   summarise_at(c("mpg", "disp", "hp"), mean)
 94 | 
 95 | # To select columns conditionally, use summarise_if:
 96 | 
 97 | mtcars_table %>%
 98 |   group_by(cyl) %>%
 99 |   summarise_if(is.numeric, mean)
100 | 
101 | 


--------------------------------------------------------------------------------
/Individual Scripts/16. Functions.R:
--------------------------------------------------------------------------------
 1 | ## Functions
 2 | 
 3 | # Anonymous functions
 4 | 
 5 | df <- data.frame(first = 5:9, second = (0:4)^2, third = -1:3)
 6 | 
 7 | # Calculate the root mean square for each column in a data.frame:
 8 | 
 9 | apply( df, 2, function(x) { sqrt(sum(x^2)) })
10 | #     first    second    third
11 | # 15.968719 18.814888 3.872983
12 | 
13 | # This function takes as input a vector (vec in this example) and outputs the same vector with the 
14 | # vector's length (6 in this case) subtracted from each of the vector's elements:
15 | 
16 | vec <- 4:9
17 | 
18 | subtract.length <- function(x) { x - length(x) }
19 | 
20 | subtract.length(vec)   # returns -2 -1 0 1 2 3
21 | 
22 | # The below function is a more complicated example which calls another function and returns a data frame:
23 | 
24 | vec2 <- (4:7)/2
25 | 
26 | msdf <- function(x, multiplier=4) {
27 |   mult <- x * multiplier
28 |   subl <- subtract.length(x)
29 |   data.frame(mult, subl)
30 | }
31 | 
32 | msdf(vec2, 5)
33 | #   mult subl
34 | # 1 10.0 -2.0
35 | # 2 12.5 -1.5
36 | # 3 15.0 -1.0
37 | # 4 17.5 -0.5
38 | 
39 | # Apply Functions
40 | 
41 | # apply: Applies a function to the rows or columns of a matrix (and higher-dimensional analogues). It's not 
42 | # advisable to use it for data frames as it will coerce to a matrix first.
43 | 
44 | matrix <- matrix(seq(1,16), 4, 4)
45 | 
46 | matrix
47 | #      [,1] [,2] [,3] [,4]
48 | # [1,]    1    5    9   13
49 | # [2,]    2    6   10   14
50 | # [3,]    3    7   11   15
51 | # [4,]    4    8   12   16
52 | 
53 | apply(matrix, 1, min)        # applies the min functions to all of the rows and returns [1] 1 2 3 4
54 | 
55 | apply(matrix, 2, max)        # applies the max functions to all of the rows and returns [1]  4  8 12 16
56 | 
57 | # lapply: Applies a function to each element of a list and returns a list containing the results.
58 | 
59 | list <- list(first = 1, second = 1:5, third = c(1,3,5)) 
60 | 
61 | lapply(list, FUN = sum) 
62 | # $first
63 | # [1] 1
64 | # 
65 | # $second
66 | # [1] 15
67 | # 
68 | # $third
69 | # [1] 9
70 | 
71 | # sapply: Applies a function to each element of a list and returns a vector containing the results.
72 | 
73 | sapply(list, FUN = sum)
74 | # first second  third 
75 | #     1     15      9 
76 | 
77 | sapply(list, FUN = length)
78 | # first second  third 
79 | #     1      5      3
80 | 
81 | 


--------------------------------------------------------------------------------
/Individual Scripts/17. Data Visualization.R:
--------------------------------------------------------------------------------
  1 | ## Data Visualization 
  2 | 
  3 | # Plotting data
  4 | 
  5 | # Below are some simple examples of how to plot a line in R, how to fit a line to some points, and how to add 
  6 | # more points to a graph.
  7 | 
  8 | # Make a very simply plot of (x, y) values and plot them:
  9 | 
 10 | x <- c(4, 6, 8, 11, 15, 18)
 11 | y <- c(2.8, 4.6, 6.2, 5.5, 7.8, 8.8)
 12 | 
 13 | plot(x, y)
 14 | 
 15 | # We can use a bunch of parameters to produce a more descriptive plot, as shown below:
 16 | 
 17 | plot(  x , y
 18 |      , xlab="X Axis Label"
 19 |      , ylab="Y Axis Label"
 20 |      , main = "Plot Title"
 21 |      , xlim = c(0, 20)     # X axis range
 22 |      , ylim = c(0, 10)     # Y axis range
 23 |      , pch = 4             # Set the plotting symbol to 'X'
 24 |      , col = "red"         # Set the plot color to red
 25 |      )
 26 | 
 27 | # Create a linear model and plot it:
 28 | 
 29 | lin_model <- lm(y ~ x)
 30 | 
 31 | abline(lin_model)
 32 | 
 33 | # Add more points to our graph:
 34 | 
 35 | x2 <- c(3.3, 6.6, 9.9, 13.2)
 36 | y2 <- c(1.6, 3.3, 5, 6.6 )
 37 | 
 38 | # Create a 2D line plot for the 4 new values:
 39 | 
 40 | points(  x2, y2
 41 |        , type="o"     # Use a line plot
 42 |        , col = "blue"
 43 |        )
 44 | 
 45 | # Histograms
 46 | 
 47 | # A histogram plots the frequencies that data appears within certain ranges. Below, we plot a simple histogram
 48 | # showing our mtcars data horse power distribution. 
 49 | 
 50 | data(mtcars)
 51 | 
 52 | hist(mtcars$hp, main = "Distribution of HP", xlab = "Horse Power")
 53 | 
 54 | # For our histogram, R will automatically calculate the intervals to use, although we can specify the amount of
 55 | # breaks we want using the breaks option:
 56 | 
 57 | hist(mtcars$hp, main = "Distribution of HP", xlab = "HP", breaks = 4)
 58 | 
 59 | # Boxplot
 60 | 
 61 | # A boxplot provides a graphical view of the median, quartiles, maximum, and minimum of a data set. 
 62 | 
 63 | boxplot(mtcars$mpg, main = "Boxplot for Miles per Gallon Data")
 64 | 
 65 | # We can also create a boxplot of a numerical variable grouped by a categorical variable
 66 | 
 67 | # Use the iris data set to create a boxplot of the sepal.length column grouped by species:
 68 | 
 69 | data(iris)
 70 | 
 71 | boxplot(Sepal.Length ~ Species, data = iris, main = "Boxplot of Sepal Length Grouped by Species")
 72 | 
 73 | # ggplot2
 74 | 
 75 | # ggplot is a popular visualization package which we can use to create elegant and complex plots.
 76 | 
 77 | # Let's illustrate some simple plots we can make using this library.
 78 | 
 79 | install_and_load_package('ggplot2')
 80 | 
 81 | # Create a regular dot plot of Sepal (length, width) points using our iris data:
 82 | 
 83 | ggplot( iris              
 84 |       , aes(x = Sepal.Length, y = Sepal.Width)) +   # Specify the aesthetic mappings (variable mappings)
 85 |         geom_point()  # Specify the geometric object. Here, we specify points (dots) to obtain a plot of points     
 86 | 
 87 | # Aesthetic mappings allow us to use properties within our data to influence the visual characeristics of our graphs.
 88 | # Lets make the same plot as above, except we will specify a different color for each flower species:
 89 | 
 90 | ggplot( iris              
 91 |       , aes(x = Sepal.Length, y = Sepal.Width, color = Species)) + 
 92 |         geom_point() 
 93 | 
 94 | # We can also make plots using different geometric shapes / objects. 
 95 | 
 96 | # Create a line chart for the iris data set:
 97 | 
 98 | ggplot( iris              
 99 |       , aes(x = Sepal.Length, y = Sepal.Width, color = Species)) + 
100 |         geom_line()   
101 | 
102 | # Create a smoothed line chart:
103 | 
104 | ggplot( iris              
105 |       , aes(x = Sepal.Length, y = Sepal.Width, color = Species)) + 
106 |         geom_smooth() 
107 | 
108 | # Create a bar chart:
109 | 
110 | ggplot( data = iris          
111 |       , aes(x = Sepal.Width)) + 
112 |         geom_bar()              
113 | 
114 | # Create a histogram (which is similar to the bar chart above):
115 | 
116 | ggplot( data = iris          
117 |       , aes(x = Sepal.Width)) + 
118 |         geom_histogram()       
119 | 
120 | # Sometimes, we may want to display multiple plots in one image with the different facets. An advantage of 
121 | # this method is that all axes share the same scale across the charts, making it easy to compare them at a glance.
122 | # You can construct a plot with multiple facets by using the facet_wrap()
123 | 
124 | # Use the iris dataset and the facet_wrap function to plot the iris sepal width accross the different species:
125 | 
126 | ggplot( iris
127 |       , aes(x = Sepal.Length, y = Sepal.Width)) + 
128 |         geom_point() + 
129 |         facet_wrap(~Species)
130 | 
131 | # Create a mew graph wiht added labels to our visuals using the labs function:
132 | 
133 | ggplot( iris
134 |       , aes(x = Sepal.Length, y = Sepal.Width, color = Species)) +
135 |         geom_point() +
136 |         labs( title = "Width and Length Iris Data",
137 |               subtitle = "Simple description of our data can be included here",
138 |               x = "Sepal Length",
139 |               y = "Sepal Width",
140 |               color = "Species"
141 |             )
142 | 
143 | # Use the facet_grid function to facet our data by more than one categorical variable:
144 | 
145 | data(mpg)
146 | 
147 | ggplot( mpg
148 |       , aes(x = displ, y = hwy)) +
149 |         geom_point() +
150 |         facet_grid(year ~ cyl)  # Create a facet grid for year / cylinder data
151 | 
152 | # Time Series Data
153 | 
154 | # Time series data can be stored as a ts object. ts objects contain information about seasonal frequency that is used
155 | # by ARIMA functions. It also allows for calling of elements in the series by date using the window command.
156 | 
157 | # Create a dummy dataset of 100 observations:
158 | 
159 | x <- rnorm(100)
160 | 
161 | # Convert this vector to a ts object with 100 annual observations:
162 | 
163 | x <- ts(x, start = c(1900), freq = 1)
164 | 
165 | plot(x)
166 | 
167 | # Convert this vector to a ts object with 100 monthly observations starting in July:
168 | 
169 | x <- ts(x, start = c(1900, 7), freq = 12)
170 | 
171 | plot(x)
172 | 
173 | # Exploratory Data Analysis with time-series data
174 | 
175 | # Lets load some air passanger data:
176 | 
177 | data(AirPassengers)
178 | 
179 | # Plot the raw data:
180 | 
181 | plot(AirPassengers) 
182 | 
183 | # Fit a trend line:
184 | 
185 | abline(reg=lm(AirPassengers~time(AirPassengers))) 
186 | 
187 | 


--------------------------------------------------------------------------------
/Individual Scripts/18. Reading and Writing Data.R:
--------------------------------------------------------------------------------
 1 | ## Reading and Writing Data
 2 | 
 3 | # cat takes one or more character vectors as arguments and prints them to the console. If the character vector 
 4 | # has a length greater than 1, arguments are separated by a space (by default):
 5 | 
 6 | cat(c("hello", "world", "\n"))    # outputs 'hello world'
 7 | 
 8 | # Reading from or writing to a file connection
 9 | 
10 | # We don't always have the liberty to read from or write to a local system path. To establish a file connection 
11 | # to read data,  use the file() command in read mode ("r" is for read mode):
12 | 
13 | stdin_connection <- file("stdin", "r")     # when just standard input/output for files are available
14 | 
15 | file_connection <- file("README.md", "r")  # when file is local
16 | 
17 | # We can use the readline method to read the contents of the file. The n parameters specifies the number of lines 
18 | # we want to read. Setting n to 1 means that we're reading the file line by line:
19 | 
20 | read_file = function(file_path) {
21 |   connection = file(file_path, "r")
22 |   while ( TRUE ) {
23 |     line = readLines(connection, n = 1, warn = FALSE)
24 |     if ( length(line) == 0 ) {
25 |       break
26 |     }
27 |     print(line)
28 |   }
29 |   close(connection)
30 | }
31 | 
32 | read_file("README.md")  # prints the results of README.md
33 | 
34 | # You can change value of n (say 10, 20 etc.) for reading data blocks (i.e. we can use 10 to read 10 lines in
35 | # one go). To read complete file in one go set n = -1.
36 | 
37 | all_lines <- readLines(file_connection, n = -1, warn = FALSE)
38 | 
39 | print(all_lines)        # prints the results of README.md
40 | 
41 | # Close the open connections:
42 | 
43 | close(file_connection)
44 | 
45 | close(stdin_connection)
46 | 
47 | # After processing data, you can write the results back to the file connection using many different commands like 
48 | # writeLines(),cat() etc. which are capable of writing to a file connection.
49 | 
50 | write_file_connection <- file("result.data", "w") # when file is local
51 | 
52 | # Then write the data as follows:
53 | 
54 | writeLines("text", write_file_connection, sep = "\n")
55 | 
56 | close(write_file_connection)
57 | 
58 | # Delete the results.data file:
59 | 
60 | file.remove("result.data")
61 | 
62 | # Importing .csv files
63 | 
64 | # Get the file path of a CSV included in R's utils package
65 | 
66 | csv_path <- system.file("misc", "exDIF.csv", package = "utils")
67 | 
68 | df <- read.csv(csv_path)
69 | 
70 | # The data.table package introduces the function fread. While it is similar to read.table, fread is usually faster and
71 | # more flexible. It tries to 'guess' the file's delimiter automatically:
72 | 
73 | dt <- fread(csv_path)
74 | 
75 | # To return an ordinary data.frame, set the data.table parameter to FALSE:
76 | 
77 | df <- fread(csv_path, data.table = FALSE)
78 | 
79 | # Data can be written to a CSV file using write.csv():
80 | 
81 | write.csv(mtcars, "mtcars.csv")
82 | 
83 | # Importing multiple csv files:
84 | 
85 | files = list.files(pattern="*.csv")
86 | 
87 | data_list = lapply(files, read.table, header = TRUE)
88 | 
89 | print (data_list)
90 | 
91 | 


--------------------------------------------------------------------------------
/Individual Scripts/19. Web Scraping and Parsing.R:
--------------------------------------------------------------------------------
 1 | ## Web Scraping and Parsing
 2 | 
 3 | # rvest is a package for web scraping and parsing by Hadley Wickham inspired by Python's Beautiful Soup. It
 4 | # leverages Hadley's xml2 package's libxml2 bindings for HTML parsing.
 5 | 
 6 | # To scrape the table of R milestones from the Wikipedia page on R, the code would look like:
 7 | 
 8 | install_and_load_package('rvest')
 9 | 
10 | url <- 'https://en.wikipedia.org/wiki/R_(programming_language)'
11 | 
12 | # Scrape HTML from website and use pipe operators to transform it into a data frame:
13 | 
14 | url %>%
15 |   # Read the url html:
16 |   read_html() %>%
17 |   # Select HTML tag with class="wikitable":
18 |   html_node(css = '.wikitable') %>%
19 |   # Parse table into data.frame:
20 |   html_table() %>%
21 |   # Trim the description to 100 characters for printing:
22 |   dplyr::mutate(Description = substr(Description, 1, 100))
23 | 
24 | 


--------------------------------------------------------------------------------
/Individual Scripts/2. Variables.R:
--------------------------------------------------------------------------------
 1 | ## Variables
 2 | 
 3 | # In R, variables are assigned values using the infix-assignment operator <-. The operator = can also be used for
 4 | # assigning values to variables, however, its proper use is for associating values with parameter names in function
 5 | # calls.
 6 | 
 7 | variable1 <- 22
 8 | variable2 = 23
 9 | 
10 | # It is also possible to make assignments to variables using ->.
11 | 
12 | 3 -> x
13 | 
14 | # Private Variables:
15 | 
16 | # A leading dot in a name of a variable or function in R is commonly used to denote that the variable or function is
17 | # meant to be hidden.
18 | 
19 | .private_variable <- 'private' 
20 | 
21 | .private_variable
22 | # [1] "private"
23 | 
24 | # The ls function which lists the objects will not include the private variable:
25 | 
26 | ls()   # returns a list which doesn't include 'private_variable'
27 | 
28 | 


--------------------------------------------------------------------------------
/Individual Scripts/20. Feature Selection.R:
--------------------------------------------------------------------------------
 1 | ## Feature Selection
 2 | 
 3 | # Feature selection is about removing extraneous features / data cleaning.
 4 | 
 5 | # A feature that has near zero variance is a good candidate for removal. You can manually detect numerical 
 6 | # variance below your own threshold:
 7 | 
 8 | data(iris)
 9 | 
10 | variances <- apply(iris, 2, var)
11 | 
12 | variances[which(variances <= 0.0025)]    # returns character(0)
13 | 
14 | # Or, you can use the caret package to find near zero variance:
15 | 
16 | install_and_load_package('caret') 
17 | 
18 | names(iris)[nearZeroVar(iris)]         # returns character(0)
19 | 
20 | # Removing features with high numbers of NA. If a feature is largely lacking data, it is a good candidate 
21 | # for removal:
22 | 
23 | install_and_load_package('VIM')
24 | 
25 | # Load the sleep data:
26 | 
27 | data(sleep)
28 | 
29 | # Use the colMeans and is.na functions to find the ratio of missing values for each column
30 | 
31 | colMeans( is.na(sleep) )
32 | #    BodyWgt   BrainWgt       NonD      Dream     Sleep        Span       Gest
33 | # 0.00000000 0.00000000 0.22580645 0.19354839 0.06451613 0.06451613 0.06451613
34 | 
35 | # In the above case, we may want to remove NonD and Dream, which have around 20% of their values missing 
36 | 
37 | # To drop columns, there's the subset command, which we could use as demonstrated below:
38 | 
39 | sleep_with_missing_col <- subset(sleep, select = -c(NonD, Dream))
40 | 
41 | # Or we can use regular frame filtering and %in% operator:
42 | 
43 | columns_to_drop <- c("NonD", "Dream")
44 | 
45 | sleep_with_missing_col <- sleep[ , !(names(sleep) %in% columns_to_drop)]
46 | 
47 | # Removing Closely Correlated Features
48 | 
49 | # Closely correlated features may add variance to your model, and removing one of a correlated pair might help
50 | # reduce that. There are lots of ways to detect correlation. Here's one:
51 | 
52 | install_and_load_package('purrr') 
53 | 
54 | # Select correlatable vars (numeric ones)
55 | 
56 | to_correlate <- mtcars %>% keep ( is.numeric )
57 | 
58 | # Calculate correlation matrix
59 | 
60 | correlation_matrix <- cor(to_correlate)
61 | 
62 | correlation_matrix
63 | 
64 | # Pick only one out of each highly correlated pair's mirror image
65 | 
66 | correlation_matrix[upper.tri(correlation_matrix)] <- 0
67 | 
68 | # We don't remove the highly-correlated-with-itself group
69 | 
70 | diag(correlation_matrix) <- 0
71 | 
72 | # Find features that are highly correlated with another feature at the +- 0.85 level
73 | 
74 | apply(correlation_matrix, 2, function(x) any( abs(x) >= 0.85 ) )
75 | #  mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb 
76 | # TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 
77 | 
78 | # We'll want to look at what MPG is correlated to so strongly, and decide what to keep and what to toss. 
79 | # Same for cyl and disp. Alternatively, we might need to combine some strongly correlated features.
80 | 
81 | 


--------------------------------------------------------------------------------
/Individual Scripts/21. Linear Regression.R:
--------------------------------------------------------------------------------
  1 | ## Linear Regression 
  2 | 
  3 | # The built-in mtcars data frame contains information about 32 cars, including their weight, fuel efficiency 
  4 | # (in miles per gallon), speed, etc. If we are interested in the relationship between fuel efficiency (mpg) 
  5 | # and weight (wt) we may start plotting those variables with:
  6 | 
  7 | plot(mpg ~ wt, data = mtcars, col = 2)
  8 | 
  9 | # The plots shows a (linear) relationship!. Then if we want to perform linear regression to determine the 
 10 | # coefficients of a linear model, we would use the lm function.
 11 | # The ~ here means "explained by", so the formula mpg ~ wt means we are predicting mpg as explained by wt:
 12 | 
 13 | example_model <- lm(mpg ~ disp, data = mtcars)
 14 | 
 15 | # We can use the summary function to display the key output / results:
 16 | 
 17 | summary(example_model)
 18 | 
 19 | # Using the 'predict' function: Once a model is built predict is the main function to test with new data.
 20 | 
 21 | # First, we sample from our original data
 22 | 
 23 | set.seed(1234)
 24 | 
 25 | new_data <- sample(mtcars$disp, 5)
 26 | 
 27 | new_data   # returns [1] 258.0  71.1  75.7 145.0 400.0
 28 | 
 29 | # Create a new data frame with the same column names as the original data
 30 | 
 31 | new_df <- data.frame( disp = new_data )
 32 | 
 33 | new_df
 34 | 
 35 | predict(example_model, new_df)
 36 | 
 37 | # Checking accuracy
 38 | 
 39 | # Let's create a new data frame and use our model to make predictions and check our end results.
 40 | # Create a new data frame containing the first 10 results from our mtcars data:
 41 | 
 42 | new_df2 = data.frame(mpg = mtcars$mpg[1:10], disp = mtcars$disp[1:10])
 43 | 
 44 | # Use our linear model to make predictions on the data set created above:
 45 | 
 46 | predictions <- predict(example_model, new_df2)
 47 | 
 48 | # Calculate the root mean square error comparing our generated results and the original data set:
 49 | 
 50 | sqrt(mean( (predictions - new_df2$mpg)^2 , na.rm = TRUE))   # returns 2.325148
 51 | 
 52 | # Checking for nonlinearity with polynomial regression
 53 | 
 54 | # Sometimes when working with linear regression we need to check for non-linearity in the data. One way to do this
 55 | # is to fit a polynomial model and check whether it fits the data better than a linear model. There are other reasons,
 56 | # such as theoretical, that indicate to fit a quadratic or higher order model because it is believed that the variables
 57 | # relationship is inherently polynomial in nature.
 58 | 
 59 | # Let's fit a quadratic model for the mtcars dataset. For a linear model see Linear regression on the mtcars dataset.
 60 | 
 61 | # A linear fit will show that disp is not significant.
 62 | 
 63 | fit0 = lm(mpg ~ wt + disp, mtcars)
 64 | 
 65 | summary(fit0)
 66 | 
 67 | # Output:
 68 | # Coefficients:
 69 | #   Estimate Std. Error t value Pr(>|t|)    
 70 | # (Intercept) 34.96055    2.16454  16.151 4.91e-16 ***
 71 | #   wt          -3.35082    1.16413  -2.878  0.00743 ** 
 72 | #   disp        -0.01773    0.00919  -1.929  0.06362 .  
 73 | # ---
 74 | #   Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
 75 | # 
 76 | # Residual standard error: 2.917 on 29 degrees of freedom
 77 | # Multiple R-squared:  0.7809,	Adjusted R-squared:  0.7658 
 78 | # F-statistic: 51.69 on 2 and 29 DF,  p-value: 2.744e-10
 79 | 
 80 | # Then, to get the result of a quadratic model, we added I(disp^2). The new model appears better when looking at
 81 | # R^2 and all variables are significant.
 82 | 
 83 | fit1 = lm(mpg ~ wt + disp + I( disp ^ 2 ), mtcars)
 84 | 
 85 | summary(fit1)
 86 | 
 87 | # Output:
 88 | # Coefficients:
 89 | #   Estimate Std. Error t value Pr(>|t|)    
 90 | # (Intercept) 41.4019837  2.4266906  17.061  2.5e-16 ***
 91 | #   wt          -3.4179165  0.9545642  -3.581 0.001278 ** 
 92 | #   disp        -0.0823950  0.0182460  -4.516 0.000104 ***
 93 | #   I(disp^2)    0.0001277  0.0000328   3.892 0.000561 ***
 94 | #   ---
 95 | #   Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
 96 | # 
 97 | # Residual standard error: 2.391 on 28 degrees of freedom
 98 | # Multiple R-squared:  0.8578,	Adjusted R-squared:  0.8426 
 99 | # F-statistic: 56.32 on 3 and 28 DF,  p-value: 5.563e-12
100 | 
101 | # Another way to specify polynomial regression is using poly with parameter raw=TRUE, otherwise orthogonal
102 | # polynomials will be considered (see the help(ploy) for more information). We get the same result using:
103 | 
104 | summary(lm(mpg ~ wt + poly(disp, 2, raw=TRUE), mtcars))
105 | 
106 | 


--------------------------------------------------------------------------------
/Individual Scripts/22. Logistic Rregression.R:
--------------------------------------------------------------------------------
 1 | ## Logistic Rregression 
 2 | 
 3 | # Example logistic regression on the Titanic dataset
 4 | 
 5 | # Read the data:
 6 | 
 7 | url <- "http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt"
 8 | 
 9 | titanic <- read.csv(file = url, stringsAsFactors = FALSE)
10 | 
11 | # Clean the missing values. In that case, we replace the missing age values with the average:
12 | 
13 | titanic$age[is.na(titanic$age)] <- mean(titanic$age, na.rm = TRUE)
14 | 
15 | # Train the model:
16 | 
17 | titanic.train <- glm(  survived ~ pclass + sex + age
18 |                      , family = binomial
19 |                      , data = titanic
20 |                      )
21 | 
22 | # Display the model summary:
23 | 
24 | summary(titanic.train)
25 | 
26 | # Output:
27 | # glm(formula = survived ~ pclass + sex + age, family = binomial, data = titanic)
28 | # Call:  glm(formula = survived ~ pclass + sex + age, family = binomial, 
29 | #            data = titanic)
30 | # 
31 | # Coefficients:
32 | #   (Intercept)    pclass2nd    pclass3rd      sexmale          age  
33 | # 4.52216     -1.49523     -2.84127     -3.08671     -0.04931  
34 | # 
35 | # Degrees of Freedom: 632 Total (i.e. Null);  628 Residual
36 | # (680 observations deleted due to missingness)
37 | # Null Deviance:	    869.5 
38 | # Residual Deviance: 539.7 	AIC: 549.7
39 | 
40 | 


--------------------------------------------------------------------------------
/Individual Scripts/23. Random Forrest.R:
--------------------------------------------------------------------------------
 1 | ## Random Forrest
 2 | 
 3 | # Random Forests is a learning method mainly used for classification. It is based on generating a large 
 4 | # number of decision trees, each constructed using a different subset of your training set. 
 5 | 
 6 | # Load the package using our function:
 7 | 
 8 | install_and_load_package('randomForest')
 9 | 
10 | # Let's split our data set into training and validation data and train a random forrest algorithm:
11 | 
12 | # Set random seed to make results reproducible:
13 | 
14 | set.seed(17)
15 | 
16 | # Calculate the size of each of the data sets:
17 | 
18 | data_set_size <- floor( nrow(iris) / 2 )
19 | 
20 | # Generate a random sample of "data_set_size" indexes
21 | 
22 | indexes <- sample(1:nrow(iris), size = data_set_size)
23 | 
24 | # Assign the data to the correct sets
25 | 
26 | training <- iris[indexes,]
27 | validation1 <- iris[-indexes,]
28 | 
29 | # Some important random forrest parameters:
30 | # ntree: Defines the number of trees to be generated. It is typical to test a range of values for this parameter
31 | #        (i.e. 100,200,300,400,500) and choose the one that minimises the OOB estimate of error rate.
32 | # mtry: Is the number of features used in the construction of each tree. These features are selected at random, 
33 | #       which is where the "random" in "random forests" comes from. The default value for this parameter, when 
34 | #       performing classification, is sqrt(number of features).
35 | # importance: Enables the algorithm to calculate variable importance.
36 | 
37 | # Perform the training:
38 | 
39 | rf_classifier = randomForest(Species ~ ., data = training, ntree = 100, mtry = 2, importance=TRUE)
40 | 
41 | # Validation set assessment #1: looking at confusion matrix. The confusion matrix is a good way of looking at 
42 | # how good our classifier is performing when presented with new data.
43 | 
44 | prediction_for_table <- predict(rf_classifier, validation1[, -5])
45 | 
46 | table(observed=validation1[, 5], predicted = prediction_for_table)
47 | #             predicted
48 | # observed     setosa versicolor virginica
49 | # setosa         29          0         0
50 | # versicolor      0         20         3
51 | # virginica       0          1        22
52 | 
53 | 


--------------------------------------------------------------------------------
/Individual Scripts/24. XGBoost.R:
--------------------------------------------------------------------------------
 1 | ## XGBoost
 2 | 
 3 | # XGBoost is a library designed and optimized for boosting trees algorithms.
 4 | # It's a highly successful algorithm, having won multiple machine learning competitions.
 5 | 
 6 | # Import the iris dataset
 7 | 
 8 | install_and_load_package( c('datasets', 'xgboost'))
 9 | 
10 | data(iris)
11 | 
12 | # The first 4 columns of the dataset supply the feature vector, while the last column contains the 
13 | # corresponding labels:
14 | 
15 | x = as.matrix(iris[, 1:4])
16 | y = as.numeric(factor(iris[, 5])) - 1
17 | 
18 | # Run the xgboost command:
19 | 
20 | model <- xgboost(data = x, label = y, nrounds = 10)
21 | # [1]	train-rmse:0.685905 
22 | # [2]	train-rmse:0.494086 
23 | # [3]	train-rmse:0.357192 
24 | # [4]	train-rmse:0.262149 
25 | # [5]	train-rmse:0.194319 
26 | # [6]	train-rmse:0.147978 
27 | # [7]	train-rmse:0.110566 
28 | # [8]	train-rmse:0.083971 
29 | # [9]	train-rmse:0.064658 
30 | # [10]	train-rmse:0.050646 
31 | 
32 | # We can see that the algorithm went through 10 iterations, and that the training error, as measured by rmse, 
33 | # is decreasing, which is great. 
34 | 
35 | # We didn't define/ supply and test data. Therefore, to assess the performance of the model on unseen data, 
36 | # we perform cross-validation:
37 | 
38 | set.seed(1)
39 | 
40 | cv <- xgb.cv(  data = x
41 |              , label = y
42 |              , nfold = 5
43 |              , nrounds = 60
44 |           )
45 | 
46 | # Let?s take a closer look at some XGBoost hyperparameters:
47 | # Objective: specifies the task of XGBoost and determines the output: the most common ones are linear
48 | # (reg:linear) and logistic regression (reg:logistic), multiclass logistic regression (multi:softprob) 
49 | # and ranking (rank:pairwise).
50 | # eval_metric: specifies the evaluation metric used by XGBoost. It heavily depends on the objective 
51 | # (rmse for regression, and error for classification, mean average precision for ranking) and users can 
52 | # create custom ones.
53 | # max_depth: specifies the max deepness of the trees (maximal number of splits) used by XGBoost. The 
54 | # bigger this number, the more abstract our model can be, but we risk overfitting the model to the data.
55 | # eta controls: the learning rate: the higher eta, the higher the contribution of new trees. Smaller 
56 | # number will prevent overfitting, but increase the number of rounds XGBoost will need to converge.
57 | 
58 | params <- list("objective" = "multi:softprob",
59 |                "eval_metric" = "mlogloss",
60 |                "max_depth" = 6,
61 |                "eta" = 0.3,
62 |                "gamma" = 0,
63 |                "colsample_bytree" = 1,
64 |                "min_child_weight" = 1,
65 |                "num_class" = 3)
66 | 
67 | # early_stopping_rounds: This parameter specifies when to stop the training / algorithm. Setting it to 8 
68 | # means that we're making the algorithm stop if there has not been an improvement in the test score (test-rmse) 
69 | # for 8 rounds.
70 | 
71 | cv <- xgb.cv(   data = x
72 |               , label = y
73 |               , nfold = 5
74 |               , nrounds = 100
75 |               , early_stopping_rounds = 8
76 |               , params = params)
77 | 
78 | # After running the above, we can see that our algoirthm will use test_mlogloss for early stopping and will 
79 | # stop at ~20 iterations.
80 | 
81 | 


--------------------------------------------------------------------------------
/Individual Scripts/25. Future To Do List.R:
--------------------------------------------------------------------------------
1 | ## Future To Do List
2 | 
3 | # 1. Add in caret package examples.
4 | # 2. Come up with clearer machine learning explanations / examples.
5 | 
6 | 


--------------------------------------------------------------------------------
/Individual Scripts/3. R Scalars.R:
--------------------------------------------------------------------------------
 1 | ## R Scalars
 2 | 
 3 | # Some simple examples of declaring and using scalars:
 4 | 
 5 | scalar1 <- 1
 6 | scalar2 <- 2
 7 | 
 8 | scalar1 + scalar2    # returns  3
 9 | 
10 | # Numeric classes and storage modes
11 | 
12 | # Doubles are R's default numeric value. They are double precision vectors, meaning that they take up 8 bytes of
13 | # memory for each value in the vector. 
14 | 
15 | # Integers are represented by a number with an L after it. Any number without an L after it will be considered a 
16 | # double.
17 | 
18 | is.double( 1 )     # returns TRUE
19 | is.double( 1.0 )   # returns TRUE
20 | is.double( 1L )    # returns FALSE
21 | 
22 | # Numeric represents integers and doubles and is the default mode assigned to vectors of numbers. The function
23 | # is.numeric() will determine if a vector is numeric. It is important to note that although integers and doubles
24 | # will pass is.numeric(), the function as.numeric() will always attempt to convert to type double.
25 | 
26 | x <- 12.3
27 | y <- 12L
28 | 
29 | typeof(x)    # returns "double"
30 | typeof(y)    # returns "integer"
31 | 
32 | # Logical to numeric conversion
33 | 
34 | as.numeric(TRUE)                # returns 1
35 | 
36 | # While TRUE == 1, it is a double and not an integer
37 | 
38 | is.integer(as.numeric(TRUE))    # returns FALSE
39 | 
40 | # The logical class
41 | 
42 | # ! Not !x
43 | 
44 | !TRUE     # returns  FALSE
45 | 
46 | # The || operator evaluates the left condition and if the left condition is TRUE the right side is never evaluated: 
47 | 
48 | 7 > 6 || stop ( "X is too small" )       # returns TRUE
49 | 
50 | 7 > 6 | stop ( "X is too small" )        # returns Error: X is too small
51 | 
52 | # The && operator will likewise return FALSE without evaluation of the second argument when the first element 
53 | # of the first argument is FALSE:
54 | 
55 | 7 <= 6 && stop ( "X is too small" )      # returns FALSE
56 | 
57 | 7 <= 6 & stop ( "X is too small" )       # returns Error: X is too small
58 | 
59 | 


--------------------------------------------------------------------------------
/Individual Scripts/4. Types of Data Structures.R:
--------------------------------------------------------------------------------
 1 | ## Types of Data Structures
 2 | 
 3 | # There are no scalar data types in R. Vectors of length-one act like scalars.
 4 | 
 5 | # There are 4 main data structures:
 6 | 
 7 | # 1. Vectors: Atomic vectors must be sequence of same-class objects: a sequence of numbers, a sequence of
 8 | #    logicals, or a sequence of characters. Vectors are usually created with c(), short for combine:
 9 | 
10 | vec <- c(1, 4.5, 8.5, 10)
11 | 
12 | # 2. Matrices: A matrix of numbers, logicals or characters. Matrices are vectors with a dimension attribute. The 
13 | #    dimension attribute is itself an integer vector of length 2 (number of rows, number of columns).
14 | 
15 | matr <- matrix(1:6, ncol = 3, nrow = 2)
16 | 
17 | # 3. Lists: A special type of vector that can contain elements of different classes. 
18 | 
19 | lst <- list(4:8, "g", c(FALSE, FALSE, TRUE), c(1.5, 8.8))
20 | 
21 | # 4. Data Frames: Data frames are represented as a special type of list where every element of the list has to 
22 | #    have the same length. Each element of the list can be thought of as a column and the length of each element
23 | #    of the list is the number of rows. Unlike matrices, data frames can store different classes of objects in 
24 | #    each column. 
25 | 
26 | df <- data.frame( x = 1:3, y = c("str1", "str1", "str3"))
27 | 
28 | 


--------------------------------------------------------------------------------
/Individual Scripts/5. Vectors.R:
--------------------------------------------------------------------------------
 1 | ## Vectors
 2 | 
 3 | # Atomic vectors must be sequence of same-class objects.
 4 | 
 5 | vector <- c(2, 3, 7, 10)
 6 | vector2 <- c("a", "b", "c")
 7 | 
 8 | vector[1]    # returns 1, the first element
 9 | vector[4]    # returns 10, our last element
10 | 
11 | vector1 <- c(1,2,3,4)
12 | vector2 <- c(2,3,4,5)
13 | 
14 | vector1 + vector2    # returns  3 5 7 9
15 | 
16 | vector1 + scalar1    # returns  2 3 4 5
17 | 
18 | vector1 ^ 2          # returns  1  4  9 16
19 | 
20 | # Some vector functions:
21 | 
22 | # Length outputs the number of elements in the vector:
23 | 
24 | vector1 <- 1:4
25 | 
26 | length(vector1)  # returns:  4
27 | 
28 | # Sum adds all of the elements of a vector:
29 | 
30 | sum(vector1)  # returns:  10
31 | 
32 | # Adding vectors of different lengths: Shorter vectors in the expression are recycled as often as need be
33 | # until they match the length of the longest vector. In particular a constant is simply repeated. In the below
34 | # example, vector1's first element is recycled since its length is less than that of vector2:
35 | 
36 | vector2 = 1:5
37 | 
38 | vector1 + vector2   # returns  2 4 6 8 6 (with a warning message)
39 | 
40 | # The Character Class
41 | 
42 | # Characters are what other languages call 'string vectors.'
43 | 
44 | x <- "The quick brown fox jumps over the lazy dog"
45 | 
46 | class(x)          # returns  "character"
47 | 
48 | # String Manipulation
49 | 
50 | # Count pattern inside string:
51 | 
52 | stri_count_fixed("babab", "b")     # returns  3
53 | stri_count_fixed("babab", "ba")    # returns  2
54 | 
55 | # With regex
56 | 
57 | stri_count_regex("a1 b2 a3 b4 aa", "a.")   # returns  3
58 | stri_count_regex("a1 b2 a3 b4 aa", "a\\d") # returns  2
59 | 
60 | # Duplicating strings
61 | 
62 | stri_dup("abc", 3)  # returns  "abcabcabc"
63 | 
64 | 


--------------------------------------------------------------------------------
/Individual Scripts/6. Matrices.R:
--------------------------------------------------------------------------------
  1 | ## Matrices 
  2 | 
  3 | # Like vectors, matrices must be made of same-class elements.
  4 | 
  5 | # Under the hood, a matrix is a special kind of vector with two dimensions. You can create matrices using the 
  6 | # matrix function as shown below:
  7 | 
  8 | matrix(data = 1:6, nrow = 2, ncol = 3)
  9 | # [,1] [,2] [,3]
 10 | # [1,] 1 3 5
 11 | # [2,] 2 4 6
 12 | 
 13 | matrix2 <- matrix( data = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12), nrow = 4, ncol = 3, byrow = F)
 14 | 
 15 | matrix2
 16 | #       [,1] [,2] [,3]
 17 | # [1,]    1    5    9
 18 | # [2,]    2    6   10
 19 | # [3,]    3    7   11
 20 | # [4,]    4    8   12
 21 | 
 22 | matrix2[1, 1]  # returns 1, the first entry of our matrix.
 23 | matrix2[1, 2]  # returns 5, the element on the first row, second column.
 24 | matrix2[4, 3]  # returns 12, the last element of our matrix (fourth row, third column).
 25 | 
 26 | # The rows and columns of a matrix can have names. You can look at these using the functions rownames and colnames. 
 27 | # As shown below, the rows and columns don't initially have names, and are denoted by NULL:
 28 | 
 29 | rownames(matrix2) # returns NULL
 30 | colnames(matrix2) # returns NULL
 31 | 
 32 | # However, you can assign values to them:
 33 | 
 34 | rownames(matrix2) <- c("Row 1", "Row 2", "Row 3", "Row 4")
 35 | colnames(matrix2) <- c("Col 1", "Col 2", "Col 3")
 36 | 
 37 | matrix2
 38 | #       Col 1 Col 2 Col 3
 39 | # Row 1     1     5     9
 40 | # Row 2     2     6    10
 41 | # Row 3     3     7    11
 42 | # Row 4     4     8    12
 43 | 
 44 | # The 'class', 'is', and 'as' functions can be used to check and coerce data structures:
 45 | 
 46 | class(matrix2)        # returns  "matrix"
 47 | is.matrix(matrix2)    # returns  TRUE
 48 | as.vector(matrix2)    # returns  1  2  3  4  5  6  7  8  9 10 11 12
 49 | 
 50 | # Matrix creation through cbind (column bind) function:
 51 | 
 52 | matrix1 <- cbind(1:4, 5:8, 9:12)
 53 | 
 54 | matrix1
 55 | #      [,1] [,2] [,3]
 56 | # [1,]    1    5    9
 57 | # [2,]    2    6   10
 58 | # [3,]    3    7   11
 59 | # [4,]    4    8   12
 60 | 
 61 | # Matrix creation through rbind (row bind) function.
 62 | 
 63 | matrix2 = rbind( c(1,5,9), c(2,6,10), c(3,7,11), c(4,8,12) )
 64 | #      [,1] [,2] [,3]
 65 | # [1,]    1    5    9
 66 | # [2,]    2    6   10
 67 | # [3,]    3    7   11
 68 | # [4,]    4    8   12
 69 | 
 70 | # Matrix + scalar example:
 71 | # R adds the scalar value to each entry in the matrix:
 72 | 
 73 | matrix1 + scalar1
 74 | #      [,1] [,2] [,3]
 75 | # [1,]    2    6   10
 76 | # [2,]    3    7   11
 77 | # [3,]    4    8   12
 78 | # [4,]    5    9   13
 79 | 
 80 | # Matrix + vector example:
 81 | # R does the operation in a column-wise manner:
 82 | 
 83 | matrix1 + vector1
 84 | #      [,1] [,2] [,3]
 85 | # [1,]    2    6   10
 86 | # [2,]    4    8   12
 87 | # [3,]    6   10   14
 88 | # [4,]    8   12   16
 89 | 
 90 | # Matrix + matrix example (it's always component wise):
 91 | 
 92 | matrix1 + matrix2
 93 | # [,1] [,2] [,3]
 94 | # [1,]    2   10   18
 95 | # [2,]    4   12   20
 96 | # [3,]    6   14   22
 97 | # [4,]    8   16   24
 98 | 
 99 | # Matrix standard product example (it's always component wise):
100 | 
101 | matrix1 * matrix2
102 | # [,1] [,2] [,3]
103 | # [1,]    1   25   81
104 | # [2,]    4   36  100
105 | # [3,]    9   49  121
106 | # [4,]   16   64  144
107 | 
108 | 


--------------------------------------------------------------------------------
/Individual Scripts/7. Lists.R:
--------------------------------------------------------------------------------
 1 | ## Lists
 2 | 
 3 | # Lists are a special type of vector where each element can be anything, even another list.
 4 | 
 5 | list_example <- list(   A = c(5,6,7,8)
 6 |                       , B = letters[1:6]
 7 |                       , CC = list( 5, "Z") 
 8 |                     )
 9 | 
10 | example_list <- list (course = 'stat'
11 |                     , date = '04/07/2009'
12 |                     , num_isc = 7
13 |                     , num_cons = 6
14 |                     , num_mat = as.character(c(45020, 45679, 46789, 43126, 42345, 47568, 45674))
15 |                     , results = c(30, 19, 29, NA, 25, 26 ,27) 
16 |                   )
17 | 
18 | # Extracting elements from a list can be done by name (if the list is named) or by index:
19 | 
20 | example_list$date   # returns the date element "04/07/2009"
21 | example_list[1]     # returns the 1st element "stat"
22 | 
23 | # Lists have two very important uses:
24 | 
25 | # 1) Since functions can only return a single value, it is common to return complicated results in a list.
26 | # 2) Lists are also the underlying fundamental class for data frames. Under the hood, a data frame is a list of
27 | #    vectors all having the same length.
28 | 
29 | # Using a list to return function results:
30 | 
31 | example_function <- function(x) list( xplus = x + 10, xsq = x ^ 2 )
32 | 
33 | results = example_function (7)
34 | 
35 | results$xplus    # returns 17
36 | results$xsq      # returns 49
37 | 
38 | # Using a list to create a data frame:
39 | 
40 | list1 <- list(x = 1:2, y = c("A","B"))
41 | 
42 | data_frame1 <- data.frame(list1)
43 | 
44 | data_frame1
45 | #   x y
46 | # 1 1 A
47 | # 2 2 B
48 | 
49 | is.list(data_frame1)  # returns  TRUE
50 | 
51 | 


--------------------------------------------------------------------------------
/Individual Scripts/8. Data Frames.R:
--------------------------------------------------------------------------------
  1 | ## Data Frames
  2 | 
  3 | # A data.frame is a special kind of list: it is rectangular. Each element (column) of the list has the same length, 
  4 | # and each row has a "row name". Each column has its own class, but the class of one column can be different 
  5 | # from the class of another column (unlike a matrix, where all elements must have the same class).
  6 | 
  7 | example_frame <- data.frame(  matr = as.character(c(45020, 45679, 46789, 43126, 42345, 47568, 45674))
  8 |                    , res_S = c(30, 19, 29, NA, 25, 26, 27)
  9 |                    , res_O = c(3, 3, 1, NA, 3, 2, NA) 
 10 |                 )
 11 | 
 12 | example_frame
 13 | #    matr res_S res_O
 14 | # 1 45020    30     3
 15 | # 2 45679    19     3
 16 | # 3 46789    29     1
 17 | # 4 43126    NA    NA
 18 | # 5 42345    25     3
 19 | # 6 47568    26     2
 20 | # 7 45674    27    NA
 21 | 
 22 | # Subsetting rows and columns from a data frame
 23 | 
 24 | # We can access elements of a data frame using matrix notation (with single brackets data[rows, columns]):
 25 | 
 26 | example_frame[1, 1]       # returns 45020
 27 | 
 28 | # Get the first row:
 29 | 
 30 | example_frame[1, ]        # returns 45020    30     3
 31 | 
 32 | # Get the first 2 rows:
 33 | 
 34 | example_frame[1:2, ]
 35 | #    matr res_S res_O
 36 | # 1 45020    30     3
 37 | # 2 45679    19     3
 38 | 
 39 | # Get the first column:
 40 | 
 41 | example_frame[, 1]        # returns 45020 45679 46789 43126 42345 47568 45674
 42 | 
 43 | # Get the res_O column:
 44 | 
 45 | example_frame$res_O        # returns 3  3  1 NA  3  2 NA
 46 | 
 47 | # Get the res_S column:
 48 | 
 49 | example_frame[, 'res_S']   # returns 30 19 29 NA 25 26 27
 50 | 
 51 | # Get the first and third columns:
 52 | 
 53 | example_frame[, c(1, 3)]
 54 | #    matr res_O
 55 | # 1 45020     3
 56 | # 2 45679     3
 57 | # 3 46789     1
 58 | # 4 43126    NA
 59 | # 5 42345     3
 60 | # 6 47568     2
 61 | # 7 45674    NA
 62 | 
 63 | # Get the first 4 rows of the res_S and res_O columns:
 64 | 
 65 | example_frame[1:4, c("res_S", "res_O")]
 66 | #   res_S res_O
 67 | # 1    30     3
 68 | # 2    19     3
 69 | # 3    29     1
 70 | # 4    NA    NA
 71 | 
 72 | # If you extract multiple columns, you will get a data frame back. However, if you extract a single column, you 
 73 | # will get a vector, not a data frame under the default options.
 74 | 
 75 | # Multiple columns return a data frame:
 76 | 
 77 | class(mtcars[, c("mpg", "cyl")])    # returns "data.frame"
 78 | 
 79 | # Single column returns a vector:
 80 | 
 81 | class(mtcars[, "mpg"])              # returns "numeric"
 82 | 
 83 | # When you use single brackets and no commas, you will get column back because data frames are lists of columns.
 84 | 
 85 | class(mtcars["mpg"])                # returns "data.frame"
 86 | 
 87 | mtcars[c("mpg", "cyl", "disp")]     # returns a data frame containing columns "mpg", "cyl", "disp":
 88 | 
 89 | # To extract a single column as a vector when treating your data.frame as a list, you can use double brackets [[.
 90 | # This will only work for a single column at a time.
 91 | 
 92 | # Extract a single column by name as a vector:
 93 | 
 94 | mtcars[["mpg"]]
 95 | 
 96 | # A single column can be extracted using the magical shortcut $ without using a quoted column name. 
 97 | # Columns accessed by $ will always be vectors, not data frames.
 98 | 
 99 | mtcars$mpg  # returns the column "mpg" as a vector:
100 | 
101 | # Logical vectors indicate specific elements to keep. We can use a condition such as < to generate a logical vector,
102 | # and extract only the rows that meet the condition.
103 | 
104 | mtcars[mtcars$mpg < 15, ]  # returns all cars which have miles per galon (mpg) less than 15
105 | 
106 | mtcars[mtcars$cyl == 4, ]  # returns all columns for rows where the value of cyl is 4
107 | 
108 | mtcars[mtcars$cyl == 4, c("cyl", "mpg", "hp")]  # returns the cyl, mpg, and hp columns where the value of cyl is 4
109 | 
110 | # Convenience functions to manipulate data.frames
111 | 
112 | # The subset() function allows you to subset a data.frame in a more convenient way (subset also works with other
113 | # classes):
114 | 
115 | # Return the lines for which cyl == 6 and for the columns mpg and hp:
116 | 
117 | subset(mtcars, subset = cyl == 6, select = c("mpg", "hp"))  
118 | 
119 | # Same as: 
120 | 
121 | mtcars[mtcars$cyl == 6, c("mpg", "hp")]
122 | 
123 | # The transform() function is a convenience function to change columns inside a data.frame.
124 | 
125 | # The below example adds another column named mpg2 with the result of mpg^2 to the mtcars data.frame:
126 | 
127 | mtcars <- transform(mtcars, mpg2 = mpg^2)
128 | 
129 | # Both with() and within() let you to evaluate expressions inside the data.frame environment
130 | 
131 | # The below example shows how to create, change and/or remove multiple columns in the airquality data.frame:
132 | 
133 | aq <- within(airquality, {
134 |   lOzone <- log(Ozone)                  # creates new lOzone column
135 |   Month <- factor(month.abb[Month])     # changes new Month column
136 |   cTemp <- round((Temp - 32) * 5/9, 1)  # creates new cTemp column
137 |   S.cT <- Solar.R / cTemp               # creates new S.cT column
138 |   rm(Day, Temp)                         # removes Day and Temp columns
139 | })
140 | 
141 | # It is important to note that, by default, data frames coerce characters to factors.
142 | 
143 | # The default behavior can be changed with the stringsAsFactors parameter. Example:
144 | 
145 | df3 <- data.frame(x = 1:3, y = c("a", "b", "c"), stringsAsFactors = FALSE)
146 | 
147 | # If the data has already been created, factor columns can be converted to character columns as shown below.
148 | 
149 | person <- data.frame(  jobs = c("scientist", "analyst")
150 |                   , pay = c(160000, 100000)
151 |                   , age = c(30, 25) 
152 |                   )
153 | 
154 | # Convert all columns to character:
155 | 
156 | person[] <- lapply(person, as.character)
157 | 
158 | # We can remove all rows from the data frame which have missing (NA) values using the 
159 | # complete.cases function, which returns a logical vector indicating which cases have
160 | # no missing values:
161 | 
162 | person_with_missing_info <- data.frame(   jobs = c("scientist", "secret agent", "analyst")
163 |                                         , pay = c(160000, NA, 120000)
164 |                                         , age = c(30, NA, 45) 
165 | )
166 | 
167 | person_with_missing_info[complete.cases(person_with_missing_info), ]
168 | #        jobs    pay age
169 | # 1 scientist 160000  30
170 | # 3   analyst 120000  45
171 | 
172 | # Let's say we want to only omit rows which have missing jobs info:
173 | 
174 | person_with_missing_info[complete.cases(person_with_missing_info[, c('jobs')]), ]
175 | #            jobs    pay age
176 | # 1     scientist 160000  30
177 | # 2  secret agent     NA  NA
178 | # 3       analyst 120000  45
179 | 
180 | # Let's say we want to replace NA vales with zeros - we can do this simply using the is.na function:
181 | 
182 | person_with_missing_info[is.na(person_with_missing_info)] <- 0
183 | 
184 | # Joining data frames
185 | 
186 | # We can do inner/outer/cross joins on data frames. As an example, lets take the 2 frames below.
187 | # As a note, the R rep function replicates the input vector or list.
188 | 
189 | data_frame1 = data.frame(CustomerId = c(1:6), Product = c(rep("Toaster", 3), rep("Radio", 3)))
190 | 
191 | data_frame2 = data.frame(CustomerId = c(2, 4, 6), State = c(rep("Alabama", 2), rep("Ohio", 1)))
192 | 
193 | # By using the merge function and its optional parameters, we can do SQL style joins on our data.
194 | # R automatically joins the frames by common variable names. Some exaples are provided below:
195 | 
196 | merge(x = data_frame1, y = data_frame2)                                  # Performs an inner join
197 | 
198 | merge(x = data_frame1, y = data_frame2, by = "CustomerId", all = TRUE)   # Performs an outer join
199 | 
200 | merge(x = data_frame1, y = data_frame2, by = "CustomerId", all.x = TRUE) # Performs a left join
201 | 
202 | merge(x = data_frame1, y = data_frame2, by = "CustomerId", all.y = TRUE) # Performs a right join
203 | 
204 | merge(x = data_frame1, y = data_frame2, by = NULL)                       # Performs a cross join
205 | 
206 | # You can merge/ join on multiple columns by giving by a vector, e.g., by = c("CustomerId", "OrderId")
207 | 
208 | # We can also use the sqldf package, which allows you to express these operations in SQL:
209 | 
210 | install_and_load_package('sqldf')
211 | 
212 | # Inner join:
213 | 
214 | result_frame <- sqldf("SELECT CustomerId, Product, State 
215 |                        FROM data_frame1
216 |                        JOIN data_frame2 USING(CustomerID)")
217 | 
218 | # Left join:
219 | 
220 | result_frame <- sqldf("SELECT CustomerId, Product, State 
221 |                        FROM data_frame1
222 |                        LEFT JOIN data_frame2 USING(CustomerID)")
223 | 
224 | 


--------------------------------------------------------------------------------
/Individual Scripts/9. Factors.R:
--------------------------------------------------------------------------------
 1 | ## Factors
 2 | 
 3 | # Factors are used to represent categorical data and can be unordered or ordered. One can think of a factor as 
 4 | # an integer vector where each integer has a label. Factors are important in statistical modeling and are treated
 5 | # specially by modelling functions like lm() and glm(). 
 6 | 
 7 | # Using factors with labels is better than using integers because factors are self-describing. Having a variable 
 8 | # that has values "Male" and "Female" is better than a variable that has values 1 and 2. 
 9 | 
10 | factor_example <- factor(c("yes", "yes", "no", "yes"))
11 | 
12 | factor_example
13 | # [1] yes yes no  yes
14 | # Levels: no yes
15 | 
16 | 


--------------------------------------------------------------------------------
/Learn R by Example.R:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/photonlines/Learn-R-by-Example/31a02695139f26cd28b7b546a7122c711224e7a9/Learn R by Example.R


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Learn R by Example
 2 | 
 3 | This is an R script meant to serve those who want to get an overview of R by looking through code / code comments and useful examples.
 4 | 
 5 | A summary of the file contents included in this repository is provided below:
 6 | 
 7 | - Learn R by Example.R : File which contains the entire script / R overview.
 8 | - 'Individual Scripts' directory : Directory which contains the learn R by example file contents spilt up into seperate sections / R scripts.  
 9 | - SplitRFilePSScript.ps1 : PowerShell script which splits the learn R by example file into individual sections and saves them in the 'individual scripts' directory. 
10 | 
11 | ## License 
12 | This code is released under Creative Commons BY-SA license.
13 | Most of the original examples / content have been modified and taken from the R Notes for Professionals book available here: https://books.goalkicker.com/RBook/
14 | The original book content was compiled from Stack Overflow Documentation, and written by the great people at Stack Overflow.
15 | 


--------------------------------------------------------------------------------
/SplitRFilePSScript.ps1:
--------------------------------------------------------------------------------
  1 | ﻿# The following PowerShell script divides the original 'Learn R by Example.R' file 
  2 | # into multiple ones and copies them into another output directory.
  3 | 
  4 | # As a note, some of the individual scripts may not work when run in isolation, 
  5 | # since some of the examples re-use the install_and_load_package function or 
  6 | # data from previous sections / examples. 
  7 | 
  8 | $InputFileName = "Learn R by Example.R"
  9 | $OutputDirectoryName = "Individual Scripts"
 10 | 
 11 | # Flag used to control whether you want to re-create the output directory containing
 12 | # the individual R files. If it's set to True, the output directory will be deleted
 13 | # prior to copying the files over and re-created. Otherwise, the scripts will be 
 14 | # copied over and any files already present within the output directory will be 
 15 | # overwritten.
 16 | 
 17 | $ReCreateOutputDirectory = $True
 18 | 
 19 | # Section delimiter which controls how the original R file is sub-divided. Currently,
 20 | # we mark each section beginning with the '##' comment characters followed by the 
 21 | # section title and contents which are used in splitting the original R file.
 22 | 
 23 | $SectionDelimiter = "##"
 24 | 
 25 | # Fetches the PowerShell script root directory path and returns it:
 26 | 
 27 | Function Get-RootDirectoryPath {
 28 |     
 29 |     $ScriptRoot = ""
 30 | 
 31 |     # Try to make sure that the script is compatible with both newer and older 
 32 |     # versions of PowerShell:
 33 | 
 34 |     Try {
 35 |         $ScriptRoot = Get-Variable -Name PSScriptRoot -ValueOnly -ErrorAction Stop
 36 |     } Catch {
 37 |         $ScriptRoot = Split-Path $script:MyInvocation.MyCommand.Path
 38 |     }
 39 | 
 40 |     return $ScriptRoot
 41 | }
 42 | 
 43 | $RootDirectoryPath = Get-RootDirectoryPath
 44 | 
 45 | $InputFilePath = (Join-Path $RootDirectoryPath $InputFileName)
 46 | 
 47 | $OutputDirectoryPath = (Join-Path $RootDirectoryPath $OutputDirectoryName)
 48 | 
 49 | # If the re-create directory path flag is set to true, delete the output 
 50 | # directory prior to copying the files over. 
 51 | 
 52 | If ($ReCreateOutputDirectory) {
 53 | 
 54 |     Remove-Item $OutputDirectoryPath -Recurse -ErrorAction Ignore
 55 | 
 56 | }
 57 | 
 58 | # Create the output directory (if it doesn't exist)
 59 | 
 60 | New-Item -ItemType Directory -Force -Path $OutputDirectoryPath
 61 | 
 62 | # Fetch the number of lines in our R file so we can create a progress indicator
 63 | 
 64 | $NumberOfLinesInFile = 0
 65 | 
 66 | gc $InputFilePath -read 100 | % { $NumberOfLinesInFile += $_.Length }
 67 | 
 68 | # Read the Learn R by Example file line by line:
 69 | 
 70 | $Reader = New-Object System.IO.StreamReader($InputFilePath)
 71 | 
 72 | Try {
 73 | 
 74 |     $SectionNumber = 1
 75 |     $LineNumber = 1
 76 | 
 77 |     While (($Line = $Reader.ReadLine()) -ne $null) {
 78 | 
 79 |         # If we find a new section delimiter / marker, we create a new file to output
 80 |         # our results to. The new file name will be set to a section number followed by 
 81 |         # the R section title which is provided after the delimiter we want to use.
 82 |         # I.E. the delimiter '##' in '## Section title' will create a new 'Section title.R'
 83 |         # file in our output directory. 
 84 | 
 85 |         If ($Line -match ($SectionDelimiter + " (.+\n?)")) {
 86 | 
 87 |             $SectionTitle = $matches[1].Trim()
 88 |             $OutputFileName = ($SectionNumber).ToString()  + ". " + $SectionTitle + ".R"
 89 |             $SectionNumber = $SectionNumber + 1
 90 | 
 91 |         }
 92 | 
 93 |         # Add the line we just processed to our output directory / file:
 94 | 
 95 |         Add-Content (Join-Path $OutputDirectoryPath $OutputFileName) $Line
 96 | 
 97 |         # Update the progress indicator to show percentage processed and line number count:
 98 | 
 99 |         $PercentComplete =  ($LineNumber / $NumberOfLinesInFile) * 100
100 |         Write-Progress -Activity 'Processing R File' -Status "On line $LineNumber" -PercentComplete $PercentComplete
101 | 
102 |         $LineNumber = $LineNumber + 1
103 | 
104 |     }
105 | 
106 | } Finally {
107 | 
108 |     $Reader.Dispose()
109 | 
110 | }


--------------------------------------------------------------------------------