├── README.md
└── Workshop_rstats
├── Part1_Introduction
├── 00_InstallWorkshopPackages.R
├── 01_data_types.R
├── 02_common_commands.R
├── 03_indexing.R
├── 04_string_manipulation.R
├── 05_loops.R
├── 06_ifelse_statements.R
├── 07_functions.R
└── 08_packages.R
├── Part2_DataCleanup
└── dataCleanup.R
├── Part3_VectorData
└── vectorProcessing.R
├── Part4_Rasters
└── rasterProcessing.R
├── README.md
└── VM_setup.sh
/README.md:
--------------------------------------------------------------------------------
1 | # FOSS4G_Boston2017
2 | Repository for all things FOSS4G 2017
3 |
--------------------------------------------------------------------------------
/Workshop_rstats/Part1_Introduction/00_InstallWorkshopPackages.R:
--------------------------------------------------------------------------------
1 | # Learning R: installPackages.R
2 | # author: Tina Cormier
3 | # date: August, 2017
4 |
5 | # Description: Installs the packages we need during the FOSS4G workshop.
6 | p1 <- proc.time() # timing your code :)
7 |
8 | #################################
9 | install.packages("raster") # raster AND vector analysis
10 | install.packages("rgdal") # bindings to GDAL (must have gdal installed).
11 | install.packages("maptools") # provides various mapping functions, BUT I don't recommend using it to read/write files bc it drops projection information.
12 | install.packages("devtools") # makes creating R packages easier (also allows us to install packages from other locations)
13 | install_github("hadley/ggplot2@v2.2.0") # data visualization package that breaks graphics into component parts (e.g., plot, layers, scales).
14 | install.packages("GGally") # extension of ggplot
15 | devtools::install_github("dgrtwo/gganimate") # create animated ggplot2 plots
16 | # install.package("plotly") # a graphing package for interactive plots
17 | install.packages("data.table") # enhanced version of data.frames. FAST
18 | install.packages("lubridate") # package that facilitates working with dates and times
19 | install.packages("RColorBrewer") # premade color palettes for plots
20 | install.packages("colorRamps") # package to build custom gradient color ramps
21 | install.packages("sf") # simple features
22 | install.packages("tmap") # layer-based approach to building thematic maps
23 | install.packages("rasterVis") # visualization of raster data
24 | install.packages("gdalUtils") # wrappers for GDAL utilities
25 | install.packages("ggmap") # visualize spatial data atop static maps from online sources
26 | install.packages("rgeos") # interfact to GEOS (Geometry Engine - Open Source)
27 | install.packages("dplyr") # A fast set of tools for working with data frame like objects
28 | install.packages("sp") # working with spatial objects (this package will be installed with raster, but wanted to explicitly recognize it as important)
29 | install.packages("leaflet") # interactive web mapping using Leaflet - we won't use this today, but web mappers may want to play with it!
30 | install.packages("rmarkdown") # convert to and from rmarkdown files
31 | install.packages("knitr") # a tool for dynamic report generation in R
32 | install.packages("rlist") # a set of functions for working with lists
33 | install.packages("snow") # simple parallel computing in R
34 |
35 | #################################
36 | p2 <- proc.time()
37 | p2-p1
38 |
39 | # Note that several of these packages are in the tidyverse package, which would be great to explore.
40 | # You can install that if you wish - it installs multiple packages that are part of the "tidyverse,"
41 | # a collection of packages with similar philosophies that are designed to work together.
42 | # (e.g., ggplot, dplyr, lubridate, stringr, etc.)
43 |
44 | # Needed?
45 | # install.packages("reshape")
46 |
--------------------------------------------------------------------------------
/Workshop_rstats/Part1_Introduction/01_data_types.R:
--------------------------------------------------------------------------------
1 | # Learning R: 01_data_types.R
2 | # author: Tina Cormier
3 | # date: August, 2017
4 |
5 |
6 | ######## REMARKS #########
7 |
8 | # Everything in R is an object.
9 | #
10 | # The assignment operator '<-' is traditionally used to store
11 | # objects as variables; however "=" will also work.
12 | #
13 | # R is case sensitive.
14 | #
15 | # In R, variable names may NOT start with a number.
16 | #
17 | # Take care not to use c, t, cat, F, T, or D as variable
18 | # names. Those are reserved by R.
19 | #
20 | # Vectors and Data Frames are the most common data types
21 | # we will use in R to start.
22 | #
23 |
24 | ######## BASIC DATA TYPES ########
25 |
26 | # vectors, matrices, arrays, data frames, lists, factors, functions
27 |
28 |
29 | # VECTORS - The basic data object in R.
30 | # A vector is a one-dimensional array of arbitrary length.
31 |
32 | # VECTOR TYPES - all elements in a vector must be of the same type.
33 | # Numbers
34 | num <- 4
35 | num
36 |
37 | # ***Note that R is case sensitive:
38 | num <- 4
39 | # is NOT the same as
40 | Num <- 10
41 |
42 | num
43 | Num
44 |
45 | # **Also, note that variable names may NOT start with a number:
46 | # NOT OK
47 | 1num <- 5
48 | # This works
49 | num1 <- 5
50 |
51 | # Can do mathematical operations to numbers
52 | # multiply, divide, add, subtract, exponents
53 | num * 5
54 | num / 2
55 | num + 8
56 | num - 1
57 | num^8
58 |
59 |
60 | # Strings
61 | string <- "hello world"
62 | string
63 |
64 |
65 | # Logical
66 | logic <- "TRUE"
67 | logic
68 | # logical test - boolean
69 | logic == "FALSE"
70 | logic == F
71 | logic == T
72 |
73 |
74 | # Vectors can be lists of like values (numbers, strings, logical, etc.)
75 | # The ":" creates sequences that increment/decrement by 1.
76 | x <- 1:10
77 | x
78 | x <- 1:-10
79 | x
80 |
81 | # Use "c" to "combine" or "concatenate" values.
82 | x <- c(2, 4, 6, 8)
83 | x
84 | x <- c("R", "stats", "for", "the", "win")
85 | x
86 |
87 | # Use "seq" to create a sequence.
88 | x <- seq(2, 8, by = 2)
89 | x
90 |
91 | # Can do mathematical operations on vectors. This is key for efficiency!
92 | x*10
93 |
94 |
95 | # MATRICES - A two-dimensional array with an arbitrary number
96 | # of rows and columns. All elements of the matrix must be
97 | # of the same type (numeric, character, etc.).
98 |
99 | x <- matrix(c(1,2,3, 11,12,13), nrow = 3, ncol=2, dimnames = list(c("row1", "row2", "row3"), c("C.1", "C.2")))
100 | x
101 |
102 | # is an object a matrix (T/F)?
103 | tx <- is.matrix(x)
104 | tx
105 |
106 | # Set object as a matrix.
107 | x <- warpbreaks
108 | x
109 | class(x)
110 | x <- as.matrix(warpbreaks[1:10,])
111 | class(x)
112 | x
113 |
114 | # ARRAYS - A matrix of arbitrary dimensions.
115 | x <- array(c(1,2,3, 11,12,13, 20,21,22), dim=c(3,3), dimnames = list(c("row1", "row2", "row3"), c("C.1", "C.2", "C.3")))
116 | x
117 |
118 | # DATA FRAMES - Most data will be stored in data frames,
119 | # rectangular arrays that are usually formed by combining
120 | # vectors (columns). Data frames, as opposed to matrices,
121 | # can contain vectors of different types. Data frames are
122 | # LIKE your typical spreadsheet or database table.
123 |
124 | # Constructing data frames
125 | x <- seq(0, 10, length = 20)
126 | x
127 | y <- rep(c("a", "b", "c", "d"), 5)
128 | y
129 |
130 | my.df <- data.frame(numbers=x, letters=y)
131 | my.df
132 |
133 | # A method for adding columns to a data frame.
134 | # cbind means "column-binding."
135 | newCol <- seq(1, 2, length=20)
136 | my.df <- cbind(my.df, newCol)
137 | my.df
138 |
139 | # Another method for adding columns to a data frame. "$" indicates
140 | # column names in a data frame. This method is particularly useful
141 | # for adding NEW columns.
142 | # Try looking at the column named "numbers"
143 | my.df$numbers
144 | # Now add another new column
145 | my.df$newest_column <- log(my.df$newCol)
146 | my.df
147 |
148 | # Can also delete column.
149 | my.df$newest_column <- NULL
150 | my.df
151 |
152 | # LISTS - Lists are generic vectors. Lists contain elements,
153 | # each of which can contain any type of R object (i.e. the elements)
154 | # of a list do not have to be of the same type.
155 |
156 | # Here is an example of a list that contains 3 different classes:
157 | # numbers, matrix, and even a function.
158 |
159 | complicated.list <- list("a"=1:4, "b"=1:3, "c"=matrix(1:4, nrow=2), "d"=sd)
160 |
161 | # What class is object complicated.list?
162 | class(complicated.list)
163 |
164 | # What classes are the individual elements in complicated.list?
165 | # The "apply" group of functions operates nicely on lists (and other
166 | # objects) to apply a function to each element in the list.
167 | lapply(complicated.list, class)
168 |
169 | # FACTORS - Represent an efficient way to store categorical values.
170 | # Factors are frequently used in statistical modeling. For instance, when
171 | # classifying land cover, the response "land cover type" is a categorical
172 | # variable. Models handle factors differently than continuous variables,
173 | # such as biomass, precipitation, or elevation. Factors can be ordered
174 | # or unordered
175 |
176 | x <- c(3,2,1,3,1,2,1,3,1,2,3,3,2)
177 | x
178 | class(x)
179 | fx <- factor(x)
180 | fx
181 | class(fx)
182 | levels(fx)
183 |
184 | # Ordered vs. unordered factors
185 | # unordered
186 | days <- c("sunday", "monday", "wednesday", "friday", "saturday","tuesday", "sunday","wednesday", "thursday", "friday", "saturday")
187 | days
188 | str(days)
189 | un.days <- factor(days)
190 |
191 | # Note that the days are not in order
192 | table(un.days)
193 |
194 | # But we know days happen in a certain order, and we can enforce that order.
195 | or.days <- ordered(days, levels=c("sunday", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday"))
196 | table(or.days)
197 |
198 | # FUNCTIONS - R objects that allow the user to reduce complex
199 | # procedures into a single command. Useful for procedures that are
200 | # run repeatedly with different inputs. R has a large number of "built-in"
201 | # functions, like "mean," "plot," etc. Users can create their own
202 | # functions as well.
203 |
204 | # Built-in functions - examples
205 | x <- c(5,48,12,45)
206 | x
207 |
208 | # Mean
209 | mean(x)
210 |
211 | # Sort
212 | sort(x)
213 |
214 | # Print
215 | print(x)
216 | paste("Working on process ", x, ". Almost done!", sep="")
217 |
218 | # Plot
219 | plot(x, x^2)
220 |
221 |
222 |
223 |
224 |
225 |
--------------------------------------------------------------------------------
/Workshop_rstats/Part1_Introduction/02_common_commands.R:
--------------------------------------------------------------------------------
1 | # Learning R: 02_common_commands.R
2 | # author: Tina Cormier
3 | # date: August, 2017
4 |
5 | # Examples of common, useful commands in R.
6 |
7 | # Commands are usually issued in the form of FUNCTIONS in R.
8 | # Functions are called by entering the name of the function,
9 | # followed by parentheses. Some functions have arguments,
10 | # which are specified within the parentheses.
11 |
12 | # Getting help
13 | # A web-based set of help pages
14 | help.start()
15 | # Show details of a function
16 | # We will go over what a function is and how to
17 | # know what the arguments are.
18 | help(mean)
19 | # A short cut to do the same thing
20 | ?mean
21 | # Gives a list of functions involving the word "mean"
22 | help.search("mean")
23 | # A short cut to do the same thing. The ?? utility can handle misspelled words.
24 | ??mean
25 | # Run the examples from the help page
26 | example(mean)
27 |
28 |
29 | # The quit function
30 | # q()
31 |
32 | # Setting your working directory
33 | # The default location where R will look to read and
34 | # write files.
35 |
36 | # To see current working directory
37 | getwd()
38 |
39 | # To change or set your working directory
40 | setwd("/Users/tcormier/Documents/capacity_building/lidar_workshop_Nepal_20161202/tcormier_intro_to_R/")
41 | getwd()
42 |
43 | # To list user-defined objects and functions
44 | ls()
45 | # or
46 | objects()
47 |
48 | # To remove a variable from the workspace
49 | x <- "hello world"
50 | x
51 | rm(x)
52 | x
53 | # or to remove ALL variables from the workspace
54 | rm(list=ls())
55 |
56 | # Exploring data
57 | colnames <- c("speed", "distance")
58 | x <- data.frame(seq(1,50, by=1.5), seq(1, 185, by=5.6))
59 | # Change the names of the columns
60 | names(x) <- colnames
61 |
62 | # Determining the class/type of an object
63 | class(x)
64 | class(x$speed)
65 |
66 | # Finding the structure of an object
67 | str(x)
68 |
69 | # Names
70 | names(x)
71 |
72 | # Quick view of the data (without printing thousands of lines)
73 | head(x)
74 | head(x, 3)
75 | tail(x)
76 |
77 | # Finding the dimensions of an object (rows,columns).
78 | # Works for multi-dimensional objects. For vectors,
79 | # use length().
80 | dim(x)
81 | length(x$speed)
82 |
83 | ncol(x)
84 | nrow(x)
85 |
86 | # Sorting
87 | y <- sample(0:50, 20, replace=FALSE)
88 | sort(y)
89 | rev(y)
90 |
91 | # Matching
92 | # Find position of maximum/minimum value in a vector
93 | which.max(y)
94 | which.min(y)
95 |
96 | # Find positions of values in y that are also in x$speed (logical vector)
97 | y %in% x$speed
98 |
99 | # Some math
100 | mean(x$speed)
101 | max(x$speed)
102 | min(x$distance)
103 | range(x$distance)
104 |
105 | x/10
106 |
107 | summary(x)
108 |
109 | # Subsetting
110 | subset(x, x$speed < 31.0)
111 |
112 | # Tables
113 | apples <- c("McIntosh", "Granny Smith", "Red Delicious", "McIntosh", "McIntosh", "Granny Smith")
114 | table(apples)
115 |
116 | # Putting vectors together to create data frames/matrices.
117 | y <- sample(0:50, 20, replace=FALSE)
118 | z <- sample(89:500, 20, replace=FALSE)
119 |
120 | # Create a matrix treating y and z as rows.
121 | rbind(y,z)
122 |
123 | # Create a matrix treating y and z as columns.
124 | cbind(y,z)
125 |
126 |
127 |
128 |
--------------------------------------------------------------------------------
/Workshop_rstats/Part1_Introduction/03_indexing.R:
--------------------------------------------------------------------------------
1 | # Learning R: 03_indexing.R
2 | # author: Tina Cormier
3 | # date: August, 2017
4 |
5 | # Indexing allows access to individual elements or subsets of objects.
6 | # Indexing can be used to both extract part of an object and/or to replace
7 | # parts of or add to an object.
8 |
9 |
10 | ################ VECTORS AND MATRICES ################
11 | # For vectors and matrices, the form x[i] or y[row,col] is generally used.
12 | x <- c(3,7,9,2,NA,5)
13 | x
14 |
15 | # Extract element 4.
16 | x[4]
17 |
18 | # Extract elements 1-3.
19 | x[1:3]
20 |
21 | # Adding a "-" sign means "except."
22 | # Extract all values except the 3rd
23 | x[-3]
24 |
25 | # Locate missing data
26 | is.na(x)
27 |
28 | # Remove missing data
29 | x[!is.na(x)]
30 | # or
31 | na.omit(x)
32 |
33 | # Or replace missing data:
34 | # Here, you would read this line as "x, where x is NA, is now 10.
35 | x[is.na(x)] <- 10
36 | x
37 |
38 | # Now a matrix
39 | y <- matrix(c(2,7,9,2,NA,5), nrow=2)
40 | y
41 | dim(y)
42 |
43 | # Find value at row 1, column 3
44 | y[1,3]
45 |
46 | # Extract row 1, columns 1 and 2
47 | y[1,1:2]
48 |
49 | # Extract columns 1 and 3
50 | y[,c(1,3)]
51 |
52 | # Locate missing data
53 | is.na(y)
54 |
55 | ################ DATA FRAMES ################
56 |
57 | # Indexing data frames can be done the same way as matrices.
58 | # The format df$colname can also be used.
59 |
60 | df <- as.data.frame(y)
61 | df
62 | names(df)
63 | names(df) <- c("car", "toy", "sky")
64 |
65 | # extract values from column called "toy"
66 | df$toy
67 | # or
68 | df[,2]
69 |
70 | # Do you want to know if an entire row is complete (no NA values)?
71 | complete.cases(df)
72 | # Now remove rows that are NOT complete - we often need to do this for modeling.
73 | df.complete <- df[complete.cases(df),]
74 |
75 | ################ LISTS ################
76 |
77 | # Indexing lists
78 | mylist <- list(x=x, y=y)
79 | mylist
80 | length(mylist)
81 |
82 | # To access the first element in the list
83 | mylist[[1]]
84 |
85 | # To access the 1st column of the 2nd list element
86 | mylist[[2]][,1]
87 |
88 |
89 | ################ WHICH FUNCTION ################
90 | # Returns indices of TRUE values in a logical vector.
91 | l <- LETTERS
92 | l
93 | which(l=="T")
94 | l[20]
95 |
96 | my.list <- c("helicopter", "plane", "car", "bus", "bicycle", "horse")
97 | your.list <- c("fly", "run", "car", "drive", "pedal", "horse")
98 |
99 | # Which positions in your.list match my.list
100 | which(your.list %in% my.list)
101 |
102 | # To get the actual values, instead of the positions
103 | your.list[which(your.list %in% my.list)]
104 |
--------------------------------------------------------------------------------
/Workshop_rstats/Part1_Introduction/04_string_manipulation.R:
--------------------------------------------------------------------------------
1 | # Learning R: 04_string_manipulation.R
2 | # author: Tina Cormier
3 | # date: August, 2017
4 |
5 | # Parsing strings in R.
6 |
7 | # Use the print function
8 | print("hello world")
9 |
10 | # Set a variable to a character vector
11 | x <- "Woods Hole Research Center"
12 | x
13 |
14 | # Get the length of a character vector
15 | length(x)
16 |
17 | # Why is this different than python's answer?
18 | # Try:
19 | nchar(x)
20 |
21 | # Concatenate vectors using "print" and "paste"
22 | x <- "hello world, I am "
23 | y <- "from"
24 | z <- " the United States."
25 |
26 | print(paste(x, y, z, sep=""))
27 |
28 | # Substrings in a character vector.
29 | x <- "my name is Tina"
30 | x
31 | substr(x, 12, 15)
32 |
33 | # Splitting strings according to a substring - result is a list.
34 | x <- "/Users/tcormier/Documents/capacity_building/lidar_workshop_Nepal_20161202/tcormier_intro_to_R/july_precip.tif"
35 | z <- strsplit(x, "/")
36 | z
37 | class(z)
38 | # To pull out the 3rd element in the split
39 | z[[1]][3]
40 |
41 | # OR, can use the "unlist" command to place each element into a vector.
42 | z <- unlist(strsplit(x, "/"))
43 | z
44 | z[3]
45 | class(z)
46 |
47 | # Creating new file names from existing files.
48 | unlist(strsplit(x, "\\."))
49 | newfile <- paste(unlist(strsplit(x, "\\."))[1], "_NEW.tif", sep="")
50 | newfile
51 |
52 | # String substitution
53 | # to substitute the first occurrence of the word tcormier with "awesome."
54 | sub("tcormier", "awesome", x)
55 | gsub("tcormier", "awesome", x)
56 |
57 | # To substitute all occurrences of a string.
58 | x <- "mary had a little lamb, little lamb, little lamb"
59 | # Using sub
60 | sub("little", "HUGE", x)
61 | # Now use gsub and note the difference
62 | gsub("little", "HUGE", x)
63 |
64 | # Searching for patterns (using regular expressions)
65 | x <- c("image.tif", "biomass.tif", "help.csv", "precipitation.tif", "precipitation.xls")
66 | # Find positions of all elements of x that end with .tif
67 | grep(".tif", x)
68 | # To find the "values" of all elements of x that end
69 | # with .tif, rather than the positions:
70 | grep(".tif", x, value=TRUE)
71 |
72 |
--------------------------------------------------------------------------------
/Workshop_rstats/Part1_Introduction/05_loops.R:
--------------------------------------------------------------------------------
1 | # Learning R: 05_loops.R
2 | # author: Tina Cormier
3 | # date: August, 2017
4 |
5 | ################ WHILE LOOP ################
6 | a=0
7 | while (a < 10) {
8 | a=a+1
9 | print(a)
10 | }
11 |
12 |
13 | ################ FOR LOOP ################
14 | # for loop on a vector
15 | a = c("harry", "joe", "peter")
16 | for (name in a) {
17 | print(name)
18 | }
19 |
20 | # Loop over each row in a data frame
21 | x <- data.frame(seq(1,50, by=1.5), seq(1, 185, by=5.6))
22 | names(x) <- c("A", "B")
23 | x
24 |
25 | for (i in c(1:nrow(x))) {
26 | # DO things.
27 | y <- x$A[i] + x$B[i]
28 | print(paste("the value of A is ", x$A[i], ", the value of B is ", x$B[i], ", and A + B is ", y, sep=""))
29 | }
30 |
31 | ################ "APPLY" FUNCTIONS ################
32 | # apply and friends
33 | # Can make code more efficient (to run and to read)
34 | # by eliminating loops.
35 |
36 | # lapply - loop over a list and perform a function on each element.
37 | # lapply always returns a list object, regardless of input class.
38 | # sapply - same as lapply but simplifies the result, if possible
39 | x <- c("bob", "harry", "abc", "i", "lisa")
40 | x
41 | # Using lapply to determine the number of characters in each element
42 | lapply(x, nchar)
43 |
44 | # Another example
45 | x <- list(a = 1:10, b = rnorm(20))
46 | x
47 | lapply(x, mean)
48 | # sapply returns a pretty vector instead of a list.
49 | x.sapply <- sapply(x, mean)
50 | x.sapply
51 | # Now the same thing using a loop, for reference (5 lines instead of 1):
52 | x.loop <- vector()
53 | for (i in c(1:length(x))) {
54 | i.mean <- mean(x[[i]])
55 | x.loop[i] <- i.mean
56 | }# end loop
57 |
58 | # apply - apply a function over the margins of an array
59 | # Useful if you want to perform the same function to all
60 | # rows or columns in a matrix, for example.
61 | x <- matrix(rnorm(200), 20, 10)
62 | x
63 |
64 | # Calculate the mean for each column (should be 10 columns)
65 | apply(x, 2, mean)
66 |
67 | # Calculate the mean for each row (should be 20 rows)
68 | apply(x, 1, mean)
69 |
70 | # tapply - apply a function over subsets of a vector
71 | attach(iris)
72 | str(iris)
73 | head(iris)
74 |
75 | # For each species type, find mean petal length -
76 | # this would involve looping over each unique species
77 | # if we wrote a loop. Here = one liner!
78 | tapply(Petal.Length, Species, mean)
79 |
--------------------------------------------------------------------------------
/Workshop_rstats/Part1_Introduction/06_ifelse_statements.R:
--------------------------------------------------------------------------------
1 | # Learning R: 06_ifelse_statements.R
2 | # author: Tina Cormier
3 | # date: August, 2017
4 |
5 | # ifelse(condition, value if true, value if false)
6 | score <- 5
7 | contest <- ifelse(score >=2, "winner", "try again")
8 | contest
9 |
10 | # OR, another way of writing it, which is useful
11 | # when you need to do something more complex depending
12 | # on the condition.
13 | score <- 1
14 | if (score >=2) {
15 | contest <- "winner"
16 | } else {
17 | contest <- "try again"
18 | }
19 | print(contest)
20 |
21 | # If you have multiple conditions
22 | score <- 50
23 | # score <- 3
24 | # score <- 1
25 | if (score >=2 && score < 4) {
26 | contest <- "pretty good"
27 | } else if (score >= 10) {
28 | contest <- "superb"
29 | } else {
30 | contest <- "try again"
31 | }
32 | print(contest)
33 |
34 |
--------------------------------------------------------------------------------
/Workshop_rstats/Part1_Introduction/07_functions.R:
--------------------------------------------------------------------------------
1 | # Learning R: 07_functions.R
2 | # author: Tina Cormier
3 | # date: August, 2017
4 |
5 | # If you find yourself writing the same code over and over again,
6 | # it might be time to make it a function!
7 |
8 | # Defining a function
9 | hello <- function() {
10 | print("Hello World")
11 | }
12 | hello()
13 |
14 | # A simple function
15 | foo <- function(x) {
16 | print(paste0("the argument is ", x))
17 | }
18 | foo("tina")
19 | foo(67)
20 |
21 | # A more complex function (with positional arguments)
22 | add.stuff <- function(x,y,z) {
23 | print(paste("Arg x = ", x, sep=""))
24 | print(paste("Arg y = ", y, sep=""))
25 | print(paste("Arg z = ", z, sep=""))
26 | return(sum(x,y,z))
27 | }
28 |
29 | add.stuff(4,5,6)
30 | add.stuff(1,5)
31 |
32 | # Naming arguments and setting default values
33 | add.stuff <- function(x=2,y=1,z=NULL) {
34 | print(paste("Arg x = ", x, sep=""))
35 | print(paste("Arg y = ", y, sep=""))
36 | print(paste("Arg z = ", z, sep=""))
37 | return(sum(x,y,z))
38 | }
39 |
40 | add.stuff()
41 | add.stuff(y=4, z=9, x=23)
42 |
43 | # A note of advice: Make your function do one thing, small and flexible.
44 | # do testing outside the function.
45 |
46 |
47 | ################ A NOTE ABOUT VARIABLES & FUNCTIONS ################
48 |
49 | x <- "I live in the global environment"
50 | x
51 |
52 | var <- function() {
53 | x <- "I live inside of my function"
54 | print(x)
55 | }
56 |
57 | # So what is the value of "x" now?
58 | var()
59 | x
60 |
61 | # We can assign the value of x in the function to a new
62 | # variable that is accessible outside of the function
63 | # (i.e., bring the value of x from inside the function into
64 | # the global environment by assigning it to a varaible):
65 | y <- var()
66 | y
67 |
68 | # x is still unchanged
69 | x
70 |
71 | # OR
72 |
73 | # To export a variable within a function to the global
74 | # environment, use "<<-". Use caution not to unintentionally
75 | # overwrite other global variables!
76 | var2 <- function() {
77 | x <<- "Yay, now I'm global"
78 | }
79 |
80 | # So what is the value of "x" now?
81 | var2()
82 | x
83 |
--------------------------------------------------------------------------------
/Workshop_rstats/Part1_Introduction/08_packages.R:
--------------------------------------------------------------------------------
1 | # Learning R: 08_packages.R
2 | # author: Tina Cormier
3 | # date: August, 2017
4 |
5 | # The base version of R is powerful in itself; however
6 | # the capabilities of R can be extended through user-created
7 | # packages. Packages are collections of R functions, data,
8 | # and compiled code. Currently, there are over 10,000 packages
9 | # available at the CRAN (Comprehensive R Archive Network),
10 | # Bioconductor, and other repositories.
11 |
12 | # R already has a standard set of packages available when you
13 | # install. Other packages are available for download. Once
14 | # installed, they have to be loaded into your R session to be
15 | # used.
16 |
17 | # Get your library location (where does R look for packages?)
18 | .libPaths()
19 | # See all installed packages
20 | library()
21 | # See all currently loaded packages
22 | search()
23 |
24 | # You can expand/enhance the types of analyses you do by adding
25 | # other packages.
26 |
27 | # Will provide a list of packages to choose from
28 | # First choose your closest CRAN mirror
29 | chooseCRANmirror()
30 | install.packages("foreign")
31 |
32 | # OR
33 |
34 | # Essential dependencies will also be downloaded and installed.
35 | install.packages("randomForest")
36 | # Installing this
37 | install.packages("zyp", lib="/Users/tcormier/Library/R/2.11/library")
38 |
39 | # If you want to install ALL dependencies:
40 | install.packages("randomForest", dependencies=TRUE)
41 |
42 | # OR
43 |
44 | # Go to the "Tools" menu, choose "Packages."
45 |
46 | # To USE the new package, must load it into your R session or script.
47 | # Can use either "library" or "require." "require" is designed for use
48 | # inside of other functions; it will return "FALSE" and give a warning
49 | # (rather than an error, as the "library" function will do by default)
50 | # if the package does not exist.
51 | library(randomForest)
52 |
53 | # To remove a package
54 | remove.packages("zyp", lib="/Users/tcormier/Library/R/2.11/library")
55 |
56 |
--------------------------------------------------------------------------------
/Workshop_rstats/Part2_DataCleanup/dataCleanup.R:
--------------------------------------------------------------------------------
1 | # Learning R: dataCleanup.R
2 | # author: Tina Cormier
3 | # date: August, 2017
4 |
5 | # Sometimes we just get crappy data handed to us. Ok, a lot of the time.
6 | # R is an excellent tool for finding and fixing issues.
7 |
8 | # Someone [your boss, a client, the governor - anyone] gives you a text file
9 | # representing some point observations. He wants you to do a GIS analysis with
10 | # the points, but when you open it, you realize you've got some issues to solve
11 | # first.
12 |
13 | ######################################
14 | library(data.table) # enhanced version of data.frames. FAST.
15 | library(ggplot2) # data visualization package that breaks graphics into component parts (e.g., plot, layers, scales).
16 | wd <- "/home/user/R_workshop/data/"
17 | setwd(wd)
18 |
19 | # You could also use read.csv from base R here, but fread is way faster if you have large tables.
20 | # OH, and by default, it sets "stringsAsFactors" to FALSE, which is amazing.
21 | system.time(birds <- fread("eBird/ebd_NE_4spp_workshopData.txt", sep=" "))
22 | system.time(birds2 <- read.table("eBird/ebd_NE_4spp_workshopData.txt", stringsAsFactors = F, header = T))
23 |
24 | # A quick preview of the table:
25 | head(birds)
26 |
27 | # Dimensions?
28 | dim(birds)
29 |
30 | # What about the structure (i.e., what are the characteristics of the fields?)
31 | str(birds)
32 |
33 | # Ok, we see geo coordinates, let's plot them quick and dirty just to get an idea of
34 | # the data:
35 | plot(birds$LONGITUDE, birds$LATITUDE, pch=20, col='orange')
36 |
37 | # Eeeek. Something bad happened.
38 | # LAT and LON are swapped AND it looks like we have an errant state.
39 | # We need to swap the LONGITUDE and LATITUDE fields, and there are many
40 | # ways to do that.
41 | # One way is to create two new columns and assign the correct values to them.
42 | birds$x <- birds$LATITUDE
43 | birds$y <- birds$LONGITUDE
44 |
45 | # Write your own plot statement here using the names of your new columns,
46 | # and make the points 'blue':
47 |
48 |
49 | # Another way to swap the values in your columns - this way makes the change in place rather
50 | # than creating new columns.
51 | birds[ , c("LATITUDE", "LONGITUDE")] <- birds[ , c("LONGITUDE","LATITUDE")]
52 |
53 | # Ok, now we have a state included that is outside of New England. As GIS analysts,
54 | # our minds go right to "clipping it out," but we can do something even easier!
55 | # Let's see what states we have in the data set:
56 | unique(birds$STATE_PROVINCE)
57 |
58 | # Ohio is the culprit
59 | # Let's remove OHIO from our point data, which we can do quickly with a tabular
60 | # function rather than a spatial one!
61 | # First, how many records do we have?
62 | dim(birds)
63 | birds <- birds[birds$STATE_PROVINCE != 'Ohio',]
64 | dim(birds)
65 | unique(birds$STATE_PROVINCE)
66 | plot(birds$LONGITUDE, birds$LATITUDE, pch=20, col='orange')
67 |
68 | # Now let's say we want to see a plot of date vs. count of birds
69 | # observed. First, let's see how our date field is formatted.
70 | str(birds$OBSERVATION.DATE)
71 |
72 | # Since R understands dates and can do math based on them (and can do other things,
73 | # like convert from a regular date to julian day), we'll convert that field to a date field.
74 | # We'll use the lubridate package to help us.
75 | library(lubridate) # package that facilitates working with dates and times.
76 |
77 | # Let's let lubridate guess the date format for us, since that can be a pain (especially
78 | # if they aren't consistent. HINT: they're not consistent).
79 | # Let's create a new variable equal to "birds," so we don't have to start over
80 | # if we (when) mess it up!
81 | birds2 <- birds
82 | birds2$OBSERVATION.DATE <- as_date(birds2$OBSERVATION.DATE)
83 | # Ok, it ran without error, but let's check it out anyway.
84 | str(birds2$OBSERVATION.DATE)
85 |
86 | # Excellent, the field is now formatted as a date.
87 | # One other check: should all be between 2014 and 2017 - have to have some knowledge of your data set.
88 | unique(year(birds2$OBSERVATION.DATE))
89 | # SHOOT, we have issues! One way to ID where they are:
90 | birds2$OBSERVATION.DATE[grep("^20",year(birds2$OBSERVATION.DATE), invert = T)]
91 | # Almost the same command, but find positions of messed up dates. Note grep vs grepl (grep-logical = T/F
92 | # for matches rather than the value itself).
93 | pos <- which(grepl("^20",year(birds2$OBSERVATION.DATE)) == F)
94 |
95 | # Let's fix those that we identified as different.
96 | # First, what were they in our original table? How are the ones we flagged
97 | # different in format than the rest of the table?
98 | birds$OBSERVATION.DATE[pos]
99 | # vs.
100 | head(birds$OBSERVATION.DATE)
101 | # Note the difference in format.
102 |
103 | # We can use lubridate to fix the format, but we'll need to treat them separately.
104 | # Start over with dates from data set before you changed the dates.
105 | # We'll tell R it's mdy format just at those positions, and it will format them
106 | # to the more standard year, month, day like the rest of the table.
107 | birds2 <- birds
108 | birds2$OBSERVATION.DATE[pos] <- as.character(mdy(birds2$OBSERVATION.DATE[pos]))
109 | # Quick look to see what happened:
110 | birds2$OBSERVATION.DATE[pos] # BOOM!
111 |
112 | # Now run the original command again and convert all to date format (instead of character):
113 | birds2$OBSERVATION.DATE <- as_date(birds2$OBSERVATION.DATE)
114 | # And check again to make sure we did what we thought we did!
115 | unique(year(birds2$OBSERVATION.DATE)) # YAY!
116 |
117 | # Since we fixed that, let's go back to using just the 'birds' variable.
118 | birds <- birds2
119 |
120 | # Why do we care if we have something in date format (vs. character)? Sometimes we don't. BUT,
121 | # if we want to plot a time series or group things by date, then this makes it way easier.
122 | # Since R sees it as a special date field, we can also do things like this, which you can't do with text:
123 | max(birds$OBSERVATION.DATE)
124 | median(birds$OBSERVATION.DATE)
125 | min(birds$OBSERVATION.DATE)
126 |
127 | # Let's label each row with its year
128 | birds$year <- year(birds$OBSERVATION.DATE) # Look - no parsing!
129 |
130 | # Observations by species across years (we'll learn more about ggplot later!),
131 | # but look - no date formatting in the graph!
132 | # First make sure our count field is numeric (hint, it isn't, see?)
133 | str(birds)
134 | birds$OBSERVATION.COUNT <- as.numeric(birds$OBSERVATION.COUNT)
135 |
136 | ggplot(data=birds, aes(x=OBSERVATION.DATE, y=OBSERVATION.COUNT, color=COMMON.NAME)) + geom_point(alpha=0.25)
137 |
138 | # Moving on from dates, which could be a course in itself,
139 | # let's check for duplicate rows!
140 | # Are there any?
141 | birds[duplicated(birds),]
142 | # Seems to be! How many?
143 | dim(birds[duplicated(birds),])
144 | # There are a couple of ways to rid your data set of duplicates, but this one is easy
145 | # to remember: duplicated function.
146 | # First check to see how many records are in your dataset:
147 | dim(birds)
148 | birds <- birds[!duplicated(birds),]
149 | # Now how many are there?
150 |
151 | # Lastly, for our purposes, let's say we only want data with complete rows (no NAs).
152 | # Are there any incomplete rows?
153 | birds[!complete.cases(birds),]
154 | # Of course there are! How many?
155 | nrow(birds[!complete.cases(birds),])
156 | # Remove! - but do a row check first
157 | nrow(birds)
158 | birds <- birds[complete.cases(birds),]
159 | # check rows again to make sure we removed the duplicate rows.
160 |
161 | # Excellent! Let's write out this new clean table (or look in your data directory for my version.)
162 | write.table(birds, "eBird/ebd_NE_4spp_workshopData_dataCleanup.txt", row.names=F)
163 | # or
164 | fwrite(birds, "eBird/ebd_NE_4spp_workshopData_dataCleanup.txt", sep=" ")
165 |
--------------------------------------------------------------------------------
/Workshop_rstats/Part3_VectorData/vectorProcessing.R:
--------------------------------------------------------------------------------
1 | # Learning R: vectorProcessing.R
2 | # author: Tina Cormier
3 | # date: August, 2017
4 |
5 | # Description: Learn how to do basic geoprocessing and visualization with vector files.
6 |
7 | ######################################
8 | library(raster) # raster AND vector analysis
9 | library(data.table) # enhanced version of data.frames. FAST.
10 | library(rgdal) # bindings to GDAL (must have gdal installed).
11 | library(ggplot2) # a plotting system in R (much NICER than base plotting)
12 | library(RColorBrewer) # Color palettes
13 | library(colorRamps) # package to build gradient color ramps
14 | library(sf) # simple features
15 | library(rlist) # a set of functions for working with lists
16 | library(dplyr) # A fast set of tools for working with data frame like objects
17 | # library(plotly) # a graphing package for interactive plots
18 | library(tmap) # layer-based approach to building thematic maps
19 | library(gganimate) # create animated ggplot2 plots
20 | library(lubridate) # package that facilitates working with dates and times
21 | ######################################
22 | wd <- "/home/user/R_workshop/data/"
23 | setwd(wd)
24 |
25 | # 1. Re-open cleaned birds txt file
26 | birds <- fread("eBird/ebd_NE_4spp_workshopData_dataCleanup.txt", sep=" ")
27 |
28 | # 2. Convert to spatial points df
29 | # Although we've plotted the points, we haven't yet defined our birds data table as
30 | # a spatial object. First, which columns are the coordinates?
31 | birds.xy <- birds[,c("LONGITUDE", "LATITUDE")]
32 | birds.sp <- SpatialPointsDataFrame(coords=birds.xy, data=birds, proj4string = CRS("+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"))
33 | birds.sp
34 | # We could write this out to a shapefile if we want to now:
35 | shapefile(birds.sp, "eBird/ebd_NE_4spp_workshopData_sp.shp", overwrite=T)
36 | # Thanks ESRI - cut our field names. How about a geojson file instead?
37 | writeOGR(birds.sp, "eBird/test_geojson", layer="birds", driver="GeoJSON")
38 |
39 | # 3. Visualize
40 | plot(birds.sp)
41 | # That's kind of ugly, but we can see our geography issues have been resolved! How about this?
42 | # qtm = quick thematic map = parallel structure to ggplots 'qplot'
43 | qtm(shp = birds.sp, symbols.col="COMMON.NAME")
44 | # Nicer - but points are huge - let's adjust
45 | qtm(shp = birds.sp, symbols.col="COMMON.NAME", symbols.size=0.15)
46 | # Great! Now let's fix the legend title, and add a north arrow and scale bar.
47 | m <- qtm(shp = birds.sp, symbols.col="COMMON.NAME", symbols.size=0.15,
48 | title="Species Observations\n2014 - 2017", symbols.title.col="Common Name") +
49 | tm_compass() + tm_scale_bar()
50 | m
51 |
52 | # a map by date?
53 | p <- ggplot(birds.sp@data, aes(x=LONGITUDE, y=LATITUDE)) + geom_point()
54 | p # Will get a weird warning here because we aren't using the most updated version of ggplot.
55 |
56 | # add some color and frame by year
57 | p <- ggplot(birds.sp@data, aes(x=LONGITUDE, y=LATITUDE, color=COMMON.NAME, frame=year)) + geom_point()
58 | p
59 |
60 | gganimate(p) # cool!
61 | gganimate(p, "birds_byYear.gif")
62 |
63 | # 4. Bring in county layer
64 | county <- shapefile("boundaries/USA_adm2_conus_albers.shp")
65 | county
66 | # plot(county)
67 | str(county) # YIKES!
68 |
69 | # NOTE sf package: New package that simplifies life with spatial objects in R
70 | county.sf <- read_sf("boundaries/USA_adm2_conus_albers.shp") # note the speed!
71 | # Be careful plotting this one though! If you just execute plot(county.sf), you'll get a separate plot
72 | # for each column in the table...SLOW. Specify which column
73 | # plot(county.sf[,"NAME_2"])
74 |
75 | # For reference - to reach the attribute table of the county variable (s4 object), we need to call
76 | # the "data" slot, and we do that with an @ symbol. Like '$' references column names, '@' references
77 | # slots in an object. (Simple features eliminates this and let's you treat the spatial object as a data frame).
78 | head(county)
79 | # example of how to extract specific rows from the attribute table
80 | county@data[county$NAME_2 == 'Rockingham',] # results in rows from the table (non-spatial)
81 | county[county$NAME_2 == 'Rockingham',] # results in polygon selection (spatial)
82 |
83 | # 5. Project points to match county
84 | # What is the projection of each of our data sets?
85 | # There is a handy function in the raster package to help us out
86 | projection(birds.sp)
87 | projection(county)
88 | # or on the simple features layer - just multiple ways of doing the same thing!
89 | st_crs(county.sf)
90 |
91 | # Let's continue on with county polygons, but know that simple features exist and you should check them out!
92 | # WHY are we using something a bit older, slower, and clunkier? Well, eventually we want to integrate some raster
93 | # stuff, and the raster package can't deal with sfs just yet (it's super new, y'all!). If you ONLY work with
94 | # vectors, I suggest trying out simple features, which seems poised to replace the sp package in the future.
95 |
96 | # Ok, back to our regularly schedule reprojection:
97 | birds.proj <- spTransform(birds.sp, CRS(projection(county)))
98 | plot(birds.sp)
99 | plot(birds.proj)
100 | # Yay, projected
101 |
102 | # 6. Clip points by counties to remove points in the ocean. Also show how intersect does same thing.
103 | # Looks like we have some points in the ocean! We'll use the raster package's intersect function.
104 |
105 | # The raster::intersect function uses a command you're probably familiar with AND it appended the county layer's
106 | # attributes onto the table from birds.proj
107 | # birds.int <- raster::intersect(birds.proj, county) # you can try running this line and see what happens.
108 |
109 | # On the VM, we probably get a memory error here. This is a good time to
110 | # illustrate looping to get around a memory error.
111 | for (i in 1:length(county$ID_2)) {
112 | print(i)
113 | }
114 |
115 | # Cool, now let's really do stuff in that loop.
116 | # First, set up output
117 | birds.int <- list()
118 | i=1
119 | for (i in 1:length(county$ID_2)) {
120 | print(paste(county$NAME_2[i], county$NAME_1[i], sep=", "))
121 | bi <- raster::intersect(birds.proj, county[i,])
122 | if (nrow(bi@data) == 0) {
123 | print("no intersecting features")
124 | next()
125 | } else {
126 | birds.int <- list.append(birds.int, bi)
127 | }
128 | }
129 | #
130 | # Ok, let's kill that - there's no sense in looping over states/counties
131 | # where we now we don't have data = inefficient!
132 | # We have state info in the bird data set, so let's select from
133 | # the counties every state that matches in the bird data:
134 | county.sel <- county[county$NAME_1 %in% birds.proj$STATE_PROVINCE,] # WOAH, WTH is that? It's awesomeness.
135 | # Did that work?
136 | # plot(county.sel)
137 | unique(county.sel$NAME_1)
138 |
139 | # Try our loop again, but with county.sel in place of county:
140 | birds.intlist <- list()
141 | for (i in 1:length(county.sel$ID_2)) {
142 | print(paste(county.sel$NAME_2[i], county.sel$NAME_1[i], sep=", "))
143 | bi <- raster::intersect(birds.proj, county.sel[i,])
144 | if (nrow(bi@data) == 0) {
145 | print("no intersecting features")
146 | next()
147 | } else {
148 | birds.intlist <- list.append(birds.intlist, bi)
149 | }
150 | }
151 |
152 | birds.intlist
153 | # Well, we don't want a list of features, we want one spatial object containing
154 | # all of the features.
155 | birds.int <- do.call(bind, birds.intlist)
156 |
157 | # What has changed about our spatial points?
158 | head(birds.int)
159 |
160 | # A side note: If we ONLY wanted to clip/subset the points, R can use the same indexing syntax - square brackets -
161 | # to select a subset as it does for other objects, AND it's faster!
162 | birds.sub <- birds.proj[county,]
163 |
164 | # Back to birds.int. Now we have two "state" fields - one originating from the eBird data set and
165 | # the other from our admin boundaries. Let's do a quick QA check to see if the state listed in
166 | # eBird == admin boundary state.
167 | birds.check <- birds.int[birds.int$STATE_PROVINCE != birds.int$NAME_1,]
168 | head(birds.check) # SHOOT! More data clean up.
169 | # Which one is right? Don't know, so let's trash any that don't match.
170 | birds.int <- birds.int[birds.int$STATE_PROVINCE == birds.int$NAME_1,]
171 | # Check again to see if we have any non-matching
172 |
173 |
174 | # 8. Summaries spp by county and state
175 | # Let's check in with our "apply" family of functions to get some quick info
176 | # tapply = table apply to get counts per county and per state
177 | birds.pc <- tapply(birds.int$OBSERVATION.COUNT, birds.int$ID_2, sum)
178 | birds.pc # Why did we do this by ID and not by county name?
179 | birds.ps <- tapply(birds.int$OBSERVATION.COUNT, birds.int$ID_1, sum)
180 | birds.ps
181 |
182 | # or by state name:
183 | birds.ps <- tapply(birds.int$OBSERVATION.COUNT, birds.int$NAME_1, sum) # Why doesn't summarizing by name work with counties?
184 | birds.ps
185 |
186 | # 9. Let's make some plots!
187 | # Plots of spp observered over time
188 | # Let's sum the number of observations per date per species, just to get an idea
189 | birds.agg <- aggregate(OBSERVATION.COUNT~OBSERVATION.DATE+COMMON.NAME, birds.int@data, FUN=sum)
190 | # Now we need "OBSERVATION.DATE" as a date - remember how to do that?
191 | library(lubridate)
192 | birds.agg$OBSERVATION.DATE <- as_date(birds.agg$OBSERVATION.DATE)
193 | summary(birds.agg)
194 |
195 | # Now some plotting
196 | ggplot(data=birds.agg, aes(x=OBSERVATION.DATE, y=OBSERVATION.COUNT, color=COMMON.NAME)) + geom_line()
197 | # Interesting, but large spikes of big birding days (great backyard bird count, etc.?) are making it hard to see trends.
198 | # Let's look at just trend lines
199 | # NOTE: Aesthetics supplied to ggplot() are used as defaults for every layer.
200 | # you can override them, or supply different aesthetics for each layer.
201 | ggplot(data=birds.agg, aes(x=OBSERVATION.DATE, y=OBSERVATION.COUNT, color=COMMON.NAME)) + geom_smooth()
202 | # ok, can we remove the se bars?
203 | ggplot(data=birds.agg, aes(x=OBSERVATION.DATE, y=OBSERVATION.COUNT, color=COMMON.NAME)) + geom_smooth(se=F)
204 |
205 | # Maybe nicer as a barplot? - but only by year
206 | birds.agg$year <- year(birds.agg$OBSERVATION.DATE)
207 | ggplot(data=birds.agg, aes(x=year, y=OBSERVATION.COUNT, fill=COMMON.NAME)) + geom_bar(stat='identity')
208 | # Why stat=identity? From geom_barplot() docs: By default, geom_bar uses stat="count" which makes the
209 | # height of the bar proportion to the number of cases in each group. If you want the heights of the
210 | # bars to represent values in the data (we do, because the values ARE the counts), use stat="identity"
211 | # and map a variable to the y aesthetic.
212 |
213 | # Now we'd like side by side bars:
214 | ggplot(data=birds.agg, aes(x=year, y=OBSERVATION.COUNT, fill=COMMON.NAME)) +
215 | geom_bar(stat='identity', position='dodge')
216 | # We can save plots to objects for later printing to a graphics device (pdf, png, etc.)
217 | # It also helps you set up your base graph and you can try adding new/different geoms.
218 | # The typical syntax=
219 | p <- ggplot(data=birds.agg, aes(x=year, y=OBSERVATION.COUNT, fill=COMMON.NAME))
220 | p <- p + geom_bar(stat='identity', position='dodge')
221 | p
222 | # Can we make it interactive? - commented out because this will only work with the most
223 | # recent version of ggplot2. We have installed a slightly older version to work with ggmap.
224 | # Sigh, it's complicated. But I'm leaving this here for you to try at another point!
225 | # ggplotly(p) # There are a lot of ways to customize, but it's this simple to get started!
226 |
227 | # Let's aggregate again, but this time by county ID
228 | birds.cy <- aggregate(OBSERVATION.COUNT~ID_2, birds.int@data, FUN=sum)
229 | # Let's add the county names on there, which requires some matching between tables
230 | # We could join the tables, but we don't want all of the fields from birds.int - just one!
231 | birds.cy$county <- birds.int$NAME_2[match(birds.cy$ID_2, birds.int$ID_2)] # match gives the first match only, but that's all we need here.
232 | birds.cy
233 |
234 | # New county layer with just these 6 New England states
235 | county.ne <- county[county$NAME_1 %in% birds.int$NAME_1,]
236 | # plot(county.ne)
237 | # Let's write this to a shapefile for later :)
238 | shapefile(county.ne, "boundaries/county_ne.shp", overwrite = T)
239 |
240 | # 10. Map where county color is # bird observations?
241 | # First, need to join the count information from birds.cy to our new county.ne spatial polygons data frame.
242 | # There are multiple ways we could do it - I'll show you two:
243 | # a. the base R way
244 | county.ne$count <- birds.cy$OBSERVATION.COUNT[match(county.ne$ID_2, birds.cy$ID_2)]
245 |
246 | # Now the tidyverse way (a way of thinking about programming that is cleaner and easier - ggplot is in that group)
247 | # Let's use a new variable to be safe
248 | county.ne2 <- county.ne@data %>% left_join(birds.cy) # cool, right? - it appends the other fields as well.
249 |
250 | # Let's take a quick glance at a summary of the table before we map:
251 | summary(county.ne)
252 | # Let's record where count is NA = 0
253 | county.ne$count[is.na(county.ne$count)] <- 0
254 | summary(county.ne)
255 |
256 | # Ok, let's map! # choropleth
257 | qtm(county.ne, fill = "count")
258 | # try a different color palette
259 | qtm(county.ne, fill = "count", style='col_blind')
260 | # now something else
261 | qtm(county.ne, fill = "count", fill.palette="-YlGnBu")
262 | # not thrilled with the breaks on this map - let's set our own!
263 | # Let's see how deciles look
264 | breaks <- quantile(county.ne$count, probs=seq(0,1,by=0.1))
265 | qtm(county.ne, fill = "count", fill.palette="Purples", fill.style="fixed",fill.breaks=breaks)
266 |
267 | # Another common GIS function = dissolve. Let's dissolve counties to states
268 | states <- aggregate(county.ne, by="NAME_1")
269 | qtm(states) # complex boundaries, so could take a moment to plot.
270 |
271 |
272 | # 11. Birding hotspots?
273 | # Let's try ggmap
274 | library(ggmap) # Spatial visualization with ggplot2
275 | m <- qmap(location="new england", zoom=6) # set location and background
276 | m
277 | m + geom_point(data=birds.int@data, aes(x=LONGITUDE, y=LATITUDE)) # Well, that's something.
278 | # Let's try to get the density (2-D kernel density estimation!
279 | m + stat_density2d(data=birds.int@data, aes(x = LONGITUDE, y = LATITUDE))
280 |
281 | # How about filled contours to show birding hot spots? We'll use default colors for now
282 | # Hints as we go along: try changing the # of bins, the colors, and the alpha levels. Just play!
283 | m + stat_density2d(data=birds.int@data, aes(x = LONGITUDE, y = LATITUDE, fill = ..level..,alpha=..level..), bins = 30, geom = "polygon")
284 |
285 | # Hmmm, Massachusetts is popular - let's zoom in and make a few changes
286 | mm <- qmap(location='boston', zoom=10, color="bw") # Can make background map black and white.
287 | # OR
288 | # Choose another source
289 | mm <- qmap(location='boston', zoom=10, source="google", maptype="satellite")
290 | # OR
291 | # Our original - the background you choose will affect which colors you like as we go along.
292 | mm <- qmap(location='boston', zoom=10)
293 | mm
294 |
295 | # first, with default colors:
296 | mm.density <- mm + stat_density2d(data=birds.int@data[birds.int@data$NAME_1=='Massachusetts',], aes(x = LONGITUDE, y = LATITUDE, fill = ..level..,alpha=..level..), bins = 15, geom = "polygon") +
297 | ggtitle("Birding Hotspots") +
298 | guides(alpha=FALSE) # Turn off alpha legend item
299 |
300 | mm.density
301 |
302 | # Now start to play by using scale_fill_gradient argument (you may find you like the default best!)
303 | # Pick some new colors - here are a bunch of ideas - try your own too!
304 | display.brewer.all()
305 | # Also check out https://www.nceas.ucsb.edu/~frazier/RSpatialGuides/colorPaletteCheatsheet.pdf for more ideas
306 | # density.colors <- c("gray20", "gray50", brewer.pal(9, "Blues")[6:9]) # custom mix
307 | # density.colors <- brewer.pal(9, "PuBuGn")[5:9] # RColorBrewer package
308 | # density.colors <- rev(brewer.pal(11, "Spectral")[1:4]) # RColorBrewer package
309 | # density.colors <- rev(heat.colors(4, alpha=1)) # From colorRamp package
310 |
311 | # These next few options are actually creating a color ramp function, which accepts a number (# of colors in ramp)
312 | # density.colors <- blue2green2red(50) # From colorRamp package
313 | # density.fun <- colorRampPalette(c("gray30", "blue4", "green4", "red3"))
314 | # density.fun <- colorRampPalette(c("gray40", "black")) # A little grayscale to make the points pop?
315 | density.fun <- colorRampPalette(c("darkslategray","darkred", "red"))
316 | density.colors <- density.fun(4)
317 |
318 | mm.density <- mm + stat_density2d(data=birds.int@data[birds.int@data$NAME_1=='Massachusetts',], aes(x = LONGITUDE, y = LATITUDE, fill = ..level..,alpha=..level..), bins = 15, geom = "polygon") +
319 | scale_fill_gradientn(colors=density.colors) +
320 | ggtitle("Birding Hotspots") +
321 | guides(alpha=FALSE) #+
322 | # scale_alpha_continuous(range=c(0.06,0.5)) # By default, the alpha scales from 0.1 to 1. We can adjust to our liking, especially when point density is high.
323 |
324 | mm.density
325 |
326 | # add points on top, colored by species - kind of a mess, but you can do it.
327 | mm.density + geom_point(data = birds.int@data, aes(x=LONGITUDE, y=LATITUDE, color=COMMON.NAME), size=0.6, alpha=0.5) +
328 | scale_colour_manual(values=brewer.pal(4, "Set1")) + # You can try different point colors too! Comment out for defaults.
329 | guides(color=guide_legend(override.aes=list(fill=NA, linetype = 0, alpha=1, size=1))) # no boxes around legend items
330 |
331 | # Now we have a projected, intersected point file and a clipped county file. Let's write them out for later (we already
332 | # wrote the county to a shapefile, but for the sake of learning). Won't use shapefiles because they will truncate our field
333 | # names in the ebird data. Not geoJSON because they can't handle projections. How about an
334 | # RDATA file for later? Excellent!
335 | save(birds.int, county.ne, file="eBird/birds_counties.RDATA")
336 |
--------------------------------------------------------------------------------
/Workshop_rstats/Part4_Rasters/rasterProcessing.R:
--------------------------------------------------------------------------------
1 | # Learning R: rasterProcessing.R
2 | # author: Tina Cormier
3 | # date: August, 2017
4 |
5 | # Description: Learn how to do basic geoprocessing and visualization with raster files.
6 |
7 | ######################################
8 | library(raster) # raster AND vector analysis
9 | library(rasterVis) # visualization of raster data
10 | library(data.table) # enhanced version of data.frames. FAST.
11 | library(rgdal) # bindings to GDAL (must have gdal installed).
12 | library(ggplot2) # a plotting system in R (much NICER than base plotting)
13 | library(gdalUtils) # # wrappers for GDAL utilities
14 | library(GGally) # extension of ggplot
15 | ######################################
16 |
17 | # Time to stop hard coding everything in the body of the code.
18 | # We'll list our user-input variables up here at the top and build from those.
19 | # What is our current working directory?
20 | getwd()
21 | # Let's set a new one!
22 | wd <- "/home/user/R_workshop/data/"
23 | setwd(wd)
24 |
25 | # User variables
26 | # rdata file containing some variables from our previous session
27 | rdata <- "eBird/birds_counties.RDATA"
28 | # canopy cover img
29 | canopy.file <- "images/lc/nlcd2011_usfs_conus_canopy_cover.tif"
30 | # impervious surface img
31 | imp.file <- "images/lc/nlcd_2011_impervious_2011_edition_2014_10_10.tif"
32 | # landsat directory - just one tile
33 | ls.dir <- "images/ls/"
34 |
35 | # Goal: We'd like to understand a little bit about where we tend to find birds of certain species.
36 |
37 | # 1. Open some layers
38 | # Open rdata file
39 | load(rdata)
40 | ls() # what objects are in the global environment? Look! birds.int and county.ne from our last script!
41 |
42 | # Open the canopy cover layer
43 | cc <- raster(canopy.file)
44 |
45 | # impervious surfaces
46 | imp <- raster(imp.file)
47 | # Inspect the new datasets (histograms, plots, quick maps)
48 | cc
49 | imp
50 | # quick plot - we'll work on styling later!
51 | plot(cc)
52 | # let's assign an NA value and try plotting again (we know the NA value from knowing the data, looking in qgis, gdalinfo etc.)
53 | # When the layer was created, NAvalue wasn't defined for some reason, but we can define it.
54 | NAvalue(cc) <- 255
55 | # plot again
56 | plot(cc)
57 |
58 | # Look at impervious surfaces:
59 | plot(imp)
60 |
61 | # The nodata value is 127. Assign NA value to imp and plot again:
62 |
63 |
64 | # How about our points on top? Quick and dirty = not pretty
65 | system.time(plot(birds.int, col="blue", pch=19, cex=0.7, add=T))
66 |
67 |
68 | # Projection check:
69 | projection(cc)
70 | # projection(lc)
71 | projection(imp)
72 | projection(county.ne)
73 |
74 | # Excellent! Same projection. But if we had to project cc to match our county polygons,
75 | # this is how we'd do it. projectRaster will automatically work in parallel if you start
76 | # a cluster. It may still be slower than gdal with -m flag. This can be a time consuming step,
77 | # so we won't do it today, but here's the code.
78 | # beginCluster()
79 | # cc.proj <- projectRaster(cc, crs=projection(county.ne))
80 | # endCluster()
81 |
82 |
83 | # For a moment we'll work just on a subset of the larger raster:
84 | # Clip the raster to one county in our shapefile. When the raster is much larger in extent than my clip polygon (or raster),
85 | # I usually do this in two steps to make it faster. 1. crop to polygon layer (just crops to extent);
86 | # 2. mask by same polygon, which actually extracts the shape of the polygon.
87 | # for the sake of speed today, we'll work just with one of the most interesting counties in Massachusetts.
88 | barn <- county.ne[county.ne$NAME_1 == "Massachusetts" & county.ne$NAME_2 == "Barnstable",]
89 | system.time(cc.crop <- crop(cc, barn)) # you can use system.time() to time a process.
90 | plot(barn)
91 | # the masking can be done in parallel - this won't work on the VM if you have only
92 | # allocated one core
93 | # beginCluster()
94 | # system.time(cc.mask <- clusterR(cc.crop, mask, args=list(mask=barn)))
95 | # endCluster()
96 |
97 | # Same code but single core processing - for work in OSGeo-Live VM
98 | system.time(cc.mask <- mask(cc.crop, barn))
99 | plot(cc.mask)
100 |
101 | # Extract raster values to points - think of this as linking field data with image data.
102 | # We want to know the %canopy cover and %impervious surface
103 | # at each point. Let's go back to the bigger data set and work in parallel!
104 | # 'Extract' is one of several commands that automatically runs in parallel if you start a
105 | # cluster.
106 | # beginCluster() # cluster won't work on OSGeo-Live VM
107 | system.time(birds.int$canopy_cover <- extract(cc, birds.int))
108 | system.time(birds.int$imp_surface <- extract(imp, birds.int))
109 | # endCluster()
110 |
111 | # Now let's see what we got
112 | head(birds.int)
113 | summary(birds.int)
114 |
115 | # Cool! Now let's REALLY see what it looks like. Start basic with canopy cover:
116 | hist(birds.int$canopy_cover, breaks=20)
117 | # Density plot
118 | hist(birds.int$canopy_cover, freq=F)
119 |
120 | # now let's try it with ggplot
121 | ggplot(birds.int@data, aes(canopy_cover)) + geom_histogram(binwidth = 3) # basic
122 | ggplot(birds.int@data, aes(canopy_cover, fill=COMMON.NAME)) + geom_histogram(binwidth = 3) # stack based on species
123 | ggplot(birds.int@data, aes(canopy_cover, fill=COMMON.NAME)) + geom_density() # smoothed kernel density plot ... hmm.
124 | ggplot(birds.int@data, aes(canopy_cover, color=COMMON.NAME)) + geom_density() # smoothed density plot OR
125 | ggplot(birds.int@data, aes(canopy_cover, fill=COMMON.NAME, color=COMMON.NAME)) + geom_density(alpha=0.1) # smoothed density plot
126 | # Add some labels?
127 | p <- ggplot(birds.int@data, aes(canopy_cover, fill=COMMON.NAME, color=COMMON.NAME)) +
128 | geom_density(alpha=0.1) + xlab("% Canopy Cover") + title("Species Distribution by Canopy Cover") +
129 | scale_fill_discrete(name="Common Name") + scale_color_discrete(name="Common Name") # These two are just to change the legend title.
130 | p
131 |
132 | # What if we want a different plot per species?
133 | p + facet_wrap(~COMMON.NAME)
134 | # or = grid of x by y - state by species
135 | p + facet_grid(STATE_PROVINCE ~ COMMON.NAME)
136 | # or species on same plot, but by state?
137 | p + facet_wrap(~STATE_PROVINCE, ncol=3)
138 |
139 |
140 | # Ok, let's say we like this last one and want to save it. There are two ways:
141 | # 1. ggsave will attempt to save your last plot (unless you define a different one) with sensible defaults.
142 | ggsave("birds_densityPlot.png") # will save to your working directory unless you specify a path.
143 |
144 | # 2. Open a graphics device and save. First, need to set your plot to a variable
145 | p <- ggplot(birds.int@data, aes(canopy_cover, fill=COMMON.NAME, color=COMMON.NAME)) +
146 | geom_density(alpha=0.1) + xlab("% Canopy Cover") + title("Species Distribution by Canopy Cover") +
147 | scale_fill_discrete(name="Common Name") + scale_color_discrete(name="Common Name")
148 | # Anything I plot or print underneath this command will go to the pdf device rather than the interactive plot window
149 | # or the console.
150 | pdf("birds_density.pdf", width=6.5, height=5)
151 | print(p)
152 | dev.off() # Be sure to turn off your device
153 |
154 | # Box plot of canopy cover by species
155 | ggplot(birds.int@data, aes(x=as.character(year), y=imp_surface, fill=COMMON.NAME)) + geom_boxplot(outlier.shape=1)
156 | # Wow, those outliers are distracint and actually a small % of the data! Let's not show them.
157 | imp.p <- ggplot(birds.int@data, aes(x=as.character(year), y=imp_surface, fill=COMMON.NAME)) + geom_boxplot(outlier.shape=NA)
158 | cc.p <- ggplot(birds.int@data, aes(x=as.character(year), y=canopy_cover, fill=COMMON.NAME)) + geom_boxplot(outlier.shape=NA)
159 |
160 | plot(imp.p)
161 | plot(cc.p)
162 |
163 | # Hmm, maybe the canopy cover might tell a better story if we classified the image
164 | # from a continous layer into a thematic layer? You can easily classify images in R. Of course,
165 | # we could just relcassify the values we extracted to our bird points, but that's no fun!
166 | # beginCluster() # Cluster code here for later when you're on a multi-core machine.
167 | # note, if this was a more complex reclassification, we could create a 3-column matrix
168 | # to pass to the rcl argument. This will take a few minutes! c(0,2,1, 2,5,2, 4,10,3)
169 | # system.time(cc.reclass <- clusterR(cc, reclassify, args=list(rcl=c(0,30,1, 30,60,2, 60,100,3))))
170 | # plot(cc.reclass)
171 | # Same code single core:
172 | system.time(cc.reclass <- reclassify(cc, rcl=c(0,30,1, 30,60,2, 60,100,3)))
173 |
174 | # Extract values to points again:
175 | birds.int$canopy_class <- extract(cc.reclass, birds.int)
176 | # endCluster()
177 |
178 | # What are the first 3 values in our new canopy class field?
179 |
180 | # Pause to write birds.int (with our extracted rasters) and county.ne to rdata file.
181 | save(birds.int, county.ne, file="birds_counties_extract.RDATA")
182 |
183 | # Bar plot!
184 | ggplot(data=birds.int@data, aes(x=canopy_class, fill=COMMON.NAME)) + geom_bar(width=0.75)
185 | # or
186 | ggplot(data=birds.int@data, aes(x=canopy_class, fill=COMMON.NAME)) + geom_bar(width=0.75, position = 'dodge')
187 |
188 | # One more cool visual we can do straight from R after doing some spatial processing.
189 | # This is helpful for you modelers - especially when you have lots of variables!
190 | ggpairs(birds.int@data[, c("canopy_cover", "imp_surface", "canopy_class")])
191 | # To get the actual correlation numbers:
192 | cors <- cor(birds.int@data[, c("canopy_cover", "imp_surface", "canopy_class")])
193 | cors
194 | # How hard it is to do a simple model?
195 | mod <- lm(imp_surface ~ canopy_cover, data=birds.int@data)
196 | summary(mod)
197 | plot(mod)
198 |
199 | ######### SAT IMAGE ###########
200 | # Now let's work with some satellite imagery.
201 | # Let's list the TIF files within the ls.dir
202 | ls.files <- list.files(ls.dir, "*.TIF$", full.names = T)
203 | ls.files # We have bands 4, 5, 6 from a landsat 8 image from July 4, 2016 (red, NIR, SWIR)
204 |
205 | # Even though they were delivered to us as individual bands, we can open them as a stack.
206 | ls <- stack(ls.files) # You may get warnings here - ignore.
207 | ls
208 |
209 | # Let's look:
210 | plot(ls)
211 | levelplot(ls)
212 | levelplot(ls[[1]])
213 |
214 | # Hmm, they both plot the bands individually. Can we look at them together?
215 | plotRGB(ls, 3,2,1, scale=65535)
216 |
217 | # You can do math on rasters just like other objects in R.
218 | # Look at the min/max stats on band 1:
219 | ls[[1]]
220 | ls * 10
221 |
222 | # Let's calculate our own band: NDVI = (NIR - R)/(NIR + R) = greenness
223 | system.time(ndvi <- (ls[[2]] - ls[[1]])/(ls[[2]] + ls[[1]]))
224 | ndvi
225 | plot(ndvi)
226 |
227 | # maybe for this exercise, we don't need 30 m data - let's aggregate to 90m - often a better
228 | # and quicker option than resampling.
229 | ndvi.agg <- aggregate(ndvi, 3, mean)
230 | ndvi.agg
231 | plot(ndvi.agg)
232 |
233 | # Discuss resample as a method for snapping one raster to another. - too slow on our VM
234 |
235 |
236 |
237 |
238 |
--------------------------------------------------------------------------------
/Workshop_rstats/README.md:
--------------------------------------------------------------------------------
1 | # Geo with R? Yes we Can!
2 | ### Tina A. Cormier
3 |
4 |
5 | ## :heavy_check_mark: Pre-workshop Instructions:
6 |
7 | To cut down on setup during our short time together, please do the following **prior to the workshop**:
8 |
9 | 1. *Download* and *install* VirtualBox and *download* and *unzip* OSGeo-Live using the links found on the [OSGeo-Live website](https://live.osgeo.org/en/quickstart/virtualization_quickstart.html).
10 | * OSGeo live took about 30-35 minutes to download for me with decent speeds, so please don't wait until the workshop to download (when speeds are expected to be slower with everyone hitting the network at the same time).
11 |
12 | * You do not need to set up your virtual machine. We will do that together and make some custom tweaks to the settings for a better experience!
13 |
14 | 2. Download the [workshop data](https://drive.google.com/open?id=0B4DQJSUPD0brVktPSXZFcmx2MEU) into a folder on your machine called `R_workshop`. You should now have a directory structure like `/[your_path]/R_workshop/data/`. You should also have a directory structure like `/[your_path]/R_workshop/code/` that contains the code in this repository.
15 |
16 |
17 | ## It's Workshop Day :clap:
18 |
19 | ##### Getting Started
20 | ###### VB setup
21 | 1. Start the VirtualBox application and click on the New button to create a new VM, and then Next.
22 | 2. Enter a name such as OSGeo-Live, and choose Linux as the “Operating system”, and Ubuntu as the “Version”.
23 | 3. In the next screen set the memory to *at least* 1024 MB (or more if your host computer has more than 4GB). On my 16GB machine, I chose 8192 MB. More than that froze my computer.
24 | 4. Continue to the next screen and choose “Use existing hard disk” . Now click on the button (a folder icon) to browse to where you saved the OSGeo-Live vmdk-file. Select this file, press Next and Create.
25 | 5. Once the VM is created, click on the Settings button. In the “General” section, go to the Advanced tab, and click to select “bidirectional" for shared clipboard (this doesn't really work but it makes me feel better to choose it).
26 | 6. Go to the “Display” section and increase video memory to 32 or 64 MB.
27 | 7. In the “Shared Folders” section, click the “Add folder” (green + icon on the right), browse to your workshop "data" directory, and add it. Also select Auto-mount.
28 | 8. Now boot up the VM by clicking the Start (green arrow) button.
29 |
30 | ###### The remainder of the instructions are to be run from within the VM.
31 |
32 | 9. Under menu (bottom left button) -> Preferences -> Monitor Settings, you can adjust display resolution to better fit your monitor. Be careful not to make it too large that the menu bar goes off your screen - it's a pain to get it back. Use the "Apply" button to test different settings before choosing one to save. Alt+F1 will get you back to the menu if you lose it off the screen, but just don't do that.
33 | 10. Add yourself to the vboxsf group so that the shared folders (defined above) are accessible. Open terminal and enter the following command
34 |
35 | ``` sudo usermod -a -G vboxsf user ```
36 |
37 | 11. Above, we defined a Shared Folder path on the host system and named it “R_workshop” in the VM Settings. The shared folder will appear in the file system under /media/sf_R_workshop/. To mount this folder in the user’s home directory, enter the following two commands into terminal (make sure you are in `/home/user/` (type `pwd`) first:
38 |
39 | ```mkdir R_workshop```
40 |
41 | ```sudo mount -t vboxsf -o uid=user,rw R_workshop /home/user/R_workshop```
42 |
43 | *Note* that the mount command will need to be run each time you log in to the VM.
44 |
45 | 12. Change directories into R_workshop (`cd R_workshop`). You should have two folders there: code and data. CD into the code directory and run the following commands:
46 |
47 | `chmod 777 VM_setup.sh`
48 |
49 | `./VM_setup.sh`
50 |
51 | 13. Go to https://www.rstudio.com/products/rstudio/download/#download and download the version named: **RStudio 1.0.153 - Ubuntu 16.04+/Debian 9+ (64-bit)**.
52 |
53 |
54 |
55 |
--------------------------------------------------------------------------------
/Workshop_rstats/VM_setup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | sudo apt-get install libgdal1-dev libproj-dev libudunits2-dev libv8-3.14-dev imagemagick
4 | sudo apt install apt-file
5 | sudo apt-file update
--------------------------------------------------------------------------------