├── 00_extract_datasets.R
├── 01_PlotSelection.R
├── 02_BuildDataset.Rmd
├── 03_AuthorList.R
├── 04_FiguresTables.Rmd
├── 05_Demo.Rmd
├── 998_fixSplotOpen.R
├── README.md
├── _public
    ├── 02_BuildDataset.html
    ├── 04_FiguresTables.html
    └── 05_Demo.pdf
├── _resampling
    ├── 01_running_a_global_PCA_on_bioclimatic_and_soil_variables.R
    ├── 02_resampling_sPlot_within_the_PC1-PC2_environmental_space.R
    ├── 03_extracting_selected_plots_from_the_sPlot_database.R
    ├── README.md
    └── _functions_TH
    │   ├── BetaJtu_OpenMP.cpp
    │   ├── BetaJtu_RcppParallel.cpp
    │   ├── bray.part.C_RcppParallel.cpp
    │   ├── bray.part.OpenMP.cpp
    │   ├── cast.cpp
    │   ├── cast_binary.cpp
    │   └── hcr.C.cpp
└── sPlot2_PlotDistribution.R


/00_extract_datasets.R:
--------------------------------------------------------------------------------
 1 | library(data.table)
 2 | load("/data/sPlot/releases/sPlot2.1/sPlot_header_20161124.RData")
 3 | load("/data/sPlot/releases/sPlot2.1/DT2_20161025.RData")
 4 | 
 5 | 
 6 | 
 7 | ###run Header fix
 8 | affiliations <- fread("../_sPlot_Management/Affiliations_20180704.csv")
 9 | databases <- fread("../_sPlot_Management/Databases.out_20180704.csv")
10 | sel <- fread("_data/Resampled1.csv")$x
11 | 
12 | header.ress <- header.fix %>%
13 |   filter(PlotID %in% sel)
14 | 
15 | 
16 | res.source <- header.fix %>%
17 |   dplyr::select(PlotID, Dataset, `GIVD ID`) %>%
18 |   filter(PlotID %in% sel) %>%
19 |   group_by(`GIVD ID`) %>% 
20 |   summarise(num.plots.res=n()) %>%
21 |   distinct() %>%
22 |   left_join(databases %>%
23 |               filter(label!="BIOTA_South_Africa_2") %>% ### two dataset labels for the same GIVD
24 |               dplyr::select(`GIVD ID`, Custodian, `Deputy custodian`), by="GIVD ID") %>%
25 |   left_join(header.fix %>% 
26 |               group_by(`GIVD ID`) %>%
27 |               summarise(tot.plots=n()), by="GIVD ID") %>%
28 |   left_join(affiliations %>% 
29 |               filter(Sequence_affiliations==1) %>%
30 |               dplyr::select(Name, `E-Mail`) %>%
31 |               rename(Custodian=Name), by="Custodian") %>%
32 |   rename(`Custodian E-Mail`=`E-Mail`) %>%
33 |   left_join(affiliations %>% 
34 |               filter(Sequence_affiliations==1) %>%
35 |               dplyr::select(Name, `E-Mail`) %>%
36 |               rename(`Deputy custodian`=Name), by="Deputy custodian") %>%
37 |   rename(`Deputy custodian E-Mail`=`E-Mail`) %>%
38 |   left_join(header.fix %>%
39 |               dplyr::select(`GIVD ID`, Dataset) %>%
40 |               filter(Dataset!="BIOTA_South_Africa_2") %>% ### two dataset labels for the same GIVD
41 |               distinct(), by="GIVD ID") %>%
42 |   mutate(perc.plots=round(num.plots.res/tot.plots*100,1)) %>%
43 |   dplyr::select(`GIVD ID`,Dataset, num.plots.res, tot.plots, perc.plots, Custodian, `Deputy custodian`,
44 |                 `Custodian E-Mail`, `Deputy custodian E-Mail`)# %>%
45 |   arrange(desc(perc.plots))
46 |   
47 | 
48 |   
49 | 
50 | fwrite(res.source[,1:6], "_output/resampling_source_custodian_perc.csv")
51 |   
52 | 
53 | #### create figure to compare plot density before and after resampling in environmental space
54 | 
55 | 
56 | 
57 |   
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/01_PlotSelection.R:
--------------------------------------------------------------------------------
  1 | library(tidyverse)
  2 | ##### code from HELGE BRUELHEIDE
  3 | 
  4 | load("/data/sPlot2.0/plot_data.RData") # server directory
  5 | # selected pool of plots by Jonathan
  6 | str(plot_data)
  7 | dim(plot_data) #799400     54
  8 | 
  9 | load("c:\\Daten\\iDiv2\\sPlot2\\JonathanLenoir\\plotToRemove.RData")
 10 | 
 11 | str(output)
 12 | output[[1]]
 13 | plotToRemove <- output[[2]]
 14 | rm(output)
 15 | class(plotToRemove) 
 16 | length(plotToRemove[[1]]) # First iteration containing 700037 IDs
 17 | for (i in 1:100) {
 18 |   plotToRemove[[i]] <- na.omit(plotToRemove[[i]])
 19 | }
 20 | str(plotToRemove)
 21 | # List of 100
 22 | str(plotToRemove[[1]]) #atomic [1:700036] 
 23 | 
 24 | posit <- match(plotToRemove[[1]], plot_data$PlotID)
 25 | # this gives the index for the first set, which we have chosen for paper #2.
 26 | 
 27 | plot_data_sel1 <- plot_data[-posit, ]
 28 | dim(plot_data_sel1)[1] # 99364 selected relev?s
 29 | # and this gives the list of plots selected
 30 | 
 31 | #If you repeat this three times
 32 | posit <- match(plotToRemove[[2]], plot_data$PlotID)
 33 | plot_data_sel2 <- plot_data[-posit, ]
 34 | posit <- match(plotToRemove[[3]], plot_data$PlotID)
 35 | plot_data_sel3 <- plot_data[-posit, ]
 36 | str(plot_data_sel1)
 37 | PlotID_run_1_2_3 <- 
 38 |   sort(unique(c(plot_data_sel1$PlotObservationID,plot_data_sel2$PlotObservationID, 
 39 |                 plot_data_sel3$PlotObservationID)))
 40 | str(PlotID_run_1_2_3)
 41 | # int [1:150116] 16 17 18 20 22 23 26 30 31 41 ...
 42 | 
 43 | ##### FMS TAKES OVER FROM HERE ##### 
 44 | ## reimport selected plots
 45 | PlotID123 <- read_csv("_data/PlotID_run_1_2_3.csv") %>% 
 46 |   rename(PlotObservationID=x)
 47 | #load header and fix
 48 | load("/data/sPlot/releases/sPlot2.1/sPlot_header_20161124.RData")
 49 | source("/data/sPlot/users/Francesco/_sPlot_Management/Fix.header.R")
 50 | header <- fix.header(header, exclude.sophy = F)
 51 | databases <- read_csv("/data/sPlot/users/Francesco/_sPlot_Management/Consortium/Databases.out.csv")
 52 | 
 53 | 
 54 | header.sel <- header %>% 
 55 |   filter(PlotObservationID %in% PlotID123$PlotObservationID)
 56 | 
 57 | 
 58 | 
 59 | ### Data from Stephan - Check for ID matches
 60 | header.TV <- read_delim(file = "_data/sPlot-2.1_header.csv", delim = "\t", 
 61 |                         col_types = cols(PlotObservationID = col_double(),
 62 |                                         PlotID = col_double(),
 63 |                                         `TV2 relevé number` = col_double(),
 64 |                                         `Original nr in database` = col_character(),
 65 |                                         ORIGDB_NR = col_character(),
 66 |                                         ORIGDB_NR_1 = col_character(),
 67 |                                         ORIG_REL_N = col_character(),
 68 |                                         Longitude = col_double(),
 69 |                                         Latitude = col_double(),
 70 |                                         `Location uncertainty (m)` = col_double(),
 71 |                                         Dataset = col_character()))
 72 | 
 73 | ## import answers from custodians
 74 | answers <- read_csv(file="_management/resampling_answers.csv")
 75 | 
 76 | header.sel.TV <- header.sel %>% 
 77 |   left_join(header.TV %>% 
 78 |               dplyr::select(PlotObservationID, `TV2 relevé number`:ORIG_REL_N),
 79 |             by="PlotObservationID") %>% 
 80 |   dplyr::select(PlotObservationID,  Dataset, `GIVD ID`, Longitude:Latitude, Country, `Location uncertainty (m)`, `TV2 relevé number`, `Original nr in database`, ORIG_REL_N) %>% 
 81 |   left_join(databases %>% 
 82 |               dplyr::select(`GIVD ID`, Custodian, `Deputy custodian`, `Still in sPlot`))# %>% 
 83 | ## join answers from google spreadsheet
 84 |   #left_join(answers) %>% 
 85 |   #filter(`Yes/Conditional/No`!="Yes")
 86 | 
 87 | write_csv(header.sel.TV, path="_output/header.sel123_tocheck_withIDs.csv")
 88 | 
 89 | ### ADD NA-US-002
 90 | header.sel.TV.add.NAUS002 <- header.sel.TV %>% 
 91 |   filter(`GIVD ID`=="NA-US-002")
 92 | write_csv(header.sel.TV.add.NAUS002, path="_output/header.sel123_addNAUS002.csv")
 93 | 
 94 | ### ADD SOPHY
 95 | header.sel.TV.Sophy <- header.sel.TV %>% 
 96 |   filter(`GIVD ID`=="EU-FR-003") %>% 
 97 |   dplyr::select(-`Still in sPlot`) %>% 
 98 |   mutate(`Usable in sPlot Project 02`="")
 99 | write_csv(header.sel.TV.Sophy, path="_output/header.sel123_Sophy.csv")
100 | 
101 | ### EXTRACT RELEVE numbers for EU-CZ-001
102 | header.TV.joined <- header.TV %>% 
103 |   left_join(header %>% 
104 |               dplyr::select(-Longitude, -Latitude, -`Location uncertainty (m)`, -Dataset), 
105 |             by="PlotObservationID") %>% 
106 |   dplyr::select(PlotObservationID,  Dataset, `GIVD ID`, Longitude:Latitude, Country, `Location uncertainty (m)`, `TV2 relevé number`, `Original nr in database`, ORIG_REL_N) %>% 
107 |   left_join(databases %>% 
108 |               dplyr::select(`GIVD ID`, Custodian, `Deputy custodian`, `Still in sPlot`))
109 | header.sel.TV.EUCZ001 <- header.TV.joined %>% 
110 |     filter(`GIVD ID`=="EU-CZ-001")
111 | write_csv(header.sel.TV.EUCZ001, path="_output/header.sel123_EUCZ001.csv")
112 | 
113 | ## extract RAINFOR Data + DAta From PERU
114 | header.sel.TV.0000001 <-   header.TV.joined %>% 
115 |   filter(`GIVD ID`=="00-00-001")
116 | write_csv(header.sel.TV.0000001, path="_output/header.sel123_0000001.csv")
117 | 
118 | header.sel.TV.Peru <- header.TV.joined %>% 
119 |   filter(Country=="Peru")
120 | write_csv(header.sel.TV.0000001, path="_output/header.Peru.csv")
121 | 
122 | ### EXTRACT RELEVE NUMBERS FOR AS-KG-001
123 | header.sel.TV.add.ASKG001 <- header.sel.TV %>% 
124 |   filter(`GIVD ID`=="AS-KG-001")
125 | write_csv(header.sel.TV.add.ASKG001, path="_output/header.sel123_ASKG001.csv")
126 | 
127 | ### EXTRACT RELEVE NUMBERS FOR SALVIAS 00-00-002
128 | header.sel.TV.add.SALVIAS <- header.sel.TV %>% 
129 |   filter(`GIVD ID`=="00-00-003")
130 | write_csv(header.sel.TV.add.SALVIAS, path="_output/header.sel123_SALVIAS.csv")
131 | 
132 | 
133 | ##summarize
134 | header.sel.TV.summary <- header.sel %>% 
135 |   group_by(`GIVD ID`) %>% 
136 |   summarize(n.sel.plot=n()) %>% 
137 |   left_join(header %>% 
138 |               group_by(`GIVD ID`) %>% 
139 |               summarize(n.tot.plot=n()),
140 |             by="GIVD ID") %>% 
141 |   mutate(share.perc=n.sel.plot/n.tot.plot*100)
142 | 
143 | 
144 | 
145 | ## Prepare demonstration subset for Miguel Alvarez, custodian of SWEA
146 | load("/data/sPlot/releases/sPlot2.1/DT2_20161025.RData")
147 | swea.header <- header.sel %>% 
148 |   filter(`GIVD ID`=="AF-00-006") 
149 | 
150 | swea.IDs <- header.sel.TV  %>% 
151 |   filter(`GIVD ID`=="AF-00-006") %>% 
152 |   dplyr::select(-Custodian, -`Deputy custodian`, -`Still in sPlot`)
153 | 
154 | swea.dt <- DT2 %>% 
155 |   filter(PlotObservationID %in% (swea.header %>% 
156 |                                    pull(PlotObservationID))) %>% 
157 |   dplyr::select(-Taxon.group, -Layer)
158 | 
159 | write_csv(swea.header, path="_output/header.SWEA.csv")
160 | write_csv(swea.IDs, path="_output/IDs.SWEA.csv")
161 | write_csv(swea.dt, path="_output/DT2.SWEA.csv")
162 | 
163 | 
164 | 
165 | ####################################################################################################################
166 | ######### Reimport answers from dataset custodians and build redundant selection of plots that can be used #########
167 | ####################################################################################################################
168 | rm(list=ls())
169 | ## reimport selected plots
170 | PlotID123 <- read_csv("_data/PlotID_run_1_2_3.csv") %>% 
171 |   rename(PlotObservationID=x)
172 | #load header and fix
173 | load("/data/sPlot/releases/sPlot2.1/sPlot_header_20161124.RData")
174 | 
175 | ## GIT checkout to last stable version for sPlot 2.1 in GIT for consortium management data
176 | source("/data/sPlot/users/Francesco/_sPlot_Management/Fix.header.R")
177 | header <- fix.header(header, exclude.sophy = F)
178 | # GIT checkout baco to master! - i.e., switch back to most up-to-date version in GIT consortium
179 | databases <- read_csv("/data/sPlot/users/Francesco/_sPlot_Management/Consortium/Databases.out.csv")
180 | 
181 | header.sel <- header %>% 
182 |   filter(PlotObservationID %in% PlotID123$PlotObservationID)
183 | 
184 | ### Data from Stephan - Check for ID matches
185 | header.TV <- read_delim(file = "_data/sPlot-2.1_header.csv", delim = "\t", 
186 |                         col_types = cols(PlotObservationID = col_double(),
187 |                                          PlotID = col_double(),
188 |                                          `TV2 relevé number` = col_double(),
189 |                                          `Original nr in database` = col_character(),
190 |                                          ORIGDB_NR = col_character(),
191 |                                          ORIGDB_NR_1 = col_character(),
192 |                                          ORIG_REL_N = col_character(),
193 |                                          Longitude = col_double(),
194 |                                          Latitude = col_double(),
195 |                                          `Location uncertainty (m)` = col_double(),
196 |                                          Dataset = col_character()))
197 | 
198 | ## import answers from custodians GIVD level
199 | library(openxlsx)
200 | #SWEA CONFIRMED ! - reimport answers
201 | 
202 | #answers <- read_csv(file="_management/resampling_answers.csv")
203 | answers <- openxlsx::read.xlsx("_management/resampling_answers.xlsx", sheet = 2)
204 | answers <- answers %>% 
205 |   mutate(`Yes/Conditional/No`=fct_recode(`Yes/Conditional/No`, No="NO", Yes="yes")) %>% 
206 |   # Manually set some dataset to yes
207 |   # Rasmus Revermann and Donald Walker's acceptance is conditional, 
208 |   # but depends on conditions others than the selection of plot
209 |   # Brian Enquist confirmed for SALVIAS [29.04.2020]
210 |   mutate(`Yes/Conditional/No`=replace(`Yes/Conditional/No`, 
211 |                                     list=GIVD.ID %in% c("NA-US-014","AF-00-009", 
212 |                                                         "AF-00-006", "00-00-003"),  
213 |                                     values="Yes"))
214 | 
215 | # join header with Turboveg information by PlotObservationID, and attach dataset-level answers from custodians
216 | header.sel.TV <- header.sel %>% 
217 |   left_join(header.TV %>% 
218 |               dplyr::select(PlotObservationID, `TV2 relevé number`:ORIG_REL_N),
219 |             by="PlotObservationID") %>% 
220 |   dplyr::select(PlotObservationID,  Dataset, `GIVD ID`, Longitude:Latitude, Country, `Location uncertainty (m)`, `TV2 relevé number`, `Original nr in database`, ORIG_REL_N) %>% 
221 |   left_join(databases %>% 
222 |               dplyr::select(`GIVD ID`, Custodian, `Deputy custodian`, `Still in sPlot`)) %>% 
223 |   left_join(answers %>% 
224 |               rename(`GIVD ID`=GIVD.ID)) 
225 | 
226 | ## import google spreadsheet with plots marked individually
227 | plots.checked <- openxlsx::read.xlsx("_management/header.sel123_checked_20200319.xlsx", sheet = 1)
228 | plots.checked <- plots.checked %>% 
229 |   as.tbl() %>% 
230 |   mutate(Usable.in.Paper.02=ifelse(GIVD.ID=="00-RU-002", NA, Usable.in.Paper.02)) %>% 
231 |   dplyr::select(PlotObservationID, Usable.in.Paper.02)
232 | 
233 |   
234 | 
235 | 
236 | ## DB whose plots are marked in other sources:
237 | # CZ 
238 | # Siberia - 00-RU-002 [Milan]
239 | # Germany 
240 | # NA-US-002
241 | 
242 | ## import answers from custodians GIVD level
243 | ## Czechia - EU-CZ-001
244 | CZ.id <- read_csv("_management/Header_EUCZ001_20191029_checked.csv")
245 | CZ.id <- CZ.id %>% 
246 |   mutate(Canbeused=ifelse(Canbeused=="Y", "Yes", "No")) %>% 
247 |   dplyr::select(PlotObservationID, Usable.in.Paper.02=Canbeused)
248 |   
249 | ## GVRD
250 | GVRD <- openxlsx::read.xlsx("_management/GVRD_check_openaccess_short.xlsx", sheet = 1, rowNames=F)
251 | GVRD <- header.sel.TV %>% 
252 |   filter(`GIVD ID`=="EU-DE-014") %>% 
253 |   dplyr::select(PlotObservationID, `GIVD ID`, `TV2 relevé number`) %>% 
254 |   left_join(GVRD %>% 
255 |               as.tbl() %>% 
256 |               mutate(Usable.in.Paper.02=ifelse(Usable.in.Paper.02=="YES", "Yes", "No")) %>% 
257 |               dplyr::select(`GIVD ID`=GIVD.ID, `TV2 relevé number`=TV2.relevé.number, Usable.in.Paper.02), 
258 |             by=c("GIVD ID", "TV2 relevé number")) %>% 
259 |   dplyr::select(-`TV2 relevé number`, -`GIVD ID`)
260 | 
261 | ## Siberia - This part of code ONLY selects the usable plots. Header and DT will need to be replaced with the new version afterwards
262 | ### Import updated data from 00-RU-002
263 | Siberia.public <- openxlsx::read.xlsx("_management/Siberia_update/00-RU-002-potentially-public-plots.xlsx", sheet=1)
264 | Siberia.new <- read_delim("_management/Siberia_update/Siberia_Chytry_header.csv", delim="\t", 
265 |                           col_types = cols(PlotObservationID = col_double(),
266 |                             PlotID = col_double(),
267 |                             `TV2 relevé number` = col_double(),
268 |                             Country = col_character(),
269 |                             `Cover abundance scale` = col_character(),
270 |                             `Date of recording` = col_character(),
271 |                             `Relevé area (m²)` = col_double(),
272 |                             `Altitude (m)` = col_double(),
273 |                             `Aspect (°)` = col_logical(),
274 |                             `Slope (°)` = col_logical(),
275 |                             `Cover total (%)` = col_logical(),
276 |                             `Cover tree layer (%)` = col_double(),
277 |                             `Cover shrub layer (%)` = col_double(),
278 |                             `Cover herb layer (%)` = col_double(),
279 |                             `Cover moss layer (%)` = col_double(),
280 |                             `Cover lichen layer (%)` = col_logical(),
281 |                             `Cover algae layer (%)` = col_logical(),
282 |                             `Cover litter layer (%)` = col_double(),
283 |                             `Cover open water (%)` = col_double(),
284 |                             `Cover bare rock (%)` = col_double(),
285 |                             `Height (highest) trees (m)` = col_double(),
286 |                             `Height lowest trees (m)` = col_double(),
287 |                             `Height (highest) shrubs (m)` = col_double(),
288 |                             `Height lowest shrubs (m)` = col_double(),
289 |                             `Aver. height (high) herbs (cm)` = col_double(),
290 |                             `Aver. height lowest herbs (cm)` = col_double(),
291 |                             `Maximum height herbs (cm)` = col_double(),
292 |                             `Maximum height cryptogams (mm)` = col_double(),
293 |                             `Mosses identified (y/n)` = col_character(),
294 |                             `Lichens identified (y/n)` = col_character(),
295 |                             COMMUNITY = col_character(),
296 |                             SUBSTRATE = col_character(),
297 |                             Locality = col_character(),
298 |                             ORIG_NUM = col_character(),
299 |                             ALLIAN_REV = col_character(),
300 |                             REV_AUTHOR = col_character(),
301 |                             Forest = col_logical(),
302 |                             Grassland = col_logical(),
303 |                             Wetland = col_logical(),
304 |                             `Sparse vegetation` = col_logical(),
305 |                             Shrubland = col_logical(),
306 |                             `Plants recorded` = col_character(),
307 |                             `Herbs identified (y/n)` = col_character(),
308 |                             Naturalness = col_integer(),
309 |                             EUNIS = col_character(),
310 |                             FIELD_NO = col_character(),
311 |                             Longitude = col_double(),
312 |                             Latitude = col_double(),
313 |                             `Location uncertainty (m)` = col_double(),
314 |                             Dataset = col_character(),
315 |                             `Access regime` = col_character())) %>% 
316 |   mutate(Usable.in.Paper.02=ifelse(FIELD_NO %in% Siberia.public$FIELD_NO, "Yes", "No")) %>% 
317 |   dplyr::select(-PlotObservationID) %>% 
318 |   # attach PlotObservationID from header
319 |   left_join(header.sel %>% 
320 |               filter(Dataset=="Siberia_Chytry") %>% 
321 |               dplyr::select(PlotObservationID) %>% 
322 |               left_join(header.TV %>% 
323 |                   dplyr::select(PlotObservationID, `TV2 relevé number`),
324 |                 by="PlotObservationID"), 
325 |             by=c("TV2 relevé number")) %>% 
326 |   dplyr::select(PlotObservationID, Usable.in.Paper.02) %>% 
327 |   # exclude new plots not in sPlot 2.1
328 |   filter(!is.na(PlotObservationID))
329 | 
330 | ### NA-US-002 - USA_vegbank
331 | vegbank0 <- openxlsx::read.xlsx("_management/sPlot-Open-NA-US-002.xlsx", sheet=1)
332 | vegbank <- vegbank0 %>% 
333 |   mutate(Bob.answer=replace(Bob.answer, 
334 |                             list=Bob.answer %in% c("No ", "no"), 
335 |                             values="No")) %>% 
336 |   mutate(Bob.answer=replace(Bob.answer, 
337 |                             list=Bob.answer=="yes", 
338 |                             values="Yes")) %>% 
339 |   mutate(`Usable.in.Paper.02`=Bob.answer) %>% 
340 |   dplyr::select(PlotObservationID, Usable.in.Paper.02) %>% 
341 |   filter(!is.na(PlotObservationID))
342 |   
343 | ## SOPHY
344 | sophy <- read_csv("_management/RLV_espaces_naturels-buffer_10m.csv") %>% 
345 |   dplyr::select(PlotObservationID=PlotObserv) %>% 
346 |   mutate(Usable.in.Paper.02="Yes")
347 | 
348 | 
349 | ### the turboveg codes of the new and old database match, but there are many new plots in Siberia.new. 
350 | #  check <- header.sel %>% 
351 | #    filter(Dataset=="Siberia_Chytry") %>% 
352 | #    left_join(header.TV %>% 
353 | #                dplyr::select(PlotObservationID, `TV2 relevé number`:ORIG_REL_N),
354 | #              by="PlotObservationID") %>% 
355 | #    dplyr::select(Dataset, PlotObservationID, `Original nr in database`, `GIVD ID`,  Country:`Slope (°)`, Longitude:Latitude, `Location uncertainty (m)`, `TV2 relevé number`) %>% 
356 | #    left_join(Siberia.new %>% 
357 | #                dplyr::select(Dataset, PlotID:`Slope (°)`, Latitude, Longitude),
358 | #              by=c("Dataset", "TV2 relevé number"))
359 | 
360 | 
361 | 
362 | 
363 | 
364 | 
365 | ### Coalesce files
366 | header.sel.final <- header.sel.TV %>% 
367 |   dplyr::select(-Longitude, -Latitude, -`Location uncertainty (m)`, -ORIG_REL_N, -`TV2 relevé number`, -`Original nr in database`) %>% 
368 |   left_join(plots.checked, by="PlotObservationID") %>% 
369 |   left_join(CZ.id, by="PlotObservationID") %>% 
370 |   left_join(GVRD, by="PlotObservationID") %>% 
371 |   left_join(Siberia.new, by="PlotObservationID") %>% 
372 |   left_join(vegbank, by="PlotObservationID") %>% 
373 |   mutate(Usable=ifelse(`Yes/Conditional/No`=="Yes", "Yes", NA)) %>% 
374 |   mutate(Usable=ifelse(`Yes/Conditional/No`=="No", "No", Usable)) %>%
375 |   mutate(Usable=coalesce(Usable, Usable.in.Paper.02.x, Usable.in.Paper.02.y, Usable.in.Paper.02.x.x, Usable.in.Paper.02.y.y, Usable.in.Paper.02)) %>% 
376 |   dplyr::select(-Usable.in.Paper.02.x, -Usable.in.Paper.02.y, -Usable.in.Paper.02.x.x, -Usable.in.Paper.02.y.y, -Usable.in.Paper.02) %>% 
377 |   dplyr::select(-`Still in sPlot`, -`Yes/Conditional/No`) %>% 
378 |   ##join France data from SOPHY
379 |   left_join(sophy, by="PlotObservationID") %>% 
380 |   mutate(Usable=coalesce(Usable, Usable.in.Paper.02)) %>% 
381 |   mutate(Usable=ifelse(`GIVD ID`=="EU-FR-003" & is.na(Usable), 
382 |                        "No", Usable)) %>% 
383 |   dplyr::select(-Usable.in.Paper.02) %>% 
384 |   mutate(Usable=ifelse(is.na(Usable), "Unknown", Usable))
385 | 
386 |   
387 | summary.sel.final <- header.sel.final %>% 
388 |   group_by(`GIVD ID`, Dataset, Custodian, `Deputy custodian`) %>% 
389 |   summarize(usable=sum(Usable=="Yes"), 
390 |             not.usable=sum(Usable=="No"), 
391 |             unknown=sum(Usable=="Unknown")) %>% 
392 |   arrange(desc(unknown), desc(not.usable), desc(usable), desc(`GIVD ID`)) 
393 |   
394 | print(summary.sel.final, n=20)
395 | #AS-KG-001# still waiting for plot selection
396 | #00-RU-001# need to be reimported from turboveg (asked Stephan Hennekens)
397 | 
398 | write_csv(summary.sel.final, "_output/summary.sel.final.csv")
399 | write_csv(header.sel.final, "_output/header.sel.final.csv")
400 | 
401 | 
402 | ### Export plot ID of Angola data for Rasmus Revermann. Angola data will be updated.
403 | 
404 | angola.sel <- header.sel.TV %>% 
405 |   filter(PlotObservationID %in% header.sel.final$PlotObservationID) %>% 
406 |   filter(Dataset=="Angola") %>% 
407 |   select(1:6, 8) 
408 | write_csv(angola.sel, "_output/Angola_sel.csv")
409 | 


--------------------------------------------------------------------------------
/03_AuthorList.R:
--------------------------------------------------------------------------------
  1 | ### Code to administer the author list and their affiliations
  2 | ### of sPlotOpen_Manuscript
  3 | ### This code also formats authors affiliations to Manubot's standards (yaml)
  4 | library(tidyverse)
  5 | library(stringr)
  6 | filter <- dplyr::filter
  7 | 
  8 | #### 0. Ancillary functions ####
  9 | # Function 1 - Extract name initials
 10 | # credits: https://codereview.stackexchange.com/questions/150624/extracting-initials-with-r
 11 | initials <- function(full.name) {
 12 |   # Returns initials of a full name
 13 |   # Input will contain only letters (uppercase and/or lowercase) plus 
 14 |   # single spaces between words. Folks like Joseph Gordon-Levitt, 
 15 |   # Conan O’Brien, and David J. Malan won’t be using your program. (If only!)
 16 |   if (nchar(full.name) == 0) {
 17 |     stop ("Valid name please")
 18 |   }
 19 |   isspace  <- integer(0)
 20 |   fn.split <- unlist(strsplit(full.name, fixed = TRUE, split = ""))
 21 |   isspace  <- which(fn.split == " ")
 22 |   init     <- toupper(fn.split[c(1, (isspace+1))])
 23 |   paste(init, collapse = "")
 24 | }
 25 | 
 26 | #### Function 2 - format name, orcid, email and affiliation info into the metadata.yaml standard for manubot
 27 | create.yaml <- function(x, file.output){
 28 |   tmp <- affiliations %>%
 29 |     #    mutate(github="") %>% 
 30 |     filter(name==x) %>% 
 31 |     pivot_longer(!Sequence_affiliations, names_to = "tag") %>%
 32 |     arrange(Sequence_affiliations) %>% 
 33 |     dplyr::select(-Sequence_affiliations) %>% 
 34 |     mutate(tag=factor(tag, levels=c("github", "name","initials", 
 35 |                                     "orcid", "twitter", "email", 
 36 |                                     "affiliations","correspondence", "symbol_str"))) %>% 
 37 |     distinct() %>% 
 38 |     bind_rows(data.frame(tag="affiliations", 
 39 |                          value=paste0("\n      - ", 
 40 |                                       paste({.} %>% 
 41 |                                               filter(tag=="affiliations") %>% 
 42 |                                               pull(value),
 43 |                                             collapse="\n      - ")))) %>% 
 44 |     group_by(tag) %>% 
 45 |     slice(n()) %>% 
 46 |     ungroup() %>% 
 47 |     mutate(tag=as.character(tag)) %>% 
 48 |     mutate(tag=ifelse(tag=="github", "  - github", paste0("    ", tag))) %>% 
 49 |     filter(!is.na(value)) %>%
 50 |     unite(tag:value,
 51 |           sep = ": ",
 52 |           col = "newtag",
 53 |           remove = T) %>% 
 54 |     mutate(newtag=str_remove(newtag, pattern = '"')) %>% 
 55 |     mutate(newtag=str_remove(newtag, pattern = '"')) %>% 
 56 |     mutate(newtag=gsub(pattern="†", replacement = '"†"', x=newtag)) %>% 
 57 |     mutate(newtag=str_replace_all(string = newtag, pattern=", , ", replacement = ", ")) %>% 
 58 |     mutate(newtag=str_replace_all(string = newtag, pattern=", , ", replacement = ", ")) %>% 
 59 |     mutate(newtag=str_replace_all(string = newtag, pattern=" NA,", replacement = "")) %>% 
 60 |     mutate(newtag=gsub(pattern=", $", replacement = "", x=newtag))
 61 |   
 62 |   write_lines(tmp[[1]], file = file.output, append=T) 
 63 | }
 64 | 
 65 | 
 66 | ## function 3 - ### get affiliation data from google sheet
 67 | get.affiliation.gs <- function(name.to.match, aff.gs){
 68 |   n.tmp <- which(paste(aff.gs$`First Name`, aff.gs$`Last Name`, sep=" ") == name.to.match)
 69 |   aff.gs0 <- aff.gs %>% mutate_at(.vars=vars(starts_with("Postal code")), .funs=funs(as.character))
 70 |   if(length(n.tmp)==0){stop("No matched name")}
 71 |   out <-  aff.gs0[n.tmp,] %>%
 72 |     mutate(Name=paste(`First Name`, `Last Name`, sep=" ")) %>%
 73 |     mutate(Sequence_affiliations=1) %>%
 74 |     dplyr::select(Surname=`Last Name`, Name, Sequence_affiliations, `Preferred email`, `Alternative email`, `ORCID`,
 75 |            `Department/Institute/Faculty`, `University/Institution`, Street, `Postal code`, Town, Country) %>%
 76 |     rename(`E-Mail`=`Preferred email`) %>%
 77 |     rename(`Second E-Mail`=`Alternative email`)
 78 |   if(is.na(aff.gs0[n.tmp,]$"Would you like to add another affiliation")){return(out); stop()} 
 79 |   if(aff.gs0[n.tmp,]$"Would you like to add another affiliation"=="Yes"){
 80 |     out <- out %>%
 81 |       bind_rows(aff.gs0[n.tmp,] %>%
 82 |                   mutate(Name=paste(`First Name`, `Last Name`, sep=" ")) %>%
 83 |                   mutate(Sequence_affiliations=2) %>%
 84 |                   dplyr::select(Surname=`Last Name`, Name, Sequence_affiliations, `Preferred email`, `Alternative email`, `ORCID`,
 85 |                          `Department/Institute/Faculty_1`, `University/Institution_1`, Street_1, `Postal code_1`, Town_1, Country_1) %>%
 86 |                   rename_at(.vars = vars(ends_with("_1")),
 87 |                             .funs = funs(sub("_1", "", .))) %>%
 88 |                   rename(`E-Mail`=`Preferred email`) %>%
 89 |                   rename(`Second E-Mail`=`Alternative email`))}
 90 |   if(is.na(aff.gs0[n.tmp,]$"Would you like to add another affiliation_1")){return(out); stop()} 
 91 |   if(aff.gs0[n.tmp,]$"Would you like to add another affiliation_1"=="Yes"){
 92 |     out <- out %>%
 93 |       bind_rows(aff.gs0[n.tmp,] %>%
 94 |                   mutate(Name=paste(`First Name`, `Last Name`, sep=" ")) %>%
 95 |                   mutate(Sequence_affiliations=3) %>%
 96 |                   dplyr::select(Surname=`Last Name`,Name, Sequence_affiliations, `Preferred email`, `Alternative email`, `ORCID`,
 97 |                          `Department/Institute/Faculty_2`, `University/Institution_2`, Street_2, `Postal code_2`, Town_2, Country_2) %>%
 98 |                   rename_at(.vars = vars(ends_with("_2")),
 99 |                             .funs = funs(sub("_2", "", .))) %>%
100 |                   rename(`E-Mail`=`Preferred email`) %>%
101 |                   rename(`Second E-Mail`=`Alternative email`))}
102 |   return(out)
103 | }
104 | 
105 | 
106 | #### 1. Import data ####
107 | ##import sPlotOpen data
108 | path <- "_sPlotOpenDB"
109 | load(file = file.path(path, "sPlotOpen.RData"))
110 | table1 <- read_csv("_output/Table1_Databases.csv")
111 | 
112 | ## Import affiliation information of sPlot members
113 | allroles <- read_csv("/data/sPlot/users/Francesco/_sPlot_Management/Consortium/roles.csv")
114 | allaffiliations <- read_csv("/data/sPlot/users/Francesco/_sPlot_Management/Consortium/Affiliations.csv") %>% 
115 |   left_join(allroles %>% 
116 |               dplyr::select(Name,Surname), 
117 |             by="Name")
118 | 
119 | #### 2. Start coauthor list ####
120 | #### 2.1. First authors, core sPlot + last author ####
121 | first <- allaffiliations %>% 
122 |   filter(Name %in% c("Francesco Maria Sabatini", "Jonathan Lenoir")) %>% 
123 |   arrange(Name)
124 | ##third author Tarek Habab
125 | third <- tibble(Name="Tarek Hattab",
126 |                 `E-Mail`="Tarek.Hattab@ifremer.fr",
127 |                 ORCID="0000-0002-1420-5758",
128 |                 `Department/Institute/Faculty`="CNRS, IFREMER and IRD",
129 |                 `University/Institution`="MARBEC, Univ Montpellier",
130 |                 Town="Sète",
131 |                 Country="France", 
132 |                 Surname="Hattab")
133 | 
134 | custodians <- unique(table1$Custodian)
135 | # Splot core team
136 | core <- allaffiliations %>% 
137 |   filter(Name %in% (allroles %>%
138 |                       filter(`Core team` == T) %>%
139 |                       filter(!Surname %in% c("Bruelheide", "Sabatini", "Lenoir")) %>%
140 |                       pull(Name))) %>% 
141 |   arrange(Surname)
142 | #last author
143 | last <- allaffiliations %>% 
144 |   filter(Name=="Helge Bruelheide")
145 | 
146 | #### 2.2. Additional opt-ins ####
147 | #Import affiliation info for additioal opt-in coauthors
148 | #(Update 09 November 2020)
149 | optins <- read_csv("_management/Opt-in - Project #02 (Responses) - Form responses 1.csv") %>% 
150 |   filter(X3=="I read and understand sPlot rules and would like to opt-in to this project") %>% 
151 |   filter(!Name %in% c("Alicia Acosta", "Bruno Herault")) %>%  #already among custodians, but with name spelled differently
152 |   dplyr::select(Surname, Name, `Email address`, Affiliation, Address) %>% 
153 |   #join affiliations from our records
154 |   left_join(allaffiliations, by="Name") %>% 
155 |   #replace affiliations with opt-in input if record is not in archive
156 |   mutate(`University/Institution`=ifelse(is.na(`E-Mail`), Affiliation, `University/Institution`)) %>% 
157 |   mutate(Street=ifelse(is.na(`E-Mail`), Address, Street)) %>% 
158 |   mutate(Surname=coalesce(Surname.y, Surname.x)) %>% 
159 |   dplyr::select(-Surname.x, -Surname.y) %>% 
160 |   mutate(`E-Mail`=ifelse(is.na(`E-Mail`), `Email address`, `E-Mail`)) %>% 
161 |   mutate(Sequence_affiliations=ifelse(is.na(Sequence_affiliations), 1, Sequence_affiliations)) %>% 
162 |   dplyr::select(Name, Surname, Sequence_affiliations:Country) %>% 
163 |   mutate_if(.predicate = ~is.character(.), 
164 |             .funs = list(~gsub(pattern = ",$", replacement = "", x = {.}))) %>% 
165 |   filter(!Name %in% c(first, third, custodians, core)) %>% 
166 |   mutate(ORCID=replace(ORCID,
167 |                        list=Name=="Inger Greve Alsos", 
168 |                        values="0000-0002-8610-1085")) %>% 
169 |   mutate(Town=replace(Town,
170 |                       list=Name=="John-Arvid Grytnes",
171 |                       values="Bergen")) %>% 
172 |   mutate(Country=replace(Country,
173 |                       list=Name=="John-Arvid Grytnes",
174 |                       values="Norway")) %>% 
175 |   mutate(ORCID=ifelse(Surname=="Zobel", "0000-0001-7957-6704", ORCID)) %>% 
176 |   mutate(ORCID=ifelse(Surname=="Brunet", "0000-0003-2667-4575", ORCID)) %>% 
177 |   mutate(Street=str_replace(Street, "Czech Repunlic", "Czech Republic")) %>% 
178 |   mutate(Street=str_replace(Street, "Jr. José Sabogal #913", "Jr. José Sabogal 913"))
179 | 
180 | 
181 | 
182 | 
183 | ### second batch of opt-ins (mostly from TRY)
184 | ### 14/12/2020
185 | optin2 <- openxlsx::read.xlsx("_management/UpdateAffiliations - sPlot (Responses).xlsx", sheet=1)
186 | colnames(optin2)[16:22] <-  paste0(colnames(optin2)[16:22], "_1")
187 | colnames(optin2)[23:28] <-  paste0(colnames(optin2)[23:28], "_2")
188 | optin2 <- optin2[,-c(33,34,35)]
189 | optin2 <- optin2 %>% 
190 |   rename_all(.funs=~gsub(pattern=".", replacement=" ", x=., fixed=T))
191 | 
192 | names_to_import <- c('Meelis Pärtel', 'Sophie Gachet', 'Josep Penuela', 'Dirk Nikolaus Karger', 'Gregory Richard Guerin', 'Attila Lengyel', #20.11.2020
193 |                      'Frederic Lens', 'Débora Vanessa Lingner', "Arindam Banerjee", "Farideh Fazayeli", 
194 |                      "Hanhuai Shan") # update 14.12.2020
195 | 
196 | 
197 | optin2.aff <- NULL
198 | for(n in names_to_import) {
199 |   optin2.aff <- optin2.aff %>% 
200 |     bind_rows(get.affiliation.gs(n, aff.gs = optin2))
201 | }
202 | 
203 | optin2.aff <- optin2.aff %>% 
204 |   mutate(Name=replace(Name, 
205 |                       list=Name=="Josep Penuela", 
206 |                       values="Josep Peñuelas")) %>% 
207 |   mutate(Surname=replace(Surname, 
208 |                       list=Surname=="Penuela", 
209 |                       values="Peñuelas")) %>% 
210 |   mutate(Country=replace(Country, 
211 |                        list=Surname=="Peñuelas", 
212 |                        values="Spain")) %>% 
213 |   rowwise() %>% 
214 |   mutate(Town=ifelse(Surname=="Peñuelas", paste0(Town, ", Catalonia"), Town)) %>% 
215 |   ungroup() %>% 
216 |   #Add second affiliation Lens
217 |   bind_rows(
218 |     tibble(
219 |       Name = "Frederic Lens",
220 |       Surname = "Lens", 
221 |       Sequence_affiliations=2,
222 |       ORCID="0000-0002-5001-0149",
223 |       `E-Mail` = "frederic.lens@naturalis.nl",
224 |       `Department/Institute/Faculty` = "Institute of Biology Leiden",
225 |       `University/Institution` = "Leiden University",
226 |       Street = "Sylviusweg 72",
227 |       `Postal code` = "2333 BE",
228 |       Town = "Leiden",
229 |       Country = "The Netherlands"
230 |     )
231 |   )
232 | 
233 | 
234 | 
235 | #### 2.3. Merge lists ####
236 | # first + core sPlot [alphabetical] + (custodians + opt-ins + TRY) [alphabetical] + last author
237 | affiliations0 <- first %>% 
238 |   bind_rows(third) %>% 
239 |   bind_rows(core) %>% 
240 |   bind_rows(allaffiliations %>% 
241 |               filter(Name %in% custodians) %>%
242 |               filter(!Name %in% core$Name) %>%
243 |               filter(!Name %in% first$Name) %>%
244 |               filter(!Name %in% last$Name) %>% 
245 |               ## add additional coauthors from SOPHY
246 |               bind_rows(
247 |                 tibble(
248 |                   Name = "Guillermo Hinojos Mendoza",
249 |                   `E-Mail` = "ghinojos@asessc.net",
250 |                   `Department/Institute/Faculty` = "Pépinière d’Entreprises l’Espélidou, Parc d’Activités du Vinobre",
251 |                   `University/Institution` = "ASES Ecological and Sustainable Services",
252 |                   Street = "555 Chemin des Traverses, Lachapelle-sous-Aubenas",
253 |                   `Postal code` = "07200",
254 |                   Town = "Aubenas",
255 |                   Country = "France",
256 |                   Surname = "Hinojos Mendoza"
257 |                 )
258 |               ) %>%
259 |               ## add additional coauthors from AF-CD-001
260 |               bind_rows(allaffiliations %>%
261 |                           filter(Name == "Elizabeth Kearsley") %>%
262 |                           mutate(
263 |                             `Department/Institute/Faculty` =
264 |                               replace(
265 |                                 `Department/Institute/Faculty`,
266 |                                 list = Name == "Elizabeth Kearsley",
267 |                                 values =
268 |                                   "Department Environment, Computational and Applied Vegetation Ecology (UGent-CAVELab)"
269 |                               )) %>% 
270 |                           mutate(Surname = "Kearsley")) %>%
271 |               bind_rows(
272 |                 tibble(
273 |                   Name = "Wannes Hubau",
274 |                   Sequence_affiliations = c(1, 2),
275 |                   `E-Mail` = "wannes.hubau@ugent.be",
276 |                   `Department/Institute/Faculty` = c(
277 |                     "Department Environment, Laboratory of Wood Biology (UGent-WoodLab)",
278 |                     "Service of Wood Biology"
279 |                   ),
280 |                   `University/Institution` = c("Ghent University", "Royal Museum for Central Africa"),
281 |                   Street = c("Coupure Links 653", "Leuvensesteenweg 13"),
282 |                   `Postal code` = c("9000", "3080"),
283 |                   Town = c("Ghent", "Tervuren"),
284 |                   Country = "Belgium",
285 |                   Surname = "Hubau"
286 |                 )
287 |               ) %>%
288 |               bind_rows(
289 |                 tibble(
290 |                   Name = "Marijn Bauters",
291 |                   Sequence_affiliations = c(1, 2),
292 |                   `E-Mail` = "marijn.bauters@ugent.be",
293 |                   `Department/Institute/Faculty` =
294 |                     c(
295 |                       "Department Green chemistry and technology, Isotope Bioscience laboratory (UGent-ISOFYS)",
296 |                       "Department Environment, Computational and Applied Vegetation Ecology (UGent-CAVELab)"
297 |                     ),
298 |                   `University/Institution` = "Ghent University",
299 |                   Street = "Coupure Links 653",
300 |                   `Postal code` = "9000",
301 |                   Town = "Ghent",
302 |                   Country = "Belgium",
303 |                   Surname = "Bauters"
304 |                 )
305 |               ) %>% 
306 |               
307 |               ### Authors from RAINFOR - as recommended by Oliver Phillips
308 |               bind_rows(
309 |                 tibble(
310 |                   Name = "Abel Monteagudo Mendoza",
311 |                   Sequence_affiliations = 1:2,
312 |                   ORCID="0000-0002-1047-845X",
313 |                   `E-Mail` = "amonteagudomendoza@gmail.com", 
314 |                   `Department/Institute/Faculty` = "",
315 |                   `University/Institution` = c("Jardín Botánico de Missouri Oxapampa", "Universidad Nacional de San Antonio Abad del Cusco"),
316 |                   Street = c("Bolognesi Mz-E-6", "Av. de la Cultura 733"),
317 |                   `Postal code` = NA,
318 |                   Town = c("Oxapampa, Pasco", "Cusco"),
319 |                   Country = "Peru",
320 |                   Surname = "Monteagudo Mendoza"
321 |                 )
322 |               ) %>%
323 |               bind_rows(
324 |                 tibble(
325 |                   Name = "Rodolfo Vásquez Martínez",
326 |                   Sequence_affiliations = 1,
327 |                   `E-Mail` = "neotaxon@yahoo.com",
328 |                   `Department/Institute/Faculty` = "",
329 |                   `University/Institution` = "Jardín Botánico de Missouri Oxapampa",
330 |                   Street = "Bolognesi Mz-E-6",
331 |                   `Postal code` = NA,
332 |                   Town = "Oxapampa, Pasco",
333 |                   Country = "Peru",
334 |                   Surname = "Vásquez Martínez"
335 |                 )
336 |               ) %>%
337 |               bind_rows(
338 |                 tibble(
339 |                   Name = "Luzmila Arroyo",
340 |                   Sequence_affiliations = 1,
341 |                   `E-Mail` = "luzmilaarroyo@hotmail.com",
342 |                   `Department/Institute/Faculty` = "Dirección de la Carrera de Biología",
343 |                   `University/Institution` = "Universidad Autónoma Gabriel René Moreno",
344 |                   Street = NA,
345 |                   `Postal code` = NA,
346 |                   Town = "Santa Cruz de la Sierra",
347 |                   Country = "Bolivia",
348 |                   Surname = "Arroyo"
349 |                 )
350 |               ) %>%
351 |               bind_rows(
352 |                 tibble(
353 |                   Name = "Timothy Killeen",
354 |                   Sequence_affiliations = 1,
355 |                   `E-Mail` = "timothy.j.killeen@gmail.com",
356 |                   `Department/Institute/Faculty` =
357 |                     "Museo de Historia Natural Noel Kempff Mercado",
358 |                   `University/Institution` = "Universidad Autonoma Gabriel Rene Moreno",
359 |                   Street = NA,
360 |                   `Postal code` = NA,
361 |                   Town = "Santa Cruz de la Sierra",
362 |                   Country = "Bolivia",
363 |                   Surname = "Killeen"
364 |                 )
365 |               ) %>%
366 |               ## Add possible additional coauthors here
367 |               bind_rows(allaffiliations %>% 
368 |                           filter(Name%in% c(#"Anita Smyth",
369 |                                             "Alireza Naqinezhad",
370 |                                             "Sylvia Haider", 
371 |                                             "Pavel Shirokikh", 
372 |                                             "Alicia T.R. Acosta", 
373 |                                             "Bruno Hérault", 
374 |                                             'Petr Petřík', 
375 |                                             "Donald M. Waller", 
376 |                                             "Yves Bergeron"))) %>%  ## was custodian when writing the paper
377 |               ##
378 |               bind_rows(optins) %>% 
379 |               bind_rows(optin2.aff) %>% 
380 |               ## Joop Schaminée is not custodian anymore. Exclude?
381 |               ###
382 |             arrange(Surname, Sequence_affiliations)) %>%  
383 |   bind_rows(last) %>% 
384 |   dplyr:::select(name=Name, email=`E-Mail`, orcid=ORCID, everything(), -Surname, -`Second E-Mail`) %>% 
385 |   replace_na(list(`Department/Institute/Faculty`="", Street="", `Postal code`="", Town="",   Country="" )) %>% 
386 |   unite(`University/Institution`, `Department/Institute/Faculty`, Street:Country, sep = ", ", col="affiliations", remove=T) %>% 
387 |   mutate(affiliations=str_replace_all(string = affiliations, pattern=", , ", replacement = ", ")) %>% 
388 |   ### Add Github accounts, if required
389 |   mutate(github="") %>% 
390 |   mutate(github=ifelse(name=="Miguel Alvarez", "kamapu", github)) %>% 
391 |   mutate(github=ifelse(name=="Francesco Maria Sabatini", "fmsabatini", github)) %>% 
392 |   mutate(github=ifelse(name=="Jonathan Lenoir", "lenjon", github)) %>% 
393 |   mutate(github=ifelse(name=="Helge Bruelheide", "Bruelheide", github)) %>% 
394 |   ### Add Twitter account, if required
395 |   mutate(twitter=NA) %>% 
396 |   mutate(twitter=ifelse(name=="Francesco Maria Sabatini", "sPlot_iDiv", twitter)) %>% 
397 |   mutate(twitter=ifelse(name=="Jonathan Lenoir", "EkoLogIt", twitter)) %>% 
398 |   mutate(twitter=ifelse(name=="Helge Bruelheide", "HelgeBruelheide", twitter)) %>% 
399 |   ### Add correspondence
400 |   mutate(correspondence=ifelse(name=="Francesco Maria Sabatini", "true", NA)) %>% 
401 |   ### Add equal contribution
402 |   mutate(symbol_str=ifelse(name %in% c("Francesco Maria Sabatini", "Jonathan Lenoir"), 
403 |                            '"†"', NA)) %>% 
404 |   ### Add initials
405 |   rowwise() %>% 
406 |   mutate(initials=initials(name)) %>% 
407 |   ungroup() %>% 
408 |   ## correct typo
409 |   mutate(name=replace(name, list=name=="Andraž Carni", values="Andraž Čarni"))
410 |   
411 | affiliations <- affiliations0 %>% 
412 | ## Update affiliation by Tsipe Aavik & Martin Zobel
413 |   mutate(`affiliations`= replace(`affiliations`,
414 |                               list =  name %in% c("Tsipe Aavik", "Martin Zobel"),
415 |                               values = (affiliations0 %>% filter(name=="Meelis Pärtel") %>% pull(`affiliations`)))) %>% 
416 |   mutate(orcid= replace(orcid,
417 |                         list =  name=="Tsipe Aavik", 
418 |                         values = "0000-0001-5232-3950"))
419 | 
420 | 
421 | 
422 | #### 2.4 opt-out ####
423 | ## exclude authors who declined offer
424 | affiliations <- affiliations %>% 
425 |   filter(!name %in% c("Marten Winter", 
426 |                       "Ching-Feng Li", 
427 |                       "Kim Sarah Jacobsen",
428 |                       "Desalegn Wana", 
429 |                       "Milan Valachovič", 
430 |                       "Philippe Marchand")) #Not yet custodian when writing the paper
431 | 
432 | 
433 | #### 3. Create metadata.yaml file ####
434 | affi.out <- "../_manuscript/content/metadata.yaml" #create empty affiliation file
435 | ##populate yaml file with affiliation info
436 | write_lines(c(
437 |   "---",
438 |   'title: "sPlotOpen – An environmentally-balanced, open-access, global dataset of vegetation plots"',
439 |   'keywords:',
440 |   '  - vegetation',
441 |   '  - database',
442 |   '  - plants',
443 |   '  - biodiversity',
444 |   '  - functional traits',
445 |   '  - big-data',
446 |   '  - manubot',
447 |   'lang: en-US', 
448 |   'authors:'), file = affi.out)
449 | lapply(affiliations %>% 
450 |          dplyr::select(name) %>% 
451 |          distinct() %>% 
452 |          pull(name), create.yaml, affi.out)
453 | 
454 | 
455 | 
456 | 
457 | #### 4. Create list of email addresses ####
458 | email <- affiliations %>% 
459 |   dplyr::select(name, email) %>% 
460 |   distinct() %>% 
461 |   mutate(export = paste0(name, " <", email, ">")) %>% 
462 |   dplyr::select(export)
463 | write_delim(email, file = "_output/Author_email.txt", delim="/t")
464 | 
465 | ## Create email list for Opt-ins
466 | #### this list was used to invite additional coauthors, besides the custodians
467 | roles <- read_csv("/data/sPlot/users/Francesco/_sPlot_Management/Consortium/roles.csv")
468 | email.optins <- roles %>% 
469 |   filter(`Still in sPlot`) %>% 
470 |   arrange(Surname) %>% 
471 |   left_join(allaffiliations, by="Name") %>% 
472 |   dplyr::select(Name, email=`E-Mail`) %>% 
473 |   distinct() %>% 
474 |   filter(!Name %in% affiliations$name) %>% 
475 |   mutate(export = paste0(Name, " <", email, ">")) %>% 
476 |   dplyr::select(export)
477 | write_delim(email.optins, file = "_output/Optins_email", delim="/t" )
478 | 
479 | 
480 | 
481 | 
482 | #### 5. Create checklist to approve submission ####
483 | approve.checklist <- affiliations %>% 
484 |   distinct(name) %>% 
485 |   arrange(name) %>% 
486 |   mutate(name=paste0(" - [ ] ", name))
487 | write_delim(approve.checklist, file = "_output/Author_checklist.txt", delim="/t" )
488 | ## second batch
489 | approve.checklist <- optin2.aff %>% 
490 |   #filter(Name %in% affiliations$name) %>% 
491 |   distinct(Name) %>% 
492 |   arrange(Name) %>% 
493 |   mutate(Name=paste0(" - [ ] ", Name))
494 | write_delim(approve.checklist, file = "_output/Author_checklist_batch2.txt", delim="/t" )
495 | 
496 | 
497 | 
498 | #### 6. Create list of ORCIDs
499 | affiliations %>% 
500 |   distinct(name, orcid) %>%
501 |   unite(name, name, orcid, sep=", ") %>% 
502 |   mutate(name=str_remove(name, pattern=" NA$")) %>% 
503 |   mutate(name=paste0(name, "\n")) %>% 
504 |   #pull(name) %>% 
505 |   #cat() %>% 
506 |   View()
507 |   
508 | 
509 | 
510 | 
511 | 


--------------------------------------------------------------------------------
/04_FiguresTables.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Project#02 - Figures and Tables"
  3 | author: "Francesco Maria Sabatini"
  4 | date: "4/28/2020"
  5 | output:
  6 |   html_document:
  7 |     toc: true
  8 |     theme: united
  9 | ---
 10 | 
 11 | <center>
 12 |   ![](https://www.idiv.de/fileadmin/content/Files_sDiv/sDiv_Workshops_Photos_Docs/sDiv_WS_Documents_sPlot/splot-long-rgb.png "sPlot Logo")
 13 | </center>
 14 |   
 15 |     
 16 |       
 17 |         
 18 | **Timestamp:** `r date()`  
 19 | **Drafted:** Francesco Maria Sabatini  
 20 | **Revised:**  
 21 | **Version:** 1.0
 22 |   
 23 | This report documents the creation of figures and tables for the sPlotOpen manuscript.
 24 |   
 25 | ```{r results="hide", message=F, warning=F}
 26 | library(tidyverse)
 27 | #library(openxlsx)
 28 | library(bib2df)
 29 | library(knitr)
 30 | library(kableExtra)
 31 | library(viridis)
 32 | library(plotbiomes)
 33 | library(cowplot)
 34 | 
 35 | library(raster)
 36 | library(sp)
 37 | library(sf)
 38 | library(rgdal)
 39 | library(rnaturalearth)
 40 | library(dggridR)
 41 | # library(rgeos)
 42 | 
 43 | library(Taxonstand)
 44 | 
 45 | #save temporary files
 46 | write("TMPDIR = /data/sPlot/users/Francesco/_tmp", file=file.path(Sys.getenv('TMPDIR'), '.Renviron'))
 47 | write("R_USER = /data/sPlot/users/Francesco/_tmp", file=file.path(Sys.getenv('R_USER'), '.Renviron'))
 48 | #rasterOptions(tmpdir="/data/sPlot/users/Francesco/_tmp")
 49 | ```
 50 | 
 51 | #Load sPlotOpen data and create spatial objects
 52 | ```{r}
 53 | load(file.path("_sPlotOpenDB", "sPlotOpen.RData"))
 54 | #header.oa <- header.oa %>% 
 55 | #  filter(!is.na(SoilClim_PC1))
 56 | ```
 57 | 
 58 | Data Preparation for spatial plotting
 59 | ```{r, cache=T, results="hide", warning=F, message=F}
 60 | header.sf <- SpatialPointsDataFrame(coords= header.oa %>% 
 61 |                                       dplyr::select(Longitude, Latitude), 
 62 |                                     proj4string = CRS("+init=epsg:4326"), 
 63 |                                     data=data.frame(PlotObservationID=header.oa$PlotObservationID, 
 64 |                                                     Dataset=header.oa$Dataset)) %>% 
 65 |   st_as_sf() %>% 
 66 |   st_transform(crs = "+proj=eck4")
 67 | ```
 68 | 
 69 | # Load ancillary geographic data and create figure templates
 70 | 
 71 | Country boundaries and world graticule
 72 | ```{r, cache=T, results="hide", warning=F, message=F}
 73 | #data downloaded from rnaturalearth package
 74 | countries <- readOGR("/data/sPlot/users/Francesco/Ancillary_Data/naturalearth/ne_110m_admin_0_countries.shp") %>% 
 75 |   st_as_sf() %>% 
 76 |   st_transform(crs = "+proj=eck4") %>% 
 77 |   st_geometry()
 78 | graticules <- readOGR("/data/sPlot/users/Francesco/Ancillary_Data/naturalearth/ne_110m_graticules_15.shp") %>% 
 79 |   st_as_sf() %>% 
 80 |   st_transform(crs = "+proj=eck4") %>% 
 81 |   st_geometry()
 82 | 
 83 | bb <- readOGR("/data/sPlot/users/Francesco/Ancillary_Data/naturalearth/ne_110m_wgs84_bounding_box.shp") %>% 
 84 |   st_as_sf() %>% 
 85 |   st_transform(crs = "+proj=eck4") %>% 
 86 |   st_geometry()
 87 | ```
 88 | 
 89 | 
 90 | Continent boundaries
 91 | ```{r}
 92 | sPDF <- rworldmap::getMap(resolution="coarse")
 93 | continent <- sPDF[,"continent"]
 94 | crs(continent) <- CRS("+init=epsg:4326")
 95 | continent@data[243,"continent"] <- "South America" ## Manually correct missing data
 96 | # create clipped version of continent to avoid going beyond 180 lON
 97 | coords <- data.frame(x=c(-180,180,180,-180),
 98 |                      y=c(-90,-90,90,90))
 99 | bboxc = Polygon(coords)
100 | bboxc = SpatialPolygons(list(Polygons(list(bboxc), ID = "a")), proj4string=crs(continent))
101 | continent_clipped <- rgeos::gIntersection(continent[-137,], bboxc, byid=T) # polygon 137 gives problems... workaround
102 | continent_clipped <- continent_clipped %>% 
103 |   st_as_sf()
104 | ```
105 | 
106 | Template of Global map - with country borders
107 | ```{r, cache=T, results="hide", warning=F, message=F}
108 | # create ggplot template of the world map
109 | w3a <- ggplot() +
110 |   geom_sf(data = bb, col = "grey20", fill = "white") +
111 |   geom_sf(data = graticules, col = "grey20", lwd = 0.1) +
112 |   geom_sf(data = countries, fill = "grey90", col = NA, lwd = 0.3) +
113 |   coord_sf(crs = "+proj=eck4") +
114 |   theme_minimal() +
115 |   theme(axis.text = element_blank(), 
116 |         legend.title=element_text(size=12), 
117 |         legend.text=element_text(size=12),
118 |         legend.background = element_rect(size=0.1, linetype="solid", colour = 1), 
119 |         legend.key.height = unit(1.1, "cm"), 
120 |         legend.key.width = unit(1.1, "cm")) +
121 |   scale_fill_viridis()
122 | ```
123 | 
124 | Create template of Global Map - without country borders
125 | ```{r}
126 | w4a <- ggplot() +
127 |     geom_sf(data = bb, col = "grey20", fill = "white") +
128 |     geom_sf(data = continent_clipped, fill = "grey90", col = NA, lwd = 0.3) +
129 |     geom_sf(data = bb, col = "grey20", fill = NA) +
130 |     #geom_sf(data = graticules, col = "grey20", lwd = 0.1) +
131 |     coord_sf(crs = "+proj=eck4") +
132 |     theme_minimal() +
133 |     theme(axis.text = element_blank(), 
134 |           legend.title=element_text(size=12), 
135 |           legend.text=element_text(size=12),
136 |           legend.background = element_rect(size=0.1, linetype="solid", colour = 1), 
137 |           legend.key.height = unit(1.1, "cm"), 
138 |           legend.key.width = unit(1.1, "cm"))
139 | 
140 | ```
141 | 
142 | 
143 | 
144 | # Figures
145 | Figures and tables for the manuscript in the [sPlotOpen_Manuscript](https://fmsabatini.github.io/sPlotOpen_Manuscript/) project.
146 | 
147 | ## Figure 1 - Geographic distribution of plots
148 | 
149 | Map of plot distribution - Version 1 - Coloured points.  
150 | Each colour represents a database. Please note there are not enough colours in the palette to represent all 105 datasets
151 | ```{r, fig.width=8, fig.height=6, fig.align="center", warning=F, message=F, cache=T}
152 | Figure1a <- w3a + 
153 |   geom_sf(data=header.sf, aes(color=Dataset), pch="+", size=1, alpha=0.8) + # aes(col=Dataset),
154 |   geom_sf(data = countries, col = "grey20", fill=NA, lwd = 0.3) + 
155 |   theme(legend.position = "none")
156 | ```
157 | 
158 | Version 2  - hexagons
159 | ```{r, fig.width=8, fig.height=6, fig.align="center", message=F, cache=T}
160 | header2 <- header.oa %>% 
161 |   dplyr::filter(Resample_1) %>% 
162 |   dplyr::select(PlotObservationID, Latitude, Longitude) %>% 
163 |   filter(!(abs(Longitude) >171 & abs(Latitude>70)))
164 | dggs <- dgconstruct(spacing=300, metric=T, resround='down')
165 | 
166 | #Get the corresponding grid cells for each plot
167 | header2$cell <- dgGEO_to_SEQNUM(dggs, header2$Longitude, header2$Latitude)$seqnum
168 | 
169 | #Calculate number of plots for each cell
170 | header.dggs   <- header2 %>% 
171 |   group_by(cell) %>% 
172 |   summarise(value.out=log(n(), 10))
173 | 
174 | #Get the grid cell boundaries for cells 
175 | grid   <- dgcellstogrid(dggs, header.dggs$cell, frame=F) %>%
176 |   st_as_sf() %>% 
177 |   mutate(cell = header.dggs$cell) %>% 
178 |   mutate(value.out=header.dggs$value.out) %>% 
179 |   st_transform("+proj=eck4") %>% 
180 |   st_wrap_dateline(options = c("WRAPDATELINE=YES"))
181 | 
182 | ## plotting
183 | Figure1b <- w3a + 
184 |     geom_sf(data=grid, aes(fill=value.out),lwd=0, alpha=0.9)    +
185 |     geom_sf(data = countries, col = "grey20", fill=NA, lwd = 0.3) + 
186 |     scale_fill_viridis(
187 |       name="# plots", breaks=0:5, labels = c("1", "10", "100",
188 |                                              "1,000", "10,000", "100,000"), option="viridis")
189 | 
190 | #ggsave("_output/figure1.png", plot=Figure1, width=8, height=4, units="in", dpi=300)
191 | ```
192 | Panel with two versions of figure 1
193 | ```{r, fig.width=5.5, fig.height=5, fig.align="center", cache=T}
194 | fig1.leg <- get_legend(Figure1b + 
195 |                          guides(fill = guide_colourbar(barwidth = 1, barheight = 6)) +
196 |                          theme(legend.title = element_text(size = 7), 
197 |                          legend.text = element_text(size = 7)))
198 | fig1_panel <- plot_grid(Figure1a, NULL,
199 |                         Figure1b + 
200 |                           theme(legend.position = "none"), fig1.leg,
201 |                         nrow=2, ncol=2, byrow = T, #labels = c("a","", "b", ""),
202 |                         rel_widths = c(0.84,0.16))
203 | ggsave("_output/figure1.pdf", plot=fig1_panel, width=5.5, height=5, units="in", dpi=300)
204 | fig1_panel
205 | ```
206 | 
207 | 
208 | ## Figure 2 - PCA graph + world map of selected plots
209 | Only for Resample draw #1  
210 | Import PCA data
211 | ```{r}
212 | ### load PCA ordination of the world
213 | load("_data/pca3.RData")
214 | path.sPlot <- "/data/sPlot2.0/"
215 | load(paste(path.sPlot, "splot.world2.RData", sep="/"))
216 | ```
217 | 
218 | 
219 | ```{r}
220 | plot_data <- header.oa %>% 
221 |   dplyr::filter(Resample_1) %>% 
222 |   dplyr::select(PlotObservationID, Longitude, Latitude)
223 | ## code adapted from @lenjon's 'resampling_2d_JL.R'
224 | CRSlonlat <- CRS("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs +towgs84=0,0,0")
225 | coords <- cbind(plot_data$Longitude, plot_data$Latitude)
226 | coords <- SpatialPoints(coords, proj4string=CRSlonlat)
227 | plot_data <- SpatialPointsDataFrame(coords, plot_data)#, proj4string=CRSlonlat)
228 | 
229 | 
230 | # Create world rasters of PCA values and extract plot values by geographic intersection
231 | # raster at half a degree resolution (cf. 30 arc minute resolution)
232 | rgeo <- raster(nrows=360, ncols=720, xmn=-180, xmx=180, ymn=-90, ymx=90) 
233 | rgeo <- disaggregate(rgeo, fact=12) # raster at 2.5 arc minute resolution
234 | splot.world2$cellID <- cellFromXY(rgeo, cbind(splot.world2$RAST_X, splot.world2$RAST_Y))
235 | 
236 | ### create rasters from PCA
237 | posit <- splot.world2$cellID
238 | temp <- getValues(rgeo)
239 | 
240 | temp[posit] <- pca3$x[, 1]
241 | PC1_r <- setValues(rgeo, temp)
242 | temp[posit] <- pca3$x[, 2]
243 | PC2_r <- setValues(rgeo, temp)
244 | 
245 | # Extract pca valus from PCA rasters
246 | plot_data@data$pc1_val <- extract(PC1_r, coordinates(plot_data))
247 | plot_data@data$pc2_val <- extract(PC2_r, coordinates(plot_data)) 
248 | 
249 | 
250 | # Compute the density of environmental conditions available at the global scale across the entire bivariate (PC1-PC2) environmental space
251 | res <- 100 # Setting the number of bins per PCA axis to 100
252 | reco <- raster(nrows=res, ncols=res, xmn=min(pca3$x[, 1]), xmx=max(pca3$x[, 1]), 
253 |                ymn=min(pca3$x[, 2]), ymx=max(pca3$x[, 2]))
254 | PC1_PC2_r <- rasterize(pca3$x[, 1:2], reco, fun="count")
255 | plot_data <- plot_data@data
256 | plot_data$pc_cellID <- cellFromXY(reco, cbind(plot_data$pc1_val, plot_data$pc2_val))
257 | 
258 | 
259 | # Compute the sampling effort (number of vegetation plots) per environmental unit (cell) across the entire bivariate (PC1-PC2) environmental space
260 | sPlot_reco <- rasterize(plot_data[, c("pc1_val", "pc2_val")], reco, fun="count")
261 | # Put zero values for the empty cells (cf. there is no vegeteation plots available for those environmental conditions: gaps) 
262 | temp1 <- getValues(PC1_PC2_r)
263 | temp1[!is.na(temp1)] <- 0
264 | temp2 <- getValues(sPlot_reco)
265 | temp2[which(temp1==0&is.na(temp2))] <- 0
266 | sPlot_reco <- setValues(reco, temp2)
267 | 
268 | plot_data <- plot_data %>% 
269 |   rename(PC1=pc1_val, PC2=pc2_val)
270 | ```
271 | 
272 | Transform to tibbles
273 | ```{r}
274 | PC1_PC2.tbl <- data.frame(xyFromCell(PC1_PC2_r, cell = 1:10000)) %>% 
275 |   rename(PC1=x, PC2=y) %>% 
276 |   mutate(values=getValues(PC1_PC2_r)) %>% 
277 |   as_tibble()
278 | 
279 | sPlot_reco.tbl <- data.frame(xyFromCell(sPlot_reco, cell = 1:10000)) %>% 
280 |   rename(PC1=x, PC2=y) %>% 
281 |   mutate(values=getValues(sPlot_reco)) %>% 
282 |   as_tibble()
283 | ```
284 | 
285 | 
286 | 
287 | Figure2 - Bottom left.  
288 | Make gridded heatmap plot
289 | ```{r}
290 | pca.heatmap <- ggplot() + 
291 |   geom_tile(data=PC1_PC2.tbl %>% 
292 |               dplyr::filter(!is.na(values)), aes(x=PC1, y=PC2), 
293 |             col = gray(0.8),
294 |             fill = gray(0.8)) + 
295 |   geom_tile(data=sPlot_reco.tbl %>% 
296 |               dplyr::filter(!is.na(values)) %>% 
297 |               dplyr::filter(values>0), ### !!!
298 |             aes(x=PC1, y=PC2, col=values, fill=values)) +
299 |   scale_fill_viridis("Number\nof plots", option = "magma") +
300 |   scale_color_viridis("Number\nof plots", option = "magma") +
301 |   theme_bw() + 
302 |   coord_equal() + 
303 |   theme(legend.position = "bottom", 
304 |         panel.grid.major = element_blank(),
305 |         panel.grid.minor = element_blank())
306 | 
307 | ```
308 | 
309 | Figure 2 - Bottom right  
310 | Randomly select four pixels with >45 plots, extract center coordinates and add to heatmap.
311 | ```{r}
312 | set.seed(558)
313 | ABC <- sPlot_reco.tbl %>% 
314 |   filter(!is.na(values)) %>% 
315 |   filter(values>45 & values<=50) %>% 
316 |   mutate(upper=PC2 > 1) %>% 
317 |   mutate(right=PC1 > 0) %>% 
318 |   group_by(upper, right) %>% 
319 |   sample_n(1) %>% 
320 |   ungroup() %>% 
321 |   mutate(label=c("D", "C", "A", "B"))
322 |   
323 | 
324 | ### add labels to heatmap
325 | pca.heatmap2 <- pca.heatmap + 
326 |   geom_point(data=ABC, aes(x=PC1, y=PC2), pch=21,  size=2, col="black", fill="white") + 
327 |   ggrepel::geom_label_repel(data=ABC, aes(x=PC1, y=PC2, label=label), size = 2.5, fill = alpha(c("white"),0.7)) + 
328 |   theme(legend.position = "left")
329 | ```
330 | 
331 | For each of the selected grid cells, get all plots belonging to that cell, and show their geographical distribution.
332 | ```{r}
333 | inset.list <- list()
334 | rangex <- max(PC1_PC2.tbl$PC1) - min(PC1_PC2.tbl$PC1)
335 | rangey <- max(PC1_PC2.tbl$PC2) - min(PC1_PC2.tbl$PC2)
336 | for(i in c("A", "B", "C", "D")){
337 |   tmp.ABC <- ABC %>% 
338 |     filter(label==i)
339 |   xmin <- tmp.ABC$PC1 - 0.5*rangex/100
340 |   xmax <- tmp.ABC$PC1 + 0.5*rangex/100
341 |   ymin <- tmp.ABC$PC2 - 0.5*rangey/100
342 |   ymax <- tmp.ABC$PC2 + 0.5*rangey/100
343 |   coords.ABC <- plot_data %>% 
344 |     mutate(sel=(PC1 > xmin & PC1 < xmax &
345 |                 PC2 > ymin & PC2 < ymax)) %>% 
346 |     filter(sel) %>% 
347 |     dplyr::select(Longitude, Latitude) %>% 
348 |     rowwise() %>% 
349 |     mutate_at(.vars=vars(Longitude, Latitude),
350 |               .funs=~jitter(.)) %>%  
351 |     ungroup() %>% 
352 |     SpatialPoints(proj4string = CRS("+init=epsg:4326")) %>% 
353 |     st_as_sf() %>% 
354 |     st_transform(crs = "+proj=eck4")
355 |   inset.list[[i]] <- w4a + 
356 |     geom_sf(data=coords.ABC, col=2, pch="+", size=2)
357 | }
358 | ```
359 | 
360 | Figure 2 - Top  
361 | 
362 | ```{r}
363 | splot.world2.eckert <- SpatialPointsDataFrame(coords=splot.world2 %>% 
364 |                          dplyr::select(RAST_X, RAST_Y), 
365 |                        data = splot.world2 %>% 
366 |                          dplyr::select(RAST_ID), 
367 |                        proj4string = CRS("+init=epsg:4326")) %>% 
368 |   spTransform(CRS("+proj=eck4"))
369 | 
370 | splot.world2.eckert <- data.frame(splot.world2.eckert@coords, 
371 |                                    splot.world2.eckert@data)
372 | 
373 | PCA_tbl <- as_tibble(splot.world2.eckert) %>%
374 |   dplyr::select(RAST_ID, RAST_X, RAST_Y) %>%
375 |   left_join(as_tibble(as.data.frame(pca3$x[,1:2]) %>%
376 |               rownames_to_column(var="RAST_ID")) %>%
377 |               mutate(RAST_ID=as.integer(RAST_ID)),
378 |       by="RAST_ID") %>%
379 |     mutate(PC0=1:n())
380 | 
381 | ggpc1 <- w4a + 
382 |   geom_tile(data=PCA_tbl %>% 
383 |               mutate(PC1=ifelse(PC1> 6,  6, PC1)) %>% 
384 |               mutate(PC1=ifelse(PC1< -6, -6, PC1)), 
385 |             aes(x=RAST_X,y=RAST_Y, fill=PC1, color=PC1)) + 
386 |   geom_sf(data = bb, col = "grey20", fill = NA) +
387 |   scale_fill_distiller("PC1", type = "seq", palette = "Spectral", 
388 |                        direction=-1, limits = c(-6.1,6.1), breaks=seq(-6,6, by=3), labels=c("<-6", -3, 0, 3,">6" )) +
389 |   scale_color_distiller("PC1", type = "seq", palette = "Spectral", 
390 |                         direction=-1, limits = c(-6.1,6.1), breaks=seq(-6,6, by=3), labels=c("<-6", -3, 0, 3,">6" )) +
391 |   theme_bw() +
392 |   theme(panel.grid.major = element_blank(), 
393 |         panel.grid.minor = element_blank(), 
394 |         axis.text = element_blank(),
395 |         panel.border = element_blank(), 
396 |         axis.title = element_blank())
397 | 
398 | 
399 | 
400 | ggpc2 <- w4a + 
401 |   geom_tile(data=PCA_tbl %>% 
402 |               mutate(PC2=ifelse(PC2> 9,  9, PC2)) %>% 
403 |               mutate(PC2=ifelse(PC2< -6, -6, PC2)), 
404 |             aes(x=RAST_X,y=RAST_Y, fill=PC2, color=PC2)) + 
405 |   geom_sf(data = bb, col = "grey20", fill = NA) +
406 |   scale_fill_distiller("PC2", type = "seq", palette = "Spectral", 
407 |                        direction=+1, limits = c(-6.1,9.1), breaks=seq(-6,9, length.out = 6), 
408 |                        labels=c("<-6", -3, 0, 3, 6,">9" )) +
409 |   scale_color_distiller("PC2", type = "seq", palette = "Spectral", 
410 |                        direction=+1, limits = c(-6.1,9.1), breaks=seq(-6,9, length.out = 6), 
411 |                        labels=c("<-6", -3, 0, 3, 6,">9" )) +
412 |   theme_bw() +
413 |   theme(panel.grid.major = element_blank(), 
414 |         panel.grid.minor = element_blank(), 
415 |         axis.text = element_blank(),
416 |         panel.border = element_blank(), 
417 |         axis.title = element_blank())
418 | ```
419 | 
420 | 
421 | Figure 2 - Panel
422 | ```{r, fig.width=8, fig.height=5.6, fig.align="center", cache=T, cache.lazy = FALSE}
423 | library(cowplot)
424 | varexpl <- round(pca3$sdev^2/sum(pca3$sdev^2)*100,1)
425 | panel.out1 <- 
426 |   plot_grid(plot_grid(NULL, 
427 |                       ggpc1 + theme(plot.margin=margin(r=.5, unit="cm")), 
428 |                       ggpc2 + theme(plot.margin=margin(r=.5, unit="cm")), 
429 |                       nrow=1, rel_widths = c(0.13,1,1)),
430 |             plot_grid(
431 |                 pca.heatmap2 + 
432 |                 theme(legend.position=c(0.13, 0.68), 
433 |                       legend.background = element_blank()) +
434 |                 xlab(paste("PC1 (", varexpl[1], "%) - cold/seasonal to hot/stable", sep="")) + 
435 |                 ylab(paste("PC2 (", varexpl[2], "%) - dry to wet", sep="")) + 
436 |                 xlim(c(-15,15)) + ylim(c(-5,22)),  
437 |               plot_grid(plot_grid(inset.list[[1]], inset.list[[4]], NULL, 
438 |                                   nrow=3, labels = c("A", "D", ""), rel_heights = c(1,1,0.1)),
439 |                         plot_grid(inset.list[[2]], inset.list[[3]], NULL, 
440 |                                   nrow=3, labels = c("B", "C", ""), rel_heights = c(1,1,0.1)))), 
441 |             nrow=2, rel_heights = c(1,1.5), align="v")
442 |           
443 | 
444 | ggsave(filename = "_output/figure2.tiff", width=8, height=5.6, units = "in", dpi=300, plot=panel.out1)
445 | 
446 | panel.out1
447 | ```
448 | 
449 | 
450 | ## Figure 3 - Whittaker Biome Graph
451 | Get climatic data
452 | ```{r}
453 | load("/data/sPlot/releases/sPlot2.1/sPlot_header_chelsa_20161124.RData")
454 | climate.oa <- climate %>% 
455 |   filter(PlotID %in% (header.oa %>% 
456 |                         filter(Resample_1) %>% 
457 |                         pull(PlotObservationID))) %>% 
458 |   dplyr::select(-POINT_X, -POINT_Y) %>% 
459 |   rename(PlotObservationID=PlotID)
460 | ```
461 | 
462 | Figure 3 - Left
463 | Create plot of Schultz' biomes
464 | ```{r}
465 | biome.order <- c('Polar and subpolar zone' ,'Alpine' ,'Boreal zone' ,'Temperate midlatitudes' ,
466 |                  'Dry midlatitudes' ,'Dry tropics and subtropics' ,'Subtropics with year-round rain' ,
467 |                  'Subtropics with winter rain' ,'Tropics with summer rain' ,'Tropics with year-round rain')
468 | biome.labs <- c('Polar & subpolar' ,'Alpine' ,'Boreal zone' ,'Temperate midlatitudes' ,
469 |                  'Dry midlatitudes' ,'Dry tropics & subtropics' ,'Subtropics - year-round\n rain' ,
470 |                  'Subtropics - winter\n rain' ,'Tropics - summer rain' ,'Tropics - year-round\n rain')
471 | 
472 | 
473 | mypalette <- palette(c('#CAB2D6','#6A3D9A', #violets
474 |                         '#A6CEE3','#1F78B4', #blues
475 |                         '#FDBF6F','#FF7F00', #orange
476 |                         '#B2DF8A','#33A02C', #greens
477 |                         '#FB9A99','#E31A1C' #reds
478 |                         ))
479 | 
480 | # Plot of Temp vs Prec + sBiomes
481 | biome.schu <- ggplot() + 
482 |   theme_classic() +
483 |   #geom_path(aes(x, y), data=contour_95) +
484 |   geom_point(data=climate.oa %>% 
485 |                left_join(header.oa %>% 
486 |                            dplyr::filter(Resample_1) %>% 
487 |                            dplyr::select(PlotObservationID, Biome), 
488 |                          by="PlotObservationID") %>%  
489 |               filter(bio12<4500 & bio01>-11) %>% 
490 |                mutate(Biome=factor(Biome, levels=biome.order, labels=biome.labs))
491 |               # mutate(alpha=ifelse(Biome=="Tropics with summer rain", 1, 1/3))
492 |                , #filter out for plotting reasons 
493 |               aes(x=bio01, y=bio12, col=Biome), 
494 |              alpha=1/3, 
495 |              cex=1/15) + 
496 |   xlab("Temperature (°C)") + 
497 |   ylab("Precipitation (mm)") + 
498 |   scale_color_manual(values = c('#CAB2D6','#6A3D9A', #violets
499 |                         '#A6CEE3','#1F78B4', #blues
500 |                         '#FDBF6F','#FF7F00', #orange
501 |                         '#B2DF8A','#33A02C', #greens
502 |                         '#FB9A99','#E31A1C' #reds
503 |                         ), name="sBiomes") + 
504 |   guides(color = guide_legend(override.aes = list(size=5, shape=15, alpha=1))) + 
505 |   theme(#plot.margin=margin(r=-0.1, unit="cm"),
506 |         axis.title = element_text(size=9),
507 |         axis.text = element_text(size=9),
508 |         legend.text = element_text(size=8), 
509 |         legend.title = element_text(size=9))
510 | ```
511 | 
512 | Figure 3 - Right  
513 | Create Whittaker plot
514 | ```{r}
515 | whitt.biome <- whittaker_base_plot() +
516 |   theme_classic() + 
517 |   geom_point(data=climate.oa %>% 
518 |                 filter(bio12<4500 & bio01>-11), #filter out for plotting reasons 
519 |               aes(x=bio01, y=bio12/10), 
520 |              alpha=1/4, 
521 |              cex=1/20) + 
522 |     theme(axis.text.y = element_blank(),
523 |           axis.text = element_text(size=9),
524 |           axis.title.y = element_blank(),
525 |           axis.title = element_text(size=9),
526 |           plot.margin=margin(l=-0.2, r=-0.1, unit="cm"), 
527 |           legend.text = element_text(size=8),
528 |           legend.title = element_text(size=9)) 
529 | ```
530 | 
531 | Figure 3 - Panel
532 | ```{r, fig.width=8, fig.height=4, fig.align="center", message=F, cache=T}
533 | #Make a panel with the two plots
534 | panel.biomes <- cowplot::plot_grid( biome.schu, whitt.biome, nrow=1, 
535 |                                     align = "h", rel_widths = c(0.5, .5))
536 | ggsave(filename="_output/figure3.tiff", plot = panel.biomes, width = 10, height=4, units="in", dpi=300)
537 | panel.biomes
538 | ```
539 | 
540 | # Supplementary figures
541 | 
542 | 
543 | 
544 | ## Figure S1 - Biplot
545 | ```{r, fig.width=5.7, fig.height=7, fig.align="center", warning=F}
546 | library(ggrepel)
547 | varexpl <- round(pca3$sdev^2/sum(pca3$sdev^2)*100,1)
548 | 
549 | mydata <- pca3$x %>% 
550 |          as_tibble() %>% 
551 |          dplyr::select(1:2) %>% 
552 |          mutate(count=1)
553 | myarrows <- pca3$rotation[,1:2] %>% 
554 |   as.data.frame() %>% 
555 |   rownames_to_column("mylab") %>%
556 |   mutate_at(.vars=vars(-mylab), .funs = list(~.*25)) %>% 
557 |   filter(!mylab %in% c("T_ANN", "P_ANN"))
558 |   
559 | 
560 | ggpca3 <- ggplot(data=mydata) +
561 |     stat_summary_2d(
562 |     aes(x = PC1, y = PC2, z = count),
563 |     bins = 100,
564 |     fun = function(x) {log10(sum(x)+1)},
565 |     alpha=1/2
566 |     ) +
567 |   geom_segment(data=myarrows, 
568 |                aes(x=0, xend=PC1, y=0, yend=PC2), 
569 |                arrow = arrow(length = unit(0.08, "inches")), alpha=0.8) +
570 |     geom_label_repel(data=myarrows, 
571 |              aes(x=PC1, y=PC2, label=mylab), size=2, 
572 |              position = position_dodge(2),segment.alpha=0.5, segment.colour=gray(0.8)) + 
573 |   scale_fill_viridis("Number of 2.5 arcmin\n terrestrial grid cells", option = "magma", alpha=1/2,
574 |                      limits=c(0,5), breaks=log10(c(9,99,999,9999)), labels=10^(1:4)) +
575 |   theme_bw() + 
576 |   coord_equal() + 
577 |   theme(legend.position = c(0.2, 0.85), 
578 |         panel.grid.major = element_blank(),
579 |         panel.grid.minor = element_blank()) +
580 |   #guides(fill = guide_legend(override.aes = list(alpha=1/3))) + 
581 |   xlab(paste("PC1 (", varexpl[1], "%)\n (cold and seasonal to hot and stable)", sep="")) + 
582 |   ylab(paste("PC2 (", varexpl[2], "%)\n (dry to wet)", sep="")) + 
583 |   ylim(c(-6,22))
584 | 
585 | ggsave("_output/figureS1.pdf", plot = ggpca3, width=5.7, height=7, unit="in", dpi=300)
586 | ggpca3
587 | ```
588 | 
589 | 
590 | # Tables 
591 | ## Table 1 - Database level information
592 | Import databases and create reference tags
593 | ```{r, message=F, warning=F}
594 | #load(file.path("_sPlotOpenDB", "sPlotOpen.RData"))
595 | #Import BibTex
596 | bib.db <- bib2df("/data/sPlot/users/Francesco/_sPlot_Management/Consortium/sPlot_References.bib")
597 | #Import database-level information
598 | databases <- read_csv("/data/sPlot/users/Francesco/_sPlot_Management/Consortium/Databases.out.csv")
599 | 
600 | # create citation tags that can be picked up by Manubot
601 | databases <- databases %>% 
602 |   left_join(bib.db %>% 
603 |               dplyr::select(BIBTEXKEY, DOI, URL), 
604 |             by="BIBTEXKEY") %>% 
605 |   mutate(tag=NA) %>% 
606 |   rowwise() %>% 
607 |   mutate(tag=ifelse(!is.na(DOI), 
608 |                     paste0("@doi:", DOI), 
609 |                     tag)) %>% 
610 |   mutate(tag=ifelse( (is.na(tag) & `GIVD ID` %in% unique(header.oa$GIVD_ID) & !is.na(Citation)), 
611 |                      paste0("@", word(Citation, 1)), 
612 |                      tag)) %>% 
613 |   dplyr::select(-DOI, -URL, -BIBTEXKEY)
614 |   
615 | 
616 | ```
617 | 
618 | Create Table 1
619 | 
620 | ```{r}
621 | table1 <- databases %>% 
622 |   filter(`Still in sPlot`==T, 
623 |          Via!="Aggregator") %>% 
624 |   dplyr::select(-Via, -`Still in sPlot`, -label) %>% 
625 |   distinct() %>% 
626 |   left_join(header.oa %>% 
627 |               group_by(GIVD_ID) %>% 
628 |               summarize(contributed_plots=n(), .groups = 'drop'), 
629 |             by=c("GIVD ID"="GIVD_ID")) %>% 
630 |   filter(!is.na(contributed_plots)) %>% 
631 |   replace_na(list(tag="",
632 |                   `Deputy custodian`="")) %>% 
633 |   dplyr::select(`GIVD ID`, `Dataset name`=`DB_name GIVD`,  Custodian, `Deputy custodian`, `Nr. open-access plots` = contributed_plots, Ref=tag) %>% 
634 |   arrange(`GIVD ID`) %>% 
635 |   # replace citation rendered wrongly using doi
636 |   mutate(Ref=replace(Ref,
637 |                      list=`GIVD ID`=="AU-AU-002", 
638 |                      values="@isbn:9781315368252"))
639 | 
640 | 
641 | write_csv(table1, "_output/Table1_Databases.csv")
642 | ```
643 | 
644 | ```{r, echo=F}
645 | knitr::kable(table1%>% 
646 |                slice(1:20),
647 |              caption="Table 1 - Database level information [only first 20 rows shown]") %>%
648 |     kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
649 |                   full_width = F, position = "center")
650 | ```
651 | 
652 | 
653 | ## Table 2 - Metadata and environmental data included in header  
654 | Define unit of measurements for columns
655 | ```{r}
656 | um <- c('Latitude'='° (WGS84)',
657 | 'Longitude'='° (WGS84)',
658 | 'Location_uncertainty'='m',
659 | 'Releve_area'='m^2^',
660 | 'Elevation'='m a.s.l.',
661 | 'Aspect'='°',
662 | 'Slope'='°',
663 | 'Date_of_recording' = 'dd-mm-yyyy',
664 | 'Cover_total'='%',
665 | 'Cover_tree_layer'='%',
666 | 'Cover_shrub_layer'='%',
667 | 'Cover_herb_layer'='%',
668 | 'Cover_moss_layer'='%',
669 | 'Cover_lichen_layer'='%',
670 | 'Cover_algae_layer'='%',
671 | 'Cover_litter_layer'='%',
672 | 'Cover_bare_rocks'='%',
673 | 'Cover_cryptogams'='%',
674 | 'Cover_bare_soil'='%',
675 | 'Height_trees_highest'='m',
676 | 'Height_trees_lowest'='m',
677 | 'Height_shrubs_highest'='m',
678 | 'Height_shrubs_lowest'='m',
679 | 'Height_herbs_average'='cm',
680 | 'Height_herbs_lowest'='cm',
681 | 'Height_herbs_highest' = 'cm')
682 | um <- data.frame(Variable=names(um), `Unit of Measurement`=um)
683 | 
684 | ```
685 | 
686 | Create table 2 
687 | ```{r}
688 | table2 <- header.oa %>% 
689 |   dplyr::summarize_at(.vars=vars(!starts_with("PlotObservationID")),
690 |                .funs = list(xxxNo.records=~sum(!is.na(.)),
691 |                             xxxType.of.variable=~ifelse("logical" %in% class(.), "b",
692 |                                            ifelse("ordered" %in% class(.), 
693 |                                                  "o", 
694 |                                                   ifelse(any(class(.) %in% c("character", "factor")), 
695 |                                                          "n",
696 |                                                          ifelse(class(.)=="Date",
697 |                                                                 "d",
698 |                                                                 "q")))),
699 |                             xxxLevels=~(ifelse(is.numeric(.)|lubridate::is.Date(.), 
700 |                                                paste(range(., na.rm=T), collapse=" - "),
701 |                                                ifelse(is.ordered(.), 
702 |                                                       paste(paste(1:nlevels(.), 
703 |                                                                   levels(.), sep=" = "), collapse=", "),
704 |                                                       ifelse(is.factor(.), 
705 |                                                               paste(levels(.), collapse=", "),
706 |                                                               ifelse(is.logical(.),
707 |                                                                      paste(names(table(.)), "=",table(.),
708 |                                                                            collapse="; "),
709 |                                                                      ""))))))) %>%
710 |   gather(key="Variable") %>% 
711 |   separate(Variable, into = c("Variable", "feature"), sep="_xxx") %>% 
712 |   spread(key=feature, value = value) %>% 
713 |   rename(`Range/Levels`=Levels) %>% 
714 |   mutate(Variable=factor(Variable, levels=colnames(header.oa))) %>% 
715 |   arrange(Variable) %>% 
716 |   left_join(um, by="Variable") %>% 
717 |   mutate(Unit.of.Measurement=as.character(Unit.of.Measurement)) %>% 
718 |   replace_na(list(Unit.of.Measurement="")) %>% 
719 |   dplyr::select(Variable, `Range/Levels`, 
720 |                 `Unit of Measurement`=Unit.of.Measurement, 
721 |                 `Nr. of plots with information`=No.records, `Type`=Type.of.variable)
722 | 
723 | write_csv(table2, "_output/Table2_header.csv")
724 | ```
725 | 
726 | ```{r, echo=F}
727 | knitr::kable(table2,
728 |              caption="Table 2 - Variables in header") %>%
729 |     kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
730 |                   full_width = F, position = "center")
731 | ```
732 | 
733 | 
734 | ## Sink tables for Manubot
735 | ```{r}
736 | out.file <- "_output/91.ManubotTables.md"
737 | readr::write_lines("## Supplementary Material {.page_break_before}\n", file = out.file)
738 | readr::write_lines("Table: List of databases contributing to sPlotOpen, the environmentally-balanced, open-access, global dataset of vegetation plots. Databases are ordered based on their ID in the Global Index of Vegetation Databases (GVID ID). {#tbl:Table1 tag='1'}\n", file = out.file, append=T)
739 | readr::write_lines("\n \n", file = out.file, append=T)
740 | kable1 <- kable(table1, format = "markdown")
741 | ## fix header table 1
742 | kable1[2] <- "|:------------|:--------------------------------------------|:--------------------|:--------------------|--------:|:--------|"
743 | readr::write_lines(kable1, file = out.file, append=T)
744 | readr::write_lines("\n \n \n", file = out.file, append=T)
745 | readr::write_lines("Table: Description of the variables contained in the ‘header’ matrix, together with their range (if numeric) or possible levels (if nominal or binary) and the number of non-empty (i.e., non NA) records. Variable types can be n - nominal (i.e., qualitative variable), o - ordinal, q - quantitative, or b - binary (i.e., boolean), or d - date {#tbl:Table2 tag='2'}. Additional details on the variables are in Bruelheide et al. (2019) [@doi:10.1111/jvs.12710]. GIVD codes derive from Dengler et al. (2011) [@doi:10.1111/j.1654-1103.2011.01265.x]. Biomes refer to Schultz 2005 [@doi:10.1007/3-540-28527-x], modified to include also the world mountain regions by Körner et al. (2017)[@doi:10.1007/s00035-016-0182-6]. The column ESY refers to the EUNIS Habitat Classification Expert system described in Chytrý et al. (2020) [@doi:10.1111/avsc.12519].\n", file = out.file, append=T)
746 | kable2 <- kable(table2, format = "markdown")
747 | ## fix header table 2
748 | kable2[2] <- "|:---------------------------|:-------------------------------------------------------------|:-------------------|:-----------|:-----|"
749 | readr::write_lines(kable2, file = out.file, append=T)
750 | ```
751 | 
752 | 
753 | # SessionInfo
754 | ```{r}
755 | sessionInfo()
756 | ```
757 | 
758 | 


--------------------------------------------------------------------------------
/05_Demo.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Appendix 1 - sPlotOpen - Demo"
  3 | author: "Francesco Maria Sabatini, Jonathan Lenoir, Helge Bruelheide"
  4 | date: "07/04/2021"
  5 | output: html_document
  6 | ---
  7 | 
  8 | Appendix to the paper: Sabatini, Lenoir et al., sPlotOpen – An environmentally-balanced, open-access, global dataset of vegetation plots. *Global Ecology and Biogeography*.
  9 | <br>
 10 | 
 11 | This demo illustrates how to import and manipulate sPlotOpen data to create some basic graphics or tables together with a reference list. As a worked example, the code below will:  
 12 | 
 13 | 1. select all plots containing at least a species of *Quercus* from sPlotOpen's resampled iteration #1  
 14 | 2. show some summary at biome level  
 15 | 3. graph the distribution of the community weighted mean of a selected functional trait  
 16 | 4. show the geographical location of all selected plots  
 17 | 5. create a reference list based on the plots effectively selected.  
 18 | 
 19 | <br>
 20 | 
 21 | ```{r, message=F, warning=F}
 22 | #load libraries
 23 | library(tidyverse)
 24 | library(sf)
 25 | library(raster)
 26 | library(rnaturalearth)
 27 | library(RefManageR)
 28 | ```
 29 | 
 30 | ## Import data
 31 | ```{r}
 32 | load("_sPlotOpenDB/sPlotOpen.RData")
 33 | ls()
 34 | ```
 35 | ## Extract all plots containing at least a *Quercus* species 
 36 | Use only the first resampled iteration of sPlotOpen
 37 | ```{r}
 38 | #select only the first resample
 39 | header.oa1 <- header.oa %>% 
 40 |   filter(Resample_1 == T)
 41 | DT2.oa1 <- DT2.oa %>% 
 42 |   filter(PlotObservationID %in% header.oa1$PlotObservationID)
 43 | CWM_CWV.oa1 <- CWM_CWV.oa %>% 
 44 |   filter(PlotObservationID %in% header.oa1$PlotObservationID)
 45 | ```
 46 | 
 47 | ```{r}
 48 | #get all plots containing at least one Quercus species
 49 | plotlist.quercus <- DT2.oa1 %>% 
 50 |   filter(str_detect(Species, "^Quercus")) %>% 
 51 |   distinct(PlotObservationID) %>% 
 52 |   pull(PlotObservationID)
 53 | 
 54 | header.quercus <- header.oa1 %>% 
 55 |   filter(PlotObservationID %in% plotlist.quercus & 
 56 |            Resample_1 == T)
 57 | 
 58 | DT2.quercus <- DT2.oa1 %>% 
 59 |   filter(PlotObservationID %in% plotlist.quercus)
 60 | 
 61 | CWM_CWV.quercus <- CWM_CWV.oa1 %>% 
 62 |   mutate(Quercus=ifelse(PlotObservationID %in% plotlist.quercus, T, F))
 63 | 
 64 | ```
 65 | There are `r length(plotlist.quercus)` plots containing at least a *Quercus* species in sPlotOpen's resampled iteration 1.  
 66 | <br>
 67 | 
 68 | ## Number of plots with *Quercus* across biomes
 69 | Summarize the number of plots containing at least one *Quercus* species across biomes  
 70 | 
 71 | ```{r}
 72 | header.quercus %>% 
 73 |   group_by(Biome) %>% 
 74 |   summarize(n = n())
 75 | ```
 76 | \pagebreak
 77 | 
 78 | ## Compare Community Weighted Means
 79 | Compare the distribution of the community weighted means of Stem density, between plots containing and not containing a *Quercus* species.
 80 | 
 81 | ```{r, warning=F}
 82 | ggplot(data = CWM_CWV.quercus) + 
 83 |   geom_density(aes(x = StemDens_CWM, fill = Quercus), col = NA, alpha = 0.5) + 
 84 |   theme_bw()
 85 | ```
 86 | \pagebreak
 87 | 
 88 | ## Geographical distribution of plots containing a *Quercus* species  
 89 | <br>
 90 | Download some spatial data of the world and create a template map using the r package `rnaturalearth`, first. Transform all geographical data to Eckert IV projection.
 91 | ```{r, message=F, warning=F, results="hide"}
 92 | countries <- ne_countries(returnclass = "sf") %>% 
 93 |   st_transform(crs = "+proj=eck4") %>% 
 94 |   st_geometry()
 95 | graticules <- ne_download(type = "graticules_15", category = "physical",
 96 |                           returnclass = "sf") %>% 
 97 |   st_transform(crs = "+proj=eck4") %>% 
 98 |   st_geometry()
 99 | bb <- ne_download(type = "wgs84_bounding_box", category = "physical",
100 |                   returnclass = "sf") %>% 
101 |   st_transform(crs = "+proj=eck4") %>% 
102 |   st_geometry()
103 | ```
104 | 
105 | Template of Global map - with country borders
106 | ```{r, results="hide", warning=F, message=F}
107 | w3a <- ggplot() +
108 |   geom_sf(data = bb, col = "grey20", fill = "white") +
109 |   geom_sf(data = graticules, col = "grey20", lwd = 0.1) +
110 |   geom_sf(data = countries, fill = "grey90", col = NA, lwd = 0.3) +
111 |   coord_sf(crs = "+proj=eck4") +
112 |   theme_minimal() +
113 |   theme(axis.text = element_blank(), 
114 |         legend.title = element_text(size=12), 
115 |         legend.text = element_text(size=12),
116 |         legend.background = element_rect(size = 0.1, linetype = "solid", colour = 1), 
117 |         legend.key.height = unit(1.1, "cm"), 
118 |         legend.key.width = unit(1.1, "cm"))
119 | ```
120 | Project selected plots to Eckert IV and transform them to sf, before plotting.
121 | ```{r, results="hide", warning=F, message=F}
122 | header.quercus.sf <- SpatialPointsDataFrame(coords = header.quercus %>%
123 |                                       dplyr::select(Longitude, Latitude), 
124 |                                     proj4string = CRS("+init=epsg:4326"), 
125 |                                     data=header.quercus %>%
126 |                                       dplyr::select(-Longitude, -Latitude)) %>% 
127 |   st_as_sf() %>% 
128 |   st_transform(crs = "+proj=eck4")
129 | ```
130 | 
131 | Show all plots containing at least one *Quercus* species. Color code based on biomes.
132 | ```{r, fig.width=8, fig.height=9, fig.align="center", warning=F, message=F}
133 | (Figure1a <- w3a + 
134 |   geom_sf(data = header.quercus.sf, aes(color = Biome), 
135 |           pch = 16, size = 0.8, alpha = 0.8) + 
136 |   geom_sf(data = countries, col = "grey20", fill=NA, lwd = 0.3) + 
137 |   theme(legend.position = "bottom", 
138 |         legend.title = element_blank()) + 
139 |   guides(color = guide_legend(ncol = 2,
140 |                             override.aes = list(size = 2))))
141 | ```
142 | \pagebreak
143 | 
144 | # Create a reference list for selected plots
145 | Create reference list as BibText
146 | ```{r}
147 | sPlotOpen_citation(IDs=plotlist.quercus, level = "database", 
148 |                    out.file = "_output/demo.bib")
149 | # show first few lines of output file
150 | read_lines("_output/demo.bib", n_max = 25)
151 | ```
152 | 
153 | Convert to reference list
154 | ```{r, warning=F}
155 | mybib <- RefManageR::ReadBib("_output/demo.bib", check = FALSE) 
156 | mybib
157 | ```
158 | 
159 | ## sessionInfo()
160 | ```{r}
161 | sessionInfo()
162 | ```
163 | 
164 | 
165 | 
166 | 
167 | 
168 | 
169 | 
170 | 
171 | 
172 | 
173 | 


--------------------------------------------------------------------------------
/998_fixSplotOpen.R:
--------------------------------------------------------------------------------
  1 | library(tidyverse)
  2 | 
  3 | ## Load sPlotOpen data
  4 | load("~/share/groups/sPlot/releases/_sPlotOpenDB/v10/sPlotOpen.RData")
  5 | 
  6 | # Fix DT2 and CWM based on Issue #7
  7 | # https://github.com/fmsabatini/sPlotOpen_Code/issues/7
  8 | 
  9 | ## Identify SALVIAS plots in sPlotOpen to be replaced
 10 | tofix <- header.oa %>% 
 11 |   filter(Dataset=="Salvias") %>% 
 12 |   dplyr::select(PlotObservationID:Location_uncertainty) %>% 
 13 |   left_join(metadata.oa %>% 
 14 |               dplyr::select(PlotObservationID, Original_nr_in_database), 
 15 |             by="PlotObservationID")
 16 | 
 17 | ## Import SALVIAS corrected data 
 18 | header.salvias <- read_delim("_data/Update_SALVIAS/SALVIAS_correct_header.csv") %>% 
 19 |   dplyr::select(New_PlotObservationID = PlotObservationID, 
 20 |                 Original_nr_in_database = `Original nr in database`)
 21 | 
 22 | ## attach New Plot ID to tofix object
 23 | tofix <- tofix %>% 
 24 |   dplyr::select(PlotObservationID, Original_nr_in_database) %>% 
 25 |   left_join(header.salvias, by="Original_nr_in_database")
 26 | 
 27 | ## Import correct salvias DT table, and match new IDs with old IDs
 28 | DT2.salvias0 <- read_delim("_data/Update_SALVIAS/SALVIAS_correct_species.csv") %>% 
 29 |   rename(New_PlotObservationID = PlotObservationID) %>%  
 30 |   left_join(tofix, by="New_PlotObservationID") %>% 
 31 |   filter(!is.na(PlotObservationID))
 32 | 
 33 | ## double check species richness (they should be almost identical)
 34 | 
 35 | checking <- DT2.oa %>% 
 36 |   filter(PlotObservationID %in% tofix$PlotObservationID) %>% 
 37 |   count(PlotObservationID) %>% 
 38 |   left_join(DT2.salvias0 %>% 
 39 |               count(PlotObservationID) %>% 
 40 |               rename(n_new=n), 
 41 |             by="PlotObservationID")
 42 | plot(checking[,2:3])
 43 | ## There are minor mismatches, probably due to taxonomic standardization
 44 | ## The most serious seems plots 56497    NADUGANI - Unclear why
 45 | 
 46 | 
 47 | 
 48 | ### Taxonomic standardization
 49 | # ## First Try - backengineer Backbone from sPlotOpen itself
 50 | # bb.sPlotOpen <- DT2.oa %>% 
 51 | #   filter(PlotObservationID %in% tofix$PlotObservationID) %>% 
 52 | #  distinct(Species, Original_species) 
 53 | # # Check wheter all taxa from DT2.salvias can be resolved this way
 54 | # DT2.salvias0 %>%
 55 | #  distinct(`Species name`) %>% 
 56 | #  left_join(bb.sPlotOpen, 
 57 | #            by=c("Species name" = "Original_species")) %>% 
 58 | #  filter(is.na(Species))
 59 | # ## Too many unmatched entries --> 22,765
 60 | 
 61 | ## Second Try - Load Backbone from sPlot 3
 62 | #load("~/share/groups/sPlot/releases/sPlot2.1/backbone.splot2.1.try3.is.vascular.Rdata")
 63 | load("~/share/groups/sPlot/releases/sPlot3.0/Backbone3.0.RData")
 64 | 
 65 | 
 66 | DT2.salvias <- DT2.salvias0 %>% 
 67 |   # some string cleaning
 68 |   mutate(Original_species=str_remove(`Species name`, pattern = " species$")) %>% 
 69 |   mutate(Original_species=str_remove(Original_species, " \\[CM$| \\[Derris$| subf\\.$| RS$| SP\\:$| M$| NPZ$| LC$")) %>% 
 70 |   # Try matching both with original species name, and with the clean version of the same string from the Backbone
 71 |   left_join(Backbone %>% 
 72 |               dplyr::select(Name_sPlot_TRY, Name_short) %>% 
 73 |               distinct(), 
 74 |             by=c("Original_species" = "Name_sPlot_TRY")) %>% 
 75 |    left_join(Backbone %>% 
 76 |               dplyr::select(Name_submitted, Name_short2 = Name_short) %>% 
 77 |               distinct(), 
 78 |             by=c("Original_species" = "Name_submitted")) %>% 
 79 |   mutate(Name_short=coalesce(Name_short, Name_short2)) %>% 
 80 |   # replace all entries returning 'No suitable match' as NAs
 81 |   mutate(Name_short = replace(Name_short, 
 82 |                               list=str_detect(Name_short, "suitable"), 
 83 |                               values=NA)) %>% 
 84 |   #distinct(Original_species, Name_short) %>% 
 85 |   #filter(is.na(Name_short)) %>% 
 86 |   #View()
 87 |   mutate(Relative_cover = NA) %>% 
 88 |   dplyr::select(PlotObservationID, 
 89 |                 Species = Name_short, 
 90 |                 Original_species = `Species name`, 
 91 |                 #Layer, 
 92 |                 ## The column names of stem count and the TV2 species number seem reversed!
 93 |                 Original_abundance = `TV2 species nr`, 
 94 |                 Abundance_scale = `Cover code`, 
 95 |                 Relative_cover, 
 96 |                 New_PlotObservationID) 
 97 | dim(DT2.salvias)
 98 | 
 99 | DT2.salvias %>% 
100 |   distinct(Species, Original_species) %>% 
101 |   filter(is.na(Species)) %>% 
102 |   nrow()
103 | ## Using the backbone from sPlot3.0 only 31 taxa remain unmatched for a total of 931 species x plot obs
104 | 
105 | ### Check for non-vascular plants
106 | #Complement taxon group using info from sPlot 3.0 
107 | ## Assign genera to taxon group
108 | taxon.groups <- DT2.salvias %>% 
109 |   as_tibble() %>% 
110 |   distinct(Species) %>% 
111 |   mutate(species2=Species) %>% 
112 |   separate(species2, into=c("Genus"), sep=" ") %>% 
113 |   distinct(Species, Genus) %>% 
114 |   left_join(Backbone %>% 
115 |               distinct(Name_short, `Taxon group`) %>% 
116 |               rename(species=Name_short, 
117 |                      Taxon.group=`Taxon group`) %>% 
118 |               separate(species, into=c("Genus"), sep=" ") %>% 
119 |               mutate(Taxon.group=as.character(Taxon.group)) %>% 
120 |               mutate(Taxon.group=ifelse(Taxon.group=="Unknown", NA, Taxon.group)) %>% 
121 |               distinct(Genus, .keep_all=T), 
122 |             by="Genus") %>% 
123 |   distinct(Species, .keep_all = T) %>% 
124 |   dplyr::select(-Genus)
125 | #27 Nas out of 7222
126 | 
127 | DT2.salvias <- DT2.salvias %>% 
128 |   # attach taxon group info from Backbone 3.0
129 |   left_join(taxon.groups, by="Species") %>% 
130 |   # exclue non vascular plants
131 |   filter(is.na(Taxon.group) | 
132 |            !Taxon.group %in% c("Alga", "Lichen", "Moss")) %>% 
133 |   # calculate relative cover
134 |   left_join({.} %>%
135 |               group_by(PlotObservationID) %>% 
136 |               summarize(tot.cover=sum(Original_abundance), .groups = 'drop'), 
137 |             by=c("PlotObservationID")) %>% 
138 |   mutate(Relative_cover=Original_abundance/tot.cover) %>% 
139 |   dplyr::select(-tot.cover, -Taxon.group)
140 | ## Only one moss taxon, with three entries. Excluded
141 | dim(DT2.salvias)
142 | 
143 | 
144 | 
145 | ### Delete wrong entries from DT2.oa and replace with correct salvias data
146 | DT2.oa.out <- DT2.oa %>% 
147 |   filter(!PlotObservationID %in% tofix$PlotObservationID) %>% 
148 |   bind_rows(DT2.salvias %>% 
149 |               dplyr::select(all_of(colnames(DT2.oa)))) %>% 
150 |   arrange(PlotObservationID, Species)
151 | 
152 | dim(DT2.oa)
153 | dim(DT2.oa.out)
154 | 
155 | 
156 | ### Check what the 2 thousand rows were before
157 | #species in old version (with wrong SALVIA plots) but not in the corrected version
158 | setdiff(DT2.oa %>% distinct(Species) %>% arrange(Species) %>% pull(),
159 |         DT2.oa.out %>% distinct(Species) %>% arrange(Species) %>% pull())
160 | #504 species were excluded
161 | 
162 | #species in the corrected version that were not present in the old version
163 | setdiff(DT2.oa.out %>% distinct(Species) %>% arrange(Species) %>% pull(),
164 |         DT2.oa %>% distinct(Species) %>% arrange(Species) %>% pull())
165 | #352 new species were included
166 | 
167 | 
168 | #selecting random tropical species for closer inspection
169 | header.oa %>% filter(PlotObservationID %in%
170 |                        c(DT2.oa %>% filter(Species == 'Bauhinia divaricata') %>% pull(PlotObservationID))) %>% 
171 |   View()
172 | 
173 | 
174 | header.oa %>% filter(PlotObservationID %in%
175 |                        c(DT2.oa.out %>% filter(Species == 'Tabebuia rosea') %>% pull(PlotObservationID))) %>% 
176 |   View()
177 | 
178 | 
179 | ### Compute CWM\CWV for DT2.salvias
180 | #Load species level gap-filled trait data
181 | load("~/share/groups/sPlot/releases/sPlot2.0/TRY.all.mean.sd.3.by.genus.species.Rdata")
182 | TRY <- TRY.all.mean.sd.3.by.genus.species
183 | 
184 | #Merge species data table with traits, and calculate species coverage for each plot, both based on relative cover, and number of species having trait info.
185 | CWM_CWV.oa0.salvias <- DT2.oa.out %>%
186 |   as.tbl() %>%
187 |   dplyr::select(PlotObservationID, Species, Relative_cover) %>%
188 |   left_join(TRY %>%
189 |               dplyr::rename(Species=StandSpeciesName) %>%
190 |               dplyr::select(Species, LeafArea.mean:Wood.vessel.length.mean), 
191 |             by="Species") %>% 
192 |   rename_at(.vars=vars(ends_with(".mean")), 
193 |             .funs=~gsub(pattern=".mean", replacement="", x=.))
194 | 
195 | # number of species with trait information.
196 | CWM_CWV.oa0.salvias %>% 
197 |   distinct(Species, .keep_all = T) %>% 
198 |   filter(!is.na(SLA)) %>% 
199 |   nrow()
200 | 
201 | # Calculate coverage for each trait in each plot
202 | CWM_CWV.oa2.salvias <- CWM_CWV.oa0.salvias %>%
203 |   mutate_at(.vars = vars(LeafArea), 
204 |             .funs = list(~if_else(is.na(.),0,1) * Relative_cover)) %>%
205 |   group_by(PlotObservationID) %>%
206 |   summarize(TraitCoverage_cover=sum(LeafArea, na.rm=T),
207 |             Species_richness=n(),
208 |             TraitCoverage_pa=mean(LeafArea>0), 
209 |             .groups = 'drop')
210 | 
211 | 
212 | #Calculate CWM and CWV for each trait in each plot
213 | # Ancillary function to calculate CWV
214 | variance2.fun <- function(trait, abu){
215 |   res <- as.double(NA)
216 |   #nam <- nam[!is.na(trait)]
217 |   abu <- abu[!is.na(trait)]
218 |   trait <- trait[!is.na(trait)]
219 |   abu <- abu/sum(abu)
220 |   if (length(trait)>1){
221 |     # you need more than 1 observation to calculate
222 |     # skewness and kurtosis
223 |     # for calculation see 
224 |     # http://r.789695.n4.nabble.com/Weighted-skewness-and-curtosis-td4709956.html
225 |     m.trait <- weighted.mean(trait,abu)
226 |     res <- sum(abu*(trait-m.trait)^2)
227 |   }
228 |   res
229 | }
230 | 
231 | CWM_CWV.oa1.salvias <- CWM_CWV.oa0.salvias %>%
232 |   group_by(PlotObservationID) %>%
233 |   summarize_at(.vars= vars(LeafArea:Wood.vessel.length),
234 |                .funs = list(CWM=~weighted.mean(., Relative_cover, na.rm=T), 
235 |                             CWV=~variance2.fun(., Relative_cover)))
236 | 
237 | #Assemble output
238 | CWM_CWV.oa.salvias <- header.oa %>% 
239 |   dplyr::select(PlotObservationID) %>% 
240 |   left_join(CWM_CWV.oa2.salvias, by="PlotObservationID") %>% 
241 |   left_join(CWM_CWV.oa1.salvias, by="PlotObservationID") %>% 
242 | #Rename fields to follow convention 
243 |   rename_all(.funs=~gsub('\\.', '_', x = .))
244 | 
245 | dim(CWM_CWV.oa.salvias)
246 | dim(header.oa)
247 | 
248 | ## Check CWM correlations
249 | plot(CWM_CWV.oa$LeafArea_CWM, 
250 |      CWM_CWV.oa.salvias$LeafArea_CWM, 
251 |      col = (CWM_CWV.oa %>% 
252 |                     pull(PlotObservationID) %in% tofix$PlotObservationID + 1))
253 | ## All identical, except SALVIAS data, marked in pink
254 | 
255 | 
256 | # Fix header.oa based on Issue #6
257 | # https://github.com/fmsabatini/sPlotOpen_Code/issues/6
258 | header.oa <- header.oa %>%    
259 |   mutate_at(vars(starts_with('Cover')),   
260 |             .funs=~(ifelse(. > 100, 100, .)))
261 | 
262 | 
263 | ### Export
264 | DT2.oa <- DT2.oa.out
265 | CWM_CWV.oa <- CWM_CWV.oa.salvias
266 | path <- "_sPlotOpenDBv20"
267 | save(DT2.oa, header.oa, metadata.oa, CWM_CWV.oa, reference.oa, sPlotOpen_citation, file = file.path(path, "sPlotOpen.RData"))
268 | 
269 | ## Export to csv files
270 | write_delim(DT2.oa, file = file.path(path, "sPlotOpen_DT.txt"), delim="\t")
271 | write_delim(CWM_CWV.oa, file = file.path(path, "sPlotOpen_CWM_CWV.txt"), delim="\t")
272 | write_delim(header.oa, file = file.path(path, "sPlotOpen_header.txt"), delim="\t")
273 |     
274 |        
275 |        
276 | 
277 | 
278 | 
279 | 
280 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # sPlotOpen_Code
 2 | This project documents the construction of the sPlotOpen database. The whole workflow is described in RMarkdown, which contains also code for creating figures and tables of the sister project [sPlotOpen_Manuscript](https://github.com/fmsabatini/sPlotOpen_Manuscript).  
 3 | <br>
 4 | **DATA VERSION 2.0 - All samples from draws 1,2 and 3.**  
 5 | <br>
 6 | 
 7 | + **html Report on database construction** at https://htmlpreview.github.io/?https://github.com/fmsabatini/sPlotOpen_Code/blob/Res123/_public/02_BuildDataset.html
 8 | 
 9 | This is an initiative by [sPlot - The Global Vegetation-Plot Database](https://www.idiv.de/en/splot.html)
10 | 


--------------------------------------------------------------------------------
/_public/05_Demo.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fmsabatini/sPlotOpen_Code/06fa267d1fb0cf7d7a167a2c5bb1e3bb393ec7a1/_public/05_Demo.pdf


--------------------------------------------------------------------------------
/_resampling/01_running_a_global_PCA_on_bioclimatic_and_soil_variables.R:
--------------------------------------------------------------------------------
  1 | #
  2 | # 27 January 2017
  3 | #
  4 | library(raster)
  5 | library(fBasics)
  6 | library(factoextra)
  7 | library(maptools)
  8 | library(RColorBrewer)
  9 | library(gridExtra)
 10 | #
 11 | data(wrld_simpl)
 12 | #                                                                                                                              
 13 | # Load all bioclimatic and soil variables at 2.5 arc minute resolution across terrestrial Earth
 14 | #
 15 | load("splot.world2.RData")
 16 | ls() # splot.world2
 17 | #
 18 | # Run the PCA on the entire matrix of environmental variables (climate + soil)
 19 | #
 20 | pca3 <- prcomp(splot.world2[, c(5:15,17:35)], center=T, scale=T)
 21 | #
 22 | save(pca3, file="pca3.RData")
 23 | #
 24 | # Check summary statistics and interpret the PCA axes
 25 | #
 26 | summary(pca3) # Three first PCA axes account for 75% of the total inertia with 47% and 23% loadings on the first and second PCA axes, respectively
 27 | var <- get_pca_var(pca3)
 28 | var$coord[, 1:3]
 29 | var_cor_func <- function(var.loadings, comp.sdev){var.loadings*comp.sdev}
 30 | loadings <- pca3$rotation
 31 | sdev <- pca3$sdev
 32 | var.cor <- t(apply(loadings, 1, var_cor_func, sdev))
 33 | var.cor[, 1:3]
 34 | var.cos2 <- var.cor^2
 35 | comp.cos2 <- apply(var.cos2, 2, sum)
 36 | contrib <- function(var.cos2, comp.cos2){var.cos2*100/comp.cos2}
 37 | var.contrib <- t(apply(var.cos2, 1, contrib, comp.cos2))
 38 | var.contrib[, 1:3]
 39 | #
 40 | # Map the first PCA axes in space at 2.5 arc minute resolution
 41 | #
 42 | rgeo <- raster(nrows=360, ncols=720, xmn=-180, xmx=180, ymn=-90, ymx=90) # raster at half a degree resolution (cf. 30 arc minute resolution)
 43 | rgeo <- disaggregate(rgeo, fact=12) # raster at 2.5 arc minute resolution
 44 | splot.world2$cellID <- cellFromXY(rgeo, cbind(splot.world2$RAST_X, splot.world2$RAST_Y))
 45 | #
 46 | posit <- splot.world2$cellID
 47 | temp <- getValues(rgeo)
 48 | #
 49 | temp[posit] <- pca3$x[, 1]
 50 | PC1_r <- setValues(rgeo, temp)
 51 | ramp_PC1 <- colorRampPalette(brewer.pal(11, "Spectral"))
 52 | breaks_PC1 <- quantile(PC1_r, probs=seq(0, 1, 0.01))
 53 | #
 54 | save(PC1_r, file="PC1_r.RData")
 55 | #
 56 | temp[posit] <- pca3$x[, 2]
 57 | PC2_r <- setValues(rgeo, temp)
 58 | ramp_PC2 <- colorRampPalette(brewer.pal(11, "Spectral"))
 59 | breaks_PC2 <- quantile(PC2_r, probs=seq(0, 1, 0.01))
 60 | #
 61 | save(PC2_r, file="PC2_r.RData")
 62 | #
 63 | temp[posit] <- pca3$x[, 3]
 64 | PC3_r <- setValues(rgeo, temp)
 65 | ramp_PC3 <- colorRampPalette(brewer.pal(11, "Spectral"))
 66 | breaks_PC3 <- quantile(PC3_r, probs=seq(0, 1, 0.01))
 67 | #
 68 | save(PC3_r, file="PC3_r.RData")
 69 | #
 70 | # Plot PCA outputs
 71 | #
 72 | tiff(filename="PCA_outputs.tiff", width=24, height=8, res=300, unit="cm")
 73 | p1 <- fviz_screeplot(pca3, ncp=10)
 74 | p2 <- fviz_pca_var(pca3, axes=c(1, 2))
 75 | p3 <- fviz_pca_var(pca3, axes=c(2, 3))
 76 | grid.arrange(p1, p2, p3, ncol=3, nrow=1)
 77 | multiplot(p1, p2, p3, cols=3)
 78 | dev.off()
 79 | #
 80 | # Plot the global distribution of PCA axis 1 at 2.5 arc minute resolution
 81 | #
 82 | tiff(filename="PCA_axis1_at_2.5_arc_minute.tiff", width=20, height=12, res=300, unit="cm")
 83 | par(mar=c(4, 4, 4, 1)) 
 84 | plot(PC1_r, breaks=breaks_PC1, col=rev(ramp_PC1(100)), legend=FALSE)
 85 | plot(PC1_r, breaks=breaks_PC1, col=rev(ramp_PC1(100)), legend.only=TRUE, legend.width=1, legend.shrink=0.75, axis.args=list(at=round(seq(min(breaks_PC1), max(breaks_PC1), length.out=10)), cex.axis=0.6), legend.args=list(text="PC1", side=3, font=2, line=0.4, cex=0.8))
 86 | plot(wrld_simpl, add=T, border="darkgrey", lwd=0.1)
 87 | title(main="First PCA axis (cold and seasonal to hot and stable) \nat 2.5 arc minute resolution")
 88 | dev.off()
 89 | #
 90 | # Plot the global distribution of PCA axis 2 at 2.5 arc minute resolution
 91 | #
 92 | tiff(filename="PCA_axis2_at_2.5_arc_minute.tiff", width=20, height=12, res=300, unit="cm")
 93 | par(mar=c(4, 4, 4, 1)) 
 94 | plot(PC2_r, breaks=breaks_PC2, col=ramp_PC2(100), legend=FALSE)
 95 | plot(PC2_r, breaks=breaks_PC2, col=ramp_PC2(100), legend.only=TRUE, legend.width=1, legend.shrink=0.75, axis.args=list(at=round(seq(min(breaks_PC2), max(breaks_PC2), length.out=10)), cex.axis=0.6), legend.args=list(text="PC2", side=3, font=2, line=0.4, cex=0.8))
 96 | plot(wrld_simpl, add=T, border="darkgrey", lwd=0.1)
 97 | title(main="Second PCA axis (dry to wet) \nat 2.5 arc minute resolution")
 98 | dev.off()
 99 | #
100 | # Plot the global distribution of PCA axis 3 at 2.5 arc minute resolution
101 | #
102 | tiff(filename="PCA_axis3_at_2.5_arc_minute.tiff", width=20, height=12, res=300, unit="cm")
103 | par(mar=c(4, 4, 4, 1)) 
104 | plot(PC3_r, breaks=breaks_PC3, col=ramp_PC3(100), legend=FALSE)
105 | plot(PC3_r, breaks=breaks_PC3, col=ramp_PC3(100), legend.only=TRUE, legend.width=1, legend.shrink=0.75, axis.args=list(at=round(seq(min(breaks_PC3), max(breaks_PC3), length.out=10)), cex.axis=0.6), legend.args=list(text="PC2", side=3, font=2, line=0.4, cex=0.8))
106 | plot(wrld_simpl, add=T, border="darkgrey", lwd=0.1)
107 | title(main="Third PCA axis (sandy to clayey) \nat  2.5 arc minute resolution")
108 | dev.off()
109 | #
110 | 


--------------------------------------------------------------------------------
/_resampling/02_resampling_sPlot_within_the_PC1-PC2_environmental_space.R:
--------------------------------------------------------------------------------
  1 | #
  2 | # 01 February 2017
  3 | #
  4 | library(spdep)
  5 | library(raster)
  6 | library(maptools)
  7 | library(fBasics)
  8 | library(devtools)
  9 | library(parallelsugar)
 10 | library(colorRamps)
 11 | library(Rcpp)
 12 | library(bigmemory)
 13 | library(RcppArmadillo)
 14 | library(RcppParallel)
 15 | #
 16 | data(wrld_simpl)
 17 | #
 18 | # Loading the header data from the sPlot database v2.1
 19 | #
 20 | load("sPlot_header_20161124.RData")
 21 | ls() # header
 22 | dim(header) # 1121244 relevés and 51 variables
 23 | length(which(is.na(header$Longitude))) # 558 relevés without longitude coordinates
 24 | length(which(is.na(header$Latitude))) # 558 relevés without latitude coordinates
 25 | posit <- which(is.na(header$Longitude)&is.na(header$Latitude))
 26 | plot_data <- header[-posit, ]
 27 | dim(plot_data) # 1120686 relevés and 51 variables
 28 | rm(header)
 29 | #
 30 | # Make plot_data as a spatial dataframe
 31 | #
 32 | CRSlonlat <- CRS("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs +towgs84=0,0,0")
 33 | coords <- cbind(plot_data$Longitude, plot_data$Latitude)
 34 | coords <- SpatialPoints(coords, proj4string=CRSlonlat)
 35 | plot_data <- SpatialPointsDataFrame(coords, plot_data, proj4string=CRSlonlat)
 36 | class(plot_data)
 37 | #
 38 | # Compute the initial sampling effort across the geographical space per spatial unit of 2.5 arc minute
 39 | #
 40 | rgeo <- raster(nrows=360, ncols=720, xmn=-180, xmx=180, ymn=-90, ymx=90) # raster at half a degree resolution (cf. 30 arc minute resolution)
 41 | rgeo <- disaggregate(rgeo, fact=12) # raster at 2.5 arc minute resolution hich is about 5 km at the equator (25 km2 is the approximate area of a spatial unit at the equator)
 42 | init_seff_rgeo <- rasterize(plot_data@data[, c("POINT_X", "POINT_Y")], rgeo, fun="count")
 43 | sum(getValues(init_seff_rgeo), na.rm=TRUE) # 1120686 relevés
 44 | #
 45 | # Remove plots for which location accuracy is above 2821 m (cf. radius of a buffer circle of 25 km2 around the plot location)
 46 | #
 47 | plot_data <- plot_data@data
 48 | buffer <- round(sqrt(25/pi)*1000)
 49 | posit <- which(plot_data[, "Location uncertainty (m)"]>buffer) # Be careful, there are 261 plots without location uncertainty but with coordinates that we will keep in the end
 50 | length(posit) # 280224 relevés for which location accuracy exceeds a radius of 2821 m
 51 | plot_data <- plot_data[-posit, ]
 52 | dim(plot_data) # 840462 relevés and 51 variables
 53 | #
 54 | # Remove plots from wetlands
 55 | #
 56 | posit <- which(is.na(plot_data$Grassland)&is.na(plot_data$Forest)&is.na(plot_data$Shrubland)&is.na(plot_data$Sparse.vegetation)&plot_data$Wetland==1)
 57 | length(posit) # 8323 relevés that are pure wetlands
 58 | plot_data <- plot_data[-posit, ]
 59 | dim(plot_data) # 832139 relevés and 51 variables
 60 | #
 61 | # Remove plots from anthropogenic vegetation types
 62 | #
 63 | posit <- which(plot_data$Naturalness==3)
 64 | length(posit) # 30942 relevés from anthropogenic vegetation types
 65 | plot_data <- plot_data[-posit, ]
 66 | dim(plot_data) # 801197 relevés and 51 variables
 67 | #
 68 | # Import DT2 which is the full dataset (free of non-vascular plants) with species composition
 69 | #
 70 | load("DT2_20161025.RData")
 71 | dim(DT2) # 22195966 species occurrences and 7 variables
 72 | #
 73 | # Match it with the plot_data dataframe
 74 | #
 75 | length(unique(DT2$PlotObservationID)) # 1117369 relevés
 76 | length(unique(DT2$PlotObservationID))-length(unique(plot_data$PlotObservationID)) # 316172 relevés in DT2 but missing from plot_data
 77 | posit <- match(DT2$PlotObservationID, plot_data$PlotObservationID)
 78 | any(is.na(posit)) # TRUE: some relevés (n = 316172) in DT2 are not in plot_data
 79 | length(DT2$PlotObservationID[is.na(posit)]) # 6168698 rows in DT2 corresponding to the species lists of the relevés missing from plot_data 
 80 | DT2 <- DT2[is.finite(posit), ]
 81 | length(unique(DT2$PlotObservationID)) # 799400 relevés
 82 | posit <- match(plot_data$PlotObservationID, DT2$PlotObservationID)
 83 | any(is.na(posit)) # TRUE: some relevés (n = 1797) in plot_data are not in DT2 (cf. plots with only non-vascular plants?)
 84 | length(plot_data$PlotObservationID[is.na(posit)]) # 1797 relevés in plot_data are not in DT2 (cf. plots with only non-vascular plants?)
 85 | plot_data <- plot_data[is.finite(posit), ]
 86 | length(unique(plot_data$PlotObservationID)) # 799400 relevés which matches with DT2
 87 | #
 88 | save(plot_data, file="plot_data.RData")
 89 | #
 90 | # Make plot_data as a spatial dataframe again
 91 | #
 92 | coords <- cbind(plot_data$Longitude, plot_data$Latitude)
 93 | coords <- SpatialPoints(coords, proj4string=CRSlonlat)
 94 | plot_data <- SpatialPointsDataFrame(coords, plot_data, proj4string=CRSlonlat)
 95 | class(plot_data)
 96 | #
 97 | # Check for relevés with identical spatial coordinates (just for information) 
 98 | #
 99 | coordID <- paste(plot_data@data$Longitude, plot_data@data$Latitude, sep=":")
100 | length(coordID) # 799400 relevés
101 | length(unique(coordID)) # 509977 relevés with unique coordinates (about 64% of the relevés have unique coordinates)
102 | #
103 | # Plot the global sampling effort per spatial unit of 2.5 arc minute
104 | #
105 | seff_rgeo <- rasterize(plot_data@data[, c("POINT_X", "POINT_Y")], rgeo, fun="count")
106 | sum(getValues(seff_rgeo), na.rm=TRUE) # 799400 relevés
107 | tiff(filename="Sampling_effort_at_2.5_arc_minute.tiff", width=20, height=12, res=300, unit="cm")
108 | par(mar=c(4, 4, 4, 1)) 
109 | plot(log(seff_rgeo), legend=FALSE, asp=0, col=rev(divPalette(n=100, name="Spectral")), xlab="Longitude", ylab="Latitude")
110 | plot(log(seff_rgeo), legend.only=TRUE, col=rev(divPalette(n=100, name="Spectral")), legend.width=1, legend.shrink=0.75, axis.args=list(at=seq(log(minValue(seff_rgeo)), log(maxValue(seff_rgeo)), length.out=5), labels=round(exp(seq(log(minValue(seff_rgeo)), log(maxValue(seff_rgeo)), length.out=5))), cex.axis=0.6), legend.args=list(text="N", side=3, font=2, line=0, cex=0.8))
111 | plot(wrld_simpl, add=T, border="darkgrey", lwd=0.1)
112 | title(main="Number (log-scale) of plots \nper 2.5 arc-minute spatial unit")
113 | dev.off()
114 | #
115 | # Plot the global sampling effort per spatial unit of 0.5 degree
116 | #
117 | seff_rgeo <- aggregate(seff_rgeo, fact=12, fun=sum)
118 | sum(getValues(seff_rgeo), na.rm=TRUE) # 799400 relevés
119 | tiff(filename="Sampling_effort_at_0.5_degree.tiff", width=20, height=12, res=300, unit="cm")
120 | par(mar=c(4, 4, 4, 1)) 
121 | plot(log(seff_rgeo), legend=FALSE, asp=0, col=rev(divPalette(n=100, name="Spectral")), xlab="Longitude", ylab="Latitude")
122 | plot(log(seff_rgeo), legend.only=TRUE, col=rev(divPalette(n=100, name="Spectral")), legend.width=1, legend.shrink=0.75, axis.args=list(at=seq(log(minValue(seff_rgeo)), log(maxValue(seff_rgeo)), length.out=5), labels=round(exp(seq(log(minValue(seff_rgeo)), log(maxValue(seff_rgeo)), length.out=5))), cex.axis=0.6), legend.args=list(text="N", side=3, font=2, line=0, cex=0.8))
123 | plot(wrld_simpl, add=T, border="darkgrey", lwd=0.1)
124 | title(main="Number (log-scale) of plots \nper 30 arc-minute spatial unit")
125 | dev.off()
126 | #
127 | # Plot the global sampling effort per spatial unit of 1 degree
128 | #
129 | seff_rgeo <- aggregate(seff_rgeo, fact=2, fun=sum)
130 | sum(getValues(seff_rgeo), na.rm=TRUE) # 799400 relevés
131 | tiff(filename="Sampling_effort_at_1_degree.tiff", width=20, height=12, res=300, unit="cm")
132 | par(mar=c(4, 4, 4, 1)) 
133 | plot(log(seff_rgeo), legend=FALSE, asp=0, col=rev(divPalette(n=100, name="Spectral")), xlab="Longitude", ylab="Latitude")
134 | plot(log(seff_rgeo), legend.only=TRUE, col=rev(divPalette(n=100, name="Spectral")), legend.width=1, legend.shrink=0.75, axis.args=list(at=seq(log(minValue(seff_rgeo)), log(maxValue(seff_rgeo)), length.out=5), labels=round(exp(seq(log(minValue(seff_rgeo)), log(maxValue(seff_rgeo)), length.out=5))), cex.axis=0.6), legend.args=list(text="N", side=3, font=2, line=0, cex=0.8))
135 | plot(wrld_simpl, add=T, border="darkgrey", lwd=0.1)
136 | title(main="Number (log-scale) of plots \nper 1 degree spatial unit")
137 | dev.off()
138 | #
139 | # Plot the global sampling effort per spatial unit of 2 degrees
140 | #
141 | seff_rgeo <- aggregate(seff_rgeo, fact=2, fun=sum)
142 | sum(getValues(seff_rgeo), na.rm=TRUE) # 799400 relevés
143 | tiff(filename="Sampling_effort_at_2_degrees.tiff", width=20, height=12, res=300, unit="cm")
144 | par(mar=c(4, 4, 4, 1)) 
145 | plot(log(seff_rgeo), legend=FALSE, asp=0, col=rev(divPalette(n=100, name="Spectral")), xlab="Longitude", ylab="Latitude")
146 | plot(log(seff_rgeo), legend.only=TRUE, col=rev(divPalette(n=100, name="Spectral")), legend.width=1, legend.shrink=0.75, axis.args=list(at=seq(log(minValue(seff_rgeo)), log(maxValue(seff_rgeo)), length.out=5), labels=round(exp(seq(log(minValue(seff_rgeo)), log(maxValue(seff_rgeo)), length.out=5))), cex.axis=0.6), legend.args=list(text="N", side=3, font=2, line=0, cex=0.8))
147 | plot(wrld_simpl, add=T, border="darkgrey", lwd=0.1)
148 | title(main="Number (log-scale) of plots \nper 2 degrees spatial unit")
149 | dev.off()
150 | #
151 | # Plot the difference with the initial sampling effort per spatial unit of 2 degrees
152 | #
153 | init_seff_rgeo <- aggregate(init_seff_rgeo, fact=48, fun=sum)
154 | sum(getValues(init_seff_rgeo), na.rm=TRUE) # 1120686 relevés
155 | diff_seff_rgeo <- init_seff_rgeo-seff_rgeo
156 | sum(getValues(diff_seff_rgeo), na.rm=TRUE) # 316728 relevés lost after data filtering
157 | tiff(filename="Plot_loss_at_2_degrees.tiff", width=20, height=12, res=300, unit="cm")
158 | par(mar=c(4, 4, 4, 1)) 
159 | plot(log(diff_seff_rgeo+1), legend=FALSE, asp=0, col=c("grey", rev(divPalette(n=99, name="Spectral"))), xlab="Longitude", ylab="Latitude")
160 | plot(log(diff_seff_rgeo+1), legend.only=TRUE, col=c("grey", rev(divPalette(n=99, name="Spectral"))), legend.width=1, legend.shrink=0.75, axis.args=list(at=seq(0, log(maxValue(diff_seff_rgeo)+1), length.out=5), labels=round(seq(0, exp(log(maxValue(diff_seff_rgeo)+1)), length.out=5)), cex.axis=0.6), legend.args=list(text="N", side=3, font=2, line=0, cex=0.8))
161 | plot(wrld_simpl, add=T, border="darkgrey", lwd=0.1)
162 | title(main="Number (log-scale) of plots \nper 2 degrees spatial unit")
163 | dev.off()
164 | #
165 | # Compute the global sampling effort across the bivariate (PC1-PC2) environmental space (not the geographical space)
166 | #
167 | load("PC1_r.RData")
168 | load("PC2_r.RData")
169 | plot_data@data$cellID <- cellFromXY(rgeo, cbind(plot_data@data$POINT_X, plot_data@data$POINT_Y))
170 | plot_data@data$pc1_val <- extract(PC1_r, coordinates(plot_data))
171 | plot_data@data$pc2_val <- extract(PC2_r, coordinates(plot_data)) 
172 | load("pca3.RData")
173 | res <- 100 # Setting the number of bins per PCA axis to 100
174 | reco <- raster(nrows=res, ncols=res, xmn=min(pca3$x[, 1]), xmx=max(pca3$x[, 1]), ymn=min(pca3$x[, 2]), ymx=max(pca3$x[, 2]))
175 | PC1_PC2_r <- rasterize(pca3$x[, 1:2], reco, fun="count") # Compute the density of geographic grid cells across the entire bivariate (PC1-PC2) environmental space
176 | sPlot_reco <- rasterize(plot_data@data[, c("pc1_val", "pc2_val")], reco, fun="count") # Compute the sampling effort (number of vegetation plots) per environmental unit (cell) across the entire bivariate (PC1-PC2) environmental space
177 | temp1 <- getValues(PC1_PC2_r)
178 | temp1[!is.na(temp1)] <- 0 # Put zero values for the empty cells (cf. there is no existing terrestrial grid cell available on Earth for the focal PC1-PC2 grid cell condition) 
179 | temp2 <- getValues(sPlot_reco)
180 | temp2[which(temp1==0&is.na(temp2))] <- 0 # Put zero values for the empty cells (cf. there is no vegeteation plots available for those environmental conditions: gaps) 
181 | sPlot_reco <- setValues(reco, temp2)
182 | #
183 | # Plot the number of 2.5 arc-minute cells for each cell of the PC1-PC2 space
184 | #
185 | tiff(filename="Global_availability_PC1-PC2.tiff", width=12, height=12, res=300, unit="cm")
186 | par(mar=c(4, 4, 4, 1)) 
187 | plot(log(PC1_PC2_r), asp=0, col=rev(divPalette(n=100, name="RdBu")), xlab="PC1 (cold and seasonal to hot and stable)", ylab="PC2 (dry to wet)", legend=FALSE)
188 | plot(log(PC1_PC2_r), asp=0, col=rev(divPalette(n=100, name="RdBu")), legend.only=TRUE, legend.width=1, legend.shrink=0.75, axis.args=list(at=seq(log(minValue(PC1_PC2_r)), log(maxValue(PC1_PC2_r)), length.out=5), labels=round(seq(exp(log(minValue(PC1_PC2_r))), exp(log(maxValue(PC1_PC2_r))), length.out=5)), cex.axis=0.6), legend.args=list(text="N", side=3, font=2, line=0, cex=0.8))
189 | title(main="Number of 2.5 arc-minute spatial units \nper environmental cell (log scale)")
190 | dev.off()
191 | #
192 | # Plot the number of sPlot relevés for each cell of the PC1-PC2 space
193 | #
194 | tiff(filename="Sampling_effort_PC1-PC2.tiff", width=12, height=12, res=300, unit="cm")
195 | par(mar=c(4, 4, 4, 1)) 
196 | plot(log(sPlot_reco+1), asp=0, col=c("grey", rev(divPalette(n=99, name="RdBu"))), xlab="PC1 (cold and seasonal to hot and stable)", ylab="PC2 (dry to wet)", legend=FALSE)
197 | plot(log(sPlot_reco+1), asp=0, col=c("grey", rev(divPalette(n=99, name="RdBu"))), legend.only=TRUE, legend.width=1, legend.shrink=0.75, axis.args=list(at=seq(0, log(maxValue(sPlot_reco)+1), length.out=5), labels=round(seq(0, exp(log(maxValue(sPlot_reco)+1)), length.out=5)), cex.axis=0.6), legend.args=list(text="N", side=3, font=2, line=0, cex=0.8))
198 | title(main="Number of sPlot relevés \nper environmental cell (log scale)")
199 | dev.off()
200 | #
201 | # Plot for each cell of the PC1-PC2 space the ratio between the relative proportion of sPlot relevés and the relative proportion of spatial units available worldwide
202 | #
203 | tiff(filename="Sampling_effort_ratio_PC1-PC2.tiff", width=12, height=12, res=300, unit="cm")
204 | par(mar=c(4, 4, 4, 1)) 
205 | ratio_reco <- (sPlot_reco/max(getValues(sPlot_reco), na.rm=T))/(PC1_PC2_r/max(getValues(PC1_PC2_r), na.rm=T))
206 | plot(log(ratio_reco+1), asp=0, col=c("grey", rev(divPalette(n=99, name="Spectral"))), xlab="PC1 cold and seasonal to hot and stable)", ylab="PC2 (dry to wet)")
207 | title(main="Oversampled (>0.69) versus \nundersampled (<0.69) PC1-PC2 cells")
208 | dev.off()
209 | #
210 | # Run a sensitivity analysis to define the most appropriate resolution of the bivariate (PC1-PC2) environmental space
211 | #
212 | res <- seq(10, 500, 10)
213 | ncell_disp <- c()
214 | ncell_samp <- c()
215 | seff_med <- c()
216 | seff_mean <- c()
217 | seff_max <- c()
218 | seff_min <- c()
219 | nbrel_sel <- c() 
220 | for (i in 1:length(res)) {
221 |   print(paste(i, "of", length(res), sep=" "))
222 |   r <- raster(nrows=res[i], ncols=res[i], xmn=min(pca3$x[, 1]), xmx=max(pca3$x[, 1]), ymn=min(pca3$x[, 2]), ymx=max(pca3$x[, 2]))
223 |   temp <- rasterize(pca3$x[, 1:2], r, fun="count")
224 |   ncell_disp <- c(ncell_disp, length(which(getValues(temp)>0)))
225 |   temp <- rasterize(plot_data@data[, c("pc1_val", "pc2_val")], r, fun="count")
226 |   temp <- getValues(temp)
227 |   temp <- na.omit(temp) 
228 |   ncell_samp <- c(ncell_samp, length(which(temp)>0))
229 |   seff_med <- c(seff_med, median(temp))
230 |   seff_mean <- c(seff_mean, mean(temp))
231 |   seff_max <- c(seff_max, max(temp))
232 |   seff_min <- c(seff_min, min(temp))
233 |   nbrel_sel <- c(nbrel_sel, length(temp[which(temp>median(temp)), ])*median(temp)+sum(temp[which(temp<=median(temp)), ]))  
234 | }
235 | plot(res, seff_med)
236 | plot(res, seff_max)
237 | plot(res, seff_mean)
238 | plot(res, nbrel_sel)
239 | plot(res, ncell_samp/ncell_disp, ylim=c(0, 1))
240 | #
241 | # Resample sPlot within the PC1-PC2 environmental space to get an environmentally-balanced subset
242 | #
243 | plot_data <- plot_data@data
244 | save(plot_data, file="plot_data.RData") # Save the latest version of plot_data
245 | Sys.setenv("PKG_CXXFLAGS"="-fopenmp") # Set environment variables for other processes called from within R 	
246 | sourceCpp("/_functions_TH/bray.part.OpenMP.cpp") # Source C++ fonctions written by Tarek Hattab from folder "_functions_TH"
247 | sourceCpp("/_functions_TH/bray.part.C_RcppParallel.cpp") # Source C++ fonctions written by Tarek Hattab from folder "_functions_TH"
248 | sourceCpp("/_functions_TH/hcr.C.cpp") # Source C++ fonctions written by Tarek Hattab from folder "_functions_TH"
249 | sourceCpp("/_functions_TH/cast_binary.cpp") # Source C++ fonctions written by Tarek Hattab from folder "_functions_TH"
250 | BigBrayPart <- function(bigMat) {
251 |   zeros <- big.matrix(nrow=nrow(bigMat), ncol=nrow(bigMat), init=0, type=typeof(bigMat), shared=FALSE, backingfile=paste("BrayMatrix_",i,sep=""), backingpath=getwd(), descriptorfile=paste("BrayMatrix_",i,".desc",sep=""))
252 |   bray_distance_OpenMP(bigMat@address, zeros@address)
253 |   return(zeros)
254 | }
255 | res <- 100 # Set the resolution of the environmental space based on the sensitivity analysis
256 | r <- raster(nrows=res, ncols=res, xmn=min(pca3$x[, 1]), xmx=max(pca3$x[, 1]), ymn=min(pca3$x[, 2]), ymx=max(pca3$x[, 2])) # Prepare the environmental space restricted to sPlot relevés only (not the entire environmental space available at a global extent)
257 | pca_sPlot_r <- rasterize(plot_data[, c("pc1_val", "pc2_val")], r, fun="count")
258 | cutoff <- median(values(pca_sPlot_r), na.rm=TRUE) # Compute the cutoff value above which relevés have to be resampled for a given cell
259 | tempZoneOut <- coordinates(pca_sPlot_r)[which(values(pca_sPlot_r)>cutoff), ] # Select only the coordinates of the environmental cells for which the total number of sPlot relevés available exceeds the cutoff value
260 | repet <- 100 # Set the number of repetitions for the HCR function
261 | sp_data <- DT2[, c(1, 2, 7)] # Prepare the species data table that will be used by the HCR approach
262 | names(sp_data) <- c("plot_id", "sp_name", "rel_cov")
263 | save(sp_data, file="sp_data.RData") # Save the latest version of sp_data
264 | plotToRemove <- as.list(rep(NA, repet)) # Prepare an empty object to store the IDs of the relevés to be removed
265 | for (i in 1:nrow(tempZoneOut)) {
266 |   print("--------")
267 |   print(paste(i, "out of", nrow(tempZoneOut), "cells", sep=" "))
268 |   plot(pca_sPlot_r, asp=0, xlab="PC1 (cold and seasonal to hot and stable)", ylab="PC2 (dry to wet)")
269 |   points(tempZoneOut[, 1], tempZoneOut[, 2], cex=0.5)
270 |   points(tempZoneOut[i, 1], tempZoneOut[i, 2], col="red", pch=19)
271 |   sel.plot <- which(plot_data$pc1_val > tempZoneOut[i, 1]-(res(r)[1]/2) & 
272 |                     plot_data$pc1_val < tempZoneOut[i, 1]+(res(r)[1]/2) &
273 |                     plot_data$pc2_val > tempZoneOut[i, 2]-(res(r)[2]/2) & 
274 |                     plot_data$pc2_val < tempZoneOut[i, 2]+(res(r)[2]/2)) 
275 |   print(paste("This cell contains", length(sel.plot), "relevés", sep=" ")) 
276 |   idZoneOut <- plot_data[sel.plot, "PlotID"]
277 |   sel.comm <- sp_data[which(sp_data$plot_id%in%idZoneOut), c("plot_id", "sp_name", "rel_cov")]
278 |   sel.comm <- na.omit(sel.comm)
279 |   sel.comm [, 2]<- factor(sel.comm[, 2], labels=seq(1:length(unique(sel.comm[, 2])))) 
280 |   sel.comm [, 2]<- as.numeric(sel.comm[, 2])
281 |   comm.data <- castC(iD=sel.comm[, 1], sp=sel.comm[, 2], cov=sel.comm[, 3])
282 |   rowNames <- comm.data[, 1]
283 |   comm.data <- comm.data[, -1]
284 |   print(paste("The total number of species is", dim(comm.data)[[2]], sep=" ")) 
285 |   gc()
286 |   if (nrow(comm.data) > 20000) {
287 |     bigComMatrix <- as.big.matrix(comm.data,shared=FALSE, backingfile=paste("Matrix_",i,sep=""), backingpath=getwd(), descriptorfile=paste("Matrix_", i, ".desc", sep="")) ; brayBalDist <- BigBrayPart(bigComMatrix)
288 |   } else {
289 |     brayBalDist <- bray_distance_RcppParallel(comm.data); brayBalDist <- as.big.matrix(brayBalDist)
290 |   }
291 |   for (j in 1:repet) {
292 |     selectedPlot <- HcrCPP(brayBalDist@address, nout=cutoff, nsampl=1000)  
293 |     selectedPlot <- rowNames[selectedPlot] 
294 |     selectedPlotIndex <- which(idZoneOut%in%selectedPlot)
295 |     plotToRemove[[j]] <-  c(plotToRemove[[j]], idZoneOut[-selectedPlotIndex])
296 |   }
297 |   output <- list(i, plotToRemove)
298 |   save(output, file="plotToRemove.RData")	
299 | }
300 | #
301 | 


--------------------------------------------------------------------------------
/_resampling/03_extracting_selected_plots_from_the_sPlot_database.R:
--------------------------------------------------------------------------------
 1 | #
 2 | # 01 February 2017, updated on 11 December 2020
 3 | #
 4 | library(raster)
 5 | library(fBasics)
 6 | library(maptools)
 7 | #
 8 | data(wrld_simpl)
 9 | #
10 | load("plot_data.RData")
11 | load("plotToRemove.RData")
12 | load("pca3.RData")
13 | #
14 | ls()
15 | #
16 | output[[1]] 
17 | #
18 | # 858 cells from the PC1-PC2 space have been resampled with a cutoff value of
19 | # 50 plots maximum per cell with 50 being the median value of the total number 
20 | # of plots across all grid cells of the PC1-PC2 space thus also being a good 
21 | # compromise between quantity and quality in terms of extracting a subset of 
22 | # sPlot that has a balanced sampling effort across the PC1-PC2 space
23 | #
24 | plotToRemove <- output[[2]]
25 | rm(output)
26 | class(plotToRemove) 
27 | #
28 | # A list of 100 vectors (100 different resampling iterations) that contains the 
29 | # IDs of sPlot releves to remove from the plot_data object
30 | #
31 | length(plotToRemove[[1]]) 
32 | #
33 | # First iteration containing 700037 IDs
34 | #
35 | length(plotToRemove[[2]]) 
36 | #
37 | # Second iteration containing 700022 IDs
38 | #
39 | head(plotToRemove[[1]]) 
40 | head(plotToRemove[[2]]) 
41 | #
42 | # First ID is NA for each vector in the list which is normal (cf. a property of 
43 | # the resampling loop) we have to clean that (see below)
44 | #
45 | for (i in 1:100) {
46 |   plotToRemove[[i]] <- na.omit(plotToRemove[[i]])
47 | }
48 | #
49 | # One extraction exemple from the first vector in the plotToRemove list object
50 | #
51 | posit <- match(plotToRemove[[1]], plot_data$PlotID)
52 | plot_sel <- plot_data[-posit, c("PlotID")]
53 | length(plot_sel) 
54 | #
55 | # A total of 99364 plots seem to be selected which is a bit too much given that
56 | # 50*858 grid cells gives only 42900 plots and even if some grid cells have less
57 | # than 50 plots, the total should not be 99364??? This is far too much and thus
58 | # something is wrong
59 | #
60 | length(which(is.na(plot_data$pc1_val)))
61 | #
62 | # It seems that 42878 plots in the plot_data object have NAs for PC1 and these
63 | # mostly correspond to coastal pixels where data from SoilGrid are unavailable
64 | #
65 | posit <- which(is.na(plot_data$pc1_val))
66 | plot_data <- plot_data[-posit, ]
67 | dim(plot_data)[[1]]
68 | #
69 | # After removing rows with NAS for PC1, the plot_data object has 756522 plots
70 | # instead of 799400 plots
71 | #
72 | posit <- match(plotToRemove[[1]], plot_data$PlotID)
73 | plot_sel <- plot_data[-posit, c("PlotID")]
74 | length(plot_sel) 
75 | # 
76 | # In the end, the true selection from the first resampling iteration is 56486 
77 | # plots instead of 99364 plots
78 | #
79 | plot_sel <- list()
80 | for (i in 1:100) {
81 |   posit <- match(plotToRemove[[i]], plot_data$PlotID)
82 |   plot_sel[[i]] <- plot_data[-posit, c("PlotID")]
83 | }
84 | save(plot_sel, file="plot_sel.RData")
85 | #
86 | 


--------------------------------------------------------------------------------
/_resampling/README.md:
--------------------------------------------------------------------------------
 1 | # Documentation on the code to resample sPlot
 2 | 
 3 | This is the open-source R code (CC-BY Jonathan Lenoir & Tarek Hattab) that implements the resampling of the original sPlot database v2.1 into environmentally-balanced subsets.
 4 | 
 5 | There is three main files with the code to: 
 6 | 
 7 | **1.** Run a global principal component analysis (PCA) encapsulating the entire environmental space covered by terrestrial systems ["*01_running_a_global_PCA_on_bioclimatic_and_soil_variables.R*"]
 8 | 
 9 | **2.** Resample the sPlot vegetation database in a systematic manner across the 2D-space covered by the first two principal components (PCs) of the PCA from step 1 [*02_resampling_sPlot_within_the_PC1-PC2_environmental_space.R*]
10 | 
11 | **3.** Extract the set of selected vegetation plots from the sPlot database using the outputs produced by the resampling algorithm in step 2 [*03_extracting_selected_plots_from_the_sPlot_database.R*]
12 | 
13 | # 1. The global PCA
14 | 
15 | The R script file "*01_running_a_global_PCA_on_bioclimatic_and_soil_variables.R*" contains the code to run the PCA. It first loads a large dataset of 30 variables reflecting bioclimatic and soil conditions. All these variables can be downloaded from the original open-access archives storing the global raster grids (CHELSA, CGIAR-CSI & SoilGrids). After running the global PCA and interpreting the meaning of the first two to three PCs, the script allows to: (1) extract data from the first three PCs (PC1, PC2 & PC3); (2) store these data into global raster grids, one for each PC; and (3) to map the outputs at the global extent.
16 | 
17 | # 2. The resampling algorithm
18 | 
19 | The R script file "*02_resampling_sPlot_within_the_PC1-PC2_environmental_space.R*" contains the code that: (1) loads the original sPlot database v2.1 (both the header data and the species composition data stored into two different dataframes); (2) cleans sPlot v2.1 to extract the set of vegetation plots to be used for the resampling procedure; (3) runs a sensitivity analysis to define the most appropriate grid resolution to define the bivariate (PC1-PC2) environmental space to be used to perform the resampling; and (4) resamples sPlot within the PC1-PC2 environmental space using the heterogeneity‐constrained random (HCR) reseamping algorithm (several times to get several possible iterations: 100).
20 | 
21 | The HCR resampling algorithm was initially written by Attila Lengyel. We adapted the original algorithm and code to our own need for the resampling of sPlot. Due to the large amount of data in sPlot, the large dissimilarity matrices generated by the HCR algorithm (pairwise distance matrices based on plant community composition) and to speed up the processing time to resample sPlot, Tarek Hattab wrote several useful functions in C++ (see the folder entitled "*_functions_TH*" that contains all the functions written by Tarek and that are then sourced from R using the sourceCpp function from the Rcpp package). Note that you need to install Rtools on Windows machines to be able to run this set of functions. Also, we do not take any responsibility if our code makes your computer explode...
22 | 
23 | # 3. The result
24 | 
25 | The actual output of the resampling algorithm does not list the set of vegetation plot IDs selected by the resampling apporach for each of the 100 HCR runs but rather the list of vegetation plot IDs not selected by the resampling approach, i.e., the garbage. This is the reason why the main output of the resampling algorithm is stored in a list object called "plotToRemove". The last R script file "*03_extracting_selected_plots_from_the_sPlot_database.R*" thus contains the code to remove (for each of the 100 HCR runs) the set of non-selected plots from the sPlot 2.1 database so that we get in the end several possible realisations of environmentally-balanced datasets representative of sPlot.
26 | 


--------------------------------------------------------------------------------
/_resampling/_functions_TH/BetaJtu_OpenMP.cpp:
--------------------------------------------------------------------------------
  1 | // To enable the functionality provided by Armadillo's various macros,
  2 | 
  3 | // simply include them before you include the RcppArmadillo headers.
  4 | 
  5 | #define ARMA_NO_DEBUG
  6 | 
  7 | // [[Rcpp::depends(RcppArmadillo, BH, bigmemory)]]
  8 | 
  9 | // [[Rcpp::plugins(cpp11)]]
 10 | 
 11 | // [[Rcpp::plugins(openmp)]]
 12 | 
 13 | #include <RcppArmadillo.h>
 14 | 
 15 | #include <bigmemory/BigMatrix.h>
 16 | 
 17 | #include <Rcpp.h> 
 18 | 
 19 | #include <omp.h>
 20 | 
 21 | 
 22 | using namespace Rcpp;
 23 | 
 24 | using namespace arma;
 25 | 
 26 | template <typename InputIterator1, typename InputIterator2>
 27 | 
 28 | 
 29 | inline double  JacElement(InputIterator1 begin1, InputIterator1 end1, InputIterator2 begin2){
 30 | 	
 31 | 	// value to return
 32 | 	
 33 | 	double rval = 0;
 34 | 	
 35 | 	int compA = 0;
 36 | 	
 37 | 	int compB = 0;
 38 | 	
 39 | 	int compC = 0;
 40 | 	
 41 | 	// set iterators to beginning of ranges
 42 | 
 43 | 	InputIterator1 it1 = begin1;
 44 | 	
 45 | 	InputIterator2 it2 = begin2;
 46 | 
 47 | 	// for each input item
 48 | 	
 49 | 	while (it1 != end1) {
 50 | 	
 51 | 	// take the value and increment the iterator
 52 | 		
 53 | 		double d1 = *it1++;
 54 | 		
 55 | 		double d2 = *it2++;
 56 | 	
 57 | 		if (d1 == 1 && d2 ==1)  { 
 58 | 		
 59 | 			compA += 1; 
 60 | 		
 61 | 		} 
 62 | 		
 63 | 		if (d1 > d2)  { 
 64 | 		
 65 | 			compB += 1; 
 66 | 		
 67 | 		} 
 68 | 		
 69 | 		if (d1 < d2)  { 
 70 | 		
 71 | 			compC += 1; 
 72 | 		
 73 | 		} 
 74 | 		
 75 | 	}
 76 | 	
 77 | 	if (compB < compC) {
 78 | 	
 79 | 		double numerator = 2 * compB;
 80 | 	
 81 | 		double denominator = compA + numerator;
 82 | 	
 83 | 		rval = numerator / denominator; 
 84 | 		
 85 | 	}
 86 | 	
 87 | 	else {
 88 | 		
 89 | 		double numerator = 2 * compC;
 90 | 	
 91 | 		double denominator = compA + numerator;
 92 | 	
 93 | 		rval = numerator / denominator; 
 94 | 		
 95 | 	}
 96 | 
 97 | return rval;
 98 | 
 99 | }
100 | 
101 | void BetaJtu (const arma::Mat<int>& mat, arma::Mat<double> rmat){
102 | 	 
103 | 	int nRows = mat.n_rows;
104 |   
105 |     omp_set_num_threads(10);
106 | 	
107 | 	#pragma omp parallel for  
108 | 	
109 |     for (int i = 0; i < nRows; i++) {
110 |          
111 | 			for (int j = 0; j < nRows; j++) {
112 |             
113 | 				arma::Row<int> row1 = mat.row(i);
114 |             
115 | 				arma::Row<int> row2 = mat.row(j);
116 | 			
117 | 				rmat(i,j) = JacElement(row1.begin(), row1.end(), row2.begin()); 
118 | 				
119 | 			}
120 | 			
121 | 	}
122 | 	
123 | }
124 | 
125 | 
126 | // [[Rcpp::export]]	
127 | 
128 | void betaJtu(SEXP mat, SEXP rmat) {
129 |  
130 | 	XPtr<BigMatrix> xpMat(mat);
131 | 
132 | 	XPtr<BigMatrix> xpOutMat(rmat);
133 | 	
134 |    // create the worker
135 |     BetaJtu(arma::Mat<int>((int *)xpMat->matrix(), xpMat->nrow(), xpMat->ncol(), false),
136 | 			
137 | 			arma::Mat<double>((double *)xpOutMat->matrix(), xpOutMat->nrow(), xpOutMat->ncol(), false));
138 | 
139 | }
140 | 


--------------------------------------------------------------------------------
/_resampling/_functions_TH/BetaJtu_RcppParallel.cpp:
--------------------------------------------------------------------------------
  1 | // To enable the functionality provided by Armadillo's various macros,
  2 | 
  3 | // simply include them before you include the RcppArmadillo headers.
  4 | 
  5 | #define ARMA_NO_DEBUG
  6 | 
  7 | // [[Rcpp::depends(RcppArmadillo, BH, bigmemory,RcppParallel)]]
  8 | 
  9 | // [[Rcpp::plugins(cpp11)]]
 10 | 
 11 | #include <RcppArmadillo.h>
 12 | 
 13 | #include <bigmemory/BigMatrix.h>
 14 | 
 15 | #include <Rcpp.h> 
 16 | 
 17 | #include <RcppParallel.h>
 18 | 
 19 | using namespace Rcpp;
 20 | 
 21 | using namespace arma;
 22 | 
 23 | using namespace RcppParallel;
 24 | 
 25 | 
 26 | template <typename InputIterator1, typename InputIterator2>
 27 | 
 28 | inline double  JacElement(InputIterator1 begin1, InputIterator1 end1, InputIterator2 begin2){
 29 | 	
 30 | 	// value to return
 31 | 	
 32 | 	double rval = 0;
 33 | 	
 34 | 	int compA = 0;
 35 | 	
 36 | 	int compB = 0;
 37 | 	
 38 | 	int compC = 0;
 39 | 	
 40 | 	// set iterators to beginning of ranges
 41 | 
 42 | 	InputIterator1 it1 = begin1;
 43 | 	
 44 | 	InputIterator2 it2 = begin2;
 45 | 
 46 | 	// for each input item
 47 | 	
 48 | 	while (it1 != end1) {
 49 | 	
 50 | 	// take the value and increment the iterator
 51 | 		
 52 | 		double d1 = *it1++;
 53 | 		
 54 | 		double d2 = *it2++;
 55 | 	
 56 | 		if (d1 == 1 && d2 ==1)  { 
 57 | 		
 58 | 			compA += 1; 
 59 | 		
 60 | 		} 
 61 | 		
 62 | 		if (d1 > d2)  { 
 63 | 		
 64 | 			compB += 1; 
 65 | 		
 66 | 		} 
 67 | 		
 68 | 		if (d1 < d2)  { 
 69 | 		
 70 | 			compC += 1; 
 71 | 		
 72 | 		} 
 73 | 		
 74 | 	}
 75 | 	
 76 | 	if (compB < compC) {
 77 | 	
 78 | 		double numerator = 2 * compB;
 79 | 	
 80 | 		double denominator = compA + numerator;
 81 | 	
 82 | 		rval = numerator / denominator; 
 83 | 		
 84 | 	}
 85 | 	
 86 | 	else {
 87 | 		
 88 | 		double numerator = 2 * compC;
 89 | 	
 90 | 		double denominator = compA + numerator;
 91 | 	
 92 | 		rval = numerator / denominator; 
 93 | 		
 94 | 	}
 95 | 
 96 | return rval;
 97 | 
 98 | }
 99 | 
100 | struct JtuDistance : public Worker {
101 |    
102 |    // input matrix to read from
103 |    
104 |    const RMatrix<int> mat;
105 |    
106 |    // output matrix to write to
107 |    
108 |    RMatrix<double> rmat;
109 |    
110 |    // initialize from Rcpp input and output matrixes (the RMatrix class
111 |    
112 |    // can be automatically converted to from the Rcpp matrix type)
113 |    
114 |    JtuDistance(const IntegerMatrix mat, NumericMatrix rmat)
115 |       : mat(mat), rmat(rmat) {}
116 |    
117 |    // function call operator that work for the specified range (begin/end)
118 |    
119 |    void operator()(std::size_t begin, std::size_t end) {
120 |       
121 | 	    for (std::size_t i = begin; i < end; i++) {
122 |          
123 | 			for (std::size_t j = 0; j < i; j++) {
124 |             
125 |             // rows we will operate on
126 |             
127 | 				RMatrix<int>::Row row1 = mat.row(i);
128 |             
129 | 				RMatrix<int>::Row row2 = mat.row(j);
130 | 			
131 | 				rmat(j,i) = JacElement(row1.begin(), row1.end(), row2.begin()); 
132 | 				
133 | 				rmat(i,j) = rmat(j,i);
134 | 				
135 | 			}
136 | 			
137 | 		}
138 | 	
139 | 	}
140 | 	
141 | };
142 | 			
143 | // [[Rcpp::export]]	
144 | 
145 | NumericMatrix betaJtu_RcppParallel(IntegerMatrix mat) {
146 |   
147 |    // allocate the matrix we will return
148 |    NumericMatrix rmat(mat.nrow(), mat.nrow());
149 | 
150 |    // create the worker
151 |    JtuDistance JtuDistance(mat, rmat);
152 |      
153 |    // call it with parallelFor
154 |    parallelFor(0, mat.nrow(), JtuDistance);
155 | 
156 |    	
157 |    return rmat;
158 | }


--------------------------------------------------------------------------------
/_resampling/_functions_TH/bray.part.C_RcppParallel.cpp:
--------------------------------------------------------------------------------
  1 | // To enable the functionality provided by Armadillo's various macros,
  2 | 
  3 | // simply include them before you include the RcppArmadillo headers.
  4 | 
  5 | #define ARMA_NO_DEBUG
  6 | 
  7 | // [[Rcpp::depends(RcppArmadillo, BH, bigmemory,RcppParallel)]]
  8 | 
  9 | // [[Rcpp::plugins(cpp11)]]
 10 | 
 11 | #include <RcppArmadillo.h>
 12 | 
 13 | #include <bigmemory/BigMatrix.h>
 14 | 
 15 | #include <Rcpp.h> 
 16 | 
 17 | #include <RcppParallel.h>
 18 | 
 19 | using namespace Rcpp;
 20 | 
 21 | using namespace arma;
 22 | 
 23 | using namespace RcppParallel;
 24 | 
 25 | 
 26 | template <typename InputIterator1, typename InputIterator2>
 27 | 
 28 | inline double  BrayElement(InputIterator1 begin1, InputIterator1 end1, InputIterator2 begin2){
 29 | 	
 30 | 	// value to return
 31 | 	
 32 | 	double rval = 0;
 33 | 	
 34 | 	double sumPi = 0;
 35 | 	
 36 | 	double sumPj = 0;
 37 | 	
 38 | 	double aComp = 0;
 39 | 	
 40 | 	// set iterators to beginning of ranges
 41 | 
 42 | 	InputIterator1 it1 = begin1;
 43 | 	
 44 | 	InputIterator2 it2 = begin2;
 45 | 
 46 | 	// for each input item
 47 | 	
 48 | 	while (it1 != end1) {
 49 | 		
 50 | 		// take the value and increment the iterator
 51 | 		
 52 | 		double d1 = *it1++;
 53 | 		
 54 | 		double d2 = *it2++;
 55 | 		
 56 | 		sumPi += d1;
 57 | 		
 58 | 		sumPj += d2;
 59 | 		
 60 | 		aComp += std::min(d1,d2);
 61 | 		
 62 | 	}
 63 | 			
 64 | 	double bComp = sumPi - aComp;
 65 | 			
 66 | 	double cComp = sumPj - aComp;
 67 | 			
 68 | 	double minBC = 0.0;
 69 | 			 
 70 | 	if (bComp < cComp) { 
 71 | 			 
 72 | 		minBC = bComp;
 73 | 			 
 74 | 	}
 75 | 			 
 76 | 	else {
 77 | 			 
 78 | 		minBC = cComp; 
 79 | 			 
 80 | 	}
 81 | 			 
 82 | rval = minBC / (aComp + minBC);
 83 | 
 84 | return rval;	
 85 | }
 86 | 
 87 | struct BrayDistance : public Worker {
 88 |    
 89 |    // input matrix to read from
 90 |    
 91 |    const RMatrix<double> mat;
 92 |    
 93 |    // output matrix to write to
 94 |    
 95 |    RMatrix<double> rmat;
 96 |    
 97 |    // initialize from Rcpp input and output matrixes (the RMatrix class
 98 |    
 99 |    // can be automatically converted to from the Rcpp matrix type)
100 |    
101 |    BrayDistance(const NumericMatrix mat, NumericMatrix rmat)
102 |       : mat(mat), rmat(rmat) {}
103 |    
104 |    // function call operator that work for the specified range (begin/end)
105 |    
106 |    void operator()(std::size_t begin, std::size_t end) {
107 |       
108 | 	    for (std::size_t i = begin; i < end; i++) {
109 |          
110 | 			for (std::size_t j = 0; j < i; j++) {
111 |             
112 |             // rows we will operate on
113 |             
114 | 				RMatrix<double>::Row row1 = mat.row(i);
115 |             
116 | 				RMatrix<double>::Row row2 = mat.row(j);
117 | 			
118 | 				rmat(j,i) = BrayElement(row1.begin(), row1.end(), row2.begin()); 
119 | 				
120 | 				rmat(i,j) = rmat(j,i);
121 | 				
122 | 			}
123 | 			
124 | 		}
125 | 	
126 | 	}
127 | 	
128 | };
129 | 			
130 | // [[Rcpp::export]]	
131 | 
132 | NumericMatrix bray_distance_RcppParallel(NumericMatrix mat) {
133 |   
134 |    // allocate the matrix we will return
135 |    NumericMatrix rmat(mat.nrow(), mat.nrow());
136 | 
137 |    // create the worker
138 |    BrayDistance BrayDistance(mat, rmat);
139 |      
140 |    // call it with parallelFor
141 |    parallelFor(0, mat.nrow(), BrayDistance);
142 | 
143 |    	
144 |    return rmat;
145 | }
146 | 
147 | 
148 | 
149 | 
150 | 
151 | 
152 | 
153 | 
154 | 
155 | 
156 | 
157 | 
158 | 


--------------------------------------------------------------------------------
/_resampling/_functions_TH/bray.part.OpenMP.cpp:
--------------------------------------------------------------------------------
  1 | // To enable the functionality provided by Armadillo's various macros,
  2 | 
  3 | // simply include them before you include the RcppArmadillo headers.
  4 | 
  5 | #define ARMA_NO_DEBUG
  6 | 
  7 | // [[Rcpp::depends(RcppArmadillo, BH, bigmemory)]]
  8 | 
  9 | // [[Rcpp::plugins(cpp11)]]
 10 | 
 11 | // [[Rcpp::plugins(openmp)]]
 12 | 
 13 | #include <RcppArmadillo.h>
 14 | 
 15 | #include <bigmemory/BigMatrix.h>
 16 | 
 17 | #include <Rcpp.h> 
 18 | 
 19 | #include <omp.h>
 20 | 
 21 | 
 22 | using namespace Rcpp;
 23 | 
 24 | using namespace arma;
 25 | 
 26 | template <typename InputIterator1, typename InputIterator2>
 27 | 
 28 | inline double  BrayElement(InputIterator1 begin1, InputIterator1 end1, InputIterator2 begin2){
 29 | 	
 30 | 	// value to return
 31 | 	
 32 | 	double rval = 0;
 33 | 	
 34 | 	double sumPi = 0;
 35 | 	
 36 | 	double sumPj = 0;
 37 | 	
 38 | 	double aComp = 0;
 39 | 	
 40 | 	// set iterators to beginning of ranges
 41 | 
 42 | 	InputIterator1 it1 = begin1;
 43 | 	
 44 | 	InputIterator2 it2 = begin2;
 45 | 
 46 | 	// for each input item
 47 | 	
 48 | 	while (it1 != end1) {
 49 | 		
 50 | 		// take the value and increment the iterator
 51 | 		
 52 | 		double d1 = *it1++;
 53 | 		
 54 | 		double d2 = *it2++;
 55 | 		
 56 | 		sumPi += d1;
 57 | 		
 58 | 		sumPj += d2;
 59 | 		
 60 | 		aComp += std::min(d1,d2);
 61 | 		
 62 | 	}
 63 | 			
 64 | 	double bComp = sumPi - aComp;
 65 | 			
 66 | 	double cComp = sumPj - aComp;
 67 | 			
 68 | 	double minBC = 0.0;
 69 | 			 
 70 | 	if (bComp < cComp) { 
 71 | 			 
 72 | 		minBC = bComp;
 73 | 			 
 74 | 	}
 75 | 			 
 76 | 	else {
 77 | 			 
 78 | 		minBC = cComp; 
 79 | 			 
 80 | 	}
 81 | 			 
 82 | rval = minBC / (aComp + minBC);
 83 | 
 84 | return rval;	
 85 | }
 86 | 
 87 | 
 88 |  void BrayDistance(const arma::Mat<double>& mat, arma::Mat<double> rmat){
 89 | 	 
 90 | 	int nRows = mat.n_rows;
 91 |   
 92 |     omp_set_num_threads(10);
 93 | 	
 94 | 	#pragma omp parallel for  
 95 | 	
 96 |     for (int i = 0; i < nRows; i++) {
 97 |          
 98 | 			for (int j = 0; j < nRows; j++) {
 99 |             
100 | 				arma::Row<double> row1 = mat.row(i);
101 |             
102 | 				arma::Row<double> row2 = mat.row(j);
103 | 			
104 | 				rmat(i,j) = BrayElement(row1.begin(), row1.end(), row2.begin()); 
105 | 				
106 | 			}
107 | 			
108 | 	}
109 | 	
110 | rmat =   rmat.t();	
111 | 
112 | }
113 | 	
114 | // [[Rcpp::export]]	
115 | 
116 | void bray_distance_OpenMP(SEXP mat, SEXP rmat) {
117 |  
118 | 	XPtr<BigMatrix> xpMat(mat);
119 | 
120 | 	XPtr<BigMatrix> xpOutMat(rmat);
121 | 	
122 |    // create the worker
123 |     BrayDistance(arma::Mat<double>((double *)xpMat->matrix(), xpMat->nrow(), xpMat->ncol(), false),
124 | 							arma::Mat<double>((double *)xpOutMat->matrix(), xpOutMat->nrow(), xpOutMat->ncol(), false));
125 | 
126 | }
127 | 


--------------------------------------------------------------------------------
/_resampling/_functions_TH/cast.cpp:
--------------------------------------------------------------------------------
 1 | // To enable the functionality provided by Armadillo's various macros,
 2 | 
 3 | // simply include them before you include the RcppArmadillo headers.
 4 | 
 5 | #define ARMA_NO_DEBUG
 6 | 
 7 | // [[Rcpp::depends(RcppArmadillo, BH, bigmemory)]]
 8 | 
 9 | // [[Rcpp::plugins(cpp11)]]
10 | 
11 | #include <RcppArmadillo.h>
12 | 
13 | #include <RcppArmadilloExtensions/sample.h>
14 | 
15 | #include <bigmemory/BigMatrix.h>
16 | 
17 | #include <Rcpp.h> 
18 | 
19 | using namespace Rcpp;
20 | 
21 | using namespace arma;
22 | 
23 | IntegerVector whichEq( IntegerVector x, int value) {
24 | 
25 |     int nx = x.size();
26 |     std::vector<int> y;
27 |     y.reserve(nx);
28 | 
29 |     for(int i = 0; i < nx; i++) {
30 |         if (x[i] == value) y.push_back(i);
31 |     }
32 | 
33 |     return wrap(y);
34 | }
35 | 
36 | 
37 | // [[Rcpp::export]]
38 | 
39 | NumericMatrix castC(IntegerVector iD, IntegerVector sp , NumericVector cov){
40 | 	
41 | 	int sizeI = iD.size(); 
42 | 	
43 | 	IntegerVector plotNb = unique(iD);
44 | 	
45 | 	IntegerVector spNb = unique(sp);
46 | 	
47 | 	int rowSize = plotNb.size(); 
48 | 	
49 | 	int colSize= spNb.size() ;
50 | 	
51 | 	NumericMatrix comMat(rowSize, colSize +1); 
52 | 	
53 | 	comMat(_ , 0) = plotNb;
54 | 	
55 | 	for (int i = 0; i < sizeI ; i++) {
56 | 		
57 | 	IntegerVector rowI = whichEq (plotNb,iD[i]);
58 | 	
59 | 	int plotID = rowI[0]; 
60 | 	
61 | 	int species = sp[i];
62 | 	
63 | 	comMat(plotID, species) += cov[i];
64 | 	
65 | 	}
66 | 		
67 | 	return comMat;
68 | 		
69 | }
70 | 


--------------------------------------------------------------------------------
/_resampling/_functions_TH/cast_binary.cpp:
--------------------------------------------------------------------------------
 1 | // To enable the functionality provided by Armadillo's various macros,
 2 | 
 3 | // simply include them before you include the RcppArmadillo headers.
 4 | 
 5 | #define ARMA_NO_DEBUG
 6 | 
 7 | // [[Rcpp::depends(RcppArmadillo, BH, bigmemory)]]
 8 | 
 9 | // [[Rcpp::plugins(cpp11)]]
10 | 
11 | #include <RcppArmadillo.h>
12 | 
13 | #include <RcppArmadilloExtensions/sample.h>
14 | 
15 | #include <bigmemory/BigMatrix.h>
16 | 
17 | #include <Rcpp.h> 
18 | 
19 | using namespace Rcpp;
20 | 
21 | using namespace arma;
22 | 
23 | IntegerVector whichEq( IntegerVector x, int value) {
24 | 
25 |     int nx = x.size();
26 |     std::vector<int> y;
27 |     y.reserve(nx);
28 | 
29 |     for(int i = 0; i < nx; i++) {
30 |         if (x[i] == value) y.push_back(i);
31 |     }
32 | 
33 |     return wrap(y);
34 | }
35 | 
36 | 
37 | // [[Rcpp::export]]
38 | 
39 | IntegerMatrix castC(IntegerVector iD, IntegerVector sp , NumericVector cov){
40 | 	
41 | 	int sizeI = iD.size(); 
42 | 	
43 | 	IntegerVector plotNb = unique(iD);
44 | 	
45 | 	IntegerVector spNb = unique(sp);
46 | 	
47 | 	int rowSize = plotNb.size(); 
48 | 	
49 | 	int colSize= spNb.size() ;
50 | 	
51 | 	IntegerMatrix comMat(rowSize, colSize +1); 
52 | 	
53 | 	comMat(_ , 0) = plotNb;
54 | 	
55 | 	for (int i = 0; i < sizeI ; i++) {
56 | 		
57 | 	IntegerVector rowI = whichEq (plotNb,iD[i]);
58 | 	
59 | 	int plotID = rowI[0]; 
60 | 	
61 | 	int species = sp[i];
62 | 	
63 | 	comMat(plotID, species) = 1;
64 | 	
65 | 	}
66 | 		
67 | 	return comMat;
68 | 		
69 | }
70 | 


--------------------------------------------------------------------------------
/_resampling/_functions_TH/hcr.C.cpp:
--------------------------------------------------------------------------------
  1 | // To enable the functionality provided by Armadillo's various macros,
  2 | 
  3 | // simply include them before you include the RcppArmadillo headers.
  4 | 
  5 | #define ARMA_NO_DEBUG
  6 | 
  7 | // [[Rcpp::depends(RcppArmadillo, BH, bigmemory)]]
  8 | 
  9 | // [[Rcpp::plugins(cpp11)]]
 10 | 
 11 | #include <RcppArmadillo.h>
 12 | 
 13 | #include <RcppArmadilloExtensions/sample.h>
 14 | 
 15 | #include <bigmemory/BigMatrix.h>
 16 | 
 17 | #include <Rcpp.h> 
 18 | 
 19 | using namespace Rcpp;
 20 | 
 21 | using namespace arma;
 22 | 
 23 | //seqInt function 
 24 | 
 25 | IntegerVector seqInt(double x, double y, double by) {
 26 | 
 27 | 	IntegerVector anOut(1);
 28 | 
 29 | 	// compute sequence
 30 | 	
 31 | 	double min_by = 1.e-8;
 32 | 	
 33 | 	if (by < min_by) min_by = by/100;
 34 | 	
 35 | 	double i = x + by;
 36 | 	
 37 | 	anOut(0) = x;
 38 | 	
 39 | 	while(i/min_by < y/min_by + 1) { 
 40 | 	
 41 | 	anOut.push_back(i);
 42 | 	
 43 | 	i += by;
 44 |  
 45 | 	}
 46 | 
 47 | 	return anOut;
 48 | 
 49 |  }
 50 | 
 51 | // stl_sort
 52 | 
 53 |  NumericVector stl_sort(NumericVector x) {
 54 |    
 55 |    NumericVector y = clone(x);
 56 |    
 57 |    std::sort(y.begin(), y.end());
 58 |    
 59 |    return y;
 60 | }
 61 | 
 62 | // stl_sort_int
 63 | 
 64 |  IntegerVector stl_sort_int( IntegerVector x) {
 65 |    
 66 |    IntegerVector y = clone(x);
 67 |    
 68 |    std::sort(y.begin(), y.end());
 69 |    
 70 |    return y;
 71 | }
 72 | 
 73 | // HCR function 
 74 | 
 75 | IntegerVector Hcr(arma::Mat<double> dMat , int nout , int nsampl) {
 76 | 	
 77 | 	int  nplots = dMat.n_rows; 
 78 | 	
 79 | 	NumericVector meand( nsampl, 0.0 ); 
 80 | 	
 81 | 	NumericVector vard( nsampl, 0.0 ); 
 82 | 	
 83 | 	IntegerMatrix sel( nout, nsampl ); 
 84 | 		
 85 | 	arma::Mat<double> selMat( nout, nout );
 86 | 	
 87 | 	IntegerVector plotID = seqInt( 0 , nplots -1 , 1 );
 88 | 	
 89 | 	for (int i = 0 ; i < nsampl ; i++ ) {
 90 | 		
 91 | 		IntegerVector sampleID = Rcpp::RcppArmadillo::sample(plotID, nout, false , NumericVector::create());	
 92 | 		
 93 | 		sel( _ , i) = sampleID;
 94 | 		
 95 | 		arma::uvec permuID = as<arma::uvec>(sampleID);
 96 | 		
 97 | 		selMat = dMat.submat( permuID , permuID );
 98 | 				
 99 | 		double sum = 0;
100 | 		
101 | 		int n = 0;
102 | 
103 | 		for (int j = 0 ; j < nout ; j++){
104 | 			
105 | 			for (int k = 0 ; k < nout ; k++){
106 | 			
107 | 				if (j > k ) {
108 | 					
109 | 					sum += selMat(j,k); 
110 | 					
111 | 					n += 1 ; 
112 | 					
113 | 				}	
114 | 			
115 | 			}
116 | 		}
117 | 		
118 | 		double tmpMean = sum/n ; 
119 | 		 
120 | 		double sce = 0;
121 | 		
122 | 		for (int j = 0 ; j < nout; j++){
123 | 			
124 | 			for (int k = 0 ; k < nout; k++){
125 | 			
126 | 				if (j > k ) {
127 | 					
128 | 					sce += pow(selMat(j,k) - tmpMean, 2.0);
129 | 					
130 | 				}	
131 | 			
132 | 			}
133 | 		}
134 | 		
135 | 		double tmpVar = sce / (n - 1) ; 
136 | 		
137 | 		meand[i] = tmpMean;
138 | 		
139 | 		vard[i] = tmpVar; 	
140 | 		
141 | 	}
142 | 	
143 | 	NumericVector sortdemean = stl_sort(-meand);
144 | 	
145 | 	NumericVector sortdevar = stl_sort(vard);
146 | 	
147 | 	IntegerVector rankdecmean = match(-meand, sortdemean); 
148 | 	
149 | 	IntegerVector rankdevar = match(vard, sortdevar); 
150 | 	
151 | 	IntegerVector sumVarMean = rankdecmean + rankdevar;
152 | 	
153 | 	IntegerVector sorVarMean = stl_sort_int(sumVarMean); 
154 | 	
155 | 	IntegerVector rankFinal = match(sumVarMean, sorVarMean); 
156 | 	
157 | 	int selecSample = which_min(rankFinal); 
158 | 	
159 | 	IntegerVector res = sel(_, selecSample); 
160 | 	
161 | 	return res;
162 | 		
163 | } 
164 | 
165 | // [[Rcpp::export]]
166 | 
167 | IntegerVector HcrCPP(SEXP pInBigMat, int nout , int nsampl) {
168 | 
169 | 	// First we tell Rcpp that the object we've been given is an external pointer.
170 |  
171 | 	XPtr<BigMatrix> xpMat(pInBigMat);
172 |   
173 | 	IntegerVector res = Hcr(arma::Mat<double>((double *)xpMat->matrix(), xpMat->nrow(), xpMat->ncol(), false), nout , nsampl);
174 | 	
175 | 	return wrap(res);
176 | }


--------------------------------------------------------------------------------
/sPlot2_PlotDistribution.R:
--------------------------------------------------------------------------------
  1 | library(tidyverse)
  2 | library(viridis)
  3 | 
  4 | #library(raster)
  5 | library(sp)
  6 | library(sf)
  7 | library(rgdal)
  8 | library(rnaturalearth)
  9 | library(dggridR)
 10 | # library(rgeos)
 11 | 
 12 | #save temporary files
 13 | write("TMPDIR = /data/sPlot/users/Francesco/_tmp", file=file.path(Sys.getenv('TMPDIR'), '.Renviron'))
 14 | write("R_USER = /data/sPlot/users/Francesco/_tmp", file=file.path(Sys.getenv('R_USER'), '.Renviron'))
 15 | #rasterOptions(tmpdir="/data/sPlot/users/Francesco/_tmp")
 16 | 
 17 | ## import data
 18 | load(file = "_output/sPlot_OPEN.RData")
 19 | load("/data/sPlot/releases/sPlot3.0/header_sPlot3.0.RData")
 20 | 
 21 | ## Data Preparation for spatial plotting
 22 | header.oa.sf <- SpatialPointsDataFrame(coords= header.oa %>% 
 23 |                                       select(POINT_X, POINT_Y), 
 24 |                                     proj4string = CRS("+init=epsg:4326"), 
 25 |                                     data=data.frame(PlotObservationID=header.oa$PlotObservationID, 
 26 |                                                     Dataset=header.oa$Dataset)) %>% 
 27 |   st_as_sf() %>% 
 28 |   st_transform(crs = "+proj=eck4")
 29 | 
 30 | header <- header %>% 
 31 |   filter(!is.na(Latitude))
 32 | header.sf <- SpatialPointsDataFrame(coords= header %>% 
 33 |                                       
 34 |                                       select(Longitude, Latitude), 
 35 |                                     proj4string = CRS("+init=epsg:4326"), 
 36 |                                     data=data.frame(PlotObservationID=header$PlotObservationID, 
 37 |                                                     Dataset=header$Dataset)) %>% 
 38 |   st_as_sf() %>% 
 39 |   st_transform(crs = "+proj=eck4")
 40 | 
 41 | 
 42 | 
 43 | ### Template of Global map
 44 | #download data from rnaturalearth package
 45 | countries <- ne_countries(returnclass = "sf") %>% 
 46 |   st_transform(crs = "+proj=eck4") %>% 
 47 |   st_geometry()
 48 | graticules <- ne_download(type = "graticules_15", category = "physical",
 49 |                           returnclass = "sf") %>% 
 50 |   st_transform(crs = "+proj=eck4") %>% 
 51 |   st_geometry()
 52 | bb <- ne_download(type = "wgs84_bounding_box", category = "physical",
 53 |                   returnclass = "sf") %>% 
 54 |   st_transform(crs = "+proj=eck4") %>% 
 55 |   st_geometry()
 56 | 
 57 | # create ggplot template of the world map
 58 | w3a <- ggplot() +
 59 |   geom_sf(data = bb, col = "grey20", fill = "white") +
 60 |   geom_sf(data = graticules, col = "grey20", lwd = 0.1) +
 61 |   #geom_sf(data = countries, fill = "grey90", col = NA, lwd = 0.3) +
 62 |   coord_sf(crs = "+proj=eck4") +
 63 |   theme_minimal() +
 64 |   theme(axis.text = element_blank(), 
 65 |         legend.title=element_text(size=12), 
 66 |         legend.text=element_text(size=12),
 67 |         legend.background = element_rect(size=0.1, linetype="solid", colour = 1), 
 68 |         legend.key.height = unit(1.1, "cm"), 
 69 |         legend.key.width = unit(1.1, "cm")) +
 70 |   scale_fill_viridis()
 71 | 
 72 | 
 73 | ### Map of plot distribution
 74 | ### all plots from sPlot 3.0
 75 | w3all <- w3a + 
 76 |   geom_sf(data=header.sf, color=gray(0.5), pch="+", size=1, alpha=1/5) + # aes(col=Dataset),
 77 |   geom_sf(data = countries, col = "grey10", fill=NA, lwd = 0.3) + 
 78 |   theme(legend.position = "none")
 79 | 
 80 | ### open.access plots from paper #02
 81 | w3oa <- w3all + 
 82 |   geom_sf(data=header.oa.sf, color="red", pch="+", size=1, alpha=1/4) + # aes(col=Dataset),
 83 |   geom_sf(data = countries, col = "grey10", fill=NA, lwd = 0.3) + 
 84 |   theme(legend.position = "none")
 85 | 
 86 | ggsave(filename="_output/sPlot3_distribution.png", device="png", width=8, height=5, dpi = 300, plot = w3all)
 87 | ggsave(filename="_output/sPlot21_oa_distribution.png", device="png", width=8, height=5, dpi = 300, plot = w3oa)
 88 | 
 89 | 
 90 | 
 91 | 
 92 | ### Version 2  - hexagons
 93 | header2 <- header.oa %>% 
 94 |   select(PlotObservationID, POINT_Y, POINT_X) %>% 
 95 |   filter(!(abs(POINT_X) >171 & abs(POINT_Y>70)))
 96 | dggs <- dgconstruct(spacing=300, metric=T, resround='down')
 97 | 
 98 | #Get the corresponding grid cells for each plot
 99 | header2$cell <- dgGEO_to_SEQNUM(dggs, header2$POINT_X, header2$POINT_Y)$seqnum
100 | 
101 | #Calculate number of plots for each cell
102 | header.dggs   <- header2 %>% 
103 |   group_by(cell) %>% 
104 |   summarise(value.out=log(n(), 10))
105 | 
106 | #Get the grid cell boundaries for cells 
107 | grid   <- dgcellstogrid(dggs, header.dggs$cell, frame=F) %>%
108 |   st_as_sf() %>% 
109 |   mutate(cell = header.dggs$cell) %>% 
110 |   mutate(value.out=header.dggs$value.out) %>% 
111 |   st_transform("+proj=eck4") %>% 
112 |   st_wrap_dateline(options = c("WRAPDATELINE=YES"))
113 | 
114 | ## plotting
115 | w3a + 
116 |   geom_sf(data=grid, aes(fill=value.out),lwd=0, alpha=0.9)    +
117 |   geom_sf(data = countries, col = "grey10", fill=NA, lwd = 0.3) + 
118 |   scale_fill_viridis(
119 |     name="# plots", breaks=0:5, labels = c("1", "10", "100",
120 |                                            "1,000", "10,000", "100,000"), option="viridis" )
121 | 
122 | 


--------------------------------------------------------------------------------