├── 00_extract_datasets.R ├── 01_PlotSelection.R ├── 02_BuildDataset.Rmd ├── 03_AuthorList.R ├── 04_FiguresTables.Rmd ├── 05_Demo.Rmd ├── 998_fixSplotOpen.R ├── README.md ├── _public ├── 02_BuildDataset.html ├── 04_FiguresTables.html └── 05_Demo.pdf ├── _resampling ├── 01_running_a_global_PCA_on_bioclimatic_and_soil_variables.R ├── 02_resampling_sPlot_within_the_PC1-PC2_environmental_space.R ├── 03_extracting_selected_plots_from_the_sPlot_database.R ├── README.md └── _functions_TH │ ├── BetaJtu_OpenMP.cpp │ ├── BetaJtu_RcppParallel.cpp │ ├── bray.part.C_RcppParallel.cpp │ ├── bray.part.OpenMP.cpp │ ├── cast.cpp │ ├── cast_binary.cpp │ └── hcr.C.cpp └── sPlot2_PlotDistribution.R /00_extract_datasets.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | load("/data/sPlot/releases/sPlot2.1/sPlot_header_20161124.RData") 3 | load("/data/sPlot/releases/sPlot2.1/DT2_20161025.RData") 4 | 5 | 6 | 7 | ###run Header fix 8 | affiliations <- fread("../_sPlot_Management/Affiliations_20180704.csv") 9 | databases <- fread("../_sPlot_Management/Databases.out_20180704.csv") 10 | sel <- fread("_data/Resampled1.csv")$x 11 | 12 | header.ress <- header.fix %>% 13 | filter(PlotID %in% sel) 14 | 15 | 16 | res.source <- header.fix %>% 17 | dplyr::select(PlotID, Dataset, `GIVD ID`) %>% 18 | filter(PlotID %in% sel) %>% 19 | group_by(`GIVD ID`) %>% 20 | summarise(num.plots.res=n()) %>% 21 | distinct() %>% 22 | left_join(databases %>% 23 | filter(label!="BIOTA_South_Africa_2") %>% ### two dataset labels for the same GIVD 24 | dplyr::select(`GIVD ID`, Custodian, `Deputy custodian`), by="GIVD ID") %>% 25 | left_join(header.fix %>% 26 | group_by(`GIVD ID`) %>% 27 | summarise(tot.plots=n()), by="GIVD ID") %>% 28 | left_join(affiliations %>% 29 | filter(Sequence_affiliations==1) %>% 30 | dplyr::select(Name, `E-Mail`) %>% 31 | rename(Custodian=Name), by="Custodian") %>% 32 | rename(`Custodian E-Mail`=`E-Mail`) %>% 33 | left_join(affiliations %>% 34 | filter(Sequence_affiliations==1) %>% 35 | dplyr::select(Name, `E-Mail`) %>% 36 | rename(`Deputy custodian`=Name), by="Deputy custodian") %>% 37 | rename(`Deputy custodian E-Mail`=`E-Mail`) %>% 38 | left_join(header.fix %>% 39 | dplyr::select(`GIVD ID`, Dataset) %>% 40 | filter(Dataset!="BIOTA_South_Africa_2") %>% ### two dataset labels for the same GIVD 41 | distinct(), by="GIVD ID") %>% 42 | mutate(perc.plots=round(num.plots.res/tot.plots*100,1)) %>% 43 | dplyr::select(`GIVD ID`,Dataset, num.plots.res, tot.plots, perc.plots, Custodian, `Deputy custodian`, 44 | `Custodian E-Mail`, `Deputy custodian E-Mail`)# %>% 45 | arrange(desc(perc.plots)) 46 | 47 | 48 | 49 | 50 | fwrite(res.source[,1:6], "_output/resampling_source_custodian_perc.csv") 51 | 52 | 53 | #### create figure to compare plot density before and after resampling in environmental space 54 | 55 | 56 | 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /01_PlotSelection.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | ##### code from HELGE BRUELHEIDE 3 | 4 | load("/data/sPlot2.0/plot_data.RData") # server directory 5 | # selected pool of plots by Jonathan 6 | str(plot_data) 7 | dim(plot_data) #799400 54 8 | 9 | load("c:\\Daten\\iDiv2\\sPlot2\\JonathanLenoir\\plotToRemove.RData") 10 | 11 | str(output) 12 | output[[1]] 13 | plotToRemove <- output[[2]] 14 | rm(output) 15 | class(plotToRemove) 16 | length(plotToRemove[[1]]) # First iteration containing 700037 IDs 17 | for (i in 1:100) { 18 | plotToRemove[[i]] <- na.omit(plotToRemove[[i]]) 19 | } 20 | str(plotToRemove) 21 | # List of 100 22 | str(plotToRemove[[1]]) #atomic [1:700036] 23 | 24 | posit <- match(plotToRemove[[1]], plot_data$PlotID) 25 | # this gives the index for the first set, which we have chosen for paper #2. 26 | 27 | plot_data_sel1 <- plot_data[-posit, ] 28 | dim(plot_data_sel1)[1] # 99364 selected relev?s 29 | # and this gives the list of plots selected 30 | 31 | #If you repeat this three times 32 | posit <- match(plotToRemove[[2]], plot_data$PlotID) 33 | plot_data_sel2 <- plot_data[-posit, ] 34 | posit <- match(plotToRemove[[3]], plot_data$PlotID) 35 | plot_data_sel3 <- plot_data[-posit, ] 36 | str(plot_data_sel1) 37 | PlotID_run_1_2_3 <- 38 | sort(unique(c(plot_data_sel1$PlotObservationID,plot_data_sel2$PlotObservationID, 39 | plot_data_sel3$PlotObservationID))) 40 | str(PlotID_run_1_2_3) 41 | # int [1:150116] 16 17 18 20 22 23 26 30 31 41 ... 42 | 43 | ##### FMS TAKES OVER FROM HERE ##### 44 | ## reimport selected plots 45 | PlotID123 <- read_csv("_data/PlotID_run_1_2_3.csv") %>% 46 | rename(PlotObservationID=x) 47 | #load header and fix 48 | load("/data/sPlot/releases/sPlot2.1/sPlot_header_20161124.RData") 49 | source("/data/sPlot/users/Francesco/_sPlot_Management/Fix.header.R") 50 | header <- fix.header(header, exclude.sophy = F) 51 | databases <- read_csv("/data/sPlot/users/Francesco/_sPlot_Management/Consortium/Databases.out.csv") 52 | 53 | 54 | header.sel <- header %>% 55 | filter(PlotObservationID %in% PlotID123$PlotObservationID) 56 | 57 | 58 | 59 | ### Data from Stephan - Check for ID matches 60 | header.TV <- read_delim(file = "_data/sPlot-2.1_header.csv", delim = "\t", 61 | col_types = cols(PlotObservationID = col_double(), 62 | PlotID = col_double(), 63 | `TV2 relevé number` = col_double(), 64 | `Original nr in database` = col_character(), 65 | ORIGDB_NR = col_character(), 66 | ORIGDB_NR_1 = col_character(), 67 | ORIG_REL_N = col_character(), 68 | Longitude = col_double(), 69 | Latitude = col_double(), 70 | `Location uncertainty (m)` = col_double(), 71 | Dataset = col_character())) 72 | 73 | ## import answers from custodians 74 | answers <- read_csv(file="_management/resampling_answers.csv") 75 | 76 | header.sel.TV <- header.sel %>% 77 | left_join(header.TV %>% 78 | dplyr::select(PlotObservationID, `TV2 relevé number`:ORIG_REL_N), 79 | by="PlotObservationID") %>% 80 | dplyr::select(PlotObservationID, Dataset, `GIVD ID`, Longitude:Latitude, Country, `Location uncertainty (m)`, `TV2 relevé number`, `Original nr in database`, ORIG_REL_N) %>% 81 | left_join(databases %>% 82 | dplyr::select(`GIVD ID`, Custodian, `Deputy custodian`, `Still in sPlot`))# %>% 83 | ## join answers from google spreadsheet 84 | #left_join(answers) %>% 85 | #filter(`Yes/Conditional/No`!="Yes") 86 | 87 | write_csv(header.sel.TV, path="_output/header.sel123_tocheck_withIDs.csv") 88 | 89 | ### ADD NA-US-002 90 | header.sel.TV.add.NAUS002 <- header.sel.TV %>% 91 | filter(`GIVD ID`=="NA-US-002") 92 | write_csv(header.sel.TV.add.NAUS002, path="_output/header.sel123_addNAUS002.csv") 93 | 94 | ### ADD SOPHY 95 | header.sel.TV.Sophy <- header.sel.TV %>% 96 | filter(`GIVD ID`=="EU-FR-003") %>% 97 | dplyr::select(-`Still in sPlot`) %>% 98 | mutate(`Usable in sPlot Project 02`="") 99 | write_csv(header.sel.TV.Sophy, path="_output/header.sel123_Sophy.csv") 100 | 101 | ### EXTRACT RELEVE numbers for EU-CZ-001 102 | header.TV.joined <- header.TV %>% 103 | left_join(header %>% 104 | dplyr::select(-Longitude, -Latitude, -`Location uncertainty (m)`, -Dataset), 105 | by="PlotObservationID") %>% 106 | dplyr::select(PlotObservationID, Dataset, `GIVD ID`, Longitude:Latitude, Country, `Location uncertainty (m)`, `TV2 relevé number`, `Original nr in database`, ORIG_REL_N) %>% 107 | left_join(databases %>% 108 | dplyr::select(`GIVD ID`, Custodian, `Deputy custodian`, `Still in sPlot`)) 109 | header.sel.TV.EUCZ001 <- header.TV.joined %>% 110 | filter(`GIVD ID`=="EU-CZ-001") 111 | write_csv(header.sel.TV.EUCZ001, path="_output/header.sel123_EUCZ001.csv") 112 | 113 | ## extract RAINFOR Data + DAta From PERU 114 | header.sel.TV.0000001 <- header.TV.joined %>% 115 | filter(`GIVD ID`=="00-00-001") 116 | write_csv(header.sel.TV.0000001, path="_output/header.sel123_0000001.csv") 117 | 118 | header.sel.TV.Peru <- header.TV.joined %>% 119 | filter(Country=="Peru") 120 | write_csv(header.sel.TV.0000001, path="_output/header.Peru.csv") 121 | 122 | ### EXTRACT RELEVE NUMBERS FOR AS-KG-001 123 | header.sel.TV.add.ASKG001 <- header.sel.TV %>% 124 | filter(`GIVD ID`=="AS-KG-001") 125 | write_csv(header.sel.TV.add.ASKG001, path="_output/header.sel123_ASKG001.csv") 126 | 127 | ### EXTRACT RELEVE NUMBERS FOR SALVIAS 00-00-002 128 | header.sel.TV.add.SALVIAS <- header.sel.TV %>% 129 | filter(`GIVD ID`=="00-00-003") 130 | write_csv(header.sel.TV.add.SALVIAS, path="_output/header.sel123_SALVIAS.csv") 131 | 132 | 133 | ##summarize 134 | header.sel.TV.summary <- header.sel %>% 135 | group_by(`GIVD ID`) %>% 136 | summarize(n.sel.plot=n()) %>% 137 | left_join(header %>% 138 | group_by(`GIVD ID`) %>% 139 | summarize(n.tot.plot=n()), 140 | by="GIVD ID") %>% 141 | mutate(share.perc=n.sel.plot/n.tot.plot*100) 142 | 143 | 144 | 145 | ## Prepare demonstration subset for Miguel Alvarez, custodian of SWEA 146 | load("/data/sPlot/releases/sPlot2.1/DT2_20161025.RData") 147 | swea.header <- header.sel %>% 148 | filter(`GIVD ID`=="AF-00-006") 149 | 150 | swea.IDs <- header.sel.TV %>% 151 | filter(`GIVD ID`=="AF-00-006") %>% 152 | dplyr::select(-Custodian, -`Deputy custodian`, -`Still in sPlot`) 153 | 154 | swea.dt <- DT2 %>% 155 | filter(PlotObservationID %in% (swea.header %>% 156 | pull(PlotObservationID))) %>% 157 | dplyr::select(-Taxon.group, -Layer) 158 | 159 | write_csv(swea.header, path="_output/header.SWEA.csv") 160 | write_csv(swea.IDs, path="_output/IDs.SWEA.csv") 161 | write_csv(swea.dt, path="_output/DT2.SWEA.csv") 162 | 163 | 164 | 165 | #################################################################################################################### 166 | ######### Reimport answers from dataset custodians and build redundant selection of plots that can be used ######### 167 | #################################################################################################################### 168 | rm(list=ls()) 169 | ## reimport selected plots 170 | PlotID123 <- read_csv("_data/PlotID_run_1_2_3.csv") %>% 171 | rename(PlotObservationID=x) 172 | #load header and fix 173 | load("/data/sPlot/releases/sPlot2.1/sPlot_header_20161124.RData") 174 | 175 | ## GIT checkout to last stable version for sPlot 2.1 in GIT for consortium management data 176 | source("/data/sPlot/users/Francesco/_sPlot_Management/Fix.header.R") 177 | header <- fix.header(header, exclude.sophy = F) 178 | # GIT checkout baco to master! - i.e., switch back to most up-to-date version in GIT consortium 179 | databases <- read_csv("/data/sPlot/users/Francesco/_sPlot_Management/Consortium/Databases.out.csv") 180 | 181 | header.sel <- header %>% 182 | filter(PlotObservationID %in% PlotID123$PlotObservationID) 183 | 184 | ### Data from Stephan - Check for ID matches 185 | header.TV <- read_delim(file = "_data/sPlot-2.1_header.csv", delim = "\t", 186 | col_types = cols(PlotObservationID = col_double(), 187 | PlotID = col_double(), 188 | `TV2 relevé number` = col_double(), 189 | `Original nr in database` = col_character(), 190 | ORIGDB_NR = col_character(), 191 | ORIGDB_NR_1 = col_character(), 192 | ORIG_REL_N = col_character(), 193 | Longitude = col_double(), 194 | Latitude = col_double(), 195 | `Location uncertainty (m)` = col_double(), 196 | Dataset = col_character())) 197 | 198 | ## import answers from custodians GIVD level 199 | library(openxlsx) 200 | #SWEA CONFIRMED ! - reimport answers 201 | 202 | #answers <- read_csv(file="_management/resampling_answers.csv") 203 | answers <- openxlsx::read.xlsx("_management/resampling_answers.xlsx", sheet = 2) 204 | answers <- answers %>% 205 | mutate(`Yes/Conditional/No`=fct_recode(`Yes/Conditional/No`, No="NO", Yes="yes")) %>% 206 | # Manually set some dataset to yes 207 | # Rasmus Revermann and Donald Walker's acceptance is conditional, 208 | # but depends on conditions others than the selection of plot 209 | # Brian Enquist confirmed for SALVIAS [29.04.2020] 210 | mutate(`Yes/Conditional/No`=replace(`Yes/Conditional/No`, 211 | list=GIVD.ID %in% c("NA-US-014","AF-00-009", 212 | "AF-00-006", "00-00-003"), 213 | values="Yes")) 214 | 215 | # join header with Turboveg information by PlotObservationID, and attach dataset-level answers from custodians 216 | header.sel.TV <- header.sel %>% 217 | left_join(header.TV %>% 218 | dplyr::select(PlotObservationID, `TV2 relevé number`:ORIG_REL_N), 219 | by="PlotObservationID") %>% 220 | dplyr::select(PlotObservationID, Dataset, `GIVD ID`, Longitude:Latitude, Country, `Location uncertainty (m)`, `TV2 relevé number`, `Original nr in database`, ORIG_REL_N) %>% 221 | left_join(databases %>% 222 | dplyr::select(`GIVD ID`, Custodian, `Deputy custodian`, `Still in sPlot`)) %>% 223 | left_join(answers %>% 224 | rename(`GIVD ID`=GIVD.ID)) 225 | 226 | ## import google spreadsheet with plots marked individually 227 | plots.checked <- openxlsx::read.xlsx("_management/header.sel123_checked_20200319.xlsx", sheet = 1) 228 | plots.checked <- plots.checked %>% 229 | as.tbl() %>% 230 | mutate(Usable.in.Paper.02=ifelse(GIVD.ID=="00-RU-002", NA, Usable.in.Paper.02)) %>% 231 | dplyr::select(PlotObservationID, Usable.in.Paper.02) 232 | 233 | 234 | 235 | 236 | ## DB whose plots are marked in other sources: 237 | # CZ 238 | # Siberia - 00-RU-002 [Milan] 239 | # Germany 240 | # NA-US-002 241 | 242 | ## import answers from custodians GIVD level 243 | ## Czechia - EU-CZ-001 244 | CZ.id <- read_csv("_management/Header_EUCZ001_20191029_checked.csv") 245 | CZ.id <- CZ.id %>% 246 | mutate(Canbeused=ifelse(Canbeused=="Y", "Yes", "No")) %>% 247 | dplyr::select(PlotObservationID, Usable.in.Paper.02=Canbeused) 248 | 249 | ## GVRD 250 | GVRD <- openxlsx::read.xlsx("_management/GVRD_check_openaccess_short.xlsx", sheet = 1, rowNames=F) 251 | GVRD <- header.sel.TV %>% 252 | filter(`GIVD ID`=="EU-DE-014") %>% 253 | dplyr::select(PlotObservationID, `GIVD ID`, `TV2 relevé number`) %>% 254 | left_join(GVRD %>% 255 | as.tbl() %>% 256 | mutate(Usable.in.Paper.02=ifelse(Usable.in.Paper.02=="YES", "Yes", "No")) %>% 257 | dplyr::select(`GIVD ID`=GIVD.ID, `TV2 relevé number`=TV2.relevé.number, Usable.in.Paper.02), 258 | by=c("GIVD ID", "TV2 relevé number")) %>% 259 | dplyr::select(-`TV2 relevé number`, -`GIVD ID`) 260 | 261 | ## Siberia - This part of code ONLY selects the usable plots. Header and DT will need to be replaced with the new version afterwards 262 | ### Import updated data from 00-RU-002 263 | Siberia.public <- openxlsx::read.xlsx("_management/Siberia_update/00-RU-002-potentially-public-plots.xlsx", sheet=1) 264 | Siberia.new <- read_delim("_management/Siberia_update/Siberia_Chytry_header.csv", delim="\t", 265 | col_types = cols(PlotObservationID = col_double(), 266 | PlotID = col_double(), 267 | `TV2 relevé number` = col_double(), 268 | Country = col_character(), 269 | `Cover abundance scale` = col_character(), 270 | `Date of recording` = col_character(), 271 | `Relevé area (m²)` = col_double(), 272 | `Altitude (m)` = col_double(), 273 | `Aspect (°)` = col_logical(), 274 | `Slope (°)` = col_logical(), 275 | `Cover total (%)` = col_logical(), 276 | `Cover tree layer (%)` = col_double(), 277 | `Cover shrub layer (%)` = col_double(), 278 | `Cover herb layer (%)` = col_double(), 279 | `Cover moss layer (%)` = col_double(), 280 | `Cover lichen layer (%)` = col_logical(), 281 | `Cover algae layer (%)` = col_logical(), 282 | `Cover litter layer (%)` = col_double(), 283 | `Cover open water (%)` = col_double(), 284 | `Cover bare rock (%)` = col_double(), 285 | `Height (highest) trees (m)` = col_double(), 286 | `Height lowest trees (m)` = col_double(), 287 | `Height (highest) shrubs (m)` = col_double(), 288 | `Height lowest shrubs (m)` = col_double(), 289 | `Aver. height (high) herbs (cm)` = col_double(), 290 | `Aver. height lowest herbs (cm)` = col_double(), 291 | `Maximum height herbs (cm)` = col_double(), 292 | `Maximum height cryptogams (mm)` = col_double(), 293 | `Mosses identified (y/n)` = col_character(), 294 | `Lichens identified (y/n)` = col_character(), 295 | COMMUNITY = col_character(), 296 | SUBSTRATE = col_character(), 297 | Locality = col_character(), 298 | ORIG_NUM = col_character(), 299 | ALLIAN_REV = col_character(), 300 | REV_AUTHOR = col_character(), 301 | Forest = col_logical(), 302 | Grassland = col_logical(), 303 | Wetland = col_logical(), 304 | `Sparse vegetation` = col_logical(), 305 | Shrubland = col_logical(), 306 | `Plants recorded` = col_character(), 307 | `Herbs identified (y/n)` = col_character(), 308 | Naturalness = col_integer(), 309 | EUNIS = col_character(), 310 | FIELD_NO = col_character(), 311 | Longitude = col_double(), 312 | Latitude = col_double(), 313 | `Location uncertainty (m)` = col_double(), 314 | Dataset = col_character(), 315 | `Access regime` = col_character())) %>% 316 | mutate(Usable.in.Paper.02=ifelse(FIELD_NO %in% Siberia.public$FIELD_NO, "Yes", "No")) %>% 317 | dplyr::select(-PlotObservationID) %>% 318 | # attach PlotObservationID from header 319 | left_join(header.sel %>% 320 | filter(Dataset=="Siberia_Chytry") %>% 321 | dplyr::select(PlotObservationID) %>% 322 | left_join(header.TV %>% 323 | dplyr::select(PlotObservationID, `TV2 relevé number`), 324 | by="PlotObservationID"), 325 | by=c("TV2 relevé number")) %>% 326 | dplyr::select(PlotObservationID, Usable.in.Paper.02) %>% 327 | # exclude new plots not in sPlot 2.1 328 | filter(!is.na(PlotObservationID)) 329 | 330 | ### NA-US-002 - USA_vegbank 331 | vegbank0 <- openxlsx::read.xlsx("_management/sPlot-Open-NA-US-002.xlsx", sheet=1) 332 | vegbank <- vegbank0 %>% 333 | mutate(Bob.answer=replace(Bob.answer, 334 | list=Bob.answer %in% c("No ", "no"), 335 | values="No")) %>% 336 | mutate(Bob.answer=replace(Bob.answer, 337 | list=Bob.answer=="yes", 338 | values="Yes")) %>% 339 | mutate(`Usable.in.Paper.02`=Bob.answer) %>% 340 | dplyr::select(PlotObservationID, Usable.in.Paper.02) %>% 341 | filter(!is.na(PlotObservationID)) 342 | 343 | ## SOPHY 344 | sophy <- read_csv("_management/RLV_espaces_naturels-buffer_10m.csv") %>% 345 | dplyr::select(PlotObservationID=PlotObserv) %>% 346 | mutate(Usable.in.Paper.02="Yes") 347 | 348 | 349 | ### the turboveg codes of the new and old database match, but there are many new plots in Siberia.new. 350 | # check <- header.sel %>% 351 | # filter(Dataset=="Siberia_Chytry") %>% 352 | # left_join(header.TV %>% 353 | # dplyr::select(PlotObservationID, `TV2 relevé number`:ORIG_REL_N), 354 | # by="PlotObservationID") %>% 355 | # dplyr::select(Dataset, PlotObservationID, `Original nr in database`, `GIVD ID`, Country:`Slope (°)`, Longitude:Latitude, `Location uncertainty (m)`, `TV2 relevé number`) %>% 356 | # left_join(Siberia.new %>% 357 | # dplyr::select(Dataset, PlotID:`Slope (°)`, Latitude, Longitude), 358 | # by=c("Dataset", "TV2 relevé number")) 359 | 360 | 361 | 362 | 363 | 364 | 365 | ### Coalesce files 366 | header.sel.final <- header.sel.TV %>% 367 | dplyr::select(-Longitude, -Latitude, -`Location uncertainty (m)`, -ORIG_REL_N, -`TV2 relevé number`, -`Original nr in database`) %>% 368 | left_join(plots.checked, by="PlotObservationID") %>% 369 | left_join(CZ.id, by="PlotObservationID") %>% 370 | left_join(GVRD, by="PlotObservationID") %>% 371 | left_join(Siberia.new, by="PlotObservationID") %>% 372 | left_join(vegbank, by="PlotObservationID") %>% 373 | mutate(Usable=ifelse(`Yes/Conditional/No`=="Yes", "Yes", NA)) %>% 374 | mutate(Usable=ifelse(`Yes/Conditional/No`=="No", "No", Usable)) %>% 375 | mutate(Usable=coalesce(Usable, Usable.in.Paper.02.x, Usable.in.Paper.02.y, Usable.in.Paper.02.x.x, Usable.in.Paper.02.y.y, Usable.in.Paper.02)) %>% 376 | dplyr::select(-Usable.in.Paper.02.x, -Usable.in.Paper.02.y, -Usable.in.Paper.02.x.x, -Usable.in.Paper.02.y.y, -Usable.in.Paper.02) %>% 377 | dplyr::select(-`Still in sPlot`, -`Yes/Conditional/No`) %>% 378 | ##join France data from SOPHY 379 | left_join(sophy, by="PlotObservationID") %>% 380 | mutate(Usable=coalesce(Usable, Usable.in.Paper.02)) %>% 381 | mutate(Usable=ifelse(`GIVD ID`=="EU-FR-003" & is.na(Usable), 382 | "No", Usable)) %>% 383 | dplyr::select(-Usable.in.Paper.02) %>% 384 | mutate(Usable=ifelse(is.na(Usable), "Unknown", Usable)) 385 | 386 | 387 | summary.sel.final <- header.sel.final %>% 388 | group_by(`GIVD ID`, Dataset, Custodian, `Deputy custodian`) %>% 389 | summarize(usable=sum(Usable=="Yes"), 390 | not.usable=sum(Usable=="No"), 391 | unknown=sum(Usable=="Unknown")) %>% 392 | arrange(desc(unknown), desc(not.usable), desc(usable), desc(`GIVD ID`)) 393 | 394 | print(summary.sel.final, n=20) 395 | #AS-KG-001# still waiting for plot selection 396 | #00-RU-001# need to be reimported from turboveg (asked Stephan Hennekens) 397 | 398 | write_csv(summary.sel.final, "_output/summary.sel.final.csv") 399 | write_csv(header.sel.final, "_output/header.sel.final.csv") 400 | 401 | 402 | ### Export plot ID of Angola data for Rasmus Revermann. Angola data will be updated. 403 | 404 | angola.sel <- header.sel.TV %>% 405 | filter(PlotObservationID %in% header.sel.final$PlotObservationID) %>% 406 | filter(Dataset=="Angola") %>% 407 | select(1:6, 8) 408 | write_csv(angola.sel, "_output/Angola_sel.csv") 409 | -------------------------------------------------------------------------------- /03_AuthorList.R: -------------------------------------------------------------------------------- 1 | ### Code to administer the author list and their affiliations 2 | ### of sPlotOpen_Manuscript 3 | ### This code also formats authors affiliations to Manubot's standards (yaml) 4 | library(tidyverse) 5 | library(stringr) 6 | filter <- dplyr::filter 7 | 8 | #### 0. Ancillary functions #### 9 | # Function 1 - Extract name initials 10 | # credits: https://codereview.stackexchange.com/questions/150624/extracting-initials-with-r 11 | initials <- function(full.name) { 12 | # Returns initials of a full name 13 | # Input will contain only letters (uppercase and/or lowercase) plus 14 | # single spaces between words. Folks like Joseph Gordon-Levitt, 15 | # Conan O’Brien, and David J. Malan won’t be using your program. (If only!) 16 | if (nchar(full.name) == 0) { 17 | stop ("Valid name please") 18 | } 19 | isspace <- integer(0) 20 | fn.split <- unlist(strsplit(full.name, fixed = TRUE, split = "")) 21 | isspace <- which(fn.split == " ") 22 | init <- toupper(fn.split[c(1, (isspace+1))]) 23 | paste(init, collapse = "") 24 | } 25 | 26 | #### Function 2 - format name, orcid, email and affiliation info into the metadata.yaml standard for manubot 27 | create.yaml <- function(x, file.output){ 28 | tmp <- affiliations %>% 29 | # mutate(github="") %>% 30 | filter(name==x) %>% 31 | pivot_longer(!Sequence_affiliations, names_to = "tag") %>% 32 | arrange(Sequence_affiliations) %>% 33 | dplyr::select(-Sequence_affiliations) %>% 34 | mutate(tag=factor(tag, levels=c("github", "name","initials", 35 | "orcid", "twitter", "email", 36 | "affiliations","correspondence", "symbol_str"))) %>% 37 | distinct() %>% 38 | bind_rows(data.frame(tag="affiliations", 39 | value=paste0("\n - ", 40 | paste({.} %>% 41 | filter(tag=="affiliations") %>% 42 | pull(value), 43 | collapse="\n - ")))) %>% 44 | group_by(tag) %>% 45 | slice(n()) %>% 46 | ungroup() %>% 47 | mutate(tag=as.character(tag)) %>% 48 | mutate(tag=ifelse(tag=="github", " - github", paste0(" ", tag))) %>% 49 | filter(!is.na(value)) %>% 50 | unite(tag:value, 51 | sep = ": ", 52 | col = "newtag", 53 | remove = T) %>% 54 | mutate(newtag=str_remove(newtag, pattern = '"')) %>% 55 | mutate(newtag=str_remove(newtag, pattern = '"')) %>% 56 | mutate(newtag=gsub(pattern="†", replacement = '"†"', x=newtag)) %>% 57 | mutate(newtag=str_replace_all(string = newtag, pattern=", , ", replacement = ", ")) %>% 58 | mutate(newtag=str_replace_all(string = newtag, pattern=", , ", replacement = ", ")) %>% 59 | mutate(newtag=str_replace_all(string = newtag, pattern=" NA,", replacement = "")) %>% 60 | mutate(newtag=gsub(pattern=", $", replacement = "", x=newtag)) 61 | 62 | write_lines(tmp[[1]], file = file.output, append=T) 63 | } 64 | 65 | 66 | ## function 3 - ### get affiliation data from google sheet 67 | get.affiliation.gs <- function(name.to.match, aff.gs){ 68 | n.tmp <- which(paste(aff.gs$`First Name`, aff.gs$`Last Name`, sep=" ") == name.to.match) 69 | aff.gs0 <- aff.gs %>% mutate_at(.vars=vars(starts_with("Postal code")), .funs=funs(as.character)) 70 | if(length(n.tmp)==0){stop("No matched name")} 71 | out <- aff.gs0[n.tmp,] %>% 72 | mutate(Name=paste(`First Name`, `Last Name`, sep=" ")) %>% 73 | mutate(Sequence_affiliations=1) %>% 74 | dplyr::select(Surname=`Last Name`, Name, Sequence_affiliations, `Preferred email`, `Alternative email`, `ORCID`, 75 | `Department/Institute/Faculty`, `University/Institution`, Street, `Postal code`, Town, Country) %>% 76 | rename(`E-Mail`=`Preferred email`) %>% 77 | rename(`Second E-Mail`=`Alternative email`) 78 | if(is.na(aff.gs0[n.tmp,]$"Would you like to add another affiliation")){return(out); stop()} 79 | if(aff.gs0[n.tmp,]$"Would you like to add another affiliation"=="Yes"){ 80 | out <- out %>% 81 | bind_rows(aff.gs0[n.tmp,] %>% 82 | mutate(Name=paste(`First Name`, `Last Name`, sep=" ")) %>% 83 | mutate(Sequence_affiliations=2) %>% 84 | dplyr::select(Surname=`Last Name`, Name, Sequence_affiliations, `Preferred email`, `Alternative email`, `ORCID`, 85 | `Department/Institute/Faculty_1`, `University/Institution_1`, Street_1, `Postal code_1`, Town_1, Country_1) %>% 86 | rename_at(.vars = vars(ends_with("_1")), 87 | .funs = funs(sub("_1", "", .))) %>% 88 | rename(`E-Mail`=`Preferred email`) %>% 89 | rename(`Second E-Mail`=`Alternative email`))} 90 | if(is.na(aff.gs0[n.tmp,]$"Would you like to add another affiliation_1")){return(out); stop()} 91 | if(aff.gs0[n.tmp,]$"Would you like to add another affiliation_1"=="Yes"){ 92 | out <- out %>% 93 | bind_rows(aff.gs0[n.tmp,] %>% 94 | mutate(Name=paste(`First Name`, `Last Name`, sep=" ")) %>% 95 | mutate(Sequence_affiliations=3) %>% 96 | dplyr::select(Surname=`Last Name`,Name, Sequence_affiliations, `Preferred email`, `Alternative email`, `ORCID`, 97 | `Department/Institute/Faculty_2`, `University/Institution_2`, Street_2, `Postal code_2`, Town_2, Country_2) %>% 98 | rename_at(.vars = vars(ends_with("_2")), 99 | .funs = funs(sub("_2", "", .))) %>% 100 | rename(`E-Mail`=`Preferred email`) %>% 101 | rename(`Second E-Mail`=`Alternative email`))} 102 | return(out) 103 | } 104 | 105 | 106 | #### 1. Import data #### 107 | ##import sPlotOpen data 108 | path <- "_sPlotOpenDB" 109 | load(file = file.path(path, "sPlotOpen.RData")) 110 | table1 <- read_csv("_output/Table1_Databases.csv") 111 | 112 | ## Import affiliation information of sPlot members 113 | allroles <- read_csv("/data/sPlot/users/Francesco/_sPlot_Management/Consortium/roles.csv") 114 | allaffiliations <- read_csv("/data/sPlot/users/Francesco/_sPlot_Management/Consortium/Affiliations.csv") %>% 115 | left_join(allroles %>% 116 | dplyr::select(Name,Surname), 117 | by="Name") 118 | 119 | #### 2. Start coauthor list #### 120 | #### 2.1. First authors, core sPlot + last author #### 121 | first <- allaffiliations %>% 122 | filter(Name %in% c("Francesco Maria Sabatini", "Jonathan Lenoir")) %>% 123 | arrange(Name) 124 | ##third author Tarek Habab 125 | third <- tibble(Name="Tarek Hattab", 126 | `E-Mail`="Tarek.Hattab@ifremer.fr", 127 | ORCID="0000-0002-1420-5758", 128 | `Department/Institute/Faculty`="CNRS, IFREMER and IRD", 129 | `University/Institution`="MARBEC, Univ Montpellier", 130 | Town="Sète", 131 | Country="France", 132 | Surname="Hattab") 133 | 134 | custodians <- unique(table1$Custodian) 135 | # Splot core team 136 | core <- allaffiliations %>% 137 | filter(Name %in% (allroles %>% 138 | filter(`Core team` == T) %>% 139 | filter(!Surname %in% c("Bruelheide", "Sabatini", "Lenoir")) %>% 140 | pull(Name))) %>% 141 | arrange(Surname) 142 | #last author 143 | last <- allaffiliations %>% 144 | filter(Name=="Helge Bruelheide") 145 | 146 | #### 2.2. Additional opt-ins #### 147 | #Import affiliation info for additioal opt-in coauthors 148 | #(Update 09 November 2020) 149 | optins <- read_csv("_management/Opt-in - Project #02 (Responses) - Form responses 1.csv") %>% 150 | filter(X3=="I read and understand sPlot rules and would like to opt-in to this project") %>% 151 | filter(!Name %in% c("Alicia Acosta", "Bruno Herault")) %>% #already among custodians, but with name spelled differently 152 | dplyr::select(Surname, Name, `Email address`, Affiliation, Address) %>% 153 | #join affiliations from our records 154 | left_join(allaffiliations, by="Name") %>% 155 | #replace affiliations with opt-in input if record is not in archive 156 | mutate(`University/Institution`=ifelse(is.na(`E-Mail`), Affiliation, `University/Institution`)) %>% 157 | mutate(Street=ifelse(is.na(`E-Mail`), Address, Street)) %>% 158 | mutate(Surname=coalesce(Surname.y, Surname.x)) %>% 159 | dplyr::select(-Surname.x, -Surname.y) %>% 160 | mutate(`E-Mail`=ifelse(is.na(`E-Mail`), `Email address`, `E-Mail`)) %>% 161 | mutate(Sequence_affiliations=ifelse(is.na(Sequence_affiliations), 1, Sequence_affiliations)) %>% 162 | dplyr::select(Name, Surname, Sequence_affiliations:Country) %>% 163 | mutate_if(.predicate = ~is.character(.), 164 | .funs = list(~gsub(pattern = ",$", replacement = "", x = {.}))) %>% 165 | filter(!Name %in% c(first, third, custodians, core)) %>% 166 | mutate(ORCID=replace(ORCID, 167 | list=Name=="Inger Greve Alsos", 168 | values="0000-0002-8610-1085")) %>% 169 | mutate(Town=replace(Town, 170 | list=Name=="John-Arvid Grytnes", 171 | values="Bergen")) %>% 172 | mutate(Country=replace(Country, 173 | list=Name=="John-Arvid Grytnes", 174 | values="Norway")) %>% 175 | mutate(ORCID=ifelse(Surname=="Zobel", "0000-0001-7957-6704", ORCID)) %>% 176 | mutate(ORCID=ifelse(Surname=="Brunet", "0000-0003-2667-4575", ORCID)) %>% 177 | mutate(Street=str_replace(Street, "Czech Repunlic", "Czech Republic")) %>% 178 | mutate(Street=str_replace(Street, "Jr. José Sabogal #913", "Jr. José Sabogal 913")) 179 | 180 | 181 | 182 | 183 | ### second batch of opt-ins (mostly from TRY) 184 | ### 14/12/2020 185 | optin2 <- openxlsx::read.xlsx("_management/UpdateAffiliations - sPlot (Responses).xlsx", sheet=1) 186 | colnames(optin2)[16:22] <- paste0(colnames(optin2)[16:22], "_1") 187 | colnames(optin2)[23:28] <- paste0(colnames(optin2)[23:28], "_2") 188 | optin2 <- optin2[,-c(33,34,35)] 189 | optin2 <- optin2 %>% 190 | rename_all(.funs=~gsub(pattern=".", replacement=" ", x=., fixed=T)) 191 | 192 | names_to_import <- c('Meelis Pärtel', 'Sophie Gachet', 'Josep Penuela', 'Dirk Nikolaus Karger', 'Gregory Richard Guerin', 'Attila Lengyel', #20.11.2020 193 | 'Frederic Lens', 'Débora Vanessa Lingner', "Arindam Banerjee", "Farideh Fazayeli", 194 | "Hanhuai Shan") # update 14.12.2020 195 | 196 | 197 | optin2.aff <- NULL 198 | for(n in names_to_import) { 199 | optin2.aff <- optin2.aff %>% 200 | bind_rows(get.affiliation.gs(n, aff.gs = optin2)) 201 | } 202 | 203 | optin2.aff <- optin2.aff %>% 204 | mutate(Name=replace(Name, 205 | list=Name=="Josep Penuela", 206 | values="Josep Peñuelas")) %>% 207 | mutate(Surname=replace(Surname, 208 | list=Surname=="Penuela", 209 | values="Peñuelas")) %>% 210 | mutate(Country=replace(Country, 211 | list=Surname=="Peñuelas", 212 | values="Spain")) %>% 213 | rowwise() %>% 214 | mutate(Town=ifelse(Surname=="Peñuelas", paste0(Town, ", Catalonia"), Town)) %>% 215 | ungroup() %>% 216 | #Add second affiliation Lens 217 | bind_rows( 218 | tibble( 219 | Name = "Frederic Lens", 220 | Surname = "Lens", 221 | Sequence_affiliations=2, 222 | ORCID="0000-0002-5001-0149", 223 | `E-Mail` = "frederic.lens@naturalis.nl", 224 | `Department/Institute/Faculty` = "Institute of Biology Leiden", 225 | `University/Institution` = "Leiden University", 226 | Street = "Sylviusweg 72", 227 | `Postal code` = "2333 BE", 228 | Town = "Leiden", 229 | Country = "The Netherlands" 230 | ) 231 | ) 232 | 233 | 234 | 235 | #### 2.3. Merge lists #### 236 | # first + core sPlot [alphabetical] + (custodians + opt-ins + TRY) [alphabetical] + last author 237 | affiliations0 <- first %>% 238 | bind_rows(third) %>% 239 | bind_rows(core) %>% 240 | bind_rows(allaffiliations %>% 241 | filter(Name %in% custodians) %>% 242 | filter(!Name %in% core$Name) %>% 243 | filter(!Name %in% first$Name) %>% 244 | filter(!Name %in% last$Name) %>% 245 | ## add additional coauthors from SOPHY 246 | bind_rows( 247 | tibble( 248 | Name = "Guillermo Hinojos Mendoza", 249 | `E-Mail` = "ghinojos@asessc.net", 250 | `Department/Institute/Faculty` = "Pépinière d’Entreprises l’Espélidou, Parc d’Activités du Vinobre", 251 | `University/Institution` = "ASES Ecological and Sustainable Services", 252 | Street = "555 Chemin des Traverses, Lachapelle-sous-Aubenas", 253 | `Postal code` = "07200", 254 | Town = "Aubenas", 255 | Country = "France", 256 | Surname = "Hinojos Mendoza" 257 | ) 258 | ) %>% 259 | ## add additional coauthors from AF-CD-001 260 | bind_rows(allaffiliations %>% 261 | filter(Name == "Elizabeth Kearsley") %>% 262 | mutate( 263 | `Department/Institute/Faculty` = 264 | replace( 265 | `Department/Institute/Faculty`, 266 | list = Name == "Elizabeth Kearsley", 267 | values = 268 | "Department Environment, Computational and Applied Vegetation Ecology (UGent-CAVELab)" 269 | )) %>% 270 | mutate(Surname = "Kearsley")) %>% 271 | bind_rows( 272 | tibble( 273 | Name = "Wannes Hubau", 274 | Sequence_affiliations = c(1, 2), 275 | `E-Mail` = "wannes.hubau@ugent.be", 276 | `Department/Institute/Faculty` = c( 277 | "Department Environment, Laboratory of Wood Biology (UGent-WoodLab)", 278 | "Service of Wood Biology" 279 | ), 280 | `University/Institution` = c("Ghent University", "Royal Museum for Central Africa"), 281 | Street = c("Coupure Links 653", "Leuvensesteenweg 13"), 282 | `Postal code` = c("9000", "3080"), 283 | Town = c("Ghent", "Tervuren"), 284 | Country = "Belgium", 285 | Surname = "Hubau" 286 | ) 287 | ) %>% 288 | bind_rows( 289 | tibble( 290 | Name = "Marijn Bauters", 291 | Sequence_affiliations = c(1, 2), 292 | `E-Mail` = "marijn.bauters@ugent.be", 293 | `Department/Institute/Faculty` = 294 | c( 295 | "Department Green chemistry and technology, Isotope Bioscience laboratory (UGent-ISOFYS)", 296 | "Department Environment, Computational and Applied Vegetation Ecology (UGent-CAVELab)" 297 | ), 298 | `University/Institution` = "Ghent University", 299 | Street = "Coupure Links 653", 300 | `Postal code` = "9000", 301 | Town = "Ghent", 302 | Country = "Belgium", 303 | Surname = "Bauters" 304 | ) 305 | ) %>% 306 | 307 | ### Authors from RAINFOR - as recommended by Oliver Phillips 308 | bind_rows( 309 | tibble( 310 | Name = "Abel Monteagudo Mendoza", 311 | Sequence_affiliations = 1:2, 312 | ORCID="0000-0002-1047-845X", 313 | `E-Mail` = "amonteagudomendoza@gmail.com", 314 | `Department/Institute/Faculty` = "", 315 | `University/Institution` = c("Jardín Botánico de Missouri Oxapampa", "Universidad Nacional de San Antonio Abad del Cusco"), 316 | Street = c("Bolognesi Mz-E-6", "Av. de la Cultura 733"), 317 | `Postal code` = NA, 318 | Town = c("Oxapampa, Pasco", "Cusco"), 319 | Country = "Peru", 320 | Surname = "Monteagudo Mendoza" 321 | ) 322 | ) %>% 323 | bind_rows( 324 | tibble( 325 | Name = "Rodolfo Vásquez Martínez", 326 | Sequence_affiliations = 1, 327 | `E-Mail` = "neotaxon@yahoo.com", 328 | `Department/Institute/Faculty` = "", 329 | `University/Institution` = "Jardín Botánico de Missouri Oxapampa", 330 | Street = "Bolognesi Mz-E-6", 331 | `Postal code` = NA, 332 | Town = "Oxapampa, Pasco", 333 | Country = "Peru", 334 | Surname = "Vásquez Martínez" 335 | ) 336 | ) %>% 337 | bind_rows( 338 | tibble( 339 | Name = "Luzmila Arroyo", 340 | Sequence_affiliations = 1, 341 | `E-Mail` = "luzmilaarroyo@hotmail.com", 342 | `Department/Institute/Faculty` = "Dirección de la Carrera de Biología", 343 | `University/Institution` = "Universidad Autónoma Gabriel René Moreno", 344 | Street = NA, 345 | `Postal code` = NA, 346 | Town = "Santa Cruz de la Sierra", 347 | Country = "Bolivia", 348 | Surname = "Arroyo" 349 | ) 350 | ) %>% 351 | bind_rows( 352 | tibble( 353 | Name = "Timothy Killeen", 354 | Sequence_affiliations = 1, 355 | `E-Mail` = "timothy.j.killeen@gmail.com", 356 | `Department/Institute/Faculty` = 357 | "Museo de Historia Natural Noel Kempff Mercado", 358 | `University/Institution` = "Universidad Autonoma Gabriel Rene Moreno", 359 | Street = NA, 360 | `Postal code` = NA, 361 | Town = "Santa Cruz de la Sierra", 362 | Country = "Bolivia", 363 | Surname = "Killeen" 364 | ) 365 | ) %>% 366 | ## Add possible additional coauthors here 367 | bind_rows(allaffiliations %>% 368 | filter(Name%in% c(#"Anita Smyth", 369 | "Alireza Naqinezhad", 370 | "Sylvia Haider", 371 | "Pavel Shirokikh", 372 | "Alicia T.R. Acosta", 373 | "Bruno Hérault", 374 | 'Petr Petřík', 375 | "Donald M. Waller", 376 | "Yves Bergeron"))) %>% ## was custodian when writing the paper 377 | ## 378 | bind_rows(optins) %>% 379 | bind_rows(optin2.aff) %>% 380 | ## Joop Schaminée is not custodian anymore. Exclude? 381 | ### 382 | arrange(Surname, Sequence_affiliations)) %>% 383 | bind_rows(last) %>% 384 | dplyr:::select(name=Name, email=`E-Mail`, orcid=ORCID, everything(), -Surname, -`Second E-Mail`) %>% 385 | replace_na(list(`Department/Institute/Faculty`="", Street="", `Postal code`="", Town="", Country="" )) %>% 386 | unite(`University/Institution`, `Department/Institute/Faculty`, Street:Country, sep = ", ", col="affiliations", remove=T) %>% 387 | mutate(affiliations=str_replace_all(string = affiliations, pattern=", , ", replacement = ", ")) %>% 388 | ### Add Github accounts, if required 389 | mutate(github="") %>% 390 | mutate(github=ifelse(name=="Miguel Alvarez", "kamapu", github)) %>% 391 | mutate(github=ifelse(name=="Francesco Maria Sabatini", "fmsabatini", github)) %>% 392 | mutate(github=ifelse(name=="Jonathan Lenoir", "lenjon", github)) %>% 393 | mutate(github=ifelse(name=="Helge Bruelheide", "Bruelheide", github)) %>% 394 | ### Add Twitter account, if required 395 | mutate(twitter=NA) %>% 396 | mutate(twitter=ifelse(name=="Francesco Maria Sabatini", "sPlot_iDiv", twitter)) %>% 397 | mutate(twitter=ifelse(name=="Jonathan Lenoir", "EkoLogIt", twitter)) %>% 398 | mutate(twitter=ifelse(name=="Helge Bruelheide", "HelgeBruelheide", twitter)) %>% 399 | ### Add correspondence 400 | mutate(correspondence=ifelse(name=="Francesco Maria Sabatini", "true", NA)) %>% 401 | ### Add equal contribution 402 | mutate(symbol_str=ifelse(name %in% c("Francesco Maria Sabatini", "Jonathan Lenoir"), 403 | '"†"', NA)) %>% 404 | ### Add initials 405 | rowwise() %>% 406 | mutate(initials=initials(name)) %>% 407 | ungroup() %>% 408 | ## correct typo 409 | mutate(name=replace(name, list=name=="Andraž Carni", values="Andraž Čarni")) 410 | 411 | affiliations <- affiliations0 %>% 412 | ## Update affiliation by Tsipe Aavik & Martin Zobel 413 | mutate(`affiliations`= replace(`affiliations`, 414 | list = name %in% c("Tsipe Aavik", "Martin Zobel"), 415 | values = (affiliations0 %>% filter(name=="Meelis Pärtel") %>% pull(`affiliations`)))) %>% 416 | mutate(orcid= replace(orcid, 417 | list = name=="Tsipe Aavik", 418 | values = "0000-0001-5232-3950")) 419 | 420 | 421 | 422 | #### 2.4 opt-out #### 423 | ## exclude authors who declined offer 424 | affiliations <- affiliations %>% 425 | filter(!name %in% c("Marten Winter", 426 | "Ching-Feng Li", 427 | "Kim Sarah Jacobsen", 428 | "Desalegn Wana", 429 | "Milan Valachovič", 430 | "Philippe Marchand")) #Not yet custodian when writing the paper 431 | 432 | 433 | #### 3. Create metadata.yaml file #### 434 | affi.out <- "../_manuscript/content/metadata.yaml" #create empty affiliation file 435 | ##populate yaml file with affiliation info 436 | write_lines(c( 437 | "---", 438 | 'title: "sPlotOpen – An environmentally-balanced, open-access, global dataset of vegetation plots"', 439 | 'keywords:', 440 | ' - vegetation', 441 | ' - database', 442 | ' - plants', 443 | ' - biodiversity', 444 | ' - functional traits', 445 | ' - big-data', 446 | ' - manubot', 447 | 'lang: en-US', 448 | 'authors:'), file = affi.out) 449 | lapply(affiliations %>% 450 | dplyr::select(name) %>% 451 | distinct() %>% 452 | pull(name), create.yaml, affi.out) 453 | 454 | 455 | 456 | 457 | #### 4. Create list of email addresses #### 458 | email <- affiliations %>% 459 | dplyr::select(name, email) %>% 460 | distinct() %>% 461 | mutate(export = paste0(name, " <", email, ">")) %>% 462 | dplyr::select(export) 463 | write_delim(email, file = "_output/Author_email.txt", delim="/t") 464 | 465 | ## Create email list for Opt-ins 466 | #### this list was used to invite additional coauthors, besides the custodians 467 | roles <- read_csv("/data/sPlot/users/Francesco/_sPlot_Management/Consortium/roles.csv") 468 | email.optins <- roles %>% 469 | filter(`Still in sPlot`) %>% 470 | arrange(Surname) %>% 471 | left_join(allaffiliations, by="Name") %>% 472 | dplyr::select(Name, email=`E-Mail`) %>% 473 | distinct() %>% 474 | filter(!Name %in% affiliations$name) %>% 475 | mutate(export = paste0(Name, " <", email, ">")) %>% 476 | dplyr::select(export) 477 | write_delim(email.optins, file = "_output/Optins_email", delim="/t" ) 478 | 479 | 480 | 481 | 482 | #### 5. Create checklist to approve submission #### 483 | approve.checklist <- affiliations %>% 484 | distinct(name) %>% 485 | arrange(name) %>% 486 | mutate(name=paste0(" - [ ] ", name)) 487 | write_delim(approve.checklist, file = "_output/Author_checklist.txt", delim="/t" ) 488 | ## second batch 489 | approve.checklist <- optin2.aff %>% 490 | #filter(Name %in% affiliations$name) %>% 491 | distinct(Name) %>% 492 | arrange(Name) %>% 493 | mutate(Name=paste0(" - [ ] ", Name)) 494 | write_delim(approve.checklist, file = "_output/Author_checklist_batch2.txt", delim="/t" ) 495 | 496 | 497 | 498 | #### 6. Create list of ORCIDs 499 | affiliations %>% 500 | distinct(name, orcid) %>% 501 | unite(name, name, orcid, sep=", ") %>% 502 | mutate(name=str_remove(name, pattern=" NA$")) %>% 503 | mutate(name=paste0(name, "\n")) %>% 504 | #pull(name) %>% 505 | #cat() %>% 506 | View() 507 | 508 | 509 | 510 | 511 | -------------------------------------------------------------------------------- /04_FiguresTables.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Project#02 - Figures and Tables" 3 | author: "Francesco Maria Sabatini" 4 | date: "4/28/2020" 5 | output: 6 | html_document: 7 | toc: true 8 | theme: united 9 | --- 10 | 11 |
12 | ![](https://www.idiv.de/fileadmin/content/Files_sDiv/sDiv_Workshops_Photos_Docs/sDiv_WS_Documents_sPlot/splot-long-rgb.png "sPlot Logo") 13 |
14 | 15 | 16 | 17 | 18 | **Timestamp:** `r date()` 19 | **Drafted:** Francesco Maria Sabatini 20 | **Revised:** 21 | **Version:** 1.0 22 | 23 | This report documents the creation of figures and tables for the sPlotOpen manuscript. 24 | 25 | ```{r results="hide", message=F, warning=F} 26 | library(tidyverse) 27 | #library(openxlsx) 28 | library(bib2df) 29 | library(knitr) 30 | library(kableExtra) 31 | library(viridis) 32 | library(plotbiomes) 33 | library(cowplot) 34 | 35 | library(raster) 36 | library(sp) 37 | library(sf) 38 | library(rgdal) 39 | library(rnaturalearth) 40 | library(dggridR) 41 | # library(rgeos) 42 | 43 | library(Taxonstand) 44 | 45 | #save temporary files 46 | write("TMPDIR = /data/sPlot/users/Francesco/_tmp", file=file.path(Sys.getenv('TMPDIR'), '.Renviron')) 47 | write("R_USER = /data/sPlot/users/Francesco/_tmp", file=file.path(Sys.getenv('R_USER'), '.Renviron')) 48 | #rasterOptions(tmpdir="/data/sPlot/users/Francesco/_tmp") 49 | ``` 50 | 51 | #Load sPlotOpen data and create spatial objects 52 | ```{r} 53 | load(file.path("_sPlotOpenDB", "sPlotOpen.RData")) 54 | #header.oa <- header.oa %>% 55 | # filter(!is.na(SoilClim_PC1)) 56 | ``` 57 | 58 | Data Preparation for spatial plotting 59 | ```{r, cache=T, results="hide", warning=F, message=F} 60 | header.sf <- SpatialPointsDataFrame(coords= header.oa %>% 61 | dplyr::select(Longitude, Latitude), 62 | proj4string = CRS("+init=epsg:4326"), 63 | data=data.frame(PlotObservationID=header.oa$PlotObservationID, 64 | Dataset=header.oa$Dataset)) %>% 65 | st_as_sf() %>% 66 | st_transform(crs = "+proj=eck4") 67 | ``` 68 | 69 | # Load ancillary geographic data and create figure templates 70 | 71 | Country boundaries and world graticule 72 | ```{r, cache=T, results="hide", warning=F, message=F} 73 | #data downloaded from rnaturalearth package 74 | countries <- readOGR("/data/sPlot/users/Francesco/Ancillary_Data/naturalearth/ne_110m_admin_0_countries.shp") %>% 75 | st_as_sf() %>% 76 | st_transform(crs = "+proj=eck4") %>% 77 | st_geometry() 78 | graticules <- readOGR("/data/sPlot/users/Francesco/Ancillary_Data/naturalearth/ne_110m_graticules_15.shp") %>% 79 | st_as_sf() %>% 80 | st_transform(crs = "+proj=eck4") %>% 81 | st_geometry() 82 | 83 | bb <- readOGR("/data/sPlot/users/Francesco/Ancillary_Data/naturalearth/ne_110m_wgs84_bounding_box.shp") %>% 84 | st_as_sf() %>% 85 | st_transform(crs = "+proj=eck4") %>% 86 | st_geometry() 87 | ``` 88 | 89 | 90 | Continent boundaries 91 | ```{r} 92 | sPDF <- rworldmap::getMap(resolution="coarse") 93 | continent <- sPDF[,"continent"] 94 | crs(continent) <- CRS("+init=epsg:4326") 95 | continent@data[243,"continent"] <- "South America" ## Manually correct missing data 96 | # create clipped version of continent to avoid going beyond 180 lON 97 | coords <- data.frame(x=c(-180,180,180,-180), 98 | y=c(-90,-90,90,90)) 99 | bboxc = Polygon(coords) 100 | bboxc = SpatialPolygons(list(Polygons(list(bboxc), ID = "a")), proj4string=crs(continent)) 101 | continent_clipped <- rgeos::gIntersection(continent[-137,], bboxc, byid=T) # polygon 137 gives problems... workaround 102 | continent_clipped <- continent_clipped %>% 103 | st_as_sf() 104 | ``` 105 | 106 | Template of Global map - with country borders 107 | ```{r, cache=T, results="hide", warning=F, message=F} 108 | # create ggplot template of the world map 109 | w3a <- ggplot() + 110 | geom_sf(data = bb, col = "grey20", fill = "white") + 111 | geom_sf(data = graticules, col = "grey20", lwd = 0.1) + 112 | geom_sf(data = countries, fill = "grey90", col = NA, lwd = 0.3) + 113 | coord_sf(crs = "+proj=eck4") + 114 | theme_minimal() + 115 | theme(axis.text = element_blank(), 116 | legend.title=element_text(size=12), 117 | legend.text=element_text(size=12), 118 | legend.background = element_rect(size=0.1, linetype="solid", colour = 1), 119 | legend.key.height = unit(1.1, "cm"), 120 | legend.key.width = unit(1.1, "cm")) + 121 | scale_fill_viridis() 122 | ``` 123 | 124 | Create template of Global Map - without country borders 125 | ```{r} 126 | w4a <- ggplot() + 127 | geom_sf(data = bb, col = "grey20", fill = "white") + 128 | geom_sf(data = continent_clipped, fill = "grey90", col = NA, lwd = 0.3) + 129 | geom_sf(data = bb, col = "grey20", fill = NA) + 130 | #geom_sf(data = graticules, col = "grey20", lwd = 0.1) + 131 | coord_sf(crs = "+proj=eck4") + 132 | theme_minimal() + 133 | theme(axis.text = element_blank(), 134 | legend.title=element_text(size=12), 135 | legend.text=element_text(size=12), 136 | legend.background = element_rect(size=0.1, linetype="solid", colour = 1), 137 | legend.key.height = unit(1.1, "cm"), 138 | legend.key.width = unit(1.1, "cm")) 139 | 140 | ``` 141 | 142 | 143 | 144 | # Figures 145 | Figures and tables for the manuscript in the [sPlotOpen_Manuscript](https://fmsabatini.github.io/sPlotOpen_Manuscript/) project. 146 | 147 | ## Figure 1 - Geographic distribution of plots 148 | 149 | Map of plot distribution - Version 1 - Coloured points. 150 | Each colour represents a database. Please note there are not enough colours in the palette to represent all 105 datasets 151 | ```{r, fig.width=8, fig.height=6, fig.align="center", warning=F, message=F, cache=T} 152 | Figure1a <- w3a + 153 | geom_sf(data=header.sf, aes(color=Dataset), pch="+", size=1, alpha=0.8) + # aes(col=Dataset), 154 | geom_sf(data = countries, col = "grey20", fill=NA, lwd = 0.3) + 155 | theme(legend.position = "none") 156 | ``` 157 | 158 | Version 2 - hexagons 159 | ```{r, fig.width=8, fig.height=6, fig.align="center", message=F, cache=T} 160 | header2 <- header.oa %>% 161 | dplyr::filter(Resample_1) %>% 162 | dplyr::select(PlotObservationID, Latitude, Longitude) %>% 163 | filter(!(abs(Longitude) >171 & abs(Latitude>70))) 164 | dggs <- dgconstruct(spacing=300, metric=T, resround='down') 165 | 166 | #Get the corresponding grid cells for each plot 167 | header2$cell <- dgGEO_to_SEQNUM(dggs, header2$Longitude, header2$Latitude)$seqnum 168 | 169 | #Calculate number of plots for each cell 170 | header.dggs <- header2 %>% 171 | group_by(cell) %>% 172 | summarise(value.out=log(n(), 10)) 173 | 174 | #Get the grid cell boundaries for cells 175 | grid <- dgcellstogrid(dggs, header.dggs$cell, frame=F) %>% 176 | st_as_sf() %>% 177 | mutate(cell = header.dggs$cell) %>% 178 | mutate(value.out=header.dggs$value.out) %>% 179 | st_transform("+proj=eck4") %>% 180 | st_wrap_dateline(options = c("WRAPDATELINE=YES")) 181 | 182 | ## plotting 183 | Figure1b <- w3a + 184 | geom_sf(data=grid, aes(fill=value.out),lwd=0, alpha=0.9) + 185 | geom_sf(data = countries, col = "grey20", fill=NA, lwd = 0.3) + 186 | scale_fill_viridis( 187 | name="# plots", breaks=0:5, labels = c("1", "10", "100", 188 | "1,000", "10,000", "100,000"), option="viridis") 189 | 190 | #ggsave("_output/figure1.png", plot=Figure1, width=8, height=4, units="in", dpi=300) 191 | ``` 192 | Panel with two versions of figure 1 193 | ```{r, fig.width=5.5, fig.height=5, fig.align="center", cache=T} 194 | fig1.leg <- get_legend(Figure1b + 195 | guides(fill = guide_colourbar(barwidth = 1, barheight = 6)) + 196 | theme(legend.title = element_text(size = 7), 197 | legend.text = element_text(size = 7))) 198 | fig1_panel <- plot_grid(Figure1a, NULL, 199 | Figure1b + 200 | theme(legend.position = "none"), fig1.leg, 201 | nrow=2, ncol=2, byrow = T, #labels = c("a","", "b", ""), 202 | rel_widths = c(0.84,0.16)) 203 | ggsave("_output/figure1.pdf", plot=fig1_panel, width=5.5, height=5, units="in", dpi=300) 204 | fig1_panel 205 | ``` 206 | 207 | 208 | ## Figure 2 - PCA graph + world map of selected plots 209 | Only for Resample draw #1 210 | Import PCA data 211 | ```{r} 212 | ### load PCA ordination of the world 213 | load("_data/pca3.RData") 214 | path.sPlot <- "/data/sPlot2.0/" 215 | load(paste(path.sPlot, "splot.world2.RData", sep="/")) 216 | ``` 217 | 218 | 219 | ```{r} 220 | plot_data <- header.oa %>% 221 | dplyr::filter(Resample_1) %>% 222 | dplyr::select(PlotObservationID, Longitude, Latitude) 223 | ## code adapted from @lenjon's 'resampling_2d_JL.R' 224 | CRSlonlat <- CRS("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs +towgs84=0,0,0") 225 | coords <- cbind(plot_data$Longitude, plot_data$Latitude) 226 | coords <- SpatialPoints(coords, proj4string=CRSlonlat) 227 | plot_data <- SpatialPointsDataFrame(coords, plot_data)#, proj4string=CRSlonlat) 228 | 229 | 230 | # Create world rasters of PCA values and extract plot values by geographic intersection 231 | # raster at half a degree resolution (cf. 30 arc minute resolution) 232 | rgeo <- raster(nrows=360, ncols=720, xmn=-180, xmx=180, ymn=-90, ymx=90) 233 | rgeo <- disaggregate(rgeo, fact=12) # raster at 2.5 arc minute resolution 234 | splot.world2$cellID <- cellFromXY(rgeo, cbind(splot.world2$RAST_X, splot.world2$RAST_Y)) 235 | 236 | ### create rasters from PCA 237 | posit <- splot.world2$cellID 238 | temp <- getValues(rgeo) 239 | 240 | temp[posit] <- pca3$x[, 1] 241 | PC1_r <- setValues(rgeo, temp) 242 | temp[posit] <- pca3$x[, 2] 243 | PC2_r <- setValues(rgeo, temp) 244 | 245 | # Extract pca valus from PCA rasters 246 | plot_data@data$pc1_val <- extract(PC1_r, coordinates(plot_data)) 247 | plot_data@data$pc2_val <- extract(PC2_r, coordinates(plot_data)) 248 | 249 | 250 | # Compute the density of environmental conditions available at the global scale across the entire bivariate (PC1-PC2) environmental space 251 | res <- 100 # Setting the number of bins per PCA axis to 100 252 | reco <- raster(nrows=res, ncols=res, xmn=min(pca3$x[, 1]), xmx=max(pca3$x[, 1]), 253 | ymn=min(pca3$x[, 2]), ymx=max(pca3$x[, 2])) 254 | PC1_PC2_r <- rasterize(pca3$x[, 1:2], reco, fun="count") 255 | plot_data <- plot_data@data 256 | plot_data$pc_cellID <- cellFromXY(reco, cbind(plot_data$pc1_val, plot_data$pc2_val)) 257 | 258 | 259 | # Compute the sampling effort (number of vegetation plots) per environmental unit (cell) across the entire bivariate (PC1-PC2) environmental space 260 | sPlot_reco <- rasterize(plot_data[, c("pc1_val", "pc2_val")], reco, fun="count") 261 | # Put zero values for the empty cells (cf. there is no vegeteation plots available for those environmental conditions: gaps) 262 | temp1 <- getValues(PC1_PC2_r) 263 | temp1[!is.na(temp1)] <- 0 264 | temp2 <- getValues(sPlot_reco) 265 | temp2[which(temp1==0&is.na(temp2))] <- 0 266 | sPlot_reco <- setValues(reco, temp2) 267 | 268 | plot_data <- plot_data %>% 269 | rename(PC1=pc1_val, PC2=pc2_val) 270 | ``` 271 | 272 | Transform to tibbles 273 | ```{r} 274 | PC1_PC2.tbl <- data.frame(xyFromCell(PC1_PC2_r, cell = 1:10000)) %>% 275 | rename(PC1=x, PC2=y) %>% 276 | mutate(values=getValues(PC1_PC2_r)) %>% 277 | as_tibble() 278 | 279 | sPlot_reco.tbl <- data.frame(xyFromCell(sPlot_reco, cell = 1:10000)) %>% 280 | rename(PC1=x, PC2=y) %>% 281 | mutate(values=getValues(sPlot_reco)) %>% 282 | as_tibble() 283 | ``` 284 | 285 | 286 | 287 | Figure2 - Bottom left. 288 | Make gridded heatmap plot 289 | ```{r} 290 | pca.heatmap <- ggplot() + 291 | geom_tile(data=PC1_PC2.tbl %>% 292 | dplyr::filter(!is.na(values)), aes(x=PC1, y=PC2), 293 | col = gray(0.8), 294 | fill = gray(0.8)) + 295 | geom_tile(data=sPlot_reco.tbl %>% 296 | dplyr::filter(!is.na(values)) %>% 297 | dplyr::filter(values>0), ### !!! 298 | aes(x=PC1, y=PC2, col=values, fill=values)) + 299 | scale_fill_viridis("Number\nof plots", option = "magma") + 300 | scale_color_viridis("Number\nof plots", option = "magma") + 301 | theme_bw() + 302 | coord_equal() + 303 | theme(legend.position = "bottom", 304 | panel.grid.major = element_blank(), 305 | panel.grid.minor = element_blank()) 306 | 307 | ``` 308 | 309 | Figure 2 - Bottom right 310 | Randomly select four pixels with >45 plots, extract center coordinates and add to heatmap. 311 | ```{r} 312 | set.seed(558) 313 | ABC <- sPlot_reco.tbl %>% 314 | filter(!is.na(values)) %>% 315 | filter(values>45 & values<=50) %>% 316 | mutate(upper=PC2 > 1) %>% 317 | mutate(right=PC1 > 0) %>% 318 | group_by(upper, right) %>% 319 | sample_n(1) %>% 320 | ungroup() %>% 321 | mutate(label=c("D", "C", "A", "B")) 322 | 323 | 324 | ### add labels to heatmap 325 | pca.heatmap2 <- pca.heatmap + 326 | geom_point(data=ABC, aes(x=PC1, y=PC2), pch=21, size=2, col="black", fill="white") + 327 | ggrepel::geom_label_repel(data=ABC, aes(x=PC1, y=PC2, label=label), size = 2.5, fill = alpha(c("white"),0.7)) + 328 | theme(legend.position = "left") 329 | ``` 330 | 331 | For each of the selected grid cells, get all plots belonging to that cell, and show their geographical distribution. 332 | ```{r} 333 | inset.list <- list() 334 | rangex <- max(PC1_PC2.tbl$PC1) - min(PC1_PC2.tbl$PC1) 335 | rangey <- max(PC1_PC2.tbl$PC2) - min(PC1_PC2.tbl$PC2) 336 | for(i in c("A", "B", "C", "D")){ 337 | tmp.ABC <- ABC %>% 338 | filter(label==i) 339 | xmin <- tmp.ABC$PC1 - 0.5*rangex/100 340 | xmax <- tmp.ABC$PC1 + 0.5*rangex/100 341 | ymin <- tmp.ABC$PC2 - 0.5*rangey/100 342 | ymax <- tmp.ABC$PC2 + 0.5*rangey/100 343 | coords.ABC <- plot_data %>% 344 | mutate(sel=(PC1 > xmin & PC1 < xmax & 345 | PC2 > ymin & PC2 < ymax)) %>% 346 | filter(sel) %>% 347 | dplyr::select(Longitude, Latitude) %>% 348 | rowwise() %>% 349 | mutate_at(.vars=vars(Longitude, Latitude), 350 | .funs=~jitter(.)) %>% 351 | ungroup() %>% 352 | SpatialPoints(proj4string = CRS("+init=epsg:4326")) %>% 353 | st_as_sf() %>% 354 | st_transform(crs = "+proj=eck4") 355 | inset.list[[i]] <- w4a + 356 | geom_sf(data=coords.ABC, col=2, pch="+", size=2) 357 | } 358 | ``` 359 | 360 | Figure 2 - Top 361 | 362 | ```{r} 363 | splot.world2.eckert <- SpatialPointsDataFrame(coords=splot.world2 %>% 364 | dplyr::select(RAST_X, RAST_Y), 365 | data = splot.world2 %>% 366 | dplyr::select(RAST_ID), 367 | proj4string = CRS("+init=epsg:4326")) %>% 368 | spTransform(CRS("+proj=eck4")) 369 | 370 | splot.world2.eckert <- data.frame(splot.world2.eckert@coords, 371 | splot.world2.eckert@data) 372 | 373 | PCA_tbl <- as_tibble(splot.world2.eckert) %>% 374 | dplyr::select(RAST_ID, RAST_X, RAST_Y) %>% 375 | left_join(as_tibble(as.data.frame(pca3$x[,1:2]) %>% 376 | rownames_to_column(var="RAST_ID")) %>% 377 | mutate(RAST_ID=as.integer(RAST_ID)), 378 | by="RAST_ID") %>% 379 | mutate(PC0=1:n()) 380 | 381 | ggpc1 <- w4a + 382 | geom_tile(data=PCA_tbl %>% 383 | mutate(PC1=ifelse(PC1> 6, 6, PC1)) %>% 384 | mutate(PC1=ifelse(PC1< -6, -6, PC1)), 385 | aes(x=RAST_X,y=RAST_Y, fill=PC1, color=PC1)) + 386 | geom_sf(data = bb, col = "grey20", fill = NA) + 387 | scale_fill_distiller("PC1", type = "seq", palette = "Spectral", 388 | direction=-1, limits = c(-6.1,6.1), breaks=seq(-6,6, by=3), labels=c("<-6", -3, 0, 3,">6" )) + 389 | scale_color_distiller("PC1", type = "seq", palette = "Spectral", 390 | direction=-1, limits = c(-6.1,6.1), breaks=seq(-6,6, by=3), labels=c("<-6", -3, 0, 3,">6" )) + 391 | theme_bw() + 392 | theme(panel.grid.major = element_blank(), 393 | panel.grid.minor = element_blank(), 394 | axis.text = element_blank(), 395 | panel.border = element_blank(), 396 | axis.title = element_blank()) 397 | 398 | 399 | 400 | ggpc2 <- w4a + 401 | geom_tile(data=PCA_tbl %>% 402 | mutate(PC2=ifelse(PC2> 9, 9, PC2)) %>% 403 | mutate(PC2=ifelse(PC2< -6, -6, PC2)), 404 | aes(x=RAST_X,y=RAST_Y, fill=PC2, color=PC2)) + 405 | geom_sf(data = bb, col = "grey20", fill = NA) + 406 | scale_fill_distiller("PC2", type = "seq", palette = "Spectral", 407 | direction=+1, limits = c(-6.1,9.1), breaks=seq(-6,9, length.out = 6), 408 | labels=c("<-6", -3, 0, 3, 6,">9" )) + 409 | scale_color_distiller("PC2", type = "seq", palette = "Spectral", 410 | direction=+1, limits = c(-6.1,9.1), breaks=seq(-6,9, length.out = 6), 411 | labels=c("<-6", -3, 0, 3, 6,">9" )) + 412 | theme_bw() + 413 | theme(panel.grid.major = element_blank(), 414 | panel.grid.minor = element_blank(), 415 | axis.text = element_blank(), 416 | panel.border = element_blank(), 417 | axis.title = element_blank()) 418 | ``` 419 | 420 | 421 | Figure 2 - Panel 422 | ```{r, fig.width=8, fig.height=5.6, fig.align="center", cache=T, cache.lazy = FALSE} 423 | library(cowplot) 424 | varexpl <- round(pca3$sdev^2/sum(pca3$sdev^2)*100,1) 425 | panel.out1 <- 426 | plot_grid(plot_grid(NULL, 427 | ggpc1 + theme(plot.margin=margin(r=.5, unit="cm")), 428 | ggpc2 + theme(plot.margin=margin(r=.5, unit="cm")), 429 | nrow=1, rel_widths = c(0.13,1,1)), 430 | plot_grid( 431 | pca.heatmap2 + 432 | theme(legend.position=c(0.13, 0.68), 433 | legend.background = element_blank()) + 434 | xlab(paste("PC1 (", varexpl[1], "%) - cold/seasonal to hot/stable", sep="")) + 435 | ylab(paste("PC2 (", varexpl[2], "%) - dry to wet", sep="")) + 436 | xlim(c(-15,15)) + ylim(c(-5,22)), 437 | plot_grid(plot_grid(inset.list[[1]], inset.list[[4]], NULL, 438 | nrow=3, labels = c("A", "D", ""), rel_heights = c(1,1,0.1)), 439 | plot_grid(inset.list[[2]], inset.list[[3]], NULL, 440 | nrow=3, labels = c("B", "C", ""), rel_heights = c(1,1,0.1)))), 441 | nrow=2, rel_heights = c(1,1.5), align="v") 442 | 443 | 444 | ggsave(filename = "_output/figure2.tiff", width=8, height=5.6, units = "in", dpi=300, plot=panel.out1) 445 | 446 | panel.out1 447 | ``` 448 | 449 | 450 | ## Figure 3 - Whittaker Biome Graph 451 | Get climatic data 452 | ```{r} 453 | load("/data/sPlot/releases/sPlot2.1/sPlot_header_chelsa_20161124.RData") 454 | climate.oa <- climate %>% 455 | filter(PlotID %in% (header.oa %>% 456 | filter(Resample_1) %>% 457 | pull(PlotObservationID))) %>% 458 | dplyr::select(-POINT_X, -POINT_Y) %>% 459 | rename(PlotObservationID=PlotID) 460 | ``` 461 | 462 | Figure 3 - Left 463 | Create plot of Schultz' biomes 464 | ```{r} 465 | biome.order <- c('Polar and subpolar zone' ,'Alpine' ,'Boreal zone' ,'Temperate midlatitudes' , 466 | 'Dry midlatitudes' ,'Dry tropics and subtropics' ,'Subtropics with year-round rain' , 467 | 'Subtropics with winter rain' ,'Tropics with summer rain' ,'Tropics with year-round rain') 468 | biome.labs <- c('Polar & subpolar' ,'Alpine' ,'Boreal zone' ,'Temperate midlatitudes' , 469 | 'Dry midlatitudes' ,'Dry tropics & subtropics' ,'Subtropics - year-round\n rain' , 470 | 'Subtropics - winter\n rain' ,'Tropics - summer rain' ,'Tropics - year-round\n rain') 471 | 472 | 473 | mypalette <- palette(c('#CAB2D6','#6A3D9A', #violets 474 | '#A6CEE3','#1F78B4', #blues 475 | '#FDBF6F','#FF7F00', #orange 476 | '#B2DF8A','#33A02C', #greens 477 | '#FB9A99','#E31A1C' #reds 478 | )) 479 | 480 | # Plot of Temp vs Prec + sBiomes 481 | biome.schu <- ggplot() + 482 | theme_classic() + 483 | #geom_path(aes(x, y), data=contour_95) + 484 | geom_point(data=climate.oa %>% 485 | left_join(header.oa %>% 486 | dplyr::filter(Resample_1) %>% 487 | dplyr::select(PlotObservationID, Biome), 488 | by="PlotObservationID") %>% 489 | filter(bio12<4500 & bio01>-11) %>% 490 | mutate(Biome=factor(Biome, levels=biome.order, labels=biome.labs)) 491 | # mutate(alpha=ifelse(Biome=="Tropics with summer rain", 1, 1/3)) 492 | , #filter out for plotting reasons 493 | aes(x=bio01, y=bio12, col=Biome), 494 | alpha=1/3, 495 | cex=1/15) + 496 | xlab("Temperature (°C)") + 497 | ylab("Precipitation (mm)") + 498 | scale_color_manual(values = c('#CAB2D6','#6A3D9A', #violets 499 | '#A6CEE3','#1F78B4', #blues 500 | '#FDBF6F','#FF7F00', #orange 501 | '#B2DF8A','#33A02C', #greens 502 | '#FB9A99','#E31A1C' #reds 503 | ), name="sBiomes") + 504 | guides(color = guide_legend(override.aes = list(size=5, shape=15, alpha=1))) + 505 | theme(#plot.margin=margin(r=-0.1, unit="cm"), 506 | axis.title = element_text(size=9), 507 | axis.text = element_text(size=9), 508 | legend.text = element_text(size=8), 509 | legend.title = element_text(size=9)) 510 | ``` 511 | 512 | Figure 3 - Right 513 | Create Whittaker plot 514 | ```{r} 515 | whitt.biome <- whittaker_base_plot() + 516 | theme_classic() + 517 | geom_point(data=climate.oa %>% 518 | filter(bio12<4500 & bio01>-11), #filter out for plotting reasons 519 | aes(x=bio01, y=bio12/10), 520 | alpha=1/4, 521 | cex=1/20) + 522 | theme(axis.text.y = element_blank(), 523 | axis.text = element_text(size=9), 524 | axis.title.y = element_blank(), 525 | axis.title = element_text(size=9), 526 | plot.margin=margin(l=-0.2, r=-0.1, unit="cm"), 527 | legend.text = element_text(size=8), 528 | legend.title = element_text(size=9)) 529 | ``` 530 | 531 | Figure 3 - Panel 532 | ```{r, fig.width=8, fig.height=4, fig.align="center", message=F, cache=T} 533 | #Make a panel with the two plots 534 | panel.biomes <- cowplot::plot_grid( biome.schu, whitt.biome, nrow=1, 535 | align = "h", rel_widths = c(0.5, .5)) 536 | ggsave(filename="_output/figure3.tiff", plot = panel.biomes, width = 10, height=4, units="in", dpi=300) 537 | panel.biomes 538 | ``` 539 | 540 | # Supplementary figures 541 | 542 | 543 | 544 | ## Figure S1 - Biplot 545 | ```{r, fig.width=5.7, fig.height=7, fig.align="center", warning=F} 546 | library(ggrepel) 547 | varexpl <- round(pca3$sdev^2/sum(pca3$sdev^2)*100,1) 548 | 549 | mydata <- pca3$x %>% 550 | as_tibble() %>% 551 | dplyr::select(1:2) %>% 552 | mutate(count=1) 553 | myarrows <- pca3$rotation[,1:2] %>% 554 | as.data.frame() %>% 555 | rownames_to_column("mylab") %>% 556 | mutate_at(.vars=vars(-mylab), .funs = list(~.*25)) %>% 557 | filter(!mylab %in% c("T_ANN", "P_ANN")) 558 | 559 | 560 | ggpca3 <- ggplot(data=mydata) + 561 | stat_summary_2d( 562 | aes(x = PC1, y = PC2, z = count), 563 | bins = 100, 564 | fun = function(x) {log10(sum(x)+1)}, 565 | alpha=1/2 566 | ) + 567 | geom_segment(data=myarrows, 568 | aes(x=0, xend=PC1, y=0, yend=PC2), 569 | arrow = arrow(length = unit(0.08, "inches")), alpha=0.8) + 570 | geom_label_repel(data=myarrows, 571 | aes(x=PC1, y=PC2, label=mylab), size=2, 572 | position = position_dodge(2),segment.alpha=0.5, segment.colour=gray(0.8)) + 573 | scale_fill_viridis("Number of 2.5 arcmin\n terrestrial grid cells", option = "magma", alpha=1/2, 574 | limits=c(0,5), breaks=log10(c(9,99,999,9999)), labels=10^(1:4)) + 575 | theme_bw() + 576 | coord_equal() + 577 | theme(legend.position = c(0.2, 0.85), 578 | panel.grid.major = element_blank(), 579 | panel.grid.minor = element_blank()) + 580 | #guides(fill = guide_legend(override.aes = list(alpha=1/3))) + 581 | xlab(paste("PC1 (", varexpl[1], "%)\n (cold and seasonal to hot and stable)", sep="")) + 582 | ylab(paste("PC2 (", varexpl[2], "%)\n (dry to wet)", sep="")) + 583 | ylim(c(-6,22)) 584 | 585 | ggsave("_output/figureS1.pdf", plot = ggpca3, width=5.7, height=7, unit="in", dpi=300) 586 | ggpca3 587 | ``` 588 | 589 | 590 | # Tables 591 | ## Table 1 - Database level information 592 | Import databases and create reference tags 593 | ```{r, message=F, warning=F} 594 | #load(file.path("_sPlotOpenDB", "sPlotOpen.RData")) 595 | #Import BibTex 596 | bib.db <- bib2df("/data/sPlot/users/Francesco/_sPlot_Management/Consortium/sPlot_References.bib") 597 | #Import database-level information 598 | databases <- read_csv("/data/sPlot/users/Francesco/_sPlot_Management/Consortium/Databases.out.csv") 599 | 600 | # create citation tags that can be picked up by Manubot 601 | databases <- databases %>% 602 | left_join(bib.db %>% 603 | dplyr::select(BIBTEXKEY, DOI, URL), 604 | by="BIBTEXKEY") %>% 605 | mutate(tag=NA) %>% 606 | rowwise() %>% 607 | mutate(tag=ifelse(!is.na(DOI), 608 | paste0("@doi:", DOI), 609 | tag)) %>% 610 | mutate(tag=ifelse( (is.na(tag) & `GIVD ID` %in% unique(header.oa$GIVD_ID) & !is.na(Citation)), 611 | paste0("@", word(Citation, 1)), 612 | tag)) %>% 613 | dplyr::select(-DOI, -URL, -BIBTEXKEY) 614 | 615 | 616 | ``` 617 | 618 | Create Table 1 619 | 620 | ```{r} 621 | table1 <- databases %>% 622 | filter(`Still in sPlot`==T, 623 | Via!="Aggregator") %>% 624 | dplyr::select(-Via, -`Still in sPlot`, -label) %>% 625 | distinct() %>% 626 | left_join(header.oa %>% 627 | group_by(GIVD_ID) %>% 628 | summarize(contributed_plots=n(), .groups = 'drop'), 629 | by=c("GIVD ID"="GIVD_ID")) %>% 630 | filter(!is.na(contributed_plots)) %>% 631 | replace_na(list(tag="", 632 | `Deputy custodian`="")) %>% 633 | dplyr::select(`GIVD ID`, `Dataset name`=`DB_name GIVD`, Custodian, `Deputy custodian`, `Nr. open-access plots` = contributed_plots, Ref=tag) %>% 634 | arrange(`GIVD ID`) %>% 635 | # replace citation rendered wrongly using doi 636 | mutate(Ref=replace(Ref, 637 | list=`GIVD ID`=="AU-AU-002", 638 | values="@isbn:9781315368252")) 639 | 640 | 641 | write_csv(table1, "_output/Table1_Databases.csv") 642 | ``` 643 | 644 | ```{r, echo=F} 645 | knitr::kable(table1%>% 646 | slice(1:20), 647 | caption="Table 1 - Database level information [only first 20 rows shown]") %>% 648 | kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 649 | full_width = F, position = "center") 650 | ``` 651 | 652 | 653 | ## Table 2 - Metadata and environmental data included in header 654 | Define unit of measurements for columns 655 | ```{r} 656 | um <- c('Latitude'='° (WGS84)', 657 | 'Longitude'='° (WGS84)', 658 | 'Location_uncertainty'='m', 659 | 'Releve_area'='m^2^', 660 | 'Elevation'='m a.s.l.', 661 | 'Aspect'='°', 662 | 'Slope'='°', 663 | 'Date_of_recording' = 'dd-mm-yyyy', 664 | 'Cover_total'='%', 665 | 'Cover_tree_layer'='%', 666 | 'Cover_shrub_layer'='%', 667 | 'Cover_herb_layer'='%', 668 | 'Cover_moss_layer'='%', 669 | 'Cover_lichen_layer'='%', 670 | 'Cover_algae_layer'='%', 671 | 'Cover_litter_layer'='%', 672 | 'Cover_bare_rocks'='%', 673 | 'Cover_cryptogams'='%', 674 | 'Cover_bare_soil'='%', 675 | 'Height_trees_highest'='m', 676 | 'Height_trees_lowest'='m', 677 | 'Height_shrubs_highest'='m', 678 | 'Height_shrubs_lowest'='m', 679 | 'Height_herbs_average'='cm', 680 | 'Height_herbs_lowest'='cm', 681 | 'Height_herbs_highest' = 'cm') 682 | um <- data.frame(Variable=names(um), `Unit of Measurement`=um) 683 | 684 | ``` 685 | 686 | Create table 2 687 | ```{r} 688 | table2 <- header.oa %>% 689 | dplyr::summarize_at(.vars=vars(!starts_with("PlotObservationID")), 690 | .funs = list(xxxNo.records=~sum(!is.na(.)), 691 | xxxType.of.variable=~ifelse("logical" %in% class(.), "b", 692 | ifelse("ordered" %in% class(.), 693 | "o", 694 | ifelse(any(class(.) %in% c("character", "factor")), 695 | "n", 696 | ifelse(class(.)=="Date", 697 | "d", 698 | "q")))), 699 | xxxLevels=~(ifelse(is.numeric(.)|lubridate::is.Date(.), 700 | paste(range(., na.rm=T), collapse=" - "), 701 | ifelse(is.ordered(.), 702 | paste(paste(1:nlevels(.), 703 | levels(.), sep=" = "), collapse=", "), 704 | ifelse(is.factor(.), 705 | paste(levels(.), collapse=", "), 706 | ifelse(is.logical(.), 707 | paste(names(table(.)), "=",table(.), 708 | collapse="; "), 709 | ""))))))) %>% 710 | gather(key="Variable") %>% 711 | separate(Variable, into = c("Variable", "feature"), sep="_xxx") %>% 712 | spread(key=feature, value = value) %>% 713 | rename(`Range/Levels`=Levels) %>% 714 | mutate(Variable=factor(Variable, levels=colnames(header.oa))) %>% 715 | arrange(Variable) %>% 716 | left_join(um, by="Variable") %>% 717 | mutate(Unit.of.Measurement=as.character(Unit.of.Measurement)) %>% 718 | replace_na(list(Unit.of.Measurement="")) %>% 719 | dplyr::select(Variable, `Range/Levels`, 720 | `Unit of Measurement`=Unit.of.Measurement, 721 | `Nr. of plots with information`=No.records, `Type`=Type.of.variable) 722 | 723 | write_csv(table2, "_output/Table2_header.csv") 724 | ``` 725 | 726 | ```{r, echo=F} 727 | knitr::kable(table2, 728 | caption="Table 2 - Variables in header") %>% 729 | kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 730 | full_width = F, position = "center") 731 | ``` 732 | 733 | 734 | ## Sink tables for Manubot 735 | ```{r} 736 | out.file <- "_output/91.ManubotTables.md" 737 | readr::write_lines("## Supplementary Material {.page_break_before}\n", file = out.file) 738 | readr::write_lines("Table: List of databases contributing to sPlotOpen, the environmentally-balanced, open-access, global dataset of vegetation plots. Databases are ordered based on their ID in the Global Index of Vegetation Databases (GVID ID). {#tbl:Table1 tag='1'}\n", file = out.file, append=T) 739 | readr::write_lines("\n \n", file = out.file, append=T) 740 | kable1 <- kable(table1, format = "markdown") 741 | ## fix header table 1 742 | kable1[2] <- "|:------------|:--------------------------------------------|:--------------------|:--------------------|--------:|:--------|" 743 | readr::write_lines(kable1, file = out.file, append=T) 744 | readr::write_lines("\n \n \n", file = out.file, append=T) 745 | readr::write_lines("Table: Description of the variables contained in the ‘header’ matrix, together with their range (if numeric) or possible levels (if nominal or binary) and the number of non-empty (i.e., non NA) records. Variable types can be n - nominal (i.e., qualitative variable), o - ordinal, q - quantitative, or b - binary (i.e., boolean), or d - date {#tbl:Table2 tag='2'}. Additional details on the variables are in Bruelheide et al. (2019) [@doi:10.1111/jvs.12710]. GIVD codes derive from Dengler et al. (2011) [@doi:10.1111/j.1654-1103.2011.01265.x]. Biomes refer to Schultz 2005 [@doi:10.1007/3-540-28527-x], modified to include also the world mountain regions by Körner et al. (2017)[@doi:10.1007/s00035-016-0182-6]. The column ESY refers to the EUNIS Habitat Classification Expert system described in Chytrý et al. (2020) [@doi:10.1111/avsc.12519].\n", file = out.file, append=T) 746 | kable2 <- kable(table2, format = "markdown") 747 | ## fix header table 2 748 | kable2[2] <- "|:---------------------------|:-------------------------------------------------------------|:-------------------|:-----------|:-----|" 749 | readr::write_lines(kable2, file = out.file, append=T) 750 | ``` 751 | 752 | 753 | # SessionInfo 754 | ```{r} 755 | sessionInfo() 756 | ``` 757 | 758 | -------------------------------------------------------------------------------- /05_Demo.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Appendix 1 - sPlotOpen - Demo" 3 | author: "Francesco Maria Sabatini, Jonathan Lenoir, Helge Bruelheide" 4 | date: "07/04/2021" 5 | output: html_document 6 | --- 7 | 8 | Appendix to the paper: Sabatini, Lenoir et al., sPlotOpen – An environmentally-balanced, open-access, global dataset of vegetation plots. *Global Ecology and Biogeography*. 9 |
10 | 11 | This demo illustrates how to import and manipulate sPlotOpen data to create some basic graphics or tables together with a reference list. As a worked example, the code below will: 12 | 13 | 1. select all plots containing at least a species of *Quercus* from sPlotOpen's resampled iteration #1 14 | 2. show some summary at biome level 15 | 3. graph the distribution of the community weighted mean of a selected functional trait 16 | 4. show the geographical location of all selected plots 17 | 5. create a reference list based on the plots effectively selected. 18 | 19 |
20 | 21 | ```{r, message=F, warning=F} 22 | #load libraries 23 | library(tidyverse) 24 | library(sf) 25 | library(raster) 26 | library(rnaturalearth) 27 | library(RefManageR) 28 | ``` 29 | 30 | ## Import data 31 | ```{r} 32 | load("_sPlotOpenDB/sPlotOpen.RData") 33 | ls() 34 | ``` 35 | ## Extract all plots containing at least a *Quercus* species 36 | Use only the first resampled iteration of sPlotOpen 37 | ```{r} 38 | #select only the first resample 39 | header.oa1 <- header.oa %>% 40 | filter(Resample_1 == T) 41 | DT2.oa1 <- DT2.oa %>% 42 | filter(PlotObservationID %in% header.oa1$PlotObservationID) 43 | CWM_CWV.oa1 <- CWM_CWV.oa %>% 44 | filter(PlotObservationID %in% header.oa1$PlotObservationID) 45 | ``` 46 | 47 | ```{r} 48 | #get all plots containing at least one Quercus species 49 | plotlist.quercus <- DT2.oa1 %>% 50 | filter(str_detect(Species, "^Quercus")) %>% 51 | distinct(PlotObservationID) %>% 52 | pull(PlotObservationID) 53 | 54 | header.quercus <- header.oa1 %>% 55 | filter(PlotObservationID %in% plotlist.quercus & 56 | Resample_1 == T) 57 | 58 | DT2.quercus <- DT2.oa1 %>% 59 | filter(PlotObservationID %in% plotlist.quercus) 60 | 61 | CWM_CWV.quercus <- CWM_CWV.oa1 %>% 62 | mutate(Quercus=ifelse(PlotObservationID %in% plotlist.quercus, T, F)) 63 | 64 | ``` 65 | There are `r length(plotlist.quercus)` plots containing at least a *Quercus* species in sPlotOpen's resampled iteration 1. 66 |
67 | 68 | ## Number of plots with *Quercus* across biomes 69 | Summarize the number of plots containing at least one *Quercus* species across biomes 70 | 71 | ```{r} 72 | header.quercus %>% 73 | group_by(Biome) %>% 74 | summarize(n = n()) 75 | ``` 76 | \pagebreak 77 | 78 | ## Compare Community Weighted Means 79 | Compare the distribution of the community weighted means of Stem density, between plots containing and not containing a *Quercus* species. 80 | 81 | ```{r, warning=F} 82 | ggplot(data = CWM_CWV.quercus) + 83 | geom_density(aes(x = StemDens_CWM, fill = Quercus), col = NA, alpha = 0.5) + 84 | theme_bw() 85 | ``` 86 | \pagebreak 87 | 88 | ## Geographical distribution of plots containing a *Quercus* species 89 |
90 | Download some spatial data of the world and create a template map using the r package `rnaturalearth`, first. Transform all geographical data to Eckert IV projection. 91 | ```{r, message=F, warning=F, results="hide"} 92 | countries <- ne_countries(returnclass = "sf") %>% 93 | st_transform(crs = "+proj=eck4") %>% 94 | st_geometry() 95 | graticules <- ne_download(type = "graticules_15", category = "physical", 96 | returnclass = "sf") %>% 97 | st_transform(crs = "+proj=eck4") %>% 98 | st_geometry() 99 | bb <- ne_download(type = "wgs84_bounding_box", category = "physical", 100 | returnclass = "sf") %>% 101 | st_transform(crs = "+proj=eck4") %>% 102 | st_geometry() 103 | ``` 104 | 105 | Template of Global map - with country borders 106 | ```{r, results="hide", warning=F, message=F} 107 | w3a <- ggplot() + 108 | geom_sf(data = bb, col = "grey20", fill = "white") + 109 | geom_sf(data = graticules, col = "grey20", lwd = 0.1) + 110 | geom_sf(data = countries, fill = "grey90", col = NA, lwd = 0.3) + 111 | coord_sf(crs = "+proj=eck4") + 112 | theme_minimal() + 113 | theme(axis.text = element_blank(), 114 | legend.title = element_text(size=12), 115 | legend.text = element_text(size=12), 116 | legend.background = element_rect(size = 0.1, linetype = "solid", colour = 1), 117 | legend.key.height = unit(1.1, "cm"), 118 | legend.key.width = unit(1.1, "cm")) 119 | ``` 120 | Project selected plots to Eckert IV and transform them to sf, before plotting. 121 | ```{r, results="hide", warning=F, message=F} 122 | header.quercus.sf <- SpatialPointsDataFrame(coords = header.quercus %>% 123 | dplyr::select(Longitude, Latitude), 124 | proj4string = CRS("+init=epsg:4326"), 125 | data=header.quercus %>% 126 | dplyr::select(-Longitude, -Latitude)) %>% 127 | st_as_sf() %>% 128 | st_transform(crs = "+proj=eck4") 129 | ``` 130 | 131 | Show all plots containing at least one *Quercus* species. Color code based on biomes. 132 | ```{r, fig.width=8, fig.height=9, fig.align="center", warning=F, message=F} 133 | (Figure1a <- w3a + 134 | geom_sf(data = header.quercus.sf, aes(color = Biome), 135 | pch = 16, size = 0.8, alpha = 0.8) + 136 | geom_sf(data = countries, col = "grey20", fill=NA, lwd = 0.3) + 137 | theme(legend.position = "bottom", 138 | legend.title = element_blank()) + 139 | guides(color = guide_legend(ncol = 2, 140 | override.aes = list(size = 2)))) 141 | ``` 142 | \pagebreak 143 | 144 | # Create a reference list for selected plots 145 | Create reference list as BibText 146 | ```{r} 147 | sPlotOpen_citation(IDs=plotlist.quercus, level = "database", 148 | out.file = "_output/demo.bib") 149 | # show first few lines of output file 150 | read_lines("_output/demo.bib", n_max = 25) 151 | ``` 152 | 153 | Convert to reference list 154 | ```{r, warning=F} 155 | mybib <- RefManageR::ReadBib("_output/demo.bib", check = FALSE) 156 | mybib 157 | ``` 158 | 159 | ## sessionInfo() 160 | ```{r} 161 | sessionInfo() 162 | ``` 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | -------------------------------------------------------------------------------- /998_fixSplotOpen.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | 3 | ## Load sPlotOpen data 4 | load("~/share/groups/sPlot/releases/_sPlotOpenDB/v10/sPlotOpen.RData") 5 | 6 | # Fix DT2 and CWM based on Issue #7 7 | # https://github.com/fmsabatini/sPlotOpen_Code/issues/7 8 | 9 | ## Identify SALVIAS plots in sPlotOpen to be replaced 10 | tofix <- header.oa %>% 11 | filter(Dataset=="Salvias") %>% 12 | dplyr::select(PlotObservationID:Location_uncertainty) %>% 13 | left_join(metadata.oa %>% 14 | dplyr::select(PlotObservationID, Original_nr_in_database), 15 | by="PlotObservationID") 16 | 17 | ## Import SALVIAS corrected data 18 | header.salvias <- read_delim("_data/Update_SALVIAS/SALVIAS_correct_header.csv") %>% 19 | dplyr::select(New_PlotObservationID = PlotObservationID, 20 | Original_nr_in_database = `Original nr in database`) 21 | 22 | ## attach New Plot ID to tofix object 23 | tofix <- tofix %>% 24 | dplyr::select(PlotObservationID, Original_nr_in_database) %>% 25 | left_join(header.salvias, by="Original_nr_in_database") 26 | 27 | ## Import correct salvias DT table, and match new IDs with old IDs 28 | DT2.salvias0 <- read_delim("_data/Update_SALVIAS/SALVIAS_correct_species.csv") %>% 29 | rename(New_PlotObservationID = PlotObservationID) %>% 30 | left_join(tofix, by="New_PlotObservationID") %>% 31 | filter(!is.na(PlotObservationID)) 32 | 33 | ## double check species richness (they should be almost identical) 34 | 35 | checking <- DT2.oa %>% 36 | filter(PlotObservationID %in% tofix$PlotObservationID) %>% 37 | count(PlotObservationID) %>% 38 | left_join(DT2.salvias0 %>% 39 | count(PlotObservationID) %>% 40 | rename(n_new=n), 41 | by="PlotObservationID") 42 | plot(checking[,2:3]) 43 | ## There are minor mismatches, probably due to taxonomic standardization 44 | ## The most serious seems plots 56497 NADUGANI - Unclear why 45 | 46 | 47 | 48 | ### Taxonomic standardization 49 | # ## First Try - backengineer Backbone from sPlotOpen itself 50 | # bb.sPlotOpen <- DT2.oa %>% 51 | # filter(PlotObservationID %in% tofix$PlotObservationID) %>% 52 | # distinct(Species, Original_species) 53 | # # Check wheter all taxa from DT2.salvias can be resolved this way 54 | # DT2.salvias0 %>% 55 | # distinct(`Species name`) %>% 56 | # left_join(bb.sPlotOpen, 57 | # by=c("Species name" = "Original_species")) %>% 58 | # filter(is.na(Species)) 59 | # ## Too many unmatched entries --> 22,765 60 | 61 | ## Second Try - Load Backbone from sPlot 3 62 | #load("~/share/groups/sPlot/releases/sPlot2.1/backbone.splot2.1.try3.is.vascular.Rdata") 63 | load("~/share/groups/sPlot/releases/sPlot3.0/Backbone3.0.RData") 64 | 65 | 66 | DT2.salvias <- DT2.salvias0 %>% 67 | # some string cleaning 68 | mutate(Original_species=str_remove(`Species name`, pattern = " species$")) %>% 69 | mutate(Original_species=str_remove(Original_species, " \\[CM$| \\[Derris$| subf\\.$| RS$| SP\\:$| M$| NPZ$| LC$")) %>% 70 | # Try matching both with original species name, and with the clean version of the same string from the Backbone 71 | left_join(Backbone %>% 72 | dplyr::select(Name_sPlot_TRY, Name_short) %>% 73 | distinct(), 74 | by=c("Original_species" = "Name_sPlot_TRY")) %>% 75 | left_join(Backbone %>% 76 | dplyr::select(Name_submitted, Name_short2 = Name_short) %>% 77 | distinct(), 78 | by=c("Original_species" = "Name_submitted")) %>% 79 | mutate(Name_short=coalesce(Name_short, Name_short2)) %>% 80 | # replace all entries returning 'No suitable match' as NAs 81 | mutate(Name_short = replace(Name_short, 82 | list=str_detect(Name_short, "suitable"), 83 | values=NA)) %>% 84 | #distinct(Original_species, Name_short) %>% 85 | #filter(is.na(Name_short)) %>% 86 | #View() 87 | mutate(Relative_cover = NA) %>% 88 | dplyr::select(PlotObservationID, 89 | Species = Name_short, 90 | Original_species = `Species name`, 91 | #Layer, 92 | ## The column names of stem count and the TV2 species number seem reversed! 93 | Original_abundance = `TV2 species nr`, 94 | Abundance_scale = `Cover code`, 95 | Relative_cover, 96 | New_PlotObservationID) 97 | dim(DT2.salvias) 98 | 99 | DT2.salvias %>% 100 | distinct(Species, Original_species) %>% 101 | filter(is.na(Species)) %>% 102 | nrow() 103 | ## Using the backbone from sPlot3.0 only 31 taxa remain unmatched for a total of 931 species x plot obs 104 | 105 | ### Check for non-vascular plants 106 | #Complement taxon group using info from sPlot 3.0 107 | ## Assign genera to taxon group 108 | taxon.groups <- DT2.salvias %>% 109 | as_tibble() %>% 110 | distinct(Species) %>% 111 | mutate(species2=Species) %>% 112 | separate(species2, into=c("Genus"), sep=" ") %>% 113 | distinct(Species, Genus) %>% 114 | left_join(Backbone %>% 115 | distinct(Name_short, `Taxon group`) %>% 116 | rename(species=Name_short, 117 | Taxon.group=`Taxon group`) %>% 118 | separate(species, into=c("Genus"), sep=" ") %>% 119 | mutate(Taxon.group=as.character(Taxon.group)) %>% 120 | mutate(Taxon.group=ifelse(Taxon.group=="Unknown", NA, Taxon.group)) %>% 121 | distinct(Genus, .keep_all=T), 122 | by="Genus") %>% 123 | distinct(Species, .keep_all = T) %>% 124 | dplyr::select(-Genus) 125 | #27 Nas out of 7222 126 | 127 | DT2.salvias <- DT2.salvias %>% 128 | # attach taxon group info from Backbone 3.0 129 | left_join(taxon.groups, by="Species") %>% 130 | # exclue non vascular plants 131 | filter(is.na(Taxon.group) | 132 | !Taxon.group %in% c("Alga", "Lichen", "Moss")) %>% 133 | # calculate relative cover 134 | left_join({.} %>% 135 | group_by(PlotObservationID) %>% 136 | summarize(tot.cover=sum(Original_abundance), .groups = 'drop'), 137 | by=c("PlotObservationID")) %>% 138 | mutate(Relative_cover=Original_abundance/tot.cover) %>% 139 | dplyr::select(-tot.cover, -Taxon.group) 140 | ## Only one moss taxon, with three entries. Excluded 141 | dim(DT2.salvias) 142 | 143 | 144 | 145 | ### Delete wrong entries from DT2.oa and replace with correct salvias data 146 | DT2.oa.out <- DT2.oa %>% 147 | filter(!PlotObservationID %in% tofix$PlotObservationID) %>% 148 | bind_rows(DT2.salvias %>% 149 | dplyr::select(all_of(colnames(DT2.oa)))) %>% 150 | arrange(PlotObservationID, Species) 151 | 152 | dim(DT2.oa) 153 | dim(DT2.oa.out) 154 | 155 | 156 | ### Check what the 2 thousand rows were before 157 | #species in old version (with wrong SALVIA plots) but not in the corrected version 158 | setdiff(DT2.oa %>% distinct(Species) %>% arrange(Species) %>% pull(), 159 | DT2.oa.out %>% distinct(Species) %>% arrange(Species) %>% pull()) 160 | #504 species were excluded 161 | 162 | #species in the corrected version that were not present in the old version 163 | setdiff(DT2.oa.out %>% distinct(Species) %>% arrange(Species) %>% pull(), 164 | DT2.oa %>% distinct(Species) %>% arrange(Species) %>% pull()) 165 | #352 new species were included 166 | 167 | 168 | #selecting random tropical species for closer inspection 169 | header.oa %>% filter(PlotObservationID %in% 170 | c(DT2.oa %>% filter(Species == 'Bauhinia divaricata') %>% pull(PlotObservationID))) %>% 171 | View() 172 | 173 | 174 | header.oa %>% filter(PlotObservationID %in% 175 | c(DT2.oa.out %>% filter(Species == 'Tabebuia rosea') %>% pull(PlotObservationID))) %>% 176 | View() 177 | 178 | 179 | ### Compute CWM\CWV for DT2.salvias 180 | #Load species level gap-filled trait data 181 | load("~/share/groups/sPlot/releases/sPlot2.0/TRY.all.mean.sd.3.by.genus.species.Rdata") 182 | TRY <- TRY.all.mean.sd.3.by.genus.species 183 | 184 | #Merge species data table with traits, and calculate species coverage for each plot, both based on relative cover, and number of species having trait info. 185 | CWM_CWV.oa0.salvias <- DT2.oa.out %>% 186 | as.tbl() %>% 187 | dplyr::select(PlotObservationID, Species, Relative_cover) %>% 188 | left_join(TRY %>% 189 | dplyr::rename(Species=StandSpeciesName) %>% 190 | dplyr::select(Species, LeafArea.mean:Wood.vessel.length.mean), 191 | by="Species") %>% 192 | rename_at(.vars=vars(ends_with(".mean")), 193 | .funs=~gsub(pattern=".mean", replacement="", x=.)) 194 | 195 | # number of species with trait information. 196 | CWM_CWV.oa0.salvias %>% 197 | distinct(Species, .keep_all = T) %>% 198 | filter(!is.na(SLA)) %>% 199 | nrow() 200 | 201 | # Calculate coverage for each trait in each plot 202 | CWM_CWV.oa2.salvias <- CWM_CWV.oa0.salvias %>% 203 | mutate_at(.vars = vars(LeafArea), 204 | .funs = list(~if_else(is.na(.),0,1) * Relative_cover)) %>% 205 | group_by(PlotObservationID) %>% 206 | summarize(TraitCoverage_cover=sum(LeafArea, na.rm=T), 207 | Species_richness=n(), 208 | TraitCoverage_pa=mean(LeafArea>0), 209 | .groups = 'drop') 210 | 211 | 212 | #Calculate CWM and CWV for each trait in each plot 213 | # Ancillary function to calculate CWV 214 | variance2.fun <- function(trait, abu){ 215 | res <- as.double(NA) 216 | #nam <- nam[!is.na(trait)] 217 | abu <- abu[!is.na(trait)] 218 | trait <- trait[!is.na(trait)] 219 | abu <- abu/sum(abu) 220 | if (length(trait)>1){ 221 | # you need more than 1 observation to calculate 222 | # skewness and kurtosis 223 | # for calculation see 224 | # http://r.789695.n4.nabble.com/Weighted-skewness-and-curtosis-td4709956.html 225 | m.trait <- weighted.mean(trait,abu) 226 | res <- sum(abu*(trait-m.trait)^2) 227 | } 228 | res 229 | } 230 | 231 | CWM_CWV.oa1.salvias <- CWM_CWV.oa0.salvias %>% 232 | group_by(PlotObservationID) %>% 233 | summarize_at(.vars= vars(LeafArea:Wood.vessel.length), 234 | .funs = list(CWM=~weighted.mean(., Relative_cover, na.rm=T), 235 | CWV=~variance2.fun(., Relative_cover))) 236 | 237 | #Assemble output 238 | CWM_CWV.oa.salvias <- header.oa %>% 239 | dplyr::select(PlotObservationID) %>% 240 | left_join(CWM_CWV.oa2.salvias, by="PlotObservationID") %>% 241 | left_join(CWM_CWV.oa1.salvias, by="PlotObservationID") %>% 242 | #Rename fields to follow convention 243 | rename_all(.funs=~gsub('\\.', '_', x = .)) 244 | 245 | dim(CWM_CWV.oa.salvias) 246 | dim(header.oa) 247 | 248 | ## Check CWM correlations 249 | plot(CWM_CWV.oa$LeafArea_CWM, 250 | CWM_CWV.oa.salvias$LeafArea_CWM, 251 | col = (CWM_CWV.oa %>% 252 | pull(PlotObservationID) %in% tofix$PlotObservationID + 1)) 253 | ## All identical, except SALVIAS data, marked in pink 254 | 255 | 256 | # Fix header.oa based on Issue #6 257 | # https://github.com/fmsabatini/sPlotOpen_Code/issues/6 258 | header.oa <- header.oa %>% 259 | mutate_at(vars(starts_with('Cover')), 260 | .funs=~(ifelse(. > 100, 100, .))) 261 | 262 | 263 | ### Export 264 | DT2.oa <- DT2.oa.out 265 | CWM_CWV.oa <- CWM_CWV.oa.salvias 266 | path <- "_sPlotOpenDBv20" 267 | save(DT2.oa, header.oa, metadata.oa, CWM_CWV.oa, reference.oa, sPlotOpen_citation, file = file.path(path, "sPlotOpen.RData")) 268 | 269 | ## Export to csv files 270 | write_delim(DT2.oa, file = file.path(path, "sPlotOpen_DT.txt"), delim="\t") 271 | write_delim(CWM_CWV.oa, file = file.path(path, "sPlotOpen_CWM_CWV.txt"), delim="\t") 272 | write_delim(header.oa, file = file.path(path, "sPlotOpen_header.txt"), delim="\t") 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # sPlotOpen_Code 2 | This project documents the construction of the sPlotOpen database. The whole workflow is described in RMarkdown, which contains also code for creating figures and tables of the sister project [sPlotOpen_Manuscript](https://github.com/fmsabatini/sPlotOpen_Manuscript). 3 |
4 | **DATA VERSION 2.0 - All samples from draws 1,2 and 3.** 5 |
6 | 7 | + **html Report on database construction** at https://htmlpreview.github.io/?https://github.com/fmsabatini/sPlotOpen_Code/blob/Res123/_public/02_BuildDataset.html 8 | 9 | This is an initiative by [sPlot - The Global Vegetation-Plot Database](https://www.idiv.de/en/splot.html) 10 | -------------------------------------------------------------------------------- /_public/05_Demo.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fmsabatini/sPlotOpen_Code/06fa267d1fb0cf7d7a167a2c5bb1e3bb393ec7a1/_public/05_Demo.pdf -------------------------------------------------------------------------------- /_resampling/01_running_a_global_PCA_on_bioclimatic_and_soil_variables.R: -------------------------------------------------------------------------------- 1 | # 2 | # 27 January 2017 3 | # 4 | library(raster) 5 | library(fBasics) 6 | library(factoextra) 7 | library(maptools) 8 | library(RColorBrewer) 9 | library(gridExtra) 10 | # 11 | data(wrld_simpl) 12 | # 13 | # Load all bioclimatic and soil variables at 2.5 arc minute resolution across terrestrial Earth 14 | # 15 | load("splot.world2.RData") 16 | ls() # splot.world2 17 | # 18 | # Run the PCA on the entire matrix of environmental variables (climate + soil) 19 | # 20 | pca3 <- prcomp(splot.world2[, c(5:15,17:35)], center=T, scale=T) 21 | # 22 | save(pca3, file="pca3.RData") 23 | # 24 | # Check summary statistics and interpret the PCA axes 25 | # 26 | summary(pca3) # Three first PCA axes account for 75% of the total inertia with 47% and 23% loadings on the first and second PCA axes, respectively 27 | var <- get_pca_var(pca3) 28 | var$coord[, 1:3] 29 | var_cor_func <- function(var.loadings, comp.sdev){var.loadings*comp.sdev} 30 | loadings <- pca3$rotation 31 | sdev <- pca3$sdev 32 | var.cor <- t(apply(loadings, 1, var_cor_func, sdev)) 33 | var.cor[, 1:3] 34 | var.cos2 <- var.cor^2 35 | comp.cos2 <- apply(var.cos2, 2, sum) 36 | contrib <- function(var.cos2, comp.cos2){var.cos2*100/comp.cos2} 37 | var.contrib <- t(apply(var.cos2, 1, contrib, comp.cos2)) 38 | var.contrib[, 1:3] 39 | # 40 | # Map the first PCA axes in space at 2.5 arc minute resolution 41 | # 42 | rgeo <- raster(nrows=360, ncols=720, xmn=-180, xmx=180, ymn=-90, ymx=90) # raster at half a degree resolution (cf. 30 arc minute resolution) 43 | rgeo <- disaggregate(rgeo, fact=12) # raster at 2.5 arc minute resolution 44 | splot.world2$cellID <- cellFromXY(rgeo, cbind(splot.world2$RAST_X, splot.world2$RAST_Y)) 45 | # 46 | posit <- splot.world2$cellID 47 | temp <- getValues(rgeo) 48 | # 49 | temp[posit] <- pca3$x[, 1] 50 | PC1_r <- setValues(rgeo, temp) 51 | ramp_PC1 <- colorRampPalette(brewer.pal(11, "Spectral")) 52 | breaks_PC1 <- quantile(PC1_r, probs=seq(0, 1, 0.01)) 53 | # 54 | save(PC1_r, file="PC1_r.RData") 55 | # 56 | temp[posit] <- pca3$x[, 2] 57 | PC2_r <- setValues(rgeo, temp) 58 | ramp_PC2 <- colorRampPalette(brewer.pal(11, "Spectral")) 59 | breaks_PC2 <- quantile(PC2_r, probs=seq(0, 1, 0.01)) 60 | # 61 | save(PC2_r, file="PC2_r.RData") 62 | # 63 | temp[posit] <- pca3$x[, 3] 64 | PC3_r <- setValues(rgeo, temp) 65 | ramp_PC3 <- colorRampPalette(brewer.pal(11, "Spectral")) 66 | breaks_PC3 <- quantile(PC3_r, probs=seq(0, 1, 0.01)) 67 | # 68 | save(PC3_r, file="PC3_r.RData") 69 | # 70 | # Plot PCA outputs 71 | # 72 | tiff(filename="PCA_outputs.tiff", width=24, height=8, res=300, unit="cm") 73 | p1 <- fviz_screeplot(pca3, ncp=10) 74 | p2 <- fviz_pca_var(pca3, axes=c(1, 2)) 75 | p3 <- fviz_pca_var(pca3, axes=c(2, 3)) 76 | grid.arrange(p1, p2, p3, ncol=3, nrow=1) 77 | multiplot(p1, p2, p3, cols=3) 78 | dev.off() 79 | # 80 | # Plot the global distribution of PCA axis 1 at 2.5 arc minute resolution 81 | # 82 | tiff(filename="PCA_axis1_at_2.5_arc_minute.tiff", width=20, height=12, res=300, unit="cm") 83 | par(mar=c(4, 4, 4, 1)) 84 | plot(PC1_r, breaks=breaks_PC1, col=rev(ramp_PC1(100)), legend=FALSE) 85 | plot(PC1_r, breaks=breaks_PC1, col=rev(ramp_PC1(100)), legend.only=TRUE, legend.width=1, legend.shrink=0.75, axis.args=list(at=round(seq(min(breaks_PC1), max(breaks_PC1), length.out=10)), cex.axis=0.6), legend.args=list(text="PC1", side=3, font=2, line=0.4, cex=0.8)) 86 | plot(wrld_simpl, add=T, border="darkgrey", lwd=0.1) 87 | title(main="First PCA axis (cold and seasonal to hot and stable) \nat 2.5 arc minute resolution") 88 | dev.off() 89 | # 90 | # Plot the global distribution of PCA axis 2 at 2.5 arc minute resolution 91 | # 92 | tiff(filename="PCA_axis2_at_2.5_arc_minute.tiff", width=20, height=12, res=300, unit="cm") 93 | par(mar=c(4, 4, 4, 1)) 94 | plot(PC2_r, breaks=breaks_PC2, col=ramp_PC2(100), legend=FALSE) 95 | plot(PC2_r, breaks=breaks_PC2, col=ramp_PC2(100), legend.only=TRUE, legend.width=1, legend.shrink=0.75, axis.args=list(at=round(seq(min(breaks_PC2), max(breaks_PC2), length.out=10)), cex.axis=0.6), legend.args=list(text="PC2", side=3, font=2, line=0.4, cex=0.8)) 96 | plot(wrld_simpl, add=T, border="darkgrey", lwd=0.1) 97 | title(main="Second PCA axis (dry to wet) \nat 2.5 arc minute resolution") 98 | dev.off() 99 | # 100 | # Plot the global distribution of PCA axis 3 at 2.5 arc minute resolution 101 | # 102 | tiff(filename="PCA_axis3_at_2.5_arc_minute.tiff", width=20, height=12, res=300, unit="cm") 103 | par(mar=c(4, 4, 4, 1)) 104 | plot(PC3_r, breaks=breaks_PC3, col=ramp_PC3(100), legend=FALSE) 105 | plot(PC3_r, breaks=breaks_PC3, col=ramp_PC3(100), legend.only=TRUE, legend.width=1, legend.shrink=0.75, axis.args=list(at=round(seq(min(breaks_PC3), max(breaks_PC3), length.out=10)), cex.axis=0.6), legend.args=list(text="PC2", side=3, font=2, line=0.4, cex=0.8)) 106 | plot(wrld_simpl, add=T, border="darkgrey", lwd=0.1) 107 | title(main="Third PCA axis (sandy to clayey) \nat 2.5 arc minute resolution") 108 | dev.off() 109 | # 110 | -------------------------------------------------------------------------------- /_resampling/02_resampling_sPlot_within_the_PC1-PC2_environmental_space.R: -------------------------------------------------------------------------------- 1 | # 2 | # 01 February 2017 3 | # 4 | library(spdep) 5 | library(raster) 6 | library(maptools) 7 | library(fBasics) 8 | library(devtools) 9 | library(parallelsugar) 10 | library(colorRamps) 11 | library(Rcpp) 12 | library(bigmemory) 13 | library(RcppArmadillo) 14 | library(RcppParallel) 15 | # 16 | data(wrld_simpl) 17 | # 18 | # Loading the header data from the sPlot database v2.1 19 | # 20 | load("sPlot_header_20161124.RData") 21 | ls() # header 22 | dim(header) # 1121244 relevés and 51 variables 23 | length(which(is.na(header$Longitude))) # 558 relevés without longitude coordinates 24 | length(which(is.na(header$Latitude))) # 558 relevés without latitude coordinates 25 | posit <- which(is.na(header$Longitude)&is.na(header$Latitude)) 26 | plot_data <- header[-posit, ] 27 | dim(plot_data) # 1120686 relevés and 51 variables 28 | rm(header) 29 | # 30 | # Make plot_data as a spatial dataframe 31 | # 32 | CRSlonlat <- CRS("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs +towgs84=0,0,0") 33 | coords <- cbind(plot_data$Longitude, plot_data$Latitude) 34 | coords <- SpatialPoints(coords, proj4string=CRSlonlat) 35 | plot_data <- SpatialPointsDataFrame(coords, plot_data, proj4string=CRSlonlat) 36 | class(plot_data) 37 | # 38 | # Compute the initial sampling effort across the geographical space per spatial unit of 2.5 arc minute 39 | # 40 | rgeo <- raster(nrows=360, ncols=720, xmn=-180, xmx=180, ymn=-90, ymx=90) # raster at half a degree resolution (cf. 30 arc minute resolution) 41 | rgeo <- disaggregate(rgeo, fact=12) # raster at 2.5 arc minute resolution hich is about 5 km at the equator (25 km2 is the approximate area of a spatial unit at the equator) 42 | init_seff_rgeo <- rasterize(plot_data@data[, c("POINT_X", "POINT_Y")], rgeo, fun="count") 43 | sum(getValues(init_seff_rgeo), na.rm=TRUE) # 1120686 relevés 44 | # 45 | # Remove plots for which location accuracy is above 2821 m (cf. radius of a buffer circle of 25 km2 around the plot location) 46 | # 47 | plot_data <- plot_data@data 48 | buffer <- round(sqrt(25/pi)*1000) 49 | posit <- which(plot_data[, "Location uncertainty (m)"]>buffer) # Be careful, there are 261 plots without location uncertainty but with coordinates that we will keep in the end 50 | length(posit) # 280224 relevés for which location accuracy exceeds a radius of 2821 m 51 | plot_data <- plot_data[-posit, ] 52 | dim(plot_data) # 840462 relevés and 51 variables 53 | # 54 | # Remove plots from wetlands 55 | # 56 | posit <- which(is.na(plot_data$Grassland)&is.na(plot_data$Forest)&is.na(plot_data$Shrubland)&is.na(plot_data$Sparse.vegetation)&plot_data$Wetland==1) 57 | length(posit) # 8323 relevés that are pure wetlands 58 | plot_data <- plot_data[-posit, ] 59 | dim(plot_data) # 832139 relevés and 51 variables 60 | # 61 | # Remove plots from anthropogenic vegetation types 62 | # 63 | posit <- which(plot_data$Naturalness==3) 64 | length(posit) # 30942 relevés from anthropogenic vegetation types 65 | plot_data <- plot_data[-posit, ] 66 | dim(plot_data) # 801197 relevés and 51 variables 67 | # 68 | # Import DT2 which is the full dataset (free of non-vascular plants) with species composition 69 | # 70 | load("DT2_20161025.RData") 71 | dim(DT2) # 22195966 species occurrences and 7 variables 72 | # 73 | # Match it with the plot_data dataframe 74 | # 75 | length(unique(DT2$PlotObservationID)) # 1117369 relevés 76 | length(unique(DT2$PlotObservationID))-length(unique(plot_data$PlotObservationID)) # 316172 relevés in DT2 but missing from plot_data 77 | posit <- match(DT2$PlotObservationID, plot_data$PlotObservationID) 78 | any(is.na(posit)) # TRUE: some relevés (n = 316172) in DT2 are not in plot_data 79 | length(DT2$PlotObservationID[is.na(posit)]) # 6168698 rows in DT2 corresponding to the species lists of the relevés missing from plot_data 80 | DT2 <- DT2[is.finite(posit), ] 81 | length(unique(DT2$PlotObservationID)) # 799400 relevés 82 | posit <- match(plot_data$PlotObservationID, DT2$PlotObservationID) 83 | any(is.na(posit)) # TRUE: some relevés (n = 1797) in plot_data are not in DT2 (cf. plots with only non-vascular plants?) 84 | length(plot_data$PlotObservationID[is.na(posit)]) # 1797 relevés in plot_data are not in DT2 (cf. plots with only non-vascular plants?) 85 | plot_data <- plot_data[is.finite(posit), ] 86 | length(unique(plot_data$PlotObservationID)) # 799400 relevés which matches with DT2 87 | # 88 | save(plot_data, file="plot_data.RData") 89 | # 90 | # Make plot_data as a spatial dataframe again 91 | # 92 | coords <- cbind(plot_data$Longitude, plot_data$Latitude) 93 | coords <- SpatialPoints(coords, proj4string=CRSlonlat) 94 | plot_data <- SpatialPointsDataFrame(coords, plot_data, proj4string=CRSlonlat) 95 | class(plot_data) 96 | # 97 | # Check for relevés with identical spatial coordinates (just for information) 98 | # 99 | coordID <- paste(plot_data@data$Longitude, plot_data@data$Latitude, sep=":") 100 | length(coordID) # 799400 relevés 101 | length(unique(coordID)) # 509977 relevés with unique coordinates (about 64% of the relevés have unique coordinates) 102 | # 103 | # Plot the global sampling effort per spatial unit of 2.5 arc minute 104 | # 105 | seff_rgeo <- rasterize(plot_data@data[, c("POINT_X", "POINT_Y")], rgeo, fun="count") 106 | sum(getValues(seff_rgeo), na.rm=TRUE) # 799400 relevés 107 | tiff(filename="Sampling_effort_at_2.5_arc_minute.tiff", width=20, height=12, res=300, unit="cm") 108 | par(mar=c(4, 4, 4, 1)) 109 | plot(log(seff_rgeo), legend=FALSE, asp=0, col=rev(divPalette(n=100, name="Spectral")), xlab="Longitude", ylab="Latitude") 110 | plot(log(seff_rgeo), legend.only=TRUE, col=rev(divPalette(n=100, name="Spectral")), legend.width=1, legend.shrink=0.75, axis.args=list(at=seq(log(minValue(seff_rgeo)), log(maxValue(seff_rgeo)), length.out=5), labels=round(exp(seq(log(minValue(seff_rgeo)), log(maxValue(seff_rgeo)), length.out=5))), cex.axis=0.6), legend.args=list(text="N", side=3, font=2, line=0, cex=0.8)) 111 | plot(wrld_simpl, add=T, border="darkgrey", lwd=0.1) 112 | title(main="Number (log-scale) of plots \nper 2.5 arc-minute spatial unit") 113 | dev.off() 114 | # 115 | # Plot the global sampling effort per spatial unit of 0.5 degree 116 | # 117 | seff_rgeo <- aggregate(seff_rgeo, fact=12, fun=sum) 118 | sum(getValues(seff_rgeo), na.rm=TRUE) # 799400 relevés 119 | tiff(filename="Sampling_effort_at_0.5_degree.tiff", width=20, height=12, res=300, unit="cm") 120 | par(mar=c(4, 4, 4, 1)) 121 | plot(log(seff_rgeo), legend=FALSE, asp=0, col=rev(divPalette(n=100, name="Spectral")), xlab="Longitude", ylab="Latitude") 122 | plot(log(seff_rgeo), legend.only=TRUE, col=rev(divPalette(n=100, name="Spectral")), legend.width=1, legend.shrink=0.75, axis.args=list(at=seq(log(minValue(seff_rgeo)), log(maxValue(seff_rgeo)), length.out=5), labels=round(exp(seq(log(minValue(seff_rgeo)), log(maxValue(seff_rgeo)), length.out=5))), cex.axis=0.6), legend.args=list(text="N", side=3, font=2, line=0, cex=0.8)) 123 | plot(wrld_simpl, add=T, border="darkgrey", lwd=0.1) 124 | title(main="Number (log-scale) of plots \nper 30 arc-minute spatial unit") 125 | dev.off() 126 | # 127 | # Plot the global sampling effort per spatial unit of 1 degree 128 | # 129 | seff_rgeo <- aggregate(seff_rgeo, fact=2, fun=sum) 130 | sum(getValues(seff_rgeo), na.rm=TRUE) # 799400 relevés 131 | tiff(filename="Sampling_effort_at_1_degree.tiff", width=20, height=12, res=300, unit="cm") 132 | par(mar=c(4, 4, 4, 1)) 133 | plot(log(seff_rgeo), legend=FALSE, asp=0, col=rev(divPalette(n=100, name="Spectral")), xlab="Longitude", ylab="Latitude") 134 | plot(log(seff_rgeo), legend.only=TRUE, col=rev(divPalette(n=100, name="Spectral")), legend.width=1, legend.shrink=0.75, axis.args=list(at=seq(log(minValue(seff_rgeo)), log(maxValue(seff_rgeo)), length.out=5), labels=round(exp(seq(log(minValue(seff_rgeo)), log(maxValue(seff_rgeo)), length.out=5))), cex.axis=0.6), legend.args=list(text="N", side=3, font=2, line=0, cex=0.8)) 135 | plot(wrld_simpl, add=T, border="darkgrey", lwd=0.1) 136 | title(main="Number (log-scale) of plots \nper 1 degree spatial unit") 137 | dev.off() 138 | # 139 | # Plot the global sampling effort per spatial unit of 2 degrees 140 | # 141 | seff_rgeo <- aggregate(seff_rgeo, fact=2, fun=sum) 142 | sum(getValues(seff_rgeo), na.rm=TRUE) # 799400 relevés 143 | tiff(filename="Sampling_effort_at_2_degrees.tiff", width=20, height=12, res=300, unit="cm") 144 | par(mar=c(4, 4, 4, 1)) 145 | plot(log(seff_rgeo), legend=FALSE, asp=0, col=rev(divPalette(n=100, name="Spectral")), xlab="Longitude", ylab="Latitude") 146 | plot(log(seff_rgeo), legend.only=TRUE, col=rev(divPalette(n=100, name="Spectral")), legend.width=1, legend.shrink=0.75, axis.args=list(at=seq(log(minValue(seff_rgeo)), log(maxValue(seff_rgeo)), length.out=5), labels=round(exp(seq(log(minValue(seff_rgeo)), log(maxValue(seff_rgeo)), length.out=5))), cex.axis=0.6), legend.args=list(text="N", side=3, font=2, line=0, cex=0.8)) 147 | plot(wrld_simpl, add=T, border="darkgrey", lwd=0.1) 148 | title(main="Number (log-scale) of plots \nper 2 degrees spatial unit") 149 | dev.off() 150 | # 151 | # Plot the difference with the initial sampling effort per spatial unit of 2 degrees 152 | # 153 | init_seff_rgeo <- aggregate(init_seff_rgeo, fact=48, fun=sum) 154 | sum(getValues(init_seff_rgeo), na.rm=TRUE) # 1120686 relevés 155 | diff_seff_rgeo <- init_seff_rgeo-seff_rgeo 156 | sum(getValues(diff_seff_rgeo), na.rm=TRUE) # 316728 relevés lost after data filtering 157 | tiff(filename="Plot_loss_at_2_degrees.tiff", width=20, height=12, res=300, unit="cm") 158 | par(mar=c(4, 4, 4, 1)) 159 | plot(log(diff_seff_rgeo+1), legend=FALSE, asp=0, col=c("grey", rev(divPalette(n=99, name="Spectral"))), xlab="Longitude", ylab="Latitude") 160 | plot(log(diff_seff_rgeo+1), legend.only=TRUE, col=c("grey", rev(divPalette(n=99, name="Spectral"))), legend.width=1, legend.shrink=0.75, axis.args=list(at=seq(0, log(maxValue(diff_seff_rgeo)+1), length.out=5), labels=round(seq(0, exp(log(maxValue(diff_seff_rgeo)+1)), length.out=5)), cex.axis=0.6), legend.args=list(text="N", side=3, font=2, line=0, cex=0.8)) 161 | plot(wrld_simpl, add=T, border="darkgrey", lwd=0.1) 162 | title(main="Number (log-scale) of plots \nper 2 degrees spatial unit") 163 | dev.off() 164 | # 165 | # Compute the global sampling effort across the bivariate (PC1-PC2) environmental space (not the geographical space) 166 | # 167 | load("PC1_r.RData") 168 | load("PC2_r.RData") 169 | plot_data@data$cellID <- cellFromXY(rgeo, cbind(plot_data@data$POINT_X, plot_data@data$POINT_Y)) 170 | plot_data@data$pc1_val <- extract(PC1_r, coordinates(plot_data)) 171 | plot_data@data$pc2_val <- extract(PC2_r, coordinates(plot_data)) 172 | load("pca3.RData") 173 | res <- 100 # Setting the number of bins per PCA axis to 100 174 | reco <- raster(nrows=res, ncols=res, xmn=min(pca3$x[, 1]), xmx=max(pca3$x[, 1]), ymn=min(pca3$x[, 2]), ymx=max(pca3$x[, 2])) 175 | PC1_PC2_r <- rasterize(pca3$x[, 1:2], reco, fun="count") # Compute the density of geographic grid cells across the entire bivariate (PC1-PC2) environmental space 176 | sPlot_reco <- rasterize(plot_data@data[, c("pc1_val", "pc2_val")], reco, fun="count") # Compute the sampling effort (number of vegetation plots) per environmental unit (cell) across the entire bivariate (PC1-PC2) environmental space 177 | temp1 <- getValues(PC1_PC2_r) 178 | temp1[!is.na(temp1)] <- 0 # Put zero values for the empty cells (cf. there is no existing terrestrial grid cell available on Earth for the focal PC1-PC2 grid cell condition) 179 | temp2 <- getValues(sPlot_reco) 180 | temp2[which(temp1==0&is.na(temp2))] <- 0 # Put zero values for the empty cells (cf. there is no vegeteation plots available for those environmental conditions: gaps) 181 | sPlot_reco <- setValues(reco, temp2) 182 | # 183 | # Plot the number of 2.5 arc-minute cells for each cell of the PC1-PC2 space 184 | # 185 | tiff(filename="Global_availability_PC1-PC2.tiff", width=12, height=12, res=300, unit="cm") 186 | par(mar=c(4, 4, 4, 1)) 187 | plot(log(PC1_PC2_r), asp=0, col=rev(divPalette(n=100, name="RdBu")), xlab="PC1 (cold and seasonal to hot and stable)", ylab="PC2 (dry to wet)", legend=FALSE) 188 | plot(log(PC1_PC2_r), asp=0, col=rev(divPalette(n=100, name="RdBu")), legend.only=TRUE, legend.width=1, legend.shrink=0.75, axis.args=list(at=seq(log(minValue(PC1_PC2_r)), log(maxValue(PC1_PC2_r)), length.out=5), labels=round(seq(exp(log(minValue(PC1_PC2_r))), exp(log(maxValue(PC1_PC2_r))), length.out=5)), cex.axis=0.6), legend.args=list(text="N", side=3, font=2, line=0, cex=0.8)) 189 | title(main="Number of 2.5 arc-minute spatial units \nper environmental cell (log scale)") 190 | dev.off() 191 | # 192 | # Plot the number of sPlot relevés for each cell of the PC1-PC2 space 193 | # 194 | tiff(filename="Sampling_effort_PC1-PC2.tiff", width=12, height=12, res=300, unit="cm") 195 | par(mar=c(4, 4, 4, 1)) 196 | plot(log(sPlot_reco+1), asp=0, col=c("grey", rev(divPalette(n=99, name="RdBu"))), xlab="PC1 (cold and seasonal to hot and stable)", ylab="PC2 (dry to wet)", legend=FALSE) 197 | plot(log(sPlot_reco+1), asp=0, col=c("grey", rev(divPalette(n=99, name="RdBu"))), legend.only=TRUE, legend.width=1, legend.shrink=0.75, axis.args=list(at=seq(0, log(maxValue(sPlot_reco)+1), length.out=5), labels=round(seq(0, exp(log(maxValue(sPlot_reco)+1)), length.out=5)), cex.axis=0.6), legend.args=list(text="N", side=3, font=2, line=0, cex=0.8)) 198 | title(main="Number of sPlot relevés \nper environmental cell (log scale)") 199 | dev.off() 200 | # 201 | # Plot for each cell of the PC1-PC2 space the ratio between the relative proportion of sPlot relevés and the relative proportion of spatial units available worldwide 202 | # 203 | tiff(filename="Sampling_effort_ratio_PC1-PC2.tiff", width=12, height=12, res=300, unit="cm") 204 | par(mar=c(4, 4, 4, 1)) 205 | ratio_reco <- (sPlot_reco/max(getValues(sPlot_reco), na.rm=T))/(PC1_PC2_r/max(getValues(PC1_PC2_r), na.rm=T)) 206 | plot(log(ratio_reco+1), asp=0, col=c("grey", rev(divPalette(n=99, name="Spectral"))), xlab="PC1 cold and seasonal to hot and stable)", ylab="PC2 (dry to wet)") 207 | title(main="Oversampled (>0.69) versus \nundersampled (<0.69) PC1-PC2 cells") 208 | dev.off() 209 | # 210 | # Run a sensitivity analysis to define the most appropriate resolution of the bivariate (PC1-PC2) environmental space 211 | # 212 | res <- seq(10, 500, 10) 213 | ncell_disp <- c() 214 | ncell_samp <- c() 215 | seff_med <- c() 216 | seff_mean <- c() 217 | seff_max <- c() 218 | seff_min <- c() 219 | nbrel_sel <- c() 220 | for (i in 1:length(res)) { 221 | print(paste(i, "of", length(res), sep=" ")) 222 | r <- raster(nrows=res[i], ncols=res[i], xmn=min(pca3$x[, 1]), xmx=max(pca3$x[, 1]), ymn=min(pca3$x[, 2]), ymx=max(pca3$x[, 2])) 223 | temp <- rasterize(pca3$x[, 1:2], r, fun="count") 224 | ncell_disp <- c(ncell_disp, length(which(getValues(temp)>0))) 225 | temp <- rasterize(plot_data@data[, c("pc1_val", "pc2_val")], r, fun="count") 226 | temp <- getValues(temp) 227 | temp <- na.omit(temp) 228 | ncell_samp <- c(ncell_samp, length(which(temp)>0)) 229 | seff_med <- c(seff_med, median(temp)) 230 | seff_mean <- c(seff_mean, mean(temp)) 231 | seff_max <- c(seff_max, max(temp)) 232 | seff_min <- c(seff_min, min(temp)) 233 | nbrel_sel <- c(nbrel_sel, length(temp[which(temp>median(temp)), ])*median(temp)+sum(temp[which(temp<=median(temp)), ])) 234 | } 235 | plot(res, seff_med) 236 | plot(res, seff_max) 237 | plot(res, seff_mean) 238 | plot(res, nbrel_sel) 239 | plot(res, ncell_samp/ncell_disp, ylim=c(0, 1)) 240 | # 241 | # Resample sPlot within the PC1-PC2 environmental space to get an environmentally-balanced subset 242 | # 243 | plot_data <- plot_data@data 244 | save(plot_data, file="plot_data.RData") # Save the latest version of plot_data 245 | Sys.setenv("PKG_CXXFLAGS"="-fopenmp") # Set environment variables for other processes called from within R 246 | sourceCpp("/_functions_TH/bray.part.OpenMP.cpp") # Source C++ fonctions written by Tarek Hattab from folder "_functions_TH" 247 | sourceCpp("/_functions_TH/bray.part.C_RcppParallel.cpp") # Source C++ fonctions written by Tarek Hattab from folder "_functions_TH" 248 | sourceCpp("/_functions_TH/hcr.C.cpp") # Source C++ fonctions written by Tarek Hattab from folder "_functions_TH" 249 | sourceCpp("/_functions_TH/cast_binary.cpp") # Source C++ fonctions written by Tarek Hattab from folder "_functions_TH" 250 | BigBrayPart <- function(bigMat) { 251 | zeros <- big.matrix(nrow=nrow(bigMat), ncol=nrow(bigMat), init=0, type=typeof(bigMat), shared=FALSE, backingfile=paste("BrayMatrix_",i,sep=""), backingpath=getwd(), descriptorfile=paste("BrayMatrix_",i,".desc",sep="")) 252 | bray_distance_OpenMP(bigMat@address, zeros@address) 253 | return(zeros) 254 | } 255 | res <- 100 # Set the resolution of the environmental space based on the sensitivity analysis 256 | r <- raster(nrows=res, ncols=res, xmn=min(pca3$x[, 1]), xmx=max(pca3$x[, 1]), ymn=min(pca3$x[, 2]), ymx=max(pca3$x[, 2])) # Prepare the environmental space restricted to sPlot relevés only (not the entire environmental space available at a global extent) 257 | pca_sPlot_r <- rasterize(plot_data[, c("pc1_val", "pc2_val")], r, fun="count") 258 | cutoff <- median(values(pca_sPlot_r), na.rm=TRUE) # Compute the cutoff value above which relevés have to be resampled for a given cell 259 | tempZoneOut <- coordinates(pca_sPlot_r)[which(values(pca_sPlot_r)>cutoff), ] # Select only the coordinates of the environmental cells for which the total number of sPlot relevés available exceeds the cutoff value 260 | repet <- 100 # Set the number of repetitions for the HCR function 261 | sp_data <- DT2[, c(1, 2, 7)] # Prepare the species data table that will be used by the HCR approach 262 | names(sp_data) <- c("plot_id", "sp_name", "rel_cov") 263 | save(sp_data, file="sp_data.RData") # Save the latest version of sp_data 264 | plotToRemove <- as.list(rep(NA, repet)) # Prepare an empty object to store the IDs of the relevés to be removed 265 | for (i in 1:nrow(tempZoneOut)) { 266 | print("--------") 267 | print(paste(i, "out of", nrow(tempZoneOut), "cells", sep=" ")) 268 | plot(pca_sPlot_r, asp=0, xlab="PC1 (cold and seasonal to hot and stable)", ylab="PC2 (dry to wet)") 269 | points(tempZoneOut[, 1], tempZoneOut[, 2], cex=0.5) 270 | points(tempZoneOut[i, 1], tempZoneOut[i, 2], col="red", pch=19) 271 | sel.plot <- which(plot_data$pc1_val > tempZoneOut[i, 1]-(res(r)[1]/2) & 272 | plot_data$pc1_val < tempZoneOut[i, 1]+(res(r)[1]/2) & 273 | plot_data$pc2_val > tempZoneOut[i, 2]-(res(r)[2]/2) & 274 | plot_data$pc2_val < tempZoneOut[i, 2]+(res(r)[2]/2)) 275 | print(paste("This cell contains", length(sel.plot), "relevés", sep=" ")) 276 | idZoneOut <- plot_data[sel.plot, "PlotID"] 277 | sel.comm <- sp_data[which(sp_data$plot_id%in%idZoneOut), c("plot_id", "sp_name", "rel_cov")] 278 | sel.comm <- na.omit(sel.comm) 279 | sel.comm [, 2]<- factor(sel.comm[, 2], labels=seq(1:length(unique(sel.comm[, 2])))) 280 | sel.comm [, 2]<- as.numeric(sel.comm[, 2]) 281 | comm.data <- castC(iD=sel.comm[, 1], sp=sel.comm[, 2], cov=sel.comm[, 3]) 282 | rowNames <- comm.data[, 1] 283 | comm.data <- comm.data[, -1] 284 | print(paste("The total number of species is", dim(comm.data)[[2]], sep=" ")) 285 | gc() 286 | if (nrow(comm.data) > 20000) { 287 | bigComMatrix <- as.big.matrix(comm.data,shared=FALSE, backingfile=paste("Matrix_",i,sep=""), backingpath=getwd(), descriptorfile=paste("Matrix_", i, ".desc", sep="")) ; brayBalDist <- BigBrayPart(bigComMatrix) 288 | } else { 289 | brayBalDist <- bray_distance_RcppParallel(comm.data); brayBalDist <- as.big.matrix(brayBalDist) 290 | } 291 | for (j in 1:repet) { 292 | selectedPlot <- HcrCPP(brayBalDist@address, nout=cutoff, nsampl=1000) 293 | selectedPlot <- rowNames[selectedPlot] 294 | selectedPlotIndex <- which(idZoneOut%in%selectedPlot) 295 | plotToRemove[[j]] <- c(plotToRemove[[j]], idZoneOut[-selectedPlotIndex]) 296 | } 297 | output <- list(i, plotToRemove) 298 | save(output, file="plotToRemove.RData") 299 | } 300 | # 301 | -------------------------------------------------------------------------------- /_resampling/03_extracting_selected_plots_from_the_sPlot_database.R: -------------------------------------------------------------------------------- 1 | # 2 | # 01 February 2017, updated on 11 December 2020 3 | # 4 | library(raster) 5 | library(fBasics) 6 | library(maptools) 7 | # 8 | data(wrld_simpl) 9 | # 10 | load("plot_data.RData") 11 | load("plotToRemove.RData") 12 | load("pca3.RData") 13 | # 14 | ls() 15 | # 16 | output[[1]] 17 | # 18 | # 858 cells from the PC1-PC2 space have been resampled with a cutoff value of 19 | # 50 plots maximum per cell with 50 being the median value of the total number 20 | # of plots across all grid cells of the PC1-PC2 space thus also being a good 21 | # compromise between quantity and quality in terms of extracting a subset of 22 | # sPlot that has a balanced sampling effort across the PC1-PC2 space 23 | # 24 | plotToRemove <- output[[2]] 25 | rm(output) 26 | class(plotToRemove) 27 | # 28 | # A list of 100 vectors (100 different resampling iterations) that contains the 29 | # IDs of sPlot releves to remove from the plot_data object 30 | # 31 | length(plotToRemove[[1]]) 32 | # 33 | # First iteration containing 700037 IDs 34 | # 35 | length(plotToRemove[[2]]) 36 | # 37 | # Second iteration containing 700022 IDs 38 | # 39 | head(plotToRemove[[1]]) 40 | head(plotToRemove[[2]]) 41 | # 42 | # First ID is NA for each vector in the list which is normal (cf. a property of 43 | # the resampling loop) we have to clean that (see below) 44 | # 45 | for (i in 1:100) { 46 | plotToRemove[[i]] <- na.omit(plotToRemove[[i]]) 47 | } 48 | # 49 | # One extraction exemple from the first vector in the plotToRemove list object 50 | # 51 | posit <- match(plotToRemove[[1]], plot_data$PlotID) 52 | plot_sel <- plot_data[-posit, c("PlotID")] 53 | length(plot_sel) 54 | # 55 | # A total of 99364 plots seem to be selected which is a bit too much given that 56 | # 50*858 grid cells gives only 42900 plots and even if some grid cells have less 57 | # than 50 plots, the total should not be 99364??? This is far too much and thus 58 | # something is wrong 59 | # 60 | length(which(is.na(plot_data$pc1_val))) 61 | # 62 | # It seems that 42878 plots in the plot_data object have NAs for PC1 and these 63 | # mostly correspond to coastal pixels where data from SoilGrid are unavailable 64 | # 65 | posit <- which(is.na(plot_data$pc1_val)) 66 | plot_data <- plot_data[-posit, ] 67 | dim(plot_data)[[1]] 68 | # 69 | # After removing rows with NAS for PC1, the plot_data object has 756522 plots 70 | # instead of 799400 plots 71 | # 72 | posit <- match(plotToRemove[[1]], plot_data$PlotID) 73 | plot_sel <- plot_data[-posit, c("PlotID")] 74 | length(plot_sel) 75 | # 76 | # In the end, the true selection from the first resampling iteration is 56486 77 | # plots instead of 99364 plots 78 | # 79 | plot_sel <- list() 80 | for (i in 1:100) { 81 | posit <- match(plotToRemove[[i]], plot_data$PlotID) 82 | plot_sel[[i]] <- plot_data[-posit, c("PlotID")] 83 | } 84 | save(plot_sel, file="plot_sel.RData") 85 | # 86 | -------------------------------------------------------------------------------- /_resampling/README.md: -------------------------------------------------------------------------------- 1 | # Documentation on the code to resample sPlot 2 | 3 | This is the open-source R code (CC-BY Jonathan Lenoir & Tarek Hattab) that implements the resampling of the original sPlot database v2.1 into environmentally-balanced subsets. 4 | 5 | There is three main files with the code to: 6 | 7 | **1.** Run a global principal component analysis (PCA) encapsulating the entire environmental space covered by terrestrial systems ["*01_running_a_global_PCA_on_bioclimatic_and_soil_variables.R*"] 8 | 9 | **2.** Resample the sPlot vegetation database in a systematic manner across the 2D-space covered by the first two principal components (PCs) of the PCA from step 1 [*02_resampling_sPlot_within_the_PC1-PC2_environmental_space.R*] 10 | 11 | **3.** Extract the set of selected vegetation plots from the sPlot database using the outputs produced by the resampling algorithm in step 2 [*03_extracting_selected_plots_from_the_sPlot_database.R*] 12 | 13 | # 1. The global PCA 14 | 15 | The R script file "*01_running_a_global_PCA_on_bioclimatic_and_soil_variables.R*" contains the code to run the PCA. It first loads a large dataset of 30 variables reflecting bioclimatic and soil conditions. All these variables can be downloaded from the original open-access archives storing the global raster grids (CHELSA, CGIAR-CSI & SoilGrids). After running the global PCA and interpreting the meaning of the first two to three PCs, the script allows to: (1) extract data from the first three PCs (PC1, PC2 & PC3); (2) store these data into global raster grids, one for each PC; and (3) to map the outputs at the global extent. 16 | 17 | # 2. The resampling algorithm 18 | 19 | The R script file "*02_resampling_sPlot_within_the_PC1-PC2_environmental_space.R*" contains the code that: (1) loads the original sPlot database v2.1 (both the header data and the species composition data stored into two different dataframes); (2) cleans sPlot v2.1 to extract the set of vegetation plots to be used for the resampling procedure; (3) runs a sensitivity analysis to define the most appropriate grid resolution to define the bivariate (PC1-PC2) environmental space to be used to perform the resampling; and (4) resamples sPlot within the PC1-PC2 environmental space using the heterogeneity‐constrained random (HCR) reseamping algorithm (several times to get several possible iterations: 100). 20 | 21 | The HCR resampling algorithm was initially written by Attila Lengyel. We adapted the original algorithm and code to our own need for the resampling of sPlot. Due to the large amount of data in sPlot, the large dissimilarity matrices generated by the HCR algorithm (pairwise distance matrices based on plant community composition) and to speed up the processing time to resample sPlot, Tarek Hattab wrote several useful functions in C++ (see the folder entitled "*_functions_TH*" that contains all the functions written by Tarek and that are then sourced from R using the sourceCpp function from the Rcpp package). Note that you need to install Rtools on Windows machines to be able to run this set of functions. Also, we do not take any responsibility if our code makes your computer explode... 22 | 23 | # 3. The result 24 | 25 | The actual output of the resampling algorithm does not list the set of vegetation plot IDs selected by the resampling apporach for each of the 100 HCR runs but rather the list of vegetation plot IDs not selected by the resampling approach, i.e., the garbage. This is the reason why the main output of the resampling algorithm is stored in a list object called "plotToRemove". The last R script file "*03_extracting_selected_plots_from_the_sPlot_database.R*" thus contains the code to remove (for each of the 100 HCR runs) the set of non-selected plots from the sPlot 2.1 database so that we get in the end several possible realisations of environmentally-balanced datasets representative of sPlot. 26 | -------------------------------------------------------------------------------- /_resampling/_functions_TH/BetaJtu_OpenMP.cpp: -------------------------------------------------------------------------------- 1 | // To enable the functionality provided by Armadillo's various macros, 2 | 3 | // simply include them before you include the RcppArmadillo headers. 4 | 5 | #define ARMA_NO_DEBUG 6 | 7 | // [[Rcpp::depends(RcppArmadillo, BH, bigmemory)]] 8 | 9 | // [[Rcpp::plugins(cpp11)]] 10 | 11 | // [[Rcpp::plugins(openmp)]] 12 | 13 | #include 14 | 15 | #include 16 | 17 | #include 18 | 19 | #include 20 | 21 | 22 | using namespace Rcpp; 23 | 24 | using namespace arma; 25 | 26 | template 27 | 28 | 29 | inline double JacElement(InputIterator1 begin1, InputIterator1 end1, InputIterator2 begin2){ 30 | 31 | // value to return 32 | 33 | double rval = 0; 34 | 35 | int compA = 0; 36 | 37 | int compB = 0; 38 | 39 | int compC = 0; 40 | 41 | // set iterators to beginning of ranges 42 | 43 | InputIterator1 it1 = begin1; 44 | 45 | InputIterator2 it2 = begin2; 46 | 47 | // for each input item 48 | 49 | while (it1 != end1) { 50 | 51 | // take the value and increment the iterator 52 | 53 | double d1 = *it1++; 54 | 55 | double d2 = *it2++; 56 | 57 | if (d1 == 1 && d2 ==1) { 58 | 59 | compA += 1; 60 | 61 | } 62 | 63 | if (d1 > d2) { 64 | 65 | compB += 1; 66 | 67 | } 68 | 69 | if (d1 < d2) { 70 | 71 | compC += 1; 72 | 73 | } 74 | 75 | } 76 | 77 | if (compB < compC) { 78 | 79 | double numerator = 2 * compB; 80 | 81 | double denominator = compA + numerator; 82 | 83 | rval = numerator / denominator; 84 | 85 | } 86 | 87 | else { 88 | 89 | double numerator = 2 * compC; 90 | 91 | double denominator = compA + numerator; 92 | 93 | rval = numerator / denominator; 94 | 95 | } 96 | 97 | return rval; 98 | 99 | } 100 | 101 | void BetaJtu (const arma::Mat& mat, arma::Mat rmat){ 102 | 103 | int nRows = mat.n_rows; 104 | 105 | omp_set_num_threads(10); 106 | 107 | #pragma omp parallel for 108 | 109 | for (int i = 0; i < nRows; i++) { 110 | 111 | for (int j = 0; j < nRows; j++) { 112 | 113 | arma::Row row1 = mat.row(i); 114 | 115 | arma::Row row2 = mat.row(j); 116 | 117 | rmat(i,j) = JacElement(row1.begin(), row1.end(), row2.begin()); 118 | 119 | } 120 | 121 | } 122 | 123 | } 124 | 125 | 126 | // [[Rcpp::export]] 127 | 128 | void betaJtu(SEXP mat, SEXP rmat) { 129 | 130 | XPtr xpMat(mat); 131 | 132 | XPtr xpOutMat(rmat); 133 | 134 | // create the worker 135 | BetaJtu(arma::Mat((int *)xpMat->matrix(), xpMat->nrow(), xpMat->ncol(), false), 136 | 137 | arma::Mat((double *)xpOutMat->matrix(), xpOutMat->nrow(), xpOutMat->ncol(), false)); 138 | 139 | } 140 | -------------------------------------------------------------------------------- /_resampling/_functions_TH/BetaJtu_RcppParallel.cpp: -------------------------------------------------------------------------------- 1 | // To enable the functionality provided by Armadillo's various macros, 2 | 3 | // simply include them before you include the RcppArmadillo headers. 4 | 5 | #define ARMA_NO_DEBUG 6 | 7 | // [[Rcpp::depends(RcppArmadillo, BH, bigmemory,RcppParallel)]] 8 | 9 | // [[Rcpp::plugins(cpp11)]] 10 | 11 | #include 12 | 13 | #include 14 | 15 | #include 16 | 17 | #include 18 | 19 | using namespace Rcpp; 20 | 21 | using namespace arma; 22 | 23 | using namespace RcppParallel; 24 | 25 | 26 | template 27 | 28 | inline double JacElement(InputIterator1 begin1, InputIterator1 end1, InputIterator2 begin2){ 29 | 30 | // value to return 31 | 32 | double rval = 0; 33 | 34 | int compA = 0; 35 | 36 | int compB = 0; 37 | 38 | int compC = 0; 39 | 40 | // set iterators to beginning of ranges 41 | 42 | InputIterator1 it1 = begin1; 43 | 44 | InputIterator2 it2 = begin2; 45 | 46 | // for each input item 47 | 48 | while (it1 != end1) { 49 | 50 | // take the value and increment the iterator 51 | 52 | double d1 = *it1++; 53 | 54 | double d2 = *it2++; 55 | 56 | if (d1 == 1 && d2 ==1) { 57 | 58 | compA += 1; 59 | 60 | } 61 | 62 | if (d1 > d2) { 63 | 64 | compB += 1; 65 | 66 | } 67 | 68 | if (d1 < d2) { 69 | 70 | compC += 1; 71 | 72 | } 73 | 74 | } 75 | 76 | if (compB < compC) { 77 | 78 | double numerator = 2 * compB; 79 | 80 | double denominator = compA + numerator; 81 | 82 | rval = numerator / denominator; 83 | 84 | } 85 | 86 | else { 87 | 88 | double numerator = 2 * compC; 89 | 90 | double denominator = compA + numerator; 91 | 92 | rval = numerator / denominator; 93 | 94 | } 95 | 96 | return rval; 97 | 98 | } 99 | 100 | struct JtuDistance : public Worker { 101 | 102 | // input matrix to read from 103 | 104 | const RMatrix mat; 105 | 106 | // output matrix to write to 107 | 108 | RMatrix rmat; 109 | 110 | // initialize from Rcpp input and output matrixes (the RMatrix class 111 | 112 | // can be automatically converted to from the Rcpp matrix type) 113 | 114 | JtuDistance(const IntegerMatrix mat, NumericMatrix rmat) 115 | : mat(mat), rmat(rmat) {} 116 | 117 | // function call operator that work for the specified range (begin/end) 118 | 119 | void operator()(std::size_t begin, std::size_t end) { 120 | 121 | for (std::size_t i = begin; i < end; i++) { 122 | 123 | for (std::size_t j = 0; j < i; j++) { 124 | 125 | // rows we will operate on 126 | 127 | RMatrix::Row row1 = mat.row(i); 128 | 129 | RMatrix::Row row2 = mat.row(j); 130 | 131 | rmat(j,i) = JacElement(row1.begin(), row1.end(), row2.begin()); 132 | 133 | rmat(i,j) = rmat(j,i); 134 | 135 | } 136 | 137 | } 138 | 139 | } 140 | 141 | }; 142 | 143 | // [[Rcpp::export]] 144 | 145 | NumericMatrix betaJtu_RcppParallel(IntegerMatrix mat) { 146 | 147 | // allocate the matrix we will return 148 | NumericMatrix rmat(mat.nrow(), mat.nrow()); 149 | 150 | // create the worker 151 | JtuDistance JtuDistance(mat, rmat); 152 | 153 | // call it with parallelFor 154 | parallelFor(0, mat.nrow(), JtuDistance); 155 | 156 | 157 | return rmat; 158 | } -------------------------------------------------------------------------------- /_resampling/_functions_TH/bray.part.C_RcppParallel.cpp: -------------------------------------------------------------------------------- 1 | // To enable the functionality provided by Armadillo's various macros, 2 | 3 | // simply include them before you include the RcppArmadillo headers. 4 | 5 | #define ARMA_NO_DEBUG 6 | 7 | // [[Rcpp::depends(RcppArmadillo, BH, bigmemory,RcppParallel)]] 8 | 9 | // [[Rcpp::plugins(cpp11)]] 10 | 11 | #include 12 | 13 | #include 14 | 15 | #include 16 | 17 | #include 18 | 19 | using namespace Rcpp; 20 | 21 | using namespace arma; 22 | 23 | using namespace RcppParallel; 24 | 25 | 26 | template 27 | 28 | inline double BrayElement(InputIterator1 begin1, InputIterator1 end1, InputIterator2 begin2){ 29 | 30 | // value to return 31 | 32 | double rval = 0; 33 | 34 | double sumPi = 0; 35 | 36 | double sumPj = 0; 37 | 38 | double aComp = 0; 39 | 40 | // set iterators to beginning of ranges 41 | 42 | InputIterator1 it1 = begin1; 43 | 44 | InputIterator2 it2 = begin2; 45 | 46 | // for each input item 47 | 48 | while (it1 != end1) { 49 | 50 | // take the value and increment the iterator 51 | 52 | double d1 = *it1++; 53 | 54 | double d2 = *it2++; 55 | 56 | sumPi += d1; 57 | 58 | sumPj += d2; 59 | 60 | aComp += std::min(d1,d2); 61 | 62 | } 63 | 64 | double bComp = sumPi - aComp; 65 | 66 | double cComp = sumPj - aComp; 67 | 68 | double minBC = 0.0; 69 | 70 | if (bComp < cComp) { 71 | 72 | minBC = bComp; 73 | 74 | } 75 | 76 | else { 77 | 78 | minBC = cComp; 79 | 80 | } 81 | 82 | rval = minBC / (aComp + minBC); 83 | 84 | return rval; 85 | } 86 | 87 | struct BrayDistance : public Worker { 88 | 89 | // input matrix to read from 90 | 91 | const RMatrix mat; 92 | 93 | // output matrix to write to 94 | 95 | RMatrix rmat; 96 | 97 | // initialize from Rcpp input and output matrixes (the RMatrix class 98 | 99 | // can be automatically converted to from the Rcpp matrix type) 100 | 101 | BrayDistance(const NumericMatrix mat, NumericMatrix rmat) 102 | : mat(mat), rmat(rmat) {} 103 | 104 | // function call operator that work for the specified range (begin/end) 105 | 106 | void operator()(std::size_t begin, std::size_t end) { 107 | 108 | for (std::size_t i = begin; i < end; i++) { 109 | 110 | for (std::size_t j = 0; j < i; j++) { 111 | 112 | // rows we will operate on 113 | 114 | RMatrix::Row row1 = mat.row(i); 115 | 116 | RMatrix::Row row2 = mat.row(j); 117 | 118 | rmat(j,i) = BrayElement(row1.begin(), row1.end(), row2.begin()); 119 | 120 | rmat(i,j) = rmat(j,i); 121 | 122 | } 123 | 124 | } 125 | 126 | } 127 | 128 | }; 129 | 130 | // [[Rcpp::export]] 131 | 132 | NumericMatrix bray_distance_RcppParallel(NumericMatrix mat) { 133 | 134 | // allocate the matrix we will return 135 | NumericMatrix rmat(mat.nrow(), mat.nrow()); 136 | 137 | // create the worker 138 | BrayDistance BrayDistance(mat, rmat); 139 | 140 | // call it with parallelFor 141 | parallelFor(0, mat.nrow(), BrayDistance); 142 | 143 | 144 | return rmat; 145 | } 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | -------------------------------------------------------------------------------- /_resampling/_functions_TH/bray.part.OpenMP.cpp: -------------------------------------------------------------------------------- 1 | // To enable the functionality provided by Armadillo's various macros, 2 | 3 | // simply include them before you include the RcppArmadillo headers. 4 | 5 | #define ARMA_NO_DEBUG 6 | 7 | // [[Rcpp::depends(RcppArmadillo, BH, bigmemory)]] 8 | 9 | // [[Rcpp::plugins(cpp11)]] 10 | 11 | // [[Rcpp::plugins(openmp)]] 12 | 13 | #include 14 | 15 | #include 16 | 17 | #include 18 | 19 | #include 20 | 21 | 22 | using namespace Rcpp; 23 | 24 | using namespace arma; 25 | 26 | template 27 | 28 | inline double BrayElement(InputIterator1 begin1, InputIterator1 end1, InputIterator2 begin2){ 29 | 30 | // value to return 31 | 32 | double rval = 0; 33 | 34 | double sumPi = 0; 35 | 36 | double sumPj = 0; 37 | 38 | double aComp = 0; 39 | 40 | // set iterators to beginning of ranges 41 | 42 | InputIterator1 it1 = begin1; 43 | 44 | InputIterator2 it2 = begin2; 45 | 46 | // for each input item 47 | 48 | while (it1 != end1) { 49 | 50 | // take the value and increment the iterator 51 | 52 | double d1 = *it1++; 53 | 54 | double d2 = *it2++; 55 | 56 | sumPi += d1; 57 | 58 | sumPj += d2; 59 | 60 | aComp += std::min(d1,d2); 61 | 62 | } 63 | 64 | double bComp = sumPi - aComp; 65 | 66 | double cComp = sumPj - aComp; 67 | 68 | double minBC = 0.0; 69 | 70 | if (bComp < cComp) { 71 | 72 | minBC = bComp; 73 | 74 | } 75 | 76 | else { 77 | 78 | minBC = cComp; 79 | 80 | } 81 | 82 | rval = minBC / (aComp + minBC); 83 | 84 | return rval; 85 | } 86 | 87 | 88 | void BrayDistance(const arma::Mat& mat, arma::Mat rmat){ 89 | 90 | int nRows = mat.n_rows; 91 | 92 | omp_set_num_threads(10); 93 | 94 | #pragma omp parallel for 95 | 96 | for (int i = 0; i < nRows; i++) { 97 | 98 | for (int j = 0; j < nRows; j++) { 99 | 100 | arma::Row row1 = mat.row(i); 101 | 102 | arma::Row row2 = mat.row(j); 103 | 104 | rmat(i,j) = BrayElement(row1.begin(), row1.end(), row2.begin()); 105 | 106 | } 107 | 108 | } 109 | 110 | rmat = rmat.t(); 111 | 112 | } 113 | 114 | // [[Rcpp::export]] 115 | 116 | void bray_distance_OpenMP(SEXP mat, SEXP rmat) { 117 | 118 | XPtr xpMat(mat); 119 | 120 | XPtr xpOutMat(rmat); 121 | 122 | // create the worker 123 | BrayDistance(arma::Mat((double *)xpMat->matrix(), xpMat->nrow(), xpMat->ncol(), false), 124 | arma::Mat((double *)xpOutMat->matrix(), xpOutMat->nrow(), xpOutMat->ncol(), false)); 125 | 126 | } 127 | -------------------------------------------------------------------------------- /_resampling/_functions_TH/cast.cpp: -------------------------------------------------------------------------------- 1 | // To enable the functionality provided by Armadillo's various macros, 2 | 3 | // simply include them before you include the RcppArmadillo headers. 4 | 5 | #define ARMA_NO_DEBUG 6 | 7 | // [[Rcpp::depends(RcppArmadillo, BH, bigmemory)]] 8 | 9 | // [[Rcpp::plugins(cpp11)]] 10 | 11 | #include 12 | 13 | #include 14 | 15 | #include 16 | 17 | #include 18 | 19 | using namespace Rcpp; 20 | 21 | using namespace arma; 22 | 23 | IntegerVector whichEq( IntegerVector x, int value) { 24 | 25 | int nx = x.size(); 26 | std::vector y; 27 | y.reserve(nx); 28 | 29 | for(int i = 0; i < nx; i++) { 30 | if (x[i] == value) y.push_back(i); 31 | } 32 | 33 | return wrap(y); 34 | } 35 | 36 | 37 | // [[Rcpp::export]] 38 | 39 | NumericMatrix castC(IntegerVector iD, IntegerVector sp , NumericVector cov){ 40 | 41 | int sizeI = iD.size(); 42 | 43 | IntegerVector plotNb = unique(iD); 44 | 45 | IntegerVector spNb = unique(sp); 46 | 47 | int rowSize = plotNb.size(); 48 | 49 | int colSize= spNb.size() ; 50 | 51 | NumericMatrix comMat(rowSize, colSize +1); 52 | 53 | comMat(_ , 0) = plotNb; 54 | 55 | for (int i = 0; i < sizeI ; i++) { 56 | 57 | IntegerVector rowI = whichEq (plotNb,iD[i]); 58 | 59 | int plotID = rowI[0]; 60 | 61 | int species = sp[i]; 62 | 63 | comMat(plotID, species) += cov[i]; 64 | 65 | } 66 | 67 | return comMat; 68 | 69 | } 70 | -------------------------------------------------------------------------------- /_resampling/_functions_TH/cast_binary.cpp: -------------------------------------------------------------------------------- 1 | // To enable the functionality provided by Armadillo's various macros, 2 | 3 | // simply include them before you include the RcppArmadillo headers. 4 | 5 | #define ARMA_NO_DEBUG 6 | 7 | // [[Rcpp::depends(RcppArmadillo, BH, bigmemory)]] 8 | 9 | // [[Rcpp::plugins(cpp11)]] 10 | 11 | #include 12 | 13 | #include 14 | 15 | #include 16 | 17 | #include 18 | 19 | using namespace Rcpp; 20 | 21 | using namespace arma; 22 | 23 | IntegerVector whichEq( IntegerVector x, int value) { 24 | 25 | int nx = x.size(); 26 | std::vector y; 27 | y.reserve(nx); 28 | 29 | for(int i = 0; i < nx; i++) { 30 | if (x[i] == value) y.push_back(i); 31 | } 32 | 33 | return wrap(y); 34 | } 35 | 36 | 37 | // [[Rcpp::export]] 38 | 39 | IntegerMatrix castC(IntegerVector iD, IntegerVector sp , NumericVector cov){ 40 | 41 | int sizeI = iD.size(); 42 | 43 | IntegerVector plotNb = unique(iD); 44 | 45 | IntegerVector spNb = unique(sp); 46 | 47 | int rowSize = plotNb.size(); 48 | 49 | int colSize= spNb.size() ; 50 | 51 | IntegerMatrix comMat(rowSize, colSize +1); 52 | 53 | comMat(_ , 0) = plotNb; 54 | 55 | for (int i = 0; i < sizeI ; i++) { 56 | 57 | IntegerVector rowI = whichEq (plotNb,iD[i]); 58 | 59 | int plotID = rowI[0]; 60 | 61 | int species = sp[i]; 62 | 63 | comMat(plotID, species) = 1; 64 | 65 | } 66 | 67 | return comMat; 68 | 69 | } 70 | -------------------------------------------------------------------------------- /_resampling/_functions_TH/hcr.C.cpp: -------------------------------------------------------------------------------- 1 | // To enable the functionality provided by Armadillo's various macros, 2 | 3 | // simply include them before you include the RcppArmadillo headers. 4 | 5 | #define ARMA_NO_DEBUG 6 | 7 | // [[Rcpp::depends(RcppArmadillo, BH, bigmemory)]] 8 | 9 | // [[Rcpp::plugins(cpp11)]] 10 | 11 | #include 12 | 13 | #include 14 | 15 | #include 16 | 17 | #include 18 | 19 | using namespace Rcpp; 20 | 21 | using namespace arma; 22 | 23 | //seqInt function 24 | 25 | IntegerVector seqInt(double x, double y, double by) { 26 | 27 | IntegerVector anOut(1); 28 | 29 | // compute sequence 30 | 31 | double min_by = 1.e-8; 32 | 33 | if (by < min_by) min_by = by/100; 34 | 35 | double i = x + by; 36 | 37 | anOut(0) = x; 38 | 39 | while(i/min_by < y/min_by + 1) { 40 | 41 | anOut.push_back(i); 42 | 43 | i += by; 44 | 45 | } 46 | 47 | return anOut; 48 | 49 | } 50 | 51 | // stl_sort 52 | 53 | NumericVector stl_sort(NumericVector x) { 54 | 55 | NumericVector y = clone(x); 56 | 57 | std::sort(y.begin(), y.end()); 58 | 59 | return y; 60 | } 61 | 62 | // stl_sort_int 63 | 64 | IntegerVector stl_sort_int( IntegerVector x) { 65 | 66 | IntegerVector y = clone(x); 67 | 68 | std::sort(y.begin(), y.end()); 69 | 70 | return y; 71 | } 72 | 73 | // HCR function 74 | 75 | IntegerVector Hcr(arma::Mat dMat , int nout , int nsampl) { 76 | 77 | int nplots = dMat.n_rows; 78 | 79 | NumericVector meand( nsampl, 0.0 ); 80 | 81 | NumericVector vard( nsampl, 0.0 ); 82 | 83 | IntegerMatrix sel( nout, nsampl ); 84 | 85 | arma::Mat selMat( nout, nout ); 86 | 87 | IntegerVector plotID = seqInt( 0 , nplots -1 , 1 ); 88 | 89 | for (int i = 0 ; i < nsampl ; i++ ) { 90 | 91 | IntegerVector sampleID = Rcpp::RcppArmadillo::sample(plotID, nout, false , NumericVector::create()); 92 | 93 | sel( _ , i) = sampleID; 94 | 95 | arma::uvec permuID = as(sampleID); 96 | 97 | selMat = dMat.submat( permuID , permuID ); 98 | 99 | double sum = 0; 100 | 101 | int n = 0; 102 | 103 | for (int j = 0 ; j < nout ; j++){ 104 | 105 | for (int k = 0 ; k < nout ; k++){ 106 | 107 | if (j > k ) { 108 | 109 | sum += selMat(j,k); 110 | 111 | n += 1 ; 112 | 113 | } 114 | 115 | } 116 | } 117 | 118 | double tmpMean = sum/n ; 119 | 120 | double sce = 0; 121 | 122 | for (int j = 0 ; j < nout; j++){ 123 | 124 | for (int k = 0 ; k < nout; k++){ 125 | 126 | if (j > k ) { 127 | 128 | sce += pow(selMat(j,k) - tmpMean, 2.0); 129 | 130 | } 131 | 132 | } 133 | } 134 | 135 | double tmpVar = sce / (n - 1) ; 136 | 137 | meand[i] = tmpMean; 138 | 139 | vard[i] = tmpVar; 140 | 141 | } 142 | 143 | NumericVector sortdemean = stl_sort(-meand); 144 | 145 | NumericVector sortdevar = stl_sort(vard); 146 | 147 | IntegerVector rankdecmean = match(-meand, sortdemean); 148 | 149 | IntegerVector rankdevar = match(vard, sortdevar); 150 | 151 | IntegerVector sumVarMean = rankdecmean + rankdevar; 152 | 153 | IntegerVector sorVarMean = stl_sort_int(sumVarMean); 154 | 155 | IntegerVector rankFinal = match(sumVarMean, sorVarMean); 156 | 157 | int selecSample = which_min(rankFinal); 158 | 159 | IntegerVector res = sel(_, selecSample); 160 | 161 | return res; 162 | 163 | } 164 | 165 | // [[Rcpp::export]] 166 | 167 | IntegerVector HcrCPP(SEXP pInBigMat, int nout , int nsampl) { 168 | 169 | // First we tell Rcpp that the object we've been given is an external pointer. 170 | 171 | XPtr xpMat(pInBigMat); 172 | 173 | IntegerVector res = Hcr(arma::Mat((double *)xpMat->matrix(), xpMat->nrow(), xpMat->ncol(), false), nout , nsampl); 174 | 175 | return wrap(res); 176 | } -------------------------------------------------------------------------------- /sPlot2_PlotDistribution.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(viridis) 3 | 4 | #library(raster) 5 | library(sp) 6 | library(sf) 7 | library(rgdal) 8 | library(rnaturalearth) 9 | library(dggridR) 10 | # library(rgeos) 11 | 12 | #save temporary files 13 | write("TMPDIR = /data/sPlot/users/Francesco/_tmp", file=file.path(Sys.getenv('TMPDIR'), '.Renviron')) 14 | write("R_USER = /data/sPlot/users/Francesco/_tmp", file=file.path(Sys.getenv('R_USER'), '.Renviron')) 15 | #rasterOptions(tmpdir="/data/sPlot/users/Francesco/_tmp") 16 | 17 | ## import data 18 | load(file = "_output/sPlot_OPEN.RData") 19 | load("/data/sPlot/releases/sPlot3.0/header_sPlot3.0.RData") 20 | 21 | ## Data Preparation for spatial plotting 22 | header.oa.sf <- SpatialPointsDataFrame(coords= header.oa %>% 23 | select(POINT_X, POINT_Y), 24 | proj4string = CRS("+init=epsg:4326"), 25 | data=data.frame(PlotObservationID=header.oa$PlotObservationID, 26 | Dataset=header.oa$Dataset)) %>% 27 | st_as_sf() %>% 28 | st_transform(crs = "+proj=eck4") 29 | 30 | header <- header %>% 31 | filter(!is.na(Latitude)) 32 | header.sf <- SpatialPointsDataFrame(coords= header %>% 33 | 34 | select(Longitude, Latitude), 35 | proj4string = CRS("+init=epsg:4326"), 36 | data=data.frame(PlotObservationID=header$PlotObservationID, 37 | Dataset=header$Dataset)) %>% 38 | st_as_sf() %>% 39 | st_transform(crs = "+proj=eck4") 40 | 41 | 42 | 43 | ### Template of Global map 44 | #download data from rnaturalearth package 45 | countries <- ne_countries(returnclass = "sf") %>% 46 | st_transform(crs = "+proj=eck4") %>% 47 | st_geometry() 48 | graticules <- ne_download(type = "graticules_15", category = "physical", 49 | returnclass = "sf") %>% 50 | st_transform(crs = "+proj=eck4") %>% 51 | st_geometry() 52 | bb <- ne_download(type = "wgs84_bounding_box", category = "physical", 53 | returnclass = "sf") %>% 54 | st_transform(crs = "+proj=eck4") %>% 55 | st_geometry() 56 | 57 | # create ggplot template of the world map 58 | w3a <- ggplot() + 59 | geom_sf(data = bb, col = "grey20", fill = "white") + 60 | geom_sf(data = graticules, col = "grey20", lwd = 0.1) + 61 | #geom_sf(data = countries, fill = "grey90", col = NA, lwd = 0.3) + 62 | coord_sf(crs = "+proj=eck4") + 63 | theme_minimal() + 64 | theme(axis.text = element_blank(), 65 | legend.title=element_text(size=12), 66 | legend.text=element_text(size=12), 67 | legend.background = element_rect(size=0.1, linetype="solid", colour = 1), 68 | legend.key.height = unit(1.1, "cm"), 69 | legend.key.width = unit(1.1, "cm")) + 70 | scale_fill_viridis() 71 | 72 | 73 | ### Map of plot distribution 74 | ### all plots from sPlot 3.0 75 | w3all <- w3a + 76 | geom_sf(data=header.sf, color=gray(0.5), pch="+", size=1, alpha=1/5) + # aes(col=Dataset), 77 | geom_sf(data = countries, col = "grey10", fill=NA, lwd = 0.3) + 78 | theme(legend.position = "none") 79 | 80 | ### open.access plots from paper #02 81 | w3oa <- w3all + 82 | geom_sf(data=header.oa.sf, color="red", pch="+", size=1, alpha=1/4) + # aes(col=Dataset), 83 | geom_sf(data = countries, col = "grey10", fill=NA, lwd = 0.3) + 84 | theme(legend.position = "none") 85 | 86 | ggsave(filename="_output/sPlot3_distribution.png", device="png", width=8, height=5, dpi = 300, plot = w3all) 87 | ggsave(filename="_output/sPlot21_oa_distribution.png", device="png", width=8, height=5, dpi = 300, plot = w3oa) 88 | 89 | 90 | 91 | 92 | ### Version 2 - hexagons 93 | header2 <- header.oa %>% 94 | select(PlotObservationID, POINT_Y, POINT_X) %>% 95 | filter(!(abs(POINT_X) >171 & abs(POINT_Y>70))) 96 | dggs <- dgconstruct(spacing=300, metric=T, resround='down') 97 | 98 | #Get the corresponding grid cells for each plot 99 | header2$cell <- dgGEO_to_SEQNUM(dggs, header2$POINT_X, header2$POINT_Y)$seqnum 100 | 101 | #Calculate number of plots for each cell 102 | header.dggs <- header2 %>% 103 | group_by(cell) %>% 104 | summarise(value.out=log(n(), 10)) 105 | 106 | #Get the grid cell boundaries for cells 107 | grid <- dgcellstogrid(dggs, header.dggs$cell, frame=F) %>% 108 | st_as_sf() %>% 109 | mutate(cell = header.dggs$cell) %>% 110 | mutate(value.out=header.dggs$value.out) %>% 111 | st_transform("+proj=eck4") %>% 112 | st_wrap_dateline(options = c("WRAPDATELINE=YES")) 113 | 114 | ## plotting 115 | w3a + 116 | geom_sf(data=grid, aes(fill=value.out),lwd=0, alpha=0.9) + 117 | geom_sf(data = countries, col = "grey10", fill=NA, lwd = 0.3) + 118 | scale_fill_viridis( 119 | name="# plots", breaks=0:5, labels = c("1", "10", "100", 120 | "1,000", "10,000", "100,000"), option="viridis" ) 121 | 122 | --------------------------------------------------------------------------------