Functional

We used cleaned taxonomy to retrieve data from TRY (Kattge et al. 2019) and BIOMASS (Réjou-Méchain et al. 2017). This corresponds to version 6.

Wood density

Using BIOMASS (Réjou-Méchain et al. 2017), we retrieved wood density for most species, with little differences between sites means.

Code
taxo <- read_tsv(taxonomy_file)
wd <- getWoodDensity(taxo$genus_cleaned, taxo$species_cleaned) %>%
  group_by(genus, species) %>%
  summarise(wd = mean(meanWD)) %>%
  na.omit() %>%
  rename(genus_cleaned = genus, species_cleaned = species)
taxo_wd <- left_join(taxo, wd)
g1 <- ggplot(taxo_wd, aes(wd, col = site)) +
  geom_density() +
  theme_bw() +
  xlab(expression("Wood Density [" ~ g / cm^3 ~ "]")) +
  theme(legend.key.size = unit(.5, "line"))
g2 <- ggplot(taxo_wd, aes(site, wd, col = site)) +
  geom_violin() +
  geom_boxplot(width = .2) +
  coord_flip() +
  theme_bw() +
  theme(
    legend.position = "none",
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank()
  ) +
  ylab(expression("Wood Density [" ~ g / cm^3 ~ "]"))
cowplot::plot_grid(g1, g2, nrow = 2)

Wood density extracted from Réjou-Méchain et al. (2017).

TRY

In order to make a request for data on TRY (Kattge et al. 2019), we first retrieved the total species list available from TRY and matched our species to their species list to obtain corresponding species ID on TRY. By chance TRY is also using WordFlora reference. But they limit the requests to 200 species. We thus retrieved available species data by batch of 200 (see example below).

Code
try_sp <- read_tsv(taxonomy_file) %>%
  select(scientific_cleaned) %>%
  na.omit() %>%
  unique() %>%
  left_join(read_tsv("data/raw_data/TryAccSpecies.txt"),
    by = c("scientific_cleaned" = "AccSpeciesName")
  )
sp_req <- try_sp %>%
  select(AccSpeciesID) %>%
  na.omit() %>%
  unique() %>%
  arrange() %>%
  mutate(request = rep(1:100, each = 200)[seq_len(n())]) %>%
  group_by(request) %>%
  summarise(species_list = paste0(AccSpeciesID, collapse = ", "))
write_tsv(sp_req, "data/derived_data/try_species_requests.tsv")
Code
read_tsv("data/derived_data/try_species_requests.tsv", n_max = 1) %>%
  kable(caption = "Example of species ID list for TRY Data Explorer requests (limited to 200) for the first chunk.") # nolint
Example of species ID list for TRY Data Explorer requests (limited to 200) for the first chunk.
request species_list
1 53035, 8582, 14866, 53654, 14156, 15557, 27269, 56630, 29126, 52505, 31260, 30552, 4136, 17195, 8580, 32629, 39236, 8980, 3147, 86888, 52969, 38544, 30609, 27124, 36681, 39724, 26812, 40311, 36684, 37540, 46572, 34387, 44223, 22256, 49908, 52432, 13692, 430788, 33337, 44232, 5583, 30895, 12681, 43766, 54284, 30632, 33380, 52899, 246054, 56788, 12678, 43849, 11219, 14450, 18719, 14882, 56617, 33288, 24884, 38616, 34870, 20736, 408207, 50321, 54536, 54783, 103114, 40645, 35216, 30904, 44986, 25648, 52844, 40368, 41051, 22281, 32634, 430797, 30624, 52895, 44853, 37333, 43959, 43542, 33334, 21888, 26519, 38552, 45280, 29808, 20479, 15619, 40831, 19480, 55518, 47684, 19696, 55524, 36897, 44855, 55866, 56813, 52968, 92496, 43763, 4160, 7610, 10273, 17778, 40357, 11353, 56697, 27177, 57266, 38516, 53047, 29162, 22272, 50499, 46274, 54663, 43873, 57627, 57282, 52436, 30575, 14860, 43767, 39875, 32613, 14890, 21873, 99866, 29834, 74946, 5605, 52900, 12272, 33343, 27264, 8569, 39234, 42597, 84130, 10716, 40823, 40360, 11365, 40231, 11345, 44590, 43958, 37888, 8576, 18800, 101052, 53039, 35441, 32627, 3383, 50390, 39604, 29655, 57424, 56781, 35358, 55852, 35649, 54098, 11952, 3878, 34856, 51769, 36686, 38676, 38574, 3877, 50523, 33335, 5937, 6614, 19578, 22258, 52390, 3446, 51785, 22261, 38551, 43836, 33295, 43828, 13282, 32615, 50507, 54280, 29811, 43810, 19522, 30614, 2458

Next we could investigate trait data available for more than 40% of species, which were not many.

Code
list.files("data/derived_data/try_req/", full.names = TRUE) %>%
  lapply(read_tsv, skip = 3) %>%
  lapply(gather, species, N, -Trait, -TraitID) %>%
  bind_rows() %>%
  filter(N > 0) %>%
  group_by(Trait, TraitID) %>%
  summarise(N = n()) %>%
  filter(N > 0.4 * 1901) %>%
  ggplot(aes(fct_rev(fct_infreq(Trait, N)), N)) +
  geom_point() +
  theme_bw() +
  coord_flip() +
  theme(axis.title = element_blank())

Traits available for at least 50% of species available on TRY.

For the first test we thus used:

  • 14: Leaf nitrogen (N) content per leaf dry mass, mg/g
  • 3117: Leaf area per leaf dry mass (specific leaf area, SLA or 1/LMA): undefined if petiole is in- or excluded, mm2 mg-1
  • 4: Stem specific density (SSD, stem dry mass per stem fresh volume) or wood density, g/cm3
  • 3114: Leaf area (in case of compound leaves: leaflet, undefined if petiole is in- or excluded), mm2

The request takes times to be ready. Moreover, we should try to minimise the number of requests. This was a first test. But future tests should be discussed before.

Code
taxo <- read_tsv(taxonomy_file)
traits <- c(
  "Stem specific density (SSD, stem dry mass per stem fresh volume) or wood density", # nolint
  "Leaf area per leaf dry mass (specific leaf area, SLA or 1/LMA): undefined if petiole is in- or excluded", # nolint
  "Leaf nitrogen (N) content per leaf dry mass",
  "Leaf area (in case of compound leaves undefined if leaf or leaflet, undefined if petiole is in- or excluded)" # nolint
)
try <- read_tsv("data/raw_data/try/38775.txt") %>%
  filter(TraitName %in% traits) %>%
  filter(AccSpeciesName %in% taxo$scientific_cleaned) %>%
  select(AccSpeciesName, TraitName, StdValue) %>%
  rename(
    species = AccSpeciesName,
    trait = TraitName,
    value = StdValue
  ) %>%
  mutate(trait = recode(trait,
    "Stem specific density (SSD, stem dry mass per stem fresh volume) or wood density" = "wd", # nolint
    "Leaf area per leaf dry mass (specific leaf area, SLA or 1/LMA): undefined if petiole is in- or excluded" = "sla", # nolint
    "Leaf nitrogen (N) content per leaf dry mass" = "n",
    "Leaf area (in case of compound leaves undefined if leaf or leaflet, undefined if petiole is in- or excluded)" = "la" # nolint
  )) %>%
  group_by(species, trait) %>%
  summarise(value = mean(as.numeric(value), na.rm = TRUE)) %>%
  pivot_wider(values_from = "value", names_from = "trait")
write_tsv(try, "data/derived_data/try_extract.tsv")
Code
try <- read_tsv("data/derived_data/try_extract.tsv") %>%
  rename(scientific_cleaned = species) %>%
  rename(wd_try = wd)
taxo <- read_tsv(taxonomy_file)
taxo_try <- left_join(taxo, try)

We obtained numerous data across species for SLA, WD, and N:

Code
g1 <- ggplot(taxo_try, aes(wd_try, col = site)) +
  geom_density() +
  theme_bw() +
  xlab(expression("Wood Density [" ~ g / cm^3 ~ "]")) +
  theme(legend.key.size = unit(.5, "line"))
g2 <- ggplot(taxo_try, aes(site, wd_try, col = site)) +
  geom_violin() +
  geom_boxplot(width = .2) +
  coord_flip() +
  theme_bw() +
  theme(
    legend.position = "none",
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank()
  ) +
  ylab(expression("Wood Density [" ~ g / cm^3 ~ "]"))
cowplot::plot_grid(g1, g2, nrow = 2)

Wood density extracted from Kattge et al. (2019).
Code
g1 <- ggplot(taxo_try, aes(sla, col = site)) +
  geom_density() +
  theme_bw() +
  xlab(expression("Specific Leaf Area [" ~ mm^2 ~ mg^{
    -1
  } ~ "]")) +
  scale_x_log10() +
  theme(legend.key.size = unit(.5, "line"))
g2 <- ggplot(taxo_try, aes(site, sla, col = site)) +
  geom_violin() +
  geom_boxplot(width = .2) +
  coord_flip() +
  theme_bw() +
  theme(
    legend.position = "none",
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank()
  ) +
  ylab(expression("Specific Leaf Area [" ~ mm^2 ~ mg^{
    -1
  } ~ "]")) +
  scale_y_log10()
cowplot::plot_grid(g1, g2, nrow = 2)

Specific leaf area extracted from Kattge et al. (2019).
Code
g1 <- ggplot(taxo_try, aes(la, col = site)) +
  geom_density() +
  theme_bw() +
  xlab(expression("Leaf Area [" ~ mm^2 ~ "]")) +
  scale_x_log10() +
  theme(legend.key.size = unit(.5, "line"))
g2 <- ggplot(taxo_try, aes(site, la, col = site)) +
  geom_violin() +
  geom_boxplot(width = .2) +
  coord_flip() +
  theme_bw() +
  theme(
    legend.position = "none",
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank()
  ) +
  ylab(expression("Leaf Area [" ~ mm^2 ~ "]")) +
  scale_y_log10()
cowplot::plot_grid(g1, g2, nrow = 2)

Leaf area extracted from Kattge et al. (2019).
Code
g1 <- ggplot(taxo_try, aes(n, col = site)) +
  geom_density() +
  theme_bw() +
  xlab(expression("Nitrogen [" ~ mg / g ~ "]")) +
  theme(legend.key.size = unit(.5, "line"))
g2 <- ggplot(taxo_try, aes(site, n, col = site)) +
  geom_violin() +
  geom_boxplot(width = .2) +
  coord_flip() +
  theme_bw() +
  theme(
    legend.position = "none",
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank()
  ) +
  ylab(expression("Nitrogen [" ~ mg / g ~ "]"))
cowplot::plot_grid(g1, g2, nrow = 2)

Leaf nitrogen extracted from Kattge et al. (2019).

All raw

We assembled all traits data from TRY (Kattge et al. 2019), BIOMASS (Réjou-Méchain et al. 2017), and TALLO (Jucker et al. 2022), and obtained the following correlations:

Code
taxo_traits <- taxo %>%
  left_join(wd) %>%
  left_join(try)
Code
taxo_traits %>%
  select(site, scientific_cleaned, wd, la, sla, n, wd_try) %>%
  gather(trait, value, -site, -scientific_cleaned) %>%
  na.omit() %>%
  group_by(site, trait, scientific_cleaned) %>%
  summarise(mean = mean(value)) %>%
  group_by(site, trait) %>%
  summarise(n = n()) %>%
  pivot_wider(names_from = trait, values_from = n) %>%
  kable()
site la n sla wd wd_try
Antimary APU2 131 125 118 161 129
Antimary APU3 108 102 100 139 110
BAFOG 224 230 232 311 244
Corinto 163 175 196 262 174
Ecosilva 122 112 106 160 122
Embrapa Acre 122 118 113 158 121
Gola 20 68 31 240 132
Iracema 159 142 133 200 156
Iwokrama 90 73 73 113 93
Jari 282 237 219 390 270
Jenaro Herrera 230 182 169 331 191
Kabo 181 153 158 240 184
Kibale 7 26 12 75 43
Lesong 4 21 25 76 57
Malinau 15 101 146 497 316
Manare 153 181 187 226 150
Mbaiki 20 93 34 369 169
Mil 40 29 21 48 34
Misiones 24 38 57 115 74
Moju 258 217 200 325 247
Montagne Tortue 181 202 212 249 177
Nelliyampathy 6 12 11 69 38
PAD Limoeiro Tabocal 150 140 132 189 146
Pad Limo 2 Barracos 84 75 70 103 81
Pad Limo Chico Bocao 133 125 119 166 133
Pad Limo Cumaru 146 132 126 183 144
Pad Limo Jatoba 135 123 118 166 136
Pad Limo Pocao 91 84 81 114 88
Pad Limo STCP 130 121 116 158 127
Paracou 352 370 382 538 358
Peixoto 127 117 111 174 123
Peteco 221 199 187 295 210
STREK 9 76 111 314 211
SUAS 10 52 75 194 125
Sao Nicolau 95 90 82 117 96
Ser Filipinas 99 99 92 132 102
Sungai Lalang 3 18 23 68 50
Tapajos Km114 286 252 224 373 264
Tapajos Km67 259 232 210 367 252
Tene 25 59 36 196 105
Tirimbina 185 199 218 291 195
Ulu Muda 1 13 14 43 30
Uppangala 9 21 19 121 78
Code
taxo_traits %>%
  select(wd, wd_try, sla, la, n) %>%
  cor(use = "pairwise.complete.obs") %>%
  corrplot::corrplot(type = "upper", diag = FALSE)

All imputed

We used missing data imputation with random forest for with the missForest R package to further impute missing trait values (Gendre et al. 2024). We first aggregated data at the genus level with median trait value to inform genus lacking traits data or species. We obtained for imputation a normalised root mean square error of NRMSE=1.007. We obtained the global PCA below showed globally and per site.

Code
raw <- taxo_traits %>%
  filter(!is.na(genus_cleaned)) %>%
  select(genus_cleaned, wd, sla, la, n) %>%
  group_by(genus_cleaned) %>%
  summarise_all(median, na.rm = TRUE) %>%
  filter(!(is.na(wd) & is.na(sla) & is.na(la) & is.na(n)))
imputation <- missForest::missForest(
  raw %>%
    select(-genus_cleaned) %>%
    as.data.frame(),
  verbose = TRUE
)
imputation$OOBerror
imputation$ximp %>%
  mutate(genus_cleaned = raw$genus_cleaned, .before = 1) %>%
  write_tsv("data/derived_data/imputed_traits.tsv")
Code
imputed <- read_tsv("data/derived_data/imputed_traits.tsv")
autoplot(
  princomp(select(imputed, wd, sla, la, n) %>%
    mutate(sla = log(sla), la = log(la)), cor = TRUE),
  data = imputed, alpha = 0.25, col = "lightgrey",
  loadings.label.size = 6,
  loadings.label.colour = "red", loadings.label.vjust = 1.1,
  loadings.label.repel = TRUE,
  loadings = TRUE, loadings.label = TRUE, loadings.colour = "red"
) +
  coord_equal() +
  geom_hline(aes(yintercept = 0), col = "black", linetype = "dotted") +
  geom_vline(aes(xintercept = 0), col = "black", linetype = "dotted") +
  theme_bw() +
  geom_density_2d_filled(alpha = .5) +
  theme(legend.position = "none")

Imputed traits global principal component analysis.

Traits data

All traits data are saved in outputs/traits_vX.tsv with the following columns:

  • scientific_cleaned: the cleaned scientific name for junction with the taxonomic data
  • trait: the trait acronym or code
  • species_trait_value_raw: the species median trait raw value from TRY
  • genus_trait_value_imputed: the genus median trait imputed value in case of missing raw values
  • definition: the trait definition
  • unit: the trait unit
Code
all_traits <- taxo_traits %>%
  select(scientific_cleaned, genus_cleaned, wd, sla, la, n) %>%
  gather(
    trait, species_trait_value_raw,
    -scientific_cleaned, -genus_cleaned
  ) %>%
  filter(!is.na(scientific_cleaned)) %>%
  unique() %>%
  left_join(imputed %>%
    select(genus_cleaned, wd, sla, la, n) %>%
    gather(trait, genus_trait_value_imputed, -genus_cleaned)) %>%
  mutate(trait = recode(trait,
    "wd" = "WD", "sla" = "SLA",
    "la" = "LA", "n" = "N"
  )) %>%
  mutate(definition = recode(trait,
    "WD" = "Stem specific wood density",
    "SLA" = "Specific leaf area",
    "LA" = "Leaf area",
    "N" = "Leaf nitrogen content per leaf dry mass",
    "hmax" = "Asymptotic height"
  )) %>%
  mutate(unit = recode(trait,
    "WD" = "g cm-3",
    "SLA" = "mm2 mg-1",
    "LA" = "mm2",
    "N" = "mg g-1",
    "hmax" = "m"
  ))
write_tsv(all_traits, traits_file)
Code
traits <- read_tsv(traits_file)
taxo <- read_tsv(taxonomy_file)
taxo %>%
  filter(
    !is.na(scientific_cleaned),
    !is.na(species_cleaned)
  ) %>%
  select(scientific_cleaned) %>%
  unique() %>%
  mutate(trait = scientific_cleaned %in% traits$scientific_cleaned) %>%
  filter(!trait)