Cleaning

Then we started by cleaning raw species data content. This corresponds to version 6.

Names corrections

First we cleaned family, genus and species names using contingency table built on the information of dubious names from the previous chapter.

Code
fam_wrong <- c(
  "#N/A", "+", "muerto", "Indet.", "Not in DB", "unid.",
  "Unidentified", "unidentified", "ND", "No determinado", "Indet"
)
gen_wrong <- c(
  "", "#N/A", "A", "Indet", "nd", "sp", "sp1", "sp5", "Indetermined",
  "KERUING", "MERANTI", "MERSAWA", "BALAU", "muerto", "Unidentified",
  "Nd", "nd10", "nd11", "nd14", "nd15", "nd16", "nd17", "nd2", "nd22",
  "nd23", "nd25", "nd26", "nd27", "nd28", "nd29", "nd33", "nd34",
  "nd4", "nd6", "nd7", "nd9", "N", "F", "cf", "No", "UN", "Un", "Spt",
  "Unident", "Unknow", "unknown", "Unknown", "ni", "n", "OT"
)
sp_wrong <- c(
  "Indet", "Indet.", "sp", "sp.", "Schaeff", "Schaeff.", "Mast", "Welw",
  "Stapf", "Hook", "Gilg", "Benth", "flacouirtiaceae", "Determinado",
  "BERMINYAK", "TIDAK", "KUNING", "spp", "", "1", "10", "11", "12", "13",
  "14", "15", "16", "17", "18", "19", "2", "20", "21", "22", "23", "25",
  "27", "28", "3", "4", "5", "6", "7", "8", "9", "a", "A", "sp1", "sp5",
  "L", "muerto", "JAP1001", "JAP1028", "JAP1036", "JAP1291", "JAP1398",
  "JAP1457", "JAP1627", "JAP1725", "JAP1799", "JAP1856", "JAP1873",
  "JAP1882", "JAP1891", "jariensis", "jauari", "javanica", "javanicum",
  "JCO355", "JCO407", "JCO416", "JCO578", "JCO597", "JCO670", "JCO696",
  "JCO710", "JCO736", "GFL1876", "GFL1997", "GFL2022", "GFL2211",
  "GFL2386", "GFL2411", "GFL2488", "GFL2606", "GFL2703", "GFL2772",
  "GFL2941", "GFL3093", "GFL3170", "GFL3182", "GFL3224", "GFL3284",
  "GFL3295", "GFL3481", "GFL3661", "GFL3686", "GFL6037", "GHP102",
  "JAP1000", "id", "indet", "Ridl", "sp13", "sp2", "sp3", "sp53",
  "species", "X", "XIV", "XVI", "spp", "n", "sp2", "a", "int",
  "L", "sur", "P", "4", "sp3", "id", "X", "2", "23", "1", "XVI",
  "19", "17", "5", "7", "22", "21", "3", "10", "11", "18", "16", "8",
  "XIV", "20", "14", "dao", "13", "6", "9", "25", "12", "28", "15",
  "27", "pin", "cf", "nd", "or", "DC"
)
family_tab <- data.frame(
  family_corrected = NA,
  family_raw = fam_wrong,
  family_correction = 1
)
genus_tab <- data.frame(
  genus_corrected = NA,
  genus_raw = gen_wrong,
  genus_correction = 1
)
species_tab <- data.frame(
  species_corrected = NA,
  species_raw = sp_wrong,
  species_correction = 1
)
corrected <- read_tsv(preped_file) %>%
  rename(
    scientific_raw = scientific, vernacular_raw = vernacular,
    family_raw = family, genus_raw = genus, species_raw = species
  ) %>%
  left_join(family_tab) %>%
  mutate(
    family_corrected = ifelse(is.na(family_correction),
      family_raw, family_corrected
    )
  ) %>%
  left_join(genus_tab) %>%
  mutate(genus_corrected = ifelse(is.na(genus_correction),
    genus_raw, genus_corrected
  )) %>%
  left_join(species_tab) %>%
  mutate(species_corrected = ifelse(is.na(species_correction),
    species_raw, species_corrected
  ))
write_tsv(corrected, corrected_file)
Code
read_tsv(corrected_file) %>%
  select(family_correction, genus_correction, species_correction) %>%
  gather("level") %>%
  na.omit() %>%
  group_by(level) %>%
  summarise(corrections = n()) %>%
  kable(
    format.args = list(big.mark = ","),
    caption = "Number of corrections per level."
  )
Number of corrections per level.
level corrections
family_correction 1,038
genus_correction 2,500
species_correction 9,889

World Flora Online (WFO)

We used WorldFlora and fuzzyjoin for synonymy and misspelling corrections. Beware, the R package is very memory consuming and I used batches of species names in the code. you might want to adapt it to the capacity of your computer. I also subsampled the backbones to the family list from the data to increase speed, but this might not be optimal.

Code
library(tidyverse)
library(WorldFlora)
library(fuzzyjoin)
batch_size <- 1000 # adjust to your RAM
dir.create("data/derived_data/wfo")
families <- read_tsv(corrected_file) %>%
  select(family_corrected) %>%
  unique()
wfo <- read_tsv("data/raw_data/WFO_Backbone.zip")
wfo_sub <- wfo %>%
  filter(family %in% c(
    families$family_corrected,
    "Huaceae", " Zygophyllaceae", "Ctenolophonaceae",
    "Lepidobotryaceae", "Thomandersiaceae", "Connaraceae",
    "Anisophylleaceae", "Tetramelaceae", "Picramniaceae",
    "Escalloniaceae"
  ))
write_tsv(wfo_sub, "data/derived_data/wfo/backbone.tsv")
taxo <- read_tsv(corrected_file) %>%
  filter(!is.na(genus_corrected)) %>%
  mutate(species_corrected = ifelse(is.na(species_corrected),
    "", species_corrected
  )) %>%
  transmute(scientific = paste(genus_corrected, species_corrected)) %>%
  arrange(scientific) %>%
  unique()
n_batch <- ceiling(nrow(taxo) / batch_size)
taxo <- taxo %>%
  mutate(batch = rep(seq_len(n_batch), each = batch_size)[seq_len(n())])
write_tsv(taxo, "data/derived_data/wfo/taxo.tsv")
rm(taxo, wfo, wfo_sub, families)
gc()
for (i in 1:n_batch) {
  rm(taxo, cleaned)
  gc()
  taxo <- read_tsv("data/derived_data/wfo/taxo.tsv") %>%
    filter(batch == i)
  cleaned <- WFO.match.fuzzyjoin(
    spec.data = taxo$scientific,
    WFO.file = "data/derived_data/wfo/backbone.tsv"
  )
  write_tsv(
    cleaned,
    paste0("data/derived_data/wfo/wfo_out_part", i, ".tsv")
  )
}
unlink("data/derived_data/wfo/taxo.tsv")
unlink("data/derived_data/wfo/backbone.tsv")
cleaned <- read_tsv(list.files("data/derived_data/wfo", full.names = TRUE))
write_tsv(cleaned, wfo_file)
unlink("data/derived_data/wfo", recursive = TRUE)

4,677 species matched a species from the World Flora Online (WFO) except 41 that can be cured manually:

Code
wfo <- read_tsv(wfo_file)
wfo %>%
  group_by(spec.name) %>%
  summarise(matched = all(Matched)) %>%
  group_by(matched) %>%
  summarise(N = n(), names = ifelse(!matched, paste(spec.name), "")) %>%
  filter(matched) %>%
  unique() %>%
  kable(
    format.args = list(big.mark = ","),
    caption = "Number of matched and unmatched species."
  )
Number of matched and unmatched species.
matched N names
TRUE 4,677

Among matched species, most were exact (not fuzzy misspelled species) and with a single match in WFO, still 257257 had multiple matches and 444 were fuzzy misspelled species, both to investigate:

Code
wfo %>%
  filter(Matched) %>%
  group_by(spec.name) %>%
  summarise(fuzzy = all(Fuzzy), unique = all(Unique)) %>%
  group_by(fuzzy, unique) %>%
  summarise(N = n()) %>%
  mutate(
    fuzzy = ifelse(fuzzy, "fuzzy", "exact"),
    unique = ifelse(unique, "unique", "multiple")
  ) %>%
  pivot_wider(names_from = "unique", values_from = "N") %>%
  kable(
    format.args = list(big.mark = ","),
    caption = "Number of fuzzy and unique species."
  )
Number of fuzzy and unique species.
fuzzy multiple unique
exact 257 3,868
fuzzy 108 444

Manual curation

The 38 species that did not match WFO were manually cured by searching manually on WFO and on the web. Some were missed because lacking the family name in the WFO backbone subset, some do not matched species names, some used the subspecies names, and other were really misspelled.

Code
t <- wfo %>%
  group_by(spec.name) %>%
  summarise(matched = all(Matched)) %>%
  filter(!matched)
manual_curration_1 <- tribble(
  ~scientific_corrected, ~family_cleaned, ~genus_cleaned, ~species_cleaned,
  "Alseodaphne ceratoxylon", "Lauraceae", NA, NA,
  "Alseodaphne dewildei", "Lauraceae", NA, NA,
  "Alseodaphne havilandii", "Lauraceae", NA, NA,
  "Aniba puchury", "Lauraceae", "Aniba", "puchury-minor",
  "Aniba squarenses", "Lauraceae", "Aniba", NA,
  "Balanites wilsoniana", "Zygophyllaceae", "Balanites", "obovata",
  "Brosimum obovata", "Moraceae", "Brosimum", "acutifolium",
  "Castanopsis cantleyanus", "Fagaceae", "Castanopsis", NA,
  "Chrysophyllum pachycarpa", "Sapotaceae", "Chrysophyllum", "lucentifolium",
  "Coordercinodendron annatum", NA, NA, NA,
  "Dacryodes pachyphyllus", "Burseraceae", "Dacryodes", NA,
  "Dehaassia microcephala", "Lauraceae", "Dehaasia", NA,
  "Dipterocarpus palemb", "Dipterocarpaceae", "Dipterocarpus", NA,
  "Fernandoa adolfi", "Bignoniaceae", "Fernandoa", "adolfi-friderici",
  "Handroanthus campestris", "Bignoniaceae", "Handroanthus", NA,
  "Ichthyophora elegans", "Sapotaceae", "Ichthyophora", NA,
  "Lacunaria folha", "Ochnaceae", "Lacunaria", NA,
  "Mabia cuspitada", NA, NA, NA,
  "Madhuca lanateramula", "Sapotaceae", "Madhuca", NA,
  "Mezilaurus mezilaurus", "Lauraceae", "Mezilaurus", NA,
  "Micropholis guyanensis subsp", "Sapotaceae", "Micropholis", "guyanensis",
  "Myrciaria goiabinha", "Myrtaceae", "Myrciaria", NA,
  "Parinari impressa", "Rosaceae", NA, NA,
  "Spoctubilis", NA, NA, NA,
  "Syzygiopsis pachycarpa", "Sapotaceae", "Pouteria", NA,
  "Syzygium clavimyrtus", "Myrtaceae", "Syzygium", NA,
  "Teijsmanniodendron simpli", "Lamiaceae", "Teijsmanniodendron", NA,
  "Xanoquito", NA, NA, NA,
) %>% mutate(cleaning_type = "manual")
manual_curration_2 <- tribble(
  ~scientific_corrected, ~family_cleaned, ~genus_cleaned, ~species_cleaned,
  "Desconocido", NA, NA, NA,
  "Desconocido Desconocido", NA, NA, NA,
  "Hedyosmum bonplandianum", "Chloranthaceae", "Hedyosmum", "bonplandianum",
  "Liana NULL", NA, NA, NA,
  "Ndengbeyawie", NA, NA, NA,
  "Piptocarpha poeppigiana", "Asteraceae", "Piptocarpha", "poeppigiana",
  "Pouteria neoxythece", "Sapotaceae", "Pouteria", NA,
  "Pseudobombax coriacea", "Malvaceae", "Pseudobombax", NA,
  "Smeathmannia laevigata", "Passifloraceae", "Smeathmannia", "laevigata",
  "Tabernaemontana heptanphyllum", NA, NA, NA,
  "Achatocarpus praecox", "Achatocarpaceae", "Achatocarpus", "praecox"
) %>% mutate(cleaning_type = "manual")
manual_curration_3 <- tribble(
  ~scientific_corrected, ~family_cleaned, ~genus_cleaned, ~species_cleaned,
  "Anthoshorea group", "Dipterocarpaceae", "Anthoshorea", NA,
  "Hopea nervosaHopea", "Dipterocarpaceae", "Hopea", "nervosa",
  "Keruing putih", NA, NA, NA
) %>% mutate(cleaning_type = "manual")

manual_curration <- bind_rows(manual_curration_1, manual_curration_2, manual_curration_3)
missing <- t %>% filter(!(spec.name %in% manual_curration$scientific_corrected))
kable(manual_curration, caption = "Manually curated taxonomy that did not match WFO.") # nolint
Manually curated taxonomy that did not match WFO.
scientific_corrected family_cleaned genus_cleaned species_cleaned cleaning_type
Alseodaphne ceratoxylon Lauraceae NA NA manual
Alseodaphne dewildei Lauraceae NA NA manual
Alseodaphne havilandii Lauraceae NA NA manual
Aniba puchury Lauraceae Aniba puchury-minor manual
Aniba squarenses Lauraceae Aniba NA manual
Balanites wilsoniana Zygophyllaceae Balanites obovata manual
Brosimum obovata Moraceae Brosimum acutifolium manual
Castanopsis cantleyanus Fagaceae Castanopsis NA manual
Chrysophyllum pachycarpa Sapotaceae Chrysophyllum lucentifolium manual
Coordercinodendron annatum NA NA NA manual
Dacryodes pachyphyllus Burseraceae Dacryodes NA manual
Dehaassia microcephala Lauraceae Dehaasia NA manual
Dipterocarpus palemb Dipterocarpaceae Dipterocarpus NA manual
Fernandoa adolfi Bignoniaceae Fernandoa adolfi-friderici manual
Handroanthus campestris Bignoniaceae Handroanthus NA manual
Ichthyophora elegans Sapotaceae Ichthyophora NA manual
Lacunaria folha Ochnaceae Lacunaria NA manual
Mabia cuspitada NA NA NA manual
Madhuca lanateramula Sapotaceae Madhuca NA manual
Mezilaurus mezilaurus Lauraceae Mezilaurus NA manual
Micropholis guyanensis subsp Sapotaceae Micropholis guyanensis manual
Myrciaria goiabinha Myrtaceae Myrciaria NA manual
Parinari impressa Rosaceae NA NA manual
Spoctubilis NA NA NA manual
Syzygiopsis pachycarpa Sapotaceae Pouteria NA manual
Syzygium clavimyrtus Myrtaceae Syzygium NA manual
Teijsmanniodendron simpli Lamiaceae Teijsmanniodendron NA manual
Xanoquito NA NA NA manual
Desconocido NA NA NA manual
Desconocido Desconocido NA NA NA manual
Hedyosmum bonplandianum Chloranthaceae Hedyosmum bonplandianum manual
Liana NULL NA NA NA manual
Ndengbeyawie NA NA NA manual
Piptocarpha poeppigiana Asteraceae Piptocarpha poeppigiana manual
Pouteria neoxythece Sapotaceae Pouteria NA manual
Pseudobombax coriacea Malvaceae Pseudobombax NA manual
Smeathmannia laevigata Passifloraceae Smeathmannia laevigata manual
Tabernaemontana heptanphyllum NA NA NA manual
Achatocarpus praecox Achatocarpaceae Achatocarpus praecox manual
Anthoshorea group Dipterocarpaceae Anthoshorea NA manual
Hopea nervosaHopea Dipterocarpaceae Hopea nervosa manual
Keruing putih NA NA NA manual

Misspelling (fuzzy)

By default WorldFlora limits fuzzy distance to 4, which seems more than acceptable and could be increased, but this can also be dealt with manually.

Code
wfo %>%
  filter(Matched, Fuzzy) %>%
  group_by(spec.name) %>%
  summarise(fuzzy_dist = unique(Fuzzy.dist)) %>%
  group_by(fuzzy_dist) %>%
  summarise(N = n()) %>%
  kable(caption = "Number of misspelled species by the number of misspelled letters.") # nolint
Number of misspelled species by the number of misspelled letters.
fuzzy_dist N
1 247
2 181
3 75
4 49

However, among the misspelled species with multiple matches, 34 have multiple newly accepted names:

Code
wfo %>%
  filter(Matched, Fuzzy, !Unique, New.accepted) %>%
  select(spec.name, scientificName) %>%
  unique() %>%
  group_by(spec.name) %>%
  filter(n() > 1) %>%
  summarise(scientificName = paste0(scientificName, collapse = ", ")) %>%
  kable(caption = "Fuzzy match with multiple newly accepted names.")
Fuzzy match with multiple newly accepted names.
spec.name scientificName
Acioa pallens Senegalia nigrescens, Dactyladenia pallescens, Aulacocalyx pallens, Viola macloskeyi
Aglaia cordifolia Acacia hastulata, Tilia platyphyllos subsp. cordifolia, Aglaia exstipulata, Aglaia elaeagnoidea, Macaranga cordifolia
Aglaia polyandra Aglaia tomentosa, Aglaia simplicifolia
Alangium Rhizophora, Forsythia
Alsophila Rinorea, Machaonia, Gentiana, Paxistima, Procris
Aniba albescens Manilkara valenzuelana, Vicia ochroleuca subsp. ochroleuca, Vachellia tortuosa, Carissa spinarum, Richetia hopeifolia, Scilla nana, Adina eurhyncha, Dactyladenia pallescens, Griffitharia pallescens, Nolana albescens, Hopea sangal
Bridelia airy Bridelia scleroneura subsp. scleroneura, Cleistanthus rufus
Celtis adolfi Celtis jessoensis, Trema micranthum, Celtis tetrandra, Celtis iguanaea
Chionanthus pauciflora Chionanthus ramiflorus, Chionanthus parviflorus
Colubrina acreana Colubrina arborescens, Colubrina beccariana, Emmenosperma papuanum
Crogbei Medicosma, Hypericum, Barleria
Cyathea Aphelandra, Celastrus, Cynanchum, Synechanthus
Drypetes Vahl Drypetes venusta, Drypetes natalensis var. natalensis, Drypetes floribunda
Enademia Grevillea, Crateva
Ficus donnell Ficus insculpta, Ficus velutina, Ficus thonningii, Ficus glumosa
Fragrea Potentilla, Parietaria, Boucerosia
Gargium Rhizophora, Forsythia, Galium
Grewia crassifolia Microlicia crassifolia, Pseudopanax crassifolius, Microcos grandifolia, Gyrotaenia crassifolia
Heritiera acuminata Tabernaemontana catharinensis, Alibertia edulis var. edulis, Gaertnera junghuhniana, Polyscias acuminata
Indora Maianthemum, Helicteres
Inga laterifolia Campsiandra laurifolia, Zygia latifolia
Kafer Symplocos, Coffea
Knema tomentosa Desmos cochinchinensis, Maesa macrophylla, Pergularia tomentosa, Palicourea tomentosa, Urena lobata subsp. lobata, Urena lobata subsp. sinuata
Magnolia candollii Magnolia liliifera, Magnolia acuminata var. acuminata
Maltanranthos Caryopteris, Catanthera
Measopsis Marrubium, Fraxinus
Meranti Evolvulus, Croton, Cyphokentia, Ceratonia, Myrsine, Pentapetes
Mesua congestiflora Macroscepis pleistantha, Ternstroemia congestiflora, Miconia leacongestiflora
Mesua cornata Kayea kunstleri, Kayea coriacea
Paropsia guineensis Chamaecrista mimosoides, Carapa procera, Garcinia smeathmanii
Pelawan Myristica, Pterospermum, Pellionia
Perospermum Bonamia, Bhesa
Pouteria eyma Pouteria cuspidata subsp. dura, Pleioluma firma, Elaeoluma nuda, Micropholis guyanensis subsp. guyanensis
Pouteria pouteria Pleioluma baueri, Planchonella contermina, Pouteria torta subsp. tuberculata, Xantolis hookeri, Pouteria glomerata subsp. glomerata, Chrysophyllum pomiferum, Pouteria durlandii subsp. durlandii, Chrysophyllum sanguinolentum subsp. spurium
Protium protium Protium heptaphyllum subsp. heptaphyllum, Protium paniculatum var. riedelianum
Ragala balata Rotala rubra, Myristica bialata var. bialata, Erythrophysa alata
Randia Longflora Oxyceros longiflorus, Euclinia longiflora, Rothmannia longiflora
Shorea leporosula Rubroshorea leprosula, Rubroshorea platycarpa
Shorea lepranda Anthoshorea henryana, Rubroshorea leprosula, Rubroshorea lepidota, Rubroshorea platycarpa
Solanum granulosum Lycianthes pauciflora, Solanum, Lycianthes glandulosa
Solanum sanctae Solanum incanum, Solanum, Solanum torvum
Zeylanica Crotalaria, Casearia
shorea parvifolia Rubroshorea parvifolia, Rubroshorea ovata

Synonymy (multiple matches)

Similarly, among the correctly spelled species with multiple matches, 32 have multiple newly accepted names:

Code
wfo %>%
  filter(Matched, !Fuzzy, !Unique, New.accepted) %>%
  select(spec.name, scientificName) %>%
  unique() %>%
  group_by(spec.name) %>%
  filter(n() > 1) %>%
  summarise(scientificName = paste0(scientificName, collapse = ", ")) %>%
  kable(caption = "Exact match with multiple newly accepted names.")
Exact match with multiple newly accepted names.
spec.name scientificName
Acacia polyphylla Senegalia polyphylla, Parasenegalia visco
Aglaia edulis Aglaia samoensis, Aglaia saltatorum
Anthocephalus chinensis Breonia chinensis, Neonauclea purpurea
Antidesma menasu Antidesma montanum var. montanum, Antidesma montanum var. wallichii
Astrocaryum aculeatum Astrocaryum rodriguesii, Bactris balanophora
Byrsonima crassifolia Byrsonima verbascifolia, Byrsonima cinerea
Callicarpa tomentosa Callicarpa kochiana var. kochiana, Callicarpa macrophylla, Callicarpa candicans var. candicans
Cinnamomum cassia Neolitsea cassia, Cinnamomum burmanni
Diospyros vestita Diospyros sericea, Diospyros sumatrana
Elaeocarpus longipetiolatus Elaeocarpus stipularis var. longipetiolatus, Elaeocarpus culminicola
Eugenia racemosa Eugenia biflora, Barringtonia racemosa
Ficus anthelminthica Ficus maxima, Ficus insipida, Ficus obtusiuscula
Ficus citrifolia Ficus maxima, Ficus drupacea
Ficus clusiifolia Ficus obtusifolia, Ficus elastica
Ficus frondosa Ficus polyantha, Ficus albert-smithii
Gustavia angusta Gustavia angustifolia, Gustavia augusta
Inga macrophylla Inga pilosula, Inga portobellensis
Inga pavoniana Inga sapindoides, Inga nobilis subsp. nobilis
Lasianthus acuminatus Lasianthus acuminatus, Polyspora excelsa
Licania guianensis Moquilea guianensis, Hymenopus heteromorphus var. heteromorphus
Licania macrophylla Hymenopus macrophyllus, Licania laxiflora
Licania pallida Leptobalanus octandrus subsp. pallidus, Moquilea brittoniana
Magnolia candollei Magnolia liliifera, Magnolia acuminata var. acuminata
Marlierea subulata Myrcia subulata, Myrcia skortzoviana
Miconia candolleana Miconia cinnamomifolia, Miconia cornifolia
Myristica tomentosa Horsfieldia tomentosa, Myristica fatua subsp. fatua, Myristica malabarica, Knema laurina var. laurina
Nectandra grandis Rhodostemonodaphne grandis, Nectandra mollis
Piptadenia suaveolens Piptadenia retusa, Marlimorimia psilostachya
Plectronia didyma Psydrax dicoccos var. dicoccos, Psydrax dicoccos
Prunus dulcis Prunus amygdalus, Prunus avium
Psychotria brachiata Rudgea citrifolia, Palicourea brachiata
Psychotria marginata Psychotria punctata var. punctata, Rudgea coronata subsp. leiocarpoides
Pterocarpus Derris, Pterocarpus, Dalbergia
Salacia multiflora Salacia polyantha, Hippocratea volubilis
Shorea leprosula Rubroshorea leprosula, Rubroshorea platycarpa
Shorea parvifolia Rubroshorea parvifolia, Rubroshorea ovata
Sickingia tinctoria Simira tinctoria, Simira rubescens
Sloanea obtusa Sloanea obtusifolia, Sloanea parviflora
Xylopia frutescens Annona squamosa, Xylopia muricata
Zygia ramiflora Archidendron ramiflorum, Zygia dinizii

Synonymy & misspelling corrections

⚠️ Multiple newly accepted names should be discussed.

However, for the moment we will randomly select the first newly accepted name in this case. We will thus correct synonymy and misspelling by:

  1. Using closest correct spelling
  2. Accepting single exact matches
  3. Accepting single newly accepted names for multiple matches
  4. Accepting the first newly accepted names for multiple matches in case of multiples (to be changed later)
  5. Using manual curation for species not matched
Code
wfo <- read_tsv(wfo_file) %>%
  filter(Matched)
wfo_exact <- wfo %>%
  filter(Unique)
wfo_mult <- wfo %>%
  filter(!Unique) %>%
  group_by(spec.name) %>%
  filter(New.accepted) %>%
  sample_n(1)
wfo_mult_non_new <- wfo %>%
  filter(!Unique) %>%
  group_by(spec.name) %>%
  filter(all(!New.accepted)) %>%
  sample_n(1)
wfo_clean <- bind_rows(wfo_exact, wfo_mult, wfo_mult_non_new) %>%
  select(
    spec.name, family, genus, specificEpithet,
    scientificName
  ) %>%
  rename(
    scientific_corrected = spec.name,
    family_cleaned = family,
    genus_cleaned = genus,
    species_cleaned = specificEpithet,
    scientific_cleaned = scientificName
  ) %>%
  mutate(
    cleaning_type = "wfo"
  ) %>%
  bind_rows(manual_curration) %>%
  mutate(scientific_cleaned = ifelse(is.na(genus_cleaned) &
    is.na(species_cleaned),
  NA,
  scientific_cleaned
  ))
taxo_cleaned <- read_tsv(corrected_file) %>%
  mutate(scientific_corrected = ifelse(!is.na(genus_corrected),
    paste(
      genus_corrected,
      ifelse(!is.na(species_corrected),
        species_corrected,
        ""
      )
    ), NA
  )) %>%
  mutate(scientific_corrected = str_squish(scientific_corrected)) %>%
  left_join(wfo_clean)
write_tsv(taxo_cleaned, taxonomy_file)
Code
clean <- read_tsv(taxonomy_file)
fam_cor <- select(clean, family_corrected, family_cleaned) %>%
  unique() %>%
  na.omit() %>%
  filter(family_corrected != family_cleaned)
gen_cor <- select(clean, genus_corrected, genus_cleaned) %>%
  unique() %>%
  na.omit() %>%
  filter(genus_corrected != genus_cleaned)
sp_cor <- select(clean, species_corrected, species_cleaned) %>%
  unique() %>%
  na.omit() %>%
  filter(species_corrected != species_cleaned)
data.frame(
  level = c("family", "genus", "species"),
  N = c(nrow(fam_cor), nrow(gen_cor), nrow(sp_cor))
) %>%
  kable(caption = "Number of synonymy, misspelling, and update corrections per taxonomic level") # nolint
Number of synonymy, misspelling, and update corrections per taxonomic level
level N
family 266
genus 463
species 819

Taxonomic data

All taxonomic data are saved in outputs/taxonomy_vX.tsv with the following columns:

  • site: the site name
  • scientific_*: all columns with the scientific name
  • vernacular_*: all columns with the vernacular name
  • family_*: all columns with the family name
  • genus_*: all columns with the genus name
  • species_*: all columns with the species name
  • *_raw: all columns with the raw information as read in the inventories, see analyses
  • *_corrected: all columns with the manual corrections of dubious names, see analyses
  • *_cleaned: all columns with the synonymy and misspelling corrections with WFO and manual corrections, see analyses, this is the final taxonomic classification that should be used in further analyses
  • *_correction: if a manual corrections of dubious names occured, see analyses
  • cleaning_type: WFO automatic cleaning or using manual curation