#---------------------------------------------------------------------------------------------------------
#This file provides general suggestions on how to manage BENFEP_v1_short in R:

#1. Load BENFEP_v1_short

#2. Create a percentage file

#3. Create a file of species presence
#----------------------------------------------------------------------------------------------------------

require(tidyverse)

#Set working directory

setwd()

#----------------------------------------------------------------------------------------------------------
#1.-Load BENFEP_v1_short

#BENFEP_v1_short contains 1565 columns, column 1 to 23 and 1556 to 1565 are metadata
#columns 24 to 1554 are taxa abundances and column 1555 is the sum of species abundances
#per sample. Please, see Table C1 in the main text for column names and column codes.

#Some datasets contain non-numerical data ("x", "<1") in some species abundances (columns 24 to 1554)
#The presence of those non-numerical data require some adjustments: 

#Our suggestion is to load BENFEP_v1_short setting the format of the columns as "character"
#otherwise the columns with non-numerical data might be read as "logical" and the
#information might be lost in a later transformation to "numeric".
#The users can change the format to "numeric" later on. Some suggestions are provided below.

short= read_delim("BENFEP_v1_short.txt", col_types = cols(.default = "c"))

db=as.data.frame(short)#convert the file into a dataframe
#----------------------------------------------------------------------------------------------------------


#----------------------------------------------------------------------------------------------------------
#2.-Create a percentage file

#2.1.PRELIMINARY STEPS

#2.1.1-Separate metadata
metadata=db%>%
  select(c(1:23), c(1556:1565))


#2.1.2-Separate species abundance data
sp_1=db%>%
  select(c(24:1555))#include column "total" (column number 1555)

#2.1.3.-Replace non-numerical data by NA
#this is suggested because the non-numerical symbols cannot be substituted by a certain value

sp_1[sp_1 == "x"] <- NA
sp_1[sp_1 == "<1"] <- NA

#2.1.4.-Optional step: Check that sum of rows after transformation (step 2.1.3) equals the column "total"

sp_3=sp_1 %>% 
  mutate_if(is.character, as.numeric) %>% 
  rowid_to_column(var = "id")%>%
  rowwise(id)%>%
  mutate(totalcounts = sum(c_across(1:1531),na.rm = TRUE))

table(round(sp_3$total, 2) == round(sp_3$totalcounts, 2))

#2.1.5.-Optional step: substitute the space between genus and species by an underscore

newnames1=colnames(sp_1)
newnames2=str_replace(newnames1, " ", "_")#genus and species
newnames3=str_replace(newnames2, " ", "_")#varieties and subspecies
newnames4=str_replace(newnames3, " ", "_")#varieties and subspecies

colnames(sp_1)=newnames4

#2.1.6.-Transform columns in sp_1 (step 2.1.3) from  "character" to "numeric"

sp_2=sp_1 %>% 
  mutate_if(is.character, as.numeric)

#2.1.7.-Bind metadata (step 2.1.1) and species (step 2.1.6)

spp=cbind(metadata,sp_2)


#2.2.-Step_by_step suggestions to build a percentage file

#We will start from the dataframe "spp" (step 2.1.7)
#metadata will allow the user to filter by the type of assemblage, fraction, etc (see Table C1 in the main text)

#Below we provide a worked example for building a percentage file with the following conditions: 
#Assemblage: Dead (column "assemblage")
#Fraction: foraminifera analysed in >125, >149 and >150 micrometers fraction (column "fraction")
#coming from assemblages were the number of individuals in the assemblage was higher
#than 100 individuals (column "n100")
#Because the data are provided in different formats (Percent, Counts, Density), 
#we might need to split and joint data

#2.2.1.-Filter data with the conditions specified above for data sourced in PERCENT

perc_1=spp%>%
  filter (n100 %in% c("Yes", "NC"))%>%
  filter (format == "Percent")%>%
  filter(fraction%in% c("125","149","150"))%>%
  filter(assemblage=="D")


#2.2.2.-Filter data with the conditions specified above for data sourced in COUNTS AND DENSITIES

count_1=spp%>%
  filter (n100 %in% c("Yes", "NC"))%>%
  filter (format %in% c("Counts", "Density"))%>%
  filter(fraction%in% c("125","149","150"))%>%
  filter(assemblage=="D")

View(count_1)

#2.2.3.-Calculate the percent from data source from COUNTS AND DENSITIES

#split the metadata and species of dataframe count_1

metadatacount_1=count_1%>%#metadata of count_1
  select(c(1:33))

sppcount_1=count_1%>%#species of count_1
  select(c(34:1564))#we do not select "total" because we calculate that in a later step

#calculating percent
per_count_1=sppcount_1/rowSums(sppcount_1, na.rm = TRUE) * 100

#calculating the column "total"
count_2=per_count_1%>%
  rowid_to_column(var = "id")%>%
  rowwise(id)%>%
  mutate(total = sum(c_across(1:1531),na.rm = TRUE))

perc_2=cbind(metadatacount_1,count_2)%>%select(-id)#bind the metadata and the percent
#remove the variable created (id)


#2.2.4.-Join perc_1 and perc_2 

Percent=bind_rows(perc_1,perc_2)

View(Percent)#the percent matrix created with the conditions indicated above.

#----------------------------------------------------------------------------------------------------------

#----------------------------------------------------------------------------------------------------------
#3. Create a file of species presence

#Below we provide a worked example for building a file of species presence.
#For the example, we consider that both numerical and non-numerical data
#in a species cell indicates species presence.
#Users should check the column "remark_1" for specifics about the
#meaning of the non-numerical data

#3.1.-Go to step 1 

db=as.data.frame(short)

metadata=db%>%
  select(c(1:23), c(1556:1565))#

sp_4=db%>%
  select(c(24:1554))#we exclude now the column "total"

#3.2.-Replace non-numerical data in sp_4 by an arbitrary number, for example, 1000

spx1=bind_rows(lapply(sp_4, gsub, pattern = "<1", replacement = "1000", fixed = TRUE))#1000 is an artifice
spx2=bind_rows(lapply(spx1, gsub, pattern = "x", replacement = "1000", fixed = TRUE))

spx=as.data.frame(spx2)#file ready 


#3.3.-Create a file of presence-non presence ("0 and 1")
pnp=spx%>%
mutate_if(is.character, as.numeric)%>%mutate(across(everything(), ~replace_na(.x, 0)))%>%
mutate_if(is.numeric, ~1 * (. != 0))

presence=cbind(metadata,pnp)#bind metadata and file of presence non presence

#3.4.-Create a file of presence -non presence and a column indicating the sum of taxa 

taxa=spx%>%
  mutate_if(is.character, as.numeric)%>%mutate(across(everything(), ~replace_na(.x, 0)))%>%
  mutate_if(is.numeric, ~1 * (. != 0))%>%
  rowid_to_column(var = "id")%>%
  rowwise(id)%>%
  mutate(richness=sum(c_across(1:1531),na.rm = TRUE))


ntaxa=cbind(metadata,taxa)%>%select(-id)#bind the metadata and the percent
#remove the variable created (id)
#----------------------------------------------------------------------------------------------------------
