## -----------------------------------------------------------------------------
## Alaska Permafrost Thaw Database - Data Cleaning and Compilation
## Created by Hailey Webb (hailey.webb@colorado.edu)
## This script reads in permafrost thaw datasets, cleans them, classifies features,
## combines them into a standardized database, and exports CSV/shapefile for use.
##------------------------------------------------------------------------------

#-------------------------------------------------------------------------------
# Load required packages
library(dplyr)
library(stringr)
library(sf)
library(tidyverse)
library(sf)
library(reshape2)
library(ggnewscale)
library(jsonlite)
library(terra)
library(ggplot2)
library(tidyr)
library(tigris)
library(RColorBrewer)
library(patchwork) 
library(rnaturalearth)
library(rnaturalearthhires)
library(ggspatial)

#-------------------------------------------------------------------------------
#
#
#          Section 1: Post-processing of input data
#
#
#-------------------------------------------------------------------------------

#-------------------------------------------------------------------------------
# Step 1: Define Helper Functions
#-------------------------------------------------------------------------------

# Function to add Latitude and Longitude columns from sf object
add_latlon <- function(sf_object) {
  coords <- st_coordinates(sf_object)
  sf_object$Longitude <- coords[,1]
  sf_object$Latitude  <- coords[,2]
  return(sf_object)
}

# Function to count removed features after filtering overlaps
count_removed <- function(original, unique) {
  nrow(original) - nrow(unique)
}

# Function to remove overlapping polygons from a dataset
remove_overlaps <- function(x, others) {
  other_union <- st_union(do.call(rbind, others))      
  overlaps <- st_intersects(x, other_union, sparse = FALSE)[,1]  
  x[!overlaps, ]                                     
}

#-------------------------------------------------------------------------------
# Step 2: Import input data and perform necessary post-processing steps
#-------------------------------------------------------------------------------

#-------------------------------------------------------------------------------
# 1. Jorgenson et al. (2022)
#-------------------------------------------------------------------------------

# Read raw datasets
jorgenson <- read.csv("") # Jorgenson et al. (2002) supplementary data - Table SI8
codes <- read.csv("") # Jorgenson et al. (2002) supplementary data - Table SI1

# Standardize column names for joining
colnames(codes) <- c("PF.Status", "Permafrost.Status")
jorgenson$PF.Status <- toupper(jorgenson$PF.Status)
codes$PF.Status <- toupper(codes$PF.Status)

# Select out abrupt thaw processes from the change code
jorgenson <- jorgenson %>%
  mutate(
    ThawType_Code   = if_else(str_detect(Change.code, "ftk"), "Abrupt", "Non-abrupt"),
    ThawType_Status = if_else(str_detect(PF.Status, "cp|ctm|ds|dtl|gl|gtl|itm|stl|tbd|tb|teg|tf|ths|tlb|tp|tsk|tsl|ttp|twt"), "Abrupt", "Non-abrupt"),
    FinalThawType   = if_else(ThawType_Code == "Abrupt" | ThawType_Status == "Abrupt", "Abrupt", "Non-abrupt")
  ) %>%
  left_join(codes %>% select(PF.Status, Permafrost.Status), by = "PF.Status")


# Classify thaw type based on code or PF.Status
jorgenson <- jorgenson %>%
  mutate(
    ThawType_Code   = if_else(str_detect(Change.code, "tk|ftk"), "Abrupt", "Non-abrupt"),
    ThawType_Status = if_else(str_detect(PF.Status, "cp|ctm|ds|dtl|gl|gtl|itm|stl|tbd|tb|teg|tf|ths|tlb|tp|tsk|tsl|ttp|twt"), "Abrupt", "Non-abrupt"),
    FinalThawType   = if_else(ThawType_Code == "Abrupt" | ThawType_Status == "Abrupt", "Abrupt", "Non-abrupt")
  ) %>%
  left_join(codes %>% select(PF.Status, Permafrost.Status), by = "PF.Status")

jorgenson <- jorgenson[,-12]
jorgenson <- jorgenson %>%
  rename(Permafrost.Status = 11)

jorgenson <- jorgenson %>%
  mutate(
    parts = str_split(Change.code, "-", simplify = TRUE),
    Permafrost.Status = if_else(
      (str_detect(parts[,1], "f") | str_detect(parts[,2], "f")) &
        (str_detect(parts[,2], "t") | str_detect(parts[,3], "t")),
      "Wildfire-induced thaw",
      Permafrost.Status  # keep original value otherwise
    )
  ) %>%
  select(-parts)  # remove temporary column



# Rename column and clean feature type labels
colnames(jorgenson)[which(names(jorgenson) == "Permafrost.Status")] <- "FeatureType"

# Leave FeatureType column blank for non-abrupt thaw features
jorgenson$FeatureType[grepl("Permafrost|Unfrozen", jorgenson$FeatureType, ignore.case = TRUE)] <- ""

# Categorize features and assign thaw type
jorgenson <- jorgenson %>%
  mutate(
    FeatureCategory = case_when(
      FeatureType %in% c("Thermokarst Bog", "Thermokarst Fen") ~ "Thermokarst wetland",
      FeatureType %in% c("Thermokarst Water Track", "Thaw Slump") ~ "Thermokarst",
      FeatureType == "" ~ "",
      FeatureType == "Wildfire-induced thaw" ~ "Wildfire-induced thaw",
      TRUE ~ NA_character_
    ),
    ThawType = case_when(
      FeatureType %in% c("Thermokarst Bog", "Thermokarst Fen", "Thermokarst Water Track", "Thaw Slump", "Wildfire-induced thaw") ~ "Abrupt",
      FeatureType == "" ~ "Non-abrupt",
      TRUE ~ NA_character_
    )
  )

# Remove duplicate coordinates, prioritizing Abrupt
jorgenson <- jorgenson %>%
  group_by(Latitude, Longitude) %>%
  arrange(desc(FinalThawType == "Abrupt"), .by_group = TRUE) %>%
  slice(1) %>%
  ungroup()

# Keep relevant columns and add metadata
jorgenson <- jorgenson[,c(1,2,3,4,11,12,13)]
jorgenson$DataSourceType <- "Remote sensing/Photo-interpretation - published"
jorgenson$DOI <- "https://doi.org/10.1088/1748-9326/ac5c0d"
jorgenson$Imagery <- "Photo-interpretation"
jorgenson$ImageryDates <- ""
jorgenson$ImageryResolution_meters <- ""

rm(codes)  # clean environment

#-------------------------------------------------------------------------------
# 2. Jones & Zuck (2016)
#-------------------------------------------------------------------------------

# Read in Jones & Zuck 2016 data but only keep specific lake types
fcw <- read.csv("") %>% # Jones & Zuck (2016) CSV file
  filter(Lake_type %in% c(1,2,6)) %>%  
  mutate(
    Authors = "Jones & Zuck 2016",
    FeatureType = case_when(
      Lake_type == 1 ~ "Remnant drained lake basin",
      Lake_type == 2 ~ "Thermokarst lake",
      Lake_type == 6 ~ "Collapsed pingo pond"
    ),
    FeatureCategory = case_when(
      Lake_type %in% c(1,2) ~ "Thermokarst lake",
      Lake_type == 6 ~ "Thaw pond"
    ),
    ThawType = "Abrupt", # Add in metadata
    DataSourceType = "Remote sensing - published",
    DOI = "https://doi.org/10.5066/F7H70CXB",
    Imagery = "SAR; Landsat TM; ETM+; OLI",
    ImageryDates = "1985 through 2015",
    ImageryResolution_meters = "30"
  )

# Remove columns to match format of other input data
fcw <- fcw[,c(3,4,25,26,27,28,29,30,31,32,33)]

#-------------------------------------------------------------------------------
# 3. Jones et al. (2019)
#-------------------------------------------------------------------------------

jones <- read.csv("") %>% # Jones et al. (2019) lake drainage CSV file
  filter(Drain_Mech %in% c("Bank Overtopping or Headward Erosion", "Lake Expansion")) %>%
  mutate(
    FeatureType = "Drained thermokarst lake",
    FeatureCategory = "Thermokarst lake",
    ThawType = "Abrupt",
    Authors = "Jones et al. 2019",
    DataSourceType = "Remote Sensing/Photo-interpretation - published",
    DOI = "https://doi.org/10.18739/A2DR2P85H",
    Imagery = "USGS topo maps; aerial photography; Landsat TM; ETM+; OLI",
    ImageryDates = "2000 through 2017",
    ImageryResolution_meters = "30"
  )


jones <- jones[,c(2,3,9,10,11,12,13,14,15,16,17)]
colnames(jones) <- c("Latitude","Longitude","FeatureType","FeatureCategory","ThawType","Authors", 
                     "DataSourceType", "DOI", "Imagery", "ImageryDates", "ImageryResolution_meters")

#-------------------------------------------------------------------------------
# 4. Nitze et al. (2018)
#-------------------------------------------------------------------------------

nitze2018 <- st_read("") %>% # T3_AK_RTS.shp
  st_transform(4326) %>%             # WGS84
  st_point_on_surface()               # points inside polygons

nitze2018 <- nitze2018 %>%
  mutate(
    Longitude = st_coordinates(geometry)[,1],
    Latitude  = st_coordinates(geometry)[,2],
    FeatureType = "Retrogressive thaw slump",
    FeatureCategory = "Retrogressive thaw slump",
    ThawType = "Abrupt",
    Authors = "Nitze et al. 2018",
    DataSourceType = "Remote Sensing - published",
    DOI = "https://doi.pangaea.de/10.1594/PANGAEA.894755",
    Imagery = "Landsat",
    ImageryDates = "1999 through 2014",
    ImageryResolution_meters = "30"
  ) %>%
  st_set_geometry(NULL) %>%
  select(Latitude, Longitude, FeatureType, FeatureCategory, ThawType, Authors, DataSourceType, DOI, Imagery, ImageryDates, ImageryResolution_meters)

#-------------------------------------------------------------------------------
# 5. Nitze et al. (2020) - Landsat/Sentinel + Planet
#-------------------------------------------------------------------------------

# Read polygons
nitze2020_ls <- st_read("") # Nitze_etal_2020_lakechange_landsat_S1.shp
nitze2020_planet <- st_read("") # Nitze_etal_2020_lakechange_planet_lake_area.shp

# Fix invalid geometries 
nitze2020_ls <- st_make_valid(nitze2020_ls)

# Remove overlapping Landsat polygons
overlap_logical <- lengths(st_intersects(nitze2020_ls, nitze2020_planet)) > 0
nitze2020_ls_filtered <- nitze2020_ls[!overlap_logical, ]

# Keep only geometries and combine
combined_polygons <- st_sfc(c(st_geometry(nitze2020_ls_filtered), st_geometry(nitze2020_planet)), crs = st_crs(nitze2020_ls_filtered))
nitze2020 <- st_point_on_surface(st_as_sf(combined_polygons)) %>% st_transform(4326)
nitze2020 <- add_latlon(nitze2020)

# Add metadata
nitze2020 <- nitze2020 %>%
  mutate(
    Authors = "Nitze et al. 2020",
    FeatureType = "Thermokarst lake",
    FeatureCategory = "Thermokarst lake",
    ThawType = "Abrupt",
    DataSourceType = "Remote Sensing - published",
    DOI = "https://doi.org/10.1594/PANGAEA.922808",
    Imagery = "Landsat; Sentinel-1; PlanetScope",
    ImageryDates = "1999 through 2018",
    ImageryResolution_meters = "30; 3.125"
  ) %>%
  as.data.frame() %>%
  select(-x)

# Clean up environment
rm(list = c(
  "combined_polygons", "nitze2020_ls", "nitze2020_planet",
  "nitze2020_ls_filtered", "overlap_logical"
))

#-------------------------------------------------------------------------------
# 6. Swanson et al. (2021)
#-------------------------------------------------------------------------------

# Read multiple polygon sources
ikonos <- st_read("") # ErosionPolysIKONOS.shp
ahap <- st_read("") # ErosionPolysAHAP.shp
spot67 <- st_read("") # ErosionPolysSPOT67.shp
wv23 <- st_read("") # ErosionPolysWV23.shp

# Transform all to same coordinate system 
crs_target <- st_crs(ikonos)
ahap <- st_transform(ahap, crs_target)
spot67 <- st_transform(spot67, crs_target)
wv23 <- st_transform(wv23, crs_target)

# Remove overlaps
ikonos_unique <- remove_overlaps(ikonos, list(ahap, spot67, wv23))
ahap_unique   <- remove_overlaps(ahap, list(ikonos, spot67, wv23))
spot67_unique <- remove_overlaps(spot67, list(ikonos, ahap, wv23))
wv23_unique   <- remove_overlaps(wv23, list(ikonos, ahap, spot67))

# Convert to points and WGS84
ikonos_pts <- add_latlon(st_transform(st_point_on_surface(ikonos_unique), 4326))
ahap_pts   <- add_latlon(st_transform(st_point_on_surface(ahap_unique), 4326))
spot67_pts <- add_latlon(st_transform(st_point_on_surface(spot67_unique), 4326))
wv23_pts   <- add_latlon(st_transform(st_point_on_surface(wv23_unique), 4326))

# Count removals
removed_ikonos <- count_removed(ikonos, ikonos_unique)
removed_ahap   <- count_removed(ahap, ahap_unique)
removed_spot67 <- count_removed(spot67, spot67_unique)
removed_wv23   <- count_removed(wv23, wv23_unique)

cat("Polygons removed:\nIKONOS:", removed_ikonos,
    "\nAHAP:", removed_ahap,
    "\nSPOT67:", removed_spot67,
    "\nWV23:", removed_wv23,
    "\nTOTAL:", sum(removed_ikonos, removed_ahap, removed_spot67, removed_wv23), "\n")

# Combine all points and assign metadata
swanson_points <- bind_rows(ikonos_pts, ahap_pts, spot67_pts, wv23_pts) %>%
  filter(ErsnFtType %in% c("A","R")) %>%
  mutate(
    FeatureType     = if_else(ErsnFtType=="A","Active layer detachment","Retrogressive thaw slump"),
    FeatureCategory = if_else(ErsnFtType=="A","Active layer detachment","Retrogressive thaw slump"),
    ThawType        = "Abrupt",
    Authors         = "Swanson et al. 2021",
    DataSourceType  = "Remote Sensing - published",
    DOI             = "https://doi.org/10.1002/ppp.2098",
    Imagery = "Alaska high altitude aerial photographs; IKONOS; SPOT67; WV23",
    ImageryDates = "1977 through 2019",
    ImageryResolution_meters = "1.5; 4; 2.2; 6"
  ) %>%
  as.data.frame() %>%
  select(Latitude, Longitude, FeatureType, FeatureCategory, ThawType, Authors, DataSourceType, DOI, Imagery, ImageryDates, ImageryResolution_meters)

# Clean environment
rm(list = c(
  "ahap", "ahap_pts", "ahap_unique",
  "crs_target",
  "ikonos", "ikonos_pts", "ikonos_unique",
  "spot67", "spot67_pts", "spot67_unique",
  "wv23", "wv23_pts", "wv23_unique",
  "removed_ahap", "removed_ikonos", "removed_spot67",
  "removed_wv23"
))

#-------------------------------------------------------------------------------
# 7. Whitley (2018)
#-------------------------------------------------------------------------------

whitley <- read.csv("") # PermafrostDepth_Fielddata_2016.csv

# Convert UTM (NAD83 Zone 3N, EPSG:26903) to WGS84 (EPSG:4326)
whitley_sf <- st_as_sf(whitley, coords = c("easting", "northing"), crs = 26903)
whitley_wgs84 <- st_transform(whitley_sf, 4326)

# Extract Latitude and Longitude back to dataframe
coords <- st_coordinates(whitley_wgs84)
whitley$Longitude <- coords[,1]
whitley$Latitude  <- coords[,2]

# Keep only locations with permafrost
whitley <- whitley %>%
  filter(permafrost == 1)

# Remove incorrect ecotype entries
whitley <- whitley %>%
  filter(!ecotype %in% c("RackLine", "Rackline"))

# Assign FeatureType, FeatureCategory, and ThawType based on ecotype
whitley <- whitley %>%
  mutate(
    FeatureType = case_when(
      ecotype %in% c("Lowland Moist Graminoid Shrub Meadow",
                     "Lowland Moist Graminoid Shrub Meadow?",
                     "Lowland Wet Sedge Meadow",
                     "Lowland Moist Grmainoid Shrub Meadow",
                     "Lowland Wet Sedge Meadow?",
                     "Lowland Moist Graminoid Sedge Meadow") ~ "",
      ecotype == "Thermokarst Pit" ~ "Thermokarst Pit",
      ecotype == "Thermoerosional gully" ~ "Thermoerosional gully",
      TRUE ~ NA_character_
    ),
    FeatureCategory = case_when(
      FeatureType == "Thermokarst Pit" ~ "Thermokarst",
      FeatureType == "Thermoerosional gully" ~ "Thermoerosional gully",
      FeatureType == "" ~ "",
      TRUE ~ NA_character_
    ),
    ThawType = case_when(
      FeatureType %in% c("Thermokarst Pit","Thermoerosional gully") ~ "Abrupt",
      FeatureType == "" ~ "Non-abrupt",
      TRUE ~ NA_character_
    ),
    Authors = "Whitley et al. 2018",
    DataSourceType = "Field - published",
    DOI = "https://doi.org/10.3334/ORNLDAAC/1598",
    Imagery = "",
    ImageryDates = "",
    ImageryResolution_meters = ""
  ) %>%
  select(Latitude, Longitude, FeatureType, FeatureCategory, ThawType, Authors, DataSourceType, DOI, Imagery, ImageryDates, ImageryResolution_meters)

# Clean environment
rm(list = c(
  "coords", "whitley_sf", "whitley_wgs84"
))

#-------------------------------------------------------------------------------
# Step 3: Combine all cleaned datasets
#-------------------------------------------------------------------------------

all_data <- list(
  whitley, jones, jorgenson, nitze2018, nitze2020,
  swanson_points, fcw
)

# Use the column order of the first dataframe as standard
col_order <- names(all_data[[1]])

# Reorder columns for all datasets and combine
combined <- all_data %>%
  lapply(function(x) x[, col_order]) %>%
  bind_rows()

rm(list = c(
  "col_order", "all_data", "whitley", "jones", "jorgenson", "nitze2018",
  "nitze2020", "swanson_points", "fcw", "add_latlon", "count_removed", "remove_overlaps"
))

#-------------------------------------------------------------------------------
# Step 4: Combine post-processed data with original data

#The 'combined' dataframe now contains all datasets that required post-
# processing before being added to the final dataframe. Let's combine these
# points with the rest of our thaw database
#-------------------------------------------------------------------------------

# Read in raw Alaska permafrost thaw database points
db <- read.csv("") #AlaskaPermafrostThawDatabase_v2.0.0_notclean.csv

# Combine with newly cleaned 'combined' data from previous section
data <- bind_rows(db, combined)

# Clean up environment
rm(db)
rm(combined)

#-------------------------------------------------------------------------------
# Step 5: Clean database
#-------------------------------------------------------------------------------

# Standardize thaw types; change gradual to non-abrupt
data <- data %>%
  mutate(ThawType = ifelse(ThawType == "Gradual", "Non-abrupt", ThawType))

# Remove duplicates
# Keep all Turetsky et al. entries (as these are correct); 
# otherwise remove duplicate FeatureName and coordinates
data_clean <- data %>%
  filter(
    Authors == "Turetsky et al. " |
      is.na(FeatureName) | FeatureName == "" | !duplicated(FeatureName)
  ) %>%
  filter(
    Authors == "Turetsky et al. " |
      !duplicated(select(., Latitude, Longitude))
  )

rm(data)

# Standardize FeatureType names & general cleanup
data_clean$FeatureType[grepl("lake", data_clean$FeatureType, ignore.case = TRUE)] <- "Thermokarst lake"
data_clean$FeatureType[grepl("gull[ey]", data_clean$FeatureType, ignore.case = TRUE)] <- "Thermokarst/thermoerosional gully"
data_clean$FeatureType[grepl("thaw slump|retrogressive thaw slump", data_clean$FeatureType, ignore.case = TRUE)] <- "Retrogressive thaw slump"
data_clean$FeatureType[data_clean$FeatureType %in% c("ALD")] <- "Active layer detachment"
data_clean$FeatureType[data_clean$FeatureType %in% c("GLY")] <- "Thermo-erosion gully"
data_clean$FeatureType[data_clean$FeatureType %in% c("TS")] <- "Retrogressive thaw slump"

# Capitalize first word only
data_clean$FeatureType <- sapply(data_clean$FeatureType, function(x) {
  words <- strsplit(tolower(x), " ")[[1]]
  words[1] <- paste0(toupper(substring(words[1], 1, 1)), substring(words[1], 2))
  paste(words, collapse = " ")
})

# Consolidate naming conventions for thaw features
data_clean <- data_clean %>%
  mutate(FeatureType = case_when(
    str_detect(FeatureType, regex("bog", ignore_case = TRUE)) ~ "Thermokarst bog",
    str_detect(FeatureType, regex("fen", ignore_case = TRUE)) ~ "Thermokarst fen",
    str_detect(FeatureType, regex("wetland|bog/fen", ignore_case = TRUE)) ~ "Thermokarst wetland",
    str_detect(FeatureType, regex("tk|glacial", ignore_case = TRUE)) ~ "Thermokarst",
    str_detect(FeatureType, regex("pit|ice|polygonal|lowland|tundra|upland", ignore_case = TRUE)) ~ "Thermokarst",
    str_detect(FeatureType, regex("detachment", ignore_case = TRUE)) ~ "Active layer detachment",
    TRUE ~ FeatureType
  ))

# Replace placeholder "NANA" with Non-abrupt
data_clean$FeatureType[data_clean$FeatureType == "NANA"] <- "Non-abrupt"

# Define FeatureCategory based on FeatureType
data_clean <- data_clean %>%
  mutate(FeatureCategory = case_when(
    str_detect(FeatureType, regex("wildfire|fire", ignore_case = TRUE)) ~ "Wildfire-induced thaw",
    str_detect(FeatureType, regex("gully", ignore_case = TRUE)) ~ "Thermoerosional gully",
    str_detect(FeatureType, regex("lake", ignore_case = TRUE)) ~ "Thermokarst lake",
    str_detect(FeatureType, regex("peat", ignore_case = TRUE)) ~ "Non-abrupt",
    str_detect(FeatureType, regex("monitoring|network", ignore_case = TRUE)) ~ "Non-abrupt",
    str_detect(FeatureType, regex("feature", ignore_case = TRUE)) ~ "Thermokarst",
    str_detect(FeatureType, regex("pond", ignore_case = TRUE)) ~ "Thaw pond",
    str_detect(FeatureType, regex("fen|bog|wetland", ignore_case = TRUE)) ~ "Thermokarst wetland",
    str_detect(FeatureType, regex("basin|track", ignore_case = TRUE)) ~ "Thermokarst",
    TRUE ~ FeatureType
  ))

#-------------------------------------------------------------------------------
# Step 6: Clip database to Alaska
#-------------------------------------------------------------------------------

# Convert to sf object using WGS84
data_sf <- st_as_sf(data_clean, coords = c("Longitude", "Latitude"), crs = 4326)

# Import Alaska boundary
alaska <- st_read("") # GU_StateOrTerritory.shp

# Ensure matching CRS
if (st_crs(data_sf) != st_crs(alaska)) {
  alaska <- st_transform(alaska, st_crs(data_sf))
}

# Clip points to Alaska
clipped_data <- st_intersection(data_sf, alaska)

# Drop unnecessary columns from shapefile attributes
clipped_data <- clipped_data[,-c(11:27)]

# Extract coordinates back to columns
coords <- st_coordinates(clipped_data)
clipped_data$Longitude <- coords[,1]
clipped_data$Latitude <- coords[,2]

# Convert to data.frame for export
clipped_df <- as.data.frame(st_drop_geometry(clipped_data))

desired_order <- c("Authors", 
                   "DOI", 
                   "DataSourceType", 
                   "FeatureName", 
                   "Latitude", 
                   "Longitude", 
                   "FeatureType", 
                   "FeatureCategory", 
                   "ThawType", 
                   "Imagery", 
                   "ImageryDates", 
                   "ImageryResolution_meters")

clipped_df <- clipped_df[ , desired_order]

rm(list = c(
  "coords", "data_sf", "data_clean", "desired_order", "clipped_data"
))

#-------------------------------------------------------------------------------
# Step 7: Look at summary data
#-------------------------------------------------------------------------------

# Get number of features per unique source
features_per_source <- clipped_df %>%
  group_by(Authors, DOI, .drop = FALSE) %>%
  summarise(n = n(), .groups = "drop")

# Get total number of each unique feature category
total_feature_categories <- as.data.frame(table(clipped_df$FeatureCategory))
sum(total_feature_categories$Freq)

#-------------------------------------------------------------------------------
# Step 8: Sort data and Export as csv and shapefile
#-------------------------------------------------------------------------------
## Format Authors and sort

# Add parentheses around years
clipped_df$Authors <- gsub("(\\d{4})", "(\\1)", clipped_df$Authors)

# Sort alphabetically by Authors
clipped_df <- clipped_df[order(clipped_df$Authors), ]

# Export as CSV
write.csv(clipped_df, ".csv", row.names = FALSE) # Name file

# Convert back to sf for shapefile
clipped_sf <- st_as_sf(clipped_df,
                       coords = c("Longitude", "Latitude"),
                       crs = 4326)

# Export shapefile
st_write(clipped_sf, ".shp", delete_dsn = TRUE) # Name file



#-------------------------------------------------------------------------------
#
#
# Section 2: Upload shapefile to GEE and run the script linked in the manuscript
#       The output file will include appended topographic variables
#
#
#-------------------------------------------------------------------------------





#-------------------------------------------------------------------------------
#
#
#      Section 3: Topographic Analysis and Ecoregion Visualization
#
#
#-------------------------------------------------------------------------------

#-------------------------------------------------------------------------------
# Step 1: Read in topographic database
#-------------------------------------------------------------------------------
topo <- read.csv("") # CSV file with GEE data

# Rename columns for clarity
colnames(topo) <- c("index", "Authors", "DOI", "DataSourceType",
                    "FeatureName", "FeatureType", 
                    "FeatureCategory", 
                    "Imagery", "ImageryDate", "ImageryResolution_meters",
                    "ThawType", "Aspect", 
                    "Elevation", "MeanElevation", "RelativeElevation", "Slope", 
                    "SolarRadiationIndex", "Geo")

#-------------------------------------------------------------------------------
# Step 2: Check normality (QQ plots)
#-------------------------------------------------------------------------------
qqnorm(topo$Slope[topo$ThawType == "Non-abrupt"])
qqline(topo$Slope[topo$ThawType == "Non-abrupt"], col = "blue")

qqnorm(topo$SolarRadiationIndex[topo$ThawType == "Abrupt"])
qqline(topo$SolarRadiationIndex[topo$ThawType == "Abrupt"], col = "blue")

qqnorm(topo$RelativeElevation[topo$ThawType == "Non-abrupt"])
qqline(topo$RelativeElevation[topo$ThawType == "Non-abrupt"], col = "blue")

# Data are not normally distributed; use Wilcoxon rank-sum test

set.seed(123)  # reproducibility

# Variables for testing
variables <- c("Slope", "RelativeElevation", "SolarRadiationIndex")

topo <- topo %>%
  filter(ThawType %in% c("Abrupt", "Non-abrupt")) %>%
  mutate(ThawType = factor(ThawType))

# Initialize results list
results <- list()

# Bootstrap Wilcoxon test for each variable
for (var in variables) {
  abrupt_vals <- topo %>% filter(ThawType == "Abrupt") %>% pull(var)
  nonabrupt_vals <- topo %>% filter(ThawType == "Non-abrupt") %>% pull(var)
  
  mean_abrupt <- mean(abrupt_vals, na.rm = TRUE)
  mean_nonabrupt <- mean(nonabrupt_vals, na.rm = TRUE)
  
  # Bootstrapping
  p_vals <- numeric(1000)
  ci_abrupt <- numeric(1000)
  ci_nonabrupt <- numeric(1000)
  
  for (i in 1:1000) {
    samp_abrupt <- sample(abrupt_vals, 500, replace = TRUE)
    samp_nonabrupt <- sample(nonabrupt_vals, 500, replace = TRUE)
    
    test <- wilcox.test(samp_abrupt, samp_nonabrupt)
    p_vals[i] <- test$p.value
    ci_abrupt[i] <- mean(samp_abrupt)
    ci_nonabrupt[i] <- mean(samp_nonabrupt)
  }
  
  # 95% confidence intervals
  ci_abrupt_95 <- quantile(ci_abrupt, probs = c(0.025, 0.975))
  ci_nonabrupt_95 <- quantile(ci_nonabrupt, probs = c(0.025, 0.975))
  
  results[[var]] <- tibble(
    Variable = var,
    Mean_Abrupt = mean_abrupt,
    CI_Abrupt_Lower = ci_abrupt_95[1],
    CI_Abrupt_Upper = ci_abrupt_95[2],
    Mean_NonAbrupt = mean_nonabrupt,
    CI_NonAbrupt_Lower = ci_nonabrupt_95[1],
    CI_NonAbrupt_Upper = ci_nonabrupt_95[2],
    Bootstrap_P = mean(p_vals)
  )
}

# Get final results
final_results <- bind_rows(results)

# Clean environment
rm(list = c(
  "abrupt_vals", "ci_abrupt", "ci_abrupt_95", "ci_nonabrupt", "ci_nonabrupt_95",
  "i", "mean_abrupt", "mean_nonabrupt", "nonabrupt_vals", "p_vals", "samp_abrupt",
  "samp_nonabrupt", "var", "variables", "test", "results"
))

#-------------------------------------------------------------------------------
# Step 3: Visualize topographic variables
#-------------------------------------------------------------------------------
plot_density <- function(df, var, xlim_vals = NULL, vline = NULL, colors = c("Abrupt"="#cc6600","Non-abrupt"="#6699cc")) {
  ggplot(df, aes_string(x = var, fill = "ThawType")) +
    geom_density(alpha = 0.6, color = "black") +
    scale_fill_manual(values = colors) +
    labs(title = paste(var, "Distribution"), x = var, y = "Relative Frequency", fill = "Thaw Type") +
    theme_bw() +
    theme(
      plot.title = element_text(size = 20, face = "bold", hjust = 0.5),
      axis.title = element_text(size = 18),
      axis.text = element_text(size = 16),
      legend.title = element_text(size = 18),
      legend.text = element_text(size = 16)
    ) +
    {if(!is.null(xlim_vals)) coord_cartesian(xlim = xlim_vals)} +
    {if(!is.null(vline)) geom_vline(xintercept = vline, linetype = "dashed", color = "black", size = 1)}
}

# Plot density curve for Relative elevation and solar radiation index
plot_density(topo, "RelativeElevation", xlim_vals = c(-7,7), vline = 0)
plot_density(topo, "SolarRadiationIndex", xlim_vals = c(0.5,1), vline = 0.75)

# Clean environment
rm(plot_density)

#-------------------------------------------------------------------------------
# Step 4: Ecoregion assignment
#-------------------------------------------------------------------------------
eco <- st_read("") # akecoregions.shp

# Make sure ecoregions and thaw database are in same projection
clipped_sf <- st_transform(clipped_sf, crs = 3338)
eco <- st_transform(eco, crs = 3338)

# Modify ecoregion names
eco$LEVEL_2_REVISED <- case_when(
  eco$COMMONER == "Brooks Range" ~ "Brooks Range",
  eco$LEVEL_1 == "Maritime" ~ "Maritime",
  eco$LEVEL_2 %in% c("Alaska Range Transition", "Coast Mountains Transition", "Pacific Mountains Transition") ~ "Southern Mountains",
  TRUE ~ eco$LEVEL_2
)

# Dissolve by ecoregions for visually improved maps
eco_dissolved <- eco %>%
  group_by(LEVEL_2_REVISED) %>%
  summarise(geometry = st_union(geometry), .groups = "drop") %>%
  st_as_sf()

# Spatial join: assign each thaw point to an ecoregion
db_joined <- st_join(clipped_sf, eco["LEVEL_2_REVISED"], left = FALSE)

#-------------------------------------------------------------------------------
# Step 5: Map of thaw locations and ecoregions
#-------------------------------------------------------------------------------
feature_colors <- c("Abrupt"="#cc6600","Non-abrupt"="#6699cc") # colorblind friendly

# Set color palette (colorblind friendly)
earth_tones <- brewer.pal(7, "Set2")

# Split up by thaw type for ease of viewing
non_abrupt <- clipped_sf[clipped_sf$ThawType == "Non-abrupt", ]
abrupt <- clipped_sf[clipped_sf$ThawType == "Abrupt", ]

# Non-abrupt thaw with ecoregions map
ggplot() +
  geom_sf(data = eco_dissolved, aes(fill = LEVEL_2_REVISED),
          color = "black", size = 0.2, alpha = 0.5) +
  scale_fill_manual(values = earth_tones, name = "Ecoregion") +
  ggnewscale::new_scale_fill() +
  geom_sf(data = non_abrupt, aes(fill = ThawType),
          shape = 21, color = "black", size = 4.5) +
  scale_fill_manual(values = feature_colors, name = "Thaw Type") +
  labs(
    title = "Non-abrupt thaw locations across Alaska",
    x = NULL, y = NULL
  ) +
  annotation_scale(
    location = "br",
    width_hint = 0.25,
    unit_category = "metric",
    text_cex = 1.5,
    height = unit(0.5, "cm"),
    bar_cols = c("black", "white")
  ) +
  theme_minimal() +
  theme(
    panel.grid.major = element_line(color = "gray80"),
    legend.position = c(0.05, 0.95),
    legend.justification = c(0,1),
    legend.background = element_rect(fill = "white", color = "black", size = 0.5),
    plot.title = element_text(hjust = 0.5, size = 22),
    legend.title = element_text(size = 14, face = "bold"),
    legend.text = element_text(size = 12),
    axis.text = element_text(size = 18),
    axis.title = element_text(size = 20)
  )

# Abrupt thaw map with ecoregions map and flipped legend order
ggplot() +
  geom_sf(data = eco_dissolved, aes(fill = LEVEL_2_REVISED),
          color = "black", size = 0.2, alpha = 0.5) +
  scale_fill_manual(
    values = earth_tones,
    name = "Ecoregion",
    guide = guide_legend(order = 1)   # Ecoregion legend first
  ) +
  ggnewscale::new_scale_fill() +
  geom_sf(data = abrupt, aes(fill = ThawType),
          shape = 21, color = "black", size = 4.5) +
  scale_fill_manual(
    values = feature_colors,
    name = "Thaw Type",
    guide = guide_legend(order = 2)   # Thaw Type legend second
  ) +
  labs(
    title = "Abrupt thaw locations across Alaska",
    x = NULL, y = NULL
  ) +
  annotation_scale(
    location = "br",
    width_hint = 0.25,
    unit_category = "metric",
    text_cex = 1.5,
    height = unit(0.5, "cm"),
    bar_cols = c("black", "white")
  ) +
  theme_minimal() +
  theme(
    legend.position = c(0.05, 0.95),
    legend.justification = c(0,1),
    legend.background = element_rect(fill = "white", color = "black", size = 0.5),
    plot.title = element_text(hjust = 0.5, size = 22),
    legend.title = element_text(size = 14, face = "bold"),
    legend.text = element_text(size = 12),
    axis.text = element_text(size = 18),
    axis.title = element_text(size = 20)
  )


#-------------------------------------------------------------------------------
# Step 6: Pie chart and bar chart of thaw feature types per ecoregion
#-------------------------------------------------------------------------------

abrupt_joined <- st_join(abrupt, eco_dissolved %>% select(LEVEL_2_REVISED)) %>%
  mutate(LEVEL_2_REVISED = ifelse(is.na(LEVEL_2_REVISED), "Bering Tundra", LEVEL_2_REVISED))

# Get number of unique thaw features for each ecoregion
thaw_counts <- abrupt_joined %>%
  st_drop_geometry() %>%
  group_by(LEVEL_2_REVISED, FeatureCategory) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(LEVEL_2_REVISED) %>%
  mutate(percent = count / sum(count) * 100)

# Join ecoregions with the database
# Set NA points to Bering Tundra. This is because some points fall outside
# of the ecoregion map. When I checked the physical location of these points,
# I realized they were all along the coastline in the Bering Tundra region
eco_joined <- st_join(clipped_sf, eco_dissolved %>% select(LEVEL_2_REVISED)) %>%
  mutate(LEVEL_2_REVISED = ifelse(is.na(LEVEL_2_REVISED), "Bering Tundra", LEVEL_2_REVISED))

# Summarize total abrupt thaw feature counts per ecoregion and proportions
eco_summary <- eco_joined %>%
  group_by(LEVEL_2_REVISED) %>%
  summarise(
    total_points = n(),
    abrupt_points = sum(ThawType == "Abrupt", na.rm = TRUE),
    abrupt_proportion = abrupt_points / total_points
  )

# Define pie chart colors (colorblind friendly)
feature_colors_pie <- c(
  "Wildfire-induced thaw" = "#FC8D62",
  "Thermokarst lake" = "#0072B2",
  "Thermokarst wetland" = "#A6D854",
  "Thermoerosional gully" = "#B3B3B3",
  "Thaw pond" = "#80B1D3",
  "Retrogressive thaw slump" = "#E78AC3", 
  "Active layer detachment" = "#FFD92F",
  "Thermokarst" = "#E5C494" #80B1D3
)


# Pie chart faceted by ecoregion
ggplot(thaw_counts, aes(x = "", y = percent, fill = FeatureCategory)) +
  geom_col(color = "black", width = 1) +
  coord_polar(theta = "y") +
  facet_wrap(~LEVEL_2_REVISED, scales = "free") +
  scale_fill_manual(values = feature_colors_pie) +
  theme_void() +
  theme(strip.text = element_text(size=15),
        legend.position="bottom",
        legend.title = element_text(size=14),
        legend.text = element_text(size=12),
        plot.title = element_text(size=18, face="bold", hjust=0.5, margin=margin(b=15))) +
  labs(title="Thaw Feature Distribution by Ecoregion", fill="Feature Type")

rm(list = c(
  "earth_tones", "feature_colors", "db_joined", "abrupt_joined", "eco",
  "eco_dissolved", "feature_colors_pie"
))


#-------------------------------------------------------------------------------
# Step 7: Chi-square test for ecoregion vs feature category
#-------------------------------------------------------------------------------
# Number of thaw feature type per ecoregion - long format
thaw_summary <- thaw_counts %>%
  select(LEVEL_2_REVISED, FeatureCategory, count) %>%
  pivot_wider(names_from=FeatureCategory, values_from=count, values_fill=0)

# Transpose the table
colnames(thaw_summary)[1] <- "Ecoregion"
transposed_table <- as.data.frame(t(thaw_summary))
colnames(transposed_table) <- as.character(transposed_table[1, ])
transposed_table <- transposed_table[-1, ]
transposed_table[, 1:ncol(transposed_table)] <- lapply(transposed_table[, 1:ncol(transposed_table)], as.numeric)

# Chi-square test with 1000 replicates because of sparse table
chisq_result <- chisq.test(transposed_table, simulate.p.value = TRUE, B=10000)
chisq_result



#-------------------------------------------------------------------------------
#
#
#                 Section 4: Ground Ice Map Comparison
#
#
#-------------------------------------------------------------------------------


#-------------------------------------------------------------------------------
# Step 1: Prep and import data
#-------------------------------------------------------------------------------

# Extract coordinate information
topo <- topo %>%
  rowwise() %>%
  mutate(
    coords = list(fromJSON(Geo)$coordinates),
    Longitude = coords[1],
    Latitude  = coords[2]
  ) %>%
  select(-coords)

topo.sf <- st_as_sf(topo, coords = c("Longitude", "Latitude"), crs = 4326)


# Select only ice-dependent thaw features
unique(topo.sf$FeatureCategory)
ice.thaw <- topo.sf %>%
  filter(FeatureCategory %in% c("Thermokarst",
                                "Thermokarst lake",
                                "Thermokarst wetland",
                                "Retrogressive thaw slump",
                                "Thaw Pond")) 


# Convert to SpatVector file
data_vec <- vect(ice.thaw)

# Import ground ice maps
ground_ice_jorg <- vect("") # Permafrost_AK_2008_AlbAK83.shp
ground_ice_nsidc <- vect("") # permaice.shp

# Reproject ground ice maps to match thaw points
ground_ice_jorg <- project(ground_ice_jorg, crs(data_vec))
ground_ice_nsidc <- project(ground_ice_nsidc, crs(data_vec))

#-------------------------------------------------------------------------------
# Step 2: Extract ground ice info to thaw points
#-------------------------------------------------------------------------------

data_vec$ice_class_jorgenson <- terra::extract(ground_ice_jorg["ICECLOWASS"], data_vec)$ICECLOWASS
data_vec$ice_class_nsidc  <- terra::extract(ground_ice_nsidc["CONTENT"], data_vec)$CONTENT

# Convert back to sf
ice.thaw <- st_as_sf(data_vec)

unique(data_vec$ice_class_jorgenson)
unique(data_vec$ice_class_nsidc)

#-------------------------------------------------------------------------------
# Step 3: Check agreement between ice maps
#-------------------------------------------------------------------------------

# Change NSIDC labels
# Combine High and moderate classes for better accuracy
ground_ice_nsidc$CONTENT <- case_when(
  ground_ice_nsidc$CONTENT %in% "l" ~ "Low",
  ground_ice_nsidc$CONTENT %in% "m" ~ "High/Moderate",
  ground_ice_nsidc$CONTENT %in% "h" ~ "High/Moderate",
  TRUE ~ NA_character_
)

# Change Jorgenson labels
# Combine High and moderate classes for better accuracy
ground_ice_jorg$ICECLOWASS <- case_when(
  ground_ice_jorg$ICECLOWASS %in% "Low" ~ "Low",
  ground_ice_jorg$ICECLOWASS %in% "Moderate" ~ "High/Moderate",
  ground_ice_jorg$ICECLOWASS %in% "High" ~ "High/Moderate",
  TRUE ~ NA_character_
)

# Reproject both vectors
ground_ice_jorg_proj <- project(ground_ice_jorg, "EPSG:3338")
ground_ice_nsidc_proj <- project(ground_ice_nsidc, "EPSG:3338")

# Get the extent of the Jorgenson map
jorg_ext <- ext(ground_ice_jorg_proj)
nsidc_crop <- intersect(ground_ice_nsidc_proj, ground_ice_jorg_proj)
plot(nsidc_crop)

# Convert to sf
jorg_sf <- st_as_sf(ground_ice_jorg_proj)
nsidc_sf <- st_as_sf(nsidc_crop)

# Fix invalid geometries
jorg_sf <- st_make_valid(jorg_sf)
nsidc_sf <- st_make_valid(nsidc_sf)

# Define raster resolution
res_m <- 1000  # meters

# Create an empty raster covering the area of both datasets
r_template <- rast(ground_ice_jorg_proj)  
res(r_template) <- res_m
crs(r_template) <- crs(ground_ice_jorg_proj)

# Rasterize Jorgenson
r_jorg <- rasterize(ground_ice_jorg_proj, r_template, field = "ICECLOWASS")
plot(r_jorg)

# Rasterize NSIDC
r_nsidc <- rasterize(nsidc_crop, r_template, field = "CONTENT")
plot(r_nsidc)

# Convert to data frame for plotting
df <- as.data.frame(c(r_jorg, r_nsidc), xy = TRUE)
names(df) <- c("x", "y", "Jorgenson", "NSIDC")

# Convert to character
df$Jorgenson <- as.character(df$Jorgenson)
df$NSIDC <- as.character(df$NSIDC)

# Create agreement column
df$agreement <- dplyr::case_when(
  is.na(df$Jorgenson) | is.na(df$NSIDC) ~ "Missing data",
  df$Jorgenson == df$NSIDC ~ paste0("Agree"),
  TRUE ~ "Disagree"
)

# Make factor with all three levels
all_levels <- c("Agree", "Disagree", "Other")

df$agree_map <- factor(ifelse(df$agreement == "Agree", "Agree", "Other"), levels = all_levels)
df$disagree_map <- factor(ifelse(df$agreement == "Disagree", "Disagree", "Other"), levels = all_levels)

plot_colors <- c(
  "Agree" = "#1B9E77",      # green
  "Disagree" = "#D95F02",   # red/orange
  "Other" = "#E5E5E5"       # light gray
)

# Get US states
us_states <- ne_states(country = "United States of America", returnclass = "sf")

# Subset Alaska
ak_sf <- us_states[us_states$name == "Alaska", ]

ak_sf <- st_transform(ak_sf, crs = 3338)


# Ice class agreement map
ggplot(df) +
  geom_sf(data = ak_sf, fill = "#E5E5E5", color = "black", size = 0.8) +
  geom_raster(aes(x = x, y = y, fill = agree_map)) +
  geom_sf(data = ak_sf, fill = NA, color = "black", size = 0.8) +
  scale_fill_manual(values = plot_colors, name = "Ground Ice Agreement") +
  coord_sf() +
  theme_minimal() +
  labs(title = "Ground Ice Distribution Agreement") +
  annotation_scale(
    location = "br",        # bottom left
    width_hint = 0.25,      # fraction of plot width
    unit_category = "metric", # use kilometers/meters
    text_cex = 1.5,
    height = unit(0.5, "cm"),
    bar_cols = c("black", "white")
  ) +
  theme(legend.position = "right",
        axis.title = element_blank(),
        panel.grid.major = element_line(color = "gray85"),
        axis.text = element_text(size=20),
        legend.title = element_text(size=14)# remove tick marks
  )

# Ice class disagreement map
ggplot(df) +
  geom_sf(data = ak_sf, fill = "#E5E5E5", color = "black", size = 0.8) +
  geom_raster(aes(x = x, y = y, fill = disagree_map)) +
  geom_sf(data = ak_sf, fill = NA, color = "black", size = 0.8) +
  scale_fill_manual(values = plot_colors, name = "Ground Ice Agreement") +
  coord_sf() +
  theme_minimal() +
  labs(title = "Ground Ice Distribution Disagreement") +
  annotation_scale(
    location = "br",        # bottom left
    width_hint = 0.25,      # fraction of plot width
    unit_category = "metric", # use kilometers/meters
    text_cex = 1.5,
    height = unit(0.5, "cm"),
    bar_cols = c("black", "white")
  ) +
  theme(legend.position = "right",
        axis.title = element_blank(),
        panel.grid.major = element_line(color = "gray85"),
        axis.text = element_text(size=20),
        legend.title = element_text(size=14)    # remove tick marks
  )

# Calculate percernt of each state that is in agreement or disagreement
# Convert df to sf object
df_sf <- st_as_sf(df, coords = c("x", "y"), crs = 3338)  

# Clip to Alaska 
df_sf <- df_sf[ak_sf, ]

# Remove classes that are not High/moderate or Low
df_modified <- df[df$agreement != "Missing data",]

# Count pixels by agreement class
area_summary <- df_modified %>%
  st_drop_geometry() %>%
  group_by(agreement) %>%
  summarise(count = n()) %>%
  mutate(percent = 100 * count / sum(count))

area_summary

#-------------------------------------------------------------------------------
# Step 4: Calculate proportions of thaw types in each ground ice class
#-------------------------------------------------------------------------------

# Define function to calculate proportions
proportions <- function(df, ice_col) {
  df %>%
    filter(!is.na(.data[[ice_col]])) %>%
    group_by(ThawType, .data[[ice_col]]) %>%
    summarise(n = n(), .groups = "drop") %>%
    group_by(ThawType) %>%
    mutate(prop = n / sum(n),
           perc = prop * 100)
}

# Calculate proportions
proportions_jorg <- proportions(ice.thaw, "ice_class_jorgenson")
proportions_nsidc <- proportions(ice.thaw, "ice_class_nsidc")


#-------------------------------------------------------------------------------
# Step 5: Highlight discrepancies with ice-dependent abrupt thaw points
#-------------------------------------------------------------------------------
ice.thaw <- ice.thaw %>%
  mutate(
    highlight_jorg = ifelse(
      ThawType == "Abrupt" & ice_class_jorgenson %in% c("Unfrozen", "Low", "Variable"),
      "Highlight", "Other"
    ),
    highlight_nsidc = ifelse(
      ThawType == "Abrupt" & ice_class_nsidc %in% c("l"),  # adjust levels if needed
      "Highlight", "Other"
    )
  )

#-------------------------------------------------------------------------------
# Step 6: Plot discrepancies
#-------------------------------------------------------------------------------

# Import Alaska shapefile
alaska_sf <- states(cb = TRUE) %>% 
  filter(STUSPS == "AK")

# Reproject to Alaska Albers Equal Area (EPSG:3338)
alaska_proj   <- st_transform(alaska_sf, 3338)
ice.thaw <- st_transform(ice.thaw, 3338)

# Extract coordinates
ice.thaw_df <- ice.thaw %>%
  mutate(X = st_coordinates(.)[,1],
         Y = st_coordinates(.)[,2])

# Jorgenson map
ggplot() +
  geom_sf(data = alaska_proj, fill = "#E5E5E5", color = "black", size = 0.8) +
  geom_point(data = ice.thaw_df %>% filter(highlight_jorg == "Other"),
             aes(x = X, y = Y),
             fill = "#80B1D3",
             color = "#0072B2",
             shape = 21,
             size = 5) +
  geom_point(data = ice.thaw_df %>% filter(highlight_jorg == "Highlight"),
             aes(X, Y),
             fill = "#FFCC99",
             color = "#D55E00",
             shape = 21,
             size = 5,
             stroke = 0.3) +
  annotation_scale(
    location = "br",
    width_hint = 0.25,
    unit_category = "metric",
    text_cex = 1.5,
    height = unit(0.5, "cm"),
    bar_cols = c("black", "white")
  ) +
  labs(
    title = "",
    x = NULL, y = NULL
  ) +
  theme_minimal() +
  theme(
    panel.grid.major = element_line(color = "gray85"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 20),
    legend.position = "none"
  )


# NSIDC map
ggplot() +
  geom_sf(data = alaska_proj, fill = "#E5E5E5", color = "black", size = 0.8) +
  geom_point(data = ice.thaw_df %>% filter(highlight_nsidc == "Other"),
             aes(x = X, y = Y),
             fill = "#80B1D3",
             color = "#0072B2",
             shape = 21,
             size = 5) +
  geom_point(data = ice.thaw_df %>% filter(highlight_nsidc == "Highlight"),
             aes(X, Y),
             fill = "#FFCC99",
             color = "#D55E00",
             shape = 21,
             size = 5,
             stroke = 0.3) +
  annotation_scale(
    location = "br",
    width_hint = 0.25,
    unit_category = "metric",
    text_cex = 1.5,
    height = unit(0.5, "cm"),
    bar_cols = c("black", "white")
  ) +
  labs(
    title = "",
    x = NULL, y = NULL
  ) +
  theme_minimal() +
  theme(
    panel.grid.major = element_line(color = "gray85"),
    plot.title = element_text(hjust = 0.5, size = 20),
    axis.text = element_text(size = 20),
    legend.position = "none"
  )

#-------------------------------------------------------------------------------
#
#
#                 Section 5: Yedoma Map Comparison
#
#
#-------------------------------------------------------------------------------

yedoma <- st_read("") # IRYP_v2_yedoma_domain.shp

# Import Alaska shapefile
alaska_sf <- states(cb = TRUE) %>% 
  filter(STUSPS == "AK")

# Reproject both to Alaska Albers Equal Area (EPSG:3338)
yedoma_proj   <- st_transform(yedoma, 3338)
alaska_proj   <- st_transform(alaska_sf, 3338)

# Fix invalid geometries just in case
yedoma_proj  <- st_make_valid(yedoma_proj)
alaska_proj  <- st_make_valid(alaska_proj)

# Do the intersection in projected space
yedoma_clipped <- st_intersection(yedoma_proj, alaska_proj)

# Dissolve ecoregions
yedoma_dissolved <- yedoma_clipped %>%
  group_by(STATEFP) %>%
  summarise(geometry = st_union(geometry), .groups = "drop") %>%
  st_as_sf()

topo.sf <- topo.sf %>%
  mutate(
    ice.thaw = case_when(
      FeatureCategory %in% c("Thermokarst", 
                             "Thermokarst lake", 
                             "Thermokarst wetland", 
                             "Retrogressive thaw slump",
                             "Thaw Pond") ~ "Ice-dependent thaw process",
      TRUE ~ "Not ice-dependent thaw process"
    )
  )

topo.sf   <- st_transform(topo.sf, 3338)

n.i <- topo.sf[topo.sf$ice.thaw == "Not ice-dependent thaw process",]
i <- topo.sf[topo.sf$ice.thaw == "Ice-dependent thaw process",]

ni_coords <- st_coordinates(n.i)
ni_df <- cbind(as.data.frame(ni_coords), n.i)  # keep the attributes

i_coords <- st_coordinates(i)
i_df <- cbind(as.data.frame(i_coords), i)  # keep the attributes


alaska_sf <- st_transform(alaska_sf, 3338)
yedoma_dissolved <- st_transform(yedoma_dissolved, 3338)
ni_df <- cbind(st_coordinates(st_transform(n.i, 3338)), n.i)
i_df <- cbind(st_coordinates(st_transform(i, 3338)), i)

ggplot() +
  geom_sf(data = alaska_proj, fill = "#E5E5E5", color = "black", size = 0.7) +
  geom_sf(data = yedoma_dissolved, fill = "darkgoldenrod2", color = NA) +
  geom_point(
    data = ni_df,
    aes(X, Y, fill = ice.thaw),
    shape = 21,
    color = "deeppink4",
    size = 5
  ) +
  geom_sf(data = yedoma_dissolved, fill = NA, color = "white", size = 3) +
  scale_fill_manual(values = c("Not ice-dependent thaw process" = "#FBB4B4"), name = "Thaw Type") +
  annotation_scale(
    location = "br",
    width_hint = 0.25,
    unit_category = "metric",
    text_cex = 1.5,
    height = unit(0.5, "cm"),
    bar_cols = c("black", "white")
  ) +
  theme_minimal() +
  theme(
    panel.grid.major = element_line(color = "gray85"),
    axis.title = element_blank(),
    axis.text = element_text(size = 20),
    plot.title = element_text(hjust = 0.5, size = 18),
    legend.title = element_text(size = 14),
    legend.position = "right"
  )

ggplot() +
  geom_sf(data = alaska_proj, fill = "#E5E5E5", color = "black", size = 0.7) +
  geom_sf(data = yedoma_dissolved, fill = "darkgoldenrod2", color = NA) +
  geom_point(
    data = i_df,
    aes(X, Y, fill = ice.thaw),
    shape = 21,
    color = "#0072B2",
    size = 5
  ) +
  geom_sf(data = yedoma_dissolved, fill = NA, color = "white", size = 3) +
  scale_fill_manual(values = c("Ice-dependent thaw process" = "#80B1D3"), name = "Thaw Type") +
  annotation_scale(
    location = "br",
    width_hint = 0.25,
    unit_category = "metric",
    text_cex = 1.5,
    height = unit(0.5, "cm"),
    bar_cols = c("black", "white")
  ) +
  theme_minimal() +
  theme(
    axis.title = element_blank(),
    axis.text = element_text(size = 20),
    plot.title = element_text(hjust = 0.5, size = 18),
    legend.title = element_text(size = 14),
    legend.position = "right"
  )

# Reproject if needed (same trick as before)
ice.thaw <- st_transform(ice.thaw, st_crs(yedoma_dissolved))

# This adds yedoma attributes to thaw features that fall inside
thaw_yedoma <- st_join(ice.thaw, yedoma_dissolved, join = st_intersects)

sum(!is.na(thaw_yedoma$STATEFP))   # number overlapping
nrow(thaw_yedoma)    

prop_overlap <- mean(!is.na(thaw_yedoma$STATEFP))
prop_overlap

# Recode overlap column
thaw_yedoma$yedoma_overlap <- "Yedoma"

thaw_yedoma <- thaw_yedoma %>%
  mutate(yedoma_overlap = ifelse(is.na(STATEFP), "Not Yedoma", yedoma_overlap))

# Make sure Alaska outline is in same CRS
alaska_outline <- st_transform(alaska_sf, st_crs(thaw_yedoma))

# Calculate percentage of Alaska that is Yedoma (sq meters)
alaska_area <- st_area(alaska_proj) %>% sum()
yedoma_area <- st_area(yedoma_dissolved) %>% sum()

# Percent of Alaska covered by Yedoma
percent_yedoma <- as.numeric(yedoma_area / alaska_area * 100)
percent_yedoma

# Percent of total thaw points in Yedoma domain
topo_in_yedoma <- st_within(topo.sf, yedoma_dissolved, sparse = FALSE)

# Count how many points are inside
n_inside <- sum(topo_in_yedoma)
n_total <- nrow(topo.sf)

# Calculate percentage
percent_inside <- (n_inside / n_total) * 100

cat(round(percent_inside, 1), "% of topo.sf points fall within the Yedoma domain.\n")
