A word about classifications
In this course, we focused on and made use of regression models. We used Random Forest to predict a continuous variable (the mean vegetation height) and also used a continuous error metric, the root mean squared error. If we do not have a continuous variable to predict (e.g. forest types or more broadly the landcover) we speak of classification models. No worries! Apart from some minor tweaks, the workflow is exactly the same as with regression models (we even can use the same model type).
library(terra)
library(sf)
library(caret)
library(ranger)
setwd("~/casestudies/course_sen_lidar/")
# sentinel are the predictors again
sen = rast("../data/sentinel/lahntal_sentinel_NDVI.tif")
names(sen) <- c("blue", "green", "red", "nir", "ndvi")
# training polygons
lc = st_read("../data/classification/trainingsgebiete.gpkg")
plot(lc["Klasse"])
plotRGB(sen, stretch = "lin"); plot(lc["Klasse"], add = TRUE)
Data cleanup
# remove underrepresented classes
table(lc$Klasse)
lc = lc[!(lc$Klasse %in% c("Baum", "kuenstliche flaechen")),]
table(lc$Klasse)
# proj4 != proj6 problematic
projection(sen)
projection(lc)
projection(sen) = projection(lc)
Create data frame
(see: The Modelling workflow)
# get pixel values underneath the polygons
val <- extract(sen, lc, df = TRUE)
# create a look-up-table with polygon ID and its class
poly_class <- data.frame(ID = seq(465), Klasse = lc$Klasse)
# add the class information to the pixel values
val <- join(val, poly_class, by = "ID")
# save
saveRDS(val, file = "data/training_set.RDS")
Modelling
df = readRDS("../data/classification/training_set.RDS")
head(df)
df$Klasse = droplevels(df$Klasse)
# this is the exact same as with the regression model
head(df)
# delete the polygon ID (for now!)
df$ID = NULL
set.seed(1)
train_id <- caret::createDataPartition(y = df$Klasse, times = 1, p = 0.6, list = FALSE)
train_df <- df[train_id,]
test_df <- df[-train_id,]
# define tuning parameters for ranger
# for classification we need a different split rule!
tgrid <- expand.grid(.mtry = 1:5,
.splitrule = "gini",
.min.node.size = c(5,10,15,20))
# train the model
rfmodel <- caret::train(Klasse ~ ., data = train_df, method = "ranger",
tuneGrid = tgrid, trControl = trainControl(method = "cv"),
num.trees = 200)
saveRDS(rfmodel, "../data/classification/lcc_ranger.RDS")
Visuals
library(ggplot2)
rfmodel = readRDS("../data/classification/lcc_ranger.RDS")
rfmodel
p = raster::predict(object = sen, rfmodel)
plot(p)
p@data@attributes[[1]]
RStoolbox::ggR(p, geom_raster = TRUE)+
scale_fill_manual(name = "Classes", values = c("grey", "gold1", "yellow3", "red", "lightgreen", "darkgreen", "blue"))+
theme(axis.text.y = element_text(angle = 90, hjust = 0.5),
axis.title = element_blank(), legend.position = "bottom", panel.background = element_blank(),
panel.grid = element_line(color = "grey50"))
Validation
test_df$Pred <- stats::predict(object = rfmodel, test_df)
# contingency table
ct = table(test_df$Klasse, test_df$Pred)
ct
# accuracy
# sum of all correct classifications / n
table(test_df$Klasse == test_df$Pred)
sum(test_df$Klasse == test_df$Pred) / nrow(test_df)
# kappa value
# takes into account the class sizes and expected probabilities
# kappa of 0: random classification
# kappa of 1: agreement
kappa(ct)