Collection of training data for remote sensing model building

Tutorial: EON Summer School 2024

Author

Paul Magdon, University of Applied Sciences and Arts (HAWK)

Published

August 29, 2023

#install.packages("devtools")
devtools::install_github("bleutner/RStoolbox")
Skipping install of 'RStoolbox' from a github remote, the SHA1 (4abdd691) has not changed since last install.
  Use `force = TRUE` to force installation
library(sf)
Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE
library(RStoolbox)
This is version 0.4.0 of RStoolbox

        ____       __              ____
       / __ \_____/ /_____  ____  / / /_  ____  _  __
      / /_/ / ___/ __/ __ \/ __ \/ / __ \/ __ \| |/_/
     / _, _(__  ) /_/ /_/ / /_/ / / /_/ / /_/ />  <
    /_/ |_/____/\__/\____/\____/_/_.___/\____/_/|_|
  
library(raster)
Lade nötiges Paket: sp
library(knitr)
library(ggplot2)
library(rmarkdown)
library(patchwork)

Attache Paket: 'patchwork'
Das folgende Objekt ist maskiert 'package:raster':

    area
library(mapview)
library(kableExtra)
library(rmarkdown)
library(rprojroot)

Data sets

In this tutorial we will work with a Sentinel-2 scene from 18/06/2022 from the National Park, Harz. We will also use the boundary of the National Park to define our study area. Before we can start you may download the S2 Scene from the following link: S2-download. Place this file into the data folder of this tutorial.

# create a string containing the current working directory
wd=paste0(find_rstudio_root_file(),"/tdv_session/data/")

#Import the boundary of the n
np_boundary = st_transform(st_read(paste0(wd,"nlp-harz_aussengrenze.gpkg")),25832)
Reading layer `nlp-harz_aussengrenze' from data source 
  `/home/creu/edu/courses/EON/EON2024/tdv_session/data/nlp-harz_aussengrenze.gpkg' 
  using driver `GPKG'
Simple feature collection with 1 feature and 3 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 591196.6 ymin: 5725081 xmax: 619212.6 ymax: 5751232
Projected CRS: WGS 84 / UTM zone 32N
s2  <- raster::brick(paste0(wd,"S2B_MSIL2A_20220618T102559_N0400_R10_resampled_harz_np.tif"))

names(s2)<-c('blue','green','red','vnir1','vnir2','vnir3','nir1','nir2','swir')

s2 <-raster::mask(s2,np_boundary)

Anaylsing the spectral variablity within the study area

If we have no access to prior information on our target variable in the study area we can use the spectral variability as a proxy for the variability of the target variable. By using the spectral variability as a sampling criterion we also ensure, that we cover the spectral range with our sampling.

Dimension reduction (PCA)

In a fist step we reduce the dimensions of the 9 Sentinel-2 bands while maintaining most of the information, using a principal component analysis (PCA).

#PCA
pca<-RStoolbox::rasterPCA(s2,nSamples = 5000, spca=TRUE )

|---------|---------|---------|---------|
=========================================
                                          
ggRGB(pca$map,1,2,3, stretch="lin", q=0)
Warning in matrix(z, nrow = nrow(rr), ncol = ncol(rr), byrow = TRUE):
Datenlänge [168814] ist kein Teiler oder Vielfaches der Anzahl der Zeilen [684]

From the output of the PCA we see that we can capture 92% of the variability with the first two components. Thus we will only use the PC1 and PC2 for the subsequent analysis.

Unsupervised clustering

In the next step we run an unsupervised classification of the PC1 and PC2 to get a clustered map. For the unsupervised classification we need to take a decision on the number of classes/clusters to be created. Here we will take n=5 classes. However, depending on the target variable this value need to be adjusted.

set.seed(2222)
cluster <- unsuperClass(pca$map[[c('PC1','PC2')]], nSamples = 100, nClasses = 5, nStarts = 5)


## Plots
colors <- rainbow(5)
plot(cluster$map, col = colors, legend = TRUE, axes = TRUE, box =TRUE)

The map shows a clear spatial patterns related to the elevation, tree species and vitality status of the Nationalpark forests.

Create a stratified sample

In the next step we take a stratified random sample with \(n=10\) points from each of the 5 spectral classes.

samples <- sampleStratified(raster(cluster$map),size = 10,na.rm=T,xy=T,sp=T)
samples$class_unsupervised <- as.factor(samples$class_unsupervised)
my.palette <- rainbow(5)
point.size <- 0.5

np.layer <- list("sp.polygons", as_Spatial(np_boundary))


spplot(samples,'class_unsupervised', col.regions = my.palette, sp.layout = np.layer,
    cex = point.size, main = "Stratified random sample for training")

We can now print the sample plot list as following:

kable(samples@data[c('x','y','class_unsupervised')], caption='Training plot list') %>%
  kable_styling(fixed_thead = T) %>% scroll_box(height = "400px")
Training plot list
x y class_unsupervised
609575 5733215 1
612875 5737945 1
609055 5741925 1
611945 5745525 1
607895 5742735 1
613125 5745475 1
610605 5746575 1
615495 5739075 1
606645 5735815 1
610505 5737485 1
602355 5735115 2
610315 5736295 2
606945 5731415 2
608305 5736795 2
596535 5731695 2
605055 5733975 2
593735 5730215 2
608655 5741055 2
613595 5739115 2
617625 5736975 2
593095 5727625 3
597765 5732435 3
593425 5725375 3
614425 5749695 3
596125 5727055 3
608775 5748725 3
608715 5745335 3
615945 5741135 3
598975 5733895 3
593055 5725865 3
612165 5747125 4
599005 5734325 4
612835 5748025 4
616925 5738205 4
600815 5739035 4
607915 5739665 4
611225 5742515 4
608325 5743895 4
614725 5740365 4
610485 5736815 4
602085 5733055 5
618485 5738035 5
612315 5747205 5
602575 5737435 5
608225 5735605 5
602625 5733495 5
606215 5740715 5
608845 5745255 5
607085 5734795 5
608175 5736215 5

Implement a plot design

# Create a training data set by extracting the mean value of all pixels touching
# a buffered area with 13m around the plot center
train<-raster::extract(s2,samples,sp=T,buffer=13,fun='mean')
mapview(train, zcol="class_unsupervised",
        map.types = c("Esri.WorldShadedRelief", "OpenStreetMap.DE"))+
  mapview(np_boundary,alpha.regions = 0.2, aplha = 1)

Compare the pixel value range between the sample and the image

image.sample<-raster::sampleRandom(s2,size=100000)
image.sample<- as.data.frame(image.sample)
image.sample$group<-'image'

train.df<- train@data[,names(s2)]
train.df$group<-'train'

df <- rbind(image.sample,train.df)

blue <-ggplot(df, aes(blue,fill=group)) + theme_classic()+
        geom_histogram(
        aes(y=after_stat(density)),alpha=0.2, color='gray80',
        position='identity',bins=30)

green <-ggplot(df, aes(green,fill=group)) + theme_classic()+
        geom_histogram(
        aes(y=after_stat(density)),alpha=0.2, color='gray80',
        position='identity',bins=30)

nir1<-ggplot(df, aes(nir1,fill=group)) + theme_classic()+
      geom_histogram(
      aes(y=after_stat(density)),alpha=0.2, color='gray80',
      position='identity',bins=30)

swir<-ggplot(df, aes(swir,fill=group)) + theme_classic()+
      geom_histogram(
      aes(y=after_stat(density)),alpha=0.2, color='gray80',
      position='identity',bins=30)

blue+green+nir1+swir+plot_layout(ncol=2)