Collection of training data for remote sensing model building

Tutorial: EON Summer School 2024

Author

Paul Magdon, University of Applied Sciences and Arts (HAWK)

Published

August 29, 2023

#install.packages("devtools")
devtools::install_github("bleutner/RStoolbox")

Skipping install of 'RStoolbox' from a github remote, the SHA1 (4abdd691) has not changed since last install.
  Use `force = TRUE` to force installation

library(sf)

Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE

library(RStoolbox)

This is version 0.4.0 of RStoolbox


        ____       __              ____
       / __ \_____/ /_____  ____  / / /_  ____  _  __
      / /_/ / ___/ __/ __ \/ __ \/ / __ \/ __ \| |/_/
     / _, _(__  ) /_/ /_/ / /_/ / / /_/ / /_/ />  <
    /_/ |_/____/\__/\____/\____/_/_.___/\____/_/|_|

library(raster)

Lade nötiges Paket: sp

library(knitr)
library(ggplot2)
library(rmarkdown)
library(patchwork)


Attache Paket: 'patchwork'

Das folgende Objekt ist maskiert 'package:raster':

    area

library(mapview)
library(kableExtra)
library(rmarkdown)
library(rprojroot)

Data sets

In this tutorial we will work with a Sentinel-2 scene from 18/06/2022 from the National Park, Harz. We will also use the boundary of the National Park to define our study area. Before we can start you may download the S2 Scene from the following link: S2-download. Place this file into the data folder of this tutorial.

# create a string containing the current working directory
wd=paste0(find_rstudio_root_file(),"/tdv_session/data/")

#Import the boundary of the n
np_boundary = st_transform(st_read(paste0(wd,"nlp-harz_aussengrenze.gpkg")),25832)

Reading layer `nlp-harz_aussengrenze' from data source 
  `/home/creu/edu/courses/EON/EON2024/tdv_session/data/nlp-harz_aussengrenze.gpkg' 
  using driver `GPKG'
Simple feature collection with 1 feature and 3 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 591196.6 ymin: 5725081 xmax: 619212.6 ymax: 5751232
Projected CRS: WGS 84 / UTM zone 32N

s2  <- raster::brick(paste0(wd,"S2B_MSIL2A_20220618T102559_N0400_R10_resampled_harz_np.tif"))

names(s2)<-c('blue','green','red','vnir1','vnir2','vnir3','nir1','nir2','swir')

s2 <-raster::mask(s2,np_boundary)

Anaylsing the spectral variablity within the study area

If we have no access to prior information on our target variable in the study area we can use the spectral variability as a proxy for the variability of the target variable. By using the spectral variability as a sampling criterion we also ensure, that we cover the spectral range with our sampling.

Dimension reduction (PCA)

In a fist step we reduce the dimensions of the 9 Sentinel-2 bands while maintaining most of the information, using a principal component analysis (PCA).

#PCA
pca<-RStoolbox::rasterPCA(s2,nSamples = 5000, spca=TRUE )


|---------|---------|---------|---------|
=========================================

ggRGB(pca$map,1,2,3, stretch="lin", q=0)

Warning in matrix(z, nrow = nrow(rr), ncol = ncol(rr), byrow = TRUE):
Datenlänge [168814] ist kein Teiler oder Vielfaches der Anzahl der Zeilen [684]

From the output of the PCA we see that we can capture 92% of the variability with the first two components. Thus we will only use the PC1 and PC2 for the subsequent analysis.

Unsupervised clustering

In the next step we run an unsupervised classification of the PC1 and PC2 to get a clustered map. For the unsupervised classification we need to take a decision on the number of classes/clusters to be created. Here we will take n=5 classes. However, depending on the target variable this value need to be adjusted.

set.seed(2222)
cluster <- unsuperClass(pca$map[[c('PC1','PC2')]], nSamples = 100, nClasses = 5, nStarts = 5)


## Plots
colors <- rainbow(5)
plot(cluster$map, col = colors, legend = TRUE, axes = TRUE, box =TRUE)

The map shows a clear spatial patterns related to the elevation, tree species and vitality status of the Nationalpark forests.

Create a stratified sample

In the next step we take a stratified random sample with \(n=10\) points from each of the 5 spectral classes.

samples <- sampleStratified(raster(cluster$map),size = 10,na.rm=T,xy=T,sp=T)
samples$class_unsupervised <- as.factor(samples$class_unsupervised)
my.palette <- rainbow(5)
point.size <- 0.5

np.layer <- list("sp.polygons", as_Spatial(np_boundary))


spplot(samples,'class_unsupervised', col.regions = my.palette, sp.layout = np.layer,
    cex = point.size, main = "Stratified random sample for training")

We can now print the sample plot list as following:

kable(samples@data[c('x','y','class_unsupervised')], caption='Training plot list') %>%
  kable_styling(fixed_thead = T) %>% scroll_box(height = "400px")

Training plot list
x	y	class_unsupervised
609575	5733215	1
612875	5737945	1
609055	5741925	1
611945	5745525	1
607895	5742735	1
613125	5745475	1
610605	5746575	1
615495	5739075	1
606645	5735815	1
610505	5737485	1
602355	5735115	2
610315	5736295	2
606945	5731415	2
608305	5736795	2
596535	5731695	2
605055	5733975	2
593735	5730215	2
608655	5741055	2
613595	5739115	2
617625	5736975	2
593095	5727625	3
597765	5732435	3
593425	5725375	3
614425	5749695	3
596125	5727055	3
608775	5748725	3
608715	5745335	3
615945	5741135	3
598975	5733895	3
593055	5725865	3
612165	5747125	4
599005	5734325	4
612835	5748025	4
616925	5738205	4
600815	5739035	4
607915	5739665	4
611225	5742515	4
608325	5743895	4
614725	5740365	4
610485	5736815	4
602085	5733055	5
618485	5738035	5
612315	5747205	5
602575	5737435	5
608225	5735605	5
602625	5733495	5
606215	5740715	5
608845	5745255	5
607085	5734795	5
608175	5736215	5

Implement a plot design

# Create a training data set by extracting the mean value of all pixels touching
# a buffered area with 13m around the plot center
train<-raster::extract(s2,samples,sp=T,buffer=13,fun='mean')
mapview(train, zcol="class_unsupervised",
        map.types = c("Esri.WorldShadedRelief", "OpenStreetMap.DE"))+
  mapview(np_boundary,alpha.regions = 0.2, aplha = 1)

Compare the pixel value range between the sample and the image

image.sample<-raster::sampleRandom(s2,size=100000)
image.sample<- as.data.frame(image.sample)
image.sample$group<-'image'

train.df<- train@data[,names(s2)]
train.df$group<-'train'

df <- rbind(image.sample,train.df)

blue <-ggplot(df, aes(blue,fill=group)) + theme_classic()+
        geom_histogram(
        aes(y=after_stat(density)),alpha=0.2, color='gray80',
        position='identity',bins=30)

green <-ggplot(df, aes(green,fill=group)) + theme_classic()+
        geom_histogram(
        aes(y=after_stat(density)),alpha=0.2, color='gray80',
        position='identity',bins=30)

nir1<-ggplot(df, aes(nir1,fill=group)) + theme_classic()+
      geom_histogram(
      aes(y=after_stat(density)),alpha=0.2, color='gray80',
      position='identity',bins=30)

swir<-ggplot(df, aes(swir,fill=group)) + theme_classic()+
      geom_histogram(
      aes(y=after_stat(density)),alpha=0.2, color='gray80',
      position='identity',bins=30)

blue+green+nir1+swir+plot_layout(ncol=2)