Merge pull request #1181 from M3nin0/fix/segs-base-classify

gilbertocamara · web-flow · commit ea05a26ed75a · 2024-07-26T15:50:25.000+02:00
diff --git a/R/api_classify.R b/R/api_classify.R
@@ -227,6 +227,8 @@
 #' in the classified images for each corresponding year.
 #'
 #' @param  tile       Single tile of a data cube.
+#' @param  bands      Bands to extract time series
+#' @param  base_bands Base bands to extract values
 #' @param  ml_model   Model trained by \code{\link[sits]{sits_train}}.
 #' @param  block      Optimized block to be read into memory.
 #' @param  roi        Region of interest.
@@ -241,6 +243,8 @@
 #' @param  progress   Show progress bar?
 #' @return List of the classified raster layers.
 .classify_vector_tile <- function(tile,
+                                  bands,
+                                  base_bands,
                                   ml_model,
                                   block,
                                   roi,
@@ -322,6 +326,8 @@
         # Extract segments time series
         segments_ts <- .segments_poly_read(
             tile = tile,
+            bands = bands,
+            base_bands = base_bands,
             chunk = chunk,
             n_sam_pol = n_sam_pol,
             impute_fn = impute_fn
diff --git a/R/api_segments.R b/R/api_segments.R
@@ -264,7 +264,7 @@
         y = data,
         by = c(pol_id = "polygon_id")
     ) |>
-    dplyr::filter(.data[["pol_id"]] %in% unique(data[["polygon_id"]]))
+        dplyr::filter(.data[["pol_id"]] %in% unique(data[["polygon_id"]]))
 }
 #'
 #' @name .segments_data_read
@@ -273,78 +273,45 @@
 #' @description     Using the segments as polygons, get all time series
 #'
 #' @param tile       tile of regular data cube
+#' @param bands      Bands to extract time series
+#' @param base_bands Base bands to extract values
 #' @param chunk      A chunk to be read.
 #' @param n_sam_pol  Number of samples per polygon to be read.
 #' @param impute_fn  Imputation function to remove NA
 #'
 #' @return  samples associated to segments
-.segments_poly_read <- function(tile, chunk, n_sam_pol, impute_fn) {
+.segments_poly_read <- function(
+        tile, bands, base_bands, chunk, n_sam_pol, impute_fn
+) {
+    # define bands variables
+    ts_bands <- NULL
+    ts_bands_base <- NULL
     # For cubes that have a time limit to expire (MPC cubes only)
     tile <- .cube_token_generator(cube = tile)
-    # Read and preprocess values of cloud
-    # Get tile bands
-    tile_bands <- .tile_bands(
-        tile = tile,
-        add_cloud = FALSE
-    )
     # Read and preprocess values of each band
-    ts_bands <- purrr::map(tile_bands, function(band) {
+    ts_bands <- purrr::map(bands, function(band) {
         # extract band values
-        values <- .tile_extract_segments(
+        .tile_read_segments(
             tile = tile,
             band = band,
-            chunk = chunk
-        )
-        pol_id <- values[, "pol_id"]
-        values <- values[, -1:0]
-        # Correct missing, minimum, and maximum values and
-        # apply scale and offset.
-        band_conf <- .tile_band_conf(
-            tile = tile,
-            band = band
+            chunk = chunk,
+            impute_fn = impute_fn
         )
-        miss_value <- .miss_value(band_conf)
-        if (.has(miss_value)) {
-            values[values == miss_value] <- NA
-        }
-        min_value <- .min_value(band_conf)
-        if (.has(min_value)) {
-            values[values < min_value] <- NA
-        }
-        max_value <- .max_value(band_conf)
-        if (.has(max_value)) {
-            values[values > max_value] <- NA
-        }
-        scale <- .scale(band_conf)
-        if (.has(scale) && scale != 1) {
-            values <- values * scale
-        }
-        offset <- .offset(band_conf)
-        if (.has(offset) && offset != 0) {
-            values <- values + offset
-        }
-        # are there NA values? interpolate them
-        if (anyNA(values)) {
-            values <- impute_fn(values)
-        }
-        # Returning extracted time series
-        return(list(pol_id, c(t(unname(values)))))
     })
     # extract the pol_id information from the first element of the list
     pol_id <- ts_bands[[1]][[1]]
     # remove the first element of the each list and retain the second
     ts_bands <- purrr::map(ts_bands, function(ts_band) ts_band[[2]])
     # rename the resulting list
-    names(ts_bands) <- tile_bands
+    names(ts_bands) <- bands
     # transform the list to a tibble
     ts_bands <-  tibble::as_tibble(ts_bands)
     # retrieve the dates of the tile
     n_dates <- length(.tile_timeline(tile))
     # find how many samples have been extracted from the tile
     n_samples <- nrow(ts_bands) / n_dates
     # include sample_id information
-    ts_bands[["sample_id"]] <- rep(seq_len(n_samples),
-                                   each = n_dates)
+    ts_bands[["sample_id"]] <- rep(seq_len(n_samples), each = n_dates)
     # include timeline
     ts_bands[["Index"]] <- rep(
         .tile_timeline(tile),
@@ -353,23 +320,70 @@
     # nest the values by bands
     ts_bands <- tidyr::nest(
         ts_bands,
-        time_series = c("Index", dplyr::all_of(tile_bands))
+        time_series = c("Index", dplyr::all_of(bands))
     )
+    # if `base_bands` is available, transform it to the same structure as
+    # `time_series`
+    if (.has(base_bands)) {
+        # read base data values
+        ts_bands_base <- purrr::map(base_bands, function(band) {
+            .tile_read_segments(
+                tile = .tile_base_info(tile),
+                band = band,
+                chunk = chunk,
+                impute_fn = impute_fn
+            )
+        })
+        # remove polygon ids
+        ts_bands_base <- purrr::map(ts_bands_base,
+                                    function(ts_band) ts_band[[2]])
+        # name band values
+        names(ts_bands_base) <- base_bands
+        # merge band values
+        ts_bands_base <- dplyr::bind_cols(ts_bands_base)
+        # include time reference in the data
+        ts_bands_base[["Index"]] <- rep(
+            .tile_timeline(.tile_base_info(tile)),
+            times = n_samples
+        )
+        # include base bands data
+        ts_bands <- tibble::add_column(ts_bands, ts_bands_base)
+        # nest base data
+        ts_bands <- tidyr::nest(
+            ts_bands,
+            base_data = c("Index", dplyr::all_of(base_bands))
+        )
+    }
     # include the ids of the polygons
     ts_bands[["polygon_id"]] <- pol_id
-    # we do the unnest again because we do not know the polygon id index
-    ts_bands <- tidyr::unnest(ts_bands, "time_series")
-    # remove pixels where all timeline was NA
-    ts_bands <-  tidyr::drop_na(ts_bands)
-    # nest the values by bands
-    ts_bands <- tidyr::nest(
-        ts_bands,
-        time_series = c("Index", dplyr::all_of(tile_bands))
-    )
+    # define which columns must be checked to drop na values
+    drop_na_colums <- list("time_series" = bands)
+    # if `base_bands` is available, to `base_data` column is used
+    if (.has(base_bands)) {
+        drop_na_colums[["base_data"]] <- base_bands
+    }
+    # drop na values
+    for (colname in names(drop_na_colums)) {
+        # we do the unnest again because we do not know the polygon id index
+        ts_bands <- tidyr::unnest(ts_bands, colname)
+        # remove pixels where all timeline was NA
+        ts_bands <-  tidyr::drop_na(ts_bands)
+        # nest the values by bands
+        ts_bands <- tidyr::nest(
+            ts_bands,
+            !!colname := c("Index", dplyr::all_of(drop_na_colums[[colname]]))
+        )
+    }
+    # define columns used in the points nest
+    points_nest <- c("sample_id", "time_series")
+    # if `base_bands` is available, include it in the nest operation
+    if (.has(base_bands)) {
+        points_nest <- c(points_nest, "base_data")
+    }
     # nest the values by sample_id and time_series
     ts_bands <- tidyr::nest(
         ts_bands,
-        points = c("sample_id", "time_series")
+        points = points_nest
     )
     # retrieve the segments
     segments <- .vector_read_vec(chunk[["segments"]][[1]])
@@ -404,5 +418,10 @@
     samples <- .discard(samples, "sample_id")
     # set sits class
     class(samples) <- c("sits", class(samples))
+    # define `sits_base` if applicable
+    if (.has(base_bands)) {
+        class(samples) <- c("sits_base", class(samples))
+    }
+    # return!
     return(samples)
 }
diff --git a/R/api_tile.R b/R/api_tile.R
@@ -1443,11 +1443,11 @@ NULL
 #' @keywords internal
 #' @author Gilberto Camara, \email{gilberto.camara@@inpe.br}
 #'
-#' @description Given a data cube, retrieve the time series of XY locations
+#' @description Given a tile and a band, return a set of values for segments
 #'
-#' @param tile ... TODO: document
-#' @param band ...
-#' @param chunk ...
+#' @param tile        Metadata about a data cube (one tile)
+#' @param band        Name of the band to the retrieved
+#' @param chunk       Chunk from where segments data will be extracted
 #'
 #' @return Data.frame with values per polygon.
 .tile_extract_segments <- function(tile, band, chunk) {
@@ -1472,6 +1472,63 @@ NULL
     # Return values
     return(as.matrix(values))
 }
+#' @title Given a tile and a band, return a set of values for segments ready to
+#' be used
+#' @name .tile_extract_segments
+#' @noRd
+#' @keywords internal
+#' @author Gilberto Camara, \email{gilberto.camara@@inpe.br}
+#'
+#' @description Given a tile and a band, return a set of values for segments
+#' ready to be used (e.g., scale transformation, offset, and so on).
+#'
+#' @param tile        Metadata about a data cube (one tile)
+#' @param band        Name of the band to the retrieved
+#' @param chunk       Chunk from where segments data will be extracted
+#' @param impute_fn  Imputation function to remove NA
+#'
+#' @return Data.frame with values per polygon.
+.tile_read_segments <- function(tile, band, chunk, impute_fn) {
+    values <- .tile_extract_segments(
+        tile = tile,
+        band = band,
+        chunk = chunk
+    )
+    pol_id <- values[, "pol_id"]
+    values <- values[, -1:0]
+    # Correct missing, minimum, and maximum values and
+    # apply scale and offset.
+    band_conf <- .tile_band_conf(
+        tile = tile,
+        band = band
+    )
+    miss_value <- .miss_value(band_conf)
+    if (.has(miss_value)) {
+        values[values == miss_value] <- NA
+    }
+    min_value <- .min_value(band_conf)
+    if (.has(min_value)) {
+        values[values < min_value] <- NA
+    }
+    max_value <- .max_value(band_conf)
+    if (.has(max_value)) {
+        values[values > max_value] <- NA
+    }
+    scale <- .scale(band_conf)
+    if (.has(scale) && scale != 1) {
+        values <- values * scale
+    }
+    offset <- .offset(band_conf)
+    if (.has(offset) && offset != 0) {
+        values <- values + offset
+    }
+    # are there NA values? interpolate them
+    if (anyNA(values)) {
+        values <- impute_fn(values)
+    }
+    # Returning extracted time series
+    return(list(pol_id, c(t(unname(values)))))
+}
 #' @title Check if tile contains cloud band
 #' @keywords internal
 #' @noRd
diff --git a/R/sits_classify.R b/R/sits_classify.R
@@ -429,6 +429,16 @@ sits_classify.segs_cube <- function(data,
     }
     if (.has(filter_fn))
         .check_filter_fn(filter_fn)
+    # By default, base bands is null.
+    base_bands <- NULL
+    if (.cube_is_base(data)) {
+        # Get base bands
+        base_bands <- intersect(
+            .ml_bands(ml_model), .cube_bands(.cube_base_info(data))
+        )
+    }
+    # get non-base bands
+    bands <- setdiff(.ml_bands(ml_model), base_bands)
     # Check memory and multicores
     # Get block size
     block <- .raster_file_blocksize(.raster_open_rast(.tile_path(data)))
@@ -471,6 +481,8 @@ sits_classify.segs_cube <- function(data,
         # Classify all the segments for each tile
         class_vector <- .classify_vector_tile(
             tile = tile,
+            bands = bands,
+            base_bands = base_bands,
             ml_model = ml_model,
             block = block,
             roi = roi,