From 44b7098fa117e2ae30e1628f7b389c15aa6aba98 Mon Sep 17 00:00:00 2001
From: Beni Stocker <benjamin.stocker@gmail.com>
Date: Fri, 29 Mar 2024 00:12:51 +0100
Subject: [PATCH] added two functions

---
 R/align_events.R              | 117 ++++++++++++++++++++++++++++++++++
 R/get_consecutive.R           |  98 ++++++++++++++++++++++++++++
 vignettes/event_detection.Rmd |   2 +-
 3 files changed, 216 insertions(+), 1 deletion(-)
 create mode 100644 R/align_events.R
 create mode 100644 R/get_consecutive.R

diff --git a/R/align_events.R b/R/align_events.R
new file mode 100644
index 0000000..62c227b
--- /dev/null
+++ b/R/align_events.R
@@ -0,0 +1,117 @@
+#' Aligns data by events
+#'
+#' Uses a vectory specifying whether data falls into an event to reshape data, aligning by the onset of the event
+#'
+#' @param df A data frame containing all data continuously along time, required columns: \code{"site", "date"}.
+#' @param events A data frame with columns \code{idx_start} and \code{len}, specifying event start and length, referring to the row index of \code{df}.
+#' \code{events} is the output of a function call to \code{get_consecutive}.
+#' @param dovars A vector of character strings specifying which columns (by column name) of \code{df} to re-arrange.
+#' @param leng_threshold An integer specifying the minum number of consecutive dates required to define an event.
+#' All events of length lower than \code{leng_threshold} are dropped.
+#' @param before An integer specifying the number of days before the event onset to be retained in re-arranged data
+#' @param after An integer specifying the number of days after the event onset to be retained in re-arranged data
+#' @param do_norm A logical specifying whether re-arranged data is to be normalised by the median value of the bin
+#' (number of bins given by argument \code{nbins}) before the event onset, given by argument \code{normbin}. Defaults to \code{FALSE}.
+#' @param nbins An integer, specifying the number of bins used to determine median values before event onset. Only used when code{do_norm=TRUE}. Defaults to 6.
+#' @param normbin An integer, specifying the bin number just before the event onset, used for normalisation. Only used when code{do_norm=TRUE}. Defaults to 2.
+#'
+#' @return A named list of data frames (\code{list( "df_idx_event", "df_idx_event_aggbyidx_event")}) containing data from all events and \code{before + after}
+#' dates (relative to event onset) with additional columns named \code{"inst"}, defining the event number (instance), and \code{"idx_event"}, defining
+#' the date relative to the respective event onset. The data frame \code{"df_idx_event"} contains rearranged, but otherwise unchanged data (unless
+#' \code{do_norm}=TRUE). The data frame \code{"df_idx_event_aggbyidx_event"} containes data aggregated across events with the mean and quantiles given for each
+#' \code{"idx_event"}.
+#' @export
+#'
+#' @examples df_alg <- align_events( df, truefalse, before=30, after=300 )
+#'
+align_events <- function(
+    df,
+    events,
+    dovars = names(df),
+    leng_threshold,
+    before,
+    after,
+    do_norm=FALSE,
+    nbins=6,
+    normbin=2
+    ){
+  ##--------------------------------------------------------
+  ## Re-arrange data, aligning by beginning of events
+  ## Creates data frame where not all rows are retained from df
+  ## and columns added for 'idx_event' (number of day relative to onset of event)
+  ## and 'iinst' number of event to which row belongs.
+  ##--------------------------------------------------------
+  if (nrow(events) > 1){
+
+    df_idx_event <- tibble()
+    for ( iinst in 1:nrow(events) ){
+      # idx_event = 0 is beginning of event
+      idx_event <- seq( from = -before,
+                   to = events$len[iinst],
+                   by = 1
+                   )
+      idxs <- idx_event + events$idx_start[iinst]
+
+      # avoid negative row indexes (possible consequence of using 'before > 0')
+      drophead <- which( idxs < 1 )
+      if (length(drophead) > 0){
+        idxs <- idxs[ -drophead ]
+        idx_event <- idx_event[ -drophead ]
+      }
+      addrows <- df |>
+        slice( idxs ) |>
+        mutate( idx_event = idx_event,
+                inst = iinst
+                )
+      df_idx_event <- df_idx_event |>
+        bind_rows( addrows )
+    }
+
+    ##--------------------------------------------------------
+    ## Normalise re-arranged data relative to a certain bin's median
+    ##--------------------------------------------------------
+    if (do_norm){
+      ## Bins for different variables
+      bins  <- seq(
+        from = -before,
+        to = after,
+        by = (after + before + 1)/nbins )
+
+      ## add bin information based on idx_event to expanded df
+      df_idx_event <- df_idx_event |>
+        mutate(
+          inbin  = cut( as.numeric(idx_event), breaks = bins )
+          )
+
+      tmp <- df_idx_event |>
+        dplyr::filter(!is.na(inbin)) |>
+        group_by( inbin ) |>
+        summarise_at( vars(one_of(dovars)), funs(median( ., na.rm=TRUE )) )
+
+      norm <- slice(tmp, normbin)
+
+      ## subtract from all values
+      df_idx_event <- df_idx_event |> mutate_at( vars(one_of(dovars)), funs(. - norm$.) )
+
+    }
+
+  } else {
+
+    df_idx_event <- NULL
+
+  }
+
+  return( df_idx_event )
+
+}
+
+q33 <- function( vec, ... ){
+  quantile( vec, 0.33, ...)
+}
+
+q66 <- function( vec, ... ){
+  quantile( vec, 0.66, ...)
+}
+
+
+
diff --git a/R/get_consecutive.R b/R/get_consecutive.R
new file mode 100644
index 0000000..57a7282
--- /dev/null
+++ b/R/get_consecutive.R
@@ -0,0 +1,98 @@
+#' Identify events
+#'
+#' Identifies events as periods where of consecutively TRUE values in a boolean
+#' vector.
+#'
+#' @param vec A vector of boolean values. Consecutive TRUE vakues designate an event.
+#' @param merge_threshold An integer value specifying the threshold of the gap
+#' length below in units of time steps which
+#' gaps between events are ignored and the two events on either side of the gap
+#' are merged into a single events. Defaults to NA (ignored). Is ignored if
+#' \code{do_merge=FALSE}
+#' @param leng_threshold An integer specifying the minimum length required for
+#' creating an event. Defaults to 3.
+#' @param do_merge A logical specifying whether to merge events if the gap between
+#' them is small (smaller than \code{merge_threshold}).
+#'
+#' @return A data frame containing information about the start date and length
+#' of each detected event
+#' @export
+#'
+get_consecutive <- function(
+    vec,
+    merge_threshold = NA,
+    leng_threshold = 3,
+    do_merge = FALSE
+){
+
+  ## replace NAs with FALSE (no drought). This is needed because of NAs at head or tail
+  vec[ which(is.na(vec)) ] <- FALSE
+
+  ## identifies periods where 'vec' true for consecutive days of length>leng_threshold and
+  ## creates data frame holding each instance's info: start of drought by index
+  ## in 'vec' and length (number of days thereafter)
+  instances <- data.frame( idx_start=c(), len=c() )
+  consecutive_vec <- rep( NA, length( vec ) )
+  nvec  <- 0
+  ninst <- 0
+  for ( idx in 1:length( vec ) ){
+    if (vec[idx]){
+      nvec <- nvec + 1
+    } else {
+      if (nvec >= leng_threshold) {
+        ## create instance
+        ninst <- ninst + 1
+        addrow <- data.frame( idx_start = idx-(nvec), len = nvec )
+        instances <- rbind( instances, addrow )
+      }
+      nvec <- 0
+    }
+    consecutive_vec[idx] <- nvec
+  }
+  if (nvec > leng_threshold){
+    ## create a last instance if the last vec period extends to the end of the time series
+    ninst <- ninst + 1
+    addrow <- data.frame( idx_start=idx-(nvec), len=nvec )
+    instances <- rbind( instances, addrow )
+  }
+
+  # get info about gap between events
+  instances <- instances |>
+    mutate(gap_before = idx_start - (lag(idx_start) + lag(len)))
+
+  if (nrow(instances) > 0){
+    if (do_merge && nrow(instances) > 1){
+
+      instances_merged <- instances[1,]
+      idx_merged <- 1
+      for (idx in 2:nrow(instances)){
+        if (instances$gap_before[idx] > merge_threshold){
+
+          # create new sequence
+          idx_merged <- idx_merged + 1
+          instances_merged <- bind_rows(
+            instances_merged,
+            instances[idx,]
+          )
+
+          # edit length of previously recorded instance
+          instances_merged$len[idx_merged - 1] <- instances$idx_start[idx - 1] + instances$len[idx - 1] - instances_merged$idx_start[idx_merged - 1]
+        }
+      }
+
+      # if all is merged until the end
+      instances_merged$len[idx_merged] <- instances$idx_start[idx] + instances$len[idx] - instances_merged$idx_start[idx_merged]
+
+      instances <- instances_merged[,c("idx_start", "len")]
+    } else {
+      instances <- instances[,c("idx_start", "len")]
+      if (nrow(instances) == 1){
+        if (instances$idx_start == 0)
+          instances$idx_start <- 1
+      }
+    }
+
+  }
+
+  return( instances )
+}
diff --git a/vignettes/event_detection.Rmd b/vignettes/event_detection.Rmd
index cd21ba6..e774c60 100644
--- a/vignettes/event_detection.Rmd
+++ b/vignettes/event_detection.Rmd
@@ -16,7 +16,7 @@ knitr::opts_chunk$set(echo = TRUE)
 ```{r message=FALSE}
 library(dplyr)
 library(ggplot2)
-library(GECOr)
+library(rgeco)
 library(here)
 ```