From 44b7098fa117e2ae30e1628f7b389c15aa6aba98 Mon Sep 17 00:00:00 2001 From: Beni Stocker Date: Fri, 29 Mar 2024 00:12:51 +0100 Subject: [PATCH] added two functions --- R/align_events.R | 117 ++++++++++++++++++++++++++++++++++ R/get_consecutive.R | 98 ++++++++++++++++++++++++++++ vignettes/event_detection.Rmd | 2 +- 3 files changed, 216 insertions(+), 1 deletion(-) create mode 100644 R/align_events.R create mode 100644 R/get_consecutive.R diff --git a/R/align_events.R b/R/align_events.R new file mode 100644 index 0000000..62c227b --- /dev/null +++ b/R/align_events.R @@ -0,0 +1,117 @@ +#' Aligns data by events +#' +#' Uses a vectory specifying whether data falls into an event to reshape data, aligning by the onset of the event +#' +#' @param df A data frame containing all data continuously along time, required columns: \code{"site", "date"}. +#' @param events A data frame with columns \code{idx_start} and \code{len}, specifying event start and length, referring to the row index of \code{df}. +#' \code{events} is the output of a function call to \code{get_consecutive}. +#' @param dovars A vector of character strings specifying which columns (by column name) of \code{df} to re-arrange. +#' @param leng_threshold An integer specifying the minum number of consecutive dates required to define an event. +#' All events of length lower than \code{leng_threshold} are dropped. +#' @param before An integer specifying the number of days before the event onset to be retained in re-arranged data +#' @param after An integer specifying the number of days after the event onset to be retained in re-arranged data +#' @param do_norm A logical specifying whether re-arranged data is to be normalised by the median value of the bin +#' (number of bins given by argument \code{nbins}) before the event onset, given by argument \code{normbin}. Defaults to \code{FALSE}. +#' @param nbins An integer, specifying the number of bins used to determine median values before event onset. Only used when code{do_norm=TRUE}. Defaults to 6. +#' @param normbin An integer, specifying the bin number just before the event onset, used for normalisation. Only used when code{do_norm=TRUE}. Defaults to 2. +#' +#' @return A named list of data frames (\code{list( "df_idx_event", "df_idx_event_aggbyidx_event")}) containing data from all events and \code{before + after} +#' dates (relative to event onset) with additional columns named \code{"inst"}, defining the event number (instance), and \code{"idx_event"}, defining +#' the date relative to the respective event onset. The data frame \code{"df_idx_event"} contains rearranged, but otherwise unchanged data (unless +#' \code{do_norm}=TRUE). The data frame \code{"df_idx_event_aggbyidx_event"} containes data aggregated across events with the mean and quantiles given for each +#' \code{"idx_event"}. +#' @export +#' +#' @examples df_alg <- align_events( df, truefalse, before=30, after=300 ) +#' +align_events <- function( + df, + events, + dovars = names(df), + leng_threshold, + before, + after, + do_norm=FALSE, + nbins=6, + normbin=2 + ){ + ##-------------------------------------------------------- + ## Re-arrange data, aligning by beginning of events + ## Creates data frame where not all rows are retained from df + ## and columns added for 'idx_event' (number of day relative to onset of event) + ## and 'iinst' number of event to which row belongs. + ##-------------------------------------------------------- + if (nrow(events) > 1){ + + df_idx_event <- tibble() + for ( iinst in 1:nrow(events) ){ + # idx_event = 0 is beginning of event + idx_event <- seq( from = -before, + to = events$len[iinst], + by = 1 + ) + idxs <- idx_event + events$idx_start[iinst] + + # avoid negative row indexes (possible consequence of using 'before > 0') + drophead <- which( idxs < 1 ) + if (length(drophead) > 0){ + idxs <- idxs[ -drophead ] + idx_event <- idx_event[ -drophead ] + } + addrows <- df |> + slice( idxs ) |> + mutate( idx_event = idx_event, + inst = iinst + ) + df_idx_event <- df_idx_event |> + bind_rows( addrows ) + } + + ##-------------------------------------------------------- + ## Normalise re-arranged data relative to a certain bin's median + ##-------------------------------------------------------- + if (do_norm){ + ## Bins for different variables + bins <- seq( + from = -before, + to = after, + by = (after + before + 1)/nbins ) + + ## add bin information based on idx_event to expanded df + df_idx_event <- df_idx_event |> + mutate( + inbin = cut( as.numeric(idx_event), breaks = bins ) + ) + + tmp <- df_idx_event |> + dplyr::filter(!is.na(inbin)) |> + group_by( inbin ) |> + summarise_at( vars(one_of(dovars)), funs(median( ., na.rm=TRUE )) ) + + norm <- slice(tmp, normbin) + + ## subtract from all values + df_idx_event <- df_idx_event |> mutate_at( vars(one_of(dovars)), funs(. - norm$.) ) + + } + + } else { + + df_idx_event <- NULL + + } + + return( df_idx_event ) + +} + +q33 <- function( vec, ... ){ + quantile( vec, 0.33, ...) +} + +q66 <- function( vec, ... ){ + quantile( vec, 0.66, ...) +} + + + diff --git a/R/get_consecutive.R b/R/get_consecutive.R new file mode 100644 index 0000000..57a7282 --- /dev/null +++ b/R/get_consecutive.R @@ -0,0 +1,98 @@ +#' Identify events +#' +#' Identifies events as periods where of consecutively TRUE values in a boolean +#' vector. +#' +#' @param vec A vector of boolean values. Consecutive TRUE vakues designate an event. +#' @param merge_threshold An integer value specifying the threshold of the gap +#' length below in units of time steps which +#' gaps between events are ignored and the two events on either side of the gap +#' are merged into a single events. Defaults to NA (ignored). Is ignored if +#' \code{do_merge=FALSE} +#' @param leng_threshold An integer specifying the minimum length required for +#' creating an event. Defaults to 3. +#' @param do_merge A logical specifying whether to merge events if the gap between +#' them is small (smaller than \code{merge_threshold}). +#' +#' @return A data frame containing information about the start date and length +#' of each detected event +#' @export +#' +get_consecutive <- function( + vec, + merge_threshold = NA, + leng_threshold = 3, + do_merge = FALSE +){ + + ## replace NAs with FALSE (no drought). This is needed because of NAs at head or tail + vec[ which(is.na(vec)) ] <- FALSE + + ## identifies periods where 'vec' true for consecutive days of length>leng_threshold and + ## creates data frame holding each instance's info: start of drought by index + ## in 'vec' and length (number of days thereafter) + instances <- data.frame( idx_start=c(), len=c() ) + consecutive_vec <- rep( NA, length( vec ) ) + nvec <- 0 + ninst <- 0 + for ( idx in 1:length( vec ) ){ + if (vec[idx]){ + nvec <- nvec + 1 + } else { + if (nvec >= leng_threshold) { + ## create instance + ninst <- ninst + 1 + addrow <- data.frame( idx_start = idx-(nvec), len = nvec ) + instances <- rbind( instances, addrow ) + } + nvec <- 0 + } + consecutive_vec[idx] <- nvec + } + if (nvec > leng_threshold){ + ## create a last instance if the last vec period extends to the end of the time series + ninst <- ninst + 1 + addrow <- data.frame( idx_start=idx-(nvec), len=nvec ) + instances <- rbind( instances, addrow ) + } + + # get info about gap between events + instances <- instances |> + mutate(gap_before = idx_start - (lag(idx_start) + lag(len))) + + if (nrow(instances) > 0){ + if (do_merge && nrow(instances) > 1){ + + instances_merged <- instances[1,] + idx_merged <- 1 + for (idx in 2:nrow(instances)){ + if (instances$gap_before[idx] > merge_threshold){ + + # create new sequence + idx_merged <- idx_merged + 1 + instances_merged <- bind_rows( + instances_merged, + instances[idx,] + ) + + # edit length of previously recorded instance + instances_merged$len[idx_merged - 1] <- instances$idx_start[idx - 1] + instances$len[idx - 1] - instances_merged$idx_start[idx_merged - 1] + } + } + + # if all is merged until the end + instances_merged$len[idx_merged] <- instances$idx_start[idx] + instances$len[idx] - instances_merged$idx_start[idx_merged] + + instances <- instances_merged[,c("idx_start", "len")] + } else { + instances <- instances[,c("idx_start", "len")] + if (nrow(instances) == 1){ + if (instances$idx_start == 0) + instances$idx_start <- 1 + } + } + + } + + return( instances ) +} diff --git a/vignettes/event_detection.Rmd b/vignettes/event_detection.Rmd index cd21ba6..e774c60 100644 --- a/vignettes/event_detection.Rmd +++ b/vignettes/event_detection.Rmd @@ -16,7 +16,7 @@ knitr::opts_chunk$set(echo = TRUE) ```{r message=FALSE} library(dplyr) library(ggplot2) -library(GECOr) +library(rgeco) library(here) ```