Merge pull request #9 from r-causal/ess

Add `ess()`
r-causal · Jan 9, 2025 · 3d0c933 · 3d0c933
2 parents c093f22 + 64f7722
commit 3d0c933
Show file tree

Hide file tree

Showing 9 changed files with 225 additions and 22 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -28,4 +28,4 @@ Config/testthat/edition: 3
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
diff --git a/NAMESPACE b/NAMESPACE
@@ -3,6 +3,7 @@
 export(bind_matches)
 export(contains)
 export(ends_with)
+export(ess)
 export(everything)
 export(geom_ecdf)
 export(geom_love)

diff --git a/R/ess.R b/R/ess.R
@@ -0,0 +1,48 @@
+#' Calculate the Effective Sample Size (ESS)
+#'
+#' This function computes the effective sample size (ESS) given a vector of
+#' weights, using the classical \eqn{(\sum w)^2 / \sum(w^2)} formula (sometimes
+#' referred to as "Kish's effective sample size").
+#'
+#' @param wts A numeric vector of weights (e.g., from survey or
+#'   inverse-probability weighting).
+#'
+#' @return A single numeric value representing the effective sample size.
+#'
+#' @details The effective sample size (ESS) reflects how many observations you
+#'   would have if all were equally weighted. If the weights vary substantially,
+#'   the ESS can be much smaller than the actual number of observations.
+#'   Formally:
+#'
+#' \deqn{
+#'   \mathrm{ESS} = \frac{\left(\sum_i w_i\right)^2}{\sum_i w_i^2}.
+#' }
+#'
+#' **Diagnostic Value**:
+#' * **Indicator of Weight Concentration**: A large discrepancy between ESS
+#'   and the actual sample size indicates that a few observations carry
+#'   disproportionately large weights, effectively reducing the usable
+#'   information in the dataset.
+#' * **Variance Inflation**: A small ESS signals that weighted estimates are
+#'   more sensitive to a handful of observations, inflating the variance and
+#'   standard errors.
+#' * **Practical Guidance**: If ESS is much lower than the total sample
+#'   size, it is advisable to investigate why some weights are extremely large
+#'   or small. Techniques like weight trimming or stabilized weights might be
+#'   employed to mitigate the issue
+#'
+#' @examples
+#' # Suppose we have five observations with equal weights
+#' wts1 <- rep(1.2, 5)
+#' # returns 5, because all weights are equal
+#' ess(wts1)
+#'
+#' # If weights vary more, smaller than 5
+#' wts2 <- c(0.5, 2, 2, 0.1, 0.8)
+#' ess(wts2)
+#'
+#' @export
+ess <- function(wts) {
+  sum(wts)^2 / sum(wts^2)
+}
+
diff --git a/man/ess.Rd b/man/ess.Rd
diff --git a/man/geom_ecdf.Rd b/man/geom_ecdf.Rd
diff --git a/man/geom_mirror_histogram.Rd b/man/geom_mirror_histogram.Rd
diff --git a/tests/testthat/Rplots.pdf b/tests/testthat/Rplots.pdf
diff --git a/tests/testthat/_snaps/geom_mirrored_histogram.md b/tests/testthat/_snaps/geom_mirrored_histogram.md
@@ -1,12 +1,12 @@
 # geom_mirrored_histogram errors/warns correctly
 
-    Computation failed in `stat_mirror_count()`
+    Computation failed in `stat_mirror_count()`.
     Caused by error in `abort()`:
     ! Groups of three or greater not supported in `geom_mirror_histogram()`
 
 ---
 
-    Computation failed in `stat_mirror_count()`
+    Computation failed in `stat_mirror_count()`.
     Caused by error in `abort()`:
     ! No group detected.
     * Do you need to use `aes(group = ...)` with your grouping variable?

diff --git a/tests/testthat/test-ess.R b/tests/testthat/test-ess.R
@@ -0,0 +1,27 @@
+test_that("ess returns correct result for equal weights", {
+  # 5 observations, each weight = 2
+  wts_equal <- rep(2, 5)
+  # ESS should be 5
+  expect_equal(ess(wts_equal), 5)
+})
+
+test_that("ess returns correct result for varied weights", {
+  # 5 observations, each weight varies
+  wts_equal <- runif(5, max = 5)
+  # ESS should always be less than 5
+  expect_lt(ess(wts_equal), 5)
+})
+
+test_that("ess handles one large weight", {
+  # 5 observations, 1 large weight
+  wts_big <- c(1000, rep(0, 4))
+  # The sum is 1000, sum of squares is 1,000^2 = 1e6
+  # ESS = (1000^2) / 1,000^2 = 1
+  expect_equal(ess(wts_big), 1)
+})
+
+test_that("ess gives `NaN` if all weights are 0", {
+  wts_zero <- rep(0, 5)
+  # sum(wts) = 0, sum(wts^2) = 0 -> 0/0 is NaN
+  expect_true(is.nan(ess(wts_zero)))
+})