Update algorithm

jteijema · jteijema · commit 71c6bdd7d748 · 2024-10-30T14:24:32.000+01:00
diff --git a/asreviewcontrib/insights/algorithms.py b/asreviewcontrib/insights/algorithms.py
@@ -24,24 +24,19 @@ def _loss_value(labels):
     Nx = len(labels)
 
     # The best AUC represents the entire area under the perfect curve, which is
-    # the total area Nx * Ny, minus the area above the perfect curve (which is
-    # the sum of a series with a formula (Ny * Ny) / 2) plus 0.5 to account for
-    # the boundary.
-    best_auc = Nx * Ny - (((Ny * Ny) / 2) + 0.5)
+    # the total area Nx * Ny, minus the area above the perfect curve.
+    best_auc = Nx * Ny - ((Ny * (Ny - 1)) / 2)
 
-    # Compute recall values (y) based on the provided labels. We don't need x
-    # values because the points are uniformly spaced.
+    # Compute recall values (y) based on the provided labels.
     y = np.array(_recall_values(labels, x_absolute=True, y_absolute=True)[1])
 
-    # The actual AUC is calculated by approximating the area under the curve
-    # using the trapezoidal rule. (y[1:] + y[:-1]) / 2 takes the average height
-    # between consecutive y values, and we sum them up.
-    actual_auc = np.sum((y[1:] + y[:-1]) / 2)
+    # The actual AUC is the sum of the recall curve.
+    actual_auc = np.sum(y)
 
     # The worst AUC represents the area under the worst-case step curve, which
-    # is simply the area under the recall curve where all positive labels are
-    # clumped at the end, calculated as (Ny * Ny) / 2.
-    worst_auc = ((Ny * Ny) / 2)
+    # is the area under the recall curve where all positive labels are clumped
+    # at the end.
+    worst_auc = (Ny * (Ny + 1)) / 2
 
     # The normalized loss is the difference between the best AUC and the actual
     # AUC, normalized by the range between the best and worst AUCs.