@@ -20,20 +20,31 @@ def _recall_values(labels, x_absolute=False, y_absolute=False):
20
20
21
21
22
22
def _loss_value (labels ):
23
- def _auc_trapezoidal (x , y ):
24
- x = np .array (x )
25
- y = np .array (y )
26
- return np .sum ((y [1 :] + y [:- 1 ]) / 2 * np .diff (x ))
27
-
28
23
Ny = sum (labels )
29
24
Nx = len (labels )
30
25
31
- best_auc = Nx * Ny - 0.5 - ((Ny * Ny ) / 2 )
32
- actual_auc = _auc_trapezoidal (* _recall_values (labels ,
33
- x_absolute = True ,
34
- y_absolute = True ))
26
+ # The best AUC represents the entire area under the perfect curve, which is
27
+ # the total area Nx * Ny, minus the area above the perfect curve (which is
28
+ # the sum of a series with a formula (Ny * Ny) / 2) plus 0.5 to account for
29
+ # the boundary.
30
+ best_auc = Nx * Ny - (((Ny * Ny ) / 2 ) + 0.5 )
31
+
32
+ # Compute recall values (y) based on the provided labels. We don't need x
33
+ # values because the points are uniformly spaced.
34
+ y = np .array (_recall_values (labels , x_absolute = True , y_absolute = True )[1 ])
35
+
36
+ # The actual AUC is calculated by approximating the area under the curve
37
+ # using the trapezoidal rule. (y[1:] + y[:-1]) / 2 takes the average height
38
+ # between consecutive y values, and we sum them up.
39
+ actual_auc = np .sum ((y [1 :] + y [:- 1 ]) / 2 )
40
+
41
+ # The worst AUC represents the area under the worst-case step curve, which
42
+ # is simply the area under the recall curve where all positive labels are
43
+ # clumped at the end, calculated as (Ny * Ny) / 2.
35
44
worst_auc = ((Ny * Ny ) / 2 )
36
45
46
+ # The normalized loss is the difference between the best AUC and the actual
47
+ # AUC, normalized by the range between the best and worst AUCs.
37
48
normalized_loss = (best_auc - actual_auc ) / (best_auc - worst_auc ) if best_auc != worst_auc else 0 # noqa: E501
38
49
39
50
return normalized_loss
0 commit comments