diff --git a/lite/benchmarks/benchmark_objdet.py b/lite/benchmarks/benchmark_objdet.py index 1e307f8f4..946156e25 100644 --- a/lite/benchmarks/benchmark_objdet.py +++ b/lite/benchmarks/benchmark_objdet.py @@ -8,7 +8,7 @@ import requests from tqdm import tqdm -from valor_lite.detection import DataLoader +from valor_lite.detection import DataLoader, MetricType class AnnotationType(str, Enum): @@ -258,24 +258,48 @@ def run_benchmarking_analysis( f"Base precomputation timed out with limit of {limit}." ) - # test detailed counts with no samples - detailed_counts_time_no_samples, _ = time_it( - evaluator.compute_detailed_counts - )() - - # test detailed counts with 3 samples - detailed_counts_time_three_samples, _ = time_it( - evaluator.compute_detailed_counts - )(n_samples=3) - - # evaluate + # evaluate - base metrics only eval_time, metrics = time_it(evaluator.evaluate)() - # print(metrics) if eval_time > evaluation_timeout and evaluation_timeout != -1: raise TimeoutError( f"Base evaluation timed out with {evaluator.n_datums} datums." ) + # evaluate - base metrics + detailed counts with no samples + detailed_counts_time_no_samples, metrics = time_it( + evaluator.evaluate + )( + [ + MetricType.DetailedCounts, + *MetricType.base_metrics(), + ] + ) + if ( + detailed_counts_time_no_samples > evaluation_timeout + and evaluation_timeout != -1 + ): + raise TimeoutError( + f"Detailed evaluation w/ no samples timed out with {evaluator.n_datums} datums." + ) + + # evaluate - base metrics + detailed counts with 3 samples + detailed_counts_time_three_samples, metrics = time_it( + evaluator.evaluate + )( + [ + MetricType.DetailedCounts, + *MetricType.base_metrics(), + ], + number_of_examples=3, + ) + if ( + detailed_counts_time_three_samples > evaluation_timeout + and evaluation_timeout != -1 + ): + raise TimeoutError( + f"Detailed w/ 3 samples evaluation timed out with {evaluator.n_datums} datums." + ) + results.append( Benchmark( limit=limit, diff --git a/lite/tests/detection/test_detailed_counts.py b/lite/tests/detection/test_detailed_counts.py index 6552808fc..f29aa3601 100644 --- a/lite/tests/detection/test_detailed_counts.py +++ b/lite/tests/detection/test_detailed_counts.py @@ -1,11 +1,11 @@ import numpy as np -from valor_lite.detection import DataLoader, Detection, Evaluator +from valor_lite.detection import DataLoader, Detection, Evaluator, MetricType from valor_lite.detection.computation import compute_detailed_counts def test_detailed_counts_no_data(): evaluator = Evaluator() - curves = evaluator.compute_detailed_counts() + curves = evaluator._compute_detailed_counts() assert isinstance(curves, list) assert len(curves) == 0 @@ -278,10 +278,11 @@ def test_detailed_counts( assert evaluator.n_groundtruths == 4 assert evaluator.n_predictions == 4 - metrics = evaluator.compute_detailed_counts( + metrics = evaluator.evaluate( iou_thresholds=[0.5], score_thresholds=[0.05, 0.3, 0.35, 0.45, 0.55, 0.95], - n_samples=1, + number_of_examples=1, + metrics_to_return=[MetricType.DetailedCounts], ) uid1_rect1 = ("uid1", rect1) @@ -293,7 +294,7 @@ def test_detailed_counts( uid2_rect2 = ("uid2", rect2) # test DetailedCounts - actual_metrics = [mm.to_dict() for m in metrics for mm in m] + actual_metrics = [m.to_dict() for m in metrics[MetricType.DetailedCounts]] expected_metrics = [ { "type": "DetailedCounts", @@ -521,14 +522,15 @@ def test_detailed_counts( # test at lower IoU threshold - metrics = evaluator.compute_detailed_counts( + metrics = evaluator.evaluate( iou_thresholds=[0.45], score_thresholds=[0.05, 0.3, 0.35, 0.45, 0.55, 0.95], - n_samples=1, + number_of_examples=1, + metrics_to_return=[MetricType.DetailedCounts], ) # test DetailedCounts - actual_metrics = [mm.to_dict() for m in metrics for mm in m] + actual_metrics = [m.to_dict() for m in metrics[MetricType.DetailedCounts]] expected_metrics = [ { "type": "DetailedCounts", @@ -781,13 +783,14 @@ def test_detailed_counts_using_torch_metrics_example( assert evaluator.n_groundtruths == 20 assert evaluator.n_predictions == 19 - metrics = evaluator.compute_detailed_counts( + metrics = evaluator.evaluate( iou_thresholds=[0.5, 0.9], score_thresholds=[0.05, 0.25, 0.35, 0.55, 0.75, 0.8, 0.85, 0.95], - n_samples=1, + number_of_examples=1, + metrics_to_return=[MetricType.DetailedCounts], ) - assert len(metrics) == 6 + assert len(metrics[MetricType.DetailedCounts]) == 12 uid0_gt_0 = ("0", (214.125, 562.5, 41.28125, 285.0)) uid1_gt_0 = ("1", (13.0, 549.0, 22.75, 632.5)) @@ -814,7 +817,7 @@ def test_detailed_counts_using_torch_metrics_example( uid3_pd_4 = ("3", (75.3125, 91.875, 23.015625, 50.84375)) # test DetailedCounts - actual_metrics = [mm.to_dict() for m in metrics for mm in m] + actual_metrics = [m.to_dict() for m in metrics[MetricType.DetailedCounts]] expected_metrics = [ { "type": "DetailedCounts", @@ -1628,16 +1631,17 @@ def test_detailed_counts_fp_hallucination_edge_case( assert evaluator.n_groundtruths == 2 assert evaluator.n_predictions == 2 - metrics = evaluator.compute_detailed_counts( + metrics = evaluator.evaluate( iou_thresholds=[0.5], score_thresholds=[0.5, 0.85], - n_samples=1, + number_of_examples=1, + metrics_to_return=[MetricType.DetailedCounts], ) - assert len(metrics) == 1 + assert len(metrics[MetricType.DetailedCounts]) == 1 # test DetailedCounts - actual_metrics = [mm.to_dict() for m in metrics for mm in m] + actual_metrics = [m.to_dict() for m in metrics[MetricType.DetailedCounts]] expected_metrics = [ { "type": "DetailedCounts", @@ -1691,13 +1695,14 @@ def test_detailed_counts_ranked_pair_ordering( "n_predictions": 4, } - metrics = evaluator.compute_detailed_counts( + metrics = evaluator.evaluate( iou_thresholds=[0.5], score_thresholds=[0.0], - n_samples=0, + number_of_examples=0, + metrics_to_return=[MetricType.DetailedCounts], ) - actual_metrics = [mm.to_dict() for m in metrics for mm in m] + actual_metrics = [m.to_dict() for m in metrics[MetricType.DetailedCounts]] expected_metrics = [ { "type": "DetailedCounts", diff --git a/lite/tests/detection/test_evaluator.py b/lite/tests/detection/test_evaluator.py index d79d168e1..93185f13f 100644 --- a/lite/tests/detection/test_evaluator.py +++ b/lite/tests/detection/test_evaluator.py @@ -96,3 +96,25 @@ def test_no_predictions(detections_no_predictions): assert m in expected_metrics for m in expected_metrics: assert m in actual_metrics + + +def test_metrics_to_return(basic_detections: list[Detection]): + + loader = DataLoader() + loader.add_data(basic_detections) + evaluator = loader.finalize() + + metrics_to_return = [ + MetricType.AP, + MetricType.AR, + ] + metrics = evaluator.evaluate(metrics_to_return) + assert metrics.keys() == set(metrics_to_return) + + metrics_to_return = [ + MetricType.AP, + MetricType.AR, + MetricType.DetailedCounts, + ] + metrics = evaluator.evaluate(metrics_to_return) + assert metrics.keys() == set(metrics_to_return) diff --git a/lite/valor_lite/detection/manager.py b/lite/valor_lite/detection/manager.py index 7869273b0..f5705be13 100644 --- a/lite/valor_lite/detection/manager.py +++ b/lite/valor_lite/detection/manager.py @@ -228,8 +228,10 @@ def create_filter( def evaluate( self, + metrics_to_return: list[MetricType] = MetricType.base_metrics(), iou_thresholds: list[float] = [0.5, 0.75, 0.9], score_thresholds: list[float] = [0.5], + number_of_examples: int = 0, filter_: Filter | None = None, ) -> dict[MetricType, list]: """ @@ -237,10 +239,14 @@ def evaluate( Parameters ---------- + metrics_to_return : list[MetricType] + A list of metrics to return in the results. iou_thresholds : list[float] A list of IoU thresholds to compute metrics over. score_thresholds : list[float] A list of score thresholds to compute metrics over. + number_of_examples : int, default=0 + Number of annotation examples to return in DetailedCounts. filter_ : Filter, optional An optional filter object. @@ -426,16 +432,27 @@ def evaluate( ) ) + if MetricType.DetailedCounts in metrics_to_return: + metrics[MetricType.DetailedCounts] = self._compute_detailed_counts( + iou_thresholds=iou_thresholds, + score_thresholds=score_thresholds, + n_samples=number_of_examples, + ) + + for metric in set(metrics.keys()): + if metric not in metrics_to_return: + del metrics[metric] + return metrics - def compute_detailed_counts( + def _compute_detailed_counts( self, iou_thresholds: list[float] = [0.5], score_thresholds: list[float] = [ score / 10.0 for score in range(1, 11) ], n_samples: int = 0, - ) -> list[list[DetailedCounts]]: + ) -> list[DetailedCounts]: """ Computes detailed counting metrics. @@ -515,68 +532,62 @@ def _unpack_examples( n_ious, n_scores, n_labels, _ = metrics.shape return [ - [ - DetailedCounts( - iou_threshold=iou_thresholds[iou_idx], - label=self.index_to_label[label_idx], - score_thresholds=score_thresholds, - tp=metrics[iou_idx, :, label_idx, tp_idx] - .astype(int) - .tolist(), - fp_misclassification=metrics[ - iou_idx, :, label_idx, fp_misclf_idx - ] - .astype(int) - .tolist(), - fp_hallucination=metrics[ - iou_idx, :, label_idx, fp_halluc_idx - ] - .astype(int) - .tolist(), - fn_misclassification=metrics[ - iou_idx, :, label_idx, fn_misclf_idx - ] - .astype(int) - .tolist(), - fn_missing_prediction=metrics[ - iou_idx, :, label_idx, fn_misprd_idx - ] - .astype(int) - .tolist(), - tp_examples=_unpack_examples( - iou_idx=iou_idx, - label_idx=label_idx, - type_idx=tp_idx, - example_source=self.prediction_examples, - ), - fp_misclassification_examples=_unpack_examples( - iou_idx=iou_idx, - label_idx=label_idx, - type_idx=fp_misclf_idx, - example_source=self.prediction_examples, - ), - fp_hallucination_examples=_unpack_examples( - iou_idx=iou_idx, - label_idx=label_idx, - type_idx=fp_halluc_idx, - example_source=self.prediction_examples, - ), - fn_misclassification_examples=_unpack_examples( - iou_idx=iou_idx, - label_idx=label_idx, - type_idx=fn_misclf_idx, - example_source=self.groundtruth_examples, - ), - fn_missing_prediction_examples=_unpack_examples( - iou_idx=iou_idx, - label_idx=label_idx, - type_idx=fn_misprd_idx, - example_source=self.groundtruth_examples, - ), - ) - for iou_idx in range(n_ious) - ] + DetailedCounts( + iou_threshold=iou_thresholds[iou_idx], + label=self.index_to_label[label_idx], + score_thresholds=score_thresholds, + tp=metrics[iou_idx, :, label_idx, tp_idx].astype(int).tolist(), + fp_misclassification=metrics[ + iou_idx, :, label_idx, fp_misclf_idx + ] + .astype(int) + .tolist(), + fp_hallucination=metrics[iou_idx, :, label_idx, fp_halluc_idx] + .astype(int) + .tolist(), + fn_misclassification=metrics[ + iou_idx, :, label_idx, fn_misclf_idx + ] + .astype(int) + .tolist(), + fn_missing_prediction=metrics[ + iou_idx, :, label_idx, fn_misprd_idx + ] + .astype(int) + .tolist(), + tp_examples=_unpack_examples( + iou_idx=iou_idx, + label_idx=label_idx, + type_idx=tp_idx, + example_source=self.prediction_examples, + ), + fp_misclassification_examples=_unpack_examples( + iou_idx=iou_idx, + label_idx=label_idx, + type_idx=fp_misclf_idx, + example_source=self.prediction_examples, + ), + fp_hallucination_examples=_unpack_examples( + iou_idx=iou_idx, + label_idx=label_idx, + type_idx=fp_halluc_idx, + example_source=self.prediction_examples, + ), + fn_misclassification_examples=_unpack_examples( + iou_idx=iou_idx, + label_idx=label_idx, + type_idx=fn_misclf_idx, + example_source=self.groundtruth_examples, + ), + fn_missing_prediction_examples=_unpack_examples( + iou_idx=iou_idx, + label_idx=label_idx, + type_idx=fn_misprd_idx, + example_source=self.groundtruth_examples, + ), + ) for label_idx in range(n_labels) + for iou_idx in range(n_ious) ] diff --git a/lite/valor_lite/detection/metric.py b/lite/valor_lite/detection/metric.py index 4a95720e9..8b8633bd5 100644 --- a/lite/valor_lite/detection/metric.py +++ b/lite/valor_lite/detection/metric.py @@ -21,6 +21,25 @@ class MetricType(str, Enum): PrecisionRecallCurve = "PrecisionRecallCurve" DetailedCounts = "DetailedCounts" + @classmethod + def base_metrics(cls): + return [ + cls.Counts, + cls.Accuracy, + cls.Precision, + cls.Recall, + cls.F1, + cls.AP, + cls.AR, + cls.mAP, + cls.mAR, + cls.APAveragedOverIOUs, + cls.mAPAveragedOverIOUs, + cls.ARAveragedOverScores, + cls.mARAveragedOverScores, + cls.PrecisionRecallCurve, + ] + @dataclass class Counts: