Skip to content

Commit

Permalink
Add Metrics to return to Lite (#765)
Browse files Browse the repository at this point in the history
  • Loading branch information
czaloom authored Sep 26, 2024
1 parent 7090cb8 commit 50f421d
Show file tree
Hide file tree
Showing 5 changed files with 176 additions and 95 deletions.
50 changes: 37 additions & 13 deletions lite/benchmarks/benchmark_objdet.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import requests
from tqdm import tqdm
from valor_lite.detection import DataLoader
from valor_lite.detection import DataLoader, MetricType


class AnnotationType(str, Enum):
Expand Down Expand Up @@ -258,24 +258,48 @@ def run_benchmarking_analysis(
f"Base precomputation timed out with limit of {limit}."
)

# test detailed counts with no samples
detailed_counts_time_no_samples, _ = time_it(
evaluator.compute_detailed_counts
)()

# test detailed counts with 3 samples
detailed_counts_time_three_samples, _ = time_it(
evaluator.compute_detailed_counts
)(n_samples=3)

# evaluate
# evaluate - base metrics only
eval_time, metrics = time_it(evaluator.evaluate)()
# print(metrics)
if eval_time > evaluation_timeout and evaluation_timeout != -1:
raise TimeoutError(
f"Base evaluation timed out with {evaluator.n_datums} datums."
)

# evaluate - base metrics + detailed counts with no samples
detailed_counts_time_no_samples, metrics = time_it(
evaluator.evaluate
)(
[
MetricType.DetailedCounts,
*MetricType.base_metrics(),
]
)
if (
detailed_counts_time_no_samples > evaluation_timeout
and evaluation_timeout != -1
):
raise TimeoutError(
f"Detailed evaluation w/ no samples timed out with {evaluator.n_datums} datums."
)

# evaluate - base metrics + detailed counts with 3 samples
detailed_counts_time_three_samples, metrics = time_it(
evaluator.evaluate
)(
[
MetricType.DetailedCounts,
*MetricType.base_metrics(),
],
number_of_examples=3,
)
if (
detailed_counts_time_three_samples > evaluation_timeout
and evaluation_timeout != -1
):
raise TimeoutError(
f"Detailed w/ 3 samples evaluation timed out with {evaluator.n_datums} datums."
)

results.append(
Benchmark(
limit=limit,
Expand Down
43 changes: 24 additions & 19 deletions lite/tests/detection/test_detailed_counts.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import numpy as np
from valor_lite.detection import DataLoader, Detection, Evaluator
from valor_lite.detection import DataLoader, Detection, Evaluator, MetricType
from valor_lite.detection.computation import compute_detailed_counts


def test_detailed_counts_no_data():
evaluator = Evaluator()
curves = evaluator.compute_detailed_counts()
curves = evaluator._compute_detailed_counts()
assert isinstance(curves, list)
assert len(curves) == 0

Expand Down Expand Up @@ -278,10 +278,11 @@ def test_detailed_counts(
assert evaluator.n_groundtruths == 4
assert evaluator.n_predictions == 4

metrics = evaluator.compute_detailed_counts(
metrics = evaluator.evaluate(
iou_thresholds=[0.5],
score_thresholds=[0.05, 0.3, 0.35, 0.45, 0.55, 0.95],
n_samples=1,
number_of_examples=1,
metrics_to_return=[MetricType.DetailedCounts],
)

uid1_rect1 = ("uid1", rect1)
Expand All @@ -293,7 +294,7 @@ def test_detailed_counts(
uid2_rect2 = ("uid2", rect2)

# test DetailedCounts
actual_metrics = [mm.to_dict() for m in metrics for mm in m]
actual_metrics = [m.to_dict() for m in metrics[MetricType.DetailedCounts]]
expected_metrics = [
{
"type": "DetailedCounts",
Expand Down Expand Up @@ -521,14 +522,15 @@ def test_detailed_counts(

# test at lower IoU threshold

metrics = evaluator.compute_detailed_counts(
metrics = evaluator.evaluate(
iou_thresholds=[0.45],
score_thresholds=[0.05, 0.3, 0.35, 0.45, 0.55, 0.95],
n_samples=1,
number_of_examples=1,
metrics_to_return=[MetricType.DetailedCounts],
)

# test DetailedCounts
actual_metrics = [mm.to_dict() for m in metrics for mm in m]
actual_metrics = [m.to_dict() for m in metrics[MetricType.DetailedCounts]]
expected_metrics = [
{
"type": "DetailedCounts",
Expand Down Expand Up @@ -781,13 +783,14 @@ def test_detailed_counts_using_torch_metrics_example(
assert evaluator.n_groundtruths == 20
assert evaluator.n_predictions == 19

metrics = evaluator.compute_detailed_counts(
metrics = evaluator.evaluate(
iou_thresholds=[0.5, 0.9],
score_thresholds=[0.05, 0.25, 0.35, 0.55, 0.75, 0.8, 0.85, 0.95],
n_samples=1,
number_of_examples=1,
metrics_to_return=[MetricType.DetailedCounts],
)

assert len(metrics) == 6
assert len(metrics[MetricType.DetailedCounts]) == 12

uid0_gt_0 = ("0", (214.125, 562.5, 41.28125, 285.0))
uid1_gt_0 = ("1", (13.0, 549.0, 22.75, 632.5))
Expand All @@ -814,7 +817,7 @@ def test_detailed_counts_using_torch_metrics_example(
uid3_pd_4 = ("3", (75.3125, 91.875, 23.015625, 50.84375))

# test DetailedCounts
actual_metrics = [mm.to_dict() for m in metrics for mm in m]
actual_metrics = [m.to_dict() for m in metrics[MetricType.DetailedCounts]]
expected_metrics = [
{
"type": "DetailedCounts",
Expand Down Expand Up @@ -1628,16 +1631,17 @@ def test_detailed_counts_fp_hallucination_edge_case(
assert evaluator.n_groundtruths == 2
assert evaluator.n_predictions == 2

metrics = evaluator.compute_detailed_counts(
metrics = evaluator.evaluate(
iou_thresholds=[0.5],
score_thresholds=[0.5, 0.85],
n_samples=1,
number_of_examples=1,
metrics_to_return=[MetricType.DetailedCounts],
)

assert len(metrics) == 1
assert len(metrics[MetricType.DetailedCounts]) == 1

# test DetailedCounts
actual_metrics = [mm.to_dict() for m in metrics for mm in m]
actual_metrics = [m.to_dict() for m in metrics[MetricType.DetailedCounts]]
expected_metrics = [
{
"type": "DetailedCounts",
Expand Down Expand Up @@ -1691,13 +1695,14 @@ def test_detailed_counts_ranked_pair_ordering(
"n_predictions": 4,
}

metrics = evaluator.compute_detailed_counts(
metrics = evaluator.evaluate(
iou_thresholds=[0.5],
score_thresholds=[0.0],
n_samples=0,
number_of_examples=0,
metrics_to_return=[MetricType.DetailedCounts],
)

actual_metrics = [mm.to_dict() for m in metrics for mm in m]
actual_metrics = [m.to_dict() for m in metrics[MetricType.DetailedCounts]]
expected_metrics = [
{
"type": "DetailedCounts",
Expand Down
22 changes: 22 additions & 0 deletions lite/tests/detection/test_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,25 @@ def test_no_predictions(detections_no_predictions):
assert m in expected_metrics
for m in expected_metrics:
assert m in actual_metrics


def test_metrics_to_return(basic_detections: list[Detection]):

loader = DataLoader()
loader.add_data(basic_detections)
evaluator = loader.finalize()

metrics_to_return = [
MetricType.AP,
MetricType.AR,
]
metrics = evaluator.evaluate(metrics_to_return)
assert metrics.keys() == set(metrics_to_return)

metrics_to_return = [
MetricType.AP,
MetricType.AR,
MetricType.DetailedCounts,
]
metrics = evaluator.evaluate(metrics_to_return)
assert metrics.keys() == set(metrics_to_return)
137 changes: 74 additions & 63 deletions lite/valor_lite/detection/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,19 +228,25 @@ def create_filter(

def evaluate(
self,
metrics_to_return: list[MetricType] = MetricType.base_metrics(),
iou_thresholds: list[float] = [0.5, 0.75, 0.9],
score_thresholds: list[float] = [0.5],
number_of_examples: int = 0,
filter_: Filter | None = None,
) -> dict[MetricType, list]:
"""
Performs an evaluation and returns metrics.
Parameters
----------
metrics_to_return : list[MetricType]
A list of metrics to return in the results.
iou_thresholds : list[float]
A list of IoU thresholds to compute metrics over.
score_thresholds : list[float]
A list of score thresholds to compute metrics over.
number_of_examples : int, default=0
Number of annotation examples to return in DetailedCounts.
filter_ : Filter, optional
An optional filter object.
Expand Down Expand Up @@ -426,16 +432,27 @@ def evaluate(
)
)

if MetricType.DetailedCounts in metrics_to_return:
metrics[MetricType.DetailedCounts] = self._compute_detailed_counts(
iou_thresholds=iou_thresholds,
score_thresholds=score_thresholds,
n_samples=number_of_examples,
)

for metric in set(metrics.keys()):
if metric not in metrics_to_return:
del metrics[metric]

return metrics

def compute_detailed_counts(
def _compute_detailed_counts(
self,
iou_thresholds: list[float] = [0.5],
score_thresholds: list[float] = [
score / 10.0 for score in range(1, 11)
],
n_samples: int = 0,
) -> list[list[DetailedCounts]]:
) -> list[DetailedCounts]:
"""
Computes detailed counting metrics.
Expand Down Expand Up @@ -515,68 +532,62 @@ def _unpack_examples(

n_ious, n_scores, n_labels, _ = metrics.shape
return [
[
DetailedCounts(
iou_threshold=iou_thresholds[iou_idx],
label=self.index_to_label[label_idx],
score_thresholds=score_thresholds,
tp=metrics[iou_idx, :, label_idx, tp_idx]
.astype(int)
.tolist(),
fp_misclassification=metrics[
iou_idx, :, label_idx, fp_misclf_idx
]
.astype(int)
.tolist(),
fp_hallucination=metrics[
iou_idx, :, label_idx, fp_halluc_idx
]
.astype(int)
.tolist(),
fn_misclassification=metrics[
iou_idx, :, label_idx, fn_misclf_idx
]
.astype(int)
.tolist(),
fn_missing_prediction=metrics[
iou_idx, :, label_idx, fn_misprd_idx
]
.astype(int)
.tolist(),
tp_examples=_unpack_examples(
iou_idx=iou_idx,
label_idx=label_idx,
type_idx=tp_idx,
example_source=self.prediction_examples,
),
fp_misclassification_examples=_unpack_examples(
iou_idx=iou_idx,
label_idx=label_idx,
type_idx=fp_misclf_idx,
example_source=self.prediction_examples,
),
fp_hallucination_examples=_unpack_examples(
iou_idx=iou_idx,
label_idx=label_idx,
type_idx=fp_halluc_idx,
example_source=self.prediction_examples,
),
fn_misclassification_examples=_unpack_examples(
iou_idx=iou_idx,
label_idx=label_idx,
type_idx=fn_misclf_idx,
example_source=self.groundtruth_examples,
),
fn_missing_prediction_examples=_unpack_examples(
iou_idx=iou_idx,
label_idx=label_idx,
type_idx=fn_misprd_idx,
example_source=self.groundtruth_examples,
),
)
for iou_idx in range(n_ious)
]
DetailedCounts(
iou_threshold=iou_thresholds[iou_idx],
label=self.index_to_label[label_idx],
score_thresholds=score_thresholds,
tp=metrics[iou_idx, :, label_idx, tp_idx].astype(int).tolist(),
fp_misclassification=metrics[
iou_idx, :, label_idx, fp_misclf_idx
]
.astype(int)
.tolist(),
fp_hallucination=metrics[iou_idx, :, label_idx, fp_halluc_idx]
.astype(int)
.tolist(),
fn_misclassification=metrics[
iou_idx, :, label_idx, fn_misclf_idx
]
.astype(int)
.tolist(),
fn_missing_prediction=metrics[
iou_idx, :, label_idx, fn_misprd_idx
]
.astype(int)
.tolist(),
tp_examples=_unpack_examples(
iou_idx=iou_idx,
label_idx=label_idx,
type_idx=tp_idx,
example_source=self.prediction_examples,
),
fp_misclassification_examples=_unpack_examples(
iou_idx=iou_idx,
label_idx=label_idx,
type_idx=fp_misclf_idx,
example_source=self.prediction_examples,
),
fp_hallucination_examples=_unpack_examples(
iou_idx=iou_idx,
label_idx=label_idx,
type_idx=fp_halluc_idx,
example_source=self.prediction_examples,
),
fn_misclassification_examples=_unpack_examples(
iou_idx=iou_idx,
label_idx=label_idx,
type_idx=fn_misclf_idx,
example_source=self.groundtruth_examples,
),
fn_missing_prediction_examples=_unpack_examples(
iou_idx=iou_idx,
label_idx=label_idx,
type_idx=fn_misprd_idx,
example_source=self.groundtruth_examples,
),
)
for label_idx in range(n_labels)
for iou_idx in range(n_ious)
]


Expand Down
Loading

0 comments on commit 50f421d

Please sign in to comment.