test: cause failure if FC emits an error metric

roypat · zulinx86 · commit 27f0b3dd75d3 · 2024-08-07T11:22:01.000+01:00
When flushing firecracker metrics to cloudwatch, explicitly assert that
metrics containing the keypharses "err", "fail" or "panic" are all 0.

While doing this, clean up the flushing code a bit by recognizing that
if we separate the flattening of the metrics json and the actual
emitting, we can treat the flattening as a tree-walk (since we are
converting a tree to a set of (path, leaf) pairs).

Signed-off-by: Patrick Roy &lt;roypat@amazon.co.uk&gt;
diff --git a/tests/host_tools/fcmetrics.py b/tests/host_tools/fcmetrics.py
@@ -437,53 +437,39 @@ def get_emf_unit_for_fc_metrics(full_key):
 
 def flush_fc_metrics_to_cw(fc_metrics, metrics):
     """
-    Flush Firecracker metrics to CloudWatch
-    Use an existing metrics logger with existing dimensions so
-    that its easier to corelate the metrics with the test calling it.
-    Add a prefix "fc_metrics." to differentiate these metrics, this
-    also helps to avoid using this metrics in AB tests.
+    Flush Firecracker metrics to CloudWatch. Use an existing metrics logger with existing dimensions so that it is
+    easier to correlate the metrics with the test calling it. Add a prefix "fc_metrics." to differentiate these metrics,
+    this also helps to avoid using this metrics in A/B tests.
     NOTE:
-        There are metrics with keywords "fail", "err",
-        "num_faults", "panic" in their name and represent
-        some kind of failure in Firecracker.
-        This function `does not` assert on these failure metrics
-        since some tests might not want to assert on them while
-        some tests might want to assert on some but not others.
+        There are metrics with keywords "fail", "err", "num_faults", "panic" in their name and represent some kind of
+        failure in Firecracker.  We assert that all these are zero, to catch potentially silent failure modes. This
+        means the FcMonitor cannot be used in negative tests that might cause such metrics to be emitted.
     """
 
-    def walk_key(full_key, keys):
-        for key, value in keys.items():
-            final_full_key = full_key + "." + key
-            if isinstance(value, dict):
-                walk_key(final_full_key, value)
-            else:
-                # values are 0 when:
-                # - there is no update
-                # - device is not used
-                # - SharedIncMetric reset to 0 on flush so if
-                #   there is no change metric the values remain 0.
-                # We can save the amount of bytes we export to
-                # CloudWatch in these cases.
-                # however it is difficult to differentiate if a 0
-                # should be skipped or upload because it could be
-                # an expected value in some cases so we upload
-                # all the metrics even if data is 0.
-                unit = get_emf_unit_for_fc_metrics(final_full_key)
-                metrics.put_metric(f"fc_metrics.{final_full_key}", value, unit=unit)
-
-    # List of SharedStoreMetric that once updated have the same value thoughout the life of vm
-    metrics_to_export_once = {
-        "api_server",
-        "latencies_us",
-    }
-    skip = set()
-    for group, keys in fc_metrics.items():
-        if group == "utc_timestamp_ms":
+    # Pre-order tree traversal to convert a tree into its list of paths with dot separate segments
+    def flatten_dict(node, prefix: str):
+        if not isinstance(node, dict):
+            return {prefix: node}
+
+        result = {}
+        for child_metric_name, child_metrics in node.items():
+            result.update(flatten_dict(child_metrics, f"{prefix}.{child_metric_name}"))
+        return result
+
+    flattened_metrics = flatten_dict(fc_metrics, "fc_metrics")
+
+    for key, value in flattened_metrics.items():
+        if ".utc_timestamp_ms." in key:
             continue
-        if group not in skip:
-            walk_key(group, keys)
-            if group in metrics_to_export_once:
-                skip.add(group)
+        metrics.put_metric(key, value, get_emf_unit_for_fc_metrics(key))
+
+    assert not {
+        key: value
+        for key, value in flattened_metrics.items()
+        if "err" in key or "fail" in key or "panic" in key or "num_faults" in key
+        if value
+    }
+
     metrics.flush()