chore(llmobs): [MLOB-1944] generalize helper for extracting token metrics (#12223)

Kyle-Verhoog · ncybul · ZStriker19 · commit 647757d713be · 2025-02-06T16:07:18.000-05:00
[applying #12026 to 3.x-staging] This PR generalizes the helper method used to extract token metrics from an APM span to be attached to an LLMObs span. Before, Anthropic, Bedrock, and Open AI had specific methods on each of their integration classes to accomplish this. Now, there is one get_llmobs_metrics_tags utils function adapted from the google-specific get_llmobs_metrics_tags_google function that gets reused across these integrations as well as Vertex AI and Gemini. The Langchain integration was excluded from this change since its logic for extracting token metrics varies significantly compared to the other integrations. ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --------- Co-authored-by: Nicole Cybul <nicole.cybul@datadoghq.com>
diff --git a/ddtrace/llmobs/_integrations/anthropic.py b/ddtrace/llmobs/_integrations/anthropic.py
@@ -7,16 +7,14 @@
 
 from ddtrace.internal.logger import get_logger
 from ddtrace.llmobs._constants import INPUT_MESSAGES
-from ddtrace.llmobs._constants import INPUT_TOKENS_METRIC_KEY
 from ddtrace.llmobs._constants import METADATA
 from ddtrace.llmobs._constants import METRICS
 from ddtrace.llmobs._constants import MODEL_NAME
 from ddtrace.llmobs._constants import MODEL_PROVIDER
 from ddtrace.llmobs._constants import OUTPUT_MESSAGES
-from ddtrace.llmobs._constants import OUTPUT_TOKENS_METRIC_KEY
 from ddtrace.llmobs._constants import SPAN_KIND
-from ddtrace.llmobs._constants import TOTAL_TOKENS_METRIC_KEY
 from ddtrace.llmobs._integrations.base import BaseLLMIntegration
+from ddtrace.llmobs._integrations.utils import get_llmobs_metrics_tags
 from ddtrace.llmobs._utils import _get_attr
 from ddtrace.trace import Span
 
@@ -77,7 +75,7 @@ def _llmobs_set_tags(
                 INPUT_MESSAGES: input_messages,
                 METADATA: parameters,
                 OUTPUT_MESSAGES: output_messages,
-                METRICS: self._get_llmobs_metrics_tags(span),
+                METRICS: get_llmobs_metrics_tags("anthropic", span),
             }
         )
 
@@ -188,18 +186,3 @@ def record_usage(self, span: Span, usage: Dict[str, Any]) -> None:
             span.set_metric("anthropic.response.usage.output_tokens", output_tokens)
         if input_tokens is not None and output_tokens is not None:
             span.set_metric("anthropic.response.usage.total_tokens", input_tokens + output_tokens)
-
-    @staticmethod
-    def _get_llmobs_metrics_tags(span):
-        usage = {}
-        input_tokens = span.get_metric("anthropic.response.usage.input_tokens")
-        output_tokens = span.get_metric("anthropic.response.usage.output_tokens")
-        total_tokens = span.get_metric("anthropic.response.usage.total_tokens")
-
-        if input_tokens is not None:
-            usage[INPUT_TOKENS_METRIC_KEY] = input_tokens
-        if output_tokens is not None:
-            usage[OUTPUT_TOKENS_METRIC_KEY] = output_tokens
-        if total_tokens is not None:
-            usage[TOTAL_TOKENS_METRIC_KEY] = total_tokens
-        return usage
diff --git a/ddtrace/llmobs/_integrations/bedrock.py b/ddtrace/llmobs/_integrations/bedrock.py
@@ -5,18 +5,16 @@
 
 from ddtrace.internal.logger import get_logger
 from ddtrace.llmobs._constants import INPUT_MESSAGES
-from ddtrace.llmobs._constants import INPUT_TOKENS_METRIC_KEY
 from ddtrace.llmobs._constants import METADATA
 from ddtrace.llmobs._constants import METRICS
 from ddtrace.llmobs._constants import MODEL_NAME
 from ddtrace.llmobs._constants import MODEL_PROVIDER
 from ddtrace.llmobs._constants import OUTPUT_MESSAGES
-from ddtrace.llmobs._constants import OUTPUT_TOKENS_METRIC_KEY
 from ddtrace.llmobs._constants import PARENT_ID_KEY
 from ddtrace.llmobs._constants import PROPAGATED_PARENT_ID_KEY
 from ddtrace.llmobs._constants import SPAN_KIND
-from ddtrace.llmobs._constants import TOTAL_TOKENS_METRIC_KEY
 from ddtrace.llmobs._integrations import BaseLLMIntegration
+from ddtrace.llmobs._integrations.utils import get_llmobs_metrics_tags
 from ddtrace.llmobs._utils import _get_llmobs_parent_id
 from ddtrace.trace import Span
 
@@ -57,22 +55,11 @@ def _llmobs_set_tags(
                 MODEL_PROVIDER: span.get_tag("bedrock.request.model_provider") or "",
                 INPUT_MESSAGES: input_messages,
                 METADATA: parameters,
-                METRICS: self._llmobs_metrics(span, response),
+                METRICS: get_llmobs_metrics_tags("bedrock", span),
                 OUTPUT_MESSAGES: output_messages,
             }
         )
 
-    @staticmethod
-    def _llmobs_metrics(span: Span, response: Optional[Dict[str, Any]]) -> Dict[str, Any]:
-        metrics = {}
-        if response and response.get("text"):
-            prompt_tokens = int(span.get_tag("bedrock.usage.prompt_tokens") or 0)
-            completion_tokens = int(span.get_tag("bedrock.usage.completion_tokens") or 0)
-            metrics[INPUT_TOKENS_METRIC_KEY] = prompt_tokens
-            metrics[OUTPUT_TOKENS_METRIC_KEY] = completion_tokens
-            metrics[TOTAL_TOKENS_METRIC_KEY] = prompt_tokens + completion_tokens
-        return metrics
-
     @staticmethod
     def _extract_input_message(prompt):
         """Extract input messages from the stored prompt.
diff --git a/ddtrace/llmobs/_integrations/gemini.py b/ddtrace/llmobs/_integrations/gemini.py
@@ -14,7 +14,7 @@
 from ddtrace.llmobs._constants import SPAN_KIND
 from ddtrace.llmobs._integrations.base import BaseLLMIntegration
 from ddtrace.llmobs._integrations.utils import extract_message_from_part_google
-from ddtrace.llmobs._integrations.utils import get_llmobs_metrics_tags_google
+from ddtrace.llmobs._integrations.utils import get_llmobs_metrics_tags
 from ddtrace.llmobs._integrations.utils import get_system_instructions_from_google_model
 from ddtrace.llmobs._integrations.utils import llmobs_get_metadata_google
 from ddtrace.llmobs._utils import _get_attr
@@ -59,7 +59,7 @@ def _llmobs_set_tags(
                 METADATA: metadata,
                 INPUT_MESSAGES: input_messages,
                 OUTPUT_MESSAGES: output_messages,
-                METRICS: get_llmobs_metrics_tags_google("google_generativeai", span),
+                METRICS: get_llmobs_metrics_tags("google_generativeai", span),
             }
         )
 
diff --git a/ddtrace/llmobs/_integrations/openai.py b/ddtrace/llmobs/_integrations/openai.py
@@ -20,6 +20,7 @@
 from ddtrace.llmobs._constants import SPAN_KIND
 from ddtrace.llmobs._constants import TOTAL_TOKENS_METRIC_KEY
 from ddtrace.llmobs._integrations.base import BaseLLMIntegration
+from ddtrace.llmobs._integrations.utils import get_llmobs_metrics_tags
 from ddtrace.llmobs._utils import _get_attr
 from ddtrace.llmobs.utils import Document
 from ddtrace.trace import Pin
@@ -234,12 +235,4 @@ def _extract_llmobs_metrics_tags(span: Span, resp: Any) -> Dict[str, Any]:
                 OUTPUT_TOKENS_METRIC_KEY: completion_tokens,
                 TOTAL_TOKENS_METRIC_KEY: prompt_tokens + completion_tokens,
             }
-        prompt_tokens = span.get_metric("openai.response.usage.prompt_tokens")
-        completion_tokens = span.get_metric("openai.response.usage.completion_tokens")
-        if prompt_tokens is None or completion_tokens is None:
-            return {}
-        return {
-            INPUT_TOKENS_METRIC_KEY: prompt_tokens,
-            OUTPUT_TOKENS_METRIC_KEY: completion_tokens,
-            TOTAL_TOKENS_METRIC_KEY: prompt_tokens + completion_tokens,
-        }
+        return get_llmobs_metrics_tags("openai", span)
diff --git a/ddtrace/llmobs/_integrations/utils.py b/ddtrace/llmobs/_integrations/utils.py
@@ -118,10 +118,29 @@ def extract_message_from_part_google(part, role=None):
     return message
 
 
-def get_llmobs_metrics_tags_google(integration_name, span):
+def get_llmobs_metrics_tags(integration_name, span):
     usage = {}
-    input_tokens = span.get_metric("%s.response.usage.prompt_tokens" % integration_name)
-    output_tokens = span.get_metric("%s.response.usage.completion_tokens" % integration_name)
+
+    # bedrock integration tags usage under meta instead of metrics
+    if integration_name == "bedrock":
+        input_tokens = int(span.get_tag("bedrock.usage.prompt_tokens") or 0)
+        output_tokens = int(span.get_tag("bedrock.usage.completion_tokens") or 0)
+        total_tokens = input_tokens + output_tokens
+        if input_tokens:
+            usage[INPUT_TOKENS_METRIC_KEY] = input_tokens
+        if output_tokens:
+            usage[OUTPUT_TOKENS_METRIC_KEY] = output_tokens
+        if total_tokens:
+            usage[TOTAL_TOKENS_METRIC_KEY] = total_tokens
+        return usage
+
+    # check for both prompt / completion or input / output tokens
+    input_tokens = span.get_metric("%s.response.usage.prompt_tokens" % integration_name) or span.get_metric(
+        "%s.response.usage.input_tokens" % integration_name
+    )
+    output_tokens = span.get_metric("%s.response.usage.completion_tokens" % integration_name) or span.get_metric(
+        "%s.response.usage.output_tokens" % integration_name
+    )
     total_tokens = span.get_metric("%s.response.usage.total_tokens" % integration_name)
 
     if input_tokens is not None:
diff --git a/ddtrace/llmobs/_integrations/vertexai.py b/ddtrace/llmobs/_integrations/vertexai.py
@@ -15,7 +15,7 @@
 from ddtrace.llmobs._constants import SPAN_KIND
 from ddtrace.llmobs._integrations.base import BaseLLMIntegration
 from ddtrace.llmobs._integrations.utils import extract_message_from_part_google
-from ddtrace.llmobs._integrations.utils import get_llmobs_metrics_tags_google
+from ddtrace.llmobs._integrations.utils import get_llmobs_metrics_tags
 from ddtrace.llmobs._integrations.utils import get_system_instructions_from_google_model
 from ddtrace.llmobs._integrations.utils import llmobs_get_metadata_google
 from ddtrace.llmobs._utils import _get_attr
@@ -65,7 +65,7 @@ def _llmobs_set_tags(
                 METADATA: metadata,
                 INPUT_MESSAGES: input_messages,
                 OUTPUT_MESSAGES: output_messages,
-                METRICS: get_llmobs_metrics_tags_google("vertexai", span),
+                METRICS: get_llmobs_metrics_tags("vertexai", span),
             }
         )