fix(llmobs): preserve openai assistant content when tool_calls are present [backport 4.8] (#18092)

dd-octo-sts[bot] · github-actions[bot] · jessicagamio · web-flow · commit 857c67d2441a · 2026-05-14T21:46:52.000Z
Backport #17760 to 4.8 Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Jessica Gamio <52049720+jessicagamio@users.noreply.github.com>
diff --git a/ddtrace/llmobs/_integrations/utils.py b/ddtrace/llmobs/_integrations/utils.py
@@ -345,6 +345,8 @@ def openai_set_meta_tags_from_chat(
         raw_content = _get_attr(m, "content", "")
         if isinstance(raw_content, list):
             content = _extract_content_parts(raw_content)
+        elif raw_content is None:
+            content = ""
         else:
             content = str(raw_content)
         role = str(_get_attr(m, "role", ""))
@@ -354,13 +356,18 @@ def openai_set_meta_tags_from_chat(
             core.dispatch(DISPATCH_ON_TOOL_CALL_OUTPUT_USED, (tool_call_id, span))
 
         extracted_tool_calls, extracted_tool_results = _openai_extract_tool_calls_and_results_chat(m)
+        pre_react_call_count = len(extracted_tool_calls)
         if role != "system":
             # ignore system messages as we may unintentionally parse instructions as tool calls
             capture_plain_text_tool_usage(extracted_tool_calls, extracted_tool_results, content, span, is_input=True)
 
+        # True if a ReAct-style "Action:" call was parsed out of the content string above
+        react_appended = len(extracted_tool_calls) > pre_react_call_count
+
         if extracted_tool_calls:
             processed_message["tool_calls"] = extracted_tool_calls
-            processed_message["content"] = ""  # reset content to empty string if tool calls present
+            if react_appended:
+                processed_message["content"] = ""  # only clear content if react tool calls present
         if extracted_tool_results:
             processed_message["tool_results"] = extracted_tool_results
             processed_message["content"] = ""  # reset content to empty string if tool results present
diff --git a/releasenotes/notes/fix-openai-llmobs-preserve-assistant-content-with-tool-calls-07fdde4269fea049.yaml b/releasenotes/notes/fix-openai-llmobs-preserve-assistant-content-with-tool-calls-07fdde4269fea049.yaml
@@ -0,0 +1,4 @@
+---
+fixes:
+  - |
+    LLM Observability: The OpenAI integration now preserves assistant message content when ``tool_calls`` are present on the same message. #17760
diff --git a/tests/contrib/openai/conftest.py b/tests/contrib/openai/conftest.py
@@ -131,6 +131,30 @@ def snapshot_tracer(tracer, openai, patch_openai):
     return tracer
 
 
+@pytest.fixture
+def openai_llmobs(snapshot_tracer, monkeypatch):
+    # Preserve meta_struct["_llmobs"] on spans so tests can assert against
+    # LLMObsSpanData via _get_llmobs_data_metastruct; production scrubs it
+    # after enqueueing when _DD_LLMOBS_EXPORT=llmobs (the default).
+    monkeypatch.setenv("_DD_LLMOBS_EXPORT", "agent")
+    LLMObs.disable()
+    with override_global_config(
+        {
+            "_llmobs_ml_app": "<ml-app-name>",
+            "_dd_api_key": "<not-a-real-key>",
+        }
+    ):
+        LLMObs.enable(
+            _tracer=snapshot_tracer,
+            integrations_enabled=False,
+            instrumented_proxy_urls={"http://localhost:4000"},
+        )
+        LLMObs._instance._llmobs_span_writer.stop()
+        LLMObs._instance._llmobs_span_writer = mock.MagicMock()
+        yield LLMObs
+    LLMObs.disable()
+
+
 @pytest.fixture
 def test_spans(ddtrace_global_config, test_spans, snapshot_tracer):
     if ddtrace_global_config.get("_llmobs_enabled", False):
diff --git a/tests/contrib/openai/test_openai_llmobs.py b/tests/contrib/openai/test_openai_llmobs.py
@@ -6,6 +6,7 @@
 
 from ddtrace.internal.utils.version import parse_version
 from ddtrace.llmobs._integrations.utils import _est_tokens
+from ddtrace.llmobs._utils import _get_llmobs_data_metastruct
 from ddtrace.llmobs._utils import safe_json
 from tests.contrib.openai.utils import assert_prompt_tracking
 from tests.contrib.openai.utils import chat_completion_custom_functions
@@ -22,6 +23,7 @@
 from tests.llmobs._utils import DEEP_TOOL_SCHEMA
 from tests.llmobs._utils import _expected_llmobs_llm_span_event
 from tests.llmobs._utils import _expected_llmobs_non_llm_span_event
+from tests.llmobs._utils import assert_llmobs_span_data
 
 
 EXPECTED_TOOL_DEFINITIONS = [
@@ -982,6 +984,161 @@ def test_chat_completion_tool_call_with_follow_up(
             ]
         )
 
+    @pytest.mark.skipif(
+        parse_version(openai_module.version.VERSION) < (1, 1), reason="Tool calls available after v1.1.0"
+    )
+    @mock.patch("openai._base_client.SyncAPIClient.post")
+    def test_chat_completion_tool_call_preserves_assistant_content(
+        self, mock_completions_post, openai, openai_llmobs, test_spans
+    ):
+        """MLOS-605: assistant messages carrying both prose content and structured tool_calls
+        on the same message must preserve the prose in the LLMObs span. OpenAI's native
+        function-calling schema allows content and tool_calls to coexist (content is the
+        model's narration, tool_calls is the structured invocation) — earlier logic cleared
+        content unconditionally which dropped legitimate prose from replayed history.
+        """
+        mock_completions_post.return_value = mock_openai_chat_completions_response
+        assistant_prose = "I'll look up the student's profile before answering."
+        tool_call_id = "call_get_user_context_0"
+        messages = [
+            {"role": "user", "content": chat_completion_input_description},
+            {
+                "role": "assistant",
+                "content": assistant_prose,
+                "tool_calls": [
+                    {
+                        "id": tool_call_id,
+                        "type": "function",
+                        "function": {"name": "extract_student_info", "arguments": "{}"},
+                    }
+                ],
+            },
+            {"role": "tool", "tool_call_id": tool_call_id, "content": '{"verified": true}'},
+            {"role": "user", "content": "Thanks, can you summarize?"},
+        ]
+        client = openai.OpenAI()
+        client.chat.completions.create(model="gpt-3.5-turbo", messages=messages)
+
+        spans = [s for trace in test_spans.pop_traces() for s in trace]
+        assert len(spans) == 1
+        assert_llmobs_span_data(
+            _get_llmobs_data_metastruct(spans[0]),
+            span_kind="llm",
+            input_messages=[
+                {"role": "user", "content": chat_completion_input_description},
+                {
+                    "role": "assistant",
+                    "content": assistant_prose,
+                    "tool_calls": [
+                        {"name": "extract_student_info", "arguments": {}, "tool_id": tool_call_id, "type": "function"}
+                    ],
+                },
+                {
+                    "role": "tool",
+                    "content": "",
+                    "tool_results": [
+                        {"name": "", "result": '{"verified": true}', "tool_id": tool_call_id, "type": "tool_result"}
+                    ],
+                },
+                {"role": "user", "content": "Thanks, can you summarize?"},
+            ],
+        )
+
+    @pytest.mark.skipif(
+        parse_version(openai_module.version.VERSION) < (1, 1), reason="Tool calls available after v1.1.0"
+    )
+    @mock.patch("openai._base_client.SyncAPIClient.post")
+    def test_chat_completion_tool_call_with_none_content_does_not_leak_string(
+        self, mock_completions_post, openai, openai_llmobs, test_spans
+    ):
+        """MLOS-605: OpenAI returns `content=None` alongside `tool_calls` when the model issues
+        a pure function call with no narration. Earlier logic ran `str(None)` → "None" and
+        relied on the unconditional content-clear to mask it; once the clear became conditional
+        the literal string "None" leaked into the span. Normalize None → "" before stringifying.
+        """
+        mock_completions_post.return_value = mock_openai_chat_completions_response
+        tool_call_id = "call_get_user_context_0"
+        messages = [
+            {"role": "user", "content": chat_completion_input_description},
+            {
+                "role": "assistant",
+                "content": None,
+                "tool_calls": [
+                    {
+                        "id": tool_call_id,
+                        "type": "function",
+                        "function": {"name": "extract_student_info", "arguments": "{}"},
+                    }
+                ],
+            },
+            {"role": "tool", "tool_call_id": tool_call_id, "content": '{"verified": true}'},
+            {"role": "user", "content": "Thanks, can you summarize?"},
+        ]
+        client = openai.OpenAI()
+        client.chat.completions.create(model="gpt-3.5-turbo", messages=messages)
+
+        spans = [s for trace in test_spans.pop_traces() for s in trace]
+        assert len(spans) == 1
+        assert_llmobs_span_data(
+            _get_llmobs_data_metastruct(spans[0]),
+            span_kind="llm",
+            input_messages=[
+                {"role": "user", "content": chat_completion_input_description},
+                {
+                    "role": "assistant",
+                    "content": "",
+                    "tool_calls": [
+                        {"name": "extract_student_info", "arguments": {}, "tool_id": tool_call_id, "type": "function"}
+                    ],
+                },
+                {
+                    "role": "tool",
+                    "content": "",
+                    "tool_results": [
+                        {"name": "", "result": '{"verified": true}', "tool_id": tool_call_id, "type": "tool_result"}
+                    ],
+                },
+                {"role": "user", "content": "Thanks, can you summarize?"},
+            ],
+        )
+
+    @pytest.mark.skipif(
+        parse_version(openai_module.version.VERSION) < (1, 1), reason="Tool calls available after v1.1.0"
+    )
+    @mock.patch("openai._base_client.SyncAPIClient.post")
+    def test_chat_completion_react_style_content_still_deduplicates(
+        self, mock_completions_post, openai, openai_llmobs, test_spans
+    ):
+        """Regression guard for ReAct-style agents: when content literally contains the
+        `Action:/Action Input:` pattern and structured tool_calls are extracted from it,
+        clear content to avoid rendering the same call twice in the LLMObs UI.
+        """
+        mock_completions_post.return_value = mock_openai_chat_completions_response
+        react_content = "Action: extract_student_info\nAction Input: {}"
+        messages = [
+            {"role": "user", "content": chat_completion_input_description},
+            {"role": "assistant", "content": react_content},
+        ]
+        client = openai.OpenAI()
+        client.chat.completions.create(model="gpt-3.5-turbo", messages=messages)
+
+        spans = [s for trace in test_spans.pop_traces() for s in trace]
+        assert len(spans) == 1
+        assert_llmobs_span_data(
+            _get_llmobs_data_metastruct(spans[0]),
+            span_kind="llm",
+            input_messages=[
+                {"role": "user", "content": chat_completion_input_description},
+                {
+                    "role": "assistant",
+                    "content": "",
+                    "tool_calls": [
+                        {"name": "extract_student_info", "arguments": {}, "tool_id": "", "type": "function"}
+                    ],
+                },
+            ],
+        )
+
     @pytest.mark.skipif(
         parse_version(openai_module.version.VERSION) < (1, 66),
         reason="Responses API with custom tools available after v1.66.0",
diff --git a/tests/llmobs/_utils.py b/tests/llmobs/_utils.py
@@ -14,6 +14,7 @@
 
 import ddtrace
 from ddtrace.internal.utils.formats import format_trace_id
+from ddtrace.llmobs._constants import LLMOBS_STRUCT
 from ddtrace.llmobs._constants import ROOT_PARENT_ID
 from ddtrace.llmobs._utils import _get_nearest_llmobs_ancestor
 from ddtrace.llmobs._utils import _get_span_name
@@ -1037,3 +1038,119 @@ async def anext_stream(stream):
             await stream.__anext__()
         except StopAsyncIteration:
             break
+
+
+def assert_llmobs_span_data(
+    actual,
+    *,
+    span_kind,
+    name=None,
+    parent_id=None,
+    model_name=None,
+    model_provider=None,
+    input_messages=None,
+    input_value=None,
+    output_messages=None,
+    output_value=None,
+    input_documents=None,
+    output_documents=None,
+    error=None,
+    tool_definitions=None,
+    metadata=None,
+    tags=None,
+    metrics=None,
+):
+    """Assert against an LLMObsSpanData payload from ``meta_struct['_llmobs']``.
+
+    Structural fields (``span_kind``, ``name``, ``parent_id``, ``model_name``,
+    ``model_provider``, input/output messages/values/documents,
+    ``tool_definitions``) are strict-equality, checked only when provided.
+
+    ``error`` defaults to asserting no error payload is present. Pass
+    ``error=mock.ANY`` to skip the check.
+
+    ``metadata``, ``tags``, ``metrics`` are top-level subset only: declared
+    top-level keys must equal exactly, extras tolerated.
+    """
+    assert actual, "expected LLMObsSpanData on span, got {!r} (was meta_struct scrubbed?)".format(actual)
+
+    actual_meta = actual.get(LLMOBS_STRUCT.META, {})
+    actual_input = actual_meta.get(LLMOBS_STRUCT.INPUT, {})
+    actual_output = actual_meta.get(LLMOBS_STRUCT.OUTPUT, {})
+
+    failures = []
+
+    def _normalize_messages(msgs):
+        if not isinstance(msgs, list):
+            return msgs
+        out = []
+        for m in msgs:
+            if isinstance(m, dict) and m.get("role") is None:
+                out.append({**m, "role": ""})
+            else:
+                out.append(m)
+        return out
+
+    def _check_eq(label, expected_value, actual_value):
+        if actual_value != expected_value:
+            failures.append(
+                "{} mismatch:\n    expected={!r}\n    actual={!r}".format(label, expected_value, actual_value)
+            )
+
+    def _check_subset(label, expected_subset, actual_dict):
+        if not expected_subset.items() <= actual_dict.items():
+            failures.append(
+                "{} subset mismatch:\n    expected={!r}\n    actual={!r}".format(label, expected_subset, actual_dict)
+            )
+
+    _check_eq("span.kind", span_kind, actual_meta.get(LLMOBS_STRUCT.SPAN, {}).get(LLMOBS_STRUCT.KIND))
+    if name is not None:
+        _check_eq("name", name, actual.get(LLMOBS_STRUCT.NAME))
+    if parent_id is not None:
+        _check_eq("parent_id", parent_id, actual.get(LLMOBS_STRUCT.PARENT_ID))
+    if model_name is not None:
+        _check_eq("meta.model_name", model_name, actual_meta.get(LLMOBS_STRUCT.MODEL_NAME))
+    if model_provider is not None:
+        _check_eq("meta.model_provider", model_provider, actual_meta.get(LLMOBS_STRUCT.MODEL_PROVIDER))
+    if input_messages is not None:
+        _check_eq(
+            "meta.input.messages",
+            _normalize_messages(input_messages),
+            _normalize_messages(actual_input.get(LLMOBS_STRUCT.MESSAGES)),
+        )
+    if input_value is not None:
+        _check_eq("meta.input.value", input_value, actual_input.get(LLMOBS_STRUCT.VALUE))
+    if input_documents is not None:
+        _check_eq("meta.input.documents", input_documents, actual_input.get(LLMOBS_STRUCT.DOCUMENTS))
+    if output_messages is not None:
+        _check_eq(
+            "meta.output.messages",
+            _normalize_messages(output_messages),
+            _normalize_messages(actual_output.get(LLMOBS_STRUCT.MESSAGES)),
+        )
+    if output_value is not None:
+        _check_eq("meta.output.value", output_value, actual_output.get(LLMOBS_STRUCT.VALUE))
+    if output_documents is not None:
+        _check_eq("meta.output.documents", output_documents, actual_output.get(LLMOBS_STRUCT.DOCUMENTS))
+    if error is None:
+        actual_error = actual_meta.get(LLMOBS_STRUCT.ERROR)
+        if actual_error:
+            failures.append(
+                "meta.error unexpectedly present:\n    expected=<absent>\n    actual={!r}".format(actual_error)
+            )
+    else:
+        _check_eq("meta.error", error, actual_meta.get(LLMOBS_STRUCT.ERROR))
+    if tool_definitions is not None:
+        _check_eq("meta.tool_definitions", tool_definitions, actual_meta.get(LLMOBS_STRUCT.TOOL_DEFINITIONS))
+
+    if metadata is not None:
+        _check_subset("meta.metadata", metadata, actual_meta.get(LLMOBS_STRUCT.METADATA, {}))
+    if tags is not None:
+        _check_subset("tags", tags, actual.get(LLMOBS_STRUCT.TAGS, {}))
+    if metrics is not None:
+        _check_subset("metrics", metrics, actual.get(LLMOBS_STRUCT.METRICS, {}))
+
+    if failures:
+        raise AssertionError(
+            "assert_llmobs_span_data found {} mismatch(es):\n  - {}".format(len(failures), "\n  - ".join(failures))
+        )

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +---
 +fixes:
 +  - |
 +    LLM Observability: The OpenAI integration now preserves assistant message content when ``tool_calls`` are present on the same message. #17760