Skip to content

Commit 857c67d

Browse files
dd-octo-sts[bot]github-actions[bot]jessicagamio
authored
fix(llmobs): preserve openai assistant content when tool_calls are present [backport 4.8] (#18092)
Backport #17760 to 4.8 Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Jessica Gamio <52049720+jessicagamio@users.noreply.github.com>
1 parent cfc4cd7 commit 857c67d

5 files changed

Lines changed: 310 additions & 1 deletion

File tree

ddtrace/llmobs/_integrations/utils.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,8 @@ def openai_set_meta_tags_from_chat(
345345
raw_content = _get_attr(m, "content", "")
346346
if isinstance(raw_content, list):
347347
content = _extract_content_parts(raw_content)
348+
elif raw_content is None:
349+
content = ""
348350
else:
349351
content = str(raw_content)
350352
role = str(_get_attr(m, "role", ""))
@@ -354,13 +356,18 @@ def openai_set_meta_tags_from_chat(
354356
core.dispatch(DISPATCH_ON_TOOL_CALL_OUTPUT_USED, (tool_call_id, span))
355357

356358
extracted_tool_calls, extracted_tool_results = _openai_extract_tool_calls_and_results_chat(m)
359+
pre_react_call_count = len(extracted_tool_calls)
357360
if role != "system":
358361
# ignore system messages as we may unintentionally parse instructions as tool calls
359362
capture_plain_text_tool_usage(extracted_tool_calls, extracted_tool_results, content, span, is_input=True)
360363

364+
# True if a ReAct-style "Action:" call was parsed out of the content string above
365+
react_appended = len(extracted_tool_calls) > pre_react_call_count
366+
361367
if extracted_tool_calls:
362368
processed_message["tool_calls"] = extracted_tool_calls
363-
processed_message["content"] = "" # reset content to empty string if tool calls present
369+
if react_appended:
370+
processed_message["content"] = "" # only clear content if react tool calls present
364371
if extracted_tool_results:
365372
processed_message["tool_results"] = extracted_tool_results
366373
processed_message["content"] = "" # reset content to empty string if tool results present
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
fixes:
3+
- |
4+
LLM Observability: The OpenAI integration now preserves assistant message content when ``tool_calls`` are present on the same message. #17760

tests/contrib/openai/conftest.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,30 @@ def snapshot_tracer(tracer, openai, patch_openai):
131131
return tracer
132132

133133

134+
@pytest.fixture
135+
def openai_llmobs(snapshot_tracer, monkeypatch):
136+
# Preserve meta_struct["_llmobs"] on spans so tests can assert against
137+
# LLMObsSpanData via _get_llmobs_data_metastruct; production scrubs it
138+
# after enqueueing when _DD_LLMOBS_EXPORT=llmobs (the default).
139+
monkeypatch.setenv("_DD_LLMOBS_EXPORT", "agent")
140+
LLMObs.disable()
141+
with override_global_config(
142+
{
143+
"_llmobs_ml_app": "<ml-app-name>",
144+
"_dd_api_key": "<not-a-real-key>",
145+
}
146+
):
147+
LLMObs.enable(
148+
_tracer=snapshot_tracer,
149+
integrations_enabled=False,
150+
instrumented_proxy_urls={"http://localhost:4000"},
151+
)
152+
LLMObs._instance._llmobs_span_writer.stop()
153+
LLMObs._instance._llmobs_span_writer = mock.MagicMock()
154+
yield LLMObs
155+
LLMObs.disable()
156+
157+
134158
@pytest.fixture
135159
def test_spans(ddtrace_global_config, test_spans, snapshot_tracer):
136160
if ddtrace_global_config.get("_llmobs_enabled", False):

tests/contrib/openai/test_openai_llmobs.py

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from ddtrace.internal.utils.version import parse_version
88
from ddtrace.llmobs._integrations.utils import _est_tokens
9+
from ddtrace.llmobs._utils import _get_llmobs_data_metastruct
910
from ddtrace.llmobs._utils import safe_json
1011
from tests.contrib.openai.utils import assert_prompt_tracking
1112
from tests.contrib.openai.utils import chat_completion_custom_functions
@@ -22,6 +23,7 @@
2223
from tests.llmobs._utils import DEEP_TOOL_SCHEMA
2324
from tests.llmobs._utils import _expected_llmobs_llm_span_event
2425
from tests.llmobs._utils import _expected_llmobs_non_llm_span_event
26+
from tests.llmobs._utils import assert_llmobs_span_data
2527

2628

2729
EXPECTED_TOOL_DEFINITIONS = [
@@ -982,6 +984,161 @@ def test_chat_completion_tool_call_with_follow_up(
982984
]
983985
)
984986

987+
@pytest.mark.skipif(
988+
parse_version(openai_module.version.VERSION) < (1, 1), reason="Tool calls available after v1.1.0"
989+
)
990+
@mock.patch("openai._base_client.SyncAPIClient.post")
991+
def test_chat_completion_tool_call_preserves_assistant_content(
992+
self, mock_completions_post, openai, openai_llmobs, test_spans
993+
):
994+
"""MLOS-605: assistant messages carrying both prose content and structured tool_calls
995+
on the same message must preserve the prose in the LLMObs span. OpenAI's native
996+
function-calling schema allows content and tool_calls to coexist (content is the
997+
model's narration, tool_calls is the structured invocation) — earlier logic cleared
998+
content unconditionally which dropped legitimate prose from replayed history.
999+
"""
1000+
mock_completions_post.return_value = mock_openai_chat_completions_response
1001+
assistant_prose = "I'll look up the student's profile before answering."
1002+
tool_call_id = "call_get_user_context_0"
1003+
messages = [
1004+
{"role": "user", "content": chat_completion_input_description},
1005+
{
1006+
"role": "assistant",
1007+
"content": assistant_prose,
1008+
"tool_calls": [
1009+
{
1010+
"id": tool_call_id,
1011+
"type": "function",
1012+
"function": {"name": "extract_student_info", "arguments": "{}"},
1013+
}
1014+
],
1015+
},
1016+
{"role": "tool", "tool_call_id": tool_call_id, "content": '{"verified": true}'},
1017+
{"role": "user", "content": "Thanks, can you summarize?"},
1018+
]
1019+
client = openai.OpenAI()
1020+
client.chat.completions.create(model="gpt-3.5-turbo", messages=messages)
1021+
1022+
spans = [s for trace in test_spans.pop_traces() for s in trace]
1023+
assert len(spans) == 1
1024+
assert_llmobs_span_data(
1025+
_get_llmobs_data_metastruct(spans[0]),
1026+
span_kind="llm",
1027+
input_messages=[
1028+
{"role": "user", "content": chat_completion_input_description},
1029+
{
1030+
"role": "assistant",
1031+
"content": assistant_prose,
1032+
"tool_calls": [
1033+
{"name": "extract_student_info", "arguments": {}, "tool_id": tool_call_id, "type": "function"}
1034+
],
1035+
},
1036+
{
1037+
"role": "tool",
1038+
"content": "",
1039+
"tool_results": [
1040+
{"name": "", "result": '{"verified": true}', "tool_id": tool_call_id, "type": "tool_result"}
1041+
],
1042+
},
1043+
{"role": "user", "content": "Thanks, can you summarize?"},
1044+
],
1045+
)
1046+
1047+
@pytest.mark.skipif(
1048+
parse_version(openai_module.version.VERSION) < (1, 1), reason="Tool calls available after v1.1.0"
1049+
)
1050+
@mock.patch("openai._base_client.SyncAPIClient.post")
1051+
def test_chat_completion_tool_call_with_none_content_does_not_leak_string(
1052+
self, mock_completions_post, openai, openai_llmobs, test_spans
1053+
):
1054+
"""MLOS-605: OpenAI returns `content=None` alongside `tool_calls` when the model issues
1055+
a pure function call with no narration. Earlier logic ran `str(None)` → "None" and
1056+
relied on the unconditional content-clear to mask it; once the clear became conditional
1057+
the literal string "None" leaked into the span. Normalize None → "" before stringifying.
1058+
"""
1059+
mock_completions_post.return_value = mock_openai_chat_completions_response
1060+
tool_call_id = "call_get_user_context_0"
1061+
messages = [
1062+
{"role": "user", "content": chat_completion_input_description},
1063+
{
1064+
"role": "assistant",
1065+
"content": None,
1066+
"tool_calls": [
1067+
{
1068+
"id": tool_call_id,
1069+
"type": "function",
1070+
"function": {"name": "extract_student_info", "arguments": "{}"},
1071+
}
1072+
],
1073+
},
1074+
{"role": "tool", "tool_call_id": tool_call_id, "content": '{"verified": true}'},
1075+
{"role": "user", "content": "Thanks, can you summarize?"},
1076+
]
1077+
client = openai.OpenAI()
1078+
client.chat.completions.create(model="gpt-3.5-turbo", messages=messages)
1079+
1080+
spans = [s for trace in test_spans.pop_traces() for s in trace]
1081+
assert len(spans) == 1
1082+
assert_llmobs_span_data(
1083+
_get_llmobs_data_metastruct(spans[0]),
1084+
span_kind="llm",
1085+
input_messages=[
1086+
{"role": "user", "content": chat_completion_input_description},
1087+
{
1088+
"role": "assistant",
1089+
"content": "",
1090+
"tool_calls": [
1091+
{"name": "extract_student_info", "arguments": {}, "tool_id": tool_call_id, "type": "function"}
1092+
],
1093+
},
1094+
{
1095+
"role": "tool",
1096+
"content": "",
1097+
"tool_results": [
1098+
{"name": "", "result": '{"verified": true}', "tool_id": tool_call_id, "type": "tool_result"}
1099+
],
1100+
},
1101+
{"role": "user", "content": "Thanks, can you summarize?"},
1102+
],
1103+
)
1104+
1105+
@pytest.mark.skipif(
1106+
parse_version(openai_module.version.VERSION) < (1, 1), reason="Tool calls available after v1.1.0"
1107+
)
1108+
@mock.patch("openai._base_client.SyncAPIClient.post")
1109+
def test_chat_completion_react_style_content_still_deduplicates(
1110+
self, mock_completions_post, openai, openai_llmobs, test_spans
1111+
):
1112+
"""Regression guard for ReAct-style agents: when content literally contains the
1113+
`Action:/Action Input:` pattern and structured tool_calls are extracted from it,
1114+
clear content to avoid rendering the same call twice in the LLMObs UI.
1115+
"""
1116+
mock_completions_post.return_value = mock_openai_chat_completions_response
1117+
react_content = "Action: extract_student_info\nAction Input: {}"
1118+
messages = [
1119+
{"role": "user", "content": chat_completion_input_description},
1120+
{"role": "assistant", "content": react_content},
1121+
]
1122+
client = openai.OpenAI()
1123+
client.chat.completions.create(model="gpt-3.5-turbo", messages=messages)
1124+
1125+
spans = [s for trace in test_spans.pop_traces() for s in trace]
1126+
assert len(spans) == 1
1127+
assert_llmobs_span_data(
1128+
_get_llmobs_data_metastruct(spans[0]),
1129+
span_kind="llm",
1130+
input_messages=[
1131+
{"role": "user", "content": chat_completion_input_description},
1132+
{
1133+
"role": "assistant",
1134+
"content": "",
1135+
"tool_calls": [
1136+
{"name": "extract_student_info", "arguments": {}, "tool_id": "", "type": "function"}
1137+
],
1138+
},
1139+
],
1140+
)
1141+
9851142
@pytest.mark.skipif(
9861143
parse_version(openai_module.version.VERSION) < (1, 66),
9871144
reason="Responses API with custom tools available after v1.66.0",

tests/llmobs/_utils.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
import ddtrace
1616
from ddtrace.internal.utils.formats import format_trace_id
17+
from ddtrace.llmobs._constants import LLMOBS_STRUCT
1718
from ddtrace.llmobs._constants import ROOT_PARENT_ID
1819
from ddtrace.llmobs._utils import _get_nearest_llmobs_ancestor
1920
from ddtrace.llmobs._utils import _get_span_name
@@ -1037,3 +1038,119 @@ async def anext_stream(stream):
10371038
await stream.__anext__()
10381039
except StopAsyncIteration:
10391040
break
1041+
1042+
1043+
def assert_llmobs_span_data(
1044+
actual,
1045+
*,
1046+
span_kind,
1047+
name=None,
1048+
parent_id=None,
1049+
model_name=None,
1050+
model_provider=None,
1051+
input_messages=None,
1052+
input_value=None,
1053+
output_messages=None,
1054+
output_value=None,
1055+
input_documents=None,
1056+
output_documents=None,
1057+
error=None,
1058+
tool_definitions=None,
1059+
metadata=None,
1060+
tags=None,
1061+
metrics=None,
1062+
):
1063+
"""Assert against an LLMObsSpanData payload from ``meta_struct['_llmobs']``.
1064+
1065+
Structural fields (``span_kind``, ``name``, ``parent_id``, ``model_name``,
1066+
``model_provider``, input/output messages/values/documents,
1067+
``tool_definitions``) are strict-equality, checked only when provided.
1068+
1069+
``error`` defaults to asserting no error payload is present. Pass
1070+
``error=mock.ANY`` to skip the check.
1071+
1072+
``metadata``, ``tags``, ``metrics`` are top-level subset only: declared
1073+
top-level keys must equal exactly, extras tolerated.
1074+
"""
1075+
assert actual, "expected LLMObsSpanData on span, got {!r} (was meta_struct scrubbed?)".format(actual)
1076+
1077+
actual_meta = actual.get(LLMOBS_STRUCT.META, {})
1078+
actual_input = actual_meta.get(LLMOBS_STRUCT.INPUT, {})
1079+
actual_output = actual_meta.get(LLMOBS_STRUCT.OUTPUT, {})
1080+
1081+
failures = []
1082+
1083+
def _normalize_messages(msgs):
1084+
if not isinstance(msgs, list):
1085+
return msgs
1086+
out = []
1087+
for m in msgs:
1088+
if isinstance(m, dict) and m.get("role") is None:
1089+
out.append({**m, "role": ""})
1090+
else:
1091+
out.append(m)
1092+
return out
1093+
1094+
def _check_eq(label, expected_value, actual_value):
1095+
if actual_value != expected_value:
1096+
failures.append(
1097+
"{} mismatch:\n expected={!r}\n actual={!r}".format(label, expected_value, actual_value)
1098+
)
1099+
1100+
def _check_subset(label, expected_subset, actual_dict):
1101+
if not expected_subset.items() <= actual_dict.items():
1102+
failures.append(
1103+
"{} subset mismatch:\n expected={!r}\n actual={!r}".format(label, expected_subset, actual_dict)
1104+
)
1105+
1106+
_check_eq("span.kind", span_kind, actual_meta.get(LLMOBS_STRUCT.SPAN, {}).get(LLMOBS_STRUCT.KIND))
1107+
if name is not None:
1108+
_check_eq("name", name, actual.get(LLMOBS_STRUCT.NAME))
1109+
if parent_id is not None:
1110+
_check_eq("parent_id", parent_id, actual.get(LLMOBS_STRUCT.PARENT_ID))
1111+
if model_name is not None:
1112+
_check_eq("meta.model_name", model_name, actual_meta.get(LLMOBS_STRUCT.MODEL_NAME))
1113+
if model_provider is not None:
1114+
_check_eq("meta.model_provider", model_provider, actual_meta.get(LLMOBS_STRUCT.MODEL_PROVIDER))
1115+
if input_messages is not None:
1116+
_check_eq(
1117+
"meta.input.messages",
1118+
_normalize_messages(input_messages),
1119+
_normalize_messages(actual_input.get(LLMOBS_STRUCT.MESSAGES)),
1120+
)
1121+
if input_value is not None:
1122+
_check_eq("meta.input.value", input_value, actual_input.get(LLMOBS_STRUCT.VALUE))
1123+
if input_documents is not None:
1124+
_check_eq("meta.input.documents", input_documents, actual_input.get(LLMOBS_STRUCT.DOCUMENTS))
1125+
if output_messages is not None:
1126+
_check_eq(
1127+
"meta.output.messages",
1128+
_normalize_messages(output_messages),
1129+
_normalize_messages(actual_output.get(LLMOBS_STRUCT.MESSAGES)),
1130+
)
1131+
if output_value is not None:
1132+
_check_eq("meta.output.value", output_value, actual_output.get(LLMOBS_STRUCT.VALUE))
1133+
if output_documents is not None:
1134+
_check_eq("meta.output.documents", output_documents, actual_output.get(LLMOBS_STRUCT.DOCUMENTS))
1135+
if error is None:
1136+
actual_error = actual_meta.get(LLMOBS_STRUCT.ERROR)
1137+
if actual_error:
1138+
failures.append(
1139+
"meta.error unexpectedly present:\n expected=<absent>\n actual={!r}".format(actual_error)
1140+
)
1141+
else:
1142+
_check_eq("meta.error", error, actual_meta.get(LLMOBS_STRUCT.ERROR))
1143+
if tool_definitions is not None:
1144+
_check_eq("meta.tool_definitions", tool_definitions, actual_meta.get(LLMOBS_STRUCT.TOOL_DEFINITIONS))
1145+
1146+
if metadata is not None:
1147+
_check_subset("meta.metadata", metadata, actual_meta.get(LLMOBS_STRUCT.METADATA, {}))
1148+
if tags is not None:
1149+
_check_subset("tags", tags, actual.get(LLMOBS_STRUCT.TAGS, {}))
1150+
if metrics is not None:
1151+
_check_subset("metrics", metrics, actual.get(LLMOBS_STRUCT.METRICS, {}))
1152+
1153+
if failures:
1154+
raise AssertionError(
1155+
"assert_llmobs_span_data found {} mismatch(es):\n - {}".format(len(failures), "\n - ".join(failures))
1156+
)

0 commit comments

Comments
 (0)