diff --git a/ddtrace/llmobs/_evaluators/runner.py b/ddtrace/llmobs/_evaluators/runner.py
index 5e0ab2737f4..780293563d8 100644
--- a/ddtrace/llmobs/_evaluators/runner.py
+++ b/ddtrace/llmobs/_evaluators/runner.py
@@ -32,6 +32,8 @@ class EvaluatorRunner(PeriodicService):
2. triggers evaluator runs over buffered finished spans on each `periodic` call
"""
+ EVALUATORS_ENV_VAR = "DD_LLMOBS_EVALUATORS"
+
def __init__(self, interval: float, llmobs_service=None, evaluators=None):
super(EvaluatorRunner, self).__init__(interval=interval)
self._lock = forksafe.RLock()
@@ -46,7 +48,7 @@ def __init__(self, interval: float, llmobs_service=None, evaluators=None):
if len(self.evaluators) > 0:
return
- evaluator_str = os.getenv("_DD_LLMOBS_EVALUATORS")
+ evaluator_str = os.getenv(self.EVALUATORS_ENV_VAR)
if evaluator_str is None:
return
diff --git a/ddtrace/llmobs/_evaluators/sampler.py b/ddtrace/llmobs/_evaluators/sampler.py
index 3598e90f7f3..524af217f83 100644
--- a/ddtrace/llmobs/_evaluators/sampler.py
+++ b/ddtrace/llmobs/_evaluators/sampler.py
@@ -46,7 +46,7 @@ def __repr__(self):
class EvaluatorRunnerSampler:
- SAMPLING_RULES_ENV_VAR = "_DD_LLMOBS_EVALUATOR_SAMPLING_RULES"
+ SAMPLING_RULES_ENV_VAR = "DD_LLMOBS_EVALUATOR_SAMPLING_RULES"
def __init__(self):
self.rules = self.parse_rules()
@@ -59,8 +59,9 @@ def sample(self, evaluator_label, span):
def parse_rules(self) -> List[EvaluatorRunnerSamplingRule]:
rules = []
+
sampling_rules_str = os.getenv(self.SAMPLING_RULES_ENV_VAR)
- telemetry_writer.add_configuration("_DD_LLMOBS_EVALUATOR_SAMPLING_RULES", sampling_rules_str, origin="env")
+ telemetry_writer.add_configuration(self.SAMPLING_RULES_ENV_VAR, sampling_rules_str, origin="env")
def parsing_failed_because(msg, maybe_throw_this):
telemetry_writer.add_log(
diff --git a/releasenotes/notes/ragas-integration-a81b696757c0e7a5.yaml b/releasenotes/notes/ragas-integration-a81b696757c0e7a5.yaml
new file mode 100644
index 00000000000..7963f891661
--- /dev/null
+++ b/releasenotes/notes/ragas-integration-a81b696757c0e7a5.yaml
@@ -0,0 +1,21 @@
+---
+features:
+ - |
+ LLM Observability: This introduces an integration with the `RAGAS `_ evaluation framework to continuously monitor
+ the performance of context-augmented LLM generations in production.
+
+ The integration supports evaluating LLM inferences with the following RAGAS metrics:
+ - `Faithfulness `_: measures if the LLM response is faithful to the provided context.
+ - `Answer Relevancy `_: measures how relevant the LLM response is to the user input.
+ - `Context Precision `_: measures how effectively the context is used in the generated response.
+
+ To learn more, see the `LLM Observability evaluations guide `_.
+deprecations:
+ - |
+ LLM Observability: The `_DD_LLMOBS_EVALUATORS` environment variable is deprecated and will be removed in ddtrace 3.0.0.
+ As an alternative to `_DD_LLMOBS_EVALUATORS`, you can use `DD_LLMOBS_EVALUATORS` instead.
+ To migrate, replace `_DD_LLMOBS_EVALUATORS` with `DD_LLMOBS_EVALUATORS`.
+ - |
+ LLM Observability: The `_DD_LLMOBS_EVALUATOR_SAMPLING_RULES` environment variable is deprecated and will be removed in ddtrace 3.0.0.
+ As an alternative to `_DD_LLMOBS_EVALUATOR_SAMPLING_RULES`, you can use `DD_LLMOBS_EVALUATOR_SAMPLING_RULES` instead.
+ To migrate, replace `_DD_LLMOBS_EVALUATOR_SAMPLING_RULES` with `DD_LLMOBS_EVALUATOR_SAMPLING_RULES`.
\ No newline at end of file
diff --git a/tests/llmobs/test_llmobs_evaluator_runner.py b/tests/llmobs/test_llmobs_evaluator_runner.py
index a2c4278297c..1c941c52d83 100644
--- a/tests/llmobs/test_llmobs_evaluator_runner.py
+++ b/tests/llmobs/test_llmobs_evaluator_runner.py
@@ -120,7 +120,7 @@ def test_evaluator_runner_on_exit(mock_writer_logs, run_python_code_in_subproces
def test_evaluator_runner_unsupported_evaluator():
- with override_env({"_DD_LLMOBS_EVALUATORS": "unsupported"}):
+ with override_env({EvaluatorRunner.EVALUATORS_ENV_VAR: "unsupported"}):
with pytest.raises(ValueError):
EvaluatorRunner(interval=0.01, llmobs_service=mock.MagicMock())
diff --git a/tests/llmobs/test_llmobs_ragas_evaluators.py b/tests/llmobs/test_llmobs_ragas_evaluators.py
index 9766c18c1e5..c46dce740c2 100644
--- a/tests/llmobs/test_llmobs_ragas_evaluators.py
+++ b/tests/llmobs/test_llmobs_ragas_evaluators.py
@@ -6,7 +6,8 @@
from ddtrace.llmobs._evaluators.ragas.answer_relevancy import RagasAnswerRelevancyEvaluator
from ddtrace.llmobs._evaluators.ragas.context_precision import RagasContextPrecisionEvaluator
from ddtrace.llmobs._evaluators.ragas.faithfulness import RagasFaithfulnessEvaluator
-from ddtrace.trace import Span
+from ddtrace.llmobs._evaluators.runner import EvaluatorRunner
+from ddtrace.span import Span
from tests.llmobs._utils import _expected_llmobs_llm_span_event
from tests.llmobs._utils import _expected_ragas_answer_relevancy_spans
from tests.llmobs._utils import _expected_ragas_context_precision_spans
@@ -235,7 +236,7 @@ def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_log
"PYTHONPATH": ":".join(pypath),
"OPENAI_API_KEY": os.getenv("OPENAI_API_KEY", "dummy-openai-api-key"),
"_DD_LLMOBS_EVALUATOR_INTERVAL": "5",
- "_DD_LLMOBS_EVALUATORS": "ragas_faithfulness",
+ EvaluatorRunner.EVALUATORS_ENV_VAR: "ragas_faithfulness",
"DD_TRACE_ENABLED": "0",
}
)
diff --git a/tests/llmobs/test_llmobs_service.py b/tests/llmobs/test_llmobs_service.py
index ff099ae3f71..2fe3e1fbfab 100644
--- a/tests/llmobs/test_llmobs_service.py
+++ b/tests/llmobs/test_llmobs_service.py
@@ -1384,7 +1384,7 @@ def test_llmobs_fork_recreates_and_restarts_eval_metric_writer():
def test_llmobs_fork_recreates_and_restarts_evaluator_runner(mock_ragas_evaluator):
"""Test that forking a process correctly recreates and restarts the EvaluatorRunner."""
pytest.importorskip("ragas")
- with override_env(dict(_DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
+ with override_env(dict(DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
with mock.patch("ddtrace.llmobs._evaluators.runner.EvaluatorRunner.periodic"):
llmobs_service.enable(_tracer=DummyTracer(), ml_app="test_app")
original_pid = llmobs_service._instance.tracer._pid
@@ -1464,9 +1464,9 @@ def test_llmobs_fork_submit_evaluation(monkeypatch):
def test_llmobs_fork_evaluator_runner_run(monkeypatch):
"""Test that forking a process correctly encodes new spans created in each process."""
- monkeypatch.setenv("_DD_LLMOBS_EVALUATOR_INTERVAL", 5.0)
+ monkeypatch.setenv("DD_LLMOBS_EVALUATOR_INTERVAL", 5.0)
pytest.importorskip("ragas")
- monkeypatch.setenv("_DD_LLMOBS_EVALUATORS", "ragas_faithfulness")
+ monkeypatch.setenv("DD_LLMOBS_EVALUATORS", "ragas_faithfulness")
with mock.patch("ddtrace.llmobs._evaluators.runner.EvaluatorRunner.periodic"):
llmobs_service.enable(_tracer=DummyTracer(), ml_app="test_app", api_key="test_api_key")
pid = os.fork()
@@ -1757,7 +1757,7 @@ async def test_annotation_context_async_nested(llmobs):
def test_service_enable_starts_evaluator_runner_when_evaluators_exist():
pytest.importorskip("ragas")
with override_global_config(dict(_dd_api_key="", _llmobs_ml_app="")):
- with override_env(dict(_DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
+ with override_env(dict(DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
dummy_tracer = DummyTracer()
llmobs_service.enable(_tracer=dummy_tracer)
llmobs_instance = llmobs_service._instance