diff --git a/ddtrace/llmobs/_constants.py b/ddtrace/llmobs/_constants.py index 27000b36aa..a7d13ca60c 100644 --- a/ddtrace/llmobs/_constants.py +++ b/ddtrace/llmobs/_constants.py @@ -50,9 +50,9 @@ # Used to differentiate traces of Datadog-run operations vs user-application operations. RUNNER_IS_INTEGRATION_SPAN_TAG = "runner.integration" -# The ml app of all ragas traces have this prefix that we use to detect -# whether a span is generated from the ragas evaluation itself. -RAGAS_ML_APP_PREFIX = "dd-ragas" +# All ragas traces have this context item set so we can differentiate +# spans generated from the ragas integration vs user application spans. +IS_EVALUATION_SPAN = "_is_evaluation_span" ANNOTATIONS_CONTEXT_ID = "annotations_context_id" INTERNAL_CONTEXT_VARIABLE_KEYS = "_dd_context_variable_keys" diff --git a/ddtrace/llmobs/_evaluators/ragas/answer_relevancy.py b/ddtrace/llmobs/_evaluators/ragas/answer_relevancy.py index 9a640e0845..5fd6e6b7c0 100644 --- a/ddtrace/llmobs/_evaluators/ragas/answer_relevancy.py +++ b/ddtrace/llmobs/_evaluators/ragas/answer_relevancy.py @@ -5,6 +5,7 @@ from ddtrace.internal.logger import get_logger from ddtrace.llmobs._constants import EVALUATION_SPAN_METADATA +from ddtrace.llmobs._constants import IS_EVALUATION_SPAN from ddtrace.llmobs._evaluators.ragas.base import BaseRagasEvaluator from ddtrace.llmobs._evaluators.ragas.base import _get_ml_app_for_ragas_trace @@ -84,6 +85,7 @@ def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]] with self.llmobs_service.workflow( "dd-ragas.answer_relevancy", ml_app=_get_ml_app_for_ragas_trace(span_event) ) as ragas_ar_workflow: + ragas_ar_workflow._set_ctx_item(IS_EVALUATION_SPAN, True) try: evaluation_metadata[EVALUATION_SPAN_METADATA] = self.llmobs_service.export_span(span=ragas_ar_workflow) diff --git a/ddtrace/llmobs/_evaluators/ragas/base.py b/ddtrace/llmobs/_evaluators/ragas/base.py index 798c8e2fcc..2f6522496d 100644 --- a/ddtrace/llmobs/_evaluators/ragas/base.py +++ b/ddtrace/llmobs/_evaluators/ragas/base.py @@ -4,6 +4,7 @@ from typing import Tuple from typing import Union +from ddtrace import config from ddtrace.internal.logger import get_logger from ddtrace.internal.telemetry import telemetry_writer from ddtrace.internal.telemetry.constants import TELEMETRY_LOG_LEVEL @@ -11,7 +12,6 @@ from ddtrace.internal.utils.version import parse_version from ddtrace.llmobs._constants import INTERNAL_CONTEXT_VARIABLE_KEYS from ddtrace.llmobs._constants import INTERNAL_QUERY_VARIABLE_KEYS -from ddtrace.llmobs._constants import RAGAS_ML_APP_PREFIX logger = get_logger(__name__) @@ -94,9 +94,7 @@ def _get_ml_app_for_ragas_trace(span_event: dict) -> str: if isinstance(tag, str) and tag.startswith("ml_app:"): ml_app = tag.split(":")[1] break - if not ml_app: - return RAGAS_ML_APP_PREFIX - return "{}-{}".format(RAGAS_ML_APP_PREFIX, ml_app) + return ml_app or config._llmobs_ml_app or "unknown-ml-app" class BaseRagasEvaluator: diff --git a/ddtrace/llmobs/_evaluators/ragas/context_precision.py b/ddtrace/llmobs/_evaluators/ragas/context_precision.py index 990302931c..13ccb1d593 100644 --- a/ddtrace/llmobs/_evaluators/ragas/context_precision.py +++ b/ddtrace/llmobs/_evaluators/ragas/context_precision.py @@ -6,6 +6,7 @@ from ddtrace.internal.logger import get_logger from ddtrace.llmobs._constants import EVALUATION_KIND_METADATA from ddtrace.llmobs._constants import EVALUATION_SPAN_METADATA +from ddtrace.llmobs._constants import IS_EVALUATION_SPAN from ddtrace.llmobs._evaluators.ragas.base import BaseRagasEvaluator from ddtrace.llmobs._evaluators.ragas.base import _get_ml_app_for_ragas_trace @@ -82,6 +83,7 @@ def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]] with self.llmobs_service.workflow( "dd-ragas.context_precision", ml_app=_get_ml_app_for_ragas_trace(span_event) ) as ragas_cp_workflow: + ragas_cp_workflow._set_ctx_item(IS_EVALUATION_SPAN, True) try: evaluation_metadata[EVALUATION_SPAN_METADATA] = self.llmobs_service.export_span(span=ragas_cp_workflow) diff --git a/ddtrace/llmobs/_evaluators/ragas/faithfulness.py b/ddtrace/llmobs/_evaluators/ragas/faithfulness.py index 98725b1f27..2c413f2cec 100644 --- a/ddtrace/llmobs/_evaluators/ragas/faithfulness.py +++ b/ddtrace/llmobs/_evaluators/ragas/faithfulness.py @@ -9,6 +9,7 @@ from ddtrace.llmobs._constants import EVALUATION_KIND_METADATA from ddtrace.llmobs._constants import EVALUATION_SPAN_METADATA from ddtrace.llmobs._constants import FAITHFULNESS_DISAGREEMENTS_METADATA +from ddtrace.llmobs._constants import IS_EVALUATION_SPAN from ddtrace.llmobs._evaluators.ragas.base import BaseRagasEvaluator from ddtrace.llmobs._evaluators.ragas.base import _get_ml_app_for_ragas_trace @@ -96,6 +97,7 @@ def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]] with self.llmobs_service.workflow( "dd-ragas.faithfulness", ml_app=_get_ml_app_for_ragas_trace(span_event) ) as ragas_faithfulness_workflow: + ragas_faithfulness_workflow._set_ctx_item(IS_EVALUATION_SPAN, True) try: evaluation_metadata[EVALUATION_SPAN_METADATA] = self.llmobs_service.export_span( span=ragas_faithfulness_workflow diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index b4f1dc1b2f..c228409063 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -38,6 +38,7 @@ from ddtrace.llmobs._constants import INPUT_PARAMETERS from ddtrace.llmobs._constants import INPUT_PROMPT from ddtrace.llmobs._constants import INPUT_VALUE +from ddtrace.llmobs._constants import IS_EVALUATION_SPAN from ddtrace.llmobs._constants import METADATA from ddtrace.llmobs._constants import METRICS from ddtrace.llmobs._constants import ML_APP @@ -59,6 +60,7 @@ from ddtrace.llmobs._utils import _get_session_id from ddtrace.llmobs._utils import _get_span_name from ddtrace.llmobs._utils import _inject_llmobs_parent_id +from ddtrace.llmobs._utils import _is_evaluation_span from ddtrace.llmobs._utils import safe_json from ddtrace.llmobs._utils import validate_prompt from ddtrace.llmobs._writer import LLMObsEvalMetricWriter @@ -123,16 +125,16 @@ def _submit_llmobs_span(self, span: Span) -> None: """Generate and submit an LLMObs span event to be sent to LLMObs.""" span_event = None is_llm_span = span._get_ctx_item(SPAN_KIND) == "llm" - is_ragas_integration_span = False + is_evaluation_span = False try: - span_event, is_ragas_integration_span = self._llmobs_span_event(span) + span_event, is_evaluation_span = self._llmobs_span_event(span) self._llmobs_span_writer.enqueue(span_event) except (KeyError, TypeError): log.error( "Error generating LLMObs span event for span %s, likely due to malformed span", span, exc_info=True ) finally: - if not span_event or not is_llm_span or is_ragas_integration_span: + if not span_event or not is_llm_span or is_evaluation_span: return if self._evaluator_runner: self._evaluator_runner.enqueue(span_event, span) @@ -185,10 +187,8 @@ def _llmobs_span_event(cls, span: Span) -> Tuple[Dict[str, Any], bool]: metrics = span._get_ctx_item(METRICS) or {} ml_app = _get_ml_app(span) - is_ragas_integration_span = False - - if ml_app.startswith(constants.RAGAS_ML_APP_PREFIX): - is_ragas_integration_span = True + is_evaluation_span = _is_evaluation_span(span) + span._set_ctx_item(IS_EVALUATION_SPAN, is_evaluation_span) span._set_ctx_item(ML_APP, ml_app) parent_id = str(_get_llmobs_parent_id(span) or "undefined") @@ -210,9 +210,9 @@ def _llmobs_span_event(cls, span: Span) -> Tuple[Dict[str, Any], bool]: llmobs_span_event["session_id"] = session_id llmobs_span_event["tags"] = cls._llmobs_tags( - span, ml_app, session_id, is_ragas_integration_span=is_ragas_integration_span + span, ml_app, session_id, is_ragas_integration_span=is_evaluation_span ) - return llmobs_span_event, is_ragas_integration_span + return llmobs_span_event, is_evaluation_span @staticmethod def _llmobs_tags( @@ -272,10 +272,6 @@ def _start_service(self) -> None: log.debug("Error starting evaluator runner") def _stop_service(self) -> None: - # Remove listener hooks for span events - core.reset_listeners("trace.span_start", self._on_span_start) - core.reset_listeners("trace.span_finish", self._on_span_finish) - try: self._evaluator_runner.stop() # flush remaining evaluation spans & evaluations @@ -290,6 +286,10 @@ def _stop_service(self) -> None: except ServiceStatusError: log.debug("Error stopping LLMObs writers") + # Remove listener hooks for span events + core.reset_listeners("trace.span_start", self._on_span_start) + core.reset_listeners("trace.span_finish", self._on_span_finish) + forksafe.unregister(self._child_after_fork) @classmethod diff --git a/ddtrace/llmobs/_utils.py b/ddtrace/llmobs/_utils.py index dd616db8be..1a45b4921c 100644 --- a/ddtrace/llmobs/_utils.py +++ b/ddtrace/llmobs/_utils.py @@ -12,6 +12,7 @@ from ddtrace.llmobs._constants import GEMINI_APM_SPAN_NAME from ddtrace.llmobs._constants import INTERNAL_CONTEXT_VARIABLE_KEYS from ddtrace.llmobs._constants import INTERNAL_QUERY_VARIABLE_KEYS +from ddtrace.llmobs._constants import IS_EVALUATION_SPAN from ddtrace.llmobs._constants import LANGCHAIN_APM_SPAN_NAME from ddtrace.llmobs._constants import ML_APP from ddtrace.llmobs._constants import OPENAI_APM_SPAN_NAME @@ -127,6 +128,23 @@ def _get_span_name(span: Span) -> str: return span.name +def _is_evaluation_span(span: Span) -> bool: + """ + Return whether or not a span is an evaluation span by checking the span's + nearest LLMObs span ancestor. Default to 'False' + """ + is_evaluation_span = span._get_ctx_item(IS_EVALUATION_SPAN) + if is_evaluation_span is not None: + return is_evaluation_span + llmobs_parent = _get_nearest_llmobs_ancestor(span) + while llmobs_parent: + is_evaluation_span = llmobs_parent._get_ctx_item(IS_EVALUATION_SPAN) + if is_evaluation_span is not None: + return is_evaluation_span + llmobs_parent = _get_nearest_llmobs_ancestor(llmobs_parent) + return is_evaluation_span or False + + def _get_ml_app(span: Span) -> str: """ Return the ML app name for a given span, by checking the span's nearest LLMObs span ancestor. diff --git a/tests/llmobs/_utils.py b/tests/llmobs/_utils.py index 3583516538..8343aee530 100644 --- a/tests/llmobs/_utils.py +++ b/tests/llmobs/_utils.py @@ -488,7 +488,7 @@ def expected_ragas_trace_tags(): "env:", "service:tests.llmobs", "source:integration", - "ml_app:dd-ragas-unnamed-ml-app", + "ml_app:unnamed-ml-app", "ddtrace.version:{}".format(ddtrace.__version__), "language:python", "error:0", diff --git a/tests/llmobs/test_llmobs_service.py b/tests/llmobs/test_llmobs_service.py index dad6accdcf..de42899914 100644 --- a/tests/llmobs/test_llmobs_service.py +++ b/tests/llmobs/test_llmobs_service.py @@ -16,6 +16,7 @@ from ddtrace.llmobs._constants import INPUT_PARAMETERS from ddtrace.llmobs._constants import INPUT_PROMPT from ddtrace.llmobs._constants import INPUT_VALUE +from ddtrace.llmobs._constants import IS_EVALUATION_SPAN from ddtrace.llmobs._constants import METADATA from ddtrace.llmobs._constants import METRICS from ddtrace.llmobs._constants import MODEL_NAME @@ -24,7 +25,6 @@ from ddtrace.llmobs._constants import OUTPUT_MESSAGES from ddtrace.llmobs._constants import OUTPUT_VALUE from ddtrace.llmobs._constants import PROPAGATED_PARENT_ID_KEY -from ddtrace.llmobs._constants import RAGAS_ML_APP_PREFIX from ddtrace.llmobs._constants import SESSION_ID from ddtrace.llmobs._constants import SPAN_KIND from ddtrace.llmobs._constants import SPAN_START_WHILE_DISABLED_WARNING @@ -1538,8 +1538,10 @@ def test_llmobs_with_evaluator_runner(llmobs, mock_llmobs_evaluator_runner): def test_llmobs_with_evaluator_runner_does_not_enqueue_evaluation_spans(mock_llmobs_evaluator_runner, llmobs): - with llmobs.llm(model_name="test_model", ml_app="{}-dummy".format(RAGAS_ML_APP_PREFIX)): - pass + with llmobs.agent(name="test") as agent: + agent._set_ctx_item(IS_EVALUATION_SPAN, True) + with llmobs.llm(model_name="test_model"): + pass time.sleep(0.1) assert llmobs._instance._evaluator_runner.enqueue.call_count == 0