From f90caa43680dd887d5046aad2080d3033b63622d Mon Sep 17 00:00:00 2001 From: lievan Date: Tue, 14 Jan 2025 15:03:05 -0500 Subject: [PATCH 1/8] small changes --- ddtrace/llmobs/_evaluators/runner.py | 11 +++-------- ddtrace/llmobs/_llmobs.py | 1 + 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/ddtrace/llmobs/_evaluators/runner.py b/ddtrace/llmobs/_evaluators/runner.py index 3d26998f1b..1e8dab28f1 100644 --- a/ddtrace/llmobs/_evaluators/runner.py +++ b/ddtrace/llmobs/_evaluators/runner.py @@ -113,14 +113,9 @@ def periodic(self, _wait_sync=False) -> None: try: if not _wait_sync: for evaluator in self.evaluators: - self.executor.map( - lambda span_event: evaluator.run_and_submit_evaluation(span_event), - [ - span_event - for span_event, span in span_events_and_spans - if self.sampler.sample(evaluator.LABEL, span) - ], - ) + for span_event, span in span_events_and_spans: + if self.sampler.sample(evaluator.LABEL, span): + self.executor.submit(evaluator.run_and_submit_evaluation, span_event) else: for evaluator in self.evaluators: for span_event, span in span_events_and_spans: diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index b4f1dc1b2f..1965d119c0 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -189,6 +189,7 @@ def _llmobs_span_event(cls, span: Span) -> Tuple[Dict[str, Any], bool]: if ml_app.startswith(constants.RAGAS_ML_APP_PREFIX): is_ragas_integration_span = True + ml_app = ml_app.replace(constants.RAGAS_ML_APP_PREFIX + "-", "") span._set_ctx_item(ML_APP, ml_app) parent_id = str(_get_llmobs_parent_id(span) or "undefined") From 5bd38404f4ddb608bb553f3a003db645fe78e957 Mon Sep 17 00:00:00 2001 From: lievan Date: Tue, 14 Jan 2025 19:29:07 -0500 Subject: [PATCH 2/8] ragas version parse --- ddtrace/llmobs/_evaluators/ragas/base.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ddtrace/llmobs/_evaluators/ragas/base.py b/ddtrace/llmobs/_evaluators/ragas/base.py index 23aa4cd3ca..f866ea3c69 100644 --- a/ddtrace/llmobs/_evaluators/ragas/base.py +++ b/ddtrace/llmobs/_evaluators/ragas/base.py @@ -26,8 +26,10 @@ class RagasDependencies: def __init__(self): import ragas - self.ragas_version = parse_version(ragas.__version__) - if self.ragas_version >= (0, 2, 0) or self.ragas_version < (0, 1, 10): + self.ragas_version = ragas.__version__ # type: str + + parsed_version = parse_version(ragas.__version__) + if parsed_version >= (0, 2, 0) or parsed_version < (0, 1, 10): raise NotImplementedError( "Ragas version: {} is not supported".format(self.ragas_version), ) From 0d8f9a6c363718a700f173a9b01ef8c1dd251d07 Mon Sep 17 00:00:00 2001 From: lievan Date: Wed, 15 Jan 2025 09:15:57 -0500 Subject: [PATCH 3/8] ragas ml app updates --- ddtrace/llmobs/_constants.py | 2 +- ddtrace/llmobs/_evaluators/ragas/base.py | 2 +- ddtrace/llmobs/_llmobs.py | 5 ++--- tests/llmobs/_utils.py | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/ddtrace/llmobs/_constants.py b/ddtrace/llmobs/_constants.py index 27000b36aa..a4148639e1 100644 --- a/ddtrace/llmobs/_constants.py +++ b/ddtrace/llmobs/_constants.py @@ -52,7 +52,7 @@ # The ml app of all ragas traces have this prefix that we use to detect # whether a span is generated from the ragas evaluation itself. -RAGAS_ML_APP_PREFIX = "dd-ragas" +RAGAS_ML_APP_PREFIX = "dd-ragas-" ANNOTATIONS_CONTEXT_ID = "annotations_context_id" INTERNAL_CONTEXT_VARIABLE_KEYS = "_dd_context_variable_keys" diff --git a/ddtrace/llmobs/_evaluators/ragas/base.py b/ddtrace/llmobs/_evaluators/ragas/base.py index f866ea3c69..73360c091c 100644 --- a/ddtrace/llmobs/_evaluators/ragas/base.py +++ b/ddtrace/llmobs/_evaluators/ragas/base.py @@ -80,7 +80,7 @@ def _get_ml_app_for_ragas_trace(span_event: dict) -> str: break if not ml_app: return RAGAS_ML_APP_PREFIX - return "{}-{}".format(RAGAS_ML_APP_PREFIX, ml_app) + return "{}{}".format(RAGAS_ML_APP_PREFIX, ml_app) class BaseRagasEvaluator: diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index 1965d119c0..47f5e24c86 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -184,14 +184,13 @@ def _llmobs_span_event(cls, span: Span) -> Tuple[Dict[str, Any], bool]: meta.pop("output") metrics = span._get_ctx_item(METRICS) or {} ml_app = _get_ml_app(span) + span._set_ctx_item(ML_APP, ml_app) is_ragas_integration_span = False - if ml_app.startswith(constants.RAGAS_ML_APP_PREFIX): is_ragas_integration_span = True - ml_app = ml_app.replace(constants.RAGAS_ML_APP_PREFIX + "-", "") + ml_app = ml_app.replace(constants.RAGAS_ML_APP_PREFIX, "") - span._set_ctx_item(ML_APP, ml_app) parent_id = str(_get_llmobs_parent_id(span) or "undefined") llmobs_span_event = { diff --git a/tests/llmobs/_utils.py b/tests/llmobs/_utils.py index 4e60a8f399..85836e2009 100644 --- a/tests/llmobs/_utils.py +++ b/tests/llmobs/_utils.py @@ -488,7 +488,7 @@ def expected_ragas_trace_tags(): "env:", "service:tests.llmobs", "source:integration", - "ml_app:dd-ragas-unnamed-ml-app", + "ml_app:unnamed-ml-app", "ddtrace.version:{}".format(ddtrace.__version__), "language:python", "error:0", From ea038a9876279f2f84b3a78f48697b9544e50f3d Mon Sep 17 00:00:00 2001 From: lievan Date: Wed, 15 Jan 2025 18:42:08 -0500 Subject: [PATCH 4/8] clarify the ml app is temp --- ddtrace/llmobs/_constants.py | 5 +++-- ddtrace/llmobs/_evaluators/ragas/base.py | 6 +++--- ddtrace/llmobs/_llmobs.py | 4 ++-- tests/llmobs/test_llmobs_service.py | 4 ++-- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/ddtrace/llmobs/_constants.py b/ddtrace/llmobs/_constants.py index a4148639e1..665212360d 100644 --- a/ddtrace/llmobs/_constants.py +++ b/ddtrace/llmobs/_constants.py @@ -51,8 +51,9 @@ RUNNER_IS_INTEGRATION_SPAN_TAG = "runner.integration" # The ml app of all ragas traces have this prefix that we use to detect -# whether a span is generated from the ragas evaluation itself. -RAGAS_ML_APP_PREFIX = "dd-ragas-" +# whether a span is generated from the ragas evaluation itself. We then +# remove this prefix from the ml app before we submit the span. +TEMP_RAGAS_ML_APP_PREFIX = "_dd_ragas_" ANNOTATIONS_CONTEXT_ID = "annotations_context_id" INTERNAL_CONTEXT_VARIABLE_KEYS = "_dd_context_variable_keys" diff --git a/ddtrace/llmobs/_evaluators/ragas/base.py b/ddtrace/llmobs/_evaluators/ragas/base.py index 594e4699e6..f361d8efc0 100644 --- a/ddtrace/llmobs/_evaluators/ragas/base.py +++ b/ddtrace/llmobs/_evaluators/ragas/base.py @@ -11,7 +11,7 @@ from ddtrace.internal.utils.version import parse_version from ddtrace.llmobs._constants import INTERNAL_CONTEXT_VARIABLE_KEYS from ddtrace.llmobs._constants import INTERNAL_QUERY_VARIABLE_KEYS -from ddtrace.llmobs._constants import RAGAS_ML_APP_PREFIX +from ddtrace.llmobs._constants import TEMP_RAGAS_ML_APP_PREFIX logger = get_logger(__name__) @@ -95,8 +95,8 @@ def _get_ml_app_for_ragas_trace(span_event: dict) -> str: ml_app = tag.split(":")[1] break if not ml_app: - return RAGAS_ML_APP_PREFIX - return "{}{}".format(RAGAS_ML_APP_PREFIX, ml_app) + return TEMP_RAGAS_ML_APP_PREFIX + return "{}{}".format(TEMP_RAGAS_ML_APP_PREFIX, ml_app) class BaseRagasEvaluator: diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index 47f5e24c86..690eb28b7c 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -187,9 +187,9 @@ def _llmobs_span_event(cls, span: Span) -> Tuple[Dict[str, Any], bool]: span._set_ctx_item(ML_APP, ml_app) is_ragas_integration_span = False - if ml_app.startswith(constants.RAGAS_ML_APP_PREFIX): + if ml_app.startswith(constants.TEMP_RAGAS_ML_APP_PREFIX): is_ragas_integration_span = True - ml_app = ml_app.replace(constants.RAGAS_ML_APP_PREFIX, "") + ml_app = ml_app.replace(constants.TEMP_RAGAS_ML_APP_PREFIX, "") parent_id = str(_get_llmobs_parent_id(span) or "undefined") diff --git a/tests/llmobs/test_llmobs_service.py b/tests/llmobs/test_llmobs_service.py index dad6accdcf..931c477f36 100644 --- a/tests/llmobs/test_llmobs_service.py +++ b/tests/llmobs/test_llmobs_service.py @@ -24,11 +24,11 @@ from ddtrace.llmobs._constants import OUTPUT_MESSAGES from ddtrace.llmobs._constants import OUTPUT_VALUE from ddtrace.llmobs._constants import PROPAGATED_PARENT_ID_KEY -from ddtrace.llmobs._constants import RAGAS_ML_APP_PREFIX from ddtrace.llmobs._constants import SESSION_ID from ddtrace.llmobs._constants import SPAN_KIND from ddtrace.llmobs._constants import SPAN_START_WHILE_DISABLED_WARNING from ddtrace.llmobs._constants import TAGS +from ddtrace.llmobs._constants import TEMP_RAGAS_ML_APP_PREFIX from ddtrace.llmobs._llmobs import SUPPORTED_LLMOBS_INTEGRATIONS from ddtrace.llmobs._writer import LLMObsAgentlessEventClient from ddtrace.llmobs._writer import LLMObsProxiedEventClient @@ -1538,7 +1538,7 @@ def test_llmobs_with_evaluator_runner(llmobs, mock_llmobs_evaluator_runner): def test_llmobs_with_evaluator_runner_does_not_enqueue_evaluation_spans(mock_llmobs_evaluator_runner, llmobs): - with llmobs.llm(model_name="test_model", ml_app="{}-dummy".format(RAGAS_ML_APP_PREFIX)): + with llmobs.llm(model_name="test_model", ml_app="{}-dummy".format(TEMP_RAGAS_ML_APP_PREFIX)): pass time.sleep(0.1) assert llmobs._instance._evaluator_runner.enqueue.call_count == 0 From d7f46300ef406146b9b78f72e4596d26a960146c Mon Sep 17 00:00:00 2001 From: lievan Date: Thu, 16 Jan 2025 09:34:59 -0500 Subject: [PATCH 5/8] make sure listeners are removed after all spans are flushed --- ddtrace/llmobs/_llmobs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index 690eb28b7c..a533f1e640 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -272,10 +272,6 @@ def _start_service(self) -> None: log.debug("Error starting evaluator runner") def _stop_service(self) -> None: - # Remove listener hooks for span events - core.reset_listeners("trace.span_start", self._on_span_start) - core.reset_listeners("trace.span_finish", self._on_span_finish) - try: self._evaluator_runner.stop() # flush remaining evaluation spans & evaluations @@ -290,6 +286,10 @@ def _stop_service(self) -> None: except ServiceStatusError: log.debug("Error stopping LLMObs writers") + # Remove listener hooks for span events + core.reset_listeners("trace.span_start", self._on_span_start) + core.reset_listeners("trace.span_finish", self._on_span_finish) + forksafe.unregister(self._child_after_fork) @classmethod From e52f5649ef17547e372c7d14d5dc8e480bba76f1 Mon Sep 17 00:00:00 2001 From: lievan Date: Thu, 16 Jan 2025 11:03:52 -0500 Subject: [PATCH 6/8] revert acc change --- ddtrace/llmobs/_evaluators/runner.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/ddtrace/llmobs/_evaluators/runner.py b/ddtrace/llmobs/_evaluators/runner.py index 64a50baf6c..6d23af647e 100644 --- a/ddtrace/llmobs/_evaluators/runner.py +++ b/ddtrace/llmobs/_evaluators/runner.py @@ -117,9 +117,14 @@ def periodic(self, _wait_sync=False) -> None: try: if not _wait_sync: for evaluator in self.evaluators: - for span_event, span in span_events_and_spans: - if self.sampler.sample(evaluator.LABEL, span): - self.executor.submit(evaluator.run_and_submit_evaluation, span_event) + self.executor.map( + lambda span_event: evaluator.run_and_submit_evaluation(span_event), + [ + span_event + for span_event, span in span_events_and_spans + if self.sampler.sample(evaluator.LABEL, span) + ], + ) else: for evaluator in self.evaluators: for span_event, span in span_events_and_spans: From 3d2a446d58eae72efac545fbc28e18b0d2e6b836 Mon Sep 17 00:00:00 2001 From: lievan Date: Fri, 17 Jan 2025 12:30:39 -0500 Subject: [PATCH 7/8] change how we detect eval spans --- ddtrace/llmobs/_constants.py | 7 +++---- .../_evaluators/ragas/answer_relevancy.py | 2 ++ ddtrace/llmobs/_evaluators/ragas/base.py | 6 ++---- .../_evaluators/ragas/context_precision.py | 2 ++ .../llmobs/_evaluators/ragas/faithfulness.py | 2 ++ ddtrace/llmobs/_llmobs.py | 18 +++++++++--------- ddtrace/llmobs/_utils.py | 18 ++++++++++++++++++ tests/llmobs/test_llmobs_service.py | 8 +++++--- 8 files changed, 43 insertions(+), 20 deletions(-) diff --git a/ddtrace/llmobs/_constants.py b/ddtrace/llmobs/_constants.py index 665212360d..a7d13ca60c 100644 --- a/ddtrace/llmobs/_constants.py +++ b/ddtrace/llmobs/_constants.py @@ -50,10 +50,9 @@ # Used to differentiate traces of Datadog-run operations vs user-application operations. RUNNER_IS_INTEGRATION_SPAN_TAG = "runner.integration" -# The ml app of all ragas traces have this prefix that we use to detect -# whether a span is generated from the ragas evaluation itself. We then -# remove this prefix from the ml app before we submit the span. -TEMP_RAGAS_ML_APP_PREFIX = "_dd_ragas_" +# All ragas traces have this context item set so we can differentiate +# spans generated from the ragas integration vs user application spans. +IS_EVALUATION_SPAN = "_is_evaluation_span" ANNOTATIONS_CONTEXT_ID = "annotations_context_id" INTERNAL_CONTEXT_VARIABLE_KEYS = "_dd_context_variable_keys" diff --git a/ddtrace/llmobs/_evaluators/ragas/answer_relevancy.py b/ddtrace/llmobs/_evaluators/ragas/answer_relevancy.py index 9a640e0845..5fd6e6b7c0 100644 --- a/ddtrace/llmobs/_evaluators/ragas/answer_relevancy.py +++ b/ddtrace/llmobs/_evaluators/ragas/answer_relevancy.py @@ -5,6 +5,7 @@ from ddtrace.internal.logger import get_logger from ddtrace.llmobs._constants import EVALUATION_SPAN_METADATA +from ddtrace.llmobs._constants import IS_EVALUATION_SPAN from ddtrace.llmobs._evaluators.ragas.base import BaseRagasEvaluator from ddtrace.llmobs._evaluators.ragas.base import _get_ml_app_for_ragas_trace @@ -84,6 +85,7 @@ def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]] with self.llmobs_service.workflow( "dd-ragas.answer_relevancy", ml_app=_get_ml_app_for_ragas_trace(span_event) ) as ragas_ar_workflow: + ragas_ar_workflow._set_ctx_item(IS_EVALUATION_SPAN, True) try: evaluation_metadata[EVALUATION_SPAN_METADATA] = self.llmobs_service.export_span(span=ragas_ar_workflow) diff --git a/ddtrace/llmobs/_evaluators/ragas/base.py b/ddtrace/llmobs/_evaluators/ragas/base.py index f361d8efc0..2f6522496d 100644 --- a/ddtrace/llmobs/_evaluators/ragas/base.py +++ b/ddtrace/llmobs/_evaluators/ragas/base.py @@ -4,6 +4,7 @@ from typing import Tuple from typing import Union +from ddtrace import config from ddtrace.internal.logger import get_logger from ddtrace.internal.telemetry import telemetry_writer from ddtrace.internal.telemetry.constants import TELEMETRY_LOG_LEVEL @@ -11,7 +12,6 @@ from ddtrace.internal.utils.version import parse_version from ddtrace.llmobs._constants import INTERNAL_CONTEXT_VARIABLE_KEYS from ddtrace.llmobs._constants import INTERNAL_QUERY_VARIABLE_KEYS -from ddtrace.llmobs._constants import TEMP_RAGAS_ML_APP_PREFIX logger = get_logger(__name__) @@ -94,9 +94,7 @@ def _get_ml_app_for_ragas_trace(span_event: dict) -> str: if isinstance(tag, str) and tag.startswith("ml_app:"): ml_app = tag.split(":")[1] break - if not ml_app: - return TEMP_RAGAS_ML_APP_PREFIX - return "{}{}".format(TEMP_RAGAS_ML_APP_PREFIX, ml_app) + return ml_app or config._llmobs_ml_app or "unknown-ml-app" class BaseRagasEvaluator: diff --git a/ddtrace/llmobs/_evaluators/ragas/context_precision.py b/ddtrace/llmobs/_evaluators/ragas/context_precision.py index 990302931c..13ccb1d593 100644 --- a/ddtrace/llmobs/_evaluators/ragas/context_precision.py +++ b/ddtrace/llmobs/_evaluators/ragas/context_precision.py @@ -6,6 +6,7 @@ from ddtrace.internal.logger import get_logger from ddtrace.llmobs._constants import EVALUATION_KIND_METADATA from ddtrace.llmobs._constants import EVALUATION_SPAN_METADATA +from ddtrace.llmobs._constants import IS_EVALUATION_SPAN from ddtrace.llmobs._evaluators.ragas.base import BaseRagasEvaluator from ddtrace.llmobs._evaluators.ragas.base import _get_ml_app_for_ragas_trace @@ -82,6 +83,7 @@ def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]] with self.llmobs_service.workflow( "dd-ragas.context_precision", ml_app=_get_ml_app_for_ragas_trace(span_event) ) as ragas_cp_workflow: + ragas_cp_workflow._set_ctx_item(IS_EVALUATION_SPAN, True) try: evaluation_metadata[EVALUATION_SPAN_METADATA] = self.llmobs_service.export_span(span=ragas_cp_workflow) diff --git a/ddtrace/llmobs/_evaluators/ragas/faithfulness.py b/ddtrace/llmobs/_evaluators/ragas/faithfulness.py index 98725b1f27..2c413f2cec 100644 --- a/ddtrace/llmobs/_evaluators/ragas/faithfulness.py +++ b/ddtrace/llmobs/_evaluators/ragas/faithfulness.py @@ -9,6 +9,7 @@ from ddtrace.llmobs._constants import EVALUATION_KIND_METADATA from ddtrace.llmobs._constants import EVALUATION_SPAN_METADATA from ddtrace.llmobs._constants import FAITHFULNESS_DISAGREEMENTS_METADATA +from ddtrace.llmobs._constants import IS_EVALUATION_SPAN from ddtrace.llmobs._evaluators.ragas.base import BaseRagasEvaluator from ddtrace.llmobs._evaluators.ragas.base import _get_ml_app_for_ragas_trace @@ -96,6 +97,7 @@ def evaluate(self, span_event: dict) -> Tuple[Union[float, str], Optional[dict]] with self.llmobs_service.workflow( "dd-ragas.faithfulness", ml_app=_get_ml_app_for_ragas_trace(span_event) ) as ragas_faithfulness_workflow: + ragas_faithfulness_workflow._set_ctx_item(IS_EVALUATION_SPAN, True) try: evaluation_metadata[EVALUATION_SPAN_METADATA] = self.llmobs_service.export_span( span=ragas_faithfulness_workflow diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index a533f1e640..89d18e71d3 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -38,6 +38,7 @@ from ddtrace.llmobs._constants import INPUT_PARAMETERS from ddtrace.llmobs._constants import INPUT_PROMPT from ddtrace.llmobs._constants import INPUT_VALUE +from ddtrace.llmobs._constants import IS_EVALUATION_SPAN from ddtrace.llmobs._constants import METADATA from ddtrace.llmobs._constants import METRICS from ddtrace.llmobs._constants import ML_APP @@ -59,6 +60,7 @@ from ddtrace.llmobs._utils import _get_session_id from ddtrace.llmobs._utils import _get_span_name from ddtrace.llmobs._utils import _inject_llmobs_parent_id +from ddtrace.llmobs._utils import _is_evaluation_span from ddtrace.llmobs._utils import safe_json from ddtrace.llmobs._utils import validate_prompt from ddtrace.llmobs._writer import LLMObsEvalMetricWriter @@ -123,16 +125,16 @@ def _submit_llmobs_span(self, span: Span) -> None: """Generate and submit an LLMObs span event to be sent to LLMObs.""" span_event = None is_llm_span = span._get_ctx_item(SPAN_KIND) == "llm" - is_ragas_integration_span = False + is_evaluation_span = False try: - span_event, is_ragas_integration_span = self._llmobs_span_event(span) + span_event, is_evaluation_span = self._llmobs_span_event(span) self._llmobs_span_writer.enqueue(span_event) except (KeyError, TypeError): log.error( "Error generating LLMObs span event for span %s, likely due to malformed span", span, exc_info=True ) finally: - if not span_event or not is_llm_span or is_ragas_integration_span: + if not span_event or not is_llm_span or is_evaluation_span: return if self._evaluator_runner: self._evaluator_runner.enqueue(span_event, span) @@ -186,10 +188,8 @@ def _llmobs_span_event(cls, span: Span) -> Tuple[Dict[str, Any], bool]: ml_app = _get_ml_app(span) span._set_ctx_item(ML_APP, ml_app) - is_ragas_integration_span = False - if ml_app.startswith(constants.TEMP_RAGAS_ML_APP_PREFIX): - is_ragas_integration_span = True - ml_app = ml_app.replace(constants.TEMP_RAGAS_ML_APP_PREFIX, "") + is_evaluation_span = _is_evaluation_span(span) + span._set_ctx_item(IS_EVALUATION_SPAN, is_evaluation_span) parent_id = str(_get_llmobs_parent_id(span) or "undefined") @@ -210,9 +210,9 @@ def _llmobs_span_event(cls, span: Span) -> Tuple[Dict[str, Any], bool]: llmobs_span_event["session_id"] = session_id llmobs_span_event["tags"] = cls._llmobs_tags( - span, ml_app, session_id, is_ragas_integration_span=is_ragas_integration_span + span, ml_app, session_id, is_ragas_integration_span=is_evaluation_span ) - return llmobs_span_event, is_ragas_integration_span + return llmobs_span_event, is_evaluation_span @staticmethod def _llmobs_tags( diff --git a/ddtrace/llmobs/_utils.py b/ddtrace/llmobs/_utils.py index dd616db8be..1a45b4921c 100644 --- a/ddtrace/llmobs/_utils.py +++ b/ddtrace/llmobs/_utils.py @@ -12,6 +12,7 @@ from ddtrace.llmobs._constants import GEMINI_APM_SPAN_NAME from ddtrace.llmobs._constants import INTERNAL_CONTEXT_VARIABLE_KEYS from ddtrace.llmobs._constants import INTERNAL_QUERY_VARIABLE_KEYS +from ddtrace.llmobs._constants import IS_EVALUATION_SPAN from ddtrace.llmobs._constants import LANGCHAIN_APM_SPAN_NAME from ddtrace.llmobs._constants import ML_APP from ddtrace.llmobs._constants import OPENAI_APM_SPAN_NAME @@ -127,6 +128,23 @@ def _get_span_name(span: Span) -> str: return span.name +def _is_evaluation_span(span: Span) -> bool: + """ + Return whether or not a span is an evaluation span by checking the span's + nearest LLMObs span ancestor. Default to 'False' + """ + is_evaluation_span = span._get_ctx_item(IS_EVALUATION_SPAN) + if is_evaluation_span is not None: + return is_evaluation_span + llmobs_parent = _get_nearest_llmobs_ancestor(span) + while llmobs_parent: + is_evaluation_span = llmobs_parent._get_ctx_item(IS_EVALUATION_SPAN) + if is_evaluation_span is not None: + return is_evaluation_span + llmobs_parent = _get_nearest_llmobs_ancestor(llmobs_parent) + return is_evaluation_span or False + + def _get_ml_app(span: Span) -> str: """ Return the ML app name for a given span, by checking the span's nearest LLMObs span ancestor. diff --git a/tests/llmobs/test_llmobs_service.py b/tests/llmobs/test_llmobs_service.py index 931c477f36..de42899914 100644 --- a/tests/llmobs/test_llmobs_service.py +++ b/tests/llmobs/test_llmobs_service.py @@ -16,6 +16,7 @@ from ddtrace.llmobs._constants import INPUT_PARAMETERS from ddtrace.llmobs._constants import INPUT_PROMPT from ddtrace.llmobs._constants import INPUT_VALUE +from ddtrace.llmobs._constants import IS_EVALUATION_SPAN from ddtrace.llmobs._constants import METADATA from ddtrace.llmobs._constants import METRICS from ddtrace.llmobs._constants import MODEL_NAME @@ -28,7 +29,6 @@ from ddtrace.llmobs._constants import SPAN_KIND from ddtrace.llmobs._constants import SPAN_START_WHILE_DISABLED_WARNING from ddtrace.llmobs._constants import TAGS -from ddtrace.llmobs._constants import TEMP_RAGAS_ML_APP_PREFIX from ddtrace.llmobs._llmobs import SUPPORTED_LLMOBS_INTEGRATIONS from ddtrace.llmobs._writer import LLMObsAgentlessEventClient from ddtrace.llmobs._writer import LLMObsProxiedEventClient @@ -1538,8 +1538,10 @@ def test_llmobs_with_evaluator_runner(llmobs, mock_llmobs_evaluator_runner): def test_llmobs_with_evaluator_runner_does_not_enqueue_evaluation_spans(mock_llmobs_evaluator_runner, llmobs): - with llmobs.llm(model_name="test_model", ml_app="{}-dummy".format(TEMP_RAGAS_ML_APP_PREFIX)): - pass + with llmobs.agent(name="test") as agent: + agent._set_ctx_item(IS_EVALUATION_SPAN, True) + with llmobs.llm(model_name="test_model"): + pass time.sleep(0.1) assert llmobs._instance._evaluator_runner.enqueue.call_count == 0 From 195cb4a091b59e0688cf8221edbb996ed4afcb96 Mon Sep 17 00:00:00 2001 From: lievan Date: Fri, 17 Jan 2025 13:17:41 -0500 Subject: [PATCH 8/8] remove unneeded change --- ddtrace/llmobs/_llmobs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index 89d18e71d3..c228409063 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -186,11 +186,11 @@ def _llmobs_span_event(cls, span: Span) -> Tuple[Dict[str, Any], bool]: meta.pop("output") metrics = span._get_ctx_item(METRICS) or {} ml_app = _get_ml_app(span) - span._set_ctx_item(ML_APP, ml_app) is_evaluation_span = _is_evaluation_span(span) span._set_ctx_item(IS_EVALUATION_SPAN, is_evaluation_span) + span._set_ctx_item(ML_APP, ml_app) parent_id = str(_get_llmobs_parent_id(span) or "undefined") llmobs_span_event = {