Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(llmobs): ragas evaluation framework integration #11939

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ddtrace/llmobs/_evaluators/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def __init__(self, interval: float, llmobs_service=None, evaluators=None):
if len(self.evaluators) > 0:
return

evaluator_str = os.getenv("_DD_LLMOBS_EVALUATORS")
evaluator_str = os.getenv("DD_LLMOBS_EVALUATORS")
if evaluator_str is None:
return

Expand Down
4 changes: 2 additions & 2 deletions ddtrace/llmobs/_evaluators/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def __repr__(self):


class EvaluatorRunnerSampler:
SAMPLING_RULES_ENV_VAR = "_DD_LLMOBS_EVALUATOR_SAMPLING_RULES"
SAMPLING_RULES_ENV_VAR = "DD_LLMOBS_EVALUATOR_SAMPLING_RULES"

def __init__(self):
self.rules = self.parse_rules()
Expand All @@ -60,7 +60,7 @@ def sample(self, evaluator_label, span):
def parse_rules(self) -> List[EvaluatorRunnerSamplingRule]:
rules = []
sampling_rules_str = os.getenv(self.SAMPLING_RULES_ENV_VAR)
telemetry_writer.add_configuration("_DD_LLMOBS_EVALUATOR_SAMPLING_RULES", sampling_rules_str, origin="env")
telemetry_writer.add_configuration("DD_LLMOBS_EVALUATOR_SAMPLING_RULES", sampling_rules_str, origin="env")

def parsing_failed_because(msg, maybe_throw_this):
telemetry_writer.add_log(
Expand Down
13 changes: 13 additions & 0 deletions releasenotes/notes/ragas-integration-a81b696757c0e7a5.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
---
features:
- |
LLM Observability: This introduces an integration with the [RAGAS](https://docs.ragas.io/en/stable/) evaluation framework to continuously monitor
the performance of context-augmented LLM generations in production.

The integration supports evaluating LLM inferences with the following RAGAS metrics:
- [Faithfulness](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/faithfulness/): measures if the LLM response is faithful to the provided context.
- [Answer Relevancy](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/answer_relevance/): measures how relevant the LLM response is to the user input.
- [Context Precision](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/context_precision/): measures how effectively the context is used in the generated response.

# (TODI): UPDATE TO CORRECT LINK LATER!!!
For more information, please see the [RAGAS Integration documentation](https://docs.datadoghq.com/llm_observability/submit_evaluations/ragas_integration).
2 changes: 1 addition & 1 deletion tests/llmobs/test_llmobs_evaluator_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def test_evaluator_runner_on_exit(mock_writer_logs, run_python_code_in_subproces


def test_evaluator_runner_unsupported_evaluator():
with override_env({"_DD_LLMOBS_EVALUATORS": "unsupported"}):
with override_env({"DD_LLMOBS_EVALUATORS": "unsupported"}):
with pytest.raises(ValueError):
EvaluatorRunner(interval=0.01, llmobs_service=mock.MagicMock())

Expand Down
2 changes: 1 addition & 1 deletion tests/llmobs/test_llmobs_ragas_evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_log
"PYTHONPATH": ":".join(pypath),
"OPENAI_API_KEY": os.getenv("OPENAI_API_KEY", "dummy-openai-api-key"),
"_DD_LLMOBS_EVALUATOR_INTERVAL": "5",
"_DD_LLMOBS_EVALUATORS": "ragas_faithfulness",
"DD_LLMOBS_EVALUATORS": "ragas_faithfulness",
"DD_TRACE_ENABLED": "0",
}
)
Expand Down
4 changes: 2 additions & 2 deletions tests/llmobs/test_llmobs_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -1388,7 +1388,7 @@ def test_llmobs_fork_recreates_and_restarts_eval_metric_writer():

def test_llmobs_fork_recreates_and_restarts_evaluator_runner(mock_ragas_evaluator):
"""Test that forking a process correctly recreates and restarts the EvaluatorRunner."""
with override_env(dict(_DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
with override_env(dict(DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
with mock.patch("ddtrace.llmobs._evaluators.runner.EvaluatorRunner.periodic"):
llmobs_service.enable(_tracer=DummyTracer(), ml_app="test_app")
original_pid = llmobs_service._instance.tracer._pid
Expand Down Expand Up @@ -1757,7 +1757,7 @@ async def test_annotation_context_async_nested(llmobs):
def test_service_enable_starts_evaluator_runner_when_evaluators_exist():
pytest.importorskip("ragas")
with override_global_config(dict(_dd_api_key="<not-a-real-api-key>", _llmobs_ml_app="<ml-app-name>")):
with override_env(dict(_DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
with override_env(dict(DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
dummy_tracer = DummyTracer()
llmobs_service.enable(_tracer=dummy_tracer)
llmobs_instance = llmobs_service._instance
Expand Down
Loading