DataDog · lievan · Jan 14, 2025 · Jan 14, 2025 · Jan 17, 2025
@@ -45,7 +45,7 @@ def __init__(self, interval: float, llmobs_service=None, evaluators=None):
         if len(self.evaluators) > 0:
             return
 
-        evaluator_str = os.getenv("_DD_LLMOBS_EVALUATORS")
+        evaluator_str = os.getenv("DD_LLMOBS_EVALUATORS")
         if evaluator_str is None:
             return
 

@@ -46,7 +46,7 @@ def __repr__(self):
 
 
 class EvaluatorRunnerSampler:
-    SAMPLING_RULES_ENV_VAR = "_DD_LLMOBS_EVALUATOR_SAMPLING_RULES"
+    SAMPLING_RULES_ENV_VAR = "DD_LLMOBS_EVALUATOR_SAMPLING_RULES"
 
     def __init__(self):
         self.rules = self.parse_rules()
@@ -60,7 +60,7 @@ def sample(self, evaluator_label, span):
     def parse_rules(self) -> List[EvaluatorRunnerSamplingRule]:
         rules = []
         sampling_rules_str = os.getenv(self.SAMPLING_RULES_ENV_VAR)
-        telemetry_writer.add_configuration("_DD_LLMOBS_EVALUATOR_SAMPLING_RULES", sampling_rules_str, origin="env")
+        telemetry_writer.add_configuration("DD_LLMOBS_EVALUATOR_SAMPLING_RULES", sampling_rules_str, origin="env")
 
         def parsing_failed_because(msg, maybe_throw_this):
             telemetry_writer.add_log(

@@ -0,0 +1,13 @@
+---
+features:
+  - |
+    LLM Observability: This introduces an integration with the [RAGAS](https://docs.ragas.io/en/stable/) evaluation framework to continuously monitor 
+                      the performance of context-augmented LLM generations in production.
+
+                      The integration supports evaluating LLM inferences with the following RAGAS metrics:
+                      - [Faithfulness](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/faithfulness/): measures if the LLM response is faithful to the provided context.
+                      - [Answer Relevancy](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/answer_relevance/): measures how relevant the LLM response is to the user input.
+                      - [Context Precision](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/context_precision/):  measures how effectively the context is used in the generated response.
+
+                      # (TODI): UPDATE TO CORRECT LINK LATER!!!
+                      For more information, please see the [RAGAS Integration documentation](https://docs.datadoghq.com/llm_observability/submit_evaluations/ragas_integration).
@@ -115,7 +115,7 @@ def test_evaluator_runner_on_exit(mock_writer_logs, run_python_code_in_subproces
 
 
 def test_evaluator_runner_unsupported_evaluator():
-    with override_env({"_DD_LLMOBS_EVALUATORS": "unsupported"}):
+    with override_env({"DD_LLMOBS_EVALUATORS": "unsupported"}):
         with pytest.raises(ValueError):
             EvaluatorRunner(interval=0.01, llmobs_service=mock.MagicMock())
 

@@ -235,7 +235,7 @@ def test_llmobs_with_faithfulness_emits_traces_and_evals_on_exit(mock_writer_log
             "PYTHONPATH": ":".join(pypath),
             "OPENAI_API_KEY": os.getenv("OPENAI_API_KEY", "dummy-openai-api-key"),
             "_DD_LLMOBS_EVALUATOR_INTERVAL": "5",
-            "_DD_LLMOBS_EVALUATORS": "ragas_faithfulness",
+            "DD_LLMOBS_EVALUATORS": "ragas_faithfulness",
             "DD_TRACE_ENABLED": "0",
         }
     )

@@ -1388,7 +1388,7 @@ def test_llmobs_fork_recreates_and_restarts_eval_metric_writer():
 
 def test_llmobs_fork_recreates_and_restarts_evaluator_runner(mock_ragas_evaluator):
     """Test that forking a process correctly recreates and restarts the EvaluatorRunner."""
-    with override_env(dict(_DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
+    with override_env(dict(DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
         with mock.patch("ddtrace.llmobs._evaluators.runner.EvaluatorRunner.periodic"):
             llmobs_service.enable(_tracer=DummyTracer(), ml_app="test_app")
             original_pid = llmobs_service._instance.tracer._pid
@@ -1757,7 +1757,7 @@ async def test_annotation_context_async_nested(llmobs):
 def test_service_enable_starts_evaluator_runner_when_evaluators_exist():
     pytest.importorskip("ragas")
     with override_global_config(dict(_dd_api_key="<not-a-real-api-key>", _llmobs_ml_app="<ml-app-name>")):
-        with override_env(dict(_DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
+        with override_env(dict(DD_LLMOBS_EVALUATORS="ragas_faithfulness")):
             dummy_tracer = DummyTracer()
             llmobs_service.enable(_tracer=dummy_tracer)
             llmobs_instance = llmobs_service._instance