add explanation

2026-01-14 13:31:07 +08:00
parent a0ad19449a
commit f9ac34498c
1 changed files with 25 additions and 6 deletions
--- a/lang_agent/eval/validator.py
+++ b/lang_agent/eval/validator.py
@@ -40,19 +40,21 @@ class Validator:
            api_key=self.config.api_key
        )
-    def default_correct(self, inputs: dict, outputs: dict, reference_outputs: dict) -> bool:
+    def default_correct(self, inputs: dict, outputs: dict, reference_outputs: dict) -> dict:
        instructions = (
            "Given an actual answer and an expected answer, determine whether"
            " the actual answer contains all of the information in the"
-            " expected answer. Respond with 'CORRECT' if the actual answer"
+            " expected answer. First provide your reasoning, then respond with"
-            " does contain all of the expected information and 'INCORRECT'"
+            " your final judgment.\n\n"
-            " otherwise. Do not include anything else in your response."
+            "Format your response EXACTLY as follows:\n"
            "EXPLANATION: <your reasoning here>\n"
            "JUDGMENT: <CORRECT or INCORRECT>"
        )
        actual_answer = outputs["output"][-1].content
        expected_answer = reference_outputs["answer"]
        if expected_answer is None:
-            return True
+            return {"score": True, "comment": "No expected answer provided, auto-pass."}
        user_msg = (
            f"ACTUAL ANSWER: {actual_answer}"
@@ -66,7 +68,24 @@ class Validator:
            ]
        )
-        return response.content.upper() == "CORRECT"
+        response_text = response.content
        # Parse the explanation and judgment from the response
        explanation = ""
        is_correct = False
        if "EXPLANATION:" in response_text:
            parts = response_text.split("JUDGMENT:")
            explanation = parts[0].replace("EXPLANATION:", "").strip()
            if len(parts) > 1:
                judgment = parts[1].strip().upper()
                is_correct = "CORRECT" in judgment and "INCORRECT" not in judgment
        else:
            # Fallback: check if response contains CORRECT/INCORRECT
            explanation = response_text
            is_correct = "CORRECT" in response_text.upper() and "INCORRECT" not in response_text.upper()
        return {"score": is_correct, "comment": explanation}
    def val_tool_use(self, inputs:dict, outputs:dict, reference_outputs:dict)->float:
        tool_uses:List[str] = reference_outputs.get("tool_use")