From b6ca6ac677181512a3e8777691dc40c693b82029 Mon Sep 17 00:00:00 2001
From: leehk <leehongkai@gmail.com>
Date: Thu, 13 Mar 2025 16:19:39 +0800
Subject: [PATCH] faithfulness done

---
 .../src/rag_adaptive_evaluation/evaluators.py | 72 +++++++++++++++++--
 .../prompts_library.py                        | 44 +++++++++---
 app/llmops/src/rag_adaptive_evaluation/run.py | 14 ++--
 3 files changed, 109 insertions(+), 21 deletions(-)

diff --git a/app/llmops/src/rag_adaptive_evaluation/evaluators.py b/app/llmops/src/rag_adaptive_evaluation/evaluators.py
index 0d16dea..f7f1184 100644
--- a/app/llmops/src/rag_adaptive_evaluation/evaluators.py
+++ b/app/llmops/src/rag_adaptive_evaluation/evaluators.py
@@ -7,7 +7,7 @@ from langchain_community.llms.moonshot import Moonshot
 
 from pydantic import BaseModel, Field
 
-from prompts_library import CORRECTNESS_PROMPT
+from prompts_library import CORRECTNESS_PROMPT, FAITHFULNESS_PROMPT
 
 os.environ["GOOGLE_API_KEY"] = config("GOOGLE_API_KEY", cast=str)
 os.environ["DEEPSEEK_API_KEY"] = config("DEEPSEEK_API_KEY", cast=str)
@@ -18,11 +18,12 @@ os.environ["MOONSHOT_API_KEY"] = config("MOONSHOT_API_KEY", cast=str)
 class CorrectnessGrade(BaseModel):
     score: int = Field(description="Numerical score (1-5) indicating the correctness of the response.")
 
-# Todo:
-# class RelevanceGrade(BaseModel):
+class FaithfulnessGrade(BaseModel):
+    score: int = Field(description="Numerical score (1-5) indicating the faithfulness of the response.")
 
 
 
+# Evaluators
 def gemini_evaluator_correctness(outputs: dict, reference_outputs: dict) -> CorrectnessGrade:
     llm = ChatGoogleGenerativeAI(
                 model="gemini-1.5-flash", 
@@ -74,4 +75,67 @@ def moonshot_evaluator_correctness(outputs: dict, reference_outputs: dict) -> Co
 
     response = llm.invoke(messages)
 
-    return CorrectnessGrade(score=int(response)).score
+    try:
+        return CorrectnessGrade(score=int(response)).score
+    except ValueError:
+        score_str = response.split(":")[1].strip()
+        return CorrectnessGrade(score=int(score_str)).score
+    
+
+def gemini_evaluator_faithfulness(outputs: dict, reference_outputs: dict) -> FaithfulnessGrade:
+    llm = ChatGoogleGenerativeAI(
+                model="gemini-1.5-pro", 
+                temperature=0.5,
+            )
+
+    messages = [
+        {"role": "system", "content": FAITHFULNESS_PROMPT},
+        {"role": "user", "content": f"""Context: {reference_outputs["answer"]};
+        Output: {outputs['response']}
+        """}
+    ]
+
+    response = llm.invoke(messages)
+
+    return FaithfulnessGrade(score=int(response.content)).score
+
+
+def deepseek_evaluator_faithfulness(outputs: dict, reference_outputs: dict) -> FaithfulnessGrade:
+    llm = ChatDeepSeek(
+                model="deepseek-chat", 
+                temperature=0.5,
+            )
+
+    messages = [
+        {"role": "system", "content": FAITHFULNESS_PROMPT},
+        {"role": "user", "content": f"""Context: {reference_outputs["answer"]};
+        Output: {outputs['response']}
+        """}
+    ]
+
+    response = llm.invoke(messages)
+
+    return FaithfulnessGrade(score=int(response.content)).score
+
+
+def moonshot_evaluator_faithfulness(outputs: dict, reference_outputs: dict) -> FaithfulnessGrade:
+    llm = Moonshot(
+                model="moonshot-v1-128k",
+                temperature=0.5,
+            )
+
+    messages = [
+        {"role": "system", "content": FAITHFULNESS_PROMPT},
+        {"role": "user", "content": f"""Context: {reference_outputs["answer"]};
+        Output: {outputs['response']}
+        """}
+    ]
+
+    response = llm.invoke(messages)
+
+    try:
+        return FaithfulnessGrade(score=int(response)).score
+    except ValueError:
+        score_str = response.split(":")[1].strip()
+        return FaithfulnessGrade(score=int(score_str)).score
+
diff --git a/app/llmops/src/rag_adaptive_evaluation/prompts_library.py b/app/llmops/src/rag_adaptive_evaluation/prompts_library.py
index 3bfed18..33d23a7 100644
--- a/app/llmops/src/rag_adaptive_evaluation/prompts_library.py
+++ b/app/llmops/src/rag_adaptive_evaluation/prompts_library.py
@@ -20,20 +20,18 @@ system_question_rewriter = """You a question re-writer that converts an input qu
 
 
 # Evaluation
-CORRECTNESS_PROMPT = """Evaluate Student Answer against Ground Truth for conceptual similarity and correctness.
-
-You are an impartial judge. Evaluate Student Answer against Ground Truth for conceptual similarity and correctness. 
+CORRECTNESS_PROMPT = """You are an impartial judge. Evaluate Student Answer against Ground Truth for conceptual similarity and correctness. 
 You may also be given additional information that was used by the model to generate the output.
 
-Your task is to determine a numerical score called faithfulness based on the input and output.
+Your task is to determine a numerical score called correctness based on the Student Answer and Ground Truth.
 A definition of correctness and a grading rubric are provided below.
 You must use the grading rubric to determine your score.
 
 Metric definition:
-Correctness assesses the degree to which a provided output aligns with factual accuracy, completeness, logical 
-consistency, and precise terminology. It evaluates the intrinsic validity of the output, independent of any 
+Correctness assesses the degree to which a provided Student Answer aligns with factual accuracy, completeness, logical 
+consistency, and precise terminology of the Ground Truth. It evaluates the intrinsic validity of the Student Answer , independent of any 
 external context. A higher score indicates a higher adherence to factual accuracy, completeness, logical consistency, 
-and precise terminology.
+and precise terminology of the Ground Truth.
 
 Grading rubric:
 Correctness: Below are the details for different scores: 
@@ -44,11 +42,37 @@ Correctness: Below are the details for different scores:
  - 5: Accurate, complete, logically consistent, and uses precise terminology.
  
  Reminder:
-  - Carefully read the input and output
-  - Check for factual accuracy and completeness
+  - Carefully read the Student Answer and Ground Truth
+  - Check for factual accuracy and completeness of Student Answer compared to the Ground Truth
   - Focus on correctness of information rather than style or verbosity
-  - The goal is to evaluate factual correctness and completeness of the response.
+  - The goal is to evaluate factual correctness and completeness of the Student Answer.
   - Please provide your answer score only with the numerical number between 1 and 5. No score: or other text is allowed.
 
 """
 
+FAITHFULNESS_PROMPT = """You are an impartial judge. Evaluate output against context for faithfulness. 
+You may also be given additional information that was used by the model to generate the Output.
+
+Your task is to determine a numerical score called faithfulness based on the output and context.
+A definition of faithfulness and a grading rubric are provided below.
+You must use the grading rubric to determine your score.
+
+Metric definition:
+Faithfulness is only evaluated with the provided output and context. Faithfulness assesses how much of the 
+provided output is factually consistent with the provided context. A higher score indicates that a higher proportion of 
+claims present in the output can be derived from the provided context. Faithfulness does not consider how much extra 
+information from the context is not present in the output.
+
+Grading rubric:
+Faithfulness: Below are the details for different scores:
+- Score 1: None of the claims in the output can be inferred from the provided context.
+- Score 2: Some of the claims in the output can be inferred from the provided context, but the majority of the output is missing from, inconsistent with, or contradictory to the provided context.
+- Score 3: Half or more of the claims in the output can be inferred from the provided context.
+- Score 4: Most of the claims in the output can be inferred from the provided context, with very little information that is not directly supported by the provided context.
+- Score 5: All of the claims in the output are directly supported by the provided context, demonstrating high faithfulness to the provided context.
+
+Reminder:
+- Carefully read the output and context
+- Focus on the information instead of the writing style or verbosity.
+- Please provide your answer score only with the numerical number between 1 and 5, according to the grading rubric above. No score: or other text is allowed.  
+"""
\ No newline at end of file
diff --git a/app/llmops/src/rag_adaptive_evaluation/run.py b/app/llmops/src/rag_adaptive_evaluation/run.py
index c629de7..bf8de6c 100644
--- a/app/llmops/src/rag_adaptive_evaluation/run.py
+++ b/app/llmops/src/rag_adaptive_evaluation/run.py
@@ -38,12 +38,9 @@ from evaluators import (
     gemini_evaluator_correctness,
     deepseek_evaluator_correctness,
     moonshot_evaluator_correctness,
-    # gemini_evaluator_conciseness,
-    # deepseek_evaluator_conciseness,
-    # moonshot_evaluator_conciseness,
-    # gemini_evaluator_hallucination,
-    # deepseek_evaluator_hallucination,
-    # moonshot_evaluator_hallucination
+    gemini_evaluator_faithfulness,
+    deepseek_evaluator_faithfulness,
+    moonshot_evaluator_faithfulness
 )
 
 logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s")
@@ -520,7 +517,10 @@ def go_evaluation(args):
             evaluators = [
                     moonshot_evaluator_correctness,
                     deepseek_evaluator_correctness,
-                    gemini_evaluator_correctness
+                    gemini_evaluator_correctness,
+                    gemini_evaluator_faithfulness,
+                    deepseek_evaluator_faithfulness,
+                    moonshot_evaluator_faithfulness
                 # can add multiple evaluators here
             ],
             experiment_prefix = "first-eval-in-langsmith",