faithfulness done

This commit is contained in:
leehk 2025-03-13 16:19:39 +08:00
parent fcb2f9e4ea
commit b6ca6ac677
3 changed files with 109 additions and 21 deletions

View File

@ -7,7 +7,7 @@ from langchain_community.llms.moonshot import Moonshot
from pydantic import BaseModel, Field
from prompts_library import CORRECTNESS_PROMPT
from prompts_library import CORRECTNESS_PROMPT, FAITHFULNESS_PROMPT
os.environ["GOOGLE_API_KEY"] = config("GOOGLE_API_KEY", cast=str)
os.environ["DEEPSEEK_API_KEY"] = config("DEEPSEEK_API_KEY", cast=str)
@ -18,11 +18,12 @@ os.environ["MOONSHOT_API_KEY"] = config("MOONSHOT_API_KEY", cast=str)
class CorrectnessGrade(BaseModel):
score: int = Field(description="Numerical score (1-5) indicating the correctness of the response.")
# Todo:
# class RelevanceGrade(BaseModel):
class FaithfulnessGrade(BaseModel):
score: int = Field(description="Numerical score (1-5) indicating the faithfulness of the response.")
# Evaluators
def gemini_evaluator_correctness(outputs: dict, reference_outputs: dict) -> CorrectnessGrade:
llm = ChatGoogleGenerativeAI(
model="gemini-1.5-flash",
@ -74,4 +75,67 @@ def moonshot_evaluator_correctness(outputs: dict, reference_outputs: dict) -> Co
response = llm.invoke(messages)
return CorrectnessGrade(score=int(response)).score
try:
return CorrectnessGrade(score=int(response)).score
except ValueError:
score_str = response.split(":")[1].strip()
return CorrectnessGrade(score=int(score_str)).score
def gemini_evaluator_faithfulness(outputs: dict, reference_outputs: dict) -> FaithfulnessGrade:
llm = ChatGoogleGenerativeAI(
model="gemini-1.5-pro",
temperature=0.5,
)
messages = [
{"role": "system", "content": FAITHFULNESS_PROMPT},
{"role": "user", "content": f"""Context: {reference_outputs["answer"]};
Output: {outputs['response']}
"""}
]
response = llm.invoke(messages)
return FaithfulnessGrade(score=int(response.content)).score
def deepseek_evaluator_faithfulness(outputs: dict, reference_outputs: dict) -> FaithfulnessGrade:
llm = ChatDeepSeek(
model="deepseek-chat",
temperature=0.5,
)
messages = [
{"role": "system", "content": FAITHFULNESS_PROMPT},
{"role": "user", "content": f"""Context: {reference_outputs["answer"]};
Output: {outputs['response']}
"""}
]
response = llm.invoke(messages)
return FaithfulnessGrade(score=int(response.content)).score
def moonshot_evaluator_faithfulness(outputs: dict, reference_outputs: dict) -> FaithfulnessGrade:
llm = Moonshot(
model="moonshot-v1-128k",
temperature=0.5,
)
messages = [
{"role": "system", "content": FAITHFULNESS_PROMPT},
{"role": "user", "content": f"""Context: {reference_outputs["answer"]};
Output: {outputs['response']}
"""}
]
response = llm.invoke(messages)
try:
return FaithfulnessGrade(score=int(response)).score
except ValueError:
score_str = response.split(":")[1].strip()
return FaithfulnessGrade(score=int(score_str)).score

View File

@ -20,20 +20,18 @@ system_question_rewriter = """You a question re-writer that converts an input qu
# Evaluation
CORRECTNESS_PROMPT = """Evaluate Student Answer against Ground Truth for conceptual similarity and correctness.
You are an impartial judge. Evaluate Student Answer against Ground Truth for conceptual similarity and correctness.
CORRECTNESS_PROMPT = """You are an impartial judge. Evaluate Student Answer against Ground Truth for conceptual similarity and correctness.
You may also be given additional information that was used by the model to generate the output.
Your task is to determine a numerical score called faithfulness based on the input and output.
Your task is to determine a numerical score called correctness based on the Student Answer and Ground Truth.
A definition of correctness and a grading rubric are provided below.
You must use the grading rubric to determine your score.
Metric definition:
Correctness assesses the degree to which a provided output aligns with factual accuracy, completeness, logical
consistency, and precise terminology. It evaluates the intrinsic validity of the output, independent of any
Correctness assesses the degree to which a provided Student Answer aligns with factual accuracy, completeness, logical
consistency, and precise terminology of the Ground Truth. It evaluates the intrinsic validity of the Student Answer , independent of any
external context. A higher score indicates a higher adherence to factual accuracy, completeness, logical consistency,
and precise terminology.
and precise terminology of the Ground Truth.
Grading rubric:
Correctness: Below are the details for different scores:
@ -44,11 +42,37 @@ Correctness: Below are the details for different scores:
- 5: Accurate, complete, logically consistent, and uses precise terminology.
Reminder:
- Carefully read the input and output
- Check for factual accuracy and completeness
- Carefully read the Student Answer and Ground Truth
- Check for factual accuracy and completeness of Student Answer compared to the Ground Truth
- Focus on correctness of information rather than style or verbosity
- The goal is to evaluate factual correctness and completeness of the response.
- The goal is to evaluate factual correctness and completeness of the Student Answer.
- Please provide your answer score only with the numerical number between 1 and 5. No score: or other text is allowed.
"""
FAITHFULNESS_PROMPT = """You are an impartial judge. Evaluate output against context for faithfulness.
You may also be given additional information that was used by the model to generate the Output.
Your task is to determine a numerical score called faithfulness based on the output and context.
A definition of faithfulness and a grading rubric are provided below.
You must use the grading rubric to determine your score.
Metric definition:
Faithfulness is only evaluated with the provided output and context. Faithfulness assesses how much of the
provided output is factually consistent with the provided context. A higher score indicates that a higher proportion of
claims present in the output can be derived from the provided context. Faithfulness does not consider how much extra
information from the context is not present in the output.
Grading rubric:
Faithfulness: Below are the details for different scores:
- Score 1: None of the claims in the output can be inferred from the provided context.
- Score 2: Some of the claims in the output can be inferred from the provided context, but the majority of the output is missing from, inconsistent with, or contradictory to the provided context.
- Score 3: Half or more of the claims in the output can be inferred from the provided context.
- Score 4: Most of the claims in the output can be inferred from the provided context, with very little information that is not directly supported by the provided context.
- Score 5: All of the claims in the output are directly supported by the provided context, demonstrating high faithfulness to the provided context.
Reminder:
- Carefully read the output and context
- Focus on the information instead of the writing style or verbosity.
- Please provide your answer score only with the numerical number between 1 and 5, according to the grading rubric above. No score: or other text is allowed.
"""

View File

@ -38,12 +38,9 @@ from evaluators import (
gemini_evaluator_correctness,
deepseek_evaluator_correctness,
moonshot_evaluator_correctness,
# gemini_evaluator_conciseness,
# deepseek_evaluator_conciseness,
# moonshot_evaluator_conciseness,
# gemini_evaluator_hallucination,
# deepseek_evaluator_hallucination,
# moonshot_evaluator_hallucination
gemini_evaluator_faithfulness,
deepseek_evaluator_faithfulness,
moonshot_evaluator_faithfulness
)
logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s")
@ -520,7 +517,10 @@ def go_evaluation(args):
evaluators = [
moonshot_evaluator_correctness,
deepseek_evaluator_correctness,
gemini_evaluator_correctness
gemini_evaluator_correctness,
gemini_evaluator_faithfulness,
deepseek_evaluator_faithfulness,
moonshot_evaluator_faithfulness
# can add multiple evaluators here
],
experiment_prefix = "first-eval-in-langsmith",