2025-03-13 16:19:39 +08:00

142 lines
4.3 KiB
Python

import os
from decouple import config
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_deepseek import ChatDeepSeek
from langchain_community.llms.moonshot import Moonshot
from pydantic import BaseModel, Field
from prompts_library import CORRECTNESS_PROMPT, FAITHFULNESS_PROMPT
os.environ["GOOGLE_API_KEY"] = config("GOOGLE_API_KEY", cast=str)
os.environ["DEEPSEEK_API_KEY"] = config("DEEPSEEK_API_KEY", cast=str)
os.environ["MOONSHOT_API_KEY"] = config("MOONSHOT_API_KEY", cast=str)
# Define output schema for the evaluation
class CorrectnessGrade(BaseModel):
score: int = Field(description="Numerical score (1-5) indicating the correctness of the response.")
class FaithfulnessGrade(BaseModel):
score: int = Field(description="Numerical score (1-5) indicating the faithfulness of the response.")
# Evaluators
def gemini_evaluator_correctness(outputs: dict, reference_outputs: dict) -> CorrectnessGrade:
llm = ChatGoogleGenerativeAI(
model="gemini-1.5-flash",
temperature=0.5,
)
messages = [
{"role": "system", "content": CORRECTNESS_PROMPT},
{"role": "user", "content": f"""Ground Truth answer: {reference_outputs["answer"]};
Student's Answer: {outputs['response']}
"""}
]
response = llm.invoke(messages)
return CorrectnessGrade(score=int(response.content)).score
def deepseek_evaluator_correctness(outputs: dict, reference_outputs: dict) -> CorrectnessGrade:
llm = ChatDeepSeek(
model="deepseek-chat",
temperature=0.5,
)
messages = [
{"role": "system", "content": CORRECTNESS_PROMPT},
{"role": "user", "content": f"""Ground Truth answer: {reference_outputs["answer"]};
Student's Answer: {outputs['response']}
"""}
]
response = llm.invoke(messages)
return CorrectnessGrade(score=int(response.content)).score
def moonshot_evaluator_correctness(outputs: dict, reference_outputs: dict) -> CorrectnessGrade:
llm = Moonshot(
model="moonshot-v1-128k",
temperature=0.5,
)
messages = [
{"role": "system", "content": CORRECTNESS_PROMPT},
{"role": "user", "content": f"""Ground Truth answer: {reference_outputs["answer"]};
Student's Answer: {outputs['response']}
"""}
]
response = llm.invoke(messages)
try:
return CorrectnessGrade(score=int(response)).score
except ValueError:
score_str = response.split(":")[1].strip()
return CorrectnessGrade(score=int(score_str)).score
def gemini_evaluator_faithfulness(outputs: dict, reference_outputs: dict) -> FaithfulnessGrade:
llm = ChatGoogleGenerativeAI(
model="gemini-1.5-pro",
temperature=0.5,
)
messages = [
{"role": "system", "content": FAITHFULNESS_PROMPT},
{"role": "user", "content": f"""Context: {reference_outputs["answer"]};
Output: {outputs['response']}
"""}
]
response = llm.invoke(messages)
return FaithfulnessGrade(score=int(response.content)).score
def deepseek_evaluator_faithfulness(outputs: dict, reference_outputs: dict) -> FaithfulnessGrade:
llm = ChatDeepSeek(
model="deepseek-chat",
temperature=0.5,
)
messages = [
{"role": "system", "content": FAITHFULNESS_PROMPT},
{"role": "user", "content": f"""Context: {reference_outputs["answer"]};
Output: {outputs['response']}
"""}
]
response = llm.invoke(messages)
return FaithfulnessGrade(score=int(response.content)).score
def moonshot_evaluator_faithfulness(outputs: dict, reference_outputs: dict) -> FaithfulnessGrade:
llm = Moonshot(
model="moonshot-v1-128k",
temperature=0.5,
)
messages = [
{"role": "system", "content": FAITHFULNESS_PROMPT},
{"role": "user", "content": f"""Context: {reference_outputs["answer"]};
Output: {outputs['response']}
"""}
]
response = llm.invoke(messages)
try:
return FaithfulnessGrade(score=int(response)).score
except ValueError:
score_str = response.split(":")[1].strip()
return FaithfulnessGrade(score=int(score_str)).score