mirror of
https://github.com/aimingmed/aimingmed-ai.git
synced 2026-02-04 14:19:49 +08:00
142 lines
4.3 KiB
Python
142 lines
4.3 KiB
Python
import os
|
|
from decouple import config
|
|
|
|
from langchain_google_genai import ChatGoogleGenerativeAI
|
|
from langchain_deepseek import ChatDeepSeek
|
|
from langchain_community.llms.moonshot import Moonshot
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
from prompts_library import CORRECTNESS_PROMPT, FAITHFULNESS_PROMPT
|
|
|
|
os.environ["GOOGLE_API_KEY"] = config("GOOGLE_API_KEY", cast=str)
|
|
os.environ["DEEPSEEK_API_KEY"] = config("DEEPSEEK_API_KEY", cast=str)
|
|
os.environ["MOONSHOT_API_KEY"] = config("MOONSHOT_API_KEY", cast=str)
|
|
|
|
|
|
# Define output schema for the evaluation
|
|
class CorrectnessGrade(BaseModel):
|
|
score: int = Field(description="Numerical score (1-5) indicating the correctness of the response.")
|
|
|
|
class FaithfulnessGrade(BaseModel):
|
|
score: int = Field(description="Numerical score (1-5) indicating the faithfulness of the response.")
|
|
|
|
|
|
|
|
# Evaluators
|
|
def gemini_evaluator_correctness(outputs: dict, reference_outputs: dict) -> CorrectnessGrade:
|
|
llm = ChatGoogleGenerativeAI(
|
|
model="gemini-1.5-flash",
|
|
temperature=0.5,
|
|
)
|
|
|
|
messages = [
|
|
{"role": "system", "content": CORRECTNESS_PROMPT},
|
|
{"role": "user", "content": f"""Ground Truth answer: {reference_outputs["answer"]};
|
|
Student's Answer: {outputs['response']}
|
|
"""}
|
|
]
|
|
|
|
response = llm.invoke(messages)
|
|
|
|
return CorrectnessGrade(score=int(response.content)).score
|
|
|
|
|
|
def deepseek_evaluator_correctness(outputs: dict, reference_outputs: dict) -> CorrectnessGrade:
|
|
llm = ChatDeepSeek(
|
|
model="deepseek-chat",
|
|
temperature=0.5,
|
|
)
|
|
|
|
messages = [
|
|
{"role": "system", "content": CORRECTNESS_PROMPT},
|
|
{"role": "user", "content": f"""Ground Truth answer: {reference_outputs["answer"]};
|
|
Student's Answer: {outputs['response']}
|
|
"""}
|
|
]
|
|
|
|
response = llm.invoke(messages)
|
|
|
|
return CorrectnessGrade(score=int(response.content)).score
|
|
|
|
|
|
def moonshot_evaluator_correctness(outputs: dict, reference_outputs: dict) -> CorrectnessGrade:
|
|
llm = Moonshot(
|
|
model="moonshot-v1-128k",
|
|
temperature=0.5,
|
|
)
|
|
|
|
messages = [
|
|
{"role": "system", "content": CORRECTNESS_PROMPT},
|
|
{"role": "user", "content": f"""Ground Truth answer: {reference_outputs["answer"]};
|
|
Student's Answer: {outputs['response']}
|
|
"""}
|
|
]
|
|
|
|
response = llm.invoke(messages)
|
|
|
|
try:
|
|
return CorrectnessGrade(score=int(response)).score
|
|
except ValueError:
|
|
score_str = response.split(":")[1].strip()
|
|
return CorrectnessGrade(score=int(score_str)).score
|
|
|
|
|
|
def gemini_evaluator_faithfulness(outputs: dict, reference_outputs: dict) -> FaithfulnessGrade:
|
|
llm = ChatGoogleGenerativeAI(
|
|
model="gemini-1.5-pro",
|
|
temperature=0.5,
|
|
)
|
|
|
|
messages = [
|
|
{"role": "system", "content": FAITHFULNESS_PROMPT},
|
|
{"role": "user", "content": f"""Context: {reference_outputs["answer"]};
|
|
Output: {outputs['response']}
|
|
"""}
|
|
]
|
|
|
|
response = llm.invoke(messages)
|
|
|
|
return FaithfulnessGrade(score=int(response.content)).score
|
|
|
|
|
|
def deepseek_evaluator_faithfulness(outputs: dict, reference_outputs: dict) -> FaithfulnessGrade:
|
|
llm = ChatDeepSeek(
|
|
model="deepseek-chat",
|
|
temperature=0.5,
|
|
)
|
|
|
|
messages = [
|
|
{"role": "system", "content": FAITHFULNESS_PROMPT},
|
|
{"role": "user", "content": f"""Context: {reference_outputs["answer"]};
|
|
Output: {outputs['response']}
|
|
"""}
|
|
]
|
|
|
|
response = llm.invoke(messages)
|
|
|
|
return FaithfulnessGrade(score=int(response.content)).score
|
|
|
|
|
|
def moonshot_evaluator_faithfulness(outputs: dict, reference_outputs: dict) -> FaithfulnessGrade:
|
|
llm = Moonshot(
|
|
model="moonshot-v1-128k",
|
|
temperature=0.5,
|
|
)
|
|
|
|
messages = [
|
|
{"role": "system", "content": FAITHFULNESS_PROMPT},
|
|
{"role": "user", "content": f"""Context: {reference_outputs["answer"]};
|
|
Output: {outputs['response']}
|
|
"""}
|
|
]
|
|
|
|
response = llm.invoke(messages)
|
|
|
|
try:
|
|
return FaithfulnessGrade(score=int(response)).score
|
|
except ValueError:
|
|
score_str = response.split(":")[1].strip()
|
|
return FaithfulnessGrade(score=int(score_str)).score
|
|
|