mirror of
https://github.com/aimingmed/aimingmed-ai.git
synced 2026-01-26 00:47:13 +08:00
correctness done
This commit is contained in:
parent
486a79a2cc
commit
fcb2f9e4ea
@ -1,4 +1,4 @@
|
||||
from typing import Literal, List
|
||||
from typing import Literal
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
|
||||
@ -1,99 +1,77 @@
|
||||
import os
|
||||
from decouple import config
|
||||
from openevals.llm import create_llm_as_judge
|
||||
from openevals.prompts import (
|
||||
CORRECTNESS_PROMPT,
|
||||
CONCISENESS_PROMPT,
|
||||
HALLUCINATION_PROMPT
|
||||
)
|
||||
|
||||
from langchain_google_genai import ChatGoogleGenerativeAI
|
||||
from langchain_deepseek import ChatDeepSeek
|
||||
from langchain_community.llms.moonshot import Moonshot
|
||||
|
||||
GEMINI_API_KEY = config("GOOGLE_API_KEY", cast=str)
|
||||
DEEKSEEK_API_KEY = config("DEEKSEEK_API_KEY", cast=str)
|
||||
MOONSHOT_API_KEY = config("MOONSHOT_API_KEY", cast=str)
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
# correctness
|
||||
gemini_evaluator_correctness = create_llm_as_judge(
|
||||
prompt=CORRECTNESS_PROMPT,
|
||||
judge=ChatGoogleGenerativeAI(
|
||||
from prompts_library import CORRECTNESS_PROMPT
|
||||
|
||||
os.environ["GOOGLE_API_KEY"] = config("GOOGLE_API_KEY", cast=str)
|
||||
os.environ["DEEPSEEK_API_KEY"] = config("DEEPSEEK_API_KEY", cast=str)
|
||||
os.environ["MOONSHOT_API_KEY"] = config("MOONSHOT_API_KEY", cast=str)
|
||||
|
||||
|
||||
# Define output schema for the evaluation
|
||||
class CorrectnessGrade(BaseModel):
|
||||
score: int = Field(description="Numerical score (1-5) indicating the correctness of the response.")
|
||||
|
||||
# Todo:
|
||||
# class RelevanceGrade(BaseModel):
|
||||
|
||||
|
||||
|
||||
def gemini_evaluator_correctness(outputs: dict, reference_outputs: dict) -> CorrectnessGrade:
|
||||
llm = ChatGoogleGenerativeAI(
|
||||
model="gemini-1.5-flash",
|
||||
google_api_key=GEMINI_API_KEY,
|
||||
temperature=0.5,
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
deepseek_evaluator_correctness = create_llm_as_judge(
|
||||
prompt=CORRECTNESS_PROMPT,
|
||||
judge=ChatDeepSeek(
|
||||
messages = [
|
||||
{"role": "system", "content": CORRECTNESS_PROMPT},
|
||||
{"role": "user", "content": f"""Ground Truth answer: {reference_outputs["answer"]};
|
||||
Student's Answer: {outputs['response']}
|
||||
"""}
|
||||
]
|
||||
|
||||
response = llm.invoke(messages)
|
||||
|
||||
return CorrectnessGrade(score=int(response.content)).score
|
||||
|
||||
|
||||
def deepseek_evaluator_correctness(outputs: dict, reference_outputs: dict) -> CorrectnessGrade:
|
||||
llm = ChatDeepSeek(
|
||||
model="deepseek-chat",
|
||||
temperature=0.5,
|
||||
api_key=DEEKSEEK_API_KEY
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
moonshot_evaluator_correctness = create_llm_as_judge(
|
||||
prompt=CORRECTNESS_PROMPT,
|
||||
judge=Moonshot(
|
||||
model="moonshot-v1-128k",
|
||||
messages = [
|
||||
{"role": "system", "content": CORRECTNESS_PROMPT},
|
||||
{"role": "user", "content": f"""Ground Truth answer: {reference_outputs["answer"]};
|
||||
Student's Answer: {outputs['response']}
|
||||
"""}
|
||||
]
|
||||
|
||||
response = llm.invoke(messages)
|
||||
|
||||
return CorrectnessGrade(score=int(response.content)).score
|
||||
|
||||
|
||||
def moonshot_evaluator_correctness(outputs: dict, reference_outputs: dict) -> CorrectnessGrade:
|
||||
llm = Moonshot(
|
||||
model="moonshot-v1-128k",
|
||||
temperature=0.5,
|
||||
api_key=MOONSHOT_API_KEY
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
# conciseness
|
||||
gemini_evaluator_conciseness = create_llm_as_judge(
|
||||
prompt=CONCISENESS_PROMPT,
|
||||
judge=ChatGoogleGenerativeAI(
|
||||
model="gemini-1.5-flash",
|
||||
google_api_key=GEMINI_API_KEY,
|
||||
temperature=0.5,
|
||||
),
|
||||
)
|
||||
messages = [
|
||||
{"role": "system", "content": CORRECTNESS_PROMPT},
|
||||
{"role": "user", "content": f"""Ground Truth answer: {reference_outputs["answer"]};
|
||||
Student's Answer: {outputs['response']}
|
||||
"""}
|
||||
]
|
||||
|
||||
deepseek_evaluator_conciseness = create_llm_as_judge(
|
||||
prompt=CONCISENESS_PROMPT,
|
||||
judge=ChatDeepSeek(
|
||||
model="deepseek-chat",
|
||||
temperature=0.5,
|
||||
api_key=DEEKSEEK_API_KEY
|
||||
),
|
||||
)
|
||||
|
||||
moonshot_evaluator_conciseness = create_llm_as_judge(
|
||||
prompt=CONCISENESS_PROMPT,
|
||||
judge=Moonshot(
|
||||
model="moonshot-v1-128k",
|
||||
temperature=0.5,
|
||||
api_key=MOONSHOT_API_KEY
|
||||
),
|
||||
)
|
||||
|
||||
# hallucination
|
||||
gemini_evaluator_hallucination = create_llm_as_judge(
|
||||
prompt=HALLUCINATION_PROMPT,
|
||||
judge=ChatGoogleGenerativeAI(
|
||||
model="gemini-1.5-flash",
|
||||
google_api_key=GEMINI_API_KEY,
|
||||
temperature=0.5,
|
||||
),
|
||||
)
|
||||
|
||||
deepseek_evaluator_hallucination = create_llm_as_judge(
|
||||
prompt=HALLUCINATION_PROMPT,
|
||||
judge=ChatDeepSeek(
|
||||
model="deepseek-chat",
|
||||
temperature=0.5,
|
||||
api_key=DEEKSEEK_API_KEY
|
||||
),
|
||||
)
|
||||
|
||||
moonshot_evaluator_hallucination = create_llm_as_judge(
|
||||
prompt=HALLUCINATION_PROMPT,
|
||||
judge=Moonshot(
|
||||
model="moonshot-v1-128k",
|
||||
temperature=0.5,
|
||||
api_key=MOONSHOT_API_KEY
|
||||
),
|
||||
)
|
||||
response = llm.invoke(messages)
|
||||
|
||||
return CorrectnessGrade(score=int(response)).score
|
||||
|
||||
@ -16,4 +16,39 @@ system_answer_grader = """You are a grader assessing whether an answer addresses
|
||||
Give a binary score 'yes' or 'no'. Yes' means that the answer resolves the question."""
|
||||
|
||||
system_question_rewriter = """You a question re-writer that converts an input question to a better version that is optimized \n
|
||||
for vectorstore retrieval. Look at the input and try to reason about the underlying semantic intent / meaning."""
|
||||
for vectorstore retrieval. Look at the input and try to reason about the underlying semantic intent / meaning."""
|
||||
|
||||
|
||||
# Evaluation
|
||||
CORRECTNESS_PROMPT = """Evaluate Student Answer against Ground Truth for conceptual similarity and correctness.
|
||||
|
||||
You are an impartial judge. Evaluate Student Answer against Ground Truth for conceptual similarity and correctness.
|
||||
You may also be given additional information that was used by the model to generate the output.
|
||||
|
||||
Your task is to determine a numerical score called faithfulness based on the input and output.
|
||||
A definition of correctness and a grading rubric are provided below.
|
||||
You must use the grading rubric to determine your score.
|
||||
|
||||
Metric definition:
|
||||
Correctness assesses the degree to which a provided output aligns with factual accuracy, completeness, logical
|
||||
consistency, and precise terminology. It evaluates the intrinsic validity of the output, independent of any
|
||||
external context. A higher score indicates a higher adherence to factual accuracy, completeness, logical consistency,
|
||||
and precise terminology.
|
||||
|
||||
Grading rubric:
|
||||
Correctness: Below are the details for different scores:
|
||||
- 1: Major factual errors, highly incomplete, illogical, and uses incorrect terminology.
|
||||
- 2: Significant factual errors, incomplete, noticeable logical flaws, and frequent terminology errors.
|
||||
- 3: Minor factual errors, somewhat incomplete, minor logical inconsistencies, and occasional terminology errors.
|
||||
- 4: Few to no factual errors, mostly complete, strong logical consistency, and accurate terminology.
|
||||
- 5: Accurate, complete, logically consistent, and uses precise terminology.
|
||||
|
||||
Reminder:
|
||||
- Carefully read the input and output
|
||||
- Check for factual accuracy and completeness
|
||||
- Focus on correctness of information rather than style or verbosity
|
||||
- The goal is to evaluate factual correctness and completeness of the response.
|
||||
- Please provide your answer score only with the numerical number between 1 and 5. No score: or other text is allowed.
|
||||
|
||||
"""
|
||||
|
||||
|
||||
@ -24,7 +24,6 @@ build_dependencies:
|
||||
- tavily-python
|
||||
- langchain_huggingface
|
||||
- pydantic
|
||||
- openevals
|
||||
# Dependencies required to run the project.
|
||||
dependencies:
|
||||
- mlflow==2.8.1
|
||||
@ -38,30 +38,26 @@ from evaluators import (
|
||||
gemini_evaluator_correctness,
|
||||
deepseek_evaluator_correctness,
|
||||
moonshot_evaluator_correctness,
|
||||
gemini_evaluator_conciseness,
|
||||
deepseek_evaluator_conciseness,
|
||||
moonshot_evaluator_conciseness,
|
||||
gemini_evaluator_hallucination,
|
||||
deepseek_evaluator_hallucination,
|
||||
moonshot_evaluator_hallucination
|
||||
# gemini_evaluator_conciseness,
|
||||
# deepseek_evaluator_conciseness,
|
||||
# moonshot_evaluator_conciseness,
|
||||
# gemini_evaluator_hallucination,
|
||||
# deepseek_evaluator_hallucination,
|
||||
# moonshot_evaluator_hallucination
|
||||
)
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s")
|
||||
logger = logging.getLogger()
|
||||
|
||||
GEMINI_API_KEY = config("GOOGLE_API_KEY", cast=str)
|
||||
DEEKSEEK_API_KEY = config("DEEKSEEK_API_KEY", cast=str)
|
||||
MOONSHOT_API_KEY = config("MOONSHOT_API_KEY", cast=str)
|
||||
TAVILY_API_KEY = config("TAVILY_API_KEY", cast=str)
|
||||
LANGSMITH_API_KEY = config("LANGSMITH_API_KEY", cast=str)
|
||||
LANGSMITH_TRACING = config("LANGSMITH_TRACING", cast=str)
|
||||
LANGSMITH_PROJECT = config("LANGSMITH_PROJECT", cast=str)
|
||||
os.environ["TAVILY_API_KEY"] = TAVILY_API_KEY
|
||||
os.environ["GOOGLE_API_KEY"] = config("GOOGLE_API_KEY", cast=str)
|
||||
os.environ["DEEPSEEK_API_KEY"] = config("DEEPSEEK_API_KEY", cast=str)
|
||||
os.environ["MOONSHOT_API_KEY"] = config("MOONSHOT_API_KEY", cast=str)
|
||||
os.environ["TAVILY_API_KEY"] = config("TAVILY_API_KEY", cast=str)
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
os.environ["LANGSMITH_API_KEY"] = LANGSMITH_API_KEY
|
||||
os.environ["LANGSMITH_TRACING"] = LANGSMITH_TRACING
|
||||
os.environ["LANGSMITH_API_KEY"] = config("LANGSMITH_API_KEY", cast=str)
|
||||
os.environ["LANGSMITH_TRACING"] = config("LANGSMITH_TRACING", cast=str)
|
||||
os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
|
||||
os.environ["LANGSMITH_PROJECT"] = LANGSMITH_PROJECT
|
||||
os.environ["LANGSMITH_PROJECT"] = config("LANGSMITH_PROJECT", cast=str)
|
||||
|
||||
def go(args):
|
||||
|
||||
@ -95,12 +91,10 @@ def go(args):
|
||||
max_tokens=None,
|
||||
timeout=None,
|
||||
max_retries=2,
|
||||
api_key=DEEKSEEK_API_KEY
|
||||
)
|
||||
elif args.chat_model_provider == 'gemini':
|
||||
llm = ChatGoogleGenerativeAI(
|
||||
model="gemini-1.5-flash",
|
||||
google_api_key=GEMINI_API_KEY,
|
||||
temperature=0,
|
||||
max_retries=3,
|
||||
streaming=True
|
||||
@ -112,7 +106,6 @@ def go(args):
|
||||
max_tokens=None,
|
||||
timeout=None,
|
||||
max_retries=2,
|
||||
api_key=MOONSHOT_API_KEY
|
||||
)
|
||||
|
||||
# Load data from ChromaDB
|
||||
@ -479,7 +472,61 @@ def go(args):
|
||||
pprint("\n---\n")
|
||||
|
||||
# Final generation
|
||||
pprint(value["generation"])
|
||||
print(value["generation"])
|
||||
|
||||
return {"response": value["generation"]}
|
||||
|
||||
def go_evaluation(args):
|
||||
if args.query_evaluation_dataset_csv_path:
|
||||
# import pandas as pd
|
||||
# from tqdm import tqdm
|
||||
|
||||
# df = pd.read_csv(args.query_evaluation_dataset_csv_path)
|
||||
client = Client()
|
||||
# # Create inputs and reference outputs
|
||||
# examples = [
|
||||
# (
|
||||
# "Which country is Mount Kilimanjaro located in?",
|
||||
# "Mount Kilimanjaro is located in Tanzania.",
|
||||
# ),
|
||||
# (
|
||||
# "What is Earth's lowest point?",
|
||||
# "Earth's lowest point is The Dead Sea.",
|
||||
# ),
|
||||
# ]
|
||||
|
||||
# inputs = [{"question": input_prompt} for input_prompt, _ in examples]
|
||||
# outputs = [{"answer": output_answer} for _, output_answer in examples]
|
||||
|
||||
# # Programmatically create a dataset in LangSmith
|
||||
# dataset = client.create_dataset(
|
||||
# dataset_name = "Sample dataset",
|
||||
# description = "A sample dataset in LangSmith."
|
||||
# )
|
||||
|
||||
# # Add examples to the dataset
|
||||
# client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)
|
||||
|
||||
def target(inputs: dict) -> dict:
|
||||
new_args = argparse.Namespace(**vars(args))
|
||||
new_args.query = inputs["question"]
|
||||
return go(new_args)
|
||||
|
||||
|
||||
# After running the evaluation, a link will be provided to view the results in langsmith
|
||||
experiment_results = client.evaluate(
|
||||
target,
|
||||
data = "Sample dataset",
|
||||
evaluators = [
|
||||
moonshot_evaluator_correctness,
|
||||
deepseek_evaluator_correctness,
|
||||
gemini_evaluator_correctness
|
||||
# can add multiple evaluators here
|
||||
],
|
||||
experiment_prefix = "first-eval-in-langsmith",
|
||||
max_concurrency = 1,
|
||||
|
||||
)
|
||||
|
||||
|
||||
|
||||
@ -523,4 +570,5 @@ if __name__ == "__main__":
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
go(args)
|
||||
# go(args)
|
||||
go_evaluation(args)
|
||||
@ -14,18 +14,14 @@ from langchain_community.llms.moonshot import Moonshot
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s")
|
||||
logger = logging.getLogger()
|
||||
|
||||
os.environ["GOOGLE_API_KEY"] = config("GOOGLE_API_KEY", cast=str)
|
||||
os.environ["DEEPSEEK_API_KEY"] = config("DEEPSEEK_API_KEY", cast=str)
|
||||
os.environ["MOONSHOT_API_KEY"] = config("MOONSHOT_API_KEY", cast=str)
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
GEMINI_API_KEY = config("GOOGLE_API_KEY", cast=str)
|
||||
DEEKSEEK_API_KEY = config("DEEKSEEK_API_KEY", cast=str)
|
||||
MOONSHOT_API_KEY = config("MOONSHOT_API_KEY", cast=str)
|
||||
LANGSMITH_API_KEY = config("LANGSMITH_API_KEY", cast=str)
|
||||
LANGSMITH_TRACING = config("LANGSMITH_TRACING", cast=str)
|
||||
LANGSMITH_PROJECT = config("LANGSMITH_PROJECT", cast=str)
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
os.environ["LANGSMITH_API_KEY"] = LANGSMITH_API_KEY
|
||||
os.environ["LANGSMITH_TRACING"] = LANGSMITH_TRACING
|
||||
os.environ["LANGSMITH_API_KEY"] = config("LANGSMITH_API_KEY", cast=str)
|
||||
os.environ["LANGSMITH_TRACING"] = config("LANGSMITH_TRACING", cast=str)
|
||||
os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
|
||||
os.environ["LANGSMITH_PROJECT"] = LANGSMITH_PROJECT
|
||||
os.environ["LANGSMITH_PROJECT"] = config("LANGSMITH_PROJECT", cast=str)
|
||||
|
||||
def go(args):
|
||||
|
||||
@ -68,14 +64,12 @@ def go(args):
|
||||
max_tokens=None,
|
||||
timeout=None,
|
||||
max_retries=2,
|
||||
api_key=DEEKSEEK_API_KEY
|
||||
)
|
||||
|
||||
elif args.chat_model_provider == "gemini":
|
||||
# Initialize Gemini model
|
||||
llm = ChatGoogleGenerativeAI(
|
||||
model="gemini-1.5-flash",
|
||||
google_api_key=GEMINI_API_KEY,
|
||||
temperature=0,
|
||||
max_retries=3
|
||||
)
|
||||
@ -88,7 +82,6 @@ def go(args):
|
||||
max_tokens=None,
|
||||
timeout=None,
|
||||
max_retries=2,
|
||||
api_key=MOONSHOT_API_KEY
|
||||
)
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user