correctness done

This commit is contained in:
leehk 2025-03-13 15:04:21 +08:00
parent 486a79a2cc
commit fcb2f9e4ea
6 changed files with 174 additions and 121 deletions

View File

@ -1,4 +1,4 @@
from typing import Literal, List
from typing import Literal
from pydantic import BaseModel, Field

View File

@ -1,99 +1,77 @@
import os
from decouple import config
from openevals.llm import create_llm_as_judge
from openevals.prompts import (
CORRECTNESS_PROMPT,
CONCISENESS_PROMPT,
HALLUCINATION_PROMPT
)
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_deepseek import ChatDeepSeek
from langchain_community.llms.moonshot import Moonshot
GEMINI_API_KEY = config("GOOGLE_API_KEY", cast=str)
DEEKSEEK_API_KEY = config("DEEKSEEK_API_KEY", cast=str)
MOONSHOT_API_KEY = config("MOONSHOT_API_KEY", cast=str)
from pydantic import BaseModel, Field
# correctness
gemini_evaluator_correctness = create_llm_as_judge(
prompt=CORRECTNESS_PROMPT,
judge=ChatGoogleGenerativeAI(
from prompts_library import CORRECTNESS_PROMPT
os.environ["GOOGLE_API_KEY"] = config("GOOGLE_API_KEY", cast=str)
os.environ["DEEPSEEK_API_KEY"] = config("DEEPSEEK_API_KEY", cast=str)
os.environ["MOONSHOT_API_KEY"] = config("MOONSHOT_API_KEY", cast=str)
# Define output schema for the evaluation
class CorrectnessGrade(BaseModel):
score: int = Field(description="Numerical score (1-5) indicating the correctness of the response.")
# Todo:
# class RelevanceGrade(BaseModel):
def gemini_evaluator_correctness(outputs: dict, reference_outputs: dict) -> CorrectnessGrade:
llm = ChatGoogleGenerativeAI(
model="gemini-1.5-flash",
google_api_key=GEMINI_API_KEY,
temperature=0.5,
),
)
)
deepseek_evaluator_correctness = create_llm_as_judge(
prompt=CORRECTNESS_PROMPT,
judge=ChatDeepSeek(
messages = [
{"role": "system", "content": CORRECTNESS_PROMPT},
{"role": "user", "content": f"""Ground Truth answer: {reference_outputs["answer"]};
Student's Answer: {outputs['response']}
"""}
]
response = llm.invoke(messages)
return CorrectnessGrade(score=int(response.content)).score
def deepseek_evaluator_correctness(outputs: dict, reference_outputs: dict) -> CorrectnessGrade:
llm = ChatDeepSeek(
model="deepseek-chat",
temperature=0.5,
api_key=DEEKSEEK_API_KEY
),
)
)
moonshot_evaluator_correctness = create_llm_as_judge(
prompt=CORRECTNESS_PROMPT,
judge=Moonshot(
model="moonshot-v1-128k",
messages = [
{"role": "system", "content": CORRECTNESS_PROMPT},
{"role": "user", "content": f"""Ground Truth answer: {reference_outputs["answer"]};
Student's Answer: {outputs['response']}
"""}
]
response = llm.invoke(messages)
return CorrectnessGrade(score=int(response.content)).score
def moonshot_evaluator_correctness(outputs: dict, reference_outputs: dict) -> CorrectnessGrade:
llm = Moonshot(
model="moonshot-v1-128k",
temperature=0.5,
api_key=MOONSHOT_API_KEY
),
)
)
# conciseness
gemini_evaluator_conciseness = create_llm_as_judge(
prompt=CONCISENESS_PROMPT,
judge=ChatGoogleGenerativeAI(
model="gemini-1.5-flash",
google_api_key=GEMINI_API_KEY,
temperature=0.5,
),
)
messages = [
{"role": "system", "content": CORRECTNESS_PROMPT},
{"role": "user", "content": f"""Ground Truth answer: {reference_outputs["answer"]};
Student's Answer: {outputs['response']}
"""}
]
deepseek_evaluator_conciseness = create_llm_as_judge(
prompt=CONCISENESS_PROMPT,
judge=ChatDeepSeek(
model="deepseek-chat",
temperature=0.5,
api_key=DEEKSEEK_API_KEY
),
)
moonshot_evaluator_conciseness = create_llm_as_judge(
prompt=CONCISENESS_PROMPT,
judge=Moonshot(
model="moonshot-v1-128k",
temperature=0.5,
api_key=MOONSHOT_API_KEY
),
)
# hallucination
gemini_evaluator_hallucination = create_llm_as_judge(
prompt=HALLUCINATION_PROMPT,
judge=ChatGoogleGenerativeAI(
model="gemini-1.5-flash",
google_api_key=GEMINI_API_KEY,
temperature=0.5,
),
)
deepseek_evaluator_hallucination = create_llm_as_judge(
prompt=HALLUCINATION_PROMPT,
judge=ChatDeepSeek(
model="deepseek-chat",
temperature=0.5,
api_key=DEEKSEEK_API_KEY
),
)
moonshot_evaluator_hallucination = create_llm_as_judge(
prompt=HALLUCINATION_PROMPT,
judge=Moonshot(
model="moonshot-v1-128k",
temperature=0.5,
api_key=MOONSHOT_API_KEY
),
)
response = llm.invoke(messages)
return CorrectnessGrade(score=int(response)).score

View File

@ -16,4 +16,39 @@ system_answer_grader = """You are a grader assessing whether an answer addresses
Give a binary score 'yes' or 'no'. Yes' means that the answer resolves the question."""
system_question_rewriter = """You a question re-writer that converts an input question to a better version that is optimized \n
for vectorstore retrieval. Look at the input and try to reason about the underlying semantic intent / meaning."""
for vectorstore retrieval. Look at the input and try to reason about the underlying semantic intent / meaning."""
# Evaluation
CORRECTNESS_PROMPT = """Evaluate Student Answer against Ground Truth for conceptual similarity and correctness.
You are an impartial judge. Evaluate Student Answer against Ground Truth for conceptual similarity and correctness.
You may also be given additional information that was used by the model to generate the output.
Your task is to determine a numerical score called faithfulness based on the input and output.
A definition of correctness and a grading rubric are provided below.
You must use the grading rubric to determine your score.
Metric definition:
Correctness assesses the degree to which a provided output aligns with factual accuracy, completeness, logical
consistency, and precise terminology. It evaluates the intrinsic validity of the output, independent of any
external context. A higher score indicates a higher adherence to factual accuracy, completeness, logical consistency,
and precise terminology.
Grading rubric:
Correctness: Below are the details for different scores:
- 1: Major factual errors, highly incomplete, illogical, and uses incorrect terminology.
- 2: Significant factual errors, incomplete, noticeable logical flaws, and frequent terminology errors.
- 3: Minor factual errors, somewhat incomplete, minor logical inconsistencies, and occasional terminology errors.
- 4: Few to no factual errors, mostly complete, strong logical consistency, and accurate terminology.
- 5: Accurate, complete, logically consistent, and uses precise terminology.
Reminder:
- Carefully read the input and output
- Check for factual accuracy and completeness
- Focus on correctness of information rather than style or verbosity
- The goal is to evaluate factual correctness and completeness of the response.
- Please provide your answer score only with the numerical number between 1 and 5. No score: or other text is allowed.
"""

View File

@ -24,7 +24,6 @@ build_dependencies:
- tavily-python
- langchain_huggingface
- pydantic
- openevals
# Dependencies required to run the project.
dependencies:
- mlflow==2.8.1

View File

@ -38,30 +38,26 @@ from evaluators import (
gemini_evaluator_correctness,
deepseek_evaluator_correctness,
moonshot_evaluator_correctness,
gemini_evaluator_conciseness,
deepseek_evaluator_conciseness,
moonshot_evaluator_conciseness,
gemini_evaluator_hallucination,
deepseek_evaluator_hallucination,
moonshot_evaluator_hallucination
# gemini_evaluator_conciseness,
# deepseek_evaluator_conciseness,
# moonshot_evaluator_conciseness,
# gemini_evaluator_hallucination,
# deepseek_evaluator_hallucination,
# moonshot_evaluator_hallucination
)
logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s")
logger = logging.getLogger()
GEMINI_API_KEY = config("GOOGLE_API_KEY", cast=str)
DEEKSEEK_API_KEY = config("DEEKSEEK_API_KEY", cast=str)
MOONSHOT_API_KEY = config("MOONSHOT_API_KEY", cast=str)
TAVILY_API_KEY = config("TAVILY_API_KEY", cast=str)
LANGSMITH_API_KEY = config("LANGSMITH_API_KEY", cast=str)
LANGSMITH_TRACING = config("LANGSMITH_TRACING", cast=str)
LANGSMITH_PROJECT = config("LANGSMITH_PROJECT", cast=str)
os.environ["TAVILY_API_KEY"] = TAVILY_API_KEY
os.environ["GOOGLE_API_KEY"] = config("GOOGLE_API_KEY", cast=str)
os.environ["DEEPSEEK_API_KEY"] = config("DEEPSEEK_API_KEY", cast=str)
os.environ["MOONSHOT_API_KEY"] = config("MOONSHOT_API_KEY", cast=str)
os.environ["TAVILY_API_KEY"] = config("TAVILY_API_KEY", cast=str)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["LANGSMITH_API_KEY"] = LANGSMITH_API_KEY
os.environ["LANGSMITH_TRACING"] = LANGSMITH_TRACING
os.environ["LANGSMITH_API_KEY"] = config("LANGSMITH_API_KEY", cast=str)
os.environ["LANGSMITH_TRACING"] = config("LANGSMITH_TRACING", cast=str)
os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGSMITH_PROJECT"] = LANGSMITH_PROJECT
os.environ["LANGSMITH_PROJECT"] = config("LANGSMITH_PROJECT", cast=str)
def go(args):
@ -95,12 +91,10 @@ def go(args):
max_tokens=None,
timeout=None,
max_retries=2,
api_key=DEEKSEEK_API_KEY
)
elif args.chat_model_provider == 'gemini':
llm = ChatGoogleGenerativeAI(
model="gemini-1.5-flash",
google_api_key=GEMINI_API_KEY,
temperature=0,
max_retries=3,
streaming=True
@ -112,7 +106,6 @@ def go(args):
max_tokens=None,
timeout=None,
max_retries=2,
api_key=MOONSHOT_API_KEY
)
# Load data from ChromaDB
@ -479,7 +472,61 @@ def go(args):
pprint("\n---\n")
# Final generation
pprint(value["generation"])
print(value["generation"])
return {"response": value["generation"]}
def go_evaluation(args):
if args.query_evaluation_dataset_csv_path:
# import pandas as pd
# from tqdm import tqdm
# df = pd.read_csv(args.query_evaluation_dataset_csv_path)
client = Client()
# # Create inputs and reference outputs
# examples = [
# (
# "Which country is Mount Kilimanjaro located in?",
# "Mount Kilimanjaro is located in Tanzania.",
# ),
# (
# "What is Earth's lowest point?",
# "Earth's lowest point is The Dead Sea.",
# ),
# ]
# inputs = [{"question": input_prompt} for input_prompt, _ in examples]
# outputs = [{"answer": output_answer} for _, output_answer in examples]
# # Programmatically create a dataset in LangSmith
# dataset = client.create_dataset(
# dataset_name = "Sample dataset",
# description = "A sample dataset in LangSmith."
# )
# # Add examples to the dataset
# client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)
def target(inputs: dict) -> dict:
new_args = argparse.Namespace(**vars(args))
new_args.query = inputs["question"]
return go(new_args)
# After running the evaluation, a link will be provided to view the results in langsmith
experiment_results = client.evaluate(
target,
data = "Sample dataset",
evaluators = [
moonshot_evaluator_correctness,
deepseek_evaluator_correctness,
gemini_evaluator_correctness
# can add multiple evaluators here
],
experiment_prefix = "first-eval-in-langsmith",
max_concurrency = 1,
)
@ -523,4 +570,5 @@ if __name__ == "__main__":
args = parser.parse_args()
go(args)
# go(args)
go_evaluation(args)

View File

@ -14,18 +14,14 @@ from langchain_community.llms.moonshot import Moonshot
logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s")
logger = logging.getLogger()
os.environ["GOOGLE_API_KEY"] = config("GOOGLE_API_KEY", cast=str)
os.environ["DEEPSEEK_API_KEY"] = config("DEEPSEEK_API_KEY", cast=str)
os.environ["MOONSHOT_API_KEY"] = config("MOONSHOT_API_KEY", cast=str)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
GEMINI_API_KEY = config("GOOGLE_API_KEY", cast=str)
DEEKSEEK_API_KEY = config("DEEKSEEK_API_KEY", cast=str)
MOONSHOT_API_KEY = config("MOONSHOT_API_KEY", cast=str)
LANGSMITH_API_KEY = config("LANGSMITH_API_KEY", cast=str)
LANGSMITH_TRACING = config("LANGSMITH_TRACING", cast=str)
LANGSMITH_PROJECT = config("LANGSMITH_PROJECT", cast=str)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["LANGSMITH_API_KEY"] = LANGSMITH_API_KEY
os.environ["LANGSMITH_TRACING"] = LANGSMITH_TRACING
os.environ["LANGSMITH_API_KEY"] = config("LANGSMITH_API_KEY", cast=str)
os.environ["LANGSMITH_TRACING"] = config("LANGSMITH_TRACING", cast=str)
os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGSMITH_PROJECT"] = LANGSMITH_PROJECT
os.environ["LANGSMITH_PROJECT"] = config("LANGSMITH_PROJECT", cast=str)
def go(args):
@ -68,14 +64,12 @@ def go(args):
max_tokens=None,
timeout=None,
max_retries=2,
api_key=DEEKSEEK_API_KEY
)
elif args.chat_model_provider == "gemini":
# Initialize Gemini model
llm = ChatGoogleGenerativeAI(
model="gemini-1.5-flash",
google_api_key=GEMINI_API_KEY,
temperature=0,
max_retries=3
)
@ -88,7 +82,6 @@ def go(args):
max_tokens=None,
timeout=None,
max_retries=2,
api_key=MOONSHOT_API_KEY
)