correctness done

2026-01-26 00:47:13 +08:00 · 2025-03-13 15:04:21 +08:00 · 2025-03-13 15:04:21 +08:00 · fcb2f9e4ea
commit fcb2f9e4ea
parent 486a79a2cc
6 changed files with 174 additions and 121 deletions
--- a/app/llmops/src/rag_adaptive_evaluation/data_models.py
+++ b/app/llmops/src/rag_adaptive_evaluation/data_models.py
@ -1,4 +1,4 @@
-from typing import Literal, List
+from typing import Literal
 from pydantic import BaseModel, Field


--- a/app/llmops/src/rag_adaptive_evaluation/evaluators.py
+++ b/app/llmops/src/rag_adaptive_evaluation/evaluators.py
@ -1,99 +1,77 @@
+import os
 from decouple import config
-from openevals.llm import create_llm_as_judge
-from openevals.prompts import (
-    CORRECTNESS_PROMPT, 
-    CONCISENESS_PROMPT, 
-    HALLUCINATION_PROMPT
-)
+
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_deepseek import ChatDeepSeek
 from langchain_community.llms.moonshot import Moonshot

-GEMINI_API_KEY = config("GOOGLE_API_KEY", cast=str)
-DEEKSEEK_API_KEY = config("DEEKSEEK_API_KEY", cast=str)
-MOONSHOT_API_KEY = config("MOONSHOT_API_KEY", cast=str)
+from pydantic import BaseModel, Field

-# correctness
-gemini_evaluator_correctness = create_llm_as_judge(
-    prompt=CORRECTNESS_PROMPT,
-    judge=ChatGoogleGenerativeAI(
+from prompts_library import CORRECTNESS_PROMPT
+
+os.environ["GOOGLE_API_KEY"] = config("GOOGLE_API_KEY", cast=str)
+os.environ["DEEPSEEK_API_KEY"] = config("DEEPSEEK_API_KEY", cast=str)
+os.environ["MOONSHOT_API_KEY"] = config("MOONSHOT_API_KEY", cast=str)
+
+
+# Define output schema for the evaluation
+class CorrectnessGrade(BaseModel):
+    score: int = Field(description="Numerical score (1-5) indicating the correctness of the response.")
+
+# Todo:
+# class RelevanceGrade(BaseModel):
+
+
+
+def gemini_evaluator_correctness(outputs: dict, reference_outputs: dict) -> CorrectnessGrade:
+    llm = ChatGoogleGenerativeAI(
                model="gemini-1.5-flash", 
-                google_api_key=GEMINI_API_KEY,
                temperature=0.5,
-            ),
-    )
+            )

-deepseek_evaluator_correctness = create_llm_as_judge(
-    prompt=CORRECTNESS_PROMPT,
-    judge=ChatDeepSeek(
+    messages = [
+        {"role": "system", "content": CORRECTNESS_PROMPT},
+        {"role": "user", "content": f"""Ground Truth answer: {reference_outputs["answer"]};
+        Student's Answer: {outputs['response']}
+        """}
+    ]
+
+    response = llm.invoke(messages)
+
+    return CorrectnessGrade(score=int(response.content)).score
+
+
+def deepseek_evaluator_correctness(outputs: dict, reference_outputs: dict) -> CorrectnessGrade:
+    llm = ChatDeepSeek(
                model="deepseek-chat", 
                temperature=0.5,
-                api_key=DEEKSEEK_API_KEY
-            ),
-    )
+            )

-moonshot_evaluator_correctness = create_llm_as_judge(
-    prompt=CORRECTNESS_PROMPT,
-    judge=Moonshot(
-                model="moonshot-v1-128k", 
+    messages = [
+        {"role": "system", "content": CORRECTNESS_PROMPT},
+        {"role": "user", "content": f"""Ground Truth answer: {reference_outputs["answer"]};
+        Student's Answer: {outputs['response']}
+        """}
+    ]
+
+    response = llm.invoke(messages)
+
+    return CorrectnessGrade(score=int(response.content)).score
+
+
+def moonshot_evaluator_correctness(outputs: dict, reference_outputs: dict) -> CorrectnessGrade:
+    llm = Moonshot(
+                model="moonshot-v1-128k",
                temperature=0.5,
-                api_key=MOONSHOT_API_KEY
-            ),
-    )
+            )

-# conciseness
-gemini_evaluator_conciseness = create_llm_as_judge(
-    prompt=CONCISENESS_PROMPT,
-    judge=ChatGoogleGenerativeAI(
-                model="gemini-1.5-flash", 
-                google_api_key=GEMINI_API_KEY,
-                temperature=0.5,
-            ),
-    )
+    messages = [
+        {"role": "system", "content": CORRECTNESS_PROMPT},
+        {"role": "user", "content": f"""Ground Truth answer: {reference_outputs["answer"]};
+        Student's Answer: {outputs['response']}
+        """}
+    ]

-deepseek_evaluator_conciseness = create_llm_as_judge(
-    prompt=CONCISENESS_PROMPT,
-    judge=ChatDeepSeek(
-                model="deepseek-chat", 
-                temperature=0.5,
-                api_key=DEEKSEEK_API_KEY
-            ),
-    )
-
-moonshot_evaluator_conciseness = create_llm_as_judge(
-    prompt=CONCISENESS_PROMPT,
-    judge=Moonshot(
-                model="moonshot-v1-128k", 
-                temperature=0.5,
-                api_key=MOONSHOT_API_KEY
-            ),
-    )
-
-# hallucination
-gemini_evaluator_hallucination = create_llm_as_judge(
-    prompt=HALLUCINATION_PROMPT,
-    judge=ChatGoogleGenerativeAI(
-                model="gemini-1.5-flash", 
-                google_api_key=GEMINI_API_KEY,
-                temperature=0.5,
-            ),
-    )
-
-deepseek_evaluator_hallucination = create_llm_as_judge(
-    prompt=HALLUCINATION_PROMPT,
-    judge=ChatDeepSeek(
-                model="deepseek-chat", 
-                temperature=0.5,
-                api_key=DEEKSEEK_API_KEY
-            ),
-    )
-
-moonshot_evaluator_hallucination = create_llm_as_judge(
-    prompt=HALLUCINATION_PROMPT,
-    judge=Moonshot(
-                model="moonshot-v1-128k", 
-                temperature=0.5,
-                api_key=MOONSHOT_API_KEY
-            ),
-    )
+    response = llm.invoke(messages)

+    return CorrectnessGrade(score=int(response)).score
--- a/app/llmops/src/rag_adaptive_evaluation/prompts_library.py
+++ b/app/llmops/src/rag_adaptive_evaluation/prompts_library.py
@ -16,4 +16,39 @@ system_answer_grader = """You are a grader assessing whether an answer addresses
    Give a binary score 'yes' or 'no'. Yes' means that the answer resolves the question."""

 system_question_rewriter = """You a question re-writer that converts an input question to a better version that is optimized \n 
-    for vectorstore retrieval. Look at the input and try to reason about the underlying semantic intent / meaning."""
+    for vectorstore retrieval. Look at the input and try to reason about the underlying semantic intent / meaning."""
+
+
+# Evaluation
+CORRECTNESS_PROMPT = """Evaluate Student Answer against Ground Truth for conceptual similarity and correctness.
+
+You are an impartial judge. Evaluate Student Answer against Ground Truth for conceptual similarity and correctness. 
+You may also be given additional information that was used by the model to generate the output.
+
+Your task is to determine a numerical score called faithfulness based on the input and output.
+A definition of correctness and a grading rubric are provided below.
+You must use the grading rubric to determine your score.
+
+Metric definition:
+Correctness assesses the degree to which a provided output aligns with factual accuracy, completeness, logical 
+consistency, and precise terminology. It evaluates the intrinsic validity of the output, independent of any 
+external context. A higher score indicates a higher adherence to factual accuracy, completeness, logical consistency, 
+and precise terminology.
+
+Grading rubric:
+Correctness: Below are the details for different scores: 
+ - 1: Major factual errors, highly incomplete, illogical, and uses incorrect terminology.
+ - 2: Significant factual errors, incomplete, noticeable logical flaws, and frequent terminology errors.
+ - 3: Minor factual errors, somewhat incomplete, minor logical inconsistencies, and occasional terminology errors.
+ - 4: Few to no factual errors, mostly complete, strong logical consistency, and accurate terminology.
+ - 5: Accurate, complete, logically consistent, and uses precise terminology.
+ 
+ Reminder:
+  - Carefully read the input and output
+  - Check for factual accuracy and completeness
+  - Focus on correctness of information rather than style or verbosity
+  - The goal is to evaluate factual correctness and completeness of the response.
+  - Please provide your answer score only with the numerical number between 1 and 5. No score: or other text is allowed.
+
+"""
+
--- a/app/llmops/src/rag_adaptive_evaluation/python_env.yml
+++ b/app/llmops/src/rag_adaptive_evaluation/python_env.yml
@ -24,7 +24,6 @@ build_dependencies:
  - tavily-python
  - langchain_huggingface
  - pydantic
-  - openevals
 # Dependencies required to run the project.
 dependencies:
  - mlflow==2.8.1
--- a/app/llmops/src/rag_adaptive_evaluation/run.py
+++ b/app/llmops/src/rag_adaptive_evaluation/run.py
@ -38,30 +38,26 @@ from evaluators import (
    gemini_evaluator_correctness,
    deepseek_evaluator_correctness,
    moonshot_evaluator_correctness,
-    gemini_evaluator_conciseness,
-    deepseek_evaluator_conciseness,
-    moonshot_evaluator_conciseness,
-    gemini_evaluator_hallucination,
-    deepseek_evaluator_hallucination,
-    moonshot_evaluator_hallucination
+    # gemini_evaluator_conciseness,
+    # deepseek_evaluator_conciseness,
+    # moonshot_evaluator_conciseness,
+    # gemini_evaluator_hallucination,
+    # deepseek_evaluator_hallucination,
+    # moonshot_evaluator_hallucination
 )

 logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s")
 logger = logging.getLogger()

-GEMINI_API_KEY = config("GOOGLE_API_KEY", cast=str)
-DEEKSEEK_API_KEY = config("DEEKSEEK_API_KEY", cast=str)
-MOONSHOT_API_KEY = config("MOONSHOT_API_KEY", cast=str)
-TAVILY_API_KEY = config("TAVILY_API_KEY", cast=str)
-LANGSMITH_API_KEY = config("LANGSMITH_API_KEY", cast=str)
-LANGSMITH_TRACING = config("LANGSMITH_TRACING", cast=str)
-LANGSMITH_PROJECT = config("LANGSMITH_PROJECT", cast=str)
-os.environ["TAVILY_API_KEY"] = TAVILY_API_KEY
+os.environ["GOOGLE_API_KEY"] = config("GOOGLE_API_KEY", cast=str)
+os.environ["DEEPSEEK_API_KEY"] = config("DEEPSEEK_API_KEY", cast=str)
+os.environ["MOONSHOT_API_KEY"] = config("MOONSHOT_API_KEY", cast=str)
+os.environ["TAVILY_API_KEY"] = config("TAVILY_API_KEY", cast=str)
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-os.environ["LANGSMITH_API_KEY"] = LANGSMITH_API_KEY
-os.environ["LANGSMITH_TRACING"] = LANGSMITH_TRACING
+os.environ["LANGSMITH_API_KEY"] = config("LANGSMITH_API_KEY", cast=str)
+os.environ["LANGSMITH_TRACING"] = config("LANGSMITH_TRACING", cast=str)
 os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
-os.environ["LANGSMITH_PROJECT"] = LANGSMITH_PROJECT
+os.environ["LANGSMITH_PROJECT"] = config("LANGSMITH_PROJECT", cast=str)

 def go(args):

@ -95,12 +91,10 @@ def go(args):
                max_tokens=None,
                timeout=None,
                max_retries=2,
-                api_key=DEEKSEEK_API_KEY
            )
        elif args.chat_model_provider == 'gemini':
            llm = ChatGoogleGenerativeAI(
                model="gemini-1.5-flash", 
-                google_api_key=GEMINI_API_KEY,
                temperature=0,
                max_retries=3,
                streaming=True
@ -112,7 +106,6 @@ def go(args):
                max_tokens=None,
                timeout=None,
                max_retries=2,
-                api_key=MOONSHOT_API_KEY
            )

        # Load data from ChromaDB
@ -479,7 +472,61 @@ def go(args):
            pprint("\n---\n")

        # Final generation
-        pprint(value["generation"])
+        print(value["generation"])
+
+        return {"response": value["generation"]}
+    
+def go_evaluation(args):
+    if args.query_evaluation_dataset_csv_path:
+        # import pandas as pd
+        # from tqdm import tqdm
+
+        # df = pd.read_csv(args.query_evaluation_dataset_csv_path)
+        client = Client()
+        # # Create inputs and reference outputs
+        # examples = [
+        # (
+        #     "Which country is Mount Kilimanjaro located in?",
+        #     "Mount Kilimanjaro is located in Tanzania.",
+        # ),
+        # (
+        #     "What is Earth's lowest point?",
+        #     "Earth's lowest point is The Dead Sea.",
+        # ),
+        # ]
+
+        # inputs = [{"question": input_prompt} for input_prompt, _ in examples]
+        # outputs = [{"answer": output_answer} for _, output_answer in examples]
+
+        # # Programmatically create a dataset in LangSmith
+        # dataset = client.create_dataset(
+        #     dataset_name = "Sample dataset",
+        #     description = "A sample dataset in LangSmith."
+        # )
+
+        # # Add examples to the dataset
+        # client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)
+
+        def target(inputs: dict) -> dict:
+            new_args = argparse.Namespace(**vars(args))
+            new_args.query = inputs["question"]
+            return go(new_args)
+
+        
+        # After running the evaluation, a link will be provided to view the results in langsmith
+        experiment_results = client.evaluate(
+            target,
+            data = "Sample dataset",
+            evaluators = [
+                    moonshot_evaluator_correctness,
+                    deepseek_evaluator_correctness,
+                    gemini_evaluator_correctness
+                # can add multiple evaluators here
+            ],
+            experiment_prefix = "first-eval-in-langsmith",
+            max_concurrency = 1,
+            
+        )

    

@ -523,4 +570,5 @@ if __name__ == "__main__":

    args = parser.parse_args()
    
-    go(args)
+    # go(args)
+    go_evaluation(args)
--- a/app/llmops/src/rag_cot_evaluation/run.py
+++ b/app/llmops/src/rag_cot_evaluation/run.py
@ -14,18 +14,14 @@ from langchain_community.llms.moonshot import Moonshot
 logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s")
 logger = logging.getLogger()

+os.environ["GOOGLE_API_KEY"] = config("GOOGLE_API_KEY", cast=str)
+os.environ["DEEPSEEK_API_KEY"] = config("DEEPSEEK_API_KEY", cast=str)
+os.environ["MOONSHOT_API_KEY"] = config("MOONSHOT_API_KEY", cast=str)
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-GEMINI_API_KEY = config("GOOGLE_API_KEY", cast=str)
-DEEKSEEK_API_KEY = config("DEEKSEEK_API_KEY", cast=str)
-MOONSHOT_API_KEY = config("MOONSHOT_API_KEY", cast=str)
-LANGSMITH_API_KEY = config("LANGSMITH_API_KEY", cast=str)
-LANGSMITH_TRACING = config("LANGSMITH_TRACING", cast=str)
-LANGSMITH_PROJECT = config("LANGSMITH_PROJECT", cast=str)
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-os.environ["LANGSMITH_API_KEY"] = LANGSMITH_API_KEY
-os.environ["LANGSMITH_TRACING"] = LANGSMITH_TRACING
+os.environ["LANGSMITH_API_KEY"] = config("LANGSMITH_API_KEY", cast=str)
+os.environ["LANGSMITH_TRACING"] = config("LANGSMITH_TRACING", cast=str)
 os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
-os.environ["LANGSMITH_PROJECT"] = LANGSMITH_PROJECT
+os.environ["LANGSMITH_PROJECT"] = config("LANGSMITH_PROJECT", cast=str)

 def go(args):

@ -68,14 +64,12 @@ def go(args):
                max_tokens=None,
                timeout=None,
                max_retries=2,
-                api_key=DEEKSEEK_API_KEY
            )
            
        elif args.chat_model_provider == "gemini":
            # Initialize Gemini model
            llm = ChatGoogleGenerativeAI(
                model="gemini-1.5-flash", 
-                google_api_key=GEMINI_API_KEY,
                temperature=0,
                max_retries=3
                )
@ -88,7 +82,6 @@ def go(args):
                max_tokens=None,
                timeout=None,
                max_retries=2,
-                api_key=MOONSHOT_API_KEY
            )