diff --git a/app/llmops/src/rag_adaptive_evaluation/data_models.py b/app/llmops/src/rag_adaptive_evaluation/data_models.py index 680cfbd..aeb193d 100644 --- a/app/llmops/src/rag_adaptive_evaluation/data_models.py +++ b/app/llmops/src/rag_adaptive_evaluation/data_models.py @@ -1,4 +1,4 @@ -from typing import Literal, List +from typing import Literal from pydantic import BaseModel, Field diff --git a/app/llmops/src/rag_adaptive_evaluation/evaluators.py b/app/llmops/src/rag_adaptive_evaluation/evaluators.py index 17b1b90..0d16dea 100644 --- a/app/llmops/src/rag_adaptive_evaluation/evaluators.py +++ b/app/llmops/src/rag_adaptive_evaluation/evaluators.py @@ -1,99 +1,77 @@ +import os from decouple import config -from openevals.llm import create_llm_as_judge -from openevals.prompts import ( - CORRECTNESS_PROMPT, - CONCISENESS_PROMPT, - HALLUCINATION_PROMPT -) + from langchain_google_genai import ChatGoogleGenerativeAI from langchain_deepseek import ChatDeepSeek from langchain_community.llms.moonshot import Moonshot -GEMINI_API_KEY = config("GOOGLE_API_KEY", cast=str) -DEEKSEEK_API_KEY = config("DEEKSEEK_API_KEY", cast=str) -MOONSHOT_API_KEY = config("MOONSHOT_API_KEY", cast=str) +from pydantic import BaseModel, Field -# correctness -gemini_evaluator_correctness = create_llm_as_judge( - prompt=CORRECTNESS_PROMPT, - judge=ChatGoogleGenerativeAI( +from prompts_library import CORRECTNESS_PROMPT + +os.environ["GOOGLE_API_KEY"] = config("GOOGLE_API_KEY", cast=str) +os.environ["DEEPSEEK_API_KEY"] = config("DEEPSEEK_API_KEY", cast=str) +os.environ["MOONSHOT_API_KEY"] = config("MOONSHOT_API_KEY", cast=str) + + +# Define output schema for the evaluation +class CorrectnessGrade(BaseModel): + score: int = Field(description="Numerical score (1-5) indicating the correctness of the response.") + +# Todo: +# class RelevanceGrade(BaseModel): + + + +def gemini_evaluator_correctness(outputs: dict, reference_outputs: dict) -> CorrectnessGrade: + llm = ChatGoogleGenerativeAI( model="gemini-1.5-flash", - google_api_key=GEMINI_API_KEY, temperature=0.5, - ), - ) + ) -deepseek_evaluator_correctness = create_llm_as_judge( - prompt=CORRECTNESS_PROMPT, - judge=ChatDeepSeek( + messages = [ + {"role": "system", "content": CORRECTNESS_PROMPT}, + {"role": "user", "content": f"""Ground Truth answer: {reference_outputs["answer"]}; + Student's Answer: {outputs['response']} + """} + ] + + response = llm.invoke(messages) + + return CorrectnessGrade(score=int(response.content)).score + + +def deepseek_evaluator_correctness(outputs: dict, reference_outputs: dict) -> CorrectnessGrade: + llm = ChatDeepSeek( model="deepseek-chat", temperature=0.5, - api_key=DEEKSEEK_API_KEY - ), - ) + ) -moonshot_evaluator_correctness = create_llm_as_judge( - prompt=CORRECTNESS_PROMPT, - judge=Moonshot( - model="moonshot-v1-128k", + messages = [ + {"role": "system", "content": CORRECTNESS_PROMPT}, + {"role": "user", "content": f"""Ground Truth answer: {reference_outputs["answer"]}; + Student's Answer: {outputs['response']} + """} + ] + + response = llm.invoke(messages) + + return CorrectnessGrade(score=int(response.content)).score + + +def moonshot_evaluator_correctness(outputs: dict, reference_outputs: dict) -> CorrectnessGrade: + llm = Moonshot( + model="moonshot-v1-128k", temperature=0.5, - api_key=MOONSHOT_API_KEY - ), - ) + ) -# conciseness -gemini_evaluator_conciseness = create_llm_as_judge( - prompt=CONCISENESS_PROMPT, - judge=ChatGoogleGenerativeAI( - model="gemini-1.5-flash", - google_api_key=GEMINI_API_KEY, - temperature=0.5, - ), - ) + messages = [ + {"role": "system", "content": CORRECTNESS_PROMPT}, + {"role": "user", "content": f"""Ground Truth answer: {reference_outputs["answer"]}; + Student's Answer: {outputs['response']} + """} + ] -deepseek_evaluator_conciseness = create_llm_as_judge( - prompt=CONCISENESS_PROMPT, - judge=ChatDeepSeek( - model="deepseek-chat", - temperature=0.5, - api_key=DEEKSEEK_API_KEY - ), - ) - -moonshot_evaluator_conciseness = create_llm_as_judge( - prompt=CONCISENESS_PROMPT, - judge=Moonshot( - model="moonshot-v1-128k", - temperature=0.5, - api_key=MOONSHOT_API_KEY - ), - ) - -# hallucination -gemini_evaluator_hallucination = create_llm_as_judge( - prompt=HALLUCINATION_PROMPT, - judge=ChatGoogleGenerativeAI( - model="gemini-1.5-flash", - google_api_key=GEMINI_API_KEY, - temperature=0.5, - ), - ) - -deepseek_evaluator_hallucination = create_llm_as_judge( - prompt=HALLUCINATION_PROMPT, - judge=ChatDeepSeek( - model="deepseek-chat", - temperature=0.5, - api_key=DEEKSEEK_API_KEY - ), - ) - -moonshot_evaluator_hallucination = create_llm_as_judge( - prompt=HALLUCINATION_PROMPT, - judge=Moonshot( - model="moonshot-v1-128k", - temperature=0.5, - api_key=MOONSHOT_API_KEY - ), - ) + response = llm.invoke(messages) + return CorrectnessGrade(score=int(response)).score diff --git a/app/llmops/src/rag_adaptive_evaluation/prompts_library.py b/app/llmops/src/rag_adaptive_evaluation/prompts_library.py index fcaf564..3bfed18 100644 --- a/app/llmops/src/rag_adaptive_evaluation/prompts_library.py +++ b/app/llmops/src/rag_adaptive_evaluation/prompts_library.py @@ -16,4 +16,39 @@ system_answer_grader = """You are a grader assessing whether an answer addresses Give a binary score 'yes' or 'no'. Yes' means that the answer resolves the question.""" system_question_rewriter = """You a question re-writer that converts an input question to a better version that is optimized \n - for vectorstore retrieval. Look at the input and try to reason about the underlying semantic intent / meaning.""" \ No newline at end of file + for vectorstore retrieval. Look at the input and try to reason about the underlying semantic intent / meaning.""" + + +# Evaluation +CORRECTNESS_PROMPT = """Evaluate Student Answer against Ground Truth for conceptual similarity and correctness. + +You are an impartial judge. Evaluate Student Answer against Ground Truth for conceptual similarity and correctness. +You may also be given additional information that was used by the model to generate the output. + +Your task is to determine a numerical score called faithfulness based on the input and output. +A definition of correctness and a grading rubric are provided below. +You must use the grading rubric to determine your score. + +Metric definition: +Correctness assesses the degree to which a provided output aligns with factual accuracy, completeness, logical +consistency, and precise terminology. It evaluates the intrinsic validity of the output, independent of any +external context. A higher score indicates a higher adherence to factual accuracy, completeness, logical consistency, +and precise terminology. + +Grading rubric: +Correctness: Below are the details for different scores: + - 1: Major factual errors, highly incomplete, illogical, and uses incorrect terminology. + - 2: Significant factual errors, incomplete, noticeable logical flaws, and frequent terminology errors. + - 3: Minor factual errors, somewhat incomplete, minor logical inconsistencies, and occasional terminology errors. + - 4: Few to no factual errors, mostly complete, strong logical consistency, and accurate terminology. + - 5: Accurate, complete, logically consistent, and uses precise terminology. + + Reminder: + - Carefully read the input and output + - Check for factual accuracy and completeness + - Focus on correctness of information rather than style or verbosity + - The goal is to evaluate factual correctness and completeness of the response. + - Please provide your answer score only with the numerical number between 1 and 5. No score: or other text is allowed. + +""" + diff --git a/app/llmops/src/rag_adaptive_evaluation/python_env.yml b/app/llmops/src/rag_adaptive_evaluation/python_env.yml index 451cdb7..2278969 100644 --- a/app/llmops/src/rag_adaptive_evaluation/python_env.yml +++ b/app/llmops/src/rag_adaptive_evaluation/python_env.yml @@ -24,7 +24,6 @@ build_dependencies: - tavily-python - langchain_huggingface - pydantic - - openevals # Dependencies required to run the project. dependencies: - mlflow==2.8.1 \ No newline at end of file diff --git a/app/llmops/src/rag_adaptive_evaluation/run.py b/app/llmops/src/rag_adaptive_evaluation/run.py index 1fe7543..c629de7 100644 --- a/app/llmops/src/rag_adaptive_evaluation/run.py +++ b/app/llmops/src/rag_adaptive_evaluation/run.py @@ -38,30 +38,26 @@ from evaluators import ( gemini_evaluator_correctness, deepseek_evaluator_correctness, moonshot_evaluator_correctness, - gemini_evaluator_conciseness, - deepseek_evaluator_conciseness, - moonshot_evaluator_conciseness, - gemini_evaluator_hallucination, - deepseek_evaluator_hallucination, - moonshot_evaluator_hallucination + # gemini_evaluator_conciseness, + # deepseek_evaluator_conciseness, + # moonshot_evaluator_conciseness, + # gemini_evaluator_hallucination, + # deepseek_evaluator_hallucination, + # moonshot_evaluator_hallucination ) logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s") logger = logging.getLogger() -GEMINI_API_KEY = config("GOOGLE_API_KEY", cast=str) -DEEKSEEK_API_KEY = config("DEEKSEEK_API_KEY", cast=str) -MOONSHOT_API_KEY = config("MOONSHOT_API_KEY", cast=str) -TAVILY_API_KEY = config("TAVILY_API_KEY", cast=str) -LANGSMITH_API_KEY = config("LANGSMITH_API_KEY", cast=str) -LANGSMITH_TRACING = config("LANGSMITH_TRACING", cast=str) -LANGSMITH_PROJECT = config("LANGSMITH_PROJECT", cast=str) -os.environ["TAVILY_API_KEY"] = TAVILY_API_KEY +os.environ["GOOGLE_API_KEY"] = config("GOOGLE_API_KEY", cast=str) +os.environ["DEEPSEEK_API_KEY"] = config("DEEPSEEK_API_KEY", cast=str) +os.environ["MOONSHOT_API_KEY"] = config("MOONSHOT_API_KEY", cast=str) +os.environ["TAVILY_API_KEY"] = config("TAVILY_API_KEY", cast=str) os.environ["TOKENIZERS_PARALLELISM"] = "false" -os.environ["LANGSMITH_API_KEY"] = LANGSMITH_API_KEY -os.environ["LANGSMITH_TRACING"] = LANGSMITH_TRACING +os.environ["LANGSMITH_API_KEY"] = config("LANGSMITH_API_KEY", cast=str) +os.environ["LANGSMITH_TRACING"] = config("LANGSMITH_TRACING", cast=str) os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com" -os.environ["LANGSMITH_PROJECT"] = LANGSMITH_PROJECT +os.environ["LANGSMITH_PROJECT"] = config("LANGSMITH_PROJECT", cast=str) def go(args): @@ -95,12 +91,10 @@ def go(args): max_tokens=None, timeout=None, max_retries=2, - api_key=DEEKSEEK_API_KEY ) elif args.chat_model_provider == 'gemini': llm = ChatGoogleGenerativeAI( model="gemini-1.5-flash", - google_api_key=GEMINI_API_KEY, temperature=0, max_retries=3, streaming=True @@ -112,7 +106,6 @@ def go(args): max_tokens=None, timeout=None, max_retries=2, - api_key=MOONSHOT_API_KEY ) # Load data from ChromaDB @@ -479,7 +472,61 @@ def go(args): pprint("\n---\n") # Final generation - pprint(value["generation"]) + print(value["generation"]) + + return {"response": value["generation"]} + +def go_evaluation(args): + if args.query_evaluation_dataset_csv_path: + # import pandas as pd + # from tqdm import tqdm + + # df = pd.read_csv(args.query_evaluation_dataset_csv_path) + client = Client() + # # Create inputs and reference outputs + # examples = [ + # ( + # "Which country is Mount Kilimanjaro located in?", + # "Mount Kilimanjaro is located in Tanzania.", + # ), + # ( + # "What is Earth's lowest point?", + # "Earth's lowest point is The Dead Sea.", + # ), + # ] + + # inputs = [{"question": input_prompt} for input_prompt, _ in examples] + # outputs = [{"answer": output_answer} for _, output_answer in examples] + + # # Programmatically create a dataset in LangSmith + # dataset = client.create_dataset( + # dataset_name = "Sample dataset", + # description = "A sample dataset in LangSmith." + # ) + + # # Add examples to the dataset + # client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id) + + def target(inputs: dict) -> dict: + new_args = argparse.Namespace(**vars(args)) + new_args.query = inputs["question"] + return go(new_args) + + + # After running the evaluation, a link will be provided to view the results in langsmith + experiment_results = client.evaluate( + target, + data = "Sample dataset", + evaluators = [ + moonshot_evaluator_correctness, + deepseek_evaluator_correctness, + gemini_evaluator_correctness + # can add multiple evaluators here + ], + experiment_prefix = "first-eval-in-langsmith", + max_concurrency = 1, + + ) @@ -523,4 +570,5 @@ if __name__ == "__main__": args = parser.parse_args() - go(args) \ No newline at end of file + # go(args) + go_evaluation(args) \ No newline at end of file diff --git a/app/llmops/src/rag_cot_evaluation/run.py b/app/llmops/src/rag_cot_evaluation/run.py index aa773b1..06484df 100644 --- a/app/llmops/src/rag_cot_evaluation/run.py +++ b/app/llmops/src/rag_cot_evaluation/run.py @@ -14,18 +14,14 @@ from langchain_community.llms.moonshot import Moonshot logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s") logger = logging.getLogger() +os.environ["GOOGLE_API_KEY"] = config("GOOGLE_API_KEY", cast=str) +os.environ["DEEPSEEK_API_KEY"] = config("DEEPSEEK_API_KEY", cast=str) +os.environ["MOONSHOT_API_KEY"] = config("MOONSHOT_API_KEY", cast=str) os.environ["TOKENIZERS_PARALLELISM"] = "false" -GEMINI_API_KEY = config("GOOGLE_API_KEY", cast=str) -DEEKSEEK_API_KEY = config("DEEKSEEK_API_KEY", cast=str) -MOONSHOT_API_KEY = config("MOONSHOT_API_KEY", cast=str) -LANGSMITH_API_KEY = config("LANGSMITH_API_KEY", cast=str) -LANGSMITH_TRACING = config("LANGSMITH_TRACING", cast=str) -LANGSMITH_PROJECT = config("LANGSMITH_PROJECT", cast=str) -os.environ["TOKENIZERS_PARALLELISM"] = "false" -os.environ["LANGSMITH_API_KEY"] = LANGSMITH_API_KEY -os.environ["LANGSMITH_TRACING"] = LANGSMITH_TRACING +os.environ["LANGSMITH_API_KEY"] = config("LANGSMITH_API_KEY", cast=str) +os.environ["LANGSMITH_TRACING"] = config("LANGSMITH_TRACING", cast=str) os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com" -os.environ["LANGSMITH_PROJECT"] = LANGSMITH_PROJECT +os.environ["LANGSMITH_PROJECT"] = config("LANGSMITH_PROJECT", cast=str) def go(args): @@ -68,14 +64,12 @@ def go(args): max_tokens=None, timeout=None, max_retries=2, - api_key=DEEKSEEK_API_KEY ) elif args.chat_model_provider == "gemini": # Initialize Gemini model llm = ChatGoogleGenerativeAI( model="gemini-1.5-flash", - google_api_key=GEMINI_API_KEY, temperature=0, max_retries=3 ) @@ -88,7 +82,6 @@ def go(args): max_tokens=None, timeout=None, max_retries=2, - api_key=MOONSHOT_API_KEY )