From 86a2c1a055d048eb2697f2575c7255e3a702049a Mon Sep 17 00:00:00 2001 From: leehk Date: Thu, 13 Mar 2025 21:23:36 +0800 Subject: [PATCH] update for now --- app/llmops/config.yaml | 11 ++- app/llmops/main.py | 25 ++--- .../src/rag_adaptive_evaluation/MLproject | 22 ++++- app/llmops/src/rag_adaptive_evaluation/run.py | 94 +++++++++++-------- 4 files changed, 99 insertions(+), 53 deletions(-) diff --git a/app/llmops/config.yaml b/app/llmops/config.yaml index 5452f8c..33383ea 100644 --- a/app/llmops/config.yaml +++ b/app/llmops/config.yaml @@ -9,8 +9,15 @@ etl: path_document_folder: "../../../../data" run_id_documents: None embedding_model: paraphrase-multilingual-mpnet-base-v2 -prompt_engineering: +rag: run_id_chromadb: None chat_model_provider: gemini +testing: query: "如何治疗乳腺癌?" - query_evaluation_dataset_csv_path: "../../../../data/qa_datasets.csv" \ No newline at end of file +evaluation: + evaluation_dataset_csv_path: "../../../../data/qa_datasets.csv" + evaluation_dataset_column_question: question + evaluation_dataset_column_answer: answer + ls_chat_model_provider: + - gemini + - moonshot \ No newline at end of file diff --git a/app/llmops/main.py b/app/llmops/main.py index ac768b4..32d6a8b 100644 --- a/app/llmops/main.py +++ b/app/llmops/main.py @@ -104,7 +104,7 @@ def go(config: DictConfig): ) if "rag_cot_evaluation" in active_steps: - if config["prompt_engineering"]["run_id_chromadb"] == "None": + if config["rag"]["run_id_chromadb"] == "None": # Look for run_id that has artifact logged as documents run_id = None client = mlflow.tracking.MlflowClient() @@ -119,22 +119,22 @@ def go(config: DictConfig): if run_id is None: raise ValueError("No run_id found with artifact logged as documents") else: - run_id = config["prompt_engineering"]["run_id_chromadb"] + run_id = config["rag"]["run_id_chromadb"] _ = mlflow.run( os.path.join(hydra.utils.get_original_cwd(), "src", "rag_cot_evaluation"), "main", parameters={ - "query": config["prompt_engineering"]["query"], + "query": config["testing"]["query"], "input_chromadb_artifact": f'runs:/{run_id}/chromadb/chroma_db.zip', "embedding_model": config["etl"]["embedding_model"], - "chat_model_provider": config["prompt_engineering"]["chat_model_provider"] + "chat_model_provider": config["rag"]["chat_model_provider"] }, ) if "rag_adaptive_evaluation" in active_steps: - if config["prompt_engineering"]["run_id_chromadb"] == "None": + if config["rag"]["run_id_chromadb"] == "None": # Look for run_id that has artifact logged as documents run_id = None client = mlflow.tracking.MlflowClient() @@ -149,17 +149,20 @@ def go(config: DictConfig): if run_id is None: raise ValueError("No run_id found with artifact logged as documents") else: - run_id = config["prompt_engineering"]["run_id_chromadb"] + run_id = config["rag"]["run_id_chromadb"] _ = mlflow.run( os.path.join(hydra.utils.get_original_cwd(), "src", "rag_adaptive_evaluation"), "main", parameters={ - "query": config["prompt_engineering"]["query"], - "query_evaluation_dataset_csv_path": config["prompt_engineering"]["query_evaluation_dataset_csv_path"], + "query": config["testing"]["query"], + "evaluation_dataset_csv_path": config["evaluation"]["evaluation_dataset_csv_path"], + "evaluation_dataset_column_question": config["evaluation"]["evaluation_dataset_column_question"], + "evaluation_dataset_column_answer": config["evaluation"]["evaluation_dataset_column_answer"], "input_chromadb_artifact": f'runs:/{run_id}/chromadb/chroma_db.zip', "embedding_model": config["etl"]["embedding_model"], - "chat_model_provider": config["prompt_engineering"]["chat_model_provider"] + "chat_model_provider": config["rag"]["chat_model_provider"], + "ls_chat_model_evaluator": ','.join(config["evaluation"]["ls_chat_model_provider"]) if config["evaluation"]["ls_chat_model_provider"] is not None else 'None', }, ) @@ -169,10 +172,10 @@ def go(config: DictConfig): os.path.join(hydra.utils.get_original_cwd(), "components", "test_rag_cot"), "main", parameters={ - "query": config["prompt_engineering"]["query"], + "query": config["testing"]["query"], "input_chromadb_local": os.path.join(hydra.utils.get_original_cwd(), "src", "rag_cot_evaluation", "chroma_db"), "embedding_model": config["etl"]["embedding_model"], - "chat_model_provider": config["prompt_engineering"]["chat_model_provider"] + "chat_model_provider": config["rag"]["chat_model_provider"] }, ) diff --git a/app/llmops/src/rag_adaptive_evaluation/MLproject b/app/llmops/src/rag_adaptive_evaluation/MLproject index 457116d..77061d4 100644 --- a/app/llmops/src/rag_adaptive_evaluation/MLproject +++ b/app/llmops/src/rag_adaptive_evaluation/MLproject @@ -9,10 +9,18 @@ entry_points: description: Query to run type: string - query_evaluation_dataset_csv_path: + evaluation_dataset_csv_path: description: query evaluation dataset csv path type: string + evaluation_dataset_column_question: + description: query evaluation dataset column question + type: string + + evaluation_dataset_column_answer: + description: query evaluation dataset column groundtruth + type: string + input_chromadb_artifact: description: Fully-qualified name for the input artifact type: string @@ -24,10 +32,18 @@ entry_points: chat_model_provider: description: Fully-qualified name for the chat model provider type: string + + ls_chat_model_evaluator: + description: list of chat model providers for evaluation + type: string + command: >- python run.py --query {query} \ - --query_evaluation_dataset_csv_path {query_evaluation_dataset_csv_path} \ + --evaluation_dataset_csv_path {evaluation_dataset_csv_path} \ + --evaluation_dataset_column_question {evaluation_dataset_column_question} \ + --evaluation_dataset_column_answer {evaluation_dataset_column_answer} \ --input_chromadb_artifact {input_chromadb_artifact} \ --embedding_model {embedding_model} \ - --chat_model_provider {chat_model_provider} \ No newline at end of file + --chat_model_provider {chat_model_provider} \ + --ls_chat_model_evaluator {ls_chat_model_evaluator} \ No newline at end of file diff --git a/app/llmops/src/rag_adaptive_evaluation/run.py b/app/llmops/src/rag_adaptive_evaluation/run.py index bf8de6c..4acc4c7 100644 --- a/app/llmops/src/rag_adaptive_evaluation/run.py +++ b/app/llmops/src/rag_adaptive_evaluation/run.py @@ -474,61 +474,60 @@ def go(args): return {"response": value["generation"]} def go_evaluation(args): - if args.query_evaluation_dataset_csv_path: - # import pandas as pd - # from tqdm import tqdm + if args.evaluation_dataset_csv_path: - # df = pd.read_csv(args.query_evaluation_dataset_csv_path) + import pandas as pd + + df = pd.read_csv(args.evaluation_dataset_csv_path) + dataset_name = os.path.basename(args.evaluation_dataset_csv_path).split('.')[0] + + # df contains columns of question and answer + examples = df[[args.evaluation_dataset_column_question, args.evaluation_dataset_column_answer]].values.tolist() + inputs = [{"question": input_prompt} for input_prompt, _ in examples] + outputs = [{"answer": output_answer} for _, output_answer in examples] + + # Programmatically create a dataset in LangSmith client = Client() - # # Create inputs and reference outputs - # examples = [ - # ( - # "Which country is Mount Kilimanjaro located in?", - # "Mount Kilimanjaro is located in Tanzania.", - # ), - # ( - # "What is Earth's lowest point?", - # "Earth's lowest point is The Dead Sea.", - # ), - # ] - # inputs = [{"question": input_prompt} for input_prompt, _ in examples] - # outputs = [{"answer": output_answer} for _, output_answer in examples] + dataset = client.create_dataset( + dataset_name = dataset_name, + description = "A sample dataset in LangSmith." + ) - # # Programmatically create a dataset in LangSmith - # dataset = client.create_dataset( - # dataset_name = "Sample dataset", - # description = "A sample dataset in LangSmith." - # ) + # Add examples to the dataset + client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id) - # # Add examples to the dataset - # client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id) + + args.ls_chat_model_evaluator = None if args.ls_chat_model_evaluator == 'None' else args.ls_chat_model_evaluator.split(',') def target(inputs: dict) -> dict: new_args = argparse.Namespace(**vars(args)) new_args.query = inputs["question"] return go(new_args) + ls_evaluators = [] + if args.ls_chat_model_evaluator: + for evaluator in args.ls_chat_model_evaluator: + if evaluator == 'moonshot': + ls_evaluators.append(moonshot_evaluator_correctness) + ls_evaluators.append(moonshot_evaluator_faithfulness) + elif evaluator == 'deepseek': + ls_evaluators.append(deepseek_evaluator_correctness) + ls_evaluators.append(deepseek_evaluator_faithfulness) + elif evaluator == 'gemini': + ls_evaluators.append(gemini_evaluator_correctness) + ls_evaluators.append(gemini_evaluator_faithfulness) # After running the evaluation, a link will be provided to view the results in langsmith experiment_results = client.evaluate( target, data = "Sample dataset", - evaluators = [ - moonshot_evaluator_correctness, - deepseek_evaluator_correctness, - gemini_evaluator_correctness, - gemini_evaluator_faithfulness, - deepseek_evaluator_faithfulness, - moonshot_evaluator_faithfulness - # can add multiple evaluators here - ], + evaluators = ls_evaluators, experiment_prefix = "first-eval-in-langsmith", max_concurrency = 1, ) - if __name__ == "__main__": parser = argparse.ArgumentParser(description="Adaptive AG") @@ -541,12 +540,26 @@ if __name__ == "__main__": ) parser.add_argument( - "--query_evaluation_dataset_csv_path", + "--evaluation_dataset_csv_path", type=str, help="Path to the query evaluation dataset", default=None, ) + parser.add_argument( + "--evaluation_dataset_column_question", + type=str, + help="Column name for the questions in the evaluation dataset", + default="question", + ) + + parser.add_argument( + "--evaluation_dataset_column_answer", + type=str, + help="Column name for the groundtruth answers in the evaluation dataset", + default="groundtruth", + ) + parser.add_argument( "--input_chromadb_artifact", type=str, @@ -568,7 +581,14 @@ if __name__ == "__main__": help="Chat model provider" ) + parser.add_argument( + "--ls_chat_model_evaluator", + type=str, + help="list of Chat model providers for evaluation", + required=False, + default="None" + ) + args = parser.parse_args() - - # go(args) + go_evaluation(args) \ No newline at end of file