From 8b68c6024992f0b42153392788226f024fba1116 Mon Sep 17 00:00:00 2001
From: leehk <leehongkai@gmail.com>
Date: Wed, 12 Mar 2025 15:31:04 +0800
Subject: [PATCH 1/7] rename src folder from adaptive_rag_evaluation to
 rag_adaptive_evaluation

---
 app/llmops/main.py                                        | 6 +++---
 .../MLproject                                             | 2 +-
 .../python_env.yml                                        | 0
 .../run.py                                                | 0
 app/llmops/src/rag_cot_evaluation/run.py                  | 8 ++++++++
 5 files changed, 12 insertions(+), 4 deletions(-)
 rename app/llmops/src/{adaptive_rag_evaluation => rag_adaptive_evaluation}/MLproject (96%)
 rename app/llmops/src/{adaptive_rag_evaluation => rag_adaptive_evaluation}/python_env.yml (100%)
 rename app/llmops/src/{adaptive_rag_evaluation => rag_adaptive_evaluation}/run.py (100%)

diff --git a/app/llmops/main.py b/app/llmops/main.py
index 809b2f1..4c04a63 100644
--- a/app/llmops/main.py
+++ b/app/llmops/main.py
@@ -9,7 +9,7 @@ _steps = [
     "etl_chromadb_pdf",
     "etl_chromadb_scanned_pdf", # the performance for scanned pdf may not be good
     "rag_cot_evaluation",
-    "adaptive_rag_evaluation",
+    "rag_adaptive_evaluation",
     "test_rag_cot"
 ]
 
@@ -132,7 +132,7 @@ def go(config: DictConfig):
                 },
             )
         
-        if "adaptive_rag_evaluation" in active_steps:
+        if "rag_adaptive_evaluation" in active_steps:
 
             if config["prompt_engineering"]["run_id_chromadb"] == "None":
                 # Look for run_id that has artifact logged as documents
@@ -152,7 +152,7 @@ def go(config: DictConfig):
                 run_id = config["prompt_engineering"]["run_id_chromadb"]
 
             _ = mlflow.run(
-                os.path.join(hydra.utils.get_original_cwd(), "src", "adaptive_rag_evaluation"),
+                os.path.join(hydra.utils.get_original_cwd(), "src", "rag_adaptive_evaluation"),
                 "main",
                 parameters={
                     "query": config["prompt_engineering"]["query"],
diff --git a/app/llmops/src/adaptive_rag_evaluation/MLproject b/app/llmops/src/rag_adaptive_evaluation/MLproject
similarity index 96%
rename from app/llmops/src/adaptive_rag_evaluation/MLproject
rename to app/llmops/src/rag_adaptive_evaluation/MLproject
index 521e783..48c1dad 100644
--- a/app/llmops/src/adaptive_rag_evaluation/MLproject
+++ b/app/llmops/src/rag_adaptive_evaluation/MLproject
@@ -1,4 +1,4 @@
-name: adaptive_rag_evaluation
+name: rag_adaptive_evaluation
 python_env: python_env.yml
 
 entry_points:
diff --git a/app/llmops/src/adaptive_rag_evaluation/python_env.yml b/app/llmops/src/rag_adaptive_evaluation/python_env.yml
similarity index 100%
rename from app/llmops/src/adaptive_rag_evaluation/python_env.yml
rename to app/llmops/src/rag_adaptive_evaluation/python_env.yml
diff --git a/app/llmops/src/adaptive_rag_evaluation/run.py b/app/llmops/src/rag_adaptive_evaluation/run.py
similarity index 100%
rename from app/llmops/src/adaptive_rag_evaluation/run.py
rename to app/llmops/src/rag_adaptive_evaluation/run.py
diff --git a/app/llmops/src/rag_cot_evaluation/run.py b/app/llmops/src/rag_cot_evaluation/run.py
index 054d11b..aa773b1 100644
--- a/app/llmops/src/rag_cot_evaluation/run.py
+++ b/app/llmops/src/rag_cot_evaluation/run.py
@@ -18,6 +18,14 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
 GEMINI_API_KEY = config("GOOGLE_API_KEY", cast=str)
 DEEKSEEK_API_KEY = config("DEEKSEEK_API_KEY", cast=str)
 MOONSHOT_API_KEY = config("MOONSHOT_API_KEY", cast=str)
+LANGSMITH_API_KEY = config("LANGSMITH_API_KEY", cast=str)
+LANGSMITH_TRACING = config("LANGSMITH_TRACING", cast=str)
+LANGSMITH_PROJECT = config("LANGSMITH_PROJECT", cast=str)
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+os.environ["LANGSMITH_API_KEY"] = LANGSMITH_API_KEY
+os.environ["LANGSMITH_TRACING"] = LANGSMITH_TRACING
+os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
+os.environ["LANGSMITH_PROJECT"] = LANGSMITH_PROJECT
 
 def go(args):
 

From 486a79a2ccca52d20914e5d2dada48018269e4fa Mon Sep 17 00:00:00 2001
From: leehk <leehongkai@gmail.com>
Date: Wed, 12 Mar 2025 17:59:25 +0800
Subject: [PATCH 2/7] refactored adaptive rag

---
 app/llmops/config.yaml                        |   2 +-
 app/llmops/main.py                            |   1 +
 .../src/rag_adaptive_evaluation/MLproject     |   6 +-
 .../rag_adaptive_evaluation/data_models.py    |  32 +++++
 .../src/rag_adaptive_evaluation/evaluators.py |  99 ++++++++++++++
 .../prompts_library.py                        |  19 +++
 .../rag_adaptive_evaluation/python_env.yml    |   1 +
 app/llmops/src/rag_adaptive_evaluation/run.py | 129 +++++++-----------
 8 files changed, 207 insertions(+), 82 deletions(-)
 create mode 100644 app/llmops/src/rag_adaptive_evaluation/data_models.py
 create mode 100644 app/llmops/src/rag_adaptive_evaluation/evaluators.py
 create mode 100644 app/llmops/src/rag_adaptive_evaluation/prompts_library.py

diff --git a/app/llmops/config.yaml b/app/llmops/config.yaml
index 37aeb2b..5452f8c 100644
--- a/app/llmops/config.yaml
+++ b/app/llmops/config.yaml
@@ -13,4 +13,4 @@ prompt_engineering:
   run_id_chromadb: None
   chat_model_provider: gemini
   query: "如何治疗乳腺癌?"
-  
\ No newline at end of file
+  query_evaluation_dataset_csv_path: "../../../../data/qa_datasets.csv"   
\ No newline at end of file
diff --git a/app/llmops/main.py b/app/llmops/main.py
index 4c04a63..ac768b4 100644
--- a/app/llmops/main.py
+++ b/app/llmops/main.py
@@ -156,6 +156,7 @@ def go(config: DictConfig):
                 "main",
                 parameters={
                     "query": config["prompt_engineering"]["query"],
+                    "query_evaluation_dataset_csv_path": config["prompt_engineering"]["query_evaluation_dataset_csv_path"],
                     "input_chromadb_artifact": f'runs:/{run_id}/chromadb/chroma_db.zip',
                     "embedding_model": config["etl"]["embedding_model"],
                     "chat_model_provider": config["prompt_engineering"]["chat_model_provider"]
diff --git a/app/llmops/src/rag_adaptive_evaluation/MLproject b/app/llmops/src/rag_adaptive_evaluation/MLproject
index 48c1dad..457116d 100644
--- a/app/llmops/src/rag_adaptive_evaluation/MLproject
+++ b/app/llmops/src/rag_adaptive_evaluation/MLproject
@@ -8,6 +8,10 @@ entry_points:
       query:
         description: Query to run
         type: string
+      
+      query_evaluation_dataset_csv_path:
+        description: query evaluation dataset csv path
+        type: string
 
       input_chromadb_artifact:
         description: Fully-qualified name for the input artifact
@@ -20,10 +24,10 @@ entry_points:
       chat_model_provider:
         description: Fully-qualified name for the chat model provider
         type: string
-
     
     command: >-
         python run.py --query {query} \
+                      --query_evaluation_dataset_csv_path {query_evaluation_dataset_csv_path} \
                       --input_chromadb_artifact {input_chromadb_artifact} \
                       --embedding_model {embedding_model} \
                       --chat_model_provider {chat_model_provider}
\ No newline at end of file
diff --git a/app/llmops/src/rag_adaptive_evaluation/data_models.py b/app/llmops/src/rag_adaptive_evaluation/data_models.py
new file mode 100644
index 0000000..680cfbd
--- /dev/null
+++ b/app/llmops/src/rag_adaptive_evaluation/data_models.py
@@ -0,0 +1,32 @@
+from typing import Literal, List
+from pydantic import BaseModel, Field
+
+
+class RouteQuery(BaseModel):
+    """Route a user query to the most relevant datasource."""
+
+    datasource: Literal["vectorstore", "web_search"] = Field(
+        ...,
+        description="Given a user question choose to route it to web search or a vectorstore.",
+    )
+
+class GradeDocuments(BaseModel):
+    """Binary score for relevance check on retrieved documents."""
+
+    binary_score: str = Field(
+        description="Documents are relevant to the question, 'yes' or 'no'"
+    )
+
+class GradeHallucinations(BaseModel):
+    """Binary score for hallucination present in generation answer."""
+
+    binary_score: str = Field(
+        description="Answer is grounded in the facts, 'yes' or 'no'"
+    )
+
+class GradeAnswer(BaseModel):
+    """Binary score to assess answer addresses question."""
+
+    binary_score: str = Field(
+        description="Answer addresses the question, 'yes' or 'no'"
+    )
\ No newline at end of file
diff --git a/app/llmops/src/rag_adaptive_evaluation/evaluators.py b/app/llmops/src/rag_adaptive_evaluation/evaluators.py
new file mode 100644
index 0000000..17b1b90
--- /dev/null
+++ b/app/llmops/src/rag_adaptive_evaluation/evaluators.py
@@ -0,0 +1,99 @@
+from decouple import config
+from openevals.llm import create_llm_as_judge
+from openevals.prompts import (
+    CORRECTNESS_PROMPT, 
+    CONCISENESS_PROMPT, 
+    HALLUCINATION_PROMPT
+)
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_deepseek import ChatDeepSeek
+from langchain_community.llms.moonshot import Moonshot
+
+GEMINI_API_KEY = config("GOOGLE_API_KEY", cast=str)
+DEEKSEEK_API_KEY = config("DEEKSEEK_API_KEY", cast=str)
+MOONSHOT_API_KEY = config("MOONSHOT_API_KEY", cast=str)
+
+# correctness
+gemini_evaluator_correctness = create_llm_as_judge(
+    prompt=CORRECTNESS_PROMPT,
+    judge=ChatGoogleGenerativeAI(
+                model="gemini-1.5-flash", 
+                google_api_key=GEMINI_API_KEY,
+                temperature=0.5,
+            ),
+    )
+
+deepseek_evaluator_correctness = create_llm_as_judge(
+    prompt=CORRECTNESS_PROMPT,
+    judge=ChatDeepSeek(
+                model="deepseek-chat", 
+                temperature=0.5,
+                api_key=DEEKSEEK_API_KEY
+            ),
+    )
+
+moonshot_evaluator_correctness = create_llm_as_judge(
+    prompt=CORRECTNESS_PROMPT,
+    judge=Moonshot(
+                model="moonshot-v1-128k", 
+                temperature=0.5,
+                api_key=MOONSHOT_API_KEY
+            ),
+    )
+
+# conciseness
+gemini_evaluator_conciseness = create_llm_as_judge(
+    prompt=CONCISENESS_PROMPT,
+    judge=ChatGoogleGenerativeAI(
+                model="gemini-1.5-flash", 
+                google_api_key=GEMINI_API_KEY,
+                temperature=0.5,
+            ),
+    )
+
+deepseek_evaluator_conciseness = create_llm_as_judge(
+    prompt=CONCISENESS_PROMPT,
+    judge=ChatDeepSeek(
+                model="deepseek-chat", 
+                temperature=0.5,
+                api_key=DEEKSEEK_API_KEY
+            ),
+    )
+
+moonshot_evaluator_conciseness = create_llm_as_judge(
+    prompt=CONCISENESS_PROMPT,
+    judge=Moonshot(
+                model="moonshot-v1-128k", 
+                temperature=0.5,
+                api_key=MOONSHOT_API_KEY
+            ),
+    )
+
+# hallucination
+gemini_evaluator_hallucination = create_llm_as_judge(
+    prompt=HALLUCINATION_PROMPT,
+    judge=ChatGoogleGenerativeAI(
+                model="gemini-1.5-flash", 
+                google_api_key=GEMINI_API_KEY,
+                temperature=0.5,
+            ),
+    )
+
+deepseek_evaluator_hallucination = create_llm_as_judge(
+    prompt=HALLUCINATION_PROMPT,
+    judge=ChatDeepSeek(
+                model="deepseek-chat", 
+                temperature=0.5,
+                api_key=DEEKSEEK_API_KEY
+            ),
+    )
+
+moonshot_evaluator_hallucination = create_llm_as_judge(
+    prompt=HALLUCINATION_PROMPT,
+    judge=Moonshot(
+                model="moonshot-v1-128k", 
+                temperature=0.5,
+                api_key=MOONSHOT_API_KEY
+            ),
+    )
+
diff --git a/app/llmops/src/rag_adaptive_evaluation/prompts_library.py b/app/llmops/src/rag_adaptive_evaluation/prompts_library.py
new file mode 100644
index 0000000..fcaf564
--- /dev/null
+++ b/app/llmops/src/rag_adaptive_evaluation/prompts_library.py
@@ -0,0 +1,19 @@
+system_router = """You are an expert at routing a user question to a vectorstore or web search.
+The vectorstore contains documents related to any cancer/tumor disease. The question may be
+asked in a variety of languages, and may be phrased in a variety of ways.
+Use the vectorstore for questions on these topics. Otherwise, use web-search. 
+"""
+
+system_retriever_grader = """You are a grader assessing relevance of a retrieved document to a user question. \n 
+    If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n
+    It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
+    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."""
+
+system_hallucination_grader = """You are a grader assessing whether an LLM generation is grounded in / supported by a set of retrieved facts. \n 
+    Give a binary score 'yes' or 'no'. 'Yes' means that the answer is grounded in / supported by the set of facts."""
+
+system_answer_grader = """You are a grader assessing whether an answer addresses / resolves a question \n 
+    Give a binary score 'yes' or 'no'. Yes' means that the answer resolves the question."""
+
+system_question_rewriter = """You a question re-writer that converts an input question to a better version that is optimized \n 
+    for vectorstore retrieval. Look at the input and try to reason about the underlying semantic intent / meaning."""
\ No newline at end of file
diff --git a/app/llmops/src/rag_adaptive_evaluation/python_env.yml b/app/llmops/src/rag_adaptive_evaluation/python_env.yml
index 2278969..451cdb7 100644
--- a/app/llmops/src/rag_adaptive_evaluation/python_env.yml
+++ b/app/llmops/src/rag_adaptive_evaluation/python_env.yml
@@ -24,6 +24,7 @@ build_dependencies:
   - tavily-python
   - langchain_huggingface
   - pydantic
+  - openevals
 # Dependencies required to run the project.
 dependencies:
   - mlflow==2.8.1
\ No newline at end of file
diff --git a/app/llmops/src/rag_adaptive_evaluation/run.py b/app/llmops/src/rag_adaptive_evaluation/run.py
index e0496c0..1fe7543 100644
--- a/app/llmops/src/rag_adaptive_evaluation/run.py
+++ b/app/llmops/src/rag_adaptive_evaluation/run.py
@@ -8,19 +8,43 @@ from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_deepseek import ChatDeepSeek
 from langchain_community.llms.moonshot import Moonshot
 from langchain_huggingface import HuggingFaceEmbeddings
-
 from langchain_community.vectorstores.chroma import Chroma
 
-from typing import Literal, List
+from typing import List
 from typing_extensions import TypedDict
 
 from langchain_core.prompts import ChatPromptTemplate
-from pydantic import BaseModel, Field
 from langchain_community.tools.tavily_search import TavilySearchResults
 from langchain.schema import Document
 from pprint import pprint
 from langgraph.graph import END, StateGraph, START
+from langsmith import Client
 
+from data_models import (
+    RouteQuery, 
+    GradeDocuments, 
+    GradeHallucinations, 
+    GradeAnswer
+)
+from prompts_library import (
+    system_router, 
+    system_retriever_grader,
+    system_hallucination_grader,
+    system_answer_grader,
+    system_question_rewriter
+)
+
+from evaluators import (
+    gemini_evaluator_correctness,
+    deepseek_evaluator_correctness,
+    moonshot_evaluator_correctness,
+    gemini_evaluator_conciseness,
+    deepseek_evaluator_conciseness,
+    moonshot_evaluator_conciseness,
+    gemini_evaluator_hallucination,
+    deepseek_evaluator_hallucination,
+    moonshot_evaluator_hallucination
+)
 
 logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s")
 logger = logging.getLogger()
@@ -98,61 +122,32 @@ def go(args):
         vectorstore = Chroma(persist_directory=db_path, collection_name=collection_name, embedding_function=embedding_model)
         retriever = vectorstore.as_retriever()
 
-        # Data model
-        class RouteQuery(BaseModel):
-            """Route a user query to the most relevant datasource."""
-
-            datasource: Literal["vectorstore", "web_search"] = Field(
-                ...,
-                description="Given a user question choose to route it to web search or a vectorstore.",
-            )
-
+        ##########################################
+        # Routing to vectorstore or web search
         structured_llm_router = llm.with_structured_output(RouteQuery)
-
         # Prompt
-        system = """You are an expert at routing a user question to a vectorstore or web search.
-        The vectorstore contains documents related to any cancer/tumor disease. The question may be
-        asked in a variety of languages, and may be phrased in a variety of ways.
-        Use the vectorstore for questions on these topics. Otherwise, use web-search. 
-        """
         route_prompt = ChatPromptTemplate.from_messages(
             [
-                ("system", system),
+                ("system", system_router),
                 ("human", "{question}"),
             ]
         )
-
         question_router = route_prompt | structured_llm_router
 
-
+        ##########################################
         ### Retrieval Grader
-        # Data model
-        class GradeDocuments(BaseModel):
-            """Binary score for relevance check on retrieved documents."""
-
-            binary_score: str = Field(
-                description="Documents are relevant to the question, 'yes' or 'no'"
-            )
-
         structured_llm_grader = llm.with_structured_output(GradeDocuments)
-
         # Prompt
-        system = """You are a grader assessing relevance of a retrieved document to a user question. \n 
-            If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n
-            It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
-            Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."""
         grade_prompt = ChatPromptTemplate.from_messages(
             [
-                ("system", system),
+                ("system", system_retriever_grader),
                 ("human", "Retrieved document: \n\n {document} \n\n User question: {question}"),
             ]
         )
-
         retrieval_grader = grade_prompt | structured_llm_grader
 
-
+        ##########################################
         ### Generate
-
         from langchain import hub
         from langchain_core.output_parsers import StrOutputParser
 
@@ -167,76 +162,45 @@ def go(args):
         rag_chain = prompt | llm | StrOutputParser()
 
 
-
+        ##########################################
         ### Hallucination Grader
-
-        # Data model
-        class GradeHallucinations(BaseModel):
-            """Binary score for hallucination present in generation answer."""
-
-            binary_score: str = Field(
-                description="Answer is grounded in the facts, 'yes' or 'no'"
-            )
-
-
-        # LLM with function call
         structured_llm_grader = llm.with_structured_output(GradeHallucinations)
 
         # Prompt
-        system = """You are a grader assessing whether an LLM generation is grounded in / supported by a set of retrieved facts. \n 
-            Give a binary score 'yes' or 'no'. 'Yes' means that the answer is grounded in / supported by the set of facts."""
         hallucination_prompt = ChatPromptTemplate.from_messages(
             [
-                ("system", system),
+                ("system", system_hallucination_grader),
                 ("human", "Set of facts: \n\n {documents} \n\n LLM generation: {generation}"),
             ]
         )
 
         hallucination_grader = hallucination_prompt | structured_llm_grader
 
-
+        ##########################################
         ### Answer Grader
-        # Data model
-        class GradeAnswer(BaseModel):
-            """Binary score to assess answer addresses question."""
-
-            binary_score: str = Field(
-                description="Answer addresses the question, 'yes' or 'no'"
-            )
-
-
-        # LLM with function call
         structured_llm_grader = llm.with_structured_output(GradeAnswer)
 
         # Prompt
-        system = """You are a grader assessing whether an answer addresses / resolves a question \n 
-            Give a binary score 'yes' or 'no'. Yes' means that the answer resolves the question."""
         answer_prompt = ChatPromptTemplate.from_messages(
             [
-                ("system", system),
+                ("system", system_answer_grader),
                 ("human", "User question: \n\n {question} \n\n LLM generation: {generation}"),
             ]
         )
-
         answer_grader = answer_prompt | structured_llm_grader
 
+        ##########################################
         ### Question Re-writer
-
-        # LLM
-
         # Prompt
-        system = """You a question re-writer that converts an input question to a better version that is optimized \n 
-            for vectorstore retrieval. Look at the input and try to reason about the underlying semantic intent / meaning."""
         re_write_prompt = ChatPromptTemplate.from_messages(
             [
-                ("system", system),
+                ("system", system_question_rewriter),
                 (
                     "human",
                     "Here is the initial question: \n\n {question} \n Formulate an improved question.",
                 ),
             ]
         )   
-
         question_rewriter = re_write_prompt | llm | StrOutputParser()
 
 
@@ -372,8 +336,6 @@ def go(args):
 
 
         ### Edges ###
-
-
         def route_question(state):
             """
             Route question to web search or RAG.
@@ -504,8 +466,6 @@ def go(args):
         # Compile
         app = workflow.compile()
 
-
-
         # Run
         inputs = {
             "question": args.query
@@ -521,8 +481,10 @@ def go(args):
         # Final generation
         pprint(value["generation"])
 
+    
+
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Chain of Thought RAG")
+    parser = argparse.ArgumentParser(description="Adaptive AG")
 
     parser.add_argument(
         "--query", 
@@ -531,6 +493,13 @@ if __name__ == "__main__":
         required=True
     )
 
+    parser.add_argument(
+        "--query_evaluation_dataset_csv_path",
+        type=str,
+        help="Path to the query evaluation dataset",
+        default=None,
+    )
+
     parser.add_argument(
         "--input_chromadb_artifact", 
         type=str,

From fcb2f9e4ea376b4a9e31e034d1f6ec442e687ddd Mon Sep 17 00:00:00 2001
From: leehk <leehongkai@gmail.com>
Date: Thu, 13 Mar 2025 15:04:21 +0800
Subject: [PATCH 3/7] correctness done

---
 .../rag_adaptive_evaluation/data_models.py    |   2 +-
 .../src/rag_adaptive_evaluation/evaluators.py | 144 ++++++++----------
 .../prompts_library.py                        |  37 ++++-
 .../rag_adaptive_evaluation/python_env.yml    |   1 -
 app/llmops/src/rag_adaptive_evaluation/run.py |  92 ++++++++---
 app/llmops/src/rag_cot_evaluation/run.py      |  19 +--
 6 files changed, 174 insertions(+), 121 deletions(-)

diff --git a/app/llmops/src/rag_adaptive_evaluation/data_models.py b/app/llmops/src/rag_adaptive_evaluation/data_models.py
index 680cfbd..aeb193d 100644
--- a/app/llmops/src/rag_adaptive_evaluation/data_models.py
+++ b/app/llmops/src/rag_adaptive_evaluation/data_models.py
@@ -1,4 +1,4 @@
-from typing import Literal, List
+from typing import Literal
 from pydantic import BaseModel, Field
 
 
diff --git a/app/llmops/src/rag_adaptive_evaluation/evaluators.py b/app/llmops/src/rag_adaptive_evaluation/evaluators.py
index 17b1b90..0d16dea 100644
--- a/app/llmops/src/rag_adaptive_evaluation/evaluators.py
+++ b/app/llmops/src/rag_adaptive_evaluation/evaluators.py
@@ -1,99 +1,77 @@
+import os
 from decouple import config
-from openevals.llm import create_llm_as_judge
-from openevals.prompts import (
-    CORRECTNESS_PROMPT, 
-    CONCISENESS_PROMPT, 
-    HALLUCINATION_PROMPT
-)
+
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_deepseek import ChatDeepSeek
 from langchain_community.llms.moonshot import Moonshot
 
-GEMINI_API_KEY = config("GOOGLE_API_KEY", cast=str)
-DEEKSEEK_API_KEY = config("DEEKSEEK_API_KEY", cast=str)
-MOONSHOT_API_KEY = config("MOONSHOT_API_KEY", cast=str)
+from pydantic import BaseModel, Field
 
-# correctness
-gemini_evaluator_correctness = create_llm_as_judge(
-    prompt=CORRECTNESS_PROMPT,
-    judge=ChatGoogleGenerativeAI(
+from prompts_library import CORRECTNESS_PROMPT
+
+os.environ["GOOGLE_API_KEY"] = config("GOOGLE_API_KEY", cast=str)
+os.environ["DEEPSEEK_API_KEY"] = config("DEEPSEEK_API_KEY", cast=str)
+os.environ["MOONSHOT_API_KEY"] = config("MOONSHOT_API_KEY", cast=str)
+
+
+# Define output schema for the evaluation
+class CorrectnessGrade(BaseModel):
+    score: int = Field(description="Numerical score (1-5) indicating the correctness of the response.")
+
+# Todo:
+# class RelevanceGrade(BaseModel):
+
+
+
+def gemini_evaluator_correctness(outputs: dict, reference_outputs: dict) -> CorrectnessGrade:
+    llm = ChatGoogleGenerativeAI(
                 model="gemini-1.5-flash", 
-                google_api_key=GEMINI_API_KEY,
                 temperature=0.5,
-            ),
-    )
+            )
 
-deepseek_evaluator_correctness = create_llm_as_judge(
-    prompt=CORRECTNESS_PROMPT,
-    judge=ChatDeepSeek(
+    messages = [
+        {"role": "system", "content": CORRECTNESS_PROMPT},
+        {"role": "user", "content": f"""Ground Truth answer: {reference_outputs["answer"]};
+        Student's Answer: {outputs['response']}
+        """}
+    ]
+
+    response = llm.invoke(messages)
+
+    return CorrectnessGrade(score=int(response.content)).score
+
+
+def deepseek_evaluator_correctness(outputs: dict, reference_outputs: dict) -> CorrectnessGrade:
+    llm = ChatDeepSeek(
                 model="deepseek-chat", 
                 temperature=0.5,
-                api_key=DEEKSEEK_API_KEY
-            ),
-    )
+            )
 
-moonshot_evaluator_correctness = create_llm_as_judge(
-    prompt=CORRECTNESS_PROMPT,
-    judge=Moonshot(
-                model="moonshot-v1-128k", 
+    messages = [
+        {"role": "system", "content": CORRECTNESS_PROMPT},
+        {"role": "user", "content": f"""Ground Truth answer: {reference_outputs["answer"]};
+        Student's Answer: {outputs['response']}
+        """}
+    ]
+
+    response = llm.invoke(messages)
+
+    return CorrectnessGrade(score=int(response.content)).score
+
+
+def moonshot_evaluator_correctness(outputs: dict, reference_outputs: dict) -> CorrectnessGrade:
+    llm = Moonshot(
+                model="moonshot-v1-128k",
                 temperature=0.5,
-                api_key=MOONSHOT_API_KEY
-            ),
-    )
+            )
 
-# conciseness
-gemini_evaluator_conciseness = create_llm_as_judge(
-    prompt=CONCISENESS_PROMPT,
-    judge=ChatGoogleGenerativeAI(
-                model="gemini-1.5-flash", 
-                google_api_key=GEMINI_API_KEY,
-                temperature=0.5,
-            ),
-    )
+    messages = [
+        {"role": "system", "content": CORRECTNESS_PROMPT},
+        {"role": "user", "content": f"""Ground Truth answer: {reference_outputs["answer"]};
+        Student's Answer: {outputs['response']}
+        """}
+    ]
 
-deepseek_evaluator_conciseness = create_llm_as_judge(
-    prompt=CONCISENESS_PROMPT,
-    judge=ChatDeepSeek(
-                model="deepseek-chat", 
-                temperature=0.5,
-                api_key=DEEKSEEK_API_KEY
-            ),
-    )
-
-moonshot_evaluator_conciseness = create_llm_as_judge(
-    prompt=CONCISENESS_PROMPT,
-    judge=Moonshot(
-                model="moonshot-v1-128k", 
-                temperature=0.5,
-                api_key=MOONSHOT_API_KEY
-            ),
-    )
-
-# hallucination
-gemini_evaluator_hallucination = create_llm_as_judge(
-    prompt=HALLUCINATION_PROMPT,
-    judge=ChatGoogleGenerativeAI(
-                model="gemini-1.5-flash", 
-                google_api_key=GEMINI_API_KEY,
-                temperature=0.5,
-            ),
-    )
-
-deepseek_evaluator_hallucination = create_llm_as_judge(
-    prompt=HALLUCINATION_PROMPT,
-    judge=ChatDeepSeek(
-                model="deepseek-chat", 
-                temperature=0.5,
-                api_key=DEEKSEEK_API_KEY
-            ),
-    )
-
-moonshot_evaluator_hallucination = create_llm_as_judge(
-    prompt=HALLUCINATION_PROMPT,
-    judge=Moonshot(
-                model="moonshot-v1-128k", 
-                temperature=0.5,
-                api_key=MOONSHOT_API_KEY
-            ),
-    )
+    response = llm.invoke(messages)
 
+    return CorrectnessGrade(score=int(response)).score
diff --git a/app/llmops/src/rag_adaptive_evaluation/prompts_library.py b/app/llmops/src/rag_adaptive_evaluation/prompts_library.py
index fcaf564..3bfed18 100644
--- a/app/llmops/src/rag_adaptive_evaluation/prompts_library.py
+++ b/app/llmops/src/rag_adaptive_evaluation/prompts_library.py
@@ -16,4 +16,39 @@ system_answer_grader = """You are a grader assessing whether an answer addresses
     Give a binary score 'yes' or 'no'. Yes' means that the answer resolves the question."""
 
 system_question_rewriter = """You a question re-writer that converts an input question to a better version that is optimized \n 
-    for vectorstore retrieval. Look at the input and try to reason about the underlying semantic intent / meaning."""
\ No newline at end of file
+    for vectorstore retrieval. Look at the input and try to reason about the underlying semantic intent / meaning."""
+
+
+# Evaluation
+CORRECTNESS_PROMPT = """Evaluate Student Answer against Ground Truth for conceptual similarity and correctness.
+
+You are an impartial judge. Evaluate Student Answer against Ground Truth for conceptual similarity and correctness. 
+You may also be given additional information that was used by the model to generate the output.
+
+Your task is to determine a numerical score called faithfulness based on the input and output.
+A definition of correctness and a grading rubric are provided below.
+You must use the grading rubric to determine your score.
+
+Metric definition:
+Correctness assesses the degree to which a provided output aligns with factual accuracy, completeness, logical 
+consistency, and precise terminology. It evaluates the intrinsic validity of the output, independent of any 
+external context. A higher score indicates a higher adherence to factual accuracy, completeness, logical consistency, 
+and precise terminology.
+
+Grading rubric:
+Correctness: Below are the details for different scores: 
+ - 1: Major factual errors, highly incomplete, illogical, and uses incorrect terminology.
+ - 2: Significant factual errors, incomplete, noticeable logical flaws, and frequent terminology errors.
+ - 3: Minor factual errors, somewhat incomplete, minor logical inconsistencies, and occasional terminology errors.
+ - 4: Few to no factual errors, mostly complete, strong logical consistency, and accurate terminology.
+ - 5: Accurate, complete, logically consistent, and uses precise terminology.
+ 
+ Reminder:
+  - Carefully read the input and output
+  - Check for factual accuracy and completeness
+  - Focus on correctness of information rather than style or verbosity
+  - The goal is to evaluate factual correctness and completeness of the response.
+  - Please provide your answer score only with the numerical number between 1 and 5. No score: or other text is allowed.
+
+"""
+
diff --git a/app/llmops/src/rag_adaptive_evaluation/python_env.yml b/app/llmops/src/rag_adaptive_evaluation/python_env.yml
index 451cdb7..2278969 100644
--- a/app/llmops/src/rag_adaptive_evaluation/python_env.yml
+++ b/app/llmops/src/rag_adaptive_evaluation/python_env.yml
@@ -24,7 +24,6 @@ build_dependencies:
   - tavily-python
   - langchain_huggingface
   - pydantic
-  - openevals
 # Dependencies required to run the project.
 dependencies:
   - mlflow==2.8.1
\ No newline at end of file
diff --git a/app/llmops/src/rag_adaptive_evaluation/run.py b/app/llmops/src/rag_adaptive_evaluation/run.py
index 1fe7543..c629de7 100644
--- a/app/llmops/src/rag_adaptive_evaluation/run.py
+++ b/app/llmops/src/rag_adaptive_evaluation/run.py
@@ -38,30 +38,26 @@ from evaluators import (
     gemini_evaluator_correctness,
     deepseek_evaluator_correctness,
     moonshot_evaluator_correctness,
-    gemini_evaluator_conciseness,
-    deepseek_evaluator_conciseness,
-    moonshot_evaluator_conciseness,
-    gemini_evaluator_hallucination,
-    deepseek_evaluator_hallucination,
-    moonshot_evaluator_hallucination
+    # gemini_evaluator_conciseness,
+    # deepseek_evaluator_conciseness,
+    # moonshot_evaluator_conciseness,
+    # gemini_evaluator_hallucination,
+    # deepseek_evaluator_hallucination,
+    # moonshot_evaluator_hallucination
 )
 
 logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s")
 logger = logging.getLogger()
 
-GEMINI_API_KEY = config("GOOGLE_API_KEY", cast=str)
-DEEKSEEK_API_KEY = config("DEEKSEEK_API_KEY", cast=str)
-MOONSHOT_API_KEY = config("MOONSHOT_API_KEY", cast=str)
-TAVILY_API_KEY = config("TAVILY_API_KEY", cast=str)
-LANGSMITH_API_KEY = config("LANGSMITH_API_KEY", cast=str)
-LANGSMITH_TRACING = config("LANGSMITH_TRACING", cast=str)
-LANGSMITH_PROJECT = config("LANGSMITH_PROJECT", cast=str)
-os.environ["TAVILY_API_KEY"] = TAVILY_API_KEY
+os.environ["GOOGLE_API_KEY"] = config("GOOGLE_API_KEY", cast=str)
+os.environ["DEEPSEEK_API_KEY"] = config("DEEPSEEK_API_KEY", cast=str)
+os.environ["MOONSHOT_API_KEY"] = config("MOONSHOT_API_KEY", cast=str)
+os.environ["TAVILY_API_KEY"] = config("TAVILY_API_KEY", cast=str)
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-os.environ["LANGSMITH_API_KEY"] = LANGSMITH_API_KEY
-os.environ["LANGSMITH_TRACING"] = LANGSMITH_TRACING
+os.environ["LANGSMITH_API_KEY"] = config("LANGSMITH_API_KEY", cast=str)
+os.environ["LANGSMITH_TRACING"] = config("LANGSMITH_TRACING", cast=str)
 os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
-os.environ["LANGSMITH_PROJECT"] = LANGSMITH_PROJECT
+os.environ["LANGSMITH_PROJECT"] = config("LANGSMITH_PROJECT", cast=str)
 
 def go(args):
 
@@ -95,12 +91,10 @@ def go(args):
                 max_tokens=None,
                 timeout=None,
                 max_retries=2,
-                api_key=DEEKSEEK_API_KEY
             )
         elif args.chat_model_provider == 'gemini':
             llm = ChatGoogleGenerativeAI(
                 model="gemini-1.5-flash", 
-                google_api_key=GEMINI_API_KEY,
                 temperature=0,
                 max_retries=3,
                 streaming=True
@@ -112,7 +106,6 @@ def go(args):
                 max_tokens=None,
                 timeout=None,
                 max_retries=2,
-                api_key=MOONSHOT_API_KEY
             )
 
         # Load data from ChromaDB
@@ -479,7 +472,61 @@ def go(args):
             pprint("\n---\n")
 
         # Final generation
-        pprint(value["generation"])
+        print(value["generation"])
+
+        return {"response": value["generation"]}
+    
+def go_evaluation(args):
+    if args.query_evaluation_dataset_csv_path:
+        # import pandas as pd
+        # from tqdm import tqdm
+
+        # df = pd.read_csv(args.query_evaluation_dataset_csv_path)
+        client = Client()
+        # # Create inputs and reference outputs
+        # examples = [
+        # (
+        #     "Which country is Mount Kilimanjaro located in?",
+        #     "Mount Kilimanjaro is located in Tanzania.",
+        # ),
+        # (
+        #     "What is Earth's lowest point?",
+        #     "Earth's lowest point is The Dead Sea.",
+        # ),
+        # ]
+
+        # inputs = [{"question": input_prompt} for input_prompt, _ in examples]
+        # outputs = [{"answer": output_answer} for _, output_answer in examples]
+
+        # # Programmatically create a dataset in LangSmith
+        # dataset = client.create_dataset(
+        #     dataset_name = "Sample dataset",
+        #     description = "A sample dataset in LangSmith."
+        # )
+
+        # # Add examples to the dataset
+        # client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)
+
+        def target(inputs: dict) -> dict:
+            new_args = argparse.Namespace(**vars(args))
+            new_args.query = inputs["question"]
+            return go(new_args)
+
+        
+        # After running the evaluation, a link will be provided to view the results in langsmith
+        experiment_results = client.evaluate(
+            target,
+            data = "Sample dataset",
+            evaluators = [
+                    moonshot_evaluator_correctness,
+                    deepseek_evaluator_correctness,
+                    gemini_evaluator_correctness
+                # can add multiple evaluators here
+            ],
+            experiment_prefix = "first-eval-in-langsmith",
+            max_concurrency = 1,
+            
+        )
 
     
 
@@ -523,4 +570,5 @@ if __name__ == "__main__":
 
     args = parser.parse_args()
     
-    go(args)
\ No newline at end of file
+    # go(args)
+    go_evaluation(args)
\ No newline at end of file
diff --git a/app/llmops/src/rag_cot_evaluation/run.py b/app/llmops/src/rag_cot_evaluation/run.py
index aa773b1..06484df 100644
--- a/app/llmops/src/rag_cot_evaluation/run.py
+++ b/app/llmops/src/rag_cot_evaluation/run.py
@@ -14,18 +14,14 @@ from langchain_community.llms.moonshot import Moonshot
 logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s")
 logger = logging.getLogger()
 
+os.environ["GOOGLE_API_KEY"] = config("GOOGLE_API_KEY", cast=str)
+os.environ["DEEPSEEK_API_KEY"] = config("DEEPSEEK_API_KEY", cast=str)
+os.environ["MOONSHOT_API_KEY"] = config("MOONSHOT_API_KEY", cast=str)
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-GEMINI_API_KEY = config("GOOGLE_API_KEY", cast=str)
-DEEKSEEK_API_KEY = config("DEEKSEEK_API_KEY", cast=str)
-MOONSHOT_API_KEY = config("MOONSHOT_API_KEY", cast=str)
-LANGSMITH_API_KEY = config("LANGSMITH_API_KEY", cast=str)
-LANGSMITH_TRACING = config("LANGSMITH_TRACING", cast=str)
-LANGSMITH_PROJECT = config("LANGSMITH_PROJECT", cast=str)
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-os.environ["LANGSMITH_API_KEY"] = LANGSMITH_API_KEY
-os.environ["LANGSMITH_TRACING"] = LANGSMITH_TRACING
+os.environ["LANGSMITH_API_KEY"] = config("LANGSMITH_API_KEY", cast=str)
+os.environ["LANGSMITH_TRACING"] = config("LANGSMITH_TRACING", cast=str)
 os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
-os.environ["LANGSMITH_PROJECT"] = LANGSMITH_PROJECT
+os.environ["LANGSMITH_PROJECT"] = config("LANGSMITH_PROJECT", cast=str)
 
 def go(args):
 
@@ -68,14 +64,12 @@ def go(args):
                 max_tokens=None,
                 timeout=None,
                 max_retries=2,
-                api_key=DEEKSEEK_API_KEY
             )
             
         elif args.chat_model_provider == "gemini":
             # Initialize Gemini model
             llm = ChatGoogleGenerativeAI(
                 model="gemini-1.5-flash", 
-                google_api_key=GEMINI_API_KEY,
                 temperature=0,
                 max_retries=3
                 )
@@ -88,7 +82,6 @@ def go(args):
                 max_tokens=None,
                 timeout=None,
                 max_retries=2,
-                api_key=MOONSHOT_API_KEY
             )
             
 

From b6ca6ac677181512a3e8777691dc40c693b82029 Mon Sep 17 00:00:00 2001
From: leehk <leehongkai@gmail.com>
Date: Thu, 13 Mar 2025 16:19:39 +0800
Subject: [PATCH 4/7] faithfulness done

---
 .../src/rag_adaptive_evaluation/evaluators.py | 72 +++++++++++++++++--
 .../prompts_library.py                        | 44 +++++++++---
 app/llmops/src/rag_adaptive_evaluation/run.py | 14 ++--
 3 files changed, 109 insertions(+), 21 deletions(-)

diff --git a/app/llmops/src/rag_adaptive_evaluation/evaluators.py b/app/llmops/src/rag_adaptive_evaluation/evaluators.py
index 0d16dea..f7f1184 100644
--- a/app/llmops/src/rag_adaptive_evaluation/evaluators.py
+++ b/app/llmops/src/rag_adaptive_evaluation/evaluators.py
@@ -7,7 +7,7 @@ from langchain_community.llms.moonshot import Moonshot
 
 from pydantic import BaseModel, Field
 
-from prompts_library import CORRECTNESS_PROMPT
+from prompts_library import CORRECTNESS_PROMPT, FAITHFULNESS_PROMPT
 
 os.environ["GOOGLE_API_KEY"] = config("GOOGLE_API_KEY", cast=str)
 os.environ["DEEPSEEK_API_KEY"] = config("DEEPSEEK_API_KEY", cast=str)
@@ -18,11 +18,12 @@ os.environ["MOONSHOT_API_KEY"] = config("MOONSHOT_API_KEY", cast=str)
 class CorrectnessGrade(BaseModel):
     score: int = Field(description="Numerical score (1-5) indicating the correctness of the response.")
 
-# Todo:
-# class RelevanceGrade(BaseModel):
+class FaithfulnessGrade(BaseModel):
+    score: int = Field(description="Numerical score (1-5) indicating the faithfulness of the response.")
 
 
 
+# Evaluators
 def gemini_evaluator_correctness(outputs: dict, reference_outputs: dict) -> CorrectnessGrade:
     llm = ChatGoogleGenerativeAI(
                 model="gemini-1.5-flash", 
@@ -74,4 +75,67 @@ def moonshot_evaluator_correctness(outputs: dict, reference_outputs: dict) -> Co
 
     response = llm.invoke(messages)
 
-    return CorrectnessGrade(score=int(response)).score
+    try:
+        return CorrectnessGrade(score=int(response)).score
+    except ValueError:
+        score_str = response.split(":")[1].strip()
+        return CorrectnessGrade(score=int(score_str)).score
+    
+
+def gemini_evaluator_faithfulness(outputs: dict, reference_outputs: dict) -> FaithfulnessGrade:
+    llm = ChatGoogleGenerativeAI(
+                model="gemini-1.5-pro", 
+                temperature=0.5,
+            )
+
+    messages = [
+        {"role": "system", "content": FAITHFULNESS_PROMPT},
+        {"role": "user", "content": f"""Context: {reference_outputs["answer"]};
+        Output: {outputs['response']}
+        """}
+    ]
+
+    response = llm.invoke(messages)
+
+    return FaithfulnessGrade(score=int(response.content)).score
+
+
+def deepseek_evaluator_faithfulness(outputs: dict, reference_outputs: dict) -> FaithfulnessGrade:
+    llm = ChatDeepSeek(
+                model="deepseek-chat", 
+                temperature=0.5,
+            )
+
+    messages = [
+        {"role": "system", "content": FAITHFULNESS_PROMPT},
+        {"role": "user", "content": f"""Context: {reference_outputs["answer"]};
+        Output: {outputs['response']}
+        """}
+    ]
+
+    response = llm.invoke(messages)
+
+    return FaithfulnessGrade(score=int(response.content)).score
+
+
+def moonshot_evaluator_faithfulness(outputs: dict, reference_outputs: dict) -> FaithfulnessGrade:
+    llm = Moonshot(
+                model="moonshot-v1-128k",
+                temperature=0.5,
+            )
+
+    messages = [
+        {"role": "system", "content": FAITHFULNESS_PROMPT},
+        {"role": "user", "content": f"""Context: {reference_outputs["answer"]};
+        Output: {outputs['response']}
+        """}
+    ]
+
+    response = llm.invoke(messages)
+
+    try:
+        return FaithfulnessGrade(score=int(response)).score
+    except ValueError:
+        score_str = response.split(":")[1].strip()
+        return FaithfulnessGrade(score=int(score_str)).score
+
diff --git a/app/llmops/src/rag_adaptive_evaluation/prompts_library.py b/app/llmops/src/rag_adaptive_evaluation/prompts_library.py
index 3bfed18..33d23a7 100644
--- a/app/llmops/src/rag_adaptive_evaluation/prompts_library.py
+++ b/app/llmops/src/rag_adaptive_evaluation/prompts_library.py
@@ -20,20 +20,18 @@ system_question_rewriter = """You a question re-writer that converts an input qu
 
 
 # Evaluation
-CORRECTNESS_PROMPT = """Evaluate Student Answer against Ground Truth for conceptual similarity and correctness.
-
-You are an impartial judge. Evaluate Student Answer against Ground Truth for conceptual similarity and correctness. 
+CORRECTNESS_PROMPT = """You are an impartial judge. Evaluate Student Answer against Ground Truth for conceptual similarity and correctness. 
 You may also be given additional information that was used by the model to generate the output.
 
-Your task is to determine a numerical score called faithfulness based on the input and output.
+Your task is to determine a numerical score called correctness based on the Student Answer and Ground Truth.
 A definition of correctness and a grading rubric are provided below.
 You must use the grading rubric to determine your score.
 
 Metric definition:
-Correctness assesses the degree to which a provided output aligns with factual accuracy, completeness, logical 
-consistency, and precise terminology. It evaluates the intrinsic validity of the output, independent of any 
+Correctness assesses the degree to which a provided Student Answer aligns with factual accuracy, completeness, logical 
+consistency, and precise terminology of the Ground Truth. It evaluates the intrinsic validity of the Student Answer , independent of any 
 external context. A higher score indicates a higher adherence to factual accuracy, completeness, logical consistency, 
-and precise terminology.
+and precise terminology of the Ground Truth.
 
 Grading rubric:
 Correctness: Below are the details for different scores: 
@@ -44,11 +42,37 @@ Correctness: Below are the details for different scores:
  - 5: Accurate, complete, logically consistent, and uses precise terminology.
  
  Reminder:
-  - Carefully read the input and output
-  - Check for factual accuracy and completeness
+  - Carefully read the Student Answer and Ground Truth
+  - Check for factual accuracy and completeness of Student Answer compared to the Ground Truth
   - Focus on correctness of information rather than style or verbosity
-  - The goal is to evaluate factual correctness and completeness of the response.
+  - The goal is to evaluate factual correctness and completeness of the Student Answer.
   - Please provide your answer score only with the numerical number between 1 and 5. No score: or other text is allowed.
 
 """
 
+FAITHFULNESS_PROMPT = """You are an impartial judge. Evaluate output against context for faithfulness. 
+You may also be given additional information that was used by the model to generate the Output.
+
+Your task is to determine a numerical score called faithfulness based on the output and context.
+A definition of faithfulness and a grading rubric are provided below.
+You must use the grading rubric to determine your score.
+
+Metric definition:
+Faithfulness is only evaluated with the provided output and context. Faithfulness assesses how much of the 
+provided output is factually consistent with the provided context. A higher score indicates that a higher proportion of 
+claims present in the output can be derived from the provided context. Faithfulness does not consider how much extra 
+information from the context is not present in the output.
+
+Grading rubric:
+Faithfulness: Below are the details for different scores:
+- Score 1: None of the claims in the output can be inferred from the provided context.
+- Score 2: Some of the claims in the output can be inferred from the provided context, but the majority of the output is missing from, inconsistent with, or contradictory to the provided context.
+- Score 3: Half or more of the claims in the output can be inferred from the provided context.
+- Score 4: Most of the claims in the output can be inferred from the provided context, with very little information that is not directly supported by the provided context.
+- Score 5: All of the claims in the output are directly supported by the provided context, demonstrating high faithfulness to the provided context.
+
+Reminder:
+- Carefully read the output and context
+- Focus on the information instead of the writing style or verbosity.
+- Please provide your answer score only with the numerical number between 1 and 5, according to the grading rubric above. No score: or other text is allowed.  
+"""
\ No newline at end of file
diff --git a/app/llmops/src/rag_adaptive_evaluation/run.py b/app/llmops/src/rag_adaptive_evaluation/run.py
index c629de7..bf8de6c 100644
--- a/app/llmops/src/rag_adaptive_evaluation/run.py
+++ b/app/llmops/src/rag_adaptive_evaluation/run.py
@@ -38,12 +38,9 @@ from evaluators import (
     gemini_evaluator_correctness,
     deepseek_evaluator_correctness,
     moonshot_evaluator_correctness,
-    # gemini_evaluator_conciseness,
-    # deepseek_evaluator_conciseness,
-    # moonshot_evaluator_conciseness,
-    # gemini_evaluator_hallucination,
-    # deepseek_evaluator_hallucination,
-    # moonshot_evaluator_hallucination
+    gemini_evaluator_faithfulness,
+    deepseek_evaluator_faithfulness,
+    moonshot_evaluator_faithfulness
 )
 
 logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s")
@@ -520,7 +517,10 @@ def go_evaluation(args):
             evaluators = [
                     moonshot_evaluator_correctness,
                     deepseek_evaluator_correctness,
-                    gemini_evaluator_correctness
+                    gemini_evaluator_correctness,
+                    gemini_evaluator_faithfulness,
+                    deepseek_evaluator_faithfulness,
+                    moonshot_evaluator_faithfulness
                 # can add multiple evaluators here
             ],
             experiment_prefix = "first-eval-in-langsmith",

From 86a2c1a055d048eb2697f2575c7255e3a702049a Mon Sep 17 00:00:00 2001
From: leehk <leehongkai@gmail.com>
Date: Thu, 13 Mar 2025 21:23:36 +0800
Subject: [PATCH 5/7] update for now

---
 app/llmops/config.yaml                        | 11 ++-
 app/llmops/main.py                            | 25 ++---
 .../src/rag_adaptive_evaluation/MLproject     | 22 ++++-
 app/llmops/src/rag_adaptive_evaluation/run.py | 94 +++++++++++--------
 4 files changed, 99 insertions(+), 53 deletions(-)

diff --git a/app/llmops/config.yaml b/app/llmops/config.yaml
index 5452f8c..33383ea 100644
--- a/app/llmops/config.yaml
+++ b/app/llmops/config.yaml
@@ -9,8 +9,15 @@ etl:
   path_document_folder: "../../../../data"
   run_id_documents: None
   embedding_model: paraphrase-multilingual-mpnet-base-v2
-prompt_engineering:
+rag:
   run_id_chromadb: None
   chat_model_provider: gemini
+testing:
   query: "如何治疗乳腺癌?"
-  query_evaluation_dataset_csv_path: "../../../../data/qa_datasets.csv"   
\ No newline at end of file
+evaluation:
+  evaluation_dataset_csv_path: "../../../../data/qa_datasets.csv"
+  evaluation_dataset_column_question: question
+  evaluation_dataset_column_answer: answer
+  ls_chat_model_provider:
+  - gemini
+  - moonshot
\ No newline at end of file
diff --git a/app/llmops/main.py b/app/llmops/main.py
index ac768b4..32d6a8b 100644
--- a/app/llmops/main.py
+++ b/app/llmops/main.py
@@ -104,7 +104,7 @@ def go(config: DictConfig):
             )
         if "rag_cot_evaluation" in active_steps:
 
-            if config["prompt_engineering"]["run_id_chromadb"] == "None":
+            if config["rag"]["run_id_chromadb"] == "None":
                 # Look for run_id that has artifact logged as documents
                 run_id = None
                 client = mlflow.tracking.MlflowClient()
@@ -119,22 +119,22 @@ def go(config: DictConfig):
                 if run_id is None:
                     raise ValueError("No run_id found with artifact logged as documents")
             else:
-                run_id = config["prompt_engineering"]["run_id_chromadb"]
+                run_id = config["rag"]["run_id_chromadb"]
 
             _ = mlflow.run(
                 os.path.join(hydra.utils.get_original_cwd(), "src", "rag_cot_evaluation"),
                 "main",
                 parameters={
-                    "query": config["prompt_engineering"]["query"],
+                    "query": config["testing"]["query"],
                     "input_chromadb_artifact": f'runs:/{run_id}/chromadb/chroma_db.zip',
                     "embedding_model": config["etl"]["embedding_model"],
-                    "chat_model_provider": config["prompt_engineering"]["chat_model_provider"]
+                    "chat_model_provider": config["rag"]["chat_model_provider"]
                 },
             )
         
         if "rag_adaptive_evaluation" in active_steps:
 
-            if config["prompt_engineering"]["run_id_chromadb"] == "None":
+            if config["rag"]["run_id_chromadb"] == "None":
                 # Look for run_id that has artifact logged as documents
                 run_id = None
                 client = mlflow.tracking.MlflowClient()
@@ -149,17 +149,20 @@ def go(config: DictConfig):
                 if run_id is None:
                     raise ValueError("No run_id found with artifact logged as documents")
             else:
-                run_id = config["prompt_engineering"]["run_id_chromadb"]
+                run_id = config["rag"]["run_id_chromadb"]
 
             _ = mlflow.run(
                 os.path.join(hydra.utils.get_original_cwd(), "src", "rag_adaptive_evaluation"),
                 "main",
                 parameters={
-                    "query": config["prompt_engineering"]["query"],
-                    "query_evaluation_dataset_csv_path": config["prompt_engineering"]["query_evaluation_dataset_csv_path"],
+                    "query": config["testing"]["query"],
+                    "evaluation_dataset_csv_path": config["evaluation"]["evaluation_dataset_csv_path"],
+                    "evaluation_dataset_column_question": config["evaluation"]["evaluation_dataset_column_question"],
+                    "evaluation_dataset_column_answer": config["evaluation"]["evaluation_dataset_column_answer"],
                     "input_chromadb_artifact": f'runs:/{run_id}/chromadb/chroma_db.zip',
                     "embedding_model": config["etl"]["embedding_model"],
-                    "chat_model_provider": config["prompt_engineering"]["chat_model_provider"]
+                    "chat_model_provider": config["rag"]["chat_model_provider"],
+                    "ls_chat_model_evaluator": ','.join(config["evaluation"]["ls_chat_model_provider"]) if config["evaluation"]["ls_chat_model_provider"] is not None else 'None',
                 },
             )
 
@@ -169,10 +172,10 @@ def go(config: DictConfig):
                 os.path.join(hydra.utils.get_original_cwd(), "components", "test_rag_cot"),
                 "main",
                 parameters={
-                    "query": config["prompt_engineering"]["query"],
+                    "query": config["testing"]["query"],
                     "input_chromadb_local": os.path.join(hydra.utils.get_original_cwd(), "src", "rag_cot_evaluation", "chroma_db"),
                     "embedding_model": config["etl"]["embedding_model"],
-                    "chat_model_provider": config["prompt_engineering"]["chat_model_provider"]
+                    "chat_model_provider": config["rag"]["chat_model_provider"]
                 },
             )
 
diff --git a/app/llmops/src/rag_adaptive_evaluation/MLproject b/app/llmops/src/rag_adaptive_evaluation/MLproject
index 457116d..77061d4 100644
--- a/app/llmops/src/rag_adaptive_evaluation/MLproject
+++ b/app/llmops/src/rag_adaptive_evaluation/MLproject
@@ -9,10 +9,18 @@ entry_points:
         description: Query to run
         type: string
       
-      query_evaluation_dataset_csv_path:
+      evaluation_dataset_csv_path:
         description: query evaluation dataset csv path
         type: string
 
+      evaluation_dataset_column_question:
+        description: query evaluation dataset column question
+        type: string
+
+      evaluation_dataset_column_answer:
+        description: query evaluation dataset column groundtruth
+        type: string
+
       input_chromadb_artifact:
         description: Fully-qualified name for the input artifact
         type: string
@@ -24,10 +32,18 @@ entry_points:
       chat_model_provider:
         description: Fully-qualified name for the chat model provider
         type: string
+
+      ls_chat_model_evaluator:
+        description: list of chat model providers for evaluation
+        type: string
+      
     
     command: >-
         python run.py --query {query} \
-                      --query_evaluation_dataset_csv_path {query_evaluation_dataset_csv_path} \
+                      --evaluation_dataset_csv_path {evaluation_dataset_csv_path} \
+                      --evaluation_dataset_column_question {evaluation_dataset_column_question} \
+                      --evaluation_dataset_column_answer {evaluation_dataset_column_answer} \
                       --input_chromadb_artifact {input_chromadb_artifact} \
                       --embedding_model {embedding_model} \
-                      --chat_model_provider {chat_model_provider}
\ No newline at end of file
+                      --chat_model_provider {chat_model_provider} \
+                      --ls_chat_model_evaluator {ls_chat_model_evaluator}
\ No newline at end of file
diff --git a/app/llmops/src/rag_adaptive_evaluation/run.py b/app/llmops/src/rag_adaptive_evaluation/run.py
index bf8de6c..4acc4c7 100644
--- a/app/llmops/src/rag_adaptive_evaluation/run.py
+++ b/app/llmops/src/rag_adaptive_evaluation/run.py
@@ -474,61 +474,60 @@ def go(args):
         return {"response": value["generation"]}
     
 def go_evaluation(args):
-    if args.query_evaluation_dataset_csv_path:
-        # import pandas as pd
-        # from tqdm import tqdm
+    if args.evaluation_dataset_csv_path:
 
-        # df = pd.read_csv(args.query_evaluation_dataset_csv_path)
+        import pandas as pd
+
+        df = pd.read_csv(args.evaluation_dataset_csv_path)
+        dataset_name = os.path.basename(args.evaluation_dataset_csv_path).split('.')[0]
+
+        # df contains columns of question and answer
+        examples = df[[args.evaluation_dataset_column_question, args.evaluation_dataset_column_answer]].values.tolist()
+        inputs = [{"question": input_prompt} for input_prompt, _ in examples]
+        outputs = [{"answer": output_answer} for _, output_answer in examples]
+
+        # Programmatically create a dataset in LangSmith
         client = Client()
-        # # Create inputs and reference outputs
-        # examples = [
-        # (
-        #     "Which country is Mount Kilimanjaro located in?",
-        #     "Mount Kilimanjaro is located in Tanzania.",
-        # ),
-        # (
-        #     "What is Earth's lowest point?",
-        #     "Earth's lowest point is The Dead Sea.",
-        # ),
-        # ]
 
-        # inputs = [{"question": input_prompt} for input_prompt, _ in examples]
-        # outputs = [{"answer": output_answer} for _, output_answer in examples]
+        dataset = client.create_dataset(
+            dataset_name = dataset_name,
+            description = "A sample dataset in LangSmith."
+        )
 
-        # # Programmatically create a dataset in LangSmith
-        # dataset = client.create_dataset(
-        #     dataset_name = "Sample dataset",
-        #     description = "A sample dataset in LangSmith."
-        # )
+        # Add examples to the dataset
+        client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)
 
-        # # Add examples to the dataset
-        # client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)
+        
+        args.ls_chat_model_evaluator = None if args.ls_chat_model_evaluator == 'None' else args.ls_chat_model_evaluator.split(',')
 
         def target(inputs: dict) -> dict:
             new_args = argparse.Namespace(**vars(args))
             new_args.query = inputs["question"]
             return go(new_args)
 
+        ls_evaluators = []
+        if args.ls_chat_model_evaluator:
+            for evaluator in args.ls_chat_model_evaluator:
+                if evaluator == 'moonshot':
+                    ls_evaluators.append(moonshot_evaluator_correctness)
+                    ls_evaluators.append(moonshot_evaluator_faithfulness)
+                elif evaluator == 'deepseek':
+                    ls_evaluators.append(deepseek_evaluator_correctness)
+                    ls_evaluators.append(deepseek_evaluator_faithfulness)
+                elif evaluator == 'gemini':
+                    ls_evaluators.append(gemini_evaluator_correctness)
+                    ls_evaluators.append(gemini_evaluator_faithfulness)
         
         # After running the evaluation, a link will be provided to view the results in langsmith
         experiment_results = client.evaluate(
             target,
             data = "Sample dataset",
-            evaluators = [
-                    moonshot_evaluator_correctness,
-                    deepseek_evaluator_correctness,
-                    gemini_evaluator_correctness,
-                    gemini_evaluator_faithfulness,
-                    deepseek_evaluator_faithfulness,
-                    moonshot_evaluator_faithfulness
-                # can add multiple evaluators here
-            ],
+            evaluators = ls_evaluators,
             experiment_prefix = "first-eval-in-langsmith",
             max_concurrency = 1,
             
         )
 
-    
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Adaptive AG")
@@ -541,12 +540,26 @@ if __name__ == "__main__":
     )
 
     parser.add_argument(
-        "--query_evaluation_dataset_csv_path",
+        "--evaluation_dataset_csv_path",
         type=str,
         help="Path to the query evaluation dataset",
         default=None,
     )
 
+    parser.add_argument(
+        "--evaluation_dataset_column_question",
+        type=str,
+        help="Column name for the questions in the evaluation dataset",
+        default="question",
+    )
+
+    parser.add_argument(
+        "--evaluation_dataset_column_answer",
+        type=str,
+        help="Column name for the groundtruth answers in the evaluation dataset",
+        default="groundtruth",
+    )
+
     parser.add_argument(
         "--input_chromadb_artifact", 
         type=str,
@@ -568,7 +581,14 @@ if __name__ == "__main__":
         help="Chat model provider"
     )
 
+    parser.add_argument(
+        "--ls_chat_model_evaluator",
+        type=str,
+        help="list of Chat model providers for evaluation",
+        required=False,
+        default="None"
+    )
+
     args = parser.parse_args()
-    
-    # go(args)
+
     go_evaluation(args)
\ No newline at end of file

From afbb34079abb8548f7a5b0ac4f651f4436a9d6d8 Mon Sep 17 00:00:00 2001
From: leehk <leehongkai@gmail.com>
Date: Thu, 13 Mar 2025 22:19:30 +0800
Subject: [PATCH 6/7] Ready for dataset evaluation

---
 app/llmops/config.yaml                        |  5 +++--
 app/llmops/src/rag_adaptive_evaluation/run.py | 22 ++++++++++++-------
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/app/llmops/config.yaml b/app/llmops/config.yaml
index 33383ea..4ea1d94 100644
--- a/app/llmops/config.yaml
+++ b/app/llmops/config.yaml
@@ -11,13 +11,14 @@ etl:
   embedding_model: paraphrase-multilingual-mpnet-base-v2
 rag:
   run_id_chromadb: None
-  chat_model_provider: gemini
+  chat_model_provider: deepseek
 testing:
   query: "如何治疗乳腺癌?"
 evaluation:
-  evaluation_dataset_csv_path: "../../../../data/qa_datasets.csv"
+  evaluation_dataset_csv_path: "../../../../data/qa_dataset_01.csv"
   evaluation_dataset_column_question: question
   evaluation_dataset_column_answer: answer
   ls_chat_model_provider:
   - gemini
+  - deepseek
   - moonshot
\ No newline at end of file
diff --git a/app/llmops/src/rag_adaptive_evaluation/run.py b/app/llmops/src/rag_adaptive_evaluation/run.py
index 4acc4c7..275b4d3 100644
--- a/app/llmops/src/rag_adaptive_evaluation/run.py
+++ b/app/llmops/src/rag_adaptive_evaluation/run.py
@@ -3,6 +3,8 @@ import logging
 import argparse
 import mlflow
 import shutil
+import langsmith
+
 from decouple import config
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_deepseek import ChatDeepSeek
@@ -10,6 +12,7 @@ from langchain_community.llms.moonshot import Moonshot
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_community.vectorstores.chroma import Chroma
 
+
 from typing import List
 from typing_extensions import TypedDict
 
@@ -489,13 +492,16 @@ def go_evaluation(args):
         # Programmatically create a dataset in LangSmith
         client = Client()
 
-        dataset = client.create_dataset(
-            dataset_name = dataset_name,
-            description = "A sample dataset in LangSmith."
-        )
-
-        # Add examples to the dataset
-        client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)
+        try:
+            # Create a dataset
+            dataset = client.create_dataset(
+                dataset_name = dataset_name,
+                description = "An evaluation dataset in LangSmith."
+            )
+            # Add examples to the dataset
+            client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)
+        except langsmith.utils.LangSmithConflictError:
+            pass
 
         
         args.ls_chat_model_evaluator = None if args.ls_chat_model_evaluator == 'None' else args.ls_chat_model_evaluator.split(',')
@@ -521,7 +527,7 @@ def go_evaluation(args):
         # After running the evaluation, a link will be provided to view the results in langsmith
         experiment_results = client.evaluate(
             target,
-            data = "Sample dataset",
+            data = dataset_name,
             evaluators = ls_evaluators,
             experiment_prefix = "first-eval-in-langsmith",
             max_concurrency = 1,

From 6471626497b83943515db5c4402a5714511e10ab Mon Sep 17 00:00:00 2001
From: leehk <leehongkai@gmail.com>
Date: Mon, 24 Mar 2025 13:35:40 +0800
Subject: [PATCH 7/7] revision for 3 questions testing

---
 .gitignore                                       | 3 ++-
 app/llmops/config.yaml                           | 8 ++++----
 app/llmops/src/etl_chromadb_pdf/run.py           | 6 +++---
 app/streamlit/initialize_sentence_transformer.py | 4 +++-
 4 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/.gitignore b/.gitignore
index ffc3c5e..8d92113 100644
--- a/.gitignore
+++ b/.gitignore
@@ -208,4 +208,5 @@ data/*
 **/*.zip
 **/llm-examples/*
 **/*.ipynb_checkpoints
-**/*.ipynb
\ No newline at end of file
+**/*.ipynb
+**/transformer_model/*
\ No newline at end of file
diff --git a/app/llmops/config.yaml b/app/llmops/config.yaml
index 4ea1d94..834fce8 100644
--- a/app/llmops/config.yaml
+++ b/app/llmops/config.yaml
@@ -15,10 +15,10 @@ rag:
 testing:
   query: "如何治疗乳腺癌?"
 evaluation:
-  evaluation_dataset_csv_path: "../../../../data/qa_dataset_01.csv"
+  evaluation_dataset_csv_path: "../../../../data/qa_dataset_20240321a.csv"
   evaluation_dataset_column_question: question
   evaluation_dataset_column_answer: answer
   ls_chat_model_provider:
-  - gemini
-  - deepseek
-  - moonshot
\ No newline at end of file
+    - gemini
+    - deepseek
+    - moonshot
diff --git a/app/llmops/src/etl_chromadb_pdf/run.py b/app/llmops/src/etl_chromadb_pdf/run.py
index edaaa01..9b2a82b 100644
--- a/app/llmops/src/etl_chromadb_pdf/run.py
+++ b/app/llmops/src/etl_chromadb_pdf/run.py
@@ -105,7 +105,7 @@ def go(args):
         documents_folder = os.path.splitext(os.path.basename(artifact_local_path))[0]
 
         text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
-            chunk_size=1000, chunk_overlap=500
+            chunk_size=15000, chunk_overlap=7500
         )
 
         ls_docs = []
@@ -113,7 +113,7 @@ def go(args):
             for file in files:
                 if file.endswith(".pdf"):
                     read_text = extract_chinese_text_from_pdf(os.path.join(root, file))
-                    document = Document(metadata={"file": file}, page_content=read_text)
+                    document = Document(metadata={"file": f"{documents_folder}/{file}"}, page_content=read_text)
                     ls_docs.append(document)
                                         
         doc_splits = text_splitter.split_documents(ls_docs)
@@ -138,7 +138,7 @@ def go(args):
 
 if __name__ == "__main__":
 
-    parser = argparse.ArgumentParser(description="A very basic data cleaning")
+    parser = argparse.ArgumentParser(description="ETL for ChromaDB with readable PDF")
 
     parser.add_argument(
         "--input_artifact", 
diff --git a/app/streamlit/initialize_sentence_transformer.py b/app/streamlit/initialize_sentence_transformer.py
index 3026701..937c70d 100644
--- a/app/streamlit/initialize_sentence_transformer.py
+++ b/app/streamlit/initialize_sentence_transformer.py
@@ -4,4 +4,6 @@ from sentence_transformers import SentenceTransformer
 EMBEDDING_MODEL = config("EMBEDDING_MODEL", cast=str, default="paraphrase-multilingual-mpnet-base-v2")
 
 # Initialize embedding model
-model = SentenceTransformer(EMBEDDING_MODEL) 
\ No newline at end of file
+model = SentenceTransformer(EMBEDDING_MODEL) 
+
+model.save("./transformer_model/paraphrase-multilingual-mpnet-base-v2")