diff --git a/app/llmops/src/rag_adaptive_evaluation/data_models.py b/app/llmops/src/rag_adaptive_evaluation/data_models.py
index 680cfbd..aeb193d 100644
--- a/app/llmops/src/rag_adaptive_evaluation/data_models.py
+++ b/app/llmops/src/rag_adaptive_evaluation/data_models.py
@@ -1,4 +1,4 @@
-from typing import Literal, List
+from typing import Literal
 from pydantic import BaseModel, Field
 
 
diff --git a/app/llmops/src/rag_adaptive_evaluation/evaluators.py b/app/llmops/src/rag_adaptive_evaluation/evaluators.py
index 17b1b90..0d16dea 100644
--- a/app/llmops/src/rag_adaptive_evaluation/evaluators.py
+++ b/app/llmops/src/rag_adaptive_evaluation/evaluators.py
@@ -1,99 +1,77 @@
+import os
 from decouple import config
-from openevals.llm import create_llm_as_judge
-from openevals.prompts import (
-    CORRECTNESS_PROMPT, 
-    CONCISENESS_PROMPT, 
-    HALLUCINATION_PROMPT
-)
+
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_deepseek import ChatDeepSeek
 from langchain_community.llms.moonshot import Moonshot
 
-GEMINI_API_KEY = config("GOOGLE_API_KEY", cast=str)
-DEEKSEEK_API_KEY = config("DEEKSEEK_API_KEY", cast=str)
-MOONSHOT_API_KEY = config("MOONSHOT_API_KEY", cast=str)
+from pydantic import BaseModel, Field
 
-# correctness
-gemini_evaluator_correctness = create_llm_as_judge(
-    prompt=CORRECTNESS_PROMPT,
-    judge=ChatGoogleGenerativeAI(
+from prompts_library import CORRECTNESS_PROMPT
+
+os.environ["GOOGLE_API_KEY"] = config("GOOGLE_API_KEY", cast=str)
+os.environ["DEEPSEEK_API_KEY"] = config("DEEPSEEK_API_KEY", cast=str)
+os.environ["MOONSHOT_API_KEY"] = config("MOONSHOT_API_KEY", cast=str)
+
+
+# Define output schema for the evaluation
+class CorrectnessGrade(BaseModel):
+    score: int = Field(description="Numerical score (1-5) indicating the correctness of the response.")
+
+# Todo:
+# class RelevanceGrade(BaseModel):
+
+
+
+def gemini_evaluator_correctness(outputs: dict, reference_outputs: dict) -> CorrectnessGrade:
+    llm = ChatGoogleGenerativeAI(
                 model="gemini-1.5-flash", 
-                google_api_key=GEMINI_API_KEY,
                 temperature=0.5,
-            ),
-    )
+            )
 
-deepseek_evaluator_correctness = create_llm_as_judge(
-    prompt=CORRECTNESS_PROMPT,
-    judge=ChatDeepSeek(
+    messages = [
+        {"role": "system", "content": CORRECTNESS_PROMPT},
+        {"role": "user", "content": f"""Ground Truth answer: {reference_outputs["answer"]};
+        Student's Answer: {outputs['response']}
+        """}
+    ]
+
+    response = llm.invoke(messages)
+
+    return CorrectnessGrade(score=int(response.content)).score
+
+
+def deepseek_evaluator_correctness(outputs: dict, reference_outputs: dict) -> CorrectnessGrade:
+    llm = ChatDeepSeek(
                 model="deepseek-chat", 
                 temperature=0.5,
-                api_key=DEEKSEEK_API_KEY
-            ),
-    )
+            )
 
-moonshot_evaluator_correctness = create_llm_as_judge(
-    prompt=CORRECTNESS_PROMPT,
-    judge=Moonshot(
-                model="moonshot-v1-128k", 
+    messages = [
+        {"role": "system", "content": CORRECTNESS_PROMPT},
+        {"role": "user", "content": f"""Ground Truth answer: {reference_outputs["answer"]};
+        Student's Answer: {outputs['response']}
+        """}
+    ]
+
+    response = llm.invoke(messages)
+
+    return CorrectnessGrade(score=int(response.content)).score
+
+
+def moonshot_evaluator_correctness(outputs: dict, reference_outputs: dict) -> CorrectnessGrade:
+    llm = Moonshot(
+                model="moonshot-v1-128k",
                 temperature=0.5,
-                api_key=MOONSHOT_API_KEY
-            ),
-    )
+            )
 
-# conciseness
-gemini_evaluator_conciseness = create_llm_as_judge(
-    prompt=CONCISENESS_PROMPT,
-    judge=ChatGoogleGenerativeAI(
-                model="gemini-1.5-flash", 
-                google_api_key=GEMINI_API_KEY,
-                temperature=0.5,
-            ),
-    )
+    messages = [
+        {"role": "system", "content": CORRECTNESS_PROMPT},
+        {"role": "user", "content": f"""Ground Truth answer: {reference_outputs["answer"]};
+        Student's Answer: {outputs['response']}
+        """}
+    ]
 
-deepseek_evaluator_conciseness = create_llm_as_judge(
-    prompt=CONCISENESS_PROMPT,
-    judge=ChatDeepSeek(
-                model="deepseek-chat", 
-                temperature=0.5,
-                api_key=DEEKSEEK_API_KEY
-            ),
-    )
-
-moonshot_evaluator_conciseness = create_llm_as_judge(
-    prompt=CONCISENESS_PROMPT,
-    judge=Moonshot(
-                model="moonshot-v1-128k", 
-                temperature=0.5,
-                api_key=MOONSHOT_API_KEY
-            ),
-    )
-
-# hallucination
-gemini_evaluator_hallucination = create_llm_as_judge(
-    prompt=HALLUCINATION_PROMPT,
-    judge=ChatGoogleGenerativeAI(
-                model="gemini-1.5-flash", 
-                google_api_key=GEMINI_API_KEY,
-                temperature=0.5,
-            ),
-    )
-
-deepseek_evaluator_hallucination = create_llm_as_judge(
-    prompt=HALLUCINATION_PROMPT,
-    judge=ChatDeepSeek(
-                model="deepseek-chat", 
-                temperature=0.5,
-                api_key=DEEKSEEK_API_KEY
-            ),
-    )
-
-moonshot_evaluator_hallucination = create_llm_as_judge(
-    prompt=HALLUCINATION_PROMPT,
-    judge=Moonshot(
-                model="moonshot-v1-128k", 
-                temperature=0.5,
-                api_key=MOONSHOT_API_KEY
-            ),
-    )
+    response = llm.invoke(messages)
 
+    return CorrectnessGrade(score=int(response)).score
diff --git a/app/llmops/src/rag_adaptive_evaluation/prompts_library.py b/app/llmops/src/rag_adaptive_evaluation/prompts_library.py
index fcaf564..3bfed18 100644
--- a/app/llmops/src/rag_adaptive_evaluation/prompts_library.py
+++ b/app/llmops/src/rag_adaptive_evaluation/prompts_library.py
@@ -16,4 +16,39 @@ system_answer_grader = """You are a grader assessing whether an answer addresses
     Give a binary score 'yes' or 'no'. Yes' means that the answer resolves the question."""
 
 system_question_rewriter = """You a question re-writer that converts an input question to a better version that is optimized \n 
-    for vectorstore retrieval. Look at the input and try to reason about the underlying semantic intent / meaning."""
\ No newline at end of file
+    for vectorstore retrieval. Look at the input and try to reason about the underlying semantic intent / meaning."""
+
+
+# Evaluation
+CORRECTNESS_PROMPT = """Evaluate Student Answer against Ground Truth for conceptual similarity and correctness.
+
+You are an impartial judge. Evaluate Student Answer against Ground Truth for conceptual similarity and correctness. 
+You may also be given additional information that was used by the model to generate the output.
+
+Your task is to determine a numerical score called faithfulness based on the input and output.
+A definition of correctness and a grading rubric are provided below.
+You must use the grading rubric to determine your score.
+
+Metric definition:
+Correctness assesses the degree to which a provided output aligns with factual accuracy, completeness, logical 
+consistency, and precise terminology. It evaluates the intrinsic validity of the output, independent of any 
+external context. A higher score indicates a higher adherence to factual accuracy, completeness, logical consistency, 
+and precise terminology.
+
+Grading rubric:
+Correctness: Below are the details for different scores: 
+ - 1: Major factual errors, highly incomplete, illogical, and uses incorrect terminology.
+ - 2: Significant factual errors, incomplete, noticeable logical flaws, and frequent terminology errors.
+ - 3: Minor factual errors, somewhat incomplete, minor logical inconsistencies, and occasional terminology errors.
+ - 4: Few to no factual errors, mostly complete, strong logical consistency, and accurate terminology.
+ - 5: Accurate, complete, logically consistent, and uses precise terminology.
+ 
+ Reminder:
+  - Carefully read the input and output
+  - Check for factual accuracy and completeness
+  - Focus on correctness of information rather than style or verbosity
+  - The goal is to evaluate factual correctness and completeness of the response.
+  - Please provide your answer score only with the numerical number between 1 and 5. No score: or other text is allowed.
+
+"""
+
diff --git a/app/llmops/src/rag_adaptive_evaluation/python_env.yml b/app/llmops/src/rag_adaptive_evaluation/python_env.yml
index 451cdb7..2278969 100644
--- a/app/llmops/src/rag_adaptive_evaluation/python_env.yml
+++ b/app/llmops/src/rag_adaptive_evaluation/python_env.yml
@@ -24,7 +24,6 @@ build_dependencies:
   - tavily-python
   - langchain_huggingface
   - pydantic
-  - openevals
 # Dependencies required to run the project.
 dependencies:
   - mlflow==2.8.1
\ No newline at end of file
diff --git a/app/llmops/src/rag_adaptive_evaluation/run.py b/app/llmops/src/rag_adaptive_evaluation/run.py
index 1fe7543..c629de7 100644
--- a/app/llmops/src/rag_adaptive_evaluation/run.py
+++ b/app/llmops/src/rag_adaptive_evaluation/run.py
@@ -38,30 +38,26 @@ from evaluators import (
     gemini_evaluator_correctness,
     deepseek_evaluator_correctness,
     moonshot_evaluator_correctness,
-    gemini_evaluator_conciseness,
-    deepseek_evaluator_conciseness,
-    moonshot_evaluator_conciseness,
-    gemini_evaluator_hallucination,
-    deepseek_evaluator_hallucination,
-    moonshot_evaluator_hallucination
+    # gemini_evaluator_conciseness,
+    # deepseek_evaluator_conciseness,
+    # moonshot_evaluator_conciseness,
+    # gemini_evaluator_hallucination,
+    # deepseek_evaluator_hallucination,
+    # moonshot_evaluator_hallucination
 )
 
 logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s")
 logger = logging.getLogger()
 
-GEMINI_API_KEY = config("GOOGLE_API_KEY", cast=str)
-DEEKSEEK_API_KEY = config("DEEKSEEK_API_KEY", cast=str)
-MOONSHOT_API_KEY = config("MOONSHOT_API_KEY", cast=str)
-TAVILY_API_KEY = config("TAVILY_API_KEY", cast=str)
-LANGSMITH_API_KEY = config("LANGSMITH_API_KEY", cast=str)
-LANGSMITH_TRACING = config("LANGSMITH_TRACING", cast=str)
-LANGSMITH_PROJECT = config("LANGSMITH_PROJECT", cast=str)
-os.environ["TAVILY_API_KEY"] = TAVILY_API_KEY
+os.environ["GOOGLE_API_KEY"] = config("GOOGLE_API_KEY", cast=str)
+os.environ["DEEPSEEK_API_KEY"] = config("DEEPSEEK_API_KEY", cast=str)
+os.environ["MOONSHOT_API_KEY"] = config("MOONSHOT_API_KEY", cast=str)
+os.environ["TAVILY_API_KEY"] = config("TAVILY_API_KEY", cast=str)
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-os.environ["LANGSMITH_API_KEY"] = LANGSMITH_API_KEY
-os.environ["LANGSMITH_TRACING"] = LANGSMITH_TRACING
+os.environ["LANGSMITH_API_KEY"] = config("LANGSMITH_API_KEY", cast=str)
+os.environ["LANGSMITH_TRACING"] = config("LANGSMITH_TRACING", cast=str)
 os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
-os.environ["LANGSMITH_PROJECT"] = LANGSMITH_PROJECT
+os.environ["LANGSMITH_PROJECT"] = config("LANGSMITH_PROJECT", cast=str)
 
 def go(args):
 
@@ -95,12 +91,10 @@ def go(args):
                 max_tokens=None,
                 timeout=None,
                 max_retries=2,
-                api_key=DEEKSEEK_API_KEY
             )
         elif args.chat_model_provider == 'gemini':
             llm = ChatGoogleGenerativeAI(
                 model="gemini-1.5-flash", 
-                google_api_key=GEMINI_API_KEY,
                 temperature=0,
                 max_retries=3,
                 streaming=True
@@ -112,7 +106,6 @@ def go(args):
                 max_tokens=None,
                 timeout=None,
                 max_retries=2,
-                api_key=MOONSHOT_API_KEY
             )
 
         # Load data from ChromaDB
@@ -479,7 +472,61 @@ def go(args):
             pprint("\n---\n")
 
         # Final generation
-        pprint(value["generation"])
+        print(value["generation"])
+
+        return {"response": value["generation"]}
+    
+def go_evaluation(args):
+    if args.query_evaluation_dataset_csv_path:
+        # import pandas as pd
+        # from tqdm import tqdm
+
+        # df = pd.read_csv(args.query_evaluation_dataset_csv_path)
+        client = Client()
+        # # Create inputs and reference outputs
+        # examples = [
+        # (
+        #     "Which country is Mount Kilimanjaro located in?",
+        #     "Mount Kilimanjaro is located in Tanzania.",
+        # ),
+        # (
+        #     "What is Earth's lowest point?",
+        #     "Earth's lowest point is The Dead Sea.",
+        # ),
+        # ]
+
+        # inputs = [{"question": input_prompt} for input_prompt, _ in examples]
+        # outputs = [{"answer": output_answer} for _, output_answer in examples]
+
+        # # Programmatically create a dataset in LangSmith
+        # dataset = client.create_dataset(
+        #     dataset_name = "Sample dataset",
+        #     description = "A sample dataset in LangSmith."
+        # )
+
+        # # Add examples to the dataset
+        # client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)
+
+        def target(inputs: dict) -> dict:
+            new_args = argparse.Namespace(**vars(args))
+            new_args.query = inputs["question"]
+            return go(new_args)
+
+        
+        # After running the evaluation, a link will be provided to view the results in langsmith
+        experiment_results = client.evaluate(
+            target,
+            data = "Sample dataset",
+            evaluators = [
+                    moonshot_evaluator_correctness,
+                    deepseek_evaluator_correctness,
+                    gemini_evaluator_correctness
+                # can add multiple evaluators here
+            ],
+            experiment_prefix = "first-eval-in-langsmith",
+            max_concurrency = 1,
+            
+        )
 
     
 
@@ -523,4 +570,5 @@ if __name__ == "__main__":
 
     args = parser.parse_args()
     
-    go(args)
\ No newline at end of file
+    # go(args)
+    go_evaluation(args)
\ No newline at end of file
diff --git a/app/llmops/src/rag_cot_evaluation/run.py b/app/llmops/src/rag_cot_evaluation/run.py
index aa773b1..06484df 100644
--- a/app/llmops/src/rag_cot_evaluation/run.py
+++ b/app/llmops/src/rag_cot_evaluation/run.py
@@ -14,18 +14,14 @@ from langchain_community.llms.moonshot import Moonshot
 logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s")
 logger = logging.getLogger()
 
+os.environ["GOOGLE_API_KEY"] = config("GOOGLE_API_KEY", cast=str)
+os.environ["DEEPSEEK_API_KEY"] = config("DEEPSEEK_API_KEY", cast=str)
+os.environ["MOONSHOT_API_KEY"] = config("MOONSHOT_API_KEY", cast=str)
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-GEMINI_API_KEY = config("GOOGLE_API_KEY", cast=str)
-DEEKSEEK_API_KEY = config("DEEKSEEK_API_KEY", cast=str)
-MOONSHOT_API_KEY = config("MOONSHOT_API_KEY", cast=str)
-LANGSMITH_API_KEY = config("LANGSMITH_API_KEY", cast=str)
-LANGSMITH_TRACING = config("LANGSMITH_TRACING", cast=str)
-LANGSMITH_PROJECT = config("LANGSMITH_PROJECT", cast=str)
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-os.environ["LANGSMITH_API_KEY"] = LANGSMITH_API_KEY
-os.environ["LANGSMITH_TRACING"] = LANGSMITH_TRACING
+os.environ["LANGSMITH_API_KEY"] = config("LANGSMITH_API_KEY", cast=str)
+os.environ["LANGSMITH_TRACING"] = config("LANGSMITH_TRACING", cast=str)
 os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
-os.environ["LANGSMITH_PROJECT"] = LANGSMITH_PROJECT
+os.environ["LANGSMITH_PROJECT"] = config("LANGSMITH_PROJECT", cast=str)
 
 def go(args):
 
@@ -68,14 +64,12 @@ def go(args):
                 max_tokens=None,
                 timeout=None,
                 max_retries=2,
-                api_key=DEEKSEEK_API_KEY
             )
             
         elif args.chat_model_provider == "gemini":
             # Initialize Gemini model
             llm = ChatGoogleGenerativeAI(
                 model="gemini-1.5-flash", 
-                google_api_key=GEMINI_API_KEY,
                 temperature=0,
                 max_retries=3
                 )
@@ -88,7 +82,6 @@ def go(args):
                 max_tokens=None,
                 timeout=None,
                 max_retries=2,
-                api_key=MOONSHOT_API_KEY
             )