aimingmed-ai/scripts/llm-template/main_old.py

# main.py
import os
import wandb
from config import GOOGLE_API_KEY, WANDB_API_KEY, LANGSMITH_API_KEY, LANGCHAIN_PROJECT
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.callbacks import LangChainTracer
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

# Set LangSmith environment variables
os.environ["LANGCHAIN_TRACING"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"] = LANGCHAIN_PROJECT
os.environ["LANGCHAIN_API_KEY"] = LANGSMITH_API_KEY

# Initialize Weights & Biases
wandb.login(key=WANDB_API_KEY)
run = wandb.init(project=LANGCHAIN_PROJECT, entity="aimingmed")

# Initialize Gemini API
tracer = LangChainTracer()
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-001", google_api_key=GOOGLE_API_KEY, callbacks=[tracer])

# Example usage of Gemini API
prompt_template = PromptTemplate(template="Write a short poem about the sun.", input_variables=[])
chain = LLMChain(llm=llm, prompt=prompt_template)
response = chain.run({})

# print(response)

import time
from langsmith import Client, Run
from langsmith.evaluation import EvaluationResult

# Initialize LangSmith client
client = Client(api_key=LANGSMITH_API_KEY)  # Replace with your API key
project_name = "my-gemini-evaluation"  # Your LangSmith project name

def evaluate_gemini_response(prompt, expected_response, gemini_response):
    """Evaluates Gemini's response against an expected response and logs to LangSmith."""

    # 1. Create a Run object (This is the correct way for manual logging)
    run = Run(
        client=client,
        project_name=project_name,
        inputs={"prompt": prompt, "expected_response": expected_response},  # Log inputs here
    )

    try:  # Use a try-except block for proper error handling
        start_time = time.time()

        # ... (Your Gemini API call goes here) ...
        llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-001", google_api_key=GOOGLE_API_KEY, callbacks=[tracer])
        # Example usage of Gemini API
        prompt_template = PromptTemplate(template="Start from here: ", input_variables=[])
        chain = LLMChain(llm=llm, prompt=prompt_template)
        gemini_response = chain.run({})  # Replace with your actual Gemini call

        end_time = time.time()
        latency = end_time - start_time

        run.outputs = {"gemini_response": gemini_response}  # Log outputs
        run.latency = latency # Log latency

        # 2. End the Run (Important!)
        run.end() # Mark the run as complete.

        # 3. Evaluate and log the result
        evaluation_result = evaluate_response(expected_response, gemini_response)
        run.create_evaluation(evaluation_result) # Log the evaluation

        return evaluation_result

    except Exception as e:  # Handle exceptions
        run.end(error=str(e))  # Log the error in LangSmith
        print(f"Error during Gemini call or evaluation: {e}")
        return None  # Or handle the error as needed

def evaluate_response(expected_response, gemini_response):
    """Performs the actual evaluation logic.  Customize this!"""

    # Example 1: Exact match (simple, but often not realistic)
    if expected_response.strip().lower() == gemini_response.strip().lower():
        score = 1.0  # Perfect match
        feedback = "Exact match!"
    # Example 2: Keyword overlap (more flexible)
    elif any(keyword in gemini_response.lower() for keyword in expected_response.lower().split()):
        score = 0.7  # Partial match (adjust score as needed)
        feedback = "Keyword overlap."
    # Example 3: Semantic similarity (requires external library/API) -  Advanced!
    # ... (Use a library like SentenceTransformers or an API for semantic similarity) ...
    else:
        score = 0.0
        feedback = "No match."

    # Create a LangSmith EvaluationResult object
    evaluation_result = EvaluationResult(
        score=score,
        value=gemini_response, # The actual response being evaluated
        comment=feedback,
        # Other metadata you might want to add, like prompt or expected response
        metadata = {"expected_response": expected_response}
    )

    return evaluation_result


# Example usage:
prompt = "Translate 'Hello, world!' to French."
expected_response = "Bonjour le monde !"
gemini_response = "Bonjour monde !" # Or get the actual response from Gemini

evaluation = evaluate_gemini_response(prompt, expected_response, gemini_response)
print(f"Evaluation Score: {evaluation.score}, Feedback: {evaluation.comment}")

# Another example
prompt = "What is the capital of France?"
expected_response = "Paris"
gemini_response = "The capital of France is Paris."

evaluation = evaluate_gemini_response(prompt, expected_response, gemini_response)
print(f"Evaluation Score: {evaluation.score}, Feedback: {evaluation.comment}")

wandb.finish()