12. Compare Experiments(Pairwise Evaluation)

Pairwise Evaluation

Some assessments seek to compare two or more LLM products against each other.

Chatbot Arena This is a comparative evaluation method that can be easily found on the t Arena or LLM leaderboard.

Copy

# installation
# !pip install -qU langsmith langchain-teddynote

Copy

# Configuration file for managing API KEY as environment variable
from dotenv import load_dotenv

# API KEY load information
load_dotenv()

Copy

True

Copy

# LangSmith Set up tracking. https://smith.langchain.com
# !pip install -qU langchain-teddynote
from langchain_teddynote import logging

# Enter a project name.
logging.langsmith("CH16-Evaluations")

Copy

Start tracking LangSmith.
[Project Name]
CH16-Evaluations

Now we can create a dataset from these example runs.

Just save your input.

Copy

from langchain import hub

from langchain_openai import ChatOpenAI
from langsmith.schemas import Example, Run
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langsmith.evaluation import evaluate


def evaluate_pairwise(runs: list, example) -> dict:
    """
    A simple evaluator for pairwise answers to score based on  engagement
    """

    # save score
    scores = {}
    for i, run in enumerate(runs):
        scores[run.id] = i

    # Execution pairs for each example
    answer_a = runs[0].outputs["answer"]
    answer_b = runs[1].outputs["answer"]
    question = example.inputs["question"]

    # LLM with function calls, using the best performing model
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

    # Structured prompts
    grade_prompt = PromptTemplate.from_template(
        """
        You are an LLM judge. Compare the following two answers to a question and determine which one is better.
        Better answer is the one that is more detailed and informative.
        If the answer is not related to the question, it is not a good answer.

        # Question:
        {question}

        #Answer A: 
        {answer_a}

        #Answer B: 
        {answer_b}

        Output should be either `A` or `B`. Pick the answer that is better.

        #Preference:
        """
    )
    answer_grader = grade_prompt | llm | StrOutputParser()

    # Earn points
    score = answer_grader.invoke(
        {
            "question": question,
            "answer_a": answer_a,
            "answer_b": answer_b,
        }
    )
    # score = score["Preference"]

    # Mapping execution assignments based on scores
    if score == "A":  # Assistant A 선호
        scores[runs[0].id] = 1
        scores[runs[1].id] = 0
    elif score == "B":  # Assistant B 선호
        scores[runs[0].id] = 0
        scores[runs[1].id] = 1
    else:
        scores[runs[0].id] = 0
        scores[runs[1].id] = 0

    return {"key": "ranked_preference", "scores": scores}

Perform comparative evaluations.

Copy

from langsmith.evaluation import evaluate_comparative

# Replace experiment name or ID array
evaluate_comparative(
    ["MODEL_COMPARE_EVAL-23908367", "MODEL_COMPARE_EVAL-a4a9f9ff"],
    # Array of evaluators
    evaluators=[evaluate_pairwise],
)

Previous11. Groundedness (Halucination) Assessment Next13. Repeat evaluation

Last updated 5 months ago

from langchain import hub from langchain_openai import ChatOpenAI from langsmith.schemas import Example, Run from langchain_core.prompts import PromptTemplate from langchain_core.output_parsers import StrOutputParser from langchain_core.pydantic_v1 import BaseModel, Field from langsmith.evaluation import evaluate def evaluate_pairwise(runs: list, example) -> dict: """ A simple evaluator for pairwise answers to score based on engagement """ # save score scores = {} for i, run in enumerate(runs): scores[run.id] = i # Execution pairs for each example answer_a = runs[0].outputs["answer"] answer_b = runs[1].outputs["answer"] question = example.inputs["question"] # LLM with function calls, using the best performing model llm = ChatOpenAI(model="gpt-4o-mini", temperature=0) # Structured prompts grade_prompt = PromptTemplate.from_template( """ You are an LLM judge. Compare the following two answers to a question and determine which one is better. Better answer is the one that is more detailed and informative. If the answer is not related to the question, it is not a good answer. # Question: {question} #Answer A: {answer_a} #Answer B: {answer_b} Output should be either `A` or `B`. Pick the answer that is better. #Preference: """ ) answer_grader = grade_prompt | llm | StrOutputParser() # Earn points score = answer_grader.invoke( { "question": question, "answer_a": answer_a, "answer_b": answer_b, } ) # score = score["Preference"] # Mapping execution assignments based on scores if score == "A": # Assistant A 선호 scores[runs[0].id] = 1 scores[runs[1].id] = 0 elif score == "B": # Assistant B 선호 scores[runs[0].id] = 0 scores[runs[1].id] = 1 else: scores[runs[0].id] = 0 scores[runs[1].id] = 0 return {"key": "ranked_preference", "scores": scores}