# Configuration file for managing API KEY as environment variable
from dotenv import load_dotenv
# API KEY load information
load_dotenv()
Copy
True
Copy
# LangSmith Set up tracking. https://smith.langchain.com
# !pip install -qU langchain-teddynote
from langchain_teddynote import logging
# Enter a project name.
logging.langsmith("CH16-Evaluations")
Copy
Now we can create a dataset from these example runs.
from langchain import hub
from langchain_openai import ChatOpenAI
from langsmith.schemas import Example, Run
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langsmith.evaluation import evaluate
def evaluate_pairwise(runs: list, example) -> dict:
"""
A simple evaluator for pairwise answers to score based on engagement
"""
# save score
scores = {}
for i, run in enumerate(runs):
scores[run.id] = i
# Execution pairs for each example
answer_a = runs[0].outputs["answer"]
answer_b = runs[1].outputs["answer"]
question = example.inputs["question"]
# LLM with function calls, using the best performing model
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
# Structured prompts
grade_prompt = PromptTemplate.from_template(
"""
You are an LLM judge. Compare the following two answers to a question and determine which one is better.
Better answer is the one that is more detailed and informative.
If the answer is not related to the question, it is not a good answer.
# Question:
{question}
#Answer A:
{answer_a}
#Answer B:
{answer_b}
Output should be either `A` or `B`. Pick the answer that is better.
#Preference:
"""
)
answer_grader = grade_prompt | llm | StrOutputParser()
# Earn points
score = answer_grader.invoke(
{
"question": question,
"answer_a": answer_a,
"answer_b": answer_b,
}
)
# score = score["Preference"]
# Mapping execution assignments based on scores
if score == "A": # Assistant A 선호
scores[runs[0].id] = 1
scores[runs[1].id] = 0
elif score == "B": # Assistant B 선호
scores[runs[0].id] = 0
scores[runs[1].id] = 1
else:
scores[runs[0].id] = 0
scores[runs[1].id] = 0
return {"key": "ranked_preference", "scores": scores}
from langsmith.evaluation import evaluate_comparative
# Replace experiment name or ID array
evaluate_comparative(
["MODEL_COMPARE_EVAL-23908367", "MODEL_COMPARE_EVAL-a4a9f9ff"],
# Array of evaluators
evaluators=[evaluate_pairwise],
)