06. Embedding-based evaluation (embedding_distance)

Generates an evaluator that measures the distance between the answer and the correct answer.

Copy

# installation
# !pip install -U langsmith langchain-teddynote

Copy

# Configuration file for managing API KEY as environment variable
from dotenv import load_dotenv

# Load API KEY information
load_dotenv()

Copy

 True

Copy

# LangSmith set up tracking. https://smith.langchain.com
# !pip install -qU langchain-teddynote
from langchain_teddynote import logging

# Enter a project name.
logging.langsmith("CH16-Evaluations")

Copy

 Start tracking LangSmith. 
[Project name] 
CH16-Evaluations

Define functions for RAG performance testing

We will create a RAG system to use for testing.

Copy

from myrag import PDFRAG
from langchain_openai import ChatOpenAI

# Creating a PDFRAG object
rag = PDFRAG(
    "data/SPRI_AI_Brief_2023년12월호_F.pdf",
    ChatOpenAI(model="gpt-4o-mini", temperature=0),
)

# Create a retriever
retriever = rag.create_retriever()

# Create a chain
chain = rag.create_chain(retriever)

# Generate answers to questions
chain.invoke("What is the name of the generative AI developed by Samsung Electronics?")

Copy

 "The name of the generated AI developed by the Samsung is'Samsung Gauss'."

ask_question Generate a function with the name Lee. Input inputs Ra receives a dickery, answer Ra returns the dictionary.

Copy

# Create a function that answers the question
def ask_question(inputs: dict):
    return {"answer": chain.invoke(inputs["question"])}

Embedding street based Evaluator

Copy

from langsmith.evaluation import LangChainStringEvaluator
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_upstage import UpstageEmbeddings
from langchain_openai import OpenAIEmbeddings
import os

# Setting up tokenizer parallelism (using HuggingFace model)
os.environ["TOKENIZERS_PARALLELISM"] = "true"

model_name = "BAAI/bge-m3"

hf_embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={"device": "cpu"},  # cuda, cpu
    # encode_kwargs={"normalize_embeddings": True},
)

# Creating an embedding model evaluator
hf_embedding_evaluator = LangChainStringEvaluator(
    "embedding_distance",
    config={
        # OpenAIEmbeddings is set as default, but can be changed
        "embeddings": hf_embeddings,
        "distance_metric": "cosine",  # "cosine", "euclidean", "chebyshev", "hamming", and "manhattan"
    },
)

upstage_embedding_evaluator = LangChainStringEvaluator(
    "embedding_distance",
    config={
        # OpenAIEmbeddings is set as default, but can be changed
        "embeddings": UpstageEmbeddings(model="solar-embedding-1-large-query"),
        "distance_metric": "euclidean",  # "cosine", "euclidean", "chebyshev", "hamming", and "manhattan"
    },
)

openai_embedding_evaluator = LangChainStringEvaluator(
    "embedding_distance",
    config={
        # OpenAIEmbeddings is set as default, but can be changed
        "embeddings": OpenAIEmbeddings(model="text-embedding-3-small"),
        "distance_metric": "euclidean",  # "cosine", "euclidean", "chebyshev", "hamming", and "manhattan"
    },
)

If multiple Embedding models are used for one metric, the results are calculated as average values.

(Example) - cosine : BGE-m3 - euclidean : OpenAI, Upstage

euclidean In the case, the average value of each model is calculated.

Copy

from langsmith.evaluation import evaluate

dataset_name = "RAG_EVAL_DATASET"

# Running the evaluation
experiment_results = evaluate(
    ask_question,
    data=dataset_name,
    evaluators=[
        hf_embedding_evaluator,
        upstage_embedding_evaluator,
        openai_embedding_evaluator,
    ],
    experiment_prefix="EMBEDDING-EVAL",
    # Specifying Experiment Metadata
    metadata={
        "variant": "embedding_distance Evaluation used",
    },
)

Previous05. LLM-as-Judge Next07. Custom LLM evaluation

Last updated 7 months ago

hashtagDefine functions for RAG performance testing

hashtagEmbedding street based Evaluator

Define functions for RAG performance testing

Embedding street based Evaluator