10. Assessment of the summary method

Some metrics can only be defined at the overall experimental level, not the individual execution of the experiment.

For example, an experiment started from a data set Calculate the classifier's evaluation score across all runs You may want to.

this summary_evaluators Say it.

These evaluators get each list instead of one Run and Example.

Copy

# installation
# !pip install -qU langsmith langchain-teddynote

Copy

# Configuration file for managing API KEY as environment variable
from dotenv import load_dotenv

# Load API KEY information
load_dotenv()

Copy

 True

Copy

# LangSmith Set up tracking. https://smith.langchain.com
# !pip install -qU langchain-teddynote
from langchain_teddynote import logging

# Enter a project name.
logging.langsmith("CH16-Evaluations")

Copy

 Start tracking LangSmith. 
[Project name] 
CH16-Evaluations

Define functions for RAG performance testing

We will create a RAG system to use for testing.

Copy

from myrag import PDFRAG


# Create a function that answers the question
def ask_question_with_llm(llm):
    # Creating a PDFRAG object
    rag = PDFRAG(
        "data/SPRI_AI_Brief_December 2023 issue_F.pdf",
        llm,
    )

    # Create a retriever
    retriever = rag.create_retriever()

    # Create a chain
    rag_chain = rag.create_chain(retriever)

    def _ask_question(inputs: dict):
        # Contextual search for questions
        context = retriever.invoke(inputs["question"])
        # Combine the searched documents into one string
        context = "\n".join([doc.page_content for doc in context])
        # Returns a dictionary containing the question, context, and answer.
        return {
            "question": inputs["question"],
            "context": context,
            "answer": rag_chain.invoke(inputs["question"]),
        }

    return _ask_question

Utilize the GPT-4o-mini model and the Ollama model to generate functions that generate answers to your questions.

Copy

from langchain_openai import ChatOpenAI
from langchain_community.chat_models import ChatOllama

gpt_chain = ask_question_with_llm(ChatOpenAI(model="gpt-4o-mini", temperature=0))
ollama_chain = ask_question_with_llm(ChatOllama(model="EEVE-Korean-10.8B:latest"))

OpenAIRelevanceGrader Is used to evaluate whether the question, context, and answer (Answer) are relevant.

target="retrieval-question" : Evaluate whether the question and context are relevant.
target="retrieval-answer" : Evaluate whether the answer and context are relevant.

Copy

from langchain_teddynote.evaluator import OpenAIRelevanceGrader
from langchain_openai import ChatOpenAI


rq_grader = OpenAIRelevanceGrader(
    llm=ChatOpenAI(model="gpt-4o-mini", temperature=0), target="retrieval-question"
).create()

ra_grader = OpenAIRelevanceGrader(
    llm=ChatOpenAI(model="gpt-4o-mini", temperature=0), target="retrieval-answer"
).create()

Copy

rq_grader.invoke(
    {
        "input": "The name of the generative AI developed by Samsung Electronics is?",
        "context": "Samsung Electronics AI is Bixby",
    }
)

Copy

 GradeRetrievalQuestion (score='yes')

Copy

ra_grader.invoke(
    {
        "input": "The generative AI developed by Samsung Electronics is Gauss.",
        "context": "Samsung Electronics AI is Bixby",
    }
)

Copy

 GradeRetrievalAnswer (score='no')

Summary Evaluator synthesizes Relevance assessment

Copy

from typing import List
from langsmith.schemas import Example, Run
from langsmith.evaluation import evaluate


def relevance_score_summary_evaluator(runs: List[Run], examples: List[Example]) -> dict:
    rq_scores = 0  # Question Relevance Score
    ra_scores = 0  # Answer Relevance Score

    for run, example in zip(runs, examples):
        question = example.inputs["question"]
        context = run.outputs["context"]
        prediction = run.outputs["answer"]

        # Question relevance assessment
        rq_score = rq_grader.invoke(
            {
                "input": question,
                "context": context,
            }
        )
        # Assessing the relevance of your answers
        ra_score = ra_grader.invoke(
            {
                "input": prediction,
                "context": context,
            }
        )

        # Accumulating relevance scores
        if rq_score.score == "yes":
            rq_scores += 1
        if ra_score.score == "yes":
            ra_scores += 1

    # Calculate final relevance score (average of question relevance and answer relevance)
    final_score = ((rq_scores / len(runs)) + (ra_scores / len(runs))) / 2

    return {"key": "relevance_score", "score": final_score}

Proceed with the evaluation.

Copy

# Running the evaluation
dataset_name = "RAG_EVAL_DATASET"

experiment_result1 = evaluate(
    gpt_chain,
    data=dataset_name,
    summary_evaluators=[relevance_score_summary_evaluator],
    experiment_prefix="SUMMARY_EVAL",
    # Specifying Experiment Metadata
    metadata={
        "variant": "GPT-4o-mini 사용: summary_evaluator Relevance assessment using",
    },
)

# Running the evaluation
experiment_result2 = evaluate(
    ollama_chain,
    data=dataset_name,
    summary_evaluators=[relevance_score_summary_evaluator],
    experiment_prefix="SUMMARY_EVAL",
    # Specifying Experiment Metadata
    metadata={
        "variant": "Ollama(EEVE-Korean-10.8B:latest) use: summary_evaluator using relevance 평가",
    },
)

Check the results.

(Note) Evaluation of individual datasets cannot be confirmed, and can be confirmed in experimental (Experiment) units.

Previous09. Experimental (Experiment) evaluation comparison Next11. Groundedness (Halucination) Assessment

Last updated 5 months ago