11. Groundedness (Halucination) Assessment

Evaluator that evaluates whether an answer is correct based on a given context.

This Evaluator can be used to evaluate Hallucination for RAG's answer.

In this tutorial, we will look at how to evaluate Groundedness by utilizing the Upstage Groundness Checker and the Groundness Checker created by the job custom.

Copy

# installation
# !pip install -qU langsmith langchain-teddynote

Copy

# Configuration file for managing API KEY as environment variable
from dotenv import load_dotenv

# Load API KEY information
load_dotenv()

Copy

 True

Copy

# Set up LangSmith tracking. https://smith.langchain.com
# !pip install -qU langchain-teddynote
from langchain_teddynote import logging

# Enter a project name.
logging.langsmith("CH16-Evaluations")

Copy

 Start tracking LangSmith. 
[Project name] 
CH16-Evaluations

Define functions for RAG performance testing

We will create a RAG system to use for testing.

Copy

from myrag import PDFRAG


# Create a function that answers the question
def ask_question_with_llm(llm):
    # Creating a PDFRAG object
    rag = PDFRAG(
        "data/SPRI_AI_Brief_December 2023 issue_F.pdf",
        llm,
    )

    # Create a retriever
    retriever = rag.create_retriever()

    # Create a chain
    rag_chain = rag.create_chain(retriever)

    def _ask_question(inputs: dict):
        # Contextual search for questions
        context = retriever.invoke(inputs["question"])
        # Combine the searched documents into one string
        context = "\n".join([doc.page_content for doc in context])
        # Returns a dictionary containing the question, context, and answer.
        return {
            "question": inputs["question"],
            "context": context,
            "answer": rag_chain.invoke(inputs["question"]),
        }

    return _ask_question

Copy

from langchain_openai import ChatOpenAI

gpt_chain = ask_question_with_llm(ChatOpenAI(model="gpt-4o-mini", temperature=0))

UpstageGroundednessCheck

In order to take advantage of Upstage's Groundedness Check feature, you must be issued an API key from the link below.

API key issuance

Copy

from langchain_upstage import UpstageGroundednessCheck

# Creating an Upstage Groundness Checker
upstage_groundedness_check = UpstageGroundednessCheck()

Copy

# Run Groundness Checker to evaluate
request_input = {
    "context": "Teddy is a male and runs the Teddy Note YouTube channel.",
    "answer": "Teddy is a guy.",
}

response = upstage_groundedness_check.invoke(request_input)
print(response)

Copy

 grounded

Copy

# Run Groundness Checker to evaluate
request_input = {
    "context": "Teddy is a male and runs the Teddy Note YouTube channel.",
    "answer": "Teddy is a girl.",
}

response = upstage_groundedness_check.invoke(request_input)
print(response)

Copy

 notGrounded

Defines UpstageGroundednessCheck Evaluator. Later, it is utilized by the Evaluate function.

Copy

from langsmith.schemas import Run, Example
from langsmith.evaluation import evaluate


def upstage_groundness_check_evaluator(run: Run, example: Example) -> dict:
    # LLM Generate Answers, Get Correct Answers
    answer = run.outputs.get("answer", "")
    context = run.outputs.get("context", "")

    # Groundness check
    groundedness_score = upstage_groundedness_check.invoke(
        {"answer": answer, "context": context}
    )
    groundedness_score = groundedness_score == "grounded"

    return {"key": "groundness_score", "score": int(groundedness_score)}

langchain_teddynote Groundness Checker

Utilize OpenAI's model to create a custom Groundness Checker.

Use the OpenAI model to check Groundedness.

Copy

from langsmith.schemas import Run, Example
from langchain_teddynote.evaluator import GroundnessChecker
from langchain_openai import ChatOpenAI

# teddynote Groundness Checker generation
groundedness_check = GroundnessChecker(
    ChatOpenAI(model="gpt-4o-mini", temperature=0)
).create()


def teddynote_groundness_check_evaluator(run: Run, example: Example) -> dict:
    # LLM Generate Answers, Get Correct Answers
    answer = run.outputs.get("answer", "")
    context = run.outputs.get("context", "")

    # Groundness check
    groundedness_score = groundedness_check.invoke(
        {"answer": answer, "context": context}
    )
    groundedness_score = groundedness_score.score == "yes"

    return {"key": "groundness_score", "score": int(groundedness_score)}

Run Groundedness assessment.

Copy

from langsmith.evaluation import evaluate

# Set dataset name
dataset_name = "RAG_EVAL_DATASET"

# execution
experiment_results = evaluate(
    gpt_chain,
    data=dataset_name,
    evaluators=[
        upstage_groundness_check_evaluator,
        teddynote_groundness_check_evaluator,
    ],
    experiment_prefix="GROUNDEDNESS-EVAL",
    # Specifying Experiment Metadata
    metadata={
        "variant": "Upstage & teddynote Groundness Checker the Hallucination evaluation",
    },
)

Comprehensive evaluation of datasets using Summary Evaluators

This is useful when running Groundedness ratings for the entire dataset. (The previous step was to evaluate the individual data.)

Copy

from typing import List
from langsmith.schemas import Example, Run


def upstage_groundness_check_summary_evaluator(
    runs: List[Run], examples: List[Example]
) -> dict:
    def is_grounded(run: Run) -> bool:
        context = run.outputs["context"]
        answer = run.outputs["answer"]
        return (
            upstage_groundedness_check.invoke({"context": context, "answer": answer})
            == "grounded"
        )

    groundedness_scores = sum(1 for run in runs if is_grounded(run))
    return {"key": "groundness_score", "score": groundedness_scores / len(runs)}


def teddynote_groundness_check_summary_evaluator(
    runs: List[Run], examples: List[Example]
) -> dict:
    def is_grounded(run: Run) -> bool:
        context = run.outputs["context"]
        answer = run.outputs["answer"]
        return (
            groundedness_check.invoke({"context": context, "answer": answer}).score
            == "yes"
        )

    groundedness_scores = sum(1 for run in runs if is_grounded(run))
    return {"key": "groundness_score", "score": groundedness_scores / len(runs)}

Copy

from langsmith.evaluation import evaluate

# Running the evaluation
experiment_result1 = evaluate(
    gpt_chain,
    data=dataset_name,
    summary_evaluators=[
        upstage_groundness_check_summary_evaluator,
    ],
    experiment_prefix="GROUNDNESS_UPSTAGE_SUMMARY_EVAL",
    # Specifying Experiment Metadata
    metadata={
        "variant": "Upstage Groundness Checker utilizing Hallucination evaluation",
    },
)

# Running the evaluation
experiment_result2 = evaluate(
    gpt_chain,
    data=dataset_name,
    summary_evaluators=[
        teddynote_groundness_check_summary_evaluator,
    ],
    experiment_prefix="GROUNDNESS_TEDDYNOTE_SUMMARY_EVAL",
    # Specifying Experiment Metadata
    metadata={
        "variant": "Teddynote Groundness Checker the Hallucination evaluation",
    },
)

Previous10. Assessment of the summary method Next12. Compare Experiments(Pairwise Evaluation)

Last updated 5 months ago