13. Repeat evaluation

Repeat evaluation

You can add iterations to the experiment.

This is useful when you can repeat the evaluation multiple times:

For larger evaluation sets
For chains that can generate variable responses
Assessments that can generate variable scores (e.g. llm-as-judge )

Reference

https://docs.smith.langchain.com/how_to_guides/evaluation/evaluate_llm_application#evaluate-on-a-dataset-with-repetitions

Copy

# installation
# !pip install -qU langsmith langchain-teddynotew

Copy

# Configuration file for managing API KEY as environment variable
from dotenv import load_dotenv

# Load API KEY information
load_dotenv()

Copy

 True

Copy

# Set up LangSmith tracking. https://smith.langchain.com
# !pip install -qU langchain-teddynote
from langchain_teddynote import logging

# Enter a project name.
logging.langsmith("CH16-Evaluations")

Copy

 Start tracking LangSmith. 
[Project name] 
CH16-Evaluations

Define functions for RAG performance testing

Copy

from myrag import PDFRAG


# Create a function that answers the question
def ask_question_with_llm(llm):
    # Creating a PDFRAG object
    rag = PDFRAG(
        "data/SPRI_AI_Brief_December 2023 issue_F.pdf",
        llm,
    )

    # Create a retriever
    retriever = rag.create_retriever()

    # Create a chain
    rag_chain = rag.create_chain(retriever)

    def _ask_question(inputs: dict):
        # Contextual search for questions
        context = retriever.invoke(inputs["question"])
        # Combine the searched documents into one string
        context = "\n".join([doc.page_content for doc in context])
        # Returns a dictionary containing the question, context, and answer.
        return {
            "question": inputs["question"],
            "context": context,
            "answer": rag_chain.invoke(inputs["question"]),
        }

    return _ask_question

Copy

from langchain_openai import ChatOpenAI
from langchain_community.chat_models import ChatOllama


gpt_chain = ask_question_with_llm(ChatOpenAI(model="gpt-4o-mini", temperature=1.0))

# Import the Ollama model.
ollama_chain = ask_question_with_llm(
    ChatOllama(model="EEVE-Korean-10.8B:latest", temperature=1.0)
)

Repeat evaluation for RAG using GPT model

Copy

from langsmith.evaluation import evaluate, LangChainStringEvaluator

# Create a qa evaluator
cot_qa_evalulator = LangChainStringEvaluator(
    "cot_qa",
    config={"llm": ChatOpenAI(model="gpt-4o-mini", temperature=0)},
    prepare_data=lambda run, example: {
        "prediction": run.outputs["answer"],
        "reference": run.outputs["context"],
        "input": example.inputs["question"],
    },
)

dataset_name = "RAG_EVAL_DATASET"

# Running the evaluation
evaluate(
    gpt_chain,
    data=dataset_name,
    evaluators=[cot_qa_evalulator],
    experiment_prefix="REPEAT_EVAL",
    # Specifying Experiment Metadata
    metadata={
        "variant": "Perform repeat evaluation. GPT-4o-mini model (cot_qa)",
    },
    num_repetitions=3,
)

Repeat evaluation for RAG using Ollama model

Copy

from langsmith.evaluation import evaluate, LangChainStringEvaluator

# Create a qa evaluator
cot_qa_evalulator = LangChainStringEvaluator(
    "cot_qa",
    config={"llm": ChatOpenAI(model="gpt-4o-mini", temperature=0)},
    prepare_data=lambda run, example: {
        "prediction": run.outputs["answer"],
        "reference": run.outputs["context"],
        "input": example.inputs["question"],
    },
)

dataset_name = "RAG_EVAL_DATASET"

# Running the evaluation
evaluate(
    ollama_chain,
    data=dataset_name,
    evaluators=[cot_qa_evalulator],
    experiment_prefix="REPEAT_EVAL",
    # Specifying Experiment Metadata
    metadata={
        "variant": "Perform Repeat Evaluation. EEVE-Korean-10.8B model (cot_qa)",
    },
    num_repetitions=3,
)

Previous12. Compare Experiments(Pairwise Evaluation)Next14. Automating evaluation using online evaluation

Last updated 5 months ago