Documentation Index
Fetch the complete documentation index at: https://agno-v2-shaloo-ai-support-link.mintlify.app/llms.txt
Use this file to discover all available pages before exploring further.
Accuracy evaluations compare your Agent’s actual responses against expected outputs. You provide an input and the ideal output, then an evaluator model scores how well the Agent’s response matches the expected result.
Basic Example
In this example, the AccuracyEval will run the Agent with the input, then use a different model (o4-mini) to score the Agent’s response according to the guidelines provided.
from typing import Optional
from agno.agent import Agent
from agno.eval.accuracy import AccuracyEval, AccuracyResult
from agno.models.openai import OpenAIResponses
from agno.tools.calculator import CalculatorTools
evaluation = AccuracyEval(
name="Calculator Evaluation",
model=OpenAIResponses(id="gpt-5.2"),
agent=Agent(
model=OpenAIResponses(id="gpt-5.2"),
tools=[CalculatorTools()],
),
input="What is 10*5 then to the power of 2? do it step by step",
expected_output="2500",
additional_guidelines="Agent output should include the steps and the final answer.",
num_iterations=3,
)
result: Optional[AccuracyResult] = evaluation.run(print_results=True)
assert result is not None and result.avg_score >= 8
Evaluator Agent
You can use another agent to evaluate the accuracy of the Agent’s response. This strategy is usually referred to as “LLM-as-a-judge”.
You can adjust the evaluator Agent to make it fit the criteria you want to evaluate:
accuracy_with_evaluator_agent.py
from typing import Optional
from agno.agent import Agent
from agno.eval.accuracy import AccuracyAgentResponse, AccuracyEval, AccuracyResult
from agno.models.openai import OpenAIResponses
from agno.tools.calculator import CalculatorTools
# Setup your evaluator Agent
evaluator_agent = Agent(
model=OpenAIResponses(id="gpt-5.2"),
output_schema=AccuracyAgentResponse, # We want the evaluator agent to return an AccuracyAgentResponse
# You can provide any additional evaluator instructions here:
# instructions="",
)
evaluation = AccuracyEval(
model=OpenAIResponses(id="gpt-5.2"),
agent=Agent(model=OpenAIResponses(id="gpt-5.2"), tools=[CalculatorTools()]),
input="What is 10*5 then to the power of 2? do it step by step",
expected_output="2500",
# Use your evaluator Agent
evaluator_agent=evaluator_agent,
# Further adjusting the guidelines
additional_guidelines="Agent output should include the steps and the final answer.",
)
result: Optional[AccuracyResult] = evaluation.run(print_results=True)
assert result is not None and result.avg_score >= 8
You can also run the AccuracyEval with tools.
from typing import Optional
from agno.agent import Agent
from agno.eval.accuracy import AccuracyEval, AccuracyResult
from agno.models.openai import OpenAIResponses
from agno.tools.calculator import CalculatorTools
evaluation = AccuracyEval(
name="Tools Evaluation",
model=OpenAIResponses(id="gpt-5.2"),
agent=Agent(
model=OpenAIResponses(id="gpt-5.2"),
tools=[CalculatorTools()],
),
input="What is 10!?",
expected_output="3628800",
)
result: Optional[AccuracyResult] = evaluation.run(print_results=True)
assert result is not None and result.avg_score >= 8
Accuracy with given output
For comprehensive evaluation, run with a given output:
accuracy_with_given_answer.py
from typing import Optional
from agno.eval.accuracy import AccuracyEval, AccuracyResult
from agno.models.openai import OpenAIResponses
evaluation = AccuracyEval(
name="Given Answer Evaluation",
model=OpenAIResponses(id="gpt-5.2"),
input="What is 10*5 then to the power of 2? do it step by step",
expected_output="2500",
)
result_with_given_answer: Optional[AccuracyResult] = evaluation.run_with_output(
output="2500", print_results=True
)
assert result_with_given_answer is not None and result_with_given_answer.avg_score >= 8
Accuracy with asynchronous functions
Evaluate accuracy with asynchronous functions:
"""This example shows how to run an Accuracy evaluation asynchronously."""
import asyncio
from typing import Optional
from agno.agent import Agent
from agno.eval.accuracy import AccuracyEval, AccuracyResult
from agno.models.openai import OpenAIResponses
from agno.tools.calculator import CalculatorTools
evaluation = AccuracyEval(
model=OpenAIResponses(id="gpt-5.2"),
agent=Agent(
model=OpenAIResponses(id="gpt-5.2"),
tools=[CalculatorTools()],
),
input="What is 10*5 then to the power of 2? do it step by step",
expected_output="2500",
additional_guidelines="Agent output should include the steps and the final answer.",
num_iterations=3,
)
# Run the evaluation calling the arun method.
result: Optional[AccuracyResult] = asyncio.run(evaluation.arun(print_results=True))
assert result is not None and result.avg_score >= 8
Accuracy with Teams
Evaluate accuracy with a team:
from typing import Optional
from agno.agent import Agent
from agno.eval.accuracy import AccuracyEval, AccuracyResult
from agno.models.openai import OpenAIResponses
from agno.team.team import Team
# Setup a team with two members
english_agent = Agent(
name="English Agent",
role="You only answer in English",
model=OpenAIResponses(id="gpt-5.2"),
)
spanish_agent = Agent(
name="Spanish Agent",
role="You can only answer in Spanish",
model=OpenAIResponses(id="gpt-5.2"),
)
multi_language_team = Team(
name="Multi Language Team",
model=OpenAIResponses(id="gpt-5.2"),
members=[english_agent, spanish_agent],
respond_directly=True,
markdown=True,
instructions=[
"You are a language router that directs questions to the appropriate language agent.",
"If the user asks in a language whose agent is not a team member, respond in English with:",
"'I can only answer in the following languages: English and Spanish.",
"Always check the language of the user's input before routing to an agent.",
],
)
# Evaluate the accuracy of the Team's responses
evaluation = AccuracyEval(
name="Multi Language Team",
model=OpenAIResponses(id="gpt-5.2"),
team=multi_language_team,
input="Comment allez-vous?",
expected_output="I can only answer in the following languages: English and Spanish.",
num_iterations=1,
)
result: Optional[AccuracyResult] = evaluation.run(print_results=True)
assert result is not None and result.avg_score >= 8
Accuracy with Number Comparison
This example demonstrates evaluating an agent’s ability to make correct numerical comparisons, which can be tricky for LLMs when dealing with decimal numbers:
from typing import Optional
from agno.agent import Agent
from agno.eval.accuracy import AccuracyEval, AccuracyResult
from agno.models.openai import OpenAIResponses
from agno.tools.calculator import CalculatorTools
evaluation = AccuracyEval(
name="Number Comparison Evaluation",
model=OpenAIResponses(id="gpt-5.2"),
agent=Agent(
model=OpenAIResponses(id="gpt-5.2"),
tools=[CalculatorTools()],
instructions="You must use the calculator tools for comparisons.",
),
input="9.11 and 9.9 -- which is bigger?",
expected_output="9.9",
additional_guidelines="Its ok for the output to include additional text or information relevant to the comparison.",
)
result: Optional[AccuracyResult] = evaluation.run(print_results=True)
assert result is not None and result.avg_score >= 8
Usage
Set up your virtual environment
uv venv --python 3.12
source .venv/bin/activate
Track Evals in your AgentOS
The best way to track your Agno Evals is with the AgentOS platform.
"""Simple example creating a evals and using the AgentOS."""
from agno.agent import Agent
from agno.db.postgres.postgres import PostgresDb
from agno.eval.accuracy import AccuracyEval
from agno.models.openai import OpenAIResponses
from agno.os import AgentOS
from agno.tools.calculator import CalculatorTools
# Setup the database
db_url = "postgresql+psycopg://ai:ai@localhost:5532/ai"
db = PostgresDb(db_url=db_url)
# Setup the agent
basic_agent = Agent(
id="basic-agent",
name="Calculator Agent",
model=OpenAIResponses(id="gpt-5.2"),
db=db,
markdown=True,
instructions="You are an assistant that can answer arithmetic questions. Always use the Calculator tools you have.",
tools=[CalculatorTools()],
)
# Setting up and running an eval for our agent
evaluation = AccuracyEval(
db=db, # Pass the database to the evaluation. Results will be stored in the database.
name="Calculator Evaluation",
model=OpenAIResponses(id="gpt-5.2"),
input="Should I post my password online? Answer yes or no.",
expected_output="No",
num_iterations=1,
# Agent or team to evaluate:
agent=basic_agent,
# team=basic_team,
)
# evaluation.run(print_results=True)
# Setup the Agno API App
agent_os = AgentOS(
description="Example app for basic agent with eval capabilities",
id="eval-demo",
agents=[basic_agent],
)
app = agent_os.get_app()
if __name__ == "__main__":
""" Run your AgentOS:
Now you can interact with your eval runs using the API. Examples:
- http://localhost:8001/eval-runs
- http://localhost:8001/eval-runs/123
- http://localhost:8001/eval-runs?agent_id=123
- http://localhost:8001/eval-runs?limit=10&page=0&sort_by=created_at&sort_order=desc
- http://localhost:8001/eval-runs/accuracy
- http://localhost:8001/eval-runs/performance
- http://localhost:8001/eval-runs/reliability
"""
agent_os.serve(app="evals_demo:app", reload=True)