15_ERROR_HANDLING

Shared from "Study" on Inkdown

Error Handling - Comprehensive Deep Dive

Overview

Error Handling in the OpenAI Agents SDK provides a structured way to manage and respond to errors that occur during agent execution. Think of Error Handling as "safety nets" or "exception management" - they ensure that when things go wrong (network failures, API errors, invalid inputs, etc.), the system can respond gracefully, provide useful information, and recover when possible.

Core Concepts

Error Types

The SDK defines several error types:

UserError - Errors due to user input or configuration
ModelBehaviorError - Errors from unexpected model behavior
AgentsException - Base exception for SDK errors
ToolTimeoutError - Tool execution timeout
MaxTurnsExceeded - Agent exceeded maximum turns
Guardrail Tripwires - Guardrail violations
RunErrorDetails - Detailed error information

Why Error Handling Matters

Python

class MaxTurnsExceeded(AgentsException):
    """Error when agent exceeds maximum turns."""
    
    def __init__(self, run_state: RunState):
        self.run_state = run_state
        super().__init__("Agent exceeded maximum turns")

Python

class InputGuardrailTripwireTriggered(AgentsException):
    """Error when input guardrail tripwire is triggered."""
    
    def __init__(self, guardrail_result, input):
        self.guardrail_result = guardrail_result
        self.input = input
        super().__init__("Input guardrail tripwire triggered")

class OutputGuardrailTripwireTriggered(AgentsException):
    """Error when output guardrail tripwire is triggered."""
    
    def __init__(self, guardrail_result, output):
        self.guardrail_result = guardrail_result
        self.output = output
        super().__init__("Output guardrail tripwire triggered")

Python

@dataclass
class RunErrorDetails:
    """Detailed information about an error."""
    
    error_type: str
    """Type of error."""
    
    message: str
    """Error message."""
    
    agent_name: str
    """Name of the agent that caused the error."""
    
    turn: int
    """Turn number when error occurred."""
    
    timestamp: datetime
    """When the error occurred."""
    
    traceback: str | None
    """Error traceback if available."""
    
    context: dict[str, Any]
    """Additional context about the error."""

Python

try:
    result = await Runner.run(agent, input)
except AgentsException as e:
    if hasattr(e, 'error_details'):
        details = e.error_details
        print(f"Error type: {details.error_type}")
        print(f"Agent: {details.agent_name}")
        print(f"Turn: {details.turn}")

Python

from agents import RunErrorHandlers

error_handlers = RunErrorHandlers(
    max_turns=lambda ctx, error: "The agent needs more turns to complete this task. Let me try a different approach.",
)

result = await Runner.run(
    agent,
    input,
    error_handlers=error_handlers,
)

Python

def max_turns_handler(context, error):
    """Handle max_turns exceeded."""
    return "I need more information to complete this task."

error_handlers = RunErrorHandlers(
    max_turns=max_turns_handler,
)

Python

def custom_error_handler(error_type: str, context, error):
    """Custom error handler."""
    if error_type == "network_error":
        return "Network error occurred. Please try again."
    return f"Error: {str(error)}"

# Note: Custom error types require extending RunErrorHandlers

Python

from agents import ToolErrorFormatter

def custom_error_formatter(args) -> str:
    """Custom error formatter."""
    return f"Tool {args.tool_name} failed: {args.default_message}"

config = RunConfig(
    tool_error_formatter=custom_error_formatter,
)

Python

@function_tool(timeout=30)  # 30 second timeout
def slow_operation(param: str) -> str:
    """Operation that might be slow."""
    time.sleep(60)  # Will timeout
    return "Done"

# When timeout occurs:
# - ToolTimeoutError is raised
# - Error is formatted and sent to model
# - Model can retry with different parameters

Python

from agents import InputGuardrailTripwireTriggered

@input_guardrail
def check_safety(context, agent, input):
    """Check input safety."""
    if "harmful" in str(input).lower():
        return GuardrailFunctionOutput(
            output_info="Harmful content detected",
            tripwire_triggered=True,
        )
    return GuardrailFunctionOutput(
        output_info="Safe",
        tripwire_triggered=False,
    )

try:
    result = await Runner.run(agent, "harmful content")
except InputGuardrailTripwireTriggered as e:
    print(f"Blocked: {e.guardrail_result.output.output_info}")
    # Handle blocked input

Python

from agents import OutputGuardrailTripwireTriggered

@output_guardrail
def check_quality(context, agent, output):
    """Check output quality."""
    if len(output) > 1000:
        return GuardrailFunctionOutput(
            output_info="Output too long",
            tripwire_triggered=True,
        )
    return GuardrailFunctionOutput(
        output_info="Valid",
        tripwire_triggered=False,
    )

try:
    result = await Runner.run(agent, input)
except OutputGuardrailTripwireTriggered as e:
    print(f"Blocked: {e.guardrail_result.output.output_info}")
    # Handle blocked output

Python

from agents import tool_input_guardrail, ToolInputGuardrailTripwireTriggered

@tool_input_guardrail
def validate_tool_args(context, tool_name, args_json):
    """Validate tool arguments."""
    args = json.loads(args_json)
    if not args.get("required_param"):
        return ToolInputGuardrailFunctionOutput(
            output_info="Missing required parameter",
            tripwire_triggered=True,
        )
    return ToolInputGuardrailFunctionOutput(
        output_info="Valid",
        tripwire_triggered=False,
    )

# Tool guardrail tripwires are handled internally
# The tool is skipped and an error message is sent to the model

Python

from agents import retry_policies

policy = retry_policies.exponential_backoff(
    max_retries=3,
    initial_delay=1.0,
    max_delay=10.0,
)

agent = Agent(
    name="retry_agent",
    instructions="...",
    model_settings=ModelSettings(
        retry_policy=policy,
    ),
)

Python

from tenacity import retry, stop_after_attempt, wait_exponential

@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=1, min=1, max=10),
)
async def run_with_retry(agent, input):
    """Run agent with retry."""
    return await Runner.run(agent, input)

try:
    result = await run_with_retry(agent, input)
except Exception as e:
    print(f"Failed after retries: {e}")

Python

async def run_with_fallback(agent, input, fallback_agent):
    """Run agent with fallback."""
    try:
        return await Runner.run(agent, input)
    except Exception as e:
        print(f"Primary agent failed: {e}, using fallback")
        return await Runner.run(fallback_agent, input)

Python

try:
    result = await Runner.run(agent, input, max_turns=10)
except MaxTurnsExceeded as e:
    state = e.run_state
    # Resume with increased max_turns
    result = await Runner.run(agent, state, max_turns=20)

Python

async def run_with_degradation(agent, input):
    """Run with graceful degradation."""
    try:
        result = await Runner.run(agent, input)
        return result
    except Exception as e:
        print(f"Error occurred: {e}")
        # Return a fallback response
        return "I encountered an error. Let me try a simpler approach."

Python

import logging

logger = logging.getLogger(__name__)

try:
    result = await Runner.run(agent, input)
except AgentsException as e:
    logger.error(f"Agent error: {e}", exc_info=True)
    raise

Python

try:
    result = await Runner.run(agent, input)
except AgentsException as e:
    error_data = {
        "error_type": type(e).__name__,
        "message": str(e),
        "agent": agent.name,
        "timestamp": datetime.now().isoformat(),
    }
    logger.error("Agent error", extra=error_data)

Python

from prometheus_client import Counter

error_counter = Counter(
    "agent_errors",
    "Agent errors",
    ["error_type", "agent_name"]
)

try:
    result = await Runner.run(agent, input)
except AgentsException as e:
    error_counter.labels(
        error_type=type(e).__name__,
        agent_name=agent.name,
    ).inc()
    raise

Python

def get_user_message(error: Exception) -> str:
    """Convert technical error to user message."""
    if isinstance(error, InputGuardrailTripwireTriggered):
        return "I cannot process that request as it contains content that violates our safety guidelines."
    elif isinstance(error, MaxTurnsExceeded):
        return "I need more information to complete this task. Could you provide more details?"
    elif isinstance(error, ToolTimeoutError):
        return "An operation took too long. Let me try a different approach."
    else:
        return "An error occurred. Please try again or contact support."

Python

def get_detailed_error_message(error: Exception, context: dict) -> str:
    """Get detailed error message with context."""
    base_message = get_user_message(error)
    
    if context.get("suggestion"):
        base_message += f"\n\nSuggestion: {context['suggestion']}"
    
    if context.get("next_steps"):
        base_message += f"\n\nNext steps: {context['next_steps']}"
    
    return base_message

Python

@pytest.mark.asyncio
async def test_max_turns_error():
    """Test max turns error."""
    agent = Agent(
        name="looping_agent",
        instructions="Keep asking questions forever",
    )
    
    with pytest.raises(MaxTurnsExceeded) as exc_info:
        await Runner.run(agent, "Hello", max_turns=5)
    
    # Verify error details
    error = exc_info.value
    assert error.run_state is not None
    assert error.run_state._current_turn >= 5

@pytest.mark.asyncio
async def test_guardrail_tripwire():
    """Test guardrail tripwire."""
    @input_guardrail
    def block_all(context, agent, input):
        return GuardrailFunctionOutput(
            output_info="Blocked",
            tripwire_triggered=True,
        )
    
    agent = Agent(input_guardrails=[block_all])
    
    with pytest.raises(InputGuardrailTripwireTriggered):
        await Runner.run(agent, "Hello")

Python

@pytest.fixture
def error_agent():
    """Agent that always errors."""
    @function_tool
    def error_tool():
        raise ValueError("Tool error")
    
    return Agent(tools=[error_tool])

@pytest.mark.asyncio
async def test_tool_error(error_agent):
    """Test tool error handling."""
    result = await Runner.run(error_agent, "Call the tool")
    # Tool error should be formatted and returned
    assert "error" in result.final_output.lower()

Python

# Good - specific exceptions
try:
    result = await Runner.run(agent, input)
except MaxTurnsExceeded:
    handle_max_turns()
except InputGuardrailTripwireTriggered:
    handle_guardrail()

# Avoid - catch all exceptions
try:
    result = await Runner.run(agent, input)
except Exception:  # Too broad
    handle_all()

Python

# Good - log then raise
try:
    result = await Runner.run(agent, input)
except AgentsException as e:
    logger.error(f"Agent error: {e}", exc_info=True)
    raise

# Avoid - just raise
try:
    result = await Runner.run(agent, input)
except AgentsException as e:
    raise  # Lost logging opportunity

Python

class MyApplicationError(AgentsException):
    """Base error for my application."""
    pass

class InsufficientCreditsError(MyApplicationError):
    """Error when user has insufficient credits."""
    pass

# Use custom errors
if user.credits < cost:
    raise InsufficientCreditsError(
        f"Insufficient credits: {user.credits} < {cost}"
    )

Python

# Good - graceful fallback
try:
    result = await Runner.run(primary_agent, input)
except Exception:
    result = await Runner.run(fallback_agent, input)

# Avoid - crash on error
result = await Runner.run(primary_agent, input)  # Might crash

Python

from tenacity import retry, stop_after_attempt, wait_exponential

@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=1, min=1, max=10),
    retry=retry_if_exception_type((ConnectionError, TimeoutError)),
)
async def run_with_network_retry(agent, input):
    """Run with network retry."""
    return await Runner.run(agent, input)

Python

async def run_with_rate_limit(agent, input):
    """Run with rate limit handling."""
    try:
        return await Runner.run(agent, input)
    except RateLimitError as e:
        wait_time = e.retry_after or 60
        print(f"Rate limited, waiting {wait_time}s")
        await asyncio.sleep(wait_time)
        return await Runner.run(agent, input)

Python

try:
    result = await Runner.run(agent, input)
except UserError as e:
    # User configuration error
    return f"Configuration error: {str(e)}"
except ModelBehaviorError as e:
    # Model behaved unexpectedly
    return f"Model error: {str(e)}"

Python

async def run_with_timeout(agent, input, timeout=30):
    """Run with timeout."""
    try:
        return await asyncio.wait_for(
            Runner.run(agent, input),
            timeout=timeout,
        )
    except asyncio.TimeoutError:
        return "Operation timed out. Please try again."

Python

async def run_with_partial_success(agent, input):
    """Handle partial success."""
    try:
        result = await Runner.run(agent, input)
        return result
    except MaxTurnsExceeded as e:
        # Partial success - got some output
        state = e.run_state
        if state.all_items:
            last_message = state.all_items[-1]
            return f"Partial result: {last_message}"
        return "Could not complete the task."

Python

from prometheus_client import Counter, Histogram

error_counter = Counter(
    "agent_errors_total",
    "Total agent errors",
    ["error_type", "agent_name"]
)

error_duration = Histogram(
    "agent_error_duration_seconds",
    "Time spent in error handling",
)

async def monitored_run(agent, input):
    """Run with error monitoring."""
    start = time.time()
    try:
        return await Runner.run(agent, input)
    except AgentsException as e:
        error_counter.labels(
            error_type=type(e).__name__,
            agent_name=agent.name,
        ).inc()
        error_duration.observe(time.time() - start)
        raise

Python

async def run_with_alerting(agent, input):
    """Run with error alerting."""
    try:
        return await Runner.run(agent, input)
    except CriticalError as e:
        await send_alert(f"Critical error: {e}")
        raise
    except NonCriticalError as e:
        await log_warning(f"Non-critical error: {e}")
        raise