diff --git a/cortex/retry.py b/cortex/retry.py new file mode 100644 index 0000000..1ac9bd4 --- /dev/null +++ b/cortex/retry.py @@ -0,0 +1,327 @@ +"""Smart retry logic with exponential backoff for Cortex operations. + +This module provides robust retry mechanisms for network operations, +API calls, and package installations that may fail transiently. + +Implements Issue #43: Smart Retry Logic with Exponential Backoff +""" + +import time +import random +import logging +import functools +from typing import Callable, TypeVar, Optional, Tuple, Type, Union, List +from dataclasses import dataclass, field +from enum import Enum + +logger = logging.getLogger(__name__) + +T = TypeVar('T') + + +class RetryStrategy(Enum): + """Available retry strategies.""" + EXPONENTIAL = "exponential" + LINEAR = "linear" + CONSTANT = "constant" + FIBONACCI = "fibonacci" + + +@dataclass +class RetryConfig: + """Configuration for retry behavior. + + Attributes: + max_attempts: Maximum number of retry attempts (including initial try) + base_delay: Initial delay in seconds before first retry + max_delay: Maximum delay cap in seconds + exponential_base: Base for exponential backoff (default 2) + jitter: Whether to add random jitter to prevent thundering herd + jitter_range: Range for jitter as fraction of delay (0.0 to 1.0) + strategy: Retry strategy to use + retryable_exceptions: Tuple of exception types that trigger retry + """ + max_attempts: int = 3 + base_delay: float = 1.0 + max_delay: float = 60.0 + exponential_base: float = 2.0 + jitter: bool = True + jitter_range: float = 0.25 + strategy: RetryStrategy = RetryStrategy.EXPONENTIAL + retryable_exceptions: Tuple[Type[Exception], ...] = (Exception,) + + def __post_init__(self): + if self.max_attempts < 1: + raise ValueError("max_attempts must be at least 1") + if self.base_delay < 0: + raise ValueError("base_delay must be non-negative") + if self.max_delay < self.base_delay: + raise ValueError("max_delay must be >= base_delay") + if not 0 <= self.jitter_range <= 1: + raise ValueError("jitter_range must be between 0 and 1") + + +@dataclass +class RetryResult: + """Result of a retry operation. + + Attributes: + success: Whether the operation ultimately succeeded + result: The return value if successful, None otherwise + attempts: Number of attempts made + total_time: Total time spent including delays + errors: List of errors encountered during retries + final_error: The last error if operation failed + """ + success: bool + result: Optional[T] = None + attempts: int = 0 + total_time: float = 0.0 + errors: List[Exception] = field(default_factory=list) + final_error: Optional[Exception] = None + + +class RetryManager: + """Manages retry operations with configurable backoff strategies.""" + + # Precomputed Fibonacci sequence for fibonacci backoff + _FIBONACCI = [1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144] + + def __init__(self, config: Optional[RetryConfig] = None): + """Initialize retry manager with configuration. + + Args: + config: RetryConfig instance, uses defaults if None + """ + self.config = config or RetryConfig() + + def _calculate_delay(self, attempt: int) -> float: + """Calculate delay for a given attempt number. + + Args: + attempt: The attempt number (0-indexed) + + Returns: + Delay in seconds + """ + if self.config.strategy == RetryStrategy.CONSTANT: + delay = self.config.base_delay + + elif self.config.strategy == RetryStrategy.LINEAR: + delay = self.config.base_delay * (attempt + 1) + + elif self.config.strategy == RetryStrategy.FIBONACCI: + fib_index = min(attempt, len(self._FIBONACCI) - 1) + delay = self.config.base_delay * self._FIBONACCI[fib_index] + + else: # EXPONENTIAL (default) + delay = self.config.base_delay * (self.config.exponential_base ** attempt) + + # Apply max delay cap + delay = min(delay, self.config.max_delay) + + # Apply jitter if enabled + if self.config.jitter: + jitter_amount = delay * self.config.jitter_range + delay += random.uniform(-jitter_amount, jitter_amount) + delay = max(0, delay) # Ensure non-negative + + return delay + + def execute( + self, + func: Callable[..., T], + *args, + on_retry: Optional[Callable[[int, Exception, float], None]] = None, + **kwargs + ) -> RetryResult: + """Execute a function with retry logic. + + Args: + func: The function to execute + *args: Positional arguments for the function + on_retry: Optional callback called before each retry with + (attempt_number, exception, delay) + **kwargs: Keyword arguments for the function + + Returns: + RetryResult containing success status and result or errors + """ + start_time = time.time() + errors: List[Exception] = [] + + for attempt in range(self.config.max_attempts): + try: + result = func(*args, **kwargs) + return RetryResult( + success=True, + result=result, + attempts=attempt + 1, + total_time=time.time() - start_time, + errors=errors + ) + + except self.config.retryable_exceptions as e: + errors.append(e) + + if attempt < self.config.max_attempts - 1: + delay = self._calculate_delay(attempt) + + logger.warning( + f"Attempt {attempt + 1}/{self.config.max_attempts} failed: {e}. " + f"Retrying in {delay:.2f}s..." + ) + + if on_retry: + on_retry(attempt + 1, e, delay) + + time.sleep(delay) + else: + logger.error( + f"All {self.config.max_attempts} attempts failed. " + f"Final error: {e}" + ) + + return RetryResult( + success=False, + attempts=self.config.max_attempts, + total_time=time.time() - start_time, + errors=errors, + final_error=errors[-1] if errors else None + ) + + +def retry( + max_attempts: int = 3, + base_delay: float = 1.0, + max_delay: float = 60.0, + exponential_base: float = 2.0, + jitter: bool = True, + strategy: RetryStrategy = RetryStrategy.EXPONENTIAL, + retryable_exceptions: Tuple[Type[Exception], ...] = (Exception,), + on_retry: Optional[Callable[[int, Exception, float], None]] = None +): + """Decorator for adding retry logic to functions. + + Args: + max_attempts: Maximum number of attempts + base_delay: Initial delay in seconds + max_delay: Maximum delay cap + exponential_base: Base for exponential backoff + jitter: Whether to add random jitter + strategy: Retry strategy to use + retryable_exceptions: Exception types that trigger retry + on_retry: Callback for retry events + + Returns: + Decorated function with retry logic + + Example: + @retry(max_attempts=3, base_delay=1.0) + def fetch_packages(): + return requests.get("https://api.example.com/packages") + """ + config = RetryConfig( + max_attempts=max_attempts, + base_delay=base_delay, + max_delay=max_delay, + exponential_base=exponential_base, + jitter=jitter, + strategy=strategy, + retryable_exceptions=retryable_exceptions + ) + manager = RetryManager(config) + + def decorator(func: Callable[..., T]) -> Callable[..., T]: + @functools.wraps(func) + def wrapper(*args, **kwargs) -> T: + result = manager.execute(func, *args, on_retry=on_retry, **kwargs) + + if result.success: + return result.result + else: + raise result.final_error + + return wrapper + + return decorator + + +# Preset configurations for common use cases +NETWORK_RETRY_CONFIG = RetryConfig( + max_attempts=5, + base_delay=1.0, + max_delay=30.0, + strategy=RetryStrategy.EXPONENTIAL, + jitter=True +) + +API_RETRY_CONFIG = RetryConfig( + max_attempts=3, + base_delay=0.5, + max_delay=10.0, + strategy=RetryStrategy.EXPONENTIAL, + jitter=True +) + +APT_RETRY_CONFIG = RetryConfig( + max_attempts=3, + base_delay=2.0, + max_delay=60.0, + strategy=RetryStrategy.EXPONENTIAL, + jitter=False # No jitter for apt operations +) + + +def retry_apt_operation(func: Callable[..., T], *args, **kwargs) -> RetryResult: + """Convenience function for retrying apt operations. + + Uses preset configuration optimized for package manager operations + which may fail due to lock files or network issues. + + Args: + func: The apt operation function + *args: Positional arguments + **kwargs: Keyword arguments + + Returns: + RetryResult with operation outcome + """ + manager = RetryManager(APT_RETRY_CONFIG) + return manager.execute(func, *args, **kwargs) + + +def retry_api_call(func: Callable[..., T], *args, **kwargs) -> RetryResult: + """Convenience function for retrying API calls. + + Uses preset configuration optimized for LLM API calls + with rate limiting considerations. + + Args: + func: The API call function + *args: Positional arguments + **kwargs: Keyword arguments + + Returns: + RetryResult with operation outcome + """ + manager = RetryManager(API_RETRY_CONFIG) + return manager.execute(func, *args, **kwargs) + + +def retry_network_operation(func: Callable[..., T], *args, **kwargs) -> RetryResult: + """Convenience function for retrying network operations. + + Uses preset configuration optimized for network requests + that may fail due to connectivity issues. + + Args: + func: The network operation function + *args: Positional arguments + **kwargs: Keyword arguments + + Returns: + RetryResult with operation outcome + """ + manager = RetryManager(NETWORK_RETRY_CONFIG) + return manager.execute(func, *args, **kwargs) diff --git a/tests/test_retry.py b/tests/test_retry.py new file mode 100644 index 0000000..25e9d49 --- /dev/null +++ b/tests/test_retry.py @@ -0,0 +1,434 @@ +"""Tests for the smart retry logic module. + +Tests Issue #43: Smart Retry Logic with Exponential Backoff +""" + +import pytest +import time +from unittest.mock import Mock, patch, call + +from cortex.retry import ( + RetryConfig, + RetryStrategy, + RetryResult, + RetryManager, + retry, + NETWORK_RETRY_CONFIG, + API_RETRY_CONFIG, + APT_RETRY_CONFIG, + retry_apt_operation, + retry_api_call, + retry_network_operation, +) + + +class TestRetryConfig: + """Tests for RetryConfig dataclass.""" + + def test_default_config(self): + """Test default configuration values.""" + config = RetryConfig() + assert config.max_attempts == 3 + assert config.base_delay == 1.0 + assert config.max_delay == 60.0 + assert config.exponential_base == 2.0 + assert config.jitter is True + assert config.jitter_range == 0.25 + assert config.strategy == RetryStrategy.EXPONENTIAL + + def test_custom_config(self): + """Test custom configuration.""" + config = RetryConfig( + max_attempts=5, + base_delay=0.5, + max_delay=30.0, + strategy=RetryStrategy.LINEAR + ) + assert config.max_attempts == 5 + assert config.base_delay == 0.5 + assert config.max_delay == 30.0 + assert config.strategy == RetryStrategy.LINEAR + + def test_invalid_max_attempts(self): + """Test that invalid max_attempts raises error.""" + with pytest.raises(ValueError, match="max_attempts must be at least 1"): + RetryConfig(max_attempts=0) + + def test_invalid_base_delay(self): + """Test that negative base_delay raises error.""" + with pytest.raises(ValueError, match="base_delay must be non-negative"): + RetryConfig(base_delay=-1) + + def test_invalid_max_delay(self): + """Test that max_delay < base_delay raises error.""" + with pytest.raises(ValueError, match="max_delay must be >= base_delay"): + RetryConfig(base_delay=10, max_delay=5) + + def test_invalid_jitter_range(self): + """Test that invalid jitter_range raises error.""" + with pytest.raises(ValueError, match="jitter_range must be between 0 and 1"): + RetryConfig(jitter_range=1.5) + + +class TestRetryManager: + """Tests for RetryManager class.""" + + def test_successful_first_attempt(self): + """Test operation succeeds on first attempt.""" + config = RetryConfig(max_attempts=3, jitter=False) + manager = RetryManager(config) + + mock_func = Mock(return_value="success") + result = manager.execute(mock_func) + + assert result.success is True + assert result.result == "success" + assert result.attempts == 1 + assert len(result.errors) == 0 + mock_func.assert_called_once() + + def test_successful_after_retries(self): + """Test operation succeeds after initial failures.""" + config = RetryConfig(max_attempts=3, base_delay=0.01, jitter=False) + manager = RetryManager(config) + + mock_func = Mock(side_effect=[Exception("fail1"), Exception("fail2"), "success"]) + result = manager.execute(mock_func) + + assert result.success is True + assert result.result == "success" + assert result.attempts == 3 + assert len(result.errors) == 2 + + def test_all_attempts_fail(self): + """Test when all attempts fail.""" + config = RetryConfig(max_attempts=3, base_delay=0.01, jitter=False) + manager = RetryManager(config) + + mock_func = Mock(side_effect=Exception("always fails")) + result = manager.execute(mock_func) + + assert result.success is False + assert result.result is None + assert result.attempts == 3 + assert len(result.errors) == 3 + assert result.final_error is not None + + def test_exponential_backoff_delays(self): + """Test exponential backoff delay calculation.""" + config = RetryConfig( + max_attempts=5, + base_delay=1.0, + max_delay=100.0, + exponential_base=2.0, + jitter=False, + strategy=RetryStrategy.EXPONENTIAL + ) + manager = RetryManager(config) + + # Expected delays: 1, 2, 4, 8 + assert manager._calculate_delay(0) == 1.0 + assert manager._calculate_delay(1) == 2.0 + assert manager._calculate_delay(2) == 4.0 + assert manager._calculate_delay(3) == 8.0 + + def test_linear_backoff_delays(self): + """Test linear backoff delay calculation.""" + config = RetryConfig( + base_delay=1.0, + max_delay=100.0, + jitter=False, + strategy=RetryStrategy.LINEAR + ) + manager = RetryManager(config) + + # Expected delays: 1, 2, 3, 4 + assert manager._calculate_delay(0) == 1.0 + assert manager._calculate_delay(1) == 2.0 + assert manager._calculate_delay(2) == 3.0 + assert manager._calculate_delay(3) == 4.0 + + def test_constant_backoff_delays(self): + """Test constant backoff delay calculation.""" + config = RetryConfig( + base_delay=2.0, + jitter=False, + strategy=RetryStrategy.CONSTANT + ) + manager = RetryManager(config) + + # All delays should be constant + assert manager._calculate_delay(0) == 2.0 + assert manager._calculate_delay(1) == 2.0 + assert manager._calculate_delay(5) == 2.0 + + def test_fibonacci_backoff_delays(self): + """Test fibonacci backoff delay calculation.""" + config = RetryConfig( + base_delay=1.0, + max_delay=100.0, + jitter=False, + strategy=RetryStrategy.FIBONACCI + ) + manager = RetryManager(config) + + # Expected: 1, 1, 2, 3, 5, 8 + assert manager._calculate_delay(0) == 1.0 + assert manager._calculate_delay(1) == 1.0 + assert manager._calculate_delay(2) == 2.0 + assert manager._calculate_delay(3) == 3.0 + assert manager._calculate_delay(4) == 5.0 + assert manager._calculate_delay(5) == 8.0 + + def test_max_delay_cap(self): + """Test that delay is capped at max_delay.""" + config = RetryConfig( + base_delay=10.0, + max_delay=15.0, + exponential_base=2.0, + jitter=False, + strategy=RetryStrategy.EXPONENTIAL + ) + manager = RetryManager(config) + + # 10 * 2^2 = 40, but should be capped at 15 + assert manager._calculate_delay(2) == 15.0 + + def test_jitter_adds_randomness(self): + """Test that jitter adds variation to delays.""" + config = RetryConfig( + base_delay=10.0, + jitter=True, + jitter_range=0.5 + ) + manager = RetryManager(config) + + delays = [manager._calculate_delay(0) for _ in range(20)] + + # With jitter, delays should vary + assert len(set(delays)) > 1 + # All delays should be within expected range + for delay in delays: + assert 5.0 <= delay <= 15.0 # 10 +/- 50% + + def test_on_retry_callback(self): + """Test that on_retry callback is called.""" + config = RetryConfig(max_attempts=3, base_delay=0.01, jitter=False) + manager = RetryManager(config) + + callback = Mock() + mock_func = Mock(side_effect=[Exception("fail"), "success"]) + + manager.execute(mock_func, on_retry=callback) + + callback.assert_called_once() + args = callback.call_args[0] + assert args[0] == 1 # attempt number + assert isinstance(args[1], Exception) # exception + assert isinstance(args[2], float) # delay + + def test_retryable_exceptions_filter(self): + """Test that only specified exceptions trigger retry.""" + config = RetryConfig( + max_attempts=3, + base_delay=0.01, + retryable_exceptions=(ValueError,) + ) + manager = RetryManager(config) + + # ValueError should be retried + mock_func = Mock(side_effect=[ValueError("retry this"), "success"]) + result = manager.execute(mock_func) + assert result.success is True + assert result.attempts == 2 + + # TypeError should NOT be retried (not in retryable_exceptions) + mock_func = Mock(side_effect=TypeError("don't retry")) + with pytest.raises(TypeError): + manager.execute(mock_func) + + def test_function_arguments_passed(self): + """Test that args and kwargs are passed to function.""" + config = RetryConfig(max_attempts=1) + manager = RetryManager(config) + + mock_func = Mock(return_value="ok") + manager.execute(mock_func, "arg1", "arg2", kwarg1="value1") + + mock_func.assert_called_with("arg1", "arg2", kwarg1="value1") + + def test_total_time_tracked(self): + """Test that total execution time is tracked.""" + config = RetryConfig(max_attempts=2, base_delay=0.05, jitter=False) + manager = RetryManager(config) + + mock_func = Mock(side_effect=[Exception("fail"), "success"]) + result = manager.execute(mock_func) + + # Should include delay time + assert result.total_time >= 0.05 + + +class TestRetryDecorator: + """Tests for the @retry decorator.""" + + def test_decorator_success(self): + """Test decorator with successful function.""" + @retry(max_attempts=3, base_delay=0.01, jitter=False) + def always_works(): + return "worked" + + assert always_works() == "worked" + + def test_decorator_with_retries(self): + """Test decorator retries on failure.""" + call_count = 0 + + @retry(max_attempts=3, base_delay=0.01, jitter=False) + def works_third_time(): + nonlocal call_count + call_count += 1 + if call_count < 3: + raise ValueError("not yet") + return "finally" + + assert works_third_time() == "finally" + assert call_count == 3 + + def test_decorator_exhausts_retries(self): + """Test decorator raises exception when retries exhausted.""" + @retry(max_attempts=2, base_delay=0.01, jitter=False) + def always_fails(): + raise RuntimeError("permanent failure") + + with pytest.raises(RuntimeError, match="permanent failure"): + always_fails() + + def test_decorator_with_arguments(self): + """Test decorator preserves function arguments.""" + @retry(max_attempts=1) + def add(a, b, c=0): + return a + b + c + + assert add(1, 2) == 3 + assert add(1, 2, c=3) == 6 + + def test_decorator_preserves_metadata(self): + """Test decorator preserves function name and docstring.""" + @retry(max_attempts=1) + def documented_function(): + """This is the docstring.""" + pass + + assert documented_function.__name__ == "documented_function" + assert documented_function.__doc__ == "This is the docstring." + + +class TestPresetConfigs: + """Tests for preset configurations.""" + + def test_network_retry_config(self): + """Test NETWORK_RETRY_CONFIG preset.""" + assert NETWORK_RETRY_CONFIG.max_attempts == 5 + assert NETWORK_RETRY_CONFIG.base_delay == 1.0 + assert NETWORK_RETRY_CONFIG.max_delay == 30.0 + assert NETWORK_RETRY_CONFIG.jitter is True + + def test_api_retry_config(self): + """Test API_RETRY_CONFIG preset.""" + assert API_RETRY_CONFIG.max_attempts == 3 + assert API_RETRY_CONFIG.base_delay == 0.5 + assert API_RETRY_CONFIG.max_delay == 10.0 + + def test_apt_retry_config(self): + """Test APT_RETRY_CONFIG preset.""" + assert APT_RETRY_CONFIG.max_attempts == 3 + assert APT_RETRY_CONFIG.base_delay == 2.0 + assert APT_RETRY_CONFIG.jitter is False # No jitter for apt + + +class TestConvenienceFunctions: + """Tests for convenience retry functions.""" + + def test_retry_apt_operation(self): + """Test retry_apt_operation helper.""" + mock_func = Mock(return_value="installed") + result = retry_apt_operation(mock_func) + + assert result.success is True + assert result.result == "installed" + + def test_retry_api_call(self): + """Test retry_api_call helper.""" + mock_func = Mock(return_value={"status": "ok"}) + result = retry_api_call(mock_func) + + assert result.success is True + assert result.result == {"status": "ok"} + + def test_retry_network_operation(self): + """Test retry_network_operation helper.""" + mock_func = Mock(return_value=b"data") + result = retry_network_operation(mock_func) + + assert result.success is True + assert result.result == b"data" + + +class TestIntegrationScenarios: + """Integration tests for realistic scenarios.""" + + def test_transient_network_failure(self): + """Simulate transient network failure recovery.""" + attempt = 0 + + def flaky_network_call(): + nonlocal attempt + attempt += 1 + if attempt < 3: + raise ConnectionError("Network unreachable") + return {"data": "response"} + + result = retry_network_operation(flaky_network_call) + + assert result.success is True + assert result.result == {"data": "response"} + assert result.attempts == 3 + + def test_rate_limit_recovery(self): + """Simulate API rate limit with recovery.""" + config = RetryConfig( + max_attempts=4, + base_delay=0.01, + strategy=RetryStrategy.EXPONENTIAL, + jitter=False + ) + manager = RetryManager(config) + + call_count = 0 + + def rate_limited_api(): + nonlocal call_count + call_count += 1 + if call_count <= 2: + raise Exception("Rate limit exceeded") + return {"result": "success"} + + result = manager.execute(rate_limited_api) + + assert result.success is True + assert result.attempts == 3 + + def test_permanent_failure_gives_up(self): + """Test that permanent failures eventually give up.""" + config = RetryConfig(max_attempts=3, base_delay=0.01, jitter=False) + manager = RetryManager(config) + + def always_fails(): + raise PermissionError("Access denied") + + result = manager.execute(always_fails) + + assert result.success is False + assert result.attempts == 3 + assert isinstance(result.final_error, PermissionError)