feat: make LLM backoff configurable end-to-end

- extend LLMConfig with backoff delay/attempt/factor fields and thread them through LLMExtractionStrategy, LLMContentFilter, table extraction, and Docker API handlers - expose the backoff parameter knobs on perform_completion_with_backoff/aperform_completion_with_backoff and document them in the md_v2 guides
2025-11-28 18:50:04 +05:30
parent b36c6daa5c
commit 7a133e22cc
9 changed files with 84 additions and 15 deletions
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -1792,7 +1792,10 @@ class LLMConfig:
        frequency_penalty: Optional[float] = None,
        presence_penalty: Optional[float] = None,
        stop: Optional[List[str]] = None,
-        n: Optional[int] = None,    
+        n: Optional[int] = None,
+        backoff_base_delay: Optional[int] = None,
+        backoff_max_attempts: Optional[int] = None,
+        backoff_exponential_factor: Optional[int] = None,
    ):
        """Configuaration class for LLM provider and API token."""
        self.provider = provider
@@ -1821,6 +1824,9 @@ class LLMConfig:
        self.presence_penalty = presence_penalty
        self.stop = stop
        self.n = n
+        self.backoff_base_delay = backoff_base_delay if backoff_base_delay is not None else 2
+        self.backoff_max_attempts = backoff_max_attempts if backoff_max_attempts is not None else 3
+        self.backoff_exponential_factor = backoff_exponential_factor if backoff_exponential_factor is not None else 2

    @staticmethod
    def from_kwargs(kwargs: dict) -> "LLMConfig":
@@ -1834,7 +1840,10 @@ class LLMConfig:
            frequency_penalty=kwargs.get("frequency_penalty"),
            presence_penalty=kwargs.get("presence_penalty"),
            stop=kwargs.get("stop"),
-            n=kwargs.get("n")
+            n=kwargs.get("n"),
+            backoff_base_delay=kwargs.get("backoff_base_delay"),
+            backoff_max_attempts=kwargs.get("backoff_max_attempts"),
+            backoff_exponential_factor=kwargs.get("backoff_exponential_factor")
        )

    def to_dict(self):
@@ -1848,7 +1857,10 @@ class LLMConfig:
            "frequency_penalty": self.frequency_penalty,
            "presence_penalty": self.presence_penalty,
            "stop": self.stop,
-            "n": self.n
+            "n": self.n,
+            "backoff_base_delay": self.backoff_base_delay,
+            "backoff_max_attempts": self.backoff_max_attempts,
+            "backoff_exponential_factor": self.backoff_exponential_factor
        }

    def clone(self, **kwargs):
--- a/crawl4ai/content_filter_strategy.py
+++ b/crawl4ai/content_filter_strategy.py
@@ -980,6 +980,9 @@ class LLMContentFilter(RelevantContentFilter):
                        prompt,
                        api_token,
                        base_url=base_url,
+                        base_delay=self.llm_config.backoff_base_delay,
+                        max_attempts=self.llm_config.backoff_max_attempts,
+                        exponential_factor=self.llm_config.backoff_exponential_factor,
                        extra_args=extra_args,
                    )

--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -649,6 +649,9 @@ class LLMExtractionStrategy(ExtractionStrategy):
                base_url=self.llm_config.base_url,
                json_response=self.force_json_response,
                extra_args=self.extra_args,
+                base_delay=self.llm_config.backoff_base_delay,
+                max_attempts=self.llm_config.backoff_max_attempts,
+                exponential_factor=self.llm_config.backoff_exponential_factor
            )  # , json_response=self.extract_type == "schema")
            # Track usage
            usage = TokenUsage(
@@ -846,6 +849,9 @@ class LLMExtractionStrategy(ExtractionStrategy):
                base_url=self.llm_config.base_url,
                json_response=self.force_json_response,
                extra_args=self.extra_args,
+                base_delay=self.llm_config.backoff_base_delay,
+                max_attempts=self.llm_config.backoff_max_attempts,
+                exponential_factor=self.llm_config.backoff_exponential_factor
            )
            # Track usage
            usage = TokenUsage(
--- a/crawl4ai/table_extraction.py
+++ b/crawl4ai/table_extraction.py
@@ -795,6 +795,9 @@ Return only a JSON array of extracted tables following the specified format."""
                    api_token=self.llm_config.api_token,
                    base_url=self.llm_config.base_url,
                    json_response=True,
+                    base_delay=self.llm_config.backoff_base_delay,
+                    max_attempts=self.llm_config.backoff_max_attempts,
+                    exponential_factor=self.llm_config.backoff_exponential_factor,
                    extra_args=self.extra_args
                )
                
@@ -1116,6 +1119,9 @@ Return only a JSON array of extracted tables following the specified format."""
                    api_token=self.llm_config.api_token,
                    base_url=self.llm_config.base_url,
                    json_response=True,
+                    base_delay=self.llm_config.backoff_base_delay,
+                    max_attempts=self.llm_config.backoff_max_attempts,
+                    exponential_factor=self.llm_config.backoff_exponential_factor,
                    extra_args=self.extra_args
                )
                
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -1745,6 +1745,9 @@ def perform_completion_with_backoff(
    api_token,
    json_response=False,
    base_url=None,
+    base_delay=2,
+    max_attempts=3,
+    exponential_factor=2,
    **kwargs,
 ):
    """
@@ -1761,6 +1764,9 @@ def perform_completion_with_backoff(
        api_token (str): The API token for authentication.
        json_response (bool): Whether to request a JSON response. Defaults to False.
        base_url (Optional[str]): The base URL for the API. Defaults to None.
+        base_delay (int): The base delay in seconds. Defaults to 2.
+        max_attempts (int): The maximum number of attempts. Defaults to 3.
+        exponential_factor (int): The exponential factor. Defaults to 2.
        **kwargs: Additional arguments for the API request.

    Returns:
@@ -1770,9 +1776,6 @@ def perform_completion_with_backoff(
    from litellm import completion
    from litellm.exceptions import RateLimitError

-    max_attempts = 3
-    base_delay = 2  # Base delay in seconds, you can adjust this based on your needs
-
    extra_args = {"temperature": 0.01, "api_key": api_token, "base_url": base_url}
    if json_response:
        extra_args["response_format"] = {"type": "json_object"}
@@ -1798,7 +1801,7 @@ def perform_completion_with_backoff(
            # Check if we have exhausted our max attempts
            if attempt < max_attempts - 1:
                # Calculate the delay and wait
-                delay = base_delay * (2**attempt)  # Exponential backoff formula
+                delay = base_delay * (exponential_factor**attempt)  # Exponential backoff formula
                print(f"Waiting for {delay} seconds before retrying...")
                time.sleep(delay)
            else:
@@ -1831,6 +1834,9 @@ async def aperform_completion_with_backoff(
    api_token,
    json_response=False,
    base_url=None,
+    base_delay=2,
+    max_attempts=3,
+    exponential_factor=2,
    **kwargs,
 ):
    """
@@ -1847,6 +1853,9 @@ async def aperform_completion_with_backoff(
        api_token (str): The API token for authentication.
        json_response (bool): Whether to request a JSON response. Defaults to False.
        base_url (Optional[str]): The base URL for the API. Defaults to None.
+        base_delay (int): The base delay in seconds. Defaults to 2.
+        max_attempts (int): The maximum number of attempts. Defaults to 3.
+        exponential_factor (int): The exponential factor. Defaults to 2.
        **kwargs: Additional arguments for the API request.

    Returns:
@@ -1857,9 +1866,6 @@ async def aperform_completion_with_backoff(
    from litellm.exceptions import RateLimitError
    import asyncio

-    max_attempts = 3
-    base_delay = 2  # Base delay in seconds, you can adjust this based on your needs
-
    extra_args = {"temperature": 0.01, "api_key": api_token, "base_url": base_url}
    if json_response:
        extra_args["response_format"] = {"type": "json_object"}
@@ -1885,7 +1891,7 @@ async def aperform_completion_with_backoff(
            # Check if we have exhausted our max attempts
            if attempt < max_attempts - 1:
                # Calculate the delay and wait
-                delay = base_delay * (2**attempt)  # Exponential backoff formula
+                delay = base_delay * (exponential_factor**attempt)  # Exponential backoff formula
                print(f"Waiting for {delay} seconds before retrying...")
                await asyncio.sleep(delay)
            else: