feat(crawler): add separate timeout for wait_for condition

Adds a new wait_for_timeout parameter to CrawlerRunConfig that allows specifying a separate timeout for the wait_for condition, independent of the page_timeout. This provides more granular control over waiting behaviors in the crawler. Also removes unused colorama dependency and updates LinkedIn crawler example. BREAKING CHANGE: LinkedIn crawler example now uses different wait_for_images timing
2025-05-16 17:00:45 +08:00
parent 897e017361
commit 8a5e23d374
4 changed files with 13 additions and 3 deletions
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -764,6 +764,9 @@ class CrawlerRunConfig():
                            Default: 60000 (60 seconds).
        wait_for (str or None): A CSS selector or JS condition to wait for before extracting content.
                                Default: None.
+        wait_for_timeout (int or None): Specific timeout in ms for the wait_for condition.
+                                       If None, uses page_timeout instead.
+                                       Default: None.
        wait_for_images (bool): If True, wait for images to load before extracting content.
                                Default: False.
        delay_before_return_html (float): Delay in seconds before retrieving final HTML.
@@ -904,6 +907,7 @@ class CrawlerRunConfig():
        wait_until: str = "domcontentloaded",
        page_timeout: int = PAGE_TIMEOUT,
        wait_for: str = None,
+        wait_for_timeout: int = None,
        wait_for_images: bool = False,
        delay_before_return_html: float = 0.1,
        mean_delay: float = 0.1,
@@ -1000,6 +1004,7 @@ class CrawlerRunConfig():
        self.wait_until = wait_until
        self.page_timeout = page_timeout
        self.wait_for = wait_for
+        self.wait_for_timeout = wait_for_timeout
        self.wait_for_images = wait_for_images
        self.delay_before_return_html = delay_before_return_html
        self.mean_delay = mean_delay
@@ -1141,6 +1146,7 @@ class CrawlerRunConfig():
            wait_until=kwargs.get("wait_until", "domcontentloaded"),
            page_timeout=kwargs.get("page_timeout", 60000),
            wait_for=kwargs.get("wait_for"),
+            wait_for_timeout=kwargs.get("wait_for_timeout"),
            wait_for_images=kwargs.get("wait_for_images", False),
            delay_before_return_html=kwargs.get("delay_before_return_html", 0.1),
            mean_delay=kwargs.get("mean_delay", 0.1),
@@ -1250,6 +1256,7 @@ class CrawlerRunConfig():
            "wait_until": self.wait_until,
            "page_timeout": self.page_timeout,
            "wait_for": self.wait_for,
+            "wait_for_timeout": self.wait_for_timeout,
            "wait_for_images": self.wait_for_images,
            "delay_before_return_html": self.delay_before_return_html,
            "mean_delay": self.mean_delay,
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -937,8 +937,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):

            if config.wait_for:
                try:
+                    # Use wait_for_timeout if specified, otherwise fall back to page_timeout
+                    timeout = config.wait_for_timeout if config.wait_for_timeout is not None else config.page_timeout
                    await self.smart_wait(
-                        page, config.wait_for, timeout=config.page_timeout
+                        page, config.wait_for, timeout=timeout
                    )
                except Exception as e:
                    raise RuntimeError(f"Wait condition failed: {str(e)}")
--- a/docs/apps/linkdin/c4ai_discover.py
+++ b/docs/apps/linkdin/c4ai_discover.py
@@ -235,6 +235,7 @@ async def crawl_people_page(
        cache_mode=CacheMode.BYPASS,
        magic=True,
        wait_for=".org-people-profile-card__card-spacing",
+        wait_for_images=5000,
        delay_before_return_html=1,
        session_id="people_search",
    )
@@ -420,8 +421,9 @@ def main():
    cli_opts = parser.parse_args()

    # decide on debug defaults
-    if cli_opts.debug:
+    if cli_opts.debug or True:
        opts = detect_debug_defaults(force=True)
+        cli_opts = opts
    else:
        env_defaults = detect_debug_defaults()
        opts = env_defaults if env_defaults else cli_opts
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,6 @@ dependencies = [
    "xxhash~=3.4",
    "rank-bm25~=0.2",
    "aiofiles>=24.1.0",
-    "colorama~=0.4",
    "snowballstemmer~=2.2",
    "pydantic>=2.10",
    "pyOpenSSL>=24.3.0",