From 8a5e23d3740b012c4ef52c134ceed4c7166785ad Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 16 May 2025 17:00:45 +0800 Subject: [PATCH] feat(crawler): add separate timeout for wait_for condition Adds a new wait_for_timeout parameter to CrawlerRunConfig that allows specifying a separate timeout for the wait_for condition, independent of the page_timeout. This provides more granular control over waiting behaviors in the crawler. Also removes unused colorama dependency and updates LinkedIn crawler example. BREAKING CHANGE: LinkedIn crawler example now uses different wait_for_images timing --- crawl4ai/async_configs.py | 7 +++++++ crawl4ai/async_crawler_strategy.py | 4 +++- docs/apps/linkdin/c4ai_discover.py | 4 +++- pyproject.toml | 1 - 4 files changed, 13 insertions(+), 3 deletions(-) diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index c93516bd..7a04fd04 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -764,6 +764,9 @@ class CrawlerRunConfig(): Default: 60000 (60 seconds). wait_for (str or None): A CSS selector or JS condition to wait for before extracting content. Default: None. + wait_for_timeout (int or None): Specific timeout in ms for the wait_for condition. + If None, uses page_timeout instead. + Default: None. wait_for_images (bool): If True, wait for images to load before extracting content. Default: False. delay_before_return_html (float): Delay in seconds before retrieving final HTML. @@ -904,6 +907,7 @@ class CrawlerRunConfig(): wait_until: str = "domcontentloaded", page_timeout: int = PAGE_TIMEOUT, wait_for: str = None, + wait_for_timeout: int = None, wait_for_images: bool = False, delay_before_return_html: float = 0.1, mean_delay: float = 0.1, @@ -1000,6 +1004,7 @@ class CrawlerRunConfig(): self.wait_until = wait_until self.page_timeout = page_timeout self.wait_for = wait_for + self.wait_for_timeout = wait_for_timeout self.wait_for_images = wait_for_images self.delay_before_return_html = delay_before_return_html self.mean_delay = mean_delay @@ -1141,6 +1146,7 @@ class CrawlerRunConfig(): wait_until=kwargs.get("wait_until", "domcontentloaded"), page_timeout=kwargs.get("page_timeout", 60000), wait_for=kwargs.get("wait_for"), + wait_for_timeout=kwargs.get("wait_for_timeout"), wait_for_images=kwargs.get("wait_for_images", False), delay_before_return_html=kwargs.get("delay_before_return_html", 0.1), mean_delay=kwargs.get("mean_delay", 0.1), @@ -1250,6 +1256,7 @@ class CrawlerRunConfig(): "wait_until": self.wait_until, "page_timeout": self.page_timeout, "wait_for": self.wait_for, + "wait_for_timeout": self.wait_for_timeout, "wait_for_images": self.wait_for_images, "delay_before_return_html": self.delay_before_return_html, "mean_delay": self.mean_delay, diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 10d395ee..55ad550d 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -937,8 +937,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if config.wait_for: try: + # Use wait_for_timeout if specified, otherwise fall back to page_timeout + timeout = config.wait_for_timeout if config.wait_for_timeout is not None else config.page_timeout await self.smart_wait( - page, config.wait_for, timeout=config.page_timeout + page, config.wait_for, timeout=timeout ) except Exception as e: raise RuntimeError(f"Wait condition failed: {str(e)}") diff --git a/docs/apps/linkdin/c4ai_discover.py b/docs/apps/linkdin/c4ai_discover.py index ac6d2783..f101ce0c 100644 --- a/docs/apps/linkdin/c4ai_discover.py +++ b/docs/apps/linkdin/c4ai_discover.py @@ -235,6 +235,7 @@ async def crawl_people_page( cache_mode=CacheMode.BYPASS, magic=True, wait_for=".org-people-profile-card__card-spacing", + wait_for_images=5000, delay_before_return_html=1, session_id="people_search", ) @@ -420,8 +421,9 @@ def main(): cli_opts = parser.parse_args() # decide on debug defaults - if cli_opts.debug: + if cli_opts.debug or True: opts = detect_debug_defaults(force=True) + cli_opts = opts else: env_defaults = detect_debug_defaults() opts = env_defaults if env_defaults else cli_opts diff --git a/pyproject.toml b/pyproject.toml index be44397e..5abfb460 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,6 @@ dependencies = [ "xxhash~=3.4", "rank-bm25~=0.2", "aiofiles>=24.1.0", - "colorama~=0.4", "snowballstemmer~=2.2", "pydantic>=2.10", "pyOpenSSL>=24.3.0",