feat(crawler): add separate timeout for wait_for condition
Adds a new wait_for_timeout parameter to CrawlerRunConfig that allows specifying a separate timeout for the wait_for condition, independent of the page_timeout. This provides more granular control over waiting behaviors in the crawler. Also removes unused colorama dependency and updates LinkedIn crawler example. BREAKING CHANGE: LinkedIn crawler example now uses different wait_for_images timing
This commit is contained in:
@@ -764,6 +764,9 @@ class CrawlerRunConfig():
|
||||
Default: 60000 (60 seconds).
|
||||
wait_for (str or None): A CSS selector or JS condition to wait for before extracting content.
|
||||
Default: None.
|
||||
wait_for_timeout (int or None): Specific timeout in ms for the wait_for condition.
|
||||
If None, uses page_timeout instead.
|
||||
Default: None.
|
||||
wait_for_images (bool): If True, wait for images to load before extracting content.
|
||||
Default: False.
|
||||
delay_before_return_html (float): Delay in seconds before retrieving final HTML.
|
||||
@@ -904,6 +907,7 @@ class CrawlerRunConfig():
|
||||
wait_until: str = "domcontentloaded",
|
||||
page_timeout: int = PAGE_TIMEOUT,
|
||||
wait_for: str = None,
|
||||
wait_for_timeout: int = None,
|
||||
wait_for_images: bool = False,
|
||||
delay_before_return_html: float = 0.1,
|
||||
mean_delay: float = 0.1,
|
||||
@@ -1000,6 +1004,7 @@ class CrawlerRunConfig():
|
||||
self.wait_until = wait_until
|
||||
self.page_timeout = page_timeout
|
||||
self.wait_for = wait_for
|
||||
self.wait_for_timeout = wait_for_timeout
|
||||
self.wait_for_images = wait_for_images
|
||||
self.delay_before_return_html = delay_before_return_html
|
||||
self.mean_delay = mean_delay
|
||||
@@ -1141,6 +1146,7 @@ class CrawlerRunConfig():
|
||||
wait_until=kwargs.get("wait_until", "domcontentloaded"),
|
||||
page_timeout=kwargs.get("page_timeout", 60000),
|
||||
wait_for=kwargs.get("wait_for"),
|
||||
wait_for_timeout=kwargs.get("wait_for_timeout"),
|
||||
wait_for_images=kwargs.get("wait_for_images", False),
|
||||
delay_before_return_html=kwargs.get("delay_before_return_html", 0.1),
|
||||
mean_delay=kwargs.get("mean_delay", 0.1),
|
||||
@@ -1250,6 +1256,7 @@ class CrawlerRunConfig():
|
||||
"wait_until": self.wait_until,
|
||||
"page_timeout": self.page_timeout,
|
||||
"wait_for": self.wait_for,
|
||||
"wait_for_timeout": self.wait_for_timeout,
|
||||
"wait_for_images": self.wait_for_images,
|
||||
"delay_before_return_html": self.delay_before_return_html,
|
||||
"mean_delay": self.mean_delay,
|
||||
|
||||
@@ -937,8 +937,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
|
||||
if config.wait_for:
|
||||
try:
|
||||
# Use wait_for_timeout if specified, otherwise fall back to page_timeout
|
||||
timeout = config.wait_for_timeout if config.wait_for_timeout is not None else config.page_timeout
|
||||
await self.smart_wait(
|
||||
page, config.wait_for, timeout=config.page_timeout
|
||||
page, config.wait_for, timeout=timeout
|
||||
)
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Wait condition failed: {str(e)}")
|
||||
|
||||
@@ -235,6 +235,7 @@ async def crawl_people_page(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
magic=True,
|
||||
wait_for=".org-people-profile-card__card-spacing",
|
||||
wait_for_images=5000,
|
||||
delay_before_return_html=1,
|
||||
session_id="people_search",
|
||||
)
|
||||
@@ -420,8 +421,9 @@ def main():
|
||||
cli_opts = parser.parse_args()
|
||||
|
||||
# decide on debug defaults
|
||||
if cli_opts.debug:
|
||||
if cli_opts.debug or True:
|
||||
opts = detect_debug_defaults(force=True)
|
||||
cli_opts = opts
|
||||
else:
|
||||
env_defaults = detect_debug_defaults()
|
||||
opts = env_defaults if env_defaults else cli_opts
|
||||
|
||||
@@ -26,7 +26,6 @@ dependencies = [
|
||||
"xxhash~=3.4",
|
||||
"rank-bm25~=0.2",
|
||||
"aiofiles>=24.1.0",
|
||||
"colorama~=0.4",
|
||||
"snowballstemmer~=2.2",
|
||||
"pydantic>=2.10",
|
||||
"pyOpenSSL>=24.3.0",
|
||||
|
||||
Reference in New Issue
Block a user