diff --git a/.gitignore b/.gitignore index 52e25a2a..02c75b3f 100644 --- a/.gitignore +++ b/.gitignore @@ -214,4 +214,7 @@ git_issues.md todo_executor.md protect-all-except-feature.sh manage-collab.sh -publish.sh \ No newline at end of file +publish.sh + +combine.sh +combined_output.txt \ No newline at end of file diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 5c706239..64e9392c 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -7,6 +7,7 @@ import os, sys, shutil import tempfile, subprocess from playwright.async_api import async_playwright, Page, Browser, Error from playwright.async_api import TimeoutError as PlaywrightTimeoutError +from playwright.async_api import TimeoutError as PlaywrightTimeoutError from io import BytesIO from PIL import Image, ImageDraw, ImageFont from pathlib import Path @@ -930,8 +931,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if not self.text_only and (kwargs.get("wait_for_images", True) or kwargs.get("adjust_viewport_to_content", False)): # Wait for network idle after initial load and images to load await page.wait_for_load_state("networkidle") - await asyncio.sleep(0.1) - await page.wait_for_function("Array.from(document.images).every(img => img.complete)") + await asyncio.sleep(0.1) + try: + await page.wait_for_function("Array.from(document.images).every(img => img.complete)", timeout=1000) + # Check for TimeoutError and ignore it + except PlaywrightTimeoutError: + pass # After initial load, adjust viewport to content size if not self.text_only and kwargs.get("adjust_viewport_to_content", False):