Fix managed browser page retrieval when no pages

Merge branch 'next' of https://github.com/unclecode/crawl4ai into next
feat(favicon): add new favicon images for improved branding
2025-05-20 21:06:12 +08:00 · 2025-05-17 19:04:03 +08:00 · 2025-05-17 19:03:51 +08:00 · 2025-05-17 15:06:53 +08:00 · 2025-05-16 21:59:23 +08:00 · 2025-05-16 21:55:07 +08:00
14 changed files with 180 additions and 12 deletions
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -764,6 +764,9 @@ class CrawlerRunConfig():
                            Default: 60000 (60 seconds).
        wait_for (str or None): A CSS selector or JS condition to wait for before extracting content.
                                Default: None.
+        wait_for_timeout (int or None): Specific timeout in ms for the wait_for condition.
+                                       If None, uses page_timeout instead.
+                                       Default: None.
        wait_for_images (bool): If True, wait for images to load before extracting content.
                                Default: False.
        delay_before_return_html (float): Delay in seconds before retrieving final HTML.
@@ -904,6 +907,7 @@ class CrawlerRunConfig():
        wait_until: str = "domcontentloaded",
        page_timeout: int = PAGE_TIMEOUT,
        wait_for: str = None,
+        wait_for_timeout: int = None,
        wait_for_images: bool = False,
        delay_before_return_html: float = 0.1,
        mean_delay: float = 0.1,
@@ -1000,6 +1004,7 @@ class CrawlerRunConfig():
        self.wait_until = wait_until
        self.page_timeout = page_timeout
        self.wait_for = wait_for
+        self.wait_for_timeout = wait_for_timeout
        self.wait_for_images = wait_for_images
        self.delay_before_return_html = delay_before_return_html
        self.mean_delay = mean_delay
@@ -1141,6 +1146,7 @@ class CrawlerRunConfig():
            wait_until=kwargs.get("wait_until", "domcontentloaded"),
            page_timeout=kwargs.get("page_timeout", 60000),
            wait_for=kwargs.get("wait_for"),
+            wait_for_timeout=kwargs.get("wait_for_timeout"),
            wait_for_images=kwargs.get("wait_for_images", False),
            delay_before_return_html=kwargs.get("delay_before_return_html", 0.1),
            mean_delay=kwargs.get("mean_delay", 0.1),
@@ -1250,6 +1256,7 @@ class CrawlerRunConfig():
            "wait_until": self.wait_until,
            "page_timeout": self.page_timeout,
            "wait_for": self.wait_for,
+            "wait_for_timeout": self.wait_for_timeout,
            "wait_for_images": self.wait_for_images,
            "delay_before_return_html": self.delay_before_return_html,
            "mean_delay": self.mean_delay,
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -937,8 +937,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):

            if config.wait_for:
                try:
+                    # Use wait_for_timeout if specified, otherwise fall back to page_timeout
+                    timeout = config.wait_for_timeout if config.wait_for_timeout is not None else config.page_timeout
                    await self.smart_wait(
-                        page, config.wait_for, timeout=config.page_timeout
+                        page, config.wait_for, timeout=timeout
                    )
                except Exception as e:
                    raise RuntimeError(f"Wait condition failed: {str(e)}")
--- a/crawl4ai/browser_manager.py
+++ b/crawl4ai/browser_manager.py
@@ -964,7 +964,10 @@ class BrowserManager:
            pages = context.pages
            page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
            if not page:
-                page = context.pages[0] # await context.new_page()
+                if pages:
+                    page = context.pages[0]
+                else:
+                    page = await context.new_page()
        else:
            # Otherwise, check if we have an existing context for this config
            config_signature = self._make_config_signature(crawlerRunConfig)
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -135,13 +135,20 @@ def merge_chunks(
    word_token_ratio: float = 1.0,
    splitter: Callable = None
 ) -> List[str]:
-    """Merges documents into chunks of specified token size.
+    """
+    Merges a sequence of documents into chunks based on a target token count, with optional overlap.
+    
+    Each document is split into tokens using the provided splitter function (defaults to str.split). Tokens are distributed into chunks aiming for the specified target size, with optional overlapping tokens between consecutive chunks. Returns a list of non-empty merged chunks as strings.
    
    Args:
-        docs: Input documents
-        target_size: Desired token count per chunk
-        overlap: Number of tokens to overlap between chunks
-        word_token_ratio: Multiplier for word->token conversion
+        docs: Sequence of input document strings to be merged.
+        target_size: Target number of tokens per chunk.
+        overlap: Number of tokens to overlap between consecutive chunks.
+        word_token_ratio: Multiplier to estimate token count from word count.
+        splitter: Callable used to split each document into tokens.
+    
+    Returns:
+        List of merged document chunks as strings, each not exceeding the target token size.
    """
    # Pre-tokenize all docs and store token counts
    splitter = splitter or str.split
@@ -150,7 +157,7 @@ def merge_chunks(
    total_tokens = 0
    
    for doc in docs:
-        tokens = doc.split()
+        tokens = splitter(doc)
        count = int(len(tokens) * word_token_ratio)
        if count:  # Skip empty docs
            token_counts.append(count)
@@ -1109,6 +1116,23 @@ def get_content_of_website_optimized(
    css_selector: str = None,
    **kwargs,
 ) -> Dict[str, Any]:
+    """
+    Extracts and cleans content from website HTML, optimizing for useful media and contextual information.
+    
+    Parses the provided HTML to extract internal and external links, filters and scores images for usefulness, gathers contextual descriptions for media, removes unwanted or low-value elements, and converts the cleaned HTML to Markdown. Also extracts metadata and returns all structured content in a dictionary.
+    
+    Args:
+        url: The URL of the website being processed.
+        html: The raw HTML content to extract from.
+        word_count_threshold: Minimum word count for elements to be retained.
+        css_selector: Optional CSS selector to restrict extraction to specific elements.
+    
+    Returns:
+        A dictionary containing Markdown content, cleaned HTML, extraction success status, media and link lists, and metadata.
+    
+    Raises:
+        InvalidCSSSelectorError: If a provided CSS selector does not match any elements.
+    """
    if not html:
        return None

@@ -1151,6 +1175,20 @@ def get_content_of_website_optimized(

    def process_image(img, url, index, total_images):
        # Check if an image has valid display and inside undesired html elements
+        """
+        Processes an HTML image element to determine its relevance and extract metadata.
+        
+        Evaluates an image's visibility, context, and usefulness based on its attributes and parent elements. If the image passes validation and exceeds a usefulness score threshold, returns a dictionary with its source, alt text, contextual description, score, and type. Otherwise, returns None.
+        
+        Args:
+            img: The BeautifulSoup image tag to process.
+            url: The base URL of the page containing the image.
+            index: The index of the image in the list of images on the page.
+            total_images: The total number of images on the page.
+        
+        Returns:
+            A dictionary with image metadata if the image is considered useful, or None otherwise.
+        """
        def is_valid_image(img, parent, parent_classes):
            style = img.get("style", "")
            src = img.get("src", "")
@@ -1172,6 +1210,20 @@ def get_content_of_website_optimized(
        # Score an image for it's usefulness
        def score_image_for_usefulness(img, base_url, index, images_count):
            # Function to parse image height/width value and units
+            """
+            Scores an HTML image element for usefulness based on size, format, attributes, and position.
+            
+            The function evaluates an image's dimensions, file format, alt text, and its position among all images on the page to assign a usefulness score. Higher scores indicate images that are likely more relevant or informative for content extraction or summarization.
+            
+            Args:
+                img: The HTML image element to score.
+                base_url: The base URL used to resolve relative image sources.
+                index: The position of the image in the list of images on the page (zero-based).
+                images_count: The total number of images on the page.
+            
+            Returns:
+                An integer usefulness score for the image.
+            """
            def parse_dimension(dimension):
                if dimension:
                    match = re.match(r"(\d+)(\D*)", dimension)
@@ -1186,6 +1238,16 @@ def get_content_of_website_optimized(
            # Fetch image file metadata to extract size and extension
            def fetch_image_file_size(img, base_url):
                # If src is relative path construct full URL, if not it may be CDN URL
+                """
+                Fetches the file size of an image by sending a HEAD request to its URL.
+                
+                Args:
+                    img: A BeautifulSoup tag representing the image element.
+                    base_url: The base URL to resolve relative image sources.
+                
+                Returns:
+                    The value of the "Content-Length" header as a string if available, otherwise None.
+                """
                img_url = urljoin(base_url, img.get("src"))
                try:
                    response = requests.head(img_url)
@@ -1196,8 +1258,6 @@ def get_content_of_website_optimized(
                        return None
                except InvalidSchema:
                    return None
-                finally:
-                    return

            image_height = img.get("height")
            height_value, height_unit = parse_dimension(image_height)
--- a/docs/apps/linkdin/c4ai_discover.py
+++ b/docs/apps/linkdin/c4ai_discover.py
@@ -235,6 +235,7 @@ async def crawl_people_page(
        cache_mode=CacheMode.BYPASS,
        magic=True,
        wait_for=".org-people-profile-card__card-spacing",
+        wait_for_images=5000,
        delay_before_return_html=1,
        session_id="people_search",
    )
@@ -420,8 +421,9 @@ def main():
    cli_opts = parser.parse_args()

    # decide on debug defaults
-    if cli_opts.debug:
+    if cli_opts.debug or True:
        opts = detect_debug_defaults(force=True)
+        cli_opts = opts
    else:
        env_defaults = detect_debug_defaults()
        opts = env_defaults if env_defaults else cli_opts
--- a/docs/md_v2/assets/feedback-overrides.css
+++ b/docs/md_v2/assets/feedback-overrides.css
@@ -0,0 +1,37 @@
+/* docs/assets/feedback-overrides.css */
+:root {
+  /* brand */
+  --feedback-primary-color: #09b5a5;
+  --feedback-highlight-color: #fed500;          /* stars etc */
+
+  /* modal shell / text */
+  --feedback-modal-content-bg-color: var(--background-color);
+  --feedback-modal-content-text-color: var(--font-color);
+  --feedback-modal-content-border-color: var(--primary-dimmed-color);
+  --feedback-modal-content-border-radius: 4px;
+
+  /* overlay */
+  --feedback-overlay-bg-color: rgba(0,0,0,.75);
+
+  /* rating buttons */
+  --feedback-modal-rating-button-color: var(--secondary-color);
+  --feedback-modal-rating-button-selected-color: var(--primary-color);
+
+  /* inputs */
+  --feedback-modal-input-bg-color: var(--code-bg-color);
+  --feedback-modal-input-text-color: var(--font-color);
+  --feedback-modal-input-border-color: var(--primary-dimmed-color);
+  --feedback-modal-input-border-color-focused: var(--primary-color);
+
+  /* submit / secondary buttons */
+  --feedback-modal-button-submit-bg-color: var(--primary-color);
+  --feedback-modal-button-submit-bg-color-hover: var(--primary-dimmed-color);
+  --feedback-modal-button-submit-text-color: var(--invert-font-color);
+
+  --feedback-modal-button-bg-color: transparent;       /* screenshot btn */
+  --feedback-modal-button-border-color: var(--primary-color);
+  --feedback-modal-button-icon-color: var(--primary-color);
+}
+
+/* optional: keep the “Powered by” link subtle */
+.feedback-logo a{color:var(--secondary-color);}
--- a/docs/md_v2/assets/gtag.js
+++ b/docs/md_v2/assets/gtag.js
@@ -0,0 +1,5 @@
+  window.dataLayer = window.dataLayer || [];
+  function gtag(){dataLayer.push(arguments);}
+  gtag('js', new Date());
+
+  gtag('config', 'G-58W0K2ZQ25');
--- a/docs/md_v2/favicon.ico
+++ b/docs/md_v2/favicon.ico
--- a/docs/md_v2/img/favicon-32x32.png
+++ b/docs/md_v2/img/favicon-32x32.png
--- a/docs/md_v2/img/favicon-x-32x32.png
+++ b/docs/md_v2/img/favicon-x-32x32.png
--- a/docs/md_v2/img/favicon.ico
+++ b/docs/md_v2/img/favicon.ico
--- a/docs/md_v2/overrides/main.html
+++ b/docs/md_v2/overrides/main.html
@@ -0,0 +1,47 @@
+{% set extra_html_attrs = 'data-theme="dark"' %}
+{% extends "base.html" %}
+
+{% block extrahead %}
+{{ super() }}
+<script>
+    document.documentElement.setAttribute("data-theme", "dark");
+</script>
+<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/pushfeedback/dist/pushfeedback/pushfeedback.css">
+
+<style>
+    :root {
+        /* brand */
+        --feedback-primary-color: #09b5a5;
+        --feedback-highlight-color: #fed500;
+
+
+        /* align with the value you really use in :root */
+        --header-height: 65px;
+
+        /* Push modal content down */
+        --feedback-modal-content-position-top: var(--header-height);
+
+        --feedback-modal-modal-wrapper-z-index: 1100;
+        /*  >  header’s 1000 */
+        --feedback-modal-content-z-index: 1101;
+    }
+
+    feedback-modal::part(overlay) {
+        top: var(--header-height);
+        /* start below header */
+        height: calc(100vh - var(--header-height));
+        /* fill the rest */
+
+
+    }
+</style>
+<script type="module"
+    src="https://cdn.jsdelivr.net/npm/pushfeedback@latest/dist/pushfeedback/pushfeedback.esm.js"></script>
+{% endblock %}
+
+{% block footer %}
+<feedback-button project="w8plzp8vjp" button-style="dark" button-position="center-right" modal-position="sidebar-right">
+    >
+    Feedback
+</feedback-button>
+{% endblock %}
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -1,4 +1,5 @@
 site_name: Crawl4AI Documentation (v0.6.x)
+site_favicon: docs/md_v2/favicon.ico
 site_description:  🚀🤖 Crawl4AI, Open-source LLM-Friendly Web Crawler & Scraper
 site_url: https://docs.crawl4ai.com
 repo_url: https://github.com/unclecode/crawl4ai
@@ -57,6 +58,8 @@ nav:
 theme:
  name: 'terminal'
  palette: 'dark'
+  custom_dir: docs/md_v2/overrides
+  color_mode: 'dark'
  icon:
    repo: fontawesome/brands/github

@@ -82,8 +85,11 @@ extra_css:
  - assets/styles.css
  - assets/highlight.css
  - assets/dmvendor.css
+  - assets/feedback-overrides.css

 extra_javascript:
+  - https://www.googletagmanager.com/gtag/js?id=G-58W0K2ZQ25
+  - assets/gtag.js
  - assets/highlight.min.js
  - assets/highlight_init.js
  - https://buttons.github.io/buttons.js
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,6 @@ dependencies = [
    "xxhash~=3.4",
    "rank-bm25~=0.2",
    "aiofiles>=24.1.0",
-    "colorama~=0.4",
    "snowballstemmer~=2.2",
    "pydantic>=2.10",
    "pyOpenSSL>=24.3.0",
Author	SHA1	Message	Date
UncleCode	0e840aea2b	Fix managed browser page retrieval when no pages	2025-05-20 21:06:12 +08:00
UncleCode	85ac6fa523	Merge branch 'next' of https://github.com/unclecode/crawl4ai into next	2025-05-17 19:04:03 +08:00
UncleCode	becc4624bb	feat(favicon): add new favicon images for improved branding	2025-05-17 19:03:51 +08:00
UncleCode	754ba731fa	Fix chunk splitting utilities (#1122 ) * Fix merge_chunks splitter usage and remove incorrect return * 📝 Add docstrings to `codex/find-and-fix-a-bug` (#1123) Docstrings generation was requested by @unclecode. * https://github.com/unclecode/crawl4ai/pull/1122#issuecomment-2887985865 The following files were modified: * `crawl4ai/utils.py` Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --------- Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>	2025-05-17 15:06:53 +08:00
UncleCode	ac9981a1f5	feat(favicon): add favicon image and update mkdocs configuration	2025-05-16 21:59:23 +08:00
UncleCode	83ef15fd47	feat(favicon): add favicon.ico for improved branding	2025-05-16 21:55:07 +08:00
UncleCode	a3cb938675	feat(theme): enable dark color mode in mkdocs configuration	2025-05-16 21:44:56 +08:00
UncleCode	9b60988232	feat(feedback): add feedback modal styles and integrate into mkdocs configuration	2025-05-16 21:25:10 +08:00
UncleCode	98e951f611	fix(mkdocs): remove duplicate gtag.js entry in extra_javascript	2025-05-16 20:52:41 +08:00
UncleCode	baca2df8df	feat(analytics): add Google Tag Manager script and gtag.js for tracking	2025-05-16 20:49:02 +08:00
UncleCode	8a5e23d374	feat(crawler): add separate timeout for wait_for condition Adds a new wait_for_timeout parameter to CrawlerRunConfig that allows specifying a separate timeout for the wait_for condition, independent of the page_timeout. This provides more granular control over waiting behaviors in the crawler. Also removes unused colorama dependency and updates LinkedIn crawler example. BREAKING CHANGE: LinkedIn crawler example now uses different wait_for_images timing	2025-05-16 17:00:45 +08:00