Merge branch '2025-JUN-1' into next-MAY

2025-07-09 09:41:03 +02:00
parent 026e96a2df 9332326457
commit 0ebce590f8
15 changed files with 664 additions and 46 deletions
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -926,6 +926,8 @@ class CrawlerRunConfig():
                               Default: False.
        scroll_delay (float): Delay in seconds between scroll steps if scan_full_page is True.
                              Default: 0.2.
+        max_scroll_steps (Optional[int]): Maximum number of scroll steps to perform during full page scan.
+                                         If None, scrolls until the entire page is loaded. Default: None.
        process_iframes (bool): If True, attempts to process and inline iframe content.
                                Default: False.
        remove_overlay_elements (bool): If True, remove overlays/popups before extracting HTML.
@@ -1066,6 +1068,7 @@ class CrawlerRunConfig():
        ignore_body_visibility: bool = True,
        scan_full_page: bool = False,
        scroll_delay: float = 0.2,
+        max_scroll_steps: Optional[int] = None,
        process_iframes: bool = False,
        remove_overlay_elements: bool = False,
        simulate_user: bool = False,
@@ -1170,6 +1173,7 @@ class CrawlerRunConfig():
        self.ignore_body_visibility = ignore_body_visibility
        self.scan_full_page = scan_full_page
        self.scroll_delay = scroll_delay
+        self.max_scroll_steps = max_scroll_steps
        self.process_iframes = process_iframes
        self.remove_overlay_elements = remove_overlay_elements
        self.simulate_user = simulate_user
@@ -1387,6 +1391,7 @@ class CrawlerRunConfig():
            ignore_body_visibility=kwargs.get("ignore_body_visibility", True),
            scan_full_page=kwargs.get("scan_full_page", False),
            scroll_delay=kwargs.get("scroll_delay", 0.2),
+            max_scroll_steps=kwargs.get("max_scroll_steps"),
            process_iframes=kwargs.get("process_iframes", False),
            remove_overlay_elements=kwargs.get("remove_overlay_elements", False),
            simulate_user=kwargs.get("simulate_user", False),
@@ -1499,6 +1504,7 @@ class CrawlerRunConfig():
            "ignore_body_visibility": self.ignore_body_visibility,
            "scan_full_page": self.scan_full_page,
            "scroll_delay": self.scroll_delay,
+            "max_scroll_steps": self.max_scroll_steps,
            "process_iframes": self.process_iframes,
            "remove_overlay_elements": self.remove_overlay_elements,
            "simulate_user": self.simulate_user,
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -469,9 +469,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                console_messages=captured_console,
            )

-        elif url.startswith("raw:") or url.startswith("raw://"):
+        ##### 
+        # Since both "raw:" and "raw://" start with "raw:", the first condition is always true for both, so "raw://" will be sliced as "//...", which is incorrect.
+        # Fix: Check for "raw://" first, then "raw:"
+        # Also, the prefix "raw://" is actually 6 characters long, not 7, so it should be sliced accordingly: url[6:]
+        #####
+        elif url.startswith("raw://") or url.startswith("raw:"):
            # Process raw HTML content
-            raw_html = url[4:] if url[:4] == "raw:" else url[7:]
+            # raw_html = url[4:] if url[:4] == "raw:" else url[7:]
+            raw_html = url[6:] if url.startswith("raw://") else url[4:]
            html = raw_html
            if config.screenshot:
                screenshot_data = await self._generate_screenshot_from_html(html)
@@ -930,7 +936,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):

            # Handle full page scanning
            if config.scan_full_page:
-                await self._handle_full_page_scan(page, config.scroll_delay)
+                # await self._handle_full_page_scan(page, config.scroll_delay)
+                await self._handle_full_page_scan(page, config.scroll_delay, config.max_scroll_steps)

            # Handle virtual scroll if configured
            if config.virtual_scroll_config:
@@ -1122,7 +1129,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                # Close the page
                await page.close()

-    async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1):
+    # async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1):
+    async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1, max_scroll_steps: Optional[int] = None):
        """
        Helper method to handle full page scanning.

@@ -1137,6 +1145,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
        Args:
            page (Page): The Playwright page object
            scroll_delay (float): The delay between page scrolls
+            max_scroll_steps (Optional[int]): Maximum number of scroll steps to perform. If None, scrolls until end.

        """
        try:
@@ -1161,9 +1170,21 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            dimensions = await self.get_page_dimensions(page)
            total_height = dimensions["height"]

+            scroll_step_count = 0
            while current_position < total_height:
+                #### 
+                # NEW FEATURE: Check if we've reached the maximum allowed scroll steps
+                # This prevents infinite scrolling on very long pages or infinite scroll scenarios
+                # If max_scroll_steps is None, this check is skipped (unlimited scrolling - original behavior)
+                ####
+                if max_scroll_steps is not None and scroll_step_count >= max_scroll_steps:
+                    break
                current_position = min(current_position + viewport_height, total_height)
                await self.safe_scroll(page, 0, current_position, delay=scroll_delay)
+
+                # Increment the step counter for max_scroll_steps tracking
+                scroll_step_count += 1
+                
                # await page.evaluate(f"window.scrollTo(0, {current_position})")
                # await asyncio.sleep(scroll_delay)

@@ -1804,12 +1825,31 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                    # then wait for the new page to load before continuing
                    result = None
                    try:
+                        # OLD VERSION:
+                        # result = await page.evaluate(
+                        #     f"""
+                        # (async () => {{
+                        #     try {{
+                        #         const script_result = {script};
+                        #         return {{ success: true, result: script_result }};
+                        #     }} catch (err) {{
+                        #         return {{ success: false, error: err.toString(), stack: err.stack }};
+                        #     }}
+                        # }})();
+                        # """
+                        # )
+                        
+                        # """ NEW VERSION:
+                        # When {script} contains statements (e.g., const link = …; link.click();), 
+                        # this forms invalid JavaScript, causing Playwright execution error: SyntaxError: Unexpected token 'const'.
+                        # """
                        result = await page.evaluate(
                            f"""
                        (async () => {{
                            try {{
-                                const script_result = {script};
-                                return {{ success: true, result: script_result }};
+                                return await (async () => {{
+                                    {script}
+                                }})();
                            }} catch (err) {{
                                return {{ success: false, error: err.toString(), stack: err.stack }};
                            }}
--- a/crawl4ai/deep_crawling/filters.py
+++ b/crawl4ai/deep_crawling/filters.py
@@ -227,10 +227,21 @@ class URLPatternFilter(URLFilter):
        # Prefix check (/foo/*)
        if self._simple_prefixes:
            path = url.split("?")[0]
-            if any(path.startswith(p) for p in self._simple_prefixes):
-                result = True
-                self._update_stats(result)
-                return not result if self._reverse else result
+            # if any(path.startswith(p) for p in self._simple_prefixes):
+            #     result = True
+            #     self._update_stats(result)
+            #     return not result if self._reverse else result
+            ####
+            # Modified the prefix matching logic to ensure path boundary checking:
+            # - Check if the matched prefix is followed by a path separator (`/`), query parameter (`?`), fragment (`#`), or is at the end of the path
+            # - This ensures `/api/` only matches complete path segments, not substrings like `/apiv2/`
+            ####
+            for prefix in self._simple_prefixes:
+                if path.startswith(prefix):
+                    if len(path) == len(prefix) or path[len(prefix)] in ['/', '?', '#']:
+                        result = True
+                        self._update_stats(result)
+                        return not result if self._reverse else result

        # Complex patterns
        if self._path_patterns:
@@ -337,6 +348,15 @@ class ContentTypeFilter(URLFilter):
        "sqlite": "application/vnd.sqlite3",
        # Placeholder
        "unknown": "application/octet-stream",  # Fallback for unknown file types
+        # php
+        "php": "application/x-httpd-php",
+        "php3": "application/x-httpd-php",
+        "php4": "application/x-httpd-php",
+        "php5": "application/x-httpd-php",
+        "php7": "application/x-httpd-php",
+        "phtml": "application/x-httpd-php",
+        "phps": "application/x-httpd-php-source",
+
    }

    @staticmethod