Merge branch '2025-MAY-2' into next-MAY

2025-07-08 11:46:13 +02:00
parent 1a73fb60db 414f16e975
commit 0f210f6e02
28 changed files with 448 additions and 154 deletions
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -445,6 +445,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            return await self._crawl_web(url, config)

        elif url.startswith("file://"):
+            # initialize empty lists for console messages
+            captured_console = []
+            
            # Process local file
            local_file_path = url[7:]  # Remove 'file://' prefix
            if not os.path.exists(local_file_path):
@@ -741,18 +744,49 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                    )
                    redirected_url = page.url
                except Error as e:
-                    raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
+                    # Allow navigation to be aborted when downloading files
+                    # This is expected behavior for downloads in some browser engines
+                    if 'net::ERR_ABORTED' in str(e) and self.browser_config.accept_downloads:
+                        self.logger.info(
+                            message=f"Navigation aborted, likely due to file download: {url}",
+                            tag="GOTO",
+                            params={"url": url},
+                        )
+                        response = None
+                    else:
+                        raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")

                await self.execute_hook(
                    "after_goto", page, context=context, url=url, response=response, config=config
                )

+                # ──────────────────────────────────────────────────────────────
+                # Walk the redirect chain.  Playwright returns only the last
+                # hop, so we trace the `request.redirected_from` links until the
+                # first response that differs from the final one and surface its
+                # status-code.
+                # ──────────────────────────────────────────────────────────────
                if response is None:
                    status_code = 200
                    response_headers = {}
                else:
-                    status_code = response.status
-                    response_headers = response.headers
+                    first_resp = response
+                    req = response.request
+                    while req and req.redirected_from:
+                        prev_req = req.redirected_from
+                        prev_resp = await prev_req.response()
+                        if prev_resp:                       # keep earliest
+                            first_resp = prev_resp
+                        req = prev_req
+                
+                    status_code = first_resp.status
+                    response_headers = first_resp.headers
+                # if response is None:
+                #     status_code = 200
+                #     response_headers = {}
+                # else:
+                #     status_code = response.status
+                #     response_headers = response.headers

            else:
                status_code = 200
@@ -1616,12 +1650,32 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            num_segments = (page_height // viewport_height) + 1
            for i in range(num_segments):
                y_offset = i * viewport_height
+                # Special handling for the last segment
+                if i == num_segments - 1:
+                    last_part_height = page_height % viewport_height
+                    
+                    # If page_height is an exact multiple of viewport_height,
+                    # we don't need an extra segment
+                    if last_part_height == 0:
+                        # Skip last segment if page height is exact multiple of viewport
+                        break
+                    
+                    # Adjust viewport to exactly match the remaining content height
+                    await page.set_viewport_size({"width": page_width, "height": last_part_height})
+                
                await page.evaluate(f"window.scrollTo(0, {y_offset})")
                await asyncio.sleep(0.01)  # wait for render
-                seg_shot = await page.screenshot(full_page=False)
+                
+                # Capture the current segment
+                # Note: Using compression options (format, quality) would go here
+                seg_shot = await page.screenshot(full_page=False, type="jpeg", quality=85)
+                # seg_shot = await page.screenshot(full_page=False)
                img = Image.open(BytesIO(seg_shot)).convert("RGB")
                segments.append(img)

+            # Reset viewport to original size after capturing segments
+            await page.set_viewport_size({"width": page_width, "height": viewport_height})
+
            total_height = sum(img.height for img in segments)
            stitched = Image.new("RGB", (segments[0].width, total_height))
            offset = 0
--- a/crawl4ai/async_logger.py
+++ b/crawl4ai/async_logger.py
@@ -39,6 +39,7 @@ class LogColor(str, Enum):
    YELLOW = "yellow"
    MAGENTA = "magenta"
    DIM_MAGENTA = "dim magenta"
+    RED = "red"

    def __str__(self):
        """Automatically convert rich color to string."""
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -363,7 +363,7 @@ class AsyncWebCrawler:
                        pdf_data=pdf_data,
                        verbose=config.verbose,
                        is_raw_html=True if url.startswith("raw:") else False,
-                        redirected_url=async_response.redirected_url, 
+                        redirected_url=async_response.redirected_url,
                        **kwargs,
                    )

@@ -506,7 +506,7 @@ class AsyncWebCrawler:
            tables = media.pop("tables", [])
            links = result.links.model_dump()
            metadata = result.metadata
-            
+
        fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000)

        ################################
@@ -588,11 +588,13 @@ class AsyncWebCrawler:
            # Choose content based on input_format
            content_format = config.extraction_strategy.input_format
            if content_format == "fit_markdown" and not markdown_result.fit_markdown:
-                self.logger.warning(
-                    message="Fit markdown requested but not available. Falling back to raw markdown.",
-                    tag="EXTRACT",
-                    params={"url": _url},
-                )
+
+                self.logger.url_status(
+                        url=_url,
+                        success=bool(html),
+                        timing=time.perf_counter() - t1,
+                        tag="EXTRACT",
+                    )
                content_format = "markdown"

            content = {
@@ -616,11 +618,12 @@ class AsyncWebCrawler:
            )

            # Log extraction completion
-            self.logger.info(
-                message="Completed for {url:.50}... | Time: {timing}s",
-                tag="EXTRACT",
-                params={"url": _url, "timing": time.perf_counter() - t1},
-            )
+            self.logger.url_status(
+                        url=_url,
+                        success=bool(html),
+                        timing=time.perf_counter() - t1,
+                        tag="EXTRACT",
+                    )

        # Apply HTML formatting if requested
        if config.prettiify:
--- a/crawl4ai/browser_profiler.py
+++ b/crawl4ai/browser_profiler.py
@@ -480,7 +480,7 @@ class BrowserProfiler:
                self.logger.info("4. Exit", tag="MENU", base_color=LogColor.MAGENTA)
                exit_option = "4"
            
-            self.logger.print(f"\n[cyan]Enter your choice (1-{exit_option}): [/cyan]", end="")
+            self.logger.info(f"\n[cyan]Enter your choice (1-{exit_option}): [/cyan]", end="")
            choice = input()
            
            if choice == "1":
@@ -637,9 +637,18 @@ class BrowserProfiler:
        self.logger.info(f"Debugging port: {debugging_port}", tag="CDP")
        self.logger.info(f"Headless mode: {headless}", tag="CDP")
        
+        # create browser config
+        browser_config = BrowserConfig(
+            browser_type=browser_type,
+            headless=headless,
+            user_data_dir=profile_path,
+            debugging_port=debugging_port,
+            verbose=True
+        )
+        
        # Create managed browser instance
        managed_browser = ManagedBrowser(
-            browser_type=browser_type,
+            browser_config=browser_config,
            user_data_dir=profile_path,
            headless=headless,
            logger=self.logger,
--- a/crawl4ai/cli.py
+++ b/crawl4ai/cli.py
@@ -1010,7 +1010,7 @@ def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless
@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all")
@click.option("--output-file", "-O", type=click.Path(), help="Output file path (default: stdout)")
-@click.option("--bypass-cache", "-b", is_flag=True, default=True, help="Bypass cache when crawling")
+@click.option("--bypass-cache", "-bc", is_flag=True, default=True, help="Bypass cache when crawling")
@click.option("--question", "-q", help="Ask a question about the crawled content")
@click.option("--verbose", "-v", is_flag=True)
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -720,13 +720,18 @@ class WebScrapingStrategy(ContentScrapingStrategy):

                    # Check flag if we should remove external images
                    if kwargs.get("exclude_external_images", False):
-                        element.decompose()
-                        return False
-                        # src_url_base = src.split('/')[2]
-                        # url_base = url.split('/')[2]
-                        # if url_base not in src_url_base:
-                        #     element.decompose()
-                        #     return False
+                        # Handle relative URLs (which are always from the same domain)
+                        if not src.startswith('http') and not src.startswith('//'):
+                            return True  # Keep relative URLs
+                        
+                        # For absolute URLs, compare the base domains using the existing function
+                        src_base_domain = get_base_domain(src)
+                        url_base_domain = get_base_domain(url)
+                        
+                        # If the domains don't match and both are valid, the image is external
+                        if src_base_domain and url_base_domain and src_base_domain != url_base_domain:
+                            element.decompose()
+                            return False

                    # if kwargs.get('exclude_social_media_links', False):
                    #     if image_src_base_domain in exclude_social_media_domains:
--- a/crawl4ai/deep_crawling/bff_strategy.py
+++ b/crawl4ai/deep_crawling/bff_strategy.py
@@ -150,6 +150,14 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
                self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
                break
                
+            # Calculate how many more URLs we can process in this batch
+            remaining = self.max_pages - self._pages_crawled
+            batch_size = min(BATCH_SIZE, remaining)
+            if batch_size <= 0:
+                # No more pages to crawl
+                self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
+                break
+                
            batch: List[Tuple[float, int, str, Optional[str]]] = []
            # Retrieve up to BATCH_SIZE items from the priority queue.
            for _ in range(BATCH_SIZE):
@@ -184,6 +192,10 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
                # Count only successful crawls toward max_pages limit
                if result.success:
                    self._pages_crawled += 1
+                    # Check if we've reached the limit during batch processing
+                    if self._pages_crawled >= self.max_pages:
+                        self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
+                        break  # Exit the generator
                
                yield result
                
--- a/crawl4ai/deep_crawling/bfs_strategy.py
+++ b/crawl4ai/deep_crawling/bfs_strategy.py
@@ -157,6 +157,11 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
        results: List[CrawlResult] = []

        while current_level and not self._cancel_event.is_set():
+            # Check if we've already reached max_pages before starting a new level
+            if self._pages_crawled >= self.max_pages:
+                self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
+                break
+            
            next_level: List[Tuple[str, Optional[str]]] = []
            urls = [url for url, _ in current_level]

@@ -221,6 +226,10 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
                # Count only successful crawls
                if result.success:
                    self._pages_crawled += 1
+                    # Check if we've reached the limit during batch processing
+                    if self._pages_crawled >= self.max_pages:
+                        self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
+                        break  # Exit the generator
                
                results_count += 1
                yield result
--- a/crawl4ai/deep_crawling/dfs_strategy.py
+++ b/crawl4ai/deep_crawling/dfs_strategy.py
@@ -49,6 +49,10 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
                # Count only successful crawls toward max_pages limit
                if result.success:
                    self._pages_crawled += 1
+                    # Check if we've reached the limit during batch processing
+                    if self._pages_crawled >= self.max_pages:
+                        self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
+                        break  # Exit the generator
                    
                    # Only discover links from successful crawls
                    new_links: List[Tuple[str, Optional[str]]] = []
@@ -94,6 +98,10 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
                # and only discover links from successful crawls
                if result.success:
                    self._pages_crawled += 1
+                    # Check if we've reached the limit during batch processing
+                    if self._pages_crawled >= self.max_pages:
+                        self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
+                        break  # Exit the generator
                    
                    new_links: List[Tuple[str, Optional[str]]] = []
                    await self.link_discovery(result, url, depth, visited, new_links, depths)
--- a/crawl4ai/docker_client.py
+++ b/crawl4ai/docker_client.py
@@ -73,6 +73,8 @@ class Crawl4aiDockerClient:
    def _prepare_request(self, urls: List[str], browser_config: Optional[BrowserConfig] = None, 
                       crawler_config: Optional[CrawlerRunConfig] = None) -> Dict[str, Any]:
        """Prepare request data from configs."""
+        if self._token:
+            self._http_client.headers["Authorization"] = f"Bearer {self._token}"
        return {
            "urls": urls,
            "browser_config": browser_config.dump() if browser_config else {},
@@ -103,8 +105,6 @@ class Crawl4aiDockerClient:
        crawler_config: Optional[CrawlerRunConfig] = None
    ) -> Union[CrawlResult, List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
        """Execute a crawl operation."""
-        if not self._token:
-            raise Crawl4aiClientError("Authentication required. Call authenticate() first.")
        await self._check_server()
        
        data = self._prepare_request(urls, browser_config, crawler_config)
@@ -140,8 +140,6 @@ class Crawl4aiDockerClient:

    async def get_schema(self) -> Dict[str, Any]:
        """Retrieve configuration schemas."""
-        if not self._token:
-            raise Crawl4aiClientError("Authentication required. Call authenticate() first.")
        response = await self._request("GET", "/schema")
        return response.json()

@@ -167,4 +165,4 @@ async def main():
        print(schema)

 if __name__ == "__main__":
-    asyncio.run(main())
+    asyncio.run(main())
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -656,11 +656,11 @@ class LLMExtractionStrategy(ExtractionStrategy):
            self.total_usage.total_tokens += usage.total_tokens

            try:
-                response = response.choices[0].message.content
+                content = response.choices[0].message.content
                blocks = None

                if self.force_json_response:
-                    blocks = json.loads(response)
+                    blocks = json.loads(content)
                    if isinstance(blocks, dict):
                        # If it has only one key which calue is list then assign that to blocks, exampled: {"news": [..]}
                        if len(blocks) == 1 and isinstance(list(blocks.values())[0], list):
@@ -673,7 +673,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
                        blocks = blocks
                else: 
                    # blocks = extract_xml_data(["blocks"], response.choices[0].message.content)["blocks"]
-                    blocks = extract_xml_data(["blocks"], response)["blocks"]
+                    blocks = extract_xml_data(["blocks"], content)["blocks"]
                    blocks = json.loads(blocks)

                for block in blocks:
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -50,6 +50,29 @@ from urllib.parse import (
 )


+# Monkey patch to fix wildcard handling in urllib.robotparser
+from urllib.robotparser import RuleLine
+import re
+
+original_applies_to = RuleLine.applies_to
+
+def patched_applies_to(self, filename):
+   # Handle wildcards in paths
+   if '*' in self.path or '%2A' in self.path or self.path in ("*", "%2A"):
+       pattern = self.path.replace('%2A', '*')
+       pattern = re.escape(pattern).replace('\\*', '.*')
+       pattern = '^' + pattern
+       if pattern.endswith('\\$'):
+           pattern = pattern[:-2] + '$'
+       try:
+           return bool(re.match(pattern, filename))
+       except re.error:
+           return original_applies_to(self, filename)
+   return original_applies_to(self, filename)
+
+RuleLine.applies_to = patched_applies_to
+# Monkey patch ends
+
 def chunk_documents(
    documents: Iterable[str],
    chunk_token_threshold: int,
@@ -318,7 +341,7 @@ class RobotsParser:
                robots_url = f"{scheme}://{domain}/robots.txt"
                
                async with aiohttp.ClientSession() as session:
-                    async with session.get(robots_url, timeout=2) as response:
+                    async with session.get(robots_url, timeout=2, ssl=False) as response:
                        if response.status == 200:
                            rules = await response.text()
                            self._cache_rules(domain, rules)
@@ -1524,6 +1547,13 @@ def extract_metadata_using_lxml(html, doc=None):
        content = tag.get("content", "").strip()
        if property_name and content:
            metadata[property_name] = content
+   # Article metadata - using starts-with() for performance
+    article_tags = head.xpath('.//meta[starts-with(@property, "article:")]')
+    for tag in article_tags:
+        property_name = tag.get("property", "").strip()
+        content = tag.get("content", "").strip()
+        if property_name and content:
+            metadata[property_name] = content

    return metadata

@@ -1599,7 +1629,12 @@ def extract_metadata(html, soup=None):
        content = tag.get("content", "").strip()
        if property_name and content:
            metadata[property_name] = content
-
+        # getting the article Values
+    metadata.update({
+        tag['property'].strip():tag["content"].strip()
+        for tag in head.find_all("meta", attrs={"property": re.compile(r"^article:")})
+          if tag.has_attr('property') and tag.has_attr('content')
+    })
    return metadata


@@ -2068,14 +2103,16 @@ def normalize_url(href, base_url):
    parsed_base = urlparse(base_url)
    if not parsed_base.scheme or not parsed_base.netloc:
        raise ValueError(f"Invalid base URL format: {base_url}")
-
-    # Ensure base_url ends with a trailing slash if it's a directory path
-    if not base_url.endswith('/'):
-        base_url = base_url + '/'
+    
+    if  parsed_base.scheme.lower() not in ["http", "https"]:
+        # Handle special protocols
+        raise ValueError(f"Invalid base URL format: {base_url}")
+    cleaned_href = href.strip()

    # Use urljoin to handle all cases
-    normalized = urljoin(base_url, href.strip())
-    return normalized
+    return urljoin(base_url, cleaned_href)
+
+


 def normalize_url(