Merge branch '2025-APR-1' into 2025-MAY-2

2025-06-02 20:25:58 +02:00
parent b55e27d2ef a55c2b3f88
commit 773ed7b281
14 changed files with 173 additions and 39 deletions
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -744,18 +744,49 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                    )
                    redirected_url = page.url
                except Error as e:
-                    raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
+                    # Allow navigation to be aborted when downloading files
                    # This is expected behavior for downloads in some browser engines
                    if 'net::ERR_ABORTED' in str(e) and self.browser_config.accept_downloads:
                        self.logger.info(
                            message=f"Navigation aborted, likely due to file download: {url}",
                            tag="GOTO",
                            params={"url": url},
                        )
                        response = None
                    else:
                        raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
                await self.execute_hook(
                    "after_goto", page, context=context, url=url, response=response, config=config
                )
                # ──────────────────────────────────────────────────────────────
                # Walk the redirect chain.  Playwright returns only the last
                # hop, so we trace the `request.redirected_from` links until the
                # first response that differs from the final one and surface its
                # status-code.
                # ──────────────────────────────────────────────────────────────
                if response is None:
                    status_code = 200
                    response_headers = {}
                else:
-                    status_code = response.status
+                    first_resp = response
-                    response_headers = response.headers
+                    req = response.request
                    while req and req.redirected_from:
                        prev_req = req.redirected_from
                        prev_resp = await prev_req.response()
                        if prev_resp:                       # keep earliest
                            first_resp = prev_resp
                        req = prev_req
                    status_code = first_resp.status
                    response_headers = first_resp.headers
                # if response is None:
                #     status_code = 200
                #     response_headers = {}
                # else:
                #     status_code = response.status
                #     response_headers = response.headers
            else:
                status_code = 200
@@ -1435,12 +1466,32 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            num_segments = (page_height // viewport_height) + 1
            for i in range(num_segments):
                y_offset = i * viewport_height
                # Special handling for the last segment
                if i == num_segments - 1:
                    last_part_height = page_height % viewport_height
                    # If page_height is an exact multiple of viewport_height,
                    # we don't need an extra segment
                    if last_part_height == 0:
                        # Skip last segment if page height is exact multiple of viewport
                        break
                    # Adjust viewport to exactly match the remaining content height
                    await page.set_viewport_size({"width": page_width, "height": last_part_height})
                await page.evaluate(f"window.scrollTo(0, {y_offset})")
                await asyncio.sleep(0.01)  # wait for render
-                seg_shot = await page.screenshot(full_page=False)
+                
                # Capture the current segment
                # Note: Using compression options (format, quality) would go here
                seg_shot = await page.screenshot(full_page=False, type="jpeg", quality=85)
                # seg_shot = await page.screenshot(full_page=False)
                img = Image.open(BytesIO(seg_shot)).convert("RGB")
                segments.append(img)
            # Reset viewport to original size after capturing segments
            await page.set_viewport_size({"width": page_width, "height": viewport_height})
            total_height = sum(img.height for img in segments)
            stitched = Image.new("RGB", (segments[0].width, total_height))
            offset = 0
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -585,11 +585,13 @@ class AsyncWebCrawler:
            # Choose content based on input_format
            content_format = config.extraction_strategy.input_format
            if content_format == "fit_markdown" and not markdown_result.fit_markdown:
-                self.logger.warning(
+
-                    message="Fit markdown requested but not available. Falling back to raw markdown.",
+                self.logger.url_status(
-                    tag="EXTRACT",
+                        url=_url,
-                    params={"url": _url},
+                        success=bool(html),
-                )
+                        timing=time.perf_counter() - t1,
                        tag="EXTRACT",
                    )
                content_format = "markdown"
            content = {
@@ -613,11 +615,12 @@ class AsyncWebCrawler:
            )
            # Log extraction completion
-            self.logger.info(
+            self.logger.url_status(
-                message="Completed for {url:.50}... | Time: {timing}s",
+                        url=_url,
-                tag="EXTRACT",
+                        success=bool(html),
-                params={"url": _url, "timing": time.perf_counter() - t1},
+                        timing=time.perf_counter() - t1,
-            )
+                        tag="EXTRACT",
                    )
        # Apply HTML formatting if requested
        if config.prettiify:
--- a/crawl4ai/browser_profiler.py
+++ b/crawl4ai/browser_profiler.py
@@ -615,9 +615,18 @@ class BrowserProfiler:
        self.logger.info(f"Debugging port: {debugging_port}", tag="CDP")
        self.logger.info(f"Headless mode: {headless}", tag="CDP")
        # create browser config
        browser_config = BrowserConfig(
            browser_type=browser_type,
            headless=headless,
            user_data_dir=profile_path,
            debugging_port=debugging_port,
            verbose=True
        )
        # Create managed browser instance
        managed_browser = ManagedBrowser(
-            browser_type=browser_type,
+            browser_config=browser_config,
            user_data_dir=profile_path,
            headless=headless,
            logger=self.logger,
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -718,13 +718,18 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                    # Check flag if we should remove external images
                    if kwargs.get("exclude_external_images", False):
-                        element.decompose()
+                        # Handle relative URLs (which are always from the same domain)
-                        return False
+                        if not src.startswith('http') and not src.startswith('//'):
-                        # src_url_base = src.split('/')[2]
+                            return True  # Keep relative URLs
-                        # url_base = url.split('/')[2]
+                        
-                        # if url_base not in src_url_base:
+                        # For absolute URLs, compare the base domains using the existing function
-                        #     element.decompose()
+                        src_base_domain = get_base_domain(src)
-                        #     return False
+                        url_base_domain = get_base_domain(url)
                        # If the domains don't match and both are valid, the image is external
                        if src_base_domain and url_base_domain and src_base_domain != url_base_domain:
                            element.decompose()
                            return False
                    # if kwargs.get('exclude_social_media_links', False):
                    #     if image_src_base_domain in exclude_social_media_domains:
--- a/crawl4ai/deep_crawling/bff_strategy.py
+++ b/crawl4ai/deep_crawling/bff_strategy.py
@@ -150,6 +150,14 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
                self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
                break
            # Calculate how many more URLs we can process in this batch
            remaining = self.max_pages - self._pages_crawled
            batch_size = min(BATCH_SIZE, remaining)
            if batch_size <= 0:
                # No more pages to crawl
                self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
                break
            batch: List[Tuple[float, int, str, Optional[str]]] = []
            # Retrieve up to BATCH_SIZE items from the priority queue.
            for _ in range(BATCH_SIZE):
@@ -184,6 +192,10 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
                # Count only successful crawls toward max_pages limit
                if result.success:
                    self._pages_crawled += 1
                    # Check if we've reached the limit during batch processing
                    if self._pages_crawled >= self.max_pages:
                        self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
                        break  # Exit the generator
                yield result
--- a/crawl4ai/deep_crawling/bfs_strategy.py
+++ b/crawl4ai/deep_crawling/bfs_strategy.py
@@ -157,6 +157,11 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
        results: List[CrawlResult] = []
        while current_level and not self._cancel_event.is_set():
            # Check if we've already reached max_pages before starting a new level
            if self._pages_crawled >= self.max_pages:
                self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
                break
            next_level: List[Tuple[str, Optional[str]]] = []
            urls = [url for url, _ in current_level]
@@ -221,6 +226,10 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
                # Count only successful crawls
                if result.success:
                    self._pages_crawled += 1
                    # Check if we've reached the limit during batch processing
                    if self._pages_crawled >= self.max_pages:
                        self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
                        break  # Exit the generator
                results_count += 1
                yield result
--- a/crawl4ai/deep_crawling/dfs_strategy.py
+++ b/crawl4ai/deep_crawling/dfs_strategy.py
@@ -49,6 +49,10 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
                # Count only successful crawls toward max_pages limit
                if result.success:
                    self._pages_crawled += 1
                    # Check if we've reached the limit during batch processing
                    if self._pages_crawled >= self.max_pages:
                        self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
                        break  # Exit the generator
                    # Only discover links from successful crawls
                    new_links: List[Tuple[str, Optional[str]]] = []
@@ -94,6 +98,10 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
                # and only discover links from successful crawls
                if result.success:
                    self._pages_crawled += 1
                    # Check if we've reached the limit during batch processing
                    if self._pages_crawled >= self.max_pages:
                        self.logger.info(f"Max pages limit ({self.max_pages}) reached during batch, stopping crawl")
                        break  # Exit the generator
                    new_links: List[Tuple[str, Optional[str]]] = []
                    await self.link_discovery(result, url, depth, visited, new_links, depths)
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -42,6 +42,29 @@ from itertools import chain
 from collections import deque
 from typing import  Generator, Iterable
 # Monkey patch to fix wildcard handling in urllib.robotparser
 from urllib.robotparser import RuleLine
 import re
 original_applies_to = RuleLine.applies_to
 def patched_applies_to(self, filename):
   # Handle wildcards in paths
   if '*' in self.path or '%2A' in self.path or self.path in ("*", "%2A"):
       pattern = self.path.replace('%2A', '*')
       pattern = re.escape(pattern).replace('\\*', '.*')
       pattern = '^' + pattern
       if pattern.endswith('\\$'):
           pattern = pattern[:-2] + '$'
       try:
           return bool(re.match(pattern, filename))
       except re.error:
           return original_applies_to(self, filename)
   return original_applies_to(self, filename)
 RuleLine.applies_to = patched_applies_to
 # Monkey patch ends
 def chunk_documents(
    documents: Iterable[str],
    chunk_token_threshold: int,
@@ -303,7 +326,7 @@ class RobotsParser:
                robots_url = f"{scheme}://{domain}/robots.txt"
                async with aiohttp.ClientSession() as session:
-                    async with session.get(robots_url, timeout=2) as response:
+                    async with session.get(robots_url, timeout=2, ssl=False) as response:
                        if response.status == 200:
                            rules = await response.text()
                            self._cache_rules(domain, rules)
--- a/deploy/docker/c4ai-doc-context.md
+++ b/deploy/docker/c4ai-doc-context.md
@@ -403,7 +403,7 @@ async def main():
    md_generator = DefaultMarkdownGenerator(
    content_filter=filter,
-    options={"ignore_links": True}
+    options={"ignore_links": True})
    # 4) Crawler run config: skip cache, use extraction
    run_conf = CrawlerRunConfig(
@@ -4152,7 +4152,7 @@ prune_filter = PruningContentFilter(
 For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
 ```python
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
 from crawl4ai.content_filter_strategy import LLMContentFilter
 async def main():
@@ -4175,8 +4175,13 @@ async def main():
        verbose=True
    )
    md_generator = DefaultMarkdownGenerator(
        content_filter=filter,
        options={"ignore_links": True}
    )
    config = CrawlerRunConfig(
-        content_filter=filter
+        markdown_generator=md_generator
    )
    async with AsyncWebCrawler() as crawler:
--- a/docs/md_v2/core/browser-crawler-config.md
+++ b/docs/md_v2/core/browser-crawler-config.md
@@ -273,7 +273,7 @@ In a typical scenario, you define **one** `BrowserConfig` for your crawler sessi
 ```python
 import asyncio
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig, LLMContentFilter, DefaultMarkdownGenerator
 from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
 async def main():
@@ -298,7 +298,7 @@ async def main():
    # 3) Example LLM content filtering
    gemini_config = LLMConfig(
-        provider="gemini/gemini-1.5-pro" 
+        provider="gemini/gemini-1.5-pro", 
        api_token = "env:GEMINI_API_TOKEN"
    )
@@ -322,8 +322,9 @@ async def main():
    )
    md_generator = DefaultMarkdownGenerator(
-    content_filter=filter,
+        content_filter=filter,
-    options={"ignore_links": True}
+        options={"ignore_links": True}
    )
    # 4) Crawler run config: skip cache, use extraction
    run_conf = CrawlerRunConfig(
--- a/docs/md_v2/core/cli.md
+++ b/docs/md_v2/core/cli.md
@@ -17,6 +17,9 @@
 - [Configuration Reference](#configuration-reference)
 - [Best Practices & Tips](#best-practices--tips)
 ## Installation
 The Crawl4AI CLI will be installed automatically when you install the library.
 ## Basic Usage
 The Crawl4AI CLI (`crwl`) provides a simple interface to the Crawl4AI library:
--- a/docs/md_v2/core/markdown-generation.md
+++ b/docs/md_v2/core/markdown-generation.md
@@ -233,7 +233,7 @@ prune_filter = PruningContentFilter(
 For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
 ```python
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
 from crawl4ai.content_filter_strategy import LLMContentFilter
 async def main():
@@ -255,9 +255,12 @@ async def main():
        chunk_token_threshold=4096,  # Adjust based on your needs
        verbose=True
    )
-
+    md_generator = DefaultMarkdownGenerator(
        content_filter=filter,
        options={"ignore_links": True}
    )
    config = CrawlerRunConfig(
-        content_filter=filter
+        markdown_generator=md_generator,
    )
    async with AsyncWebCrawler() as crawler:
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,7 @@ dependencies = [
    "lxml~=5.3",
    "litellm>=1.53.1",
    "numpy>=1.26.0,<3",
-    "pillow~=10.4",
+    "pillow>=10.4",
    "playwright>=1.49.0",
    "python-dotenv~=1.0",
    "requests~=2.26",
@@ -33,7 +33,6 @@ dependencies = [
    "psutil>=6.1.1",
    "nltk>=3.9.1",
    "playwright",
    "aiofiles",
    "rich>=13.9.4",
    "cssselect>=1.2.0",
    "httpx>=0.27.2",
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@ aiosqlite~=0.20
 lxml~=5.3
 litellm>=1.53.1
 numpy>=1.26.0,<3
-pillow~=10.4
+pillow>=10.4
 playwright>=1.49.0
 python-dotenv~=1.0
 requests~=2.26
@@ -23,3 +23,6 @@ rich>=13.9.4
 cssselect>=1.2.0
 chardet>=5.2.0
 brotli>=1.1.0
 fake-useragent>=2.2.0
 pdf2image>=1.17.0
 PyPDF2>=3.0.1