Merge branch '2025-MAY-2' into next-MAY

2025-07-08 11:46:13 +02:00
parent 1a73fb60db 414f16e975
commit 0f210f6e02
28 changed files with 448 additions and 154 deletions
--- a/deploy/docker/api.py
+++ b/deploy/docker/api.py
@@ -459,7 +459,7 @@ async def handle_crawl_request(
            #      await crawler.close()
            #  except Exception as close_e:
            #       logger.error(f"Error closing crawler during exception handling: {close_e}")
-            logger.error(f"Error closing crawler during exception handling: {close_e}")
+            logger.error(f"Error closing crawler during exception handling: {str(e)}")

        # Measure memory even on error if possible
        end_mem_mb_error = _get_memory_mb()
@@ -518,7 +518,7 @@ async def handle_stream_crawl_request(
            #       await crawler.close()
            #  except Exception as close_e:
            #       logger.error(f"Error closing crawler during stream setup exception: {close_e}")
-            logger.error(f"Error closing crawler during stream setup exception: {close_e}")
+            logger.error(f"Error closing crawler during stream setup exception: {str(e)}")
        logger.error(f"Stream crawl error: {str(e)}", exc_info=True)
        # Raising HTTPException here will prevent streaming response
        raise HTTPException(
--- a/deploy/docker/c4ai-doc-context.md
+++ b/deploy/docker/c4ai-doc-context.md
@@ -403,7 +403,7 @@ async def main():

    md_generator = DefaultMarkdownGenerator(
    content_filter=filter,
-    options={"ignore_links": True}
+    options={"ignore_links": True})

    # 4) Crawler run config: skip cache, use extraction
    run_conf = CrawlerRunConfig(
@@ -3760,11 +3760,11 @@ To crawl a live web page, provide the URL starting with `http://` or `https://`,

 ```python
 import asyncio
-from crawl4ai import AsyncWebCrawler
+from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.async_configs import CrawlerRunConfig

 async def crawl_web():
-    config = CrawlerRunConfig(bypass_cache=True)
+    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url="https://en.wikipedia.org/wiki/apple", 
@@ -3785,13 +3785,13 @@ To crawl a local HTML file, prefix the file path with `file://`.

 ```python
 import asyncio
-from crawl4ai import AsyncWebCrawler
+from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.async_configs import CrawlerRunConfig

 async def crawl_local_file():
    local_file_path = "/path/to/apple.html"  # Replace with your file path
    file_url = f"file://{local_file_path}"
-    config = CrawlerRunConfig(bypass_cache=True)
+    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
    
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(url=file_url, config=config)
@@ -3810,13 +3810,13 @@ To crawl raw HTML content, prefix the HTML string with `raw:`.

 ```python
 import asyncio
-from crawl4ai import AsyncWebCrawler
+from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.async_configs import CrawlerRunConfig

 async def crawl_raw_html():
    raw_html = "<html><body><h1>Hello, World!</h1></body></html>"
    raw_html_url = f"raw:{raw_html}"
-    config = CrawlerRunConfig(bypass_cache=True)
+    config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
    
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(url=raw_html_url, config=config)
@@ -3845,7 +3845,7 @@ import os
 import sys
 import asyncio
 from pathlib import Path
-from crawl4ai import AsyncWebCrawler
+from crawl4ai import AsyncWebCrawler, CacheMode
 from crawl4ai.async_configs import CrawlerRunConfig

 async def main():
@@ -3856,7 +3856,7 @@ async def main():
    async with AsyncWebCrawler() as crawler:
        # Step 1: Crawl the Web URL
        print("\n=== Step 1: Crawling the Wikipedia URL ===")
-        web_config = CrawlerRunConfig(bypass_cache=True)
+        web_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
        result = await crawler.arun(url=wikipedia_url, config=web_config)

        if not result.success:
@@ -3871,7 +3871,7 @@ async def main():
        # Step 2: Crawl from the Local HTML File
        print("=== Step 2: Crawling from the Local HTML File ===")
        file_url = f"file://{html_file_path.resolve()}"
-        file_config = CrawlerRunConfig(bypass_cache=True)
+        file_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
        local_result = await crawler.arun(url=file_url, config=file_config)

        if not local_result.success:
@@ -3887,7 +3887,7 @@ async def main():
        with open(html_file_path, 'r', encoding='utf-8') as f:
            raw_html_content = f.read()
        raw_html_url = f"raw:{raw_html_content}"
-        raw_config = CrawlerRunConfig(bypass_cache=True)
+        raw_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
        raw_result = await crawler.arun(url=raw_html_url, config=raw_config)

        if not raw_result.success:
@@ -4152,7 +4152,7 @@ prune_filter = PruningContentFilter(
 For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:

 ```python
-from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
 from crawl4ai.content_filter_strategy import LLMContentFilter

 async def main():
@@ -4175,8 +4175,13 @@ async def main():
        verbose=True
    )

+    md_generator = DefaultMarkdownGenerator(
+        content_filter=filter,
+        options={"ignore_links": True}
+    )
+
    config = CrawlerRunConfig(
-        content_filter=filter
+        markdown_generator=md_generator
    )

    async with AsyncWebCrawler() as crawler:
@@ -5428,29 +5433,38 @@ Sometimes you need a visual record of a page or a PDF “printout.” Crawl4AI c
 ```python
 import os, asyncio
 from base64 import b64decode
-from crawl4ai import AsyncWebCrawler, CacheMode
+from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig

 async def main():
+    run_config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        screenshot=True,
+        pdf=True
+    )
+
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url="https://en.wikipedia.org/wiki/List_of_common_misconceptions",
-            cache_mode=CacheMode.BYPASS,
-            pdf=True,
-            screenshot=True
+            config=run_config
        )
-        
        if result.success:
-            # Save screenshot
+            print(f"Screenshot data present: {result.screenshot is not None}")
+            print(f"PDF data present: {result.pdf is not None}")
+
            if result.screenshot:
+                print(f"[OK] Screenshot captured, size: {len(result.screenshot)} bytes")
                with open("wikipedia_screenshot.png", "wb") as f:
                    f.write(b64decode(result.screenshot))
-            
-            # Save PDF
+            else:
+                print("[WARN] Screenshot data is None.")
+
            if result.pdf:
+                print(f"[OK] PDF captured, size: {len(result.pdf)} bytes")
                with open("wikipedia_page.pdf", "wb") as f:
                    f.write(result.pdf)
-            
-            print("[OK] PDF & screenshot captured.")
+            else:
+                print("[WARN] PDF data is None.")
+
        else:
            print("[ERROR]", result.error_message)

--- a/deploy/docker/schemas.py
+++ b/deploy/docker/schemas.py
@@ -12,8 +12,7 @@ class CrawlRequest(BaseModel):
 class MarkdownRequest(BaseModel):
    """Request body for the /md endpoint."""
    url: str                    = Field(...,  description="Absolute http/https URL to fetch")
-    f:   FilterType             = Field(FilterType.FIT,
-                                        description="Content‑filter strategy: FIT, RAW, BM25, or LLM")
+    f:   FilterType             = Field(FilterType.FIT, description="Content‑filter strategy: fit, raw, bm25, or llm")
    q:   Optional[str] = Field(None,  description="Query string used by BM25/LLM filters")
    c:   Optional[str] = Field("0",   description="Cache‑bust / revision counter")