Merge branch '2025-MAY-2' into next-MAY

This commit is contained in:
ntohidi
2025-07-08 11:46:13 +02:00
28 changed files with 448 additions and 154 deletions

View File

@@ -459,7 +459,7 @@ async def handle_crawl_request(
# await crawler.close()
# except Exception as close_e:
# logger.error(f"Error closing crawler during exception handling: {close_e}")
logger.error(f"Error closing crawler during exception handling: {close_e}")
logger.error(f"Error closing crawler during exception handling: {str(e)}")
# Measure memory even on error if possible
end_mem_mb_error = _get_memory_mb()
@@ -518,7 +518,7 @@ async def handle_stream_crawl_request(
# await crawler.close()
# except Exception as close_e:
# logger.error(f"Error closing crawler during stream setup exception: {close_e}")
logger.error(f"Error closing crawler during stream setup exception: {close_e}")
logger.error(f"Error closing crawler during stream setup exception: {str(e)}")
logger.error(f"Stream crawl error: {str(e)}", exc_info=True)
# Raising HTTPException here will prevent streaming response
raise HTTPException(

View File

@@ -403,7 +403,7 @@ async def main():
md_generator = DefaultMarkdownGenerator(
content_filter=filter,
options={"ignore_links": True}
options={"ignore_links": True})
# 4) Crawler run config: skip cache, use extraction
run_conf = CrawlerRunConfig(
@@ -3760,11 +3760,11 @@ To crawl a live web page, provide the URL starting with `http://` or `https://`,
```python
import asyncio
from crawl4ai import AsyncWebCrawler
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.async_configs import CrawlerRunConfig
async def crawl_web():
config = CrawlerRunConfig(bypass_cache=True)
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://en.wikipedia.org/wiki/apple",
@@ -3785,13 +3785,13 @@ To crawl a local HTML file, prefix the file path with `file://`.
```python
import asyncio
from crawl4ai import AsyncWebCrawler
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.async_configs import CrawlerRunConfig
async def crawl_local_file():
local_file_path = "/path/to/apple.html" # Replace with your file path
file_url = f"file://{local_file_path}"
config = CrawlerRunConfig(bypass_cache=True)
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url=file_url, config=config)
@@ -3810,13 +3810,13 @@ To crawl raw HTML content, prefix the HTML string with `raw:`.
```python
import asyncio
from crawl4ai import AsyncWebCrawler
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.async_configs import CrawlerRunConfig
async def crawl_raw_html():
raw_html = "<html><body><h1>Hello, World!</h1></body></html>"
raw_html_url = f"raw:{raw_html}"
config = CrawlerRunConfig(bypass_cache=True)
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url=raw_html_url, config=config)
@@ -3845,7 +3845,7 @@ import os
import sys
import asyncio
from pathlib import Path
from crawl4ai import AsyncWebCrawler
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.async_configs import CrawlerRunConfig
async def main():
@@ -3856,7 +3856,7 @@ async def main():
async with AsyncWebCrawler() as crawler:
# Step 1: Crawl the Web URL
print("\n=== Step 1: Crawling the Wikipedia URL ===")
web_config = CrawlerRunConfig(bypass_cache=True)
web_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
result = await crawler.arun(url=wikipedia_url, config=web_config)
if not result.success:
@@ -3871,7 +3871,7 @@ async def main():
# Step 2: Crawl from the Local HTML File
print("=== Step 2: Crawling from the Local HTML File ===")
file_url = f"file://{html_file_path.resolve()}"
file_config = CrawlerRunConfig(bypass_cache=True)
file_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
local_result = await crawler.arun(url=file_url, config=file_config)
if not local_result.success:
@@ -3887,7 +3887,7 @@ async def main():
with open(html_file_path, 'r', encoding='utf-8') as f:
raw_html_content = f.read()
raw_html_url = f"raw:{raw_html_content}"
raw_config = CrawlerRunConfig(bypass_cache=True)
raw_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
raw_result = await crawler.arun(url=raw_html_url, config=raw_config)
if not raw_result.success:
@@ -4152,7 +4152,7 @@ prune_filter = PruningContentFilter(
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
```python
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import LLMContentFilter
async def main():
@@ -4175,8 +4175,13 @@ async def main():
verbose=True
)
md_generator = DefaultMarkdownGenerator(
content_filter=filter,
options={"ignore_links": True}
)
config = CrawlerRunConfig(
content_filter=filter
markdown_generator=md_generator
)
async with AsyncWebCrawler() as crawler:
@@ -5428,29 +5433,38 @@ Sometimes you need a visual record of a page or a PDF “printout.” Crawl4AI c
```python
import os, asyncio
from base64 import b64decode
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
async def main():
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
screenshot=True,
pdf=True
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://en.wikipedia.org/wiki/List_of_common_misconceptions",
cache_mode=CacheMode.BYPASS,
pdf=True,
screenshot=True
config=run_config
)
if result.success:
# Save screenshot
print(f"Screenshot data present: {result.screenshot is not None}")
print(f"PDF data present: {result.pdf is not None}")
if result.screenshot:
print(f"[OK] Screenshot captured, size: {len(result.screenshot)} bytes")
with open("wikipedia_screenshot.png", "wb") as f:
f.write(b64decode(result.screenshot))
# Save PDF
else:
print("[WARN] Screenshot data is None.")
if result.pdf:
print(f"[OK] PDF captured, size: {len(result.pdf)} bytes")
with open("wikipedia_page.pdf", "wb") as f:
f.write(result.pdf)
print("[OK] PDF & screenshot captured.")
else:
print("[WARN] PDF data is None.")
else:
print("[ERROR]", result.error_message)

View File

@@ -12,8 +12,7 @@ class CrawlRequest(BaseModel):
class MarkdownRequest(BaseModel):
"""Request body for the /md endpoint."""
url: str = Field(..., description="Absolute http/https URL to fetch")
f: FilterType = Field(FilterType.FIT,
description="Contentfilter strategy: FIT, RAW, BM25, or LLM")
f: FilterType = Field(FilterType.FIT, description="Contentfilter strategy: fit, raw, bm25, or llm")
q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters")
c: Optional[str] = Field("0", description="Cachebust / revision counter")