Merge branch 'main' into main

This commit is contained in:
Emmanuel Ferdman
2025-07-29 14:05:35 +03:00
committed by GitHub
330 changed files with 116430 additions and 589 deletions

View File

@@ -58,13 +58,15 @@ Pull and run images directly from Docker Hub without building locally.
#### 1. Pull the Image
Our latest release candidate is `0.6.0-r1`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
Our latest release candidate is `0.7.0-r1`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
> ⚠️ **Important Note**: The `latest` tag currently points to the stable `0.6.0` version. After testing and validation, `0.7.0` (without -r1) will be released and `latest` will be updated. For now, please use `0.7.0-r1` to test the new features.
```bash
# Pull the release candidate (recommended for latest features)
docker pull unclecode/crawl4ai:0.6.0-rN # Use your favorite revision number
# Pull the release candidate (for testing new features)
docker pull unclecode/crawl4ai:0.7.0-r1
# Or pull the latest stable version
# Or pull the current stable version (0.6.0)
docker pull unclecode/crawl4ai:latest
```
@@ -99,7 +101,7 @@ EOL
-p 11235:11235 \
--name crawl4ai \
--shm-size=1g \
unclecode/crawl4ai:0.6.0-rN # Use your favorite revision number
unclecode/crawl4ai:0.7.0-r1
```
* **With LLM support:**
@@ -110,7 +112,7 @@ EOL
--name crawl4ai \
--env-file .llm.env \
--shm-size=1g \
unclecode/crawl4ai:0.6.0-rN # Use your favorite revision number
unclecode/crawl4ai:0.7.0-r1
```
> The server will be available at `http://localhost:11235`. Visit `/playground` to access the interactive testing interface.
@@ -124,7 +126,7 @@ docker stop crawl4ai && docker rm crawl4ai
#### Docker Hub Versioning Explained
* **Image Name:** `unclecode/crawl4ai`
* **Tag Format:** `LIBRARY_VERSION[-SUFFIX]` (e.g., `0.6.0-r1`)
* **Tag Format:** `LIBRARY_VERSION[-SUFFIX]` (e.g., `0.7.0-r1`)
* `LIBRARY_VERSION`: The semantic version of the core `crawl4ai` Python library
* `SUFFIX`: Optional tag for release candidates (``) and revisions (`r1`)
* **`latest` Tag:** Points to the most recent stable version
@@ -160,7 +162,7 @@ The `docker-compose.yml` file in the project root provides a simplified approach
```bash
# Pulls and runs the release candidate from Docker Hub
# Automatically selects the correct architecture
IMAGE=unclecode/crawl4ai:0.6.0-rN # Use your favorite revision number docker compose up -d
IMAGE=unclecode/crawl4ai:0.7.0-r1 docker compose up -d
```
* **Build and Run Locally:**

View File

@@ -5,6 +5,7 @@ from typing import List, Tuple, Dict
from functools import partial
from uuid import uuid4
from datetime import datetime, timezone
from base64 import b64encode
import logging
from typing import Optional, AsyncGenerator
@@ -371,6 +372,9 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator)
server_memory_mb = _get_memory_mb()
result_dict = result.model_dump()
result_dict['server_memory_mb'] = server_memory_mb
# If PDF exists, encode it to base64
if result_dict.get('pdf') is not None:
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}")
data = json.dumps(result_dict, default=datetime_handler) + "\n"
yield data.encode('utf-8')
@@ -443,10 +447,19 @@ async def handle_crawl_request(
mem_delta_mb = end_mem_mb - start_mem_mb # <--- Calculate delta
peak_mem_mb = max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb) # <--- Get peak memory
logger.info(f"Memory usage: Start: {start_mem_mb} MB, End: {end_mem_mb} MB, Delta: {mem_delta_mb} MB, Peak: {peak_mem_mb} MB")
# Process results to handle PDF bytes
processed_results = []
for result in results:
result_dict = result.model_dump()
# If PDF exists, encode it to base64
if result_dict.get('pdf') is not None:
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
processed_results.append(result_dict)
return {
"success": True,
"results": [result.model_dump() for result in results],
"results": processed_results,
"server_processing_time_s": end_time - start_time,
"server_memory_delta_mb": mem_delta_mb,
"server_peak_memory_mb": peak_mem_mb
@@ -459,7 +472,7 @@ async def handle_crawl_request(
# await crawler.close()
# except Exception as close_e:
# logger.error(f"Error closing crawler during exception handling: {close_e}")
logger.error(f"Error closing crawler during exception handling: {close_e}")
logger.error(f"Error closing crawler during exception handling: {str(e)}")
# Measure memory even on error if possible
end_mem_mb_error = _get_memory_mb()
@@ -518,7 +531,7 @@ async def handle_stream_crawl_request(
# await crawler.close()
# except Exception as close_e:
# logger.error(f"Error closing crawler during stream setup exception: {close_e}")
logger.error(f"Error closing crawler during stream setup exception: {close_e}")
logger.error(f"Error closing crawler during stream setup exception: {str(e)}")
logger.error(f"Stream crawl error: {str(e)}", exc_info=True)
# Raising HTTPException here will prevent streaming response
raise HTTPException(

View File

@@ -1263,7 +1263,7 @@ class LLMConfig:
provider: str = DEFAULT_PROVIDER,
api_token: Optional[str] = None,
base_url: Optional[str] = None,
temprature: Optional[float] = None,
temperature: Optional[float] = None,
max_tokens: Optional[int] = None,
top_p: Optional[float] = None,
frequency_penalty: Optional[float] = None,
@@ -1291,7 +1291,7 @@ class LLMConfig:
self.provider = DEFAULT_PROVIDER
self.api_token = os.getenv(DEFAULT_PROVIDER_API_KEY)
self.base_url = base_url
self.temprature = temprature
self.temperature = temperature
self.max_tokens = max_tokens
self.top_p = top_p
self.frequency_penalty = frequency_penalty
@@ -1305,7 +1305,7 @@ class LLMConfig:
provider=kwargs.get("provider", DEFAULT_PROVIDER),
api_token=kwargs.get("api_token"),
base_url=kwargs.get("base_url"),
temprature=kwargs.get("temprature"),
temperature=kwargs.get("temperature"),
max_tokens=kwargs.get("max_tokens"),
top_p=kwargs.get("top_p"),
frequency_penalty=kwargs.get("frequency_penalty"),
@@ -1319,7 +1319,7 @@ class LLMConfig:
"provider": self.provider,
"api_token": self.api_token,
"base_url": self.base_url,
"temprature": self.temprature,
"temperature": self.temperature,
"max_tokens": self.max_tokens,
"top_p": self.top_p,
"frequency_penalty": self.frequency_penalty,
@@ -4075,7 +4075,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
api_token: The API token for the provider.
base_url: The base URL for the API request.
api_base: The base URL for the API request.
extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
extra_args: Additional arguments for the API request, such as temperature, max_tokens, etc.
"""
super().__init__( input_format=input_format, **kwargs)
self.llm_config = llm_config
@@ -7901,7 +7901,7 @@ from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.extraction_strategy import (
from crawl4ai import (
JsonCssExtractionStrategy,
LLMExtractionStrategy,
)
@@ -8301,7 +8301,7 @@ async def crawl_dynamic_content_pages_method_2():
async def cosine_similarity_extraction():
from crawl4ai.extraction_strategy import CosineStrategy
from crawl4ai import CosineStrategy
crawl_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
extraction_strategy=CosineStrategy(

View File

@@ -332,7 +332,7 @@ The `clone()` method:
### Key fields to note
1. **`provider`**:
- Which LLM provoder to use.
- Which LLM provider to use.
- Possible values are `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`<br/>*(default: `"openai/gpt-4o-mini"`)*
2. **`api_token`**:
@@ -354,7 +354,7 @@ In a typical scenario, you define **one** `BrowserConfig` for your crawler sessi
```python
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
async def main():
# 1) Browser config: headless, bigger viewport, no proxy
@@ -403,7 +403,7 @@ async def main():
md_generator = DefaultMarkdownGenerator(
content_filter=filter,
options={"ignore_links": True}
options={"ignore_links": True})
# 4) Crawler run config: skip cache, use extraction
run_conf = CrawlerRunConfig(
@@ -1042,7 +1042,7 @@ You can combine content selection with a more advanced extraction strategy. For
import asyncio
import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
async def main():
# Minimal schema for repeated items
@@ -1094,7 +1094,7 @@ import asyncio
import json
from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from crawl4ai import LLMExtractionStrategy
class ArticleData(BaseModel):
headline: str
@@ -1139,7 +1139,7 @@ Below is a short function that unifies **CSS selection**, **exclusion** logic, a
import asyncio
import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
async def extract_main_articles(url: str):
schema = {
@@ -1488,7 +1488,7 @@ If you run a JSON-based extraction strategy (CSS, XPath, LLM, etc.), the structu
import asyncio
import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
async def main():
schema = {
@@ -3760,11 +3760,11 @@ To crawl a live web page, provide the URL starting with `http://` or `https://`,
```python
import asyncio
from crawl4ai import AsyncWebCrawler
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.async_configs import CrawlerRunConfig
async def crawl_web():
config = CrawlerRunConfig(bypass_cache=True)
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://en.wikipedia.org/wiki/apple",
@@ -3785,13 +3785,13 @@ To crawl a local HTML file, prefix the file path with `file://`.
```python
import asyncio
from crawl4ai import AsyncWebCrawler
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.async_configs import CrawlerRunConfig
async def crawl_local_file():
local_file_path = "/path/to/apple.html" # Replace with your file path
file_url = f"file://{local_file_path}"
config = CrawlerRunConfig(bypass_cache=True)
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url=file_url, config=config)
@@ -3810,13 +3810,13 @@ To crawl raw HTML content, prefix the HTML string with `raw:`.
```python
import asyncio
from crawl4ai import AsyncWebCrawler
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.async_configs import CrawlerRunConfig
async def crawl_raw_html():
raw_html = "<html><body><h1>Hello, World!</h1></body></html>"
raw_html_url = f"raw:{raw_html}"
config = CrawlerRunConfig(bypass_cache=True)
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url=raw_html_url, config=config)
@@ -3845,7 +3845,7 @@ import os
import sys
import asyncio
from pathlib import Path
from crawl4ai import AsyncWebCrawler
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.async_configs import CrawlerRunConfig
async def main():
@@ -3856,7 +3856,7 @@ async def main():
async with AsyncWebCrawler() as crawler:
# Step 1: Crawl the Web URL
print("\n=== Step 1: Crawling the Wikipedia URL ===")
web_config = CrawlerRunConfig(bypass_cache=True)
web_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
result = await crawler.arun(url=wikipedia_url, config=web_config)
if not result.success:
@@ -3871,7 +3871,7 @@ async def main():
# Step 2: Crawl from the Local HTML File
print("=== Step 2: Crawling from the Local HTML File ===")
file_url = f"file://{html_file_path.resolve()}"
file_config = CrawlerRunConfig(bypass_cache=True)
file_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
local_result = await crawler.arun(url=file_url, config=file_config)
if not local_result.success:
@@ -3887,7 +3887,7 @@ async def main():
with open(html_file_path, 'r', encoding='utf-8') as f:
raw_html_content = f.read()
raw_html_url = f"raw:{raw_html_content}"
raw_config = CrawlerRunConfig(bypass_cache=True)
raw_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
raw_result = await crawler.arun(url=raw_html_url, config=raw_config)
if not raw_result.success:
@@ -4152,7 +4152,7 @@ prune_filter = PruningContentFilter(
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
```python
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import LLMContentFilter
async def main():
@@ -4175,8 +4175,13 @@ async def main():
verbose=True
)
md_generator = DefaultMarkdownGenerator(
content_filter=filter,
options={"ignore_links": True}
)
config = CrawlerRunConfig(
content_filter=filter
markdown_generator=md_generator
)
async with AsyncWebCrawler() as crawler:
@@ -4722,7 +4727,7 @@ if __name__ == "__main__":
Once dynamic content is loaded, you can attach an **`extraction_strategy`** (like `JsonCssExtractionStrategy` or `LLMExtractionStrategy`). For example:
```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
schema = {
"name": "Commits",
@@ -4902,7 +4907,7 @@ Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. B
> **New!** Crawl4AI now provides a powerful utility to automatically generate extraction schemas using LLM. This is a one-time cost that gives you a reusable schema for fast, LLM-free extractions:
```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
from crawl4ai import LLMConfig
# Generate a schema (one-time cost)
@@ -4932,7 +4937,7 @@ Here's a basic extraction example:
import asyncio
import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
async def main():
schema = {
@@ -4987,7 +4992,7 @@ import json
import asyncio
from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from crawl4ai import LLMExtractionStrategy
class OpenAIModelFee(BaseModel):
model_name: str = Field(..., description="Name of the OpenAI model.")
@@ -5103,7 +5108,7 @@ Some sites require multiple “page clicks” or dynamic JavaScript updates. Bel
```python
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
async def extract_structured_data_using_css_extractor():
print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
@@ -5428,29 +5433,38 @@ Sometimes you need a visual record of a page or a PDF “printout.” Crawl4AI c
```python
import os, asyncio
from base64 import b64decode
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
async def main():
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
screenshot=True,
pdf=True
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://en.wikipedia.org/wiki/List_of_common_misconceptions",
cache_mode=CacheMode.BYPASS,
pdf=True,
screenshot=True
config=run_config
)
if result.success:
# Save screenshot
print(f"Screenshot data present: {result.screenshot is not None}")
print(f"PDF data present: {result.pdf is not None}")
if result.screenshot:
print(f"[OK] Screenshot captured, size: {len(result.screenshot)} bytes")
with open("wikipedia_screenshot.png", "wb") as f:
f.write(b64decode(result.screenshot))
# Save PDF
else:
print("[WARN] Screenshot data is None.")
if result.pdf:
print(f"[OK] PDF captured, size: {len(result.pdf)} bytes")
with open("wikipedia_page.pdf", "wb") as f:
f.write(result.pdf)
print("[OK] PDF & screenshot captured.")
else:
print("[WARN] PDF data is None.")
else:
print("[ERROR]", result.error_message)
@@ -6705,7 +6719,7 @@ dispatcher = MemoryAdaptiveDispatcher(
3.**`max_session_permit`** (`int`, default: `10`)
The maximum number of concurrent crawling tasks allowed. This ensures resource limits are respected while maintaining concurrency.
4.**`memory_wait_timeout`** (`float`, default: `300.0`)
4.**`memory_wait_timeout`** (`float`, default: `600.0`)
Optional timeout (in seconds). If memory usage exceeds `memory_threshold_percent` for longer than this duration, a `MemoryError` is raised.
5.**`rate_limiter`** (`RateLimiter`, default: `None`)
@@ -7300,7 +7314,7 @@ Here's an example of crawling GitHub commits across multiple pages while preserv
```python
from crawl4ai.async_configs import CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
from crawl4ai.cache_context import CacheMode
async def crawl_dynamic_content():
@@ -7850,7 +7864,7 @@ The Cosine Strategy:
## Basic Usage
```python
from crawl4ai.extraction_strategy import CosineStrategy
from crawl4ai import CosineStrategy
strategy = CosineStrategy(
semantic_filter="product reviews", # Target content type
@@ -8161,7 +8175,7 @@ import json
from pydantic import BaseModel, Field
from typing import List
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from crawl4ai import LLMExtractionStrategy
class Product(BaseModel):
name: str
@@ -8278,7 +8292,7 @@ import asyncio
from typing import List
from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from crawl4ai import LLMExtractionStrategy
class Entity(BaseModel):
name: str
@@ -8423,7 +8437,7 @@ Lets begin with a **simple** schema-based extraction using the `JsonCssExtrac
import json
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
async def extract_crypto_prices():
# 1. Define a simple extraction schema
@@ -8493,7 +8507,7 @@ Below is a short example demonstrating **XPath** extraction plus the **`raw://`*
import json
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
from crawl4ai import JsonXPathExtractionStrategy
async def extract_crypto_prices_xpath():
# 1. Minimal dummy HTML with some repeating rows
@@ -8694,7 +8708,7 @@ Key Takeaways:
import json
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
ecommerce_schema = {
# ... the advanced schema from above ...
@@ -8804,7 +8818,7 @@ While manually crafting schemas is powerful and precise, Crawl4AI now offers a c
The schema generator is available as a static method on both `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy`. You can choose between OpenAI's GPT-4 or the open-source Ollama for schema generation:
```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
from crawl4ai import LLMConfig
# Sample HTML with product information

View File

@@ -14,3 +14,4 @@ anyio==4.9.0
PyJWT==2.10.1
mcp>=1.6.0
websockets>=15.0.1
httpx[http2]>=0.27.2

View File

@@ -12,8 +12,7 @@ class CrawlRequest(BaseModel):
class MarkdownRequest(BaseModel):
"""Request body for the /md endpoint."""
url: str = Field(..., description="Absolute http/https URL to fetch")
f: FilterType = Field(FilterType.FIT,
description="Contentfilter strategy: FIT, RAW, BM25, or LLM")
f: FilterType = Field(FilterType.FIT, description="Contentfilter strategy: fit, raw, bm25, or llm")
q: Optional[str] = Field(None, description="Query string used by BM25/LLM filters")
c: Optional[str] = Field("0", description="Cachebust / revision counter")

View File

@@ -671,6 +671,16 @@
method: 'GET',
headers: { 'Accept': 'application/json' }
});
responseData = await response.json();
const time = Math.round(performance.now() - startTime);
if (!response.ok) {
updateStatus('error', time);
throw new Error(responseData.error || 'Request failed');
}
updateStatus('success', time);
document.querySelector('#response-content code').textContent = JSON.stringify(responseData, null, 2);
document.querySelector('#response-content code').className = 'json hljs';
forceHighlightElement(document.querySelector('#response-content code'));
} else if (endpoint === 'crawl_stream') {
// Stream processing
response = await fetch(api, {