feat(config): enhance serialization and add deep crawling exports

Improve configuration serialization with better handling of frozensets and slots.
Expand deep crawling module exports and documentation.
Add comprehensive API usage examples in Docker README.

- Add support for frozenset serialization
- Improve error handling in config loading
- Export additional deep crawling components
- Enhance Docker API documentation with detailed examples
- Fix ContentTypeFilter initialization
This commit is contained in:
UncleCode
2025-02-13 21:45:19 +08:00
parent 43e09da694
commit 966fb47e64
6 changed files with 460 additions and 30 deletions

View File

@@ -33,10 +33,35 @@ from .async_dispatcher import (
)
from .docker_client import Crawl4aiDockerClient
from .hub import CrawlerHub
from .deep_crawling import DeepCrawlStrategy
from .deep_crawling import (
DeepCrawlStrategy,
BFSDeepCrawlStrategy,
FastFilterChain,
FastContentTypeFilter,
FastDomainFilter,
FastURLFilter,
FastFilterStats,
FastKeywordRelevanceScorer,
FastURLScorer,
BestFirstCrawlingStrategy,
DFSDeepCrawlStrategy,
DeepCrawlDecorator,
)
__all__ = [
"AsyncWebCrawler",
"DeepCrawlStrategy",
"BFSDeepCrawlStrategy",
"BestFirstCrawlingStrategy",
"DFSDeepCrawlStrategy",
"FastFilterChain",
"FastContentTypeFilter",
"FastDomainFilter",
"FastFilterStats",
"FastURLFilter",
"FastKeywordRelevanceScorer",
"FastURLScorer",
"DeepCrawlDecorator",
"CrawlResult",
"CrawlerHub",
"CacheMode",

View File

@@ -1,3 +1,5 @@
import re
from attr import has
from .config import (
MIN_WORD_THRESHOLD,
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
@@ -17,6 +19,7 @@ from typing import Union, List
from .cache_context import CacheMode
from .proxy_strategy import ProxyRotationStrategy
import inspect
from typing import Any, Dict, Optional
from enum import Enum
@@ -48,6 +51,10 @@ def to_serializable_dict(obj: Any) -> Dict:
if isinstance(obj, (list, tuple, set)):
return [to_serializable_dict(item) for item in obj]
# Handle frozensets, which are not iterable
if isinstance(obj, frozenset):
return [to_serializable_dict(item) for item in list(obj)]
# Handle dictionaries - preserve them as-is
if isinstance(obj, dict):
return {
@@ -60,6 +67,7 @@ def to_serializable_dict(obj: Any) -> Dict:
# Get constructor signature
sig = inspect.signature(obj.__class__.__init__)
params = sig.parameters
_type = obj.__class__.__name__
# Get current values
current_values = {}
@@ -73,6 +81,24 @@ def to_serializable_dict(obj: Any) -> Dict:
if not (is_empty_value(value) and is_empty_value(param.default)):
if value != param.default:
current_values[name] = to_serializable_dict(value)
elif hasattr(obj.__class__, '__slots__') and f"_{name}" in obj.__slots__:
slot = f"_{name}"
slot_value = getattr(obj, slot, None)
if not is_empty_value(slot_value):
current_values[name] = to_serializable_dict(slot_value)
# # Then handle slots if present
# if hasattr(obj.__class__, '__slots__'):
# for slot in obj.__class__.__slots__:
# # Remove leading underscore if present
# param_name = slot[1:] if slot.startswith('_') else slot
# # Get the slot value if it exists
# if hasattr(obj, slot):
# value = getattr(obj, slot)
# if not is_empty_value(value):
# current_values[param_name] = to_serializable_dict(value)
return {
"type": obj.__class__.__name__,
@@ -100,6 +126,9 @@ def from_serializable_dict(data: Any) -> Any:
# Import from crawl4ai for class instances
import crawl4ai
if not hasattr(crawl4ai, data["type"]):
return None
else:
cls = getattr(crawl4ai, data["type"])
# Handle Enum
@@ -361,7 +390,14 @@ class BrowserConfig():
def load( data: dict) -> "BrowserConfig":
# Deserialize the object from a dictionary
config = from_serializable_dict(data)
# check if the deserialized object is an instance of BrowserConfig
if isinstance(config, BrowserConfig):
return config
elif isinstance(config, dict):
return BrowserConfig.from_kwargs(config)
else:
raise ValueError("Invalid data type for BrowserConfig")
class CrawlerRunConfig():
@@ -807,7 +843,13 @@ class CrawlerRunConfig():
def load(data: dict) -> "CrawlerRunConfig":
# Deserialize the object from a dictionary
config = from_serializable_dict(data)
# If config type is alread instant of CrawleRunConfig, return it
if isinstance(config, CrawlerRunConfig):
return config
elif isinstance(config, dict):
return CrawlerRunConfig.from_kwargs(config)
else:
raise ValueError("Invalid data type")
def to_dict(self):
return {

View File

@@ -3,6 +3,17 @@ from .base_strategy import DeepCrawlDecorator, DeepCrawlStrategy
from .bfs_strategy import BFSDeepCrawlStrategy
from .bff_strategy import BestFirstCrawlingStrategy
from .dfs_strategy import DFSDeepCrawlStrategy
from .filters import (
FastFilterChain,
FastContentTypeFilter,
FastDomainFilter,
FastURLFilter,
FastFilterStats,
)
from .scorers import (
FastKeywordRelevanceScorer,
FastURLScorer,
)
__all__ = [
"DeepCrawlDecorator",
@@ -10,4 +21,11 @@ __all__ = [
"BFSDeepCrawlStrategy",
"BestFirstCrawlingStrategy",
"DFSDeepCrawlStrategy",
"FastFilterChain",
"FastContentTypeFilter",
"FastDomainFilter",
"FastURLFilter",
"FastFilterStats",
"FastKeywordRelevanceScorer",
"FastURLScorer",
]

View File

@@ -522,7 +522,7 @@ class FastContentTypeFilter(FastURLFilter):
return path.rpartition(".")[-1].lower()
def __init__(
self, allowed_types: Union[str, List[str]], check_extension: bool = True
self, allowed_types: Union[str, List[str]], check_extension: bool = True, ext_map: Dict[str, str] = _MIME_MAP
):
super().__init__()
# Normalize and store as frozenset for fast lookup

View File

@@ -213,7 +213,37 @@ Configure your build with these parameters:
## Using the API
### Understanding Request Schema
In the following sections, we discuss two ways to communicate with the Docker server. One option is to use the client SDK that I developed for Python, and I will soon develop one for Node.js. I highly recommend this approach to avoid mistakes. Alternatively, you can take a more technical route by using the JSON structure and passing it to all the URLs, which I will explain in detail.
### Python SDK
The SDK makes things easier! Here's how to use it:
```python
from crawl4ai.docker_client import Crawl4aiDockerClient
from crawl4ai import BrowserConfig, CrawlerRunConfig
async with Crawl4aiDockerClient() as client:
# The SDK handles serialization for you!
result = await client.crawl(
urls=["https://example.com"],
browser_config=BrowserConfig(headless=True),
crawler_config=CrawlerRunConfig(stream=False)
)
print(result.markdown)
```
`Crawl4aiDockerClient` is an async context manager that handles the connection for you. You can pass in optional parameters for more control:
- `base_url` (str): Base URL of the Crawl4AI Docker server
- `timeout` (float): Default timeout for requests in seconds
- `verify_ssl` (bool): Whether to verify SSL certificates
- `verbose` (bool): Whether to show logging output
- `log_file` (str, optional): Path to log file if file logging is desired
This client SDK generates a properly structured JSON request for the server's HTTP API.
### Second Approach: Direct API Calls
This is super important! The API expects a specific structure that matches our Python classes. Let me show you how it works.
@@ -247,10 +277,34 @@ This will output something like:
}
```
#### Making API Requests
So when making a request, your JSON should look like this:
#### Structuring Your Requests
1. Basic Request Structure
Every request must include URLs and may include configuration objects:
```json
{
"urls": ["https://example.com"],
"browser_config": {...},
"crawler_config": {...}
}
```
2. Understanding Type-Params Pattern
All complex objects follow this pattern:
```json
{
"type": "ClassName",
"params": {
"param1": value1,
"param2": value2
}
}
```
> 💡 **Note**: Simple types (strings, numbers, booleans) are passed directly without the type-params wrapper.
3. Browser Configuration
```json
{
"urls": ["https://example.com"],
@@ -258,9 +312,37 @@ So when making a request, your JSON should look like this:
"type": "BrowserConfig",
"params": {
"headless": true,
"viewport": {"width": 1200, "height": 800}
"viewport": {
"type": "dict",
"value": {
"width": 1200,
"height": 800
}
},
}
}
}
}
```
4. Simple Crawler Configuration
```json
{
"urls": ["https://example.com"],
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"word_count_threshold": 200,
"stream": true,
"verbose": true
}
}
}
```
5. Advanced Crawler Configuration
```json
{
"urls": ["https://example.com"],
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
@@ -283,7 +365,175 @@ So when making a request, your JSON should look like this:
}
```
> 💡 **Pro tip**: Look at the class names in the library documentation - they map directly to the "type" fields in your requests!
6. Adding Strategies
**Chunking Strategy**:
```json
{
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"chunking_strategy": {
"type": "RegexChunking",
"params": {
"patterns": ["\n\n", "\\.\\s+"]
}
}
}
}
}
```
**Extraction Strategy**:
```json
{
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"extraction_strategy": {
"type": "JsonCssExtractionStrategy",
"params": {
"schema": {
"baseSelector": "article.post",
"fields": [
{"name": "title", "selector": "h1", "type": "text"},
{"name": "content", "selector": ".content", "type": "html"}
]
}
}
}
}
}
}
```
**LLM Extraction Strategy**
```json
{
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"extraction_strategy": {
"type": "LLMExtractionStrategy",
"params": {
"instruction": "Extract article title, author, publication date and main content",
"provider": "openai/gpt-4",
"api_token": "your-api-token",
"schema": {
"type": "dict",
"value": {
"title": "Article Schema",
"type": "object",
"properties": {
"title": {
"type": "string",
"description": "The article's headline"
},
"author": {
"type": "string",
"description": "The author's name"
},
"published_date": {
"type": "string",
"format": "date-time",
"description": "Publication date and time"
},
"content": {
"type": "string",
"description": "The main article content"
}
},
"required": ["title", "content"]
}
}
}
}
}
}
}
```
**Deep Crawler Exampler**
```json
{
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"deep_crawl_strategy": {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": 3,
"max_pages": 100,
"filter_chain": {
"type": "FastFilterChain",
"params": {
"filters": [
{
"type": "FastContentTypeFilter",
"params": {
"allowed_types": ["text/html", "application/xhtml+xml"]
}
},
{
"type": "FastDomainFilter",
"params": {
"allowed_domains": ["blog.*", "docs.*"],
"blocked_domains": ["ads.*", "analytics.*"]
}
},
{
"type": "FastURLPatternFilter",
"params": {
"allowed_patterns": ["^/blog/", "^/docs/"],
"blocked_patterns": [".*/ads/", ".*/sponsored/"]
}
}
]
}
},
"url_scorer": {
"type": "FastCompositeScorer",
"params": {
"scorers": [
{
"type": "FastKeywordRelevanceScorer",
"params": {
"keywords": ["tutorial", "guide", "documentation"],
"weight": 1.0
}
},
{
"type": "FastPathDepthScorer",
"params": {
"weight": 0.5,
"preferred_depth": 2
}
},
{
"type": "FastFreshnessScorer",
"params": {
"weight": 0.8,
"max_age_days": 365
}
}
]
}
}
}
}
}
}
}
```
**Important Rules**:
- Always use the type-params pattern for class instances
- Use direct values for primitives (numbers, strings, booleans)
- Wrap dictionaries with {"type": "dict", "value": {...}}
- Arrays/lists are passed directly without type-params
- All parameters are optional unless specifically required
### REST API Examples
@@ -329,24 +579,6 @@ for line in response.iter_lines():
print(line.decode())
```
### Python SDK
The SDK makes things even easier! Here's how to use it:
```python
from crawl4ai.docker_client import Crawl4aiDockerClient
from crawl4ai import BrowserConfig, CrawlerRunConfig
async with Crawl4aiDockerClient() as client:
# The SDK handles serialization for you!
result = await client.crawl(
urls=["https://example.com"],
browser_config=BrowserConfig(headless=True),
crawler_config=CrawlerRunConfig(stream=False)
)
print(result.markdown)
```
## Metrics & Monitoring
Keep an eye on your crawler with these endpoints:

View File

@@ -0,0 +1,113 @@
import json
from crawl4ai import (
CrawlerRunConfig,
DefaultMarkdownGenerator,
RegexChunking,
JsonCssExtractionStrategy,
BM25ContentFilter,
CacheMode
)
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
from crawl4ai.deep_crawling.filters import FastFilterChain
from crawl4ai.deep_crawling.filters import FastContentTypeFilter, FastDomainFilter
from crawl4ai.deep_crawling.scorers import FastKeywordRelevanceScorer
def create_test_config() -> CrawlerRunConfig:
# Set up content filtering and markdown generation
content_filter = BM25ContentFilter(
user_query="technology articles",
)
markdown_generator = DefaultMarkdownGenerator(
content_filter=content_filter,
options={"ignore_links": False, "body_width": 0}
)
# Set up extraction strategy
extraction_schema = {
"name": "ArticleExtractor",
"baseSelector": "article.content",
"fields": [
{"name": "title", "selector": "h1", "type": "text"},
{"name": "content", "selector": ".article-body", "type": "html"}
]
}
extraction_strategy = JsonCssExtractionStrategy(schema=extraction_schema)
# Set up deep crawling
filter_chain = FastFilterChain([
FastContentTypeFilter(["text/html"]),
FastDomainFilter(blocked_domains=["ads.*"])
])
url_scorer = FastKeywordRelevanceScorer(
keywords=["article", "blog"],
weight=1.0
)
deep_crawl_strategy = BFSDeepCrawlStrategy(
max_depth=3,
filter_chain=filter_chain,
url_scorer=url_scorer
)
# Create the config
config = CrawlerRunConfig(
word_count_threshold=200,
extraction_strategy=extraction_strategy,
chunking_strategy=RegexChunking(patterns=[r"\n\n"]),
markdown_generator=markdown_generator,
css_selector="main.content",
excluded_tags=["nav", "footer"],
keep_attrs=["href", "src"],
cache_mode=CacheMode.BYPASS,
wait_until="networkidle",
page_timeout=30000,
scan_full_page=True,
deep_crawl_strategy=deep_crawl_strategy,
verbose=True,
stream=True
)
return config
def test_config_serialization_cycle():
# Create original config
original_config = create_test_config()
# Dump to serializable dictionary
serialized = original_config.dump()
print(json.dumps(serialized, indent=2))
# Load back into config object
deserialized_config = CrawlerRunConfig.load(serialized)
# Verify core attributes
assert deserialized_config.word_count_threshold == original_config.word_count_threshold
assert deserialized_config.css_selector == original_config.css_selector
assert deserialized_config.excluded_tags == original_config.excluded_tags
assert deserialized_config.keep_attrs == original_config.keep_attrs
assert deserialized_config.cache_mode == original_config.cache_mode
assert deserialized_config.wait_until == original_config.wait_until
assert deserialized_config.page_timeout == original_config.page_timeout
assert deserialized_config.scan_full_page == original_config.scan_full_page
assert deserialized_config.verbose == original_config.verbose
assert deserialized_config.stream == original_config.stream
# Verify complex objects
assert isinstance(deserialized_config.extraction_strategy, JsonCssExtractionStrategy)
assert isinstance(deserialized_config.chunking_strategy, RegexChunking)
assert isinstance(deserialized_config.markdown_generator, DefaultMarkdownGenerator)
assert isinstance(deserialized_config.markdown_generator.content_filter, BM25ContentFilter)
assert isinstance(deserialized_config.deep_crawl_strategy, BFSDeepCrawlStrategy)
# Verify deep crawl strategy configuration
assert deserialized_config.deep_crawl_strategy.max_depth == 3
assert isinstance(deserialized_config.deep_crawl_strategy.filter_chain, FastFilterChain)
assert isinstance(deserialized_config.deep_crawl_strategy.url_scorer, FastKeywordRelevanceScorer)
print("Serialization cycle test passed successfully!")
if __name__ == "__main__":
test_config_serialization_cycle()