feat(config): enhance serialization and add deep crawling exports
Improve configuration serialization with better handling of frozensets and slots. Expand deep crawling module exports and documentation. Add comprehensive API usage examples in Docker README. - Add support for frozenset serialization - Improve error handling in config loading - Export additional deep crawling components - Enhance Docker API documentation with detailed examples - Fix ContentTypeFilter initialization
This commit is contained in:
@@ -33,10 +33,35 @@ from .async_dispatcher import (
|
||||
)
|
||||
from .docker_client import Crawl4aiDockerClient
|
||||
from .hub import CrawlerHub
|
||||
from .deep_crawling import DeepCrawlStrategy
|
||||
from .deep_crawling import (
|
||||
DeepCrawlStrategy,
|
||||
BFSDeepCrawlStrategy,
|
||||
FastFilterChain,
|
||||
FastContentTypeFilter,
|
||||
FastDomainFilter,
|
||||
FastURLFilter,
|
||||
FastFilterStats,
|
||||
FastKeywordRelevanceScorer,
|
||||
FastURLScorer,
|
||||
BestFirstCrawlingStrategy,
|
||||
DFSDeepCrawlStrategy,
|
||||
DeepCrawlDecorator,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"AsyncWebCrawler",
|
||||
"DeepCrawlStrategy",
|
||||
"BFSDeepCrawlStrategy",
|
||||
"BestFirstCrawlingStrategy",
|
||||
"DFSDeepCrawlStrategy",
|
||||
"FastFilterChain",
|
||||
"FastContentTypeFilter",
|
||||
"FastDomainFilter",
|
||||
"FastFilterStats",
|
||||
"FastURLFilter",
|
||||
"FastKeywordRelevanceScorer",
|
||||
"FastURLScorer",
|
||||
"DeepCrawlDecorator",
|
||||
"CrawlResult",
|
||||
"CrawlerHub",
|
||||
"CacheMode",
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
import re
|
||||
from attr import has
|
||||
from .config import (
|
||||
MIN_WORD_THRESHOLD,
|
||||
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
||||
@@ -17,6 +19,7 @@ from typing import Union, List
|
||||
from .cache_context import CacheMode
|
||||
from .proxy_strategy import ProxyRotationStrategy
|
||||
|
||||
|
||||
import inspect
|
||||
from typing import Any, Dict, Optional
|
||||
from enum import Enum
|
||||
@@ -48,6 +51,10 @@ def to_serializable_dict(obj: Any) -> Dict:
|
||||
if isinstance(obj, (list, tuple, set)):
|
||||
return [to_serializable_dict(item) for item in obj]
|
||||
|
||||
# Handle frozensets, which are not iterable
|
||||
if isinstance(obj, frozenset):
|
||||
return [to_serializable_dict(item) for item in list(obj)]
|
||||
|
||||
# Handle dictionaries - preserve them as-is
|
||||
if isinstance(obj, dict):
|
||||
return {
|
||||
@@ -60,6 +67,7 @@ def to_serializable_dict(obj: Any) -> Dict:
|
||||
# Get constructor signature
|
||||
sig = inspect.signature(obj.__class__.__init__)
|
||||
params = sig.parameters
|
||||
_type = obj.__class__.__name__
|
||||
|
||||
# Get current values
|
||||
current_values = {}
|
||||
@@ -73,6 +81,24 @@ def to_serializable_dict(obj: Any) -> Dict:
|
||||
if not (is_empty_value(value) and is_empty_value(param.default)):
|
||||
if value != param.default:
|
||||
current_values[name] = to_serializable_dict(value)
|
||||
elif hasattr(obj.__class__, '__slots__') and f"_{name}" in obj.__slots__:
|
||||
slot = f"_{name}"
|
||||
slot_value = getattr(obj, slot, None)
|
||||
if not is_empty_value(slot_value):
|
||||
current_values[name] = to_serializable_dict(slot_value)
|
||||
|
||||
|
||||
# # Then handle slots if present
|
||||
# if hasattr(obj.__class__, '__slots__'):
|
||||
# for slot in obj.__class__.__slots__:
|
||||
# # Remove leading underscore if present
|
||||
# param_name = slot[1:] if slot.startswith('_') else slot
|
||||
|
||||
# # Get the slot value if it exists
|
||||
# if hasattr(obj, slot):
|
||||
# value = getattr(obj, slot)
|
||||
# if not is_empty_value(value):
|
||||
# current_values[param_name] = to_serializable_dict(value)
|
||||
|
||||
return {
|
||||
"type": obj.__class__.__name__,
|
||||
@@ -100,6 +126,9 @@ def from_serializable_dict(data: Any) -> Any:
|
||||
|
||||
# Import from crawl4ai for class instances
|
||||
import crawl4ai
|
||||
if not hasattr(crawl4ai, data["type"]):
|
||||
return None
|
||||
else:
|
||||
cls = getattr(crawl4ai, data["type"])
|
||||
|
||||
# Handle Enum
|
||||
@@ -361,7 +390,14 @@ class BrowserConfig():
|
||||
def load( data: dict) -> "BrowserConfig":
|
||||
# Deserialize the object from a dictionary
|
||||
config = from_serializable_dict(data)
|
||||
|
||||
# check if the deserialized object is an instance of BrowserConfig
|
||||
if isinstance(config, BrowserConfig):
|
||||
return config
|
||||
elif isinstance(config, dict):
|
||||
return BrowserConfig.from_kwargs(config)
|
||||
else:
|
||||
raise ValueError("Invalid data type for BrowserConfig")
|
||||
|
||||
|
||||
class CrawlerRunConfig():
|
||||
@@ -807,7 +843,13 @@ class CrawlerRunConfig():
|
||||
def load(data: dict) -> "CrawlerRunConfig":
|
||||
# Deserialize the object from a dictionary
|
||||
config = from_serializable_dict(data)
|
||||
# If config type is alread instant of CrawleRunConfig, return it
|
||||
if isinstance(config, CrawlerRunConfig):
|
||||
return config
|
||||
elif isinstance(config, dict):
|
||||
return CrawlerRunConfig.from_kwargs(config)
|
||||
else:
|
||||
raise ValueError("Invalid data type")
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
|
||||
@@ -3,6 +3,17 @@ from .base_strategy import DeepCrawlDecorator, DeepCrawlStrategy
|
||||
from .bfs_strategy import BFSDeepCrawlStrategy
|
||||
from .bff_strategy import BestFirstCrawlingStrategy
|
||||
from .dfs_strategy import DFSDeepCrawlStrategy
|
||||
from .filters import (
|
||||
FastFilterChain,
|
||||
FastContentTypeFilter,
|
||||
FastDomainFilter,
|
||||
FastURLFilter,
|
||||
FastFilterStats,
|
||||
)
|
||||
from .scorers import (
|
||||
FastKeywordRelevanceScorer,
|
||||
FastURLScorer,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"DeepCrawlDecorator",
|
||||
@@ -10,4 +21,11 @@ __all__ = [
|
||||
"BFSDeepCrawlStrategy",
|
||||
"BestFirstCrawlingStrategy",
|
||||
"DFSDeepCrawlStrategy",
|
||||
"FastFilterChain",
|
||||
"FastContentTypeFilter",
|
||||
"FastDomainFilter",
|
||||
"FastURLFilter",
|
||||
"FastFilterStats",
|
||||
"FastKeywordRelevanceScorer",
|
||||
"FastURLScorer",
|
||||
]
|
||||
@@ -522,7 +522,7 @@ class FastContentTypeFilter(FastURLFilter):
|
||||
return path.rpartition(".")[-1].lower()
|
||||
|
||||
def __init__(
|
||||
self, allowed_types: Union[str, List[str]], check_extension: bool = True
|
||||
self, allowed_types: Union[str, List[str]], check_extension: bool = True, ext_map: Dict[str, str] = _MIME_MAP
|
||||
):
|
||||
super().__init__()
|
||||
# Normalize and store as frozenset for fast lookup
|
||||
|
||||
@@ -213,7 +213,37 @@ Configure your build with these parameters:
|
||||
|
||||
## Using the API
|
||||
|
||||
### Understanding Request Schema
|
||||
In the following sections, we discuss two ways to communicate with the Docker server. One option is to use the client SDK that I developed for Python, and I will soon develop one for Node.js. I highly recommend this approach to avoid mistakes. Alternatively, you can take a more technical route by using the JSON structure and passing it to all the URLs, which I will explain in detail.
|
||||
|
||||
### Python SDK
|
||||
|
||||
The SDK makes things easier! Here's how to use it:
|
||||
|
||||
```python
|
||||
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||
from crawl4ai import BrowserConfig, CrawlerRunConfig
|
||||
|
||||
async with Crawl4aiDockerClient() as client:
|
||||
# The SDK handles serialization for you!
|
||||
result = await client.crawl(
|
||||
urls=["https://example.com"],
|
||||
browser_config=BrowserConfig(headless=True),
|
||||
crawler_config=CrawlerRunConfig(stream=False)
|
||||
)
|
||||
print(result.markdown)
|
||||
```
|
||||
|
||||
`Crawl4aiDockerClient` is an async context manager that handles the connection for you. You can pass in optional parameters for more control:
|
||||
|
||||
- `base_url` (str): Base URL of the Crawl4AI Docker server
|
||||
- `timeout` (float): Default timeout for requests in seconds
|
||||
- `verify_ssl` (bool): Whether to verify SSL certificates
|
||||
- `verbose` (bool): Whether to show logging output
|
||||
- `log_file` (str, optional): Path to log file if file logging is desired
|
||||
|
||||
This client SDK generates a properly structured JSON request for the server's HTTP API.
|
||||
|
||||
### Second Approach: Direct API Calls
|
||||
|
||||
This is super important! The API expects a specific structure that matches our Python classes. Let me show you how it works.
|
||||
|
||||
@@ -247,10 +277,34 @@ This will output something like:
|
||||
}
|
||||
```
|
||||
|
||||
#### Making API Requests
|
||||
|
||||
So when making a request, your JSON should look like this:
|
||||
#### Structuring Your Requests
|
||||
|
||||
1. Basic Request Structure
|
||||
Every request must include URLs and may include configuration objects:
|
||||
|
||||
```json
|
||||
{
|
||||
"urls": ["https://example.com"],
|
||||
"browser_config": {...},
|
||||
"crawler_config": {...}
|
||||
}
|
||||
```
|
||||
|
||||
2. Understanding Type-Params Pattern
|
||||
All complex objects follow this pattern:
|
||||
```json
|
||||
{
|
||||
"type": "ClassName",
|
||||
"params": {
|
||||
"param1": value1,
|
||||
"param2": value2
|
||||
}
|
||||
}
|
||||
```
|
||||
> 💡 **Note**: Simple types (strings, numbers, booleans) are passed directly without the type-params wrapper.
|
||||
|
||||
3. Browser Configuration
|
||||
```json
|
||||
{
|
||||
"urls": ["https://example.com"],
|
||||
@@ -258,9 +312,37 @@ So when making a request, your JSON should look like this:
|
||||
"type": "BrowserConfig",
|
||||
"params": {
|
||||
"headless": true,
|
||||
"viewport": {"width": 1200, "height": 800}
|
||||
"viewport": {
|
||||
"type": "dict",
|
||||
"value": {
|
||||
"width": 1200,
|
||||
"height": 800
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
4. Simple Crawler Configuration
|
||||
```json
|
||||
{
|
||||
"urls": ["https://example.com"],
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"word_count_threshold": 200,
|
||||
"stream": true,
|
||||
"verbose": true
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
5. Advanced Crawler Configuration
|
||||
```json
|
||||
{
|
||||
"urls": ["https://example.com"],
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
@@ -283,7 +365,175 @@ So when making a request, your JSON should look like this:
|
||||
}
|
||||
```
|
||||
|
||||
> 💡 **Pro tip**: Look at the class names in the library documentation - they map directly to the "type" fields in your requests!
|
||||
6. Adding Strategies
|
||||
|
||||
**Chunking Strategy**:
|
||||
```json
|
||||
{
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"chunking_strategy": {
|
||||
"type": "RegexChunking",
|
||||
"params": {
|
||||
"patterns": ["\n\n", "\\.\\s+"]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Extraction Strategy**:
|
||||
```json
|
||||
{
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"extraction_strategy": {
|
||||
"type": "JsonCssExtractionStrategy",
|
||||
"params": {
|
||||
"schema": {
|
||||
"baseSelector": "article.post",
|
||||
"fields": [
|
||||
{"name": "title", "selector": "h1", "type": "text"},
|
||||
{"name": "content", "selector": ".content", "type": "html"}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**LLM Extraction Strategy**
|
||||
```json
|
||||
{
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"extraction_strategy": {
|
||||
"type": "LLMExtractionStrategy",
|
||||
"params": {
|
||||
"instruction": "Extract article title, author, publication date and main content",
|
||||
"provider": "openai/gpt-4",
|
||||
"api_token": "your-api-token",
|
||||
"schema": {
|
||||
"type": "dict",
|
||||
"value": {
|
||||
"title": "Article Schema",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string",
|
||||
"description": "The article's headline"
|
||||
},
|
||||
"author": {
|
||||
"type": "string",
|
||||
"description": "The author's name"
|
||||
},
|
||||
"published_date": {
|
||||
"type": "string",
|
||||
"format": "date-time",
|
||||
"description": "Publication date and time"
|
||||
},
|
||||
"content": {
|
||||
"type": "string",
|
||||
"description": "The main article content"
|
||||
}
|
||||
},
|
||||
"required": ["title", "content"]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Deep Crawler Exampler**
|
||||
```json
|
||||
{
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"deep_crawl_strategy": {
|
||||
"type": "BFSDeepCrawlStrategy",
|
||||
"params": {
|
||||
"max_depth": 3,
|
||||
"max_pages": 100,
|
||||
"filter_chain": {
|
||||
"type": "FastFilterChain",
|
||||
"params": {
|
||||
"filters": [
|
||||
{
|
||||
"type": "FastContentTypeFilter",
|
||||
"params": {
|
||||
"allowed_types": ["text/html", "application/xhtml+xml"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "FastDomainFilter",
|
||||
"params": {
|
||||
"allowed_domains": ["blog.*", "docs.*"],
|
||||
"blocked_domains": ["ads.*", "analytics.*"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "FastURLPatternFilter",
|
||||
"params": {
|
||||
"allowed_patterns": ["^/blog/", "^/docs/"],
|
||||
"blocked_patterns": [".*/ads/", ".*/sponsored/"]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"url_scorer": {
|
||||
"type": "FastCompositeScorer",
|
||||
"params": {
|
||||
"scorers": [
|
||||
{
|
||||
"type": "FastKeywordRelevanceScorer",
|
||||
"params": {
|
||||
"keywords": ["tutorial", "guide", "documentation"],
|
||||
"weight": 1.0
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "FastPathDepthScorer",
|
||||
"params": {
|
||||
"weight": 0.5,
|
||||
"preferred_depth": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "FastFreshnessScorer",
|
||||
"params": {
|
||||
"weight": 0.8,
|
||||
"max_age_days": 365
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Important Rules**:
|
||||
|
||||
- Always use the type-params pattern for class instances
|
||||
- Use direct values for primitives (numbers, strings, booleans)
|
||||
- Wrap dictionaries with {"type": "dict", "value": {...}}
|
||||
- Arrays/lists are passed directly without type-params
|
||||
- All parameters are optional unless specifically required
|
||||
|
||||
|
||||
### REST API Examples
|
||||
|
||||
@@ -329,24 +579,6 @@ for line in response.iter_lines():
|
||||
print(line.decode())
|
||||
```
|
||||
|
||||
### Python SDK
|
||||
|
||||
The SDK makes things even easier! Here's how to use it:
|
||||
|
||||
```python
|
||||
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||
from crawl4ai import BrowserConfig, CrawlerRunConfig
|
||||
|
||||
async with Crawl4aiDockerClient() as client:
|
||||
# The SDK handles serialization for you!
|
||||
result = await client.crawl(
|
||||
urls=["https://example.com"],
|
||||
browser_config=BrowserConfig(headless=True),
|
||||
crawler_config=CrawlerRunConfig(stream=False)
|
||||
)
|
||||
print(result.markdown)
|
||||
```
|
||||
|
||||
## Metrics & Monitoring
|
||||
|
||||
Keep an eye on your crawler with these endpoints:
|
||||
|
||||
113
tests/docker/test_config_object.py
Normal file
113
tests/docker/test_config_object.py
Normal file
@@ -0,0 +1,113 @@
|
||||
import json
|
||||
from crawl4ai import (
|
||||
CrawlerRunConfig,
|
||||
DefaultMarkdownGenerator,
|
||||
RegexChunking,
|
||||
JsonCssExtractionStrategy,
|
||||
BM25ContentFilter,
|
||||
CacheMode
|
||||
)
|
||||
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
|
||||
from crawl4ai.deep_crawling.filters import FastFilterChain
|
||||
from crawl4ai.deep_crawling.filters import FastContentTypeFilter, FastDomainFilter
|
||||
from crawl4ai.deep_crawling.scorers import FastKeywordRelevanceScorer
|
||||
|
||||
def create_test_config() -> CrawlerRunConfig:
|
||||
# Set up content filtering and markdown generation
|
||||
content_filter = BM25ContentFilter(
|
||||
user_query="technology articles",
|
||||
)
|
||||
|
||||
markdown_generator = DefaultMarkdownGenerator(
|
||||
content_filter=content_filter,
|
||||
options={"ignore_links": False, "body_width": 0}
|
||||
)
|
||||
|
||||
# Set up extraction strategy
|
||||
extraction_schema = {
|
||||
"name": "ArticleExtractor",
|
||||
"baseSelector": "article.content",
|
||||
"fields": [
|
||||
{"name": "title", "selector": "h1", "type": "text"},
|
||||
{"name": "content", "selector": ".article-body", "type": "html"}
|
||||
]
|
||||
}
|
||||
extraction_strategy = JsonCssExtractionStrategy(schema=extraction_schema)
|
||||
|
||||
# Set up deep crawling
|
||||
filter_chain = FastFilterChain([
|
||||
FastContentTypeFilter(["text/html"]),
|
||||
FastDomainFilter(blocked_domains=["ads.*"])
|
||||
])
|
||||
|
||||
url_scorer = FastKeywordRelevanceScorer(
|
||||
keywords=["article", "blog"],
|
||||
weight=1.0
|
||||
)
|
||||
|
||||
deep_crawl_strategy = BFSDeepCrawlStrategy(
|
||||
max_depth=3,
|
||||
filter_chain=filter_chain,
|
||||
url_scorer=url_scorer
|
||||
)
|
||||
|
||||
# Create the config
|
||||
config = CrawlerRunConfig(
|
||||
word_count_threshold=200,
|
||||
extraction_strategy=extraction_strategy,
|
||||
chunking_strategy=RegexChunking(patterns=[r"\n\n"]),
|
||||
markdown_generator=markdown_generator,
|
||||
css_selector="main.content",
|
||||
excluded_tags=["nav", "footer"],
|
||||
keep_attrs=["href", "src"],
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
wait_until="networkidle",
|
||||
page_timeout=30000,
|
||||
scan_full_page=True,
|
||||
deep_crawl_strategy=deep_crawl_strategy,
|
||||
verbose=True,
|
||||
stream=True
|
||||
)
|
||||
|
||||
return config
|
||||
|
||||
def test_config_serialization_cycle():
|
||||
# Create original config
|
||||
original_config = create_test_config()
|
||||
|
||||
# Dump to serializable dictionary
|
||||
serialized = original_config.dump()
|
||||
|
||||
print(json.dumps(serialized, indent=2))
|
||||
|
||||
# Load back into config object
|
||||
deserialized_config = CrawlerRunConfig.load(serialized)
|
||||
|
||||
# Verify core attributes
|
||||
assert deserialized_config.word_count_threshold == original_config.word_count_threshold
|
||||
assert deserialized_config.css_selector == original_config.css_selector
|
||||
assert deserialized_config.excluded_tags == original_config.excluded_tags
|
||||
assert deserialized_config.keep_attrs == original_config.keep_attrs
|
||||
assert deserialized_config.cache_mode == original_config.cache_mode
|
||||
assert deserialized_config.wait_until == original_config.wait_until
|
||||
assert deserialized_config.page_timeout == original_config.page_timeout
|
||||
assert deserialized_config.scan_full_page == original_config.scan_full_page
|
||||
assert deserialized_config.verbose == original_config.verbose
|
||||
assert deserialized_config.stream == original_config.stream
|
||||
|
||||
# Verify complex objects
|
||||
assert isinstance(deserialized_config.extraction_strategy, JsonCssExtractionStrategy)
|
||||
assert isinstance(deserialized_config.chunking_strategy, RegexChunking)
|
||||
assert isinstance(deserialized_config.markdown_generator, DefaultMarkdownGenerator)
|
||||
assert isinstance(deserialized_config.markdown_generator.content_filter, BM25ContentFilter)
|
||||
assert isinstance(deserialized_config.deep_crawl_strategy, BFSDeepCrawlStrategy)
|
||||
|
||||
# Verify deep crawl strategy configuration
|
||||
assert deserialized_config.deep_crawl_strategy.max_depth == 3
|
||||
assert isinstance(deserialized_config.deep_crawl_strategy.filter_chain, FastFilterChain)
|
||||
assert isinstance(deserialized_config.deep_crawl_strategy.url_scorer, FastKeywordRelevanceScorer)
|
||||
|
||||
print("Serialization cycle test passed successfully!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_config_serialization_cycle()
|
||||
Reference in New Issue
Block a user