feat(config): enhance serialization and add deep crawling exports
Improve configuration serialization with better handling of frozensets and slots. Expand deep crawling module exports and documentation. Add comprehensive API usage examples in Docker README. - Add support for frozenset serialization - Improve error handling in config loading - Export additional deep crawling components - Enhance Docker API documentation with detailed examples - Fix ContentTypeFilter initialization
This commit is contained in:
113
tests/docker/test_config_object.py
Normal file
113
tests/docker/test_config_object.py
Normal file
@@ -0,0 +1,113 @@
|
||||
import json
|
||||
from crawl4ai import (
|
||||
CrawlerRunConfig,
|
||||
DefaultMarkdownGenerator,
|
||||
RegexChunking,
|
||||
JsonCssExtractionStrategy,
|
||||
BM25ContentFilter,
|
||||
CacheMode
|
||||
)
|
||||
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
|
||||
from crawl4ai.deep_crawling.filters import FastFilterChain
|
||||
from crawl4ai.deep_crawling.filters import FastContentTypeFilter, FastDomainFilter
|
||||
from crawl4ai.deep_crawling.scorers import FastKeywordRelevanceScorer
|
||||
|
||||
def create_test_config() -> CrawlerRunConfig:
|
||||
# Set up content filtering and markdown generation
|
||||
content_filter = BM25ContentFilter(
|
||||
user_query="technology articles",
|
||||
)
|
||||
|
||||
markdown_generator = DefaultMarkdownGenerator(
|
||||
content_filter=content_filter,
|
||||
options={"ignore_links": False, "body_width": 0}
|
||||
)
|
||||
|
||||
# Set up extraction strategy
|
||||
extraction_schema = {
|
||||
"name": "ArticleExtractor",
|
||||
"baseSelector": "article.content",
|
||||
"fields": [
|
||||
{"name": "title", "selector": "h1", "type": "text"},
|
||||
{"name": "content", "selector": ".article-body", "type": "html"}
|
||||
]
|
||||
}
|
||||
extraction_strategy = JsonCssExtractionStrategy(schema=extraction_schema)
|
||||
|
||||
# Set up deep crawling
|
||||
filter_chain = FastFilterChain([
|
||||
FastContentTypeFilter(["text/html"]),
|
||||
FastDomainFilter(blocked_domains=["ads.*"])
|
||||
])
|
||||
|
||||
url_scorer = FastKeywordRelevanceScorer(
|
||||
keywords=["article", "blog"],
|
||||
weight=1.0
|
||||
)
|
||||
|
||||
deep_crawl_strategy = BFSDeepCrawlStrategy(
|
||||
max_depth=3,
|
||||
filter_chain=filter_chain,
|
||||
url_scorer=url_scorer
|
||||
)
|
||||
|
||||
# Create the config
|
||||
config = CrawlerRunConfig(
|
||||
word_count_threshold=200,
|
||||
extraction_strategy=extraction_strategy,
|
||||
chunking_strategy=RegexChunking(patterns=[r"\n\n"]),
|
||||
markdown_generator=markdown_generator,
|
||||
css_selector="main.content",
|
||||
excluded_tags=["nav", "footer"],
|
||||
keep_attrs=["href", "src"],
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
wait_until="networkidle",
|
||||
page_timeout=30000,
|
||||
scan_full_page=True,
|
||||
deep_crawl_strategy=deep_crawl_strategy,
|
||||
verbose=True,
|
||||
stream=True
|
||||
)
|
||||
|
||||
return config
|
||||
|
||||
def test_config_serialization_cycle():
|
||||
# Create original config
|
||||
original_config = create_test_config()
|
||||
|
||||
# Dump to serializable dictionary
|
||||
serialized = original_config.dump()
|
||||
|
||||
print(json.dumps(serialized, indent=2))
|
||||
|
||||
# Load back into config object
|
||||
deserialized_config = CrawlerRunConfig.load(serialized)
|
||||
|
||||
# Verify core attributes
|
||||
assert deserialized_config.word_count_threshold == original_config.word_count_threshold
|
||||
assert deserialized_config.css_selector == original_config.css_selector
|
||||
assert deserialized_config.excluded_tags == original_config.excluded_tags
|
||||
assert deserialized_config.keep_attrs == original_config.keep_attrs
|
||||
assert deserialized_config.cache_mode == original_config.cache_mode
|
||||
assert deserialized_config.wait_until == original_config.wait_until
|
||||
assert deserialized_config.page_timeout == original_config.page_timeout
|
||||
assert deserialized_config.scan_full_page == original_config.scan_full_page
|
||||
assert deserialized_config.verbose == original_config.verbose
|
||||
assert deserialized_config.stream == original_config.stream
|
||||
|
||||
# Verify complex objects
|
||||
assert isinstance(deserialized_config.extraction_strategy, JsonCssExtractionStrategy)
|
||||
assert isinstance(deserialized_config.chunking_strategy, RegexChunking)
|
||||
assert isinstance(deserialized_config.markdown_generator, DefaultMarkdownGenerator)
|
||||
assert isinstance(deserialized_config.markdown_generator.content_filter, BM25ContentFilter)
|
||||
assert isinstance(deserialized_config.deep_crawl_strategy, BFSDeepCrawlStrategy)
|
||||
|
||||
# Verify deep crawl strategy configuration
|
||||
assert deserialized_config.deep_crawl_strategy.max_depth == 3
|
||||
assert isinstance(deserialized_config.deep_crawl_strategy.filter_chain, FastFilterChain)
|
||||
assert isinstance(deserialized_config.deep_crawl_strategy.url_scorer, FastKeywordRelevanceScorer)
|
||||
|
||||
print("Serialization cycle test passed successfully!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_config_serialization_cycle()
|
||||
Reference in New Issue
Block a user