diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index f4aa60b1..ed752252 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -33,10 +33,35 @@ from .async_dispatcher import ( ) from .docker_client import Crawl4aiDockerClient from .hub import CrawlerHub -from .deep_crawling import DeepCrawlStrategy +from .deep_crawling import ( + DeepCrawlStrategy, + BFSDeepCrawlStrategy, + FastFilterChain, + FastContentTypeFilter, + FastDomainFilter, + FastURLFilter, + FastFilterStats, + FastKeywordRelevanceScorer, + FastURLScorer, + BestFirstCrawlingStrategy, + DFSDeepCrawlStrategy, + DeepCrawlDecorator, +) + __all__ = [ "AsyncWebCrawler", "DeepCrawlStrategy", + "BFSDeepCrawlStrategy", + "BestFirstCrawlingStrategy", + "DFSDeepCrawlStrategy", + "FastFilterChain", + "FastContentTypeFilter", + "FastDomainFilter", + "FastFilterStats", + "FastURLFilter", + "FastKeywordRelevanceScorer", + "FastURLScorer", + "DeepCrawlDecorator", "CrawlResult", "CrawlerHub", "CacheMode", diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 0981c1d2..e43b1394 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -1,3 +1,5 @@ +import re +from attr import has from .config import ( MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, @@ -17,6 +19,7 @@ from typing import Union, List from .cache_context import CacheMode from .proxy_strategy import ProxyRotationStrategy + import inspect from typing import Any, Dict, Optional from enum import Enum @@ -47,7 +50,11 @@ def to_serializable_dict(obj: Any) -> Dict: # Handle lists, tuples, and sets if isinstance(obj, (list, tuple, set)): return [to_serializable_dict(item) for item in obj] - + + # Handle frozensets, which are not iterable + if isinstance(obj, frozenset): + return [to_serializable_dict(item) for item in list(obj)] + # Handle dictionaries - preserve them as-is if isinstance(obj, dict): return { @@ -60,6 +67,7 @@ def to_serializable_dict(obj: Any) -> Dict: # Get constructor signature sig = inspect.signature(obj.__class__.__init__) params = sig.parameters + _type = obj.__class__.__name__ # Get current values current_values = {} @@ -73,6 +81,24 @@ def to_serializable_dict(obj: Any) -> Dict: if not (is_empty_value(value) and is_empty_value(param.default)): if value != param.default: current_values[name] = to_serializable_dict(value) + elif hasattr(obj.__class__, '__slots__') and f"_{name}" in obj.__slots__: + slot = f"_{name}" + slot_value = getattr(obj, slot, None) + if not is_empty_value(slot_value): + current_values[name] = to_serializable_dict(slot_value) + + + # # Then handle slots if present + # if hasattr(obj.__class__, '__slots__'): + # for slot in obj.__class__.__slots__: + # # Remove leading underscore if present + # param_name = slot[1:] if slot.startswith('_') else slot + + # # Get the slot value if it exists + # if hasattr(obj, slot): + # value = getattr(obj, slot) + # if not is_empty_value(value): + # current_values[param_name] = to_serializable_dict(value) return { "type": obj.__class__.__name__, @@ -100,7 +126,10 @@ def from_serializable_dict(data: Any) -> Any: # Import from crawl4ai for class instances import crawl4ai - cls = getattr(crawl4ai, data["type"]) + if not hasattr(crawl4ai, data["type"]): + return None + else: + cls = getattr(crawl4ai, data["type"]) # Handle Enum if issubclass(cls, Enum): @@ -361,7 +390,14 @@ class BrowserConfig(): def load( data: dict) -> "BrowserConfig": # Deserialize the object from a dictionary config = from_serializable_dict(data) - return BrowserConfig.from_kwargs(config) + + # check if the deserialized object is an instance of BrowserConfig + if isinstance(config, BrowserConfig): + return config + elif isinstance(config, dict): + return BrowserConfig.from_kwargs(config) + else: + raise ValueError("Invalid data type for BrowserConfig") class CrawlerRunConfig(): @@ -807,7 +843,13 @@ class CrawlerRunConfig(): def load(data: dict) -> "CrawlerRunConfig": # Deserialize the object from a dictionary config = from_serializable_dict(data) - return CrawlerRunConfig.from_kwargs(config) + # If config type is alread instant of CrawleRunConfig, return it + if isinstance(config, CrawlerRunConfig): + return config + elif isinstance(config, dict): + return CrawlerRunConfig.from_kwargs(config) + else: + raise ValueError("Invalid data type") def to_dict(self): return { diff --git a/crawl4ai/deep_crawling/__init__.py b/crawl4ai/deep_crawling/__init__.py index 8ebdb58b..f885d2ab 100644 --- a/crawl4ai/deep_crawling/__init__.py +++ b/crawl4ai/deep_crawling/__init__.py @@ -3,6 +3,17 @@ from .base_strategy import DeepCrawlDecorator, DeepCrawlStrategy from .bfs_strategy import BFSDeepCrawlStrategy from .bff_strategy import BestFirstCrawlingStrategy from .dfs_strategy import DFSDeepCrawlStrategy +from .filters import ( + FastFilterChain, + FastContentTypeFilter, + FastDomainFilter, + FastURLFilter, + FastFilterStats, +) +from .scorers import ( + FastKeywordRelevanceScorer, + FastURLScorer, +) __all__ = [ "DeepCrawlDecorator", @@ -10,4 +21,11 @@ __all__ = [ "BFSDeepCrawlStrategy", "BestFirstCrawlingStrategy", "DFSDeepCrawlStrategy", + "FastFilterChain", + "FastContentTypeFilter", + "FastDomainFilter", + "FastURLFilter", + "FastFilterStats", + "FastKeywordRelevanceScorer", + "FastURLScorer", ] \ No newline at end of file diff --git a/crawl4ai/deep_crawling/filters.py b/crawl4ai/deep_crawling/filters.py index 68472e9d..4e754424 100644 --- a/crawl4ai/deep_crawling/filters.py +++ b/crawl4ai/deep_crawling/filters.py @@ -522,7 +522,7 @@ class FastContentTypeFilter(FastURLFilter): return path.rpartition(".")[-1].lower() def __init__( - self, allowed_types: Union[str, List[str]], check_extension: bool = True + self, allowed_types: Union[str, List[str]], check_extension: bool = True, ext_map: Dict[str, str] = _MIME_MAP ): super().__init__() # Normalize and store as frozenset for fast lookup diff --git a/deploy/docker/README.md b/deploy/docker/README.md index 30f9fc13..46dd803a 100644 --- a/deploy/docker/README.md +++ b/deploy/docker/README.md @@ -213,7 +213,37 @@ Configure your build with these parameters: ## Using the API -### Understanding Request Schema +In the following sections, we discuss two ways to communicate with the Docker server. One option is to use the client SDK that I developed for Python, and I will soon develop one for Node.js. I highly recommend this approach to avoid mistakes. Alternatively, you can take a more technical route by using the JSON structure and passing it to all the URLs, which I will explain in detail. + +### Python SDK + +The SDK makes things easier! Here's how to use it: + +```python +from crawl4ai.docker_client import Crawl4aiDockerClient +from crawl4ai import BrowserConfig, CrawlerRunConfig + +async with Crawl4aiDockerClient() as client: + # The SDK handles serialization for you! + result = await client.crawl( + urls=["https://example.com"], + browser_config=BrowserConfig(headless=True), + crawler_config=CrawlerRunConfig(stream=False) + ) + print(result.markdown) +``` + +`Crawl4aiDockerClient` is an async context manager that handles the connection for you. You can pass in optional parameters for more control: + +- `base_url` (str): Base URL of the Crawl4AI Docker server +- `timeout` (float): Default timeout for requests in seconds +- `verify_ssl` (bool): Whether to verify SSL certificates +- `verbose` (bool): Whether to show logging output +- `log_file` (str, optional): Path to log file if file logging is desired + +This client SDK generates a properly structured JSON request for the server's HTTP API. + +### Second Approach: Direct API Calls This is super important! The API expects a specific structure that matches our Python classes. Let me show you how it works. @@ -247,10 +277,34 @@ This will output something like: } ``` -#### Making API Requests -So when making a request, your JSON should look like this: +#### Structuring Your Requests +1. Basic Request Structure +Every request must include URLs and may include configuration objects: + +```json +{ + "urls": ["https://example.com"], + "browser_config": {...}, + "crawler_config": {...} +} +``` + +2. Understanding Type-Params Pattern +All complex objects follow this pattern: +```json +{ + "type": "ClassName", + "params": { + "param1": value1, + "param2": value2 + } +} +``` +> 💡 **Note**: Simple types (strings, numbers, booleans) are passed directly without the type-params wrapper. + +3. Browser Configuration ```json { "urls": ["https://example.com"], @@ -258,9 +312,37 @@ So when making a request, your JSON should look like this: "type": "BrowserConfig", "params": { "headless": true, - "viewport": {"width": 1200, "height": 800} + "viewport": { + "type": "dict", + "value": { + "width": 1200, + "height": 800 + } + } } - }, + } +} +``` + +4. Simple Crawler Configuration +```json +{ + "urls": ["https://example.com"], + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "word_count_threshold": 200, + "stream": true, + "verbose": true + } + } +} +``` + +5. Advanced Crawler Configuration +```json +{ + "urls": ["https://example.com"], "crawler_config": { "type": "CrawlerRunConfig", "params": { @@ -283,7 +365,175 @@ So when making a request, your JSON should look like this: } ``` -> 💡 **Pro tip**: Look at the class names in the library documentation - they map directly to the "type" fields in your requests! +6. Adding Strategies + +**Chunking Strategy**: +```json +{ + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "chunking_strategy": { + "type": "RegexChunking", + "params": { + "patterns": ["\n\n", "\\.\\s+"] + } + } + } + } +} +``` + +**Extraction Strategy**: +```json +{ + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "extraction_strategy": { + "type": "JsonCssExtractionStrategy", + "params": { + "schema": { + "baseSelector": "article.post", + "fields": [ + {"name": "title", "selector": "h1", "type": "text"}, + {"name": "content", "selector": ".content", "type": "html"} + ] + } + } + } + } + } +} +``` + +**LLM Extraction Strategy** +```json +{ + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "extraction_strategy": { + "type": "LLMExtractionStrategy", + "params": { + "instruction": "Extract article title, author, publication date and main content", + "provider": "openai/gpt-4", + "api_token": "your-api-token", + "schema": { + "type": "dict", + "value": { + "title": "Article Schema", + "type": "object", + "properties": { + "title": { + "type": "string", + "description": "The article's headline" + }, + "author": { + "type": "string", + "description": "The author's name" + }, + "published_date": { + "type": "string", + "format": "date-time", + "description": "Publication date and time" + }, + "content": { + "type": "string", + "description": "The main article content" + } + }, + "required": ["title", "content"] + } + } + } + } + } + } +} +``` + +**Deep Crawler Exampler** +```json +{ + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "deep_crawl_strategy": { + "type": "BFSDeepCrawlStrategy", + "params": { + "max_depth": 3, + "max_pages": 100, + "filter_chain": { + "type": "FastFilterChain", + "params": { + "filters": [ + { + "type": "FastContentTypeFilter", + "params": { + "allowed_types": ["text/html", "application/xhtml+xml"] + } + }, + { + "type": "FastDomainFilter", + "params": { + "allowed_domains": ["blog.*", "docs.*"], + "blocked_domains": ["ads.*", "analytics.*"] + } + }, + { + "type": "FastURLPatternFilter", + "params": { + "allowed_patterns": ["^/blog/", "^/docs/"], + "blocked_patterns": [".*/ads/", ".*/sponsored/"] + } + } + ] + } + }, + "url_scorer": { + "type": "FastCompositeScorer", + "params": { + "scorers": [ + { + "type": "FastKeywordRelevanceScorer", + "params": { + "keywords": ["tutorial", "guide", "documentation"], + "weight": 1.0 + } + }, + { + "type": "FastPathDepthScorer", + "params": { + "weight": 0.5, + "preferred_depth": 2 + } + }, + { + "type": "FastFreshnessScorer", + "params": { + "weight": 0.8, + "max_age_days": 365 + } + } + ] + } + } + } + } + } + } +} +``` + +**Important Rules**: + +- Always use the type-params pattern for class instances +- Use direct values for primitives (numbers, strings, booleans) +- Wrap dictionaries with {"type": "dict", "value": {...}} +- Arrays/lists are passed directly without type-params +- All parameters are optional unless specifically required + ### REST API Examples @@ -329,24 +579,6 @@ for line in response.iter_lines(): print(line.decode()) ``` -### Python SDK - -The SDK makes things even easier! Here's how to use it: - -```python -from crawl4ai.docker_client import Crawl4aiDockerClient -from crawl4ai import BrowserConfig, CrawlerRunConfig - -async with Crawl4aiDockerClient() as client: - # The SDK handles serialization for you! - result = await client.crawl( - urls=["https://example.com"], - browser_config=BrowserConfig(headless=True), - crawler_config=CrawlerRunConfig(stream=False) - ) - print(result.markdown) -``` - ## Metrics & Monitoring Keep an eye on your crawler with these endpoints: diff --git a/tests/docker/test_config_object.py b/tests/docker/test_config_object.py new file mode 100644 index 00000000..94a30f05 --- /dev/null +++ b/tests/docker/test_config_object.py @@ -0,0 +1,113 @@ +import json +from crawl4ai import ( + CrawlerRunConfig, + DefaultMarkdownGenerator, + RegexChunking, + JsonCssExtractionStrategy, + BM25ContentFilter, + CacheMode +) +from crawl4ai.deep_crawling import BFSDeepCrawlStrategy +from crawl4ai.deep_crawling.filters import FastFilterChain +from crawl4ai.deep_crawling.filters import FastContentTypeFilter, FastDomainFilter +from crawl4ai.deep_crawling.scorers import FastKeywordRelevanceScorer + +def create_test_config() -> CrawlerRunConfig: + # Set up content filtering and markdown generation + content_filter = BM25ContentFilter( + user_query="technology articles", + ) + + markdown_generator = DefaultMarkdownGenerator( + content_filter=content_filter, + options={"ignore_links": False, "body_width": 0} + ) + + # Set up extraction strategy + extraction_schema = { + "name": "ArticleExtractor", + "baseSelector": "article.content", + "fields": [ + {"name": "title", "selector": "h1", "type": "text"}, + {"name": "content", "selector": ".article-body", "type": "html"} + ] + } + extraction_strategy = JsonCssExtractionStrategy(schema=extraction_schema) + + # Set up deep crawling + filter_chain = FastFilterChain([ + FastContentTypeFilter(["text/html"]), + FastDomainFilter(blocked_domains=["ads.*"]) + ]) + + url_scorer = FastKeywordRelevanceScorer( + keywords=["article", "blog"], + weight=1.0 + ) + + deep_crawl_strategy = BFSDeepCrawlStrategy( + max_depth=3, + filter_chain=filter_chain, + url_scorer=url_scorer + ) + + # Create the config + config = CrawlerRunConfig( + word_count_threshold=200, + extraction_strategy=extraction_strategy, + chunking_strategy=RegexChunking(patterns=[r"\n\n"]), + markdown_generator=markdown_generator, + css_selector="main.content", + excluded_tags=["nav", "footer"], + keep_attrs=["href", "src"], + cache_mode=CacheMode.BYPASS, + wait_until="networkidle", + page_timeout=30000, + scan_full_page=True, + deep_crawl_strategy=deep_crawl_strategy, + verbose=True, + stream=True + ) + + return config + +def test_config_serialization_cycle(): + # Create original config + original_config = create_test_config() + + # Dump to serializable dictionary + serialized = original_config.dump() + + print(json.dumps(serialized, indent=2)) + + # Load back into config object + deserialized_config = CrawlerRunConfig.load(serialized) + + # Verify core attributes + assert deserialized_config.word_count_threshold == original_config.word_count_threshold + assert deserialized_config.css_selector == original_config.css_selector + assert deserialized_config.excluded_tags == original_config.excluded_tags + assert deserialized_config.keep_attrs == original_config.keep_attrs + assert deserialized_config.cache_mode == original_config.cache_mode + assert deserialized_config.wait_until == original_config.wait_until + assert deserialized_config.page_timeout == original_config.page_timeout + assert deserialized_config.scan_full_page == original_config.scan_full_page + assert deserialized_config.verbose == original_config.verbose + assert deserialized_config.stream == original_config.stream + + # Verify complex objects + assert isinstance(deserialized_config.extraction_strategy, JsonCssExtractionStrategy) + assert isinstance(deserialized_config.chunking_strategy, RegexChunking) + assert isinstance(deserialized_config.markdown_generator, DefaultMarkdownGenerator) + assert isinstance(deserialized_config.markdown_generator.content_filter, BM25ContentFilter) + assert isinstance(deserialized_config.deep_crawl_strategy, BFSDeepCrawlStrategy) + + # Verify deep crawl strategy configuration + assert deserialized_config.deep_crawl_strategy.max_depth == 3 + assert isinstance(deserialized_config.deep_crawl_strategy.filter_chain, FastFilterChain) + assert isinstance(deserialized_config.deep_crawl_strategy.url_scorer, FastKeywordRelevanceScorer) + + print("Serialization cycle test passed successfully!") + +if __name__ == "__main__": + test_config_serialization_cycle() \ No newline at end of file