Enhance AsyncWebCrawler and related configurations

- Introduced new configuration classes: BrowserConfig and CrawlerRunConfig. - Refactored AsyncWebCrawler to leverage the new configuration system for cleaner parameter management. - Updated AsyncPlaywrightCrawlerStrategy for better flexibility and reduced legacy parameters. - Improved error handling with detailed context extraction during exceptions. - Enhanced overall maintainability and usability of the web crawler.
2024-12-12 19:35:09 +08:00
parent 5188b7a6a0
commit 0982c639ae
14 changed files with 6373 additions and 2667 deletions
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -1,7 +1,11 @@
 # __init__.py

 from .async_webcrawler import AsyncWebCrawler, CacheMode
-
+from .async_configs import BrowserConfig, CrawlerRunConfig
+from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy, CosineStrategy, JsonCssExtractionStrategy
+from .chunking_strategy import ChunkingStrategy, RegexChunking
+from .markdown_generation_strategy import DefaultMarkdownGenerator
+from .content_filter_strategy import PruningContentFilter, BM25ContentFilter
 from .models import CrawlResult
 from .__version__ import __version__

@@ -9,6 +13,17 @@ __all__ = [
    "AsyncWebCrawler",
    "CrawlResult",
    "CacheMode",
+    'BrowserConfig',
+    'CrawlerRunConfig',
+    'ExtractionStrategy',
+    'LLMExtractionStrategy',
+    'CosineStrategy',
+    'JsonCssExtractionStrategy',
+    'ChunkingStrategy',
+    'RegexChunking',
+    'DefaultMarkdownGenerator',
+    'PruningContentFilter',
+    'BM25ContentFilter',
 ]

 def is_sync_version_installed():