refactor(scraping): replace ScrapingMode enum with strategy pattern

Replace the ScrapingMode enum with a proper strategy pattern implementation for content scraping.
This change introduces:
- New ContentScrapingStrategy abstract base class
- Concrete WebScrapingStrategy and LXMLWebScrapingStrategy implementations
- New Pydantic models for structured scraping results
- Updated documentation reflecting the new strategy-based approach

BREAKING CHANGE: ScrapingMode enum has been removed. Users should now use ContentScrapingStrategy implementations instead.
This commit is contained in:
UncleCode
2025-01-13 17:53:12 +08:00
parent f3ae5a657c
commit c3370ec5da
7 changed files with 185 additions and 70 deletions

View File

@@ -1,7 +1,8 @@
# __init__.py
from .async_webcrawler import AsyncWebCrawler, CacheMode
from .async_configs import BrowserConfig, CrawlerRunConfig, ScrapingMode
from .async_configs import BrowserConfig, CrawlerRunConfig
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy, LXMLWebScrapingStrategy
from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy, CosineStrategy, JsonCssExtractionStrategy
from .chunking_strategy import ChunkingStrategy, RegexChunking
from .markdown_generation_strategy import DefaultMarkdownGenerator
@@ -14,7 +15,9 @@ __all__ = [
"AsyncWebCrawler",
"CrawlResult",
"CacheMode",
"ScrapingMode",
"ContentScrapingStrategy",
"WebScrapingStrategy",
"LXMLWebScrapingStrategy",
'BrowserConfig',
'CrawlerRunConfig',
'ExtractionStrategy',