refactor(scraping): replace ScrapingMode enum with strategy pattern

Replace the ScrapingMode enum with a proper strategy pattern implementation for content scraping. This change introduces: - New ContentScrapingStrategy abstract base class - Concrete WebScrapingStrategy and LXMLWebScrapingStrategy implementations - New Pydantic models for structured scraping results - Updated documentation reflecting the new strategy-based approach BREAKING CHANGE: ScrapingMode enum has been removed. Users should now use ContentScrapingStrategy implementations instead.
2025-01-13 17:53:12 +08:00
parent f3ae5a657c
commit c3370ec5da
7 changed files with 185 additions and 70 deletions
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -1,7 +1,8 @@
 # __init__.py
 from .async_webcrawler import AsyncWebCrawler, CacheMode
-from .async_configs import BrowserConfig, CrawlerRunConfig, ScrapingMode
+from .async_configs import BrowserConfig, CrawlerRunConfig
 from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy, LXMLWebScrapingStrategy
 from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy, CosineStrategy, JsonCssExtractionStrategy
 from .chunking_strategy import ChunkingStrategy, RegexChunking
 from .markdown_generation_strategy import DefaultMarkdownGenerator
@@ -14,7 +15,9 @@ __all__ = [
    "AsyncWebCrawler",
    "CrawlResult",
    "CacheMode",
-    "ScrapingMode",
+    "ContentScrapingStrategy",
    "WebScrapingStrategy",
    "LXMLWebScrapingStrategy",
    'BrowserConfig',
    'CrawlerRunConfig',
    'ExtractionStrategy',
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -9,10 +9,10 @@ from .config import (
 )
 from .user_agent_generator import UserAgentGenerator
 from .extraction_strategy import ExtractionStrategy
-from .chunking_strategy import ChunkingStrategy
+from .chunking_strategy import ChunkingStrategy, RegexChunking
 from .markdown_generation_strategy import MarkdownGenerationStrategy
 from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
 from typing import Union, List
 from enum import Enum
 class BrowserConfig:
    """
@@ -184,12 +184,6 @@ class BrowserConfig:
        )
 class ScrapingMode(str, Enum):
    """Enum for different scraping modes."""
    BEAUTIFULSOUP = "beautifulsoup"
    LXML = "lxml"
 class CrawlerRunConfig:
    """
    Configuration class for controlling how the crawler runs each crawl operation.
@@ -227,8 +221,8 @@ class CrawlerRunConfig:
                          Default: False.
        parser_type (str): Type of parser to use for HTML parsing.
                           Default: "lxml".
-        scraping_mode (ScrapingMode): Scraping mode to use.
+        scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
-                           Default: ScrapingMode.BEAUTIFULSOUP.
+                           Default: WebScrapingStrategy.
        # Caching Parameters
        cache_mode (CacheMode or None): Defines how caching is handled.
@@ -329,7 +323,7 @@ class CrawlerRunConfig:
        # Content Processing Parameters
        word_count_threshold: int = MIN_WORD_THRESHOLD,
        extraction_strategy: ExtractionStrategy = None,
-        chunking_strategy: ChunkingStrategy = None,
+        chunking_strategy: ChunkingStrategy = RegexChunking(),
        markdown_generator: MarkdownGenerationStrategy = None,
        content_filter=None,
        only_text: bool = False,
@@ -340,7 +334,7 @@ class CrawlerRunConfig:
        remove_forms: bool = False,
        prettiify: bool = False,
        parser_type: str = "lxml",
-        scraping_mode: ScrapingMode = ScrapingMode.BEAUTIFULSOUP,
+        scraping_strategy: ContentScrapingStrategy = None,
        # SSL Parameters
        fetch_ssl_certificate: bool = False,
@@ -413,7 +407,7 @@ class CrawlerRunConfig:
        self.remove_forms = remove_forms
        self.prettiify = prettiify
        self.parser_type = parser_type
-        self.scraping_mode = scraping_mode
+        self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
        # SSL Parameters
        self.fetch_ssl_certificate = fetch_ssl_certificate
@@ -480,7 +474,6 @@ class CrawlerRunConfig:
        # Set default chunking strategy if None
        if self.chunking_strategy is None:
            from .chunking_strategy import RegexChunking
            self.chunking_strategy = RegexChunking()
    @staticmethod
@@ -489,7 +482,7 @@ class CrawlerRunConfig:
            # Content Processing Parameters
            word_count_threshold=kwargs.get("word_count_threshold", 200),
            extraction_strategy=kwargs.get("extraction_strategy"),
-            chunking_strategy=kwargs.get("chunking_strategy"),
+            chunking_strategy=kwargs.get("chunking_strategy", RegexChunking()),
            markdown_generator=kwargs.get("markdown_generator"),
            content_filter=kwargs.get("content_filter"),
            only_text=kwargs.get("only_text", False),
@@ -500,7 +493,7 @@ class CrawlerRunConfig:
            remove_forms=kwargs.get("remove_forms", False),
            prettiify=kwargs.get("prettiify", False),
            parser_type=kwargs.get("parser_type", "lxml"),
-            scraping_mode=kwargs.get("scraping_mode", ScrapingMode.BEAUTIFULSOUP),
+            scraping_strategy=kwargs.get("scraping_strategy"),
            # SSL Parameters
            fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
@@ -574,7 +567,7 @@ class CrawlerRunConfig:
            "remove_forms": self.remove_forms,
            "prettiify": self.prettiify,
            "parser_type": self.parser_type,
-            "scraping_mode": self.scraping_mode,
+            "scraping_strategy": self.scraping_strategy,
            "fetch_ssl_certificate": self.fetch_ssl_certificate,
            "cache_mode": self.cache_mode,
            "session_id": self.session_id,
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -543,27 +543,20 @@ class AsyncWebCrawler:
                _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
                t1 = time.perf_counter()
-                # Initialize scraping strategy based on mode
+                # Get scraping strategy and ensure it has a logger
-                if config.scraping_mode == ScrapingMode.LXML:
+                scraping_strategy = config.scraping_strategy
-                    scrapping_strategy = LXMLWebScrapingStrategy(logger=self.logger)
+                if not scraping_strategy.logger:
-                else:  # Default to BeautifulSoup
+                    scraping_strategy.logger = self.logger
                    scrapping_strategy = WebScrapingStrategy(logger=self.logger)
                # Process HTML content
                params = {k:v for k, v in config.to_dict().items() if k not in ["url"]}
                # add keys from kwargs to params that doesn't exist in params
                params.update({k:v for k, v in kwargs.items() if k not in params.keys()})
-                result = scrapping_strategy.scrap(
+                result = scraping_strategy.scrap(
                    url,
                    html,
-                    **params,
+                    **params
                    # word_count_threshold=config.word_count_threshold,
                    # css_selector=config.css_selector,
                    # only_text=config.only_text,
                    # image_description_min_word_threshold=config.image_description_min_word_threshold,
                    # content_filter=config.content_filter,
                    # **kwargs
                )
                if result is None:
@@ -576,13 +569,17 @@ class AsyncWebCrawler:
-            # Extract results
+            # Extract results - handle both dict and ScrapingResult
-            cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
+            if isinstance(result, dict):
-            fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
+                cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
-            fit_html = sanitize_input_encode(result.get("fit_html", ""))
+                media = result.get("media", {})
-            media = result.get("media", [])
+                links = result.get("links", {})
-            links = result.get("links", [])
+                metadata = result.get("metadata", {})
-            metadata = result.get("metadata", {})
+            else:
                cleaned_html = sanitize_input_encode(result.cleaned_html)
                media = result.media.model_dump()
                links = result.links.model_dump()
                metadata = result.metadata
            # Markdown Generation
            markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator()
@@ -610,10 +607,7 @@ class AsyncWebCrawler:
            )
            # Handle content extraction if needed
-            if (extracted_content is None and 
+            if (not bool(extracted_content) and config.extraction_strategy and not isinstance(config.extraction_strategy, NoExtractionStrategy)):
                config.extraction_strategy and 
                config.chunking_strategy and 
                not isinstance(config.extraction_strategy, NoExtractionStrategy)):
                t1 = time.perf_counter()
@@ -664,8 +658,8 @@ class AsyncWebCrawler:
                cleaned_html=cleaned_html,
                markdown_v2=markdown_v2,
                markdown=markdown,
-                fit_markdown=fit_markdown,
+                fit_markdown=markdown_result.fit_markdown,
-                fit_html=fit_html,
+                fit_html=markdown_result.fit_html,
                media=media,
                links=links,
                metadata=metadata,
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -21,6 +21,7 @@ from .utils import (
 from lxml import etree
 from lxml import html as lhtml
 from typing import Dict, Any, List, Tuple
 from .models import ScrapingResult, MediaItem, Link, Media, Links
 # Pre-compile regular expressions for Open Graph and Twitter metadata
 OG_REGEX = re.compile(r'^og:')
@@ -73,11 +74,11 @@ def fetch_image_file_size(img, base_url):
 class ContentScrapingStrategy(ABC):
    @abstractmethod
-    def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
+    def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
        pass
    @abstractmethod
-    async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
+    async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
        pass
 class WebScrapingStrategy(ContentScrapingStrategy):
@@ -101,7 +102,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
            log_method = getattr(self.logger, level)
            log_method(message=message, tag=tag, **kwargs)
-    def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
+    def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
        """
        Main entry point for content scraping.  
@@ -111,16 +112,40 @@ class WebScrapingStrategy(ContentScrapingStrategy):
            **kwargs: Additional keyword arguments.
        Returns:
-            Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys:
+            ScrapingResult: A structured result containing the scraped content.
            - 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
            - 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
            - 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
            - 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
        """
-        return self._scrap(url, html, is_async=False, **kwargs)
+        raw_result = self._scrap(url, html, is_async=False, **kwargs)
        if raw_result is None:
            return ScrapingResult(
                cleaned_html="",
                success=False,
                media=Media(),
                links=Links(),
                metadata={}
            )
-    async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
+        # Convert media items
        media = Media(
            images=[MediaItem(**img) for img in raw_result.get("media", {}).get("images", []) if img],
            videos=[MediaItem(**vid) for vid in raw_result.get("media", {}).get("videos", []) if vid],
            audios=[MediaItem(**aud) for aud in raw_result.get("media", {}).get("audios", []) if aud]
        )
        # Convert links
        links = Links(
            internal=[Link(**link) for link in raw_result.get("links", {}).get("internal", []) if link],
            external=[Link(**link) for link in raw_result.get("links", {}).get("external", []) if link]
        )
        return ScrapingResult(
            cleaned_html=raw_result.get("cleaned_html", ""),
            success=raw_result.get("success", False),
            media=media,
            links=links,
            metadata=raw_result.get("metadata", {})
        )
    async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
        """
        Main entry point for asynchronous content scraping.
@@ -130,12 +155,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
            **kwargs: Additional keyword arguments.
        Returns:
-            Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys:
+            ScrapingResult: A structured result containing the scraped content.
            - 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
            - 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
            - 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
            - 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
        """
        return await asyncio.to_thread(self._scrap, url, html, **kwargs)
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -1,5 +1,5 @@
 from pydantic import BaseModel, HttpUrl
-from typing import List, Dict, Optional, Callable, Awaitable, Union, Tuple
+from typing import List, Dict, Optional, Callable, Awaitable, Union, Tuple, Any
 from enum import Enum
 from dataclasses import dataclass, field
 from .ssl_certificate import SSLCertificate
@@ -129,3 +129,38 @@ class AsyncCrawlResponse(BaseModel):
    class Config:
        arbitrary_types_allowed = True
 ###############################
 # Scraping Models
 ###############################
 class MediaItem(BaseModel):
    src: str
    alt: Optional[str] = None
    desc: Optional[str] = None
    score: int
    type: str = "image"
    group_id: int
    format: Optional[str] = None
    width: Optional[int] = None
 class Link(BaseModel):
    href: str
    text: str
    title: Optional[str] = None
    base_domain: str
 class Media(BaseModel):
    images: List[MediaItem] = []
    videos: List[MediaItem] = []  # Using MediaItem model for now, can be extended with Video model if needed
    audios: List[MediaItem] = []  # Using MediaItem model for now, can be extended with Audio model if needed
 class Links(BaseModel):
    internal: List[Link] = []
    external: List[Link] = []
 class ScrapingResult(BaseModel):
    cleaned_html: str
    success: bool
    media: Media = Media()
    links: Links = Links()
    metadata: Dict[str, Any] = {}
--- a/docs/md_v2/core/content-selection.md
+++ b/docs/md_v2/core/content-selection.md
@@ -320,14 +320,14 @@ if __name__ == "__main__":
 ## 6. Scraping Modes
-Crawl4AI provides two different scraping modes for HTML content processing: BeautifulSoup (default) and LXML. The LXML mode offers significantly better performance, especially for large HTML documents.
+Crawl4AI provides two different scraping strategies for HTML content processing: `WebScrapingStrategy` (BeautifulSoup-based, default) and `LXMLWebScrapingStrategy` (LXML-based). The LXML strategy offers significantly better performance, especially for large HTML documents.
 ```python
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, ScrapingMode
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LXMLWebScrapingStrategy
 async def main():
    config = CrawlerRunConfig(
-        scraping_mode=ScrapingMode.LXML  # Faster alternative to default BeautifulSoup
+        scraping_strategy=LXMLWebScrapingStrategy()  # Faster alternative to default BeautifulSoup
    )
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
@@ -336,20 +336,69 @@ async def main():
        )
 ```
 You can also create your own custom scraping strategy by inheriting from `ContentScrapingStrategy`. The strategy must return a `ScrapingResult` object with the following structure:
 ```python
 from crawl4ai import ContentScrapingStrategy, ScrapingResult, MediaItem, Media, Link, Links
 class CustomScrapingStrategy(ContentScrapingStrategy):
    def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
        # Implement your custom scraping logic here
        return ScrapingResult(
            cleaned_html="<html>...</html>",  # Cleaned HTML content
            success=True,                     # Whether scraping was successful
            media=Media(
                images=[                      # List of images found
                    MediaItem(
                        src="https://example.com/image.jpg",
                        alt="Image description",
                        desc="Surrounding text",
                        score=1,
                        type="image",
                        group_id=1,
                        format="jpg",
                        width=800
                    )
                ],
                videos=[],                    # List of videos (same structure as images)
                audios=[]                     # List of audio files (same structure as images)
            ),
            links=Links(
                internal=[                    # List of internal links
                    Link(
                        href="https://example.com/page",
                        text="Link text",
                        title="Link title",
                        base_domain="example.com"
                    )
                ],
                external=[]                   # List of external links (same structure)
            ),
            metadata={                        # Additional metadata
                "title": "Page Title",
                "description": "Page description"
            }
        )
    async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
        # For simple cases, you can use the sync version
        return await asyncio.to_thread(self.scrap, url, html, **kwargs)
 ```
 ### Performance Considerations
-The LXML mode can be up to 10-20x faster than BeautifulSoup mode, particularly when processing large HTML documents. However, please note:
+The LXML strategy can be up to 10-20x faster than BeautifulSoup strategy, particularly when processing large HTML documents. However, please note:
-1. LXML mode is currently experimental
+1. LXML strategy is currently experimental
 2. In some edge cases, the parsing results might differ slightly from BeautifulSoup
 3. If you encounter any inconsistencies between LXML and BeautifulSoup results, please [raise an issue](https://github.com/codeium/crawl4ai/issues) with a reproducible example
-Choose LXML mode when:
+Choose LXML strategy when:
 - Processing large HTML documents (recommended for >100KB)
 - Performance is critical
 - Working with well-formed HTML
-Stick to BeautifulSoup mode (default) when:
+Stick to BeautifulSoup strategy (default) when:
 - Maximum compatibility is needed
 - Working with malformed HTML
 - Exact parsing behavior is critical
--- a/tests/test_scraping_strategy.py
+++ b/tests/test_scraping_strategy.py
@@ -0,0 +1,21 @@
 import nest_asyncio
 nest_asyncio.apply()
 import asyncio
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LXMLWebScrapingStrategy, CacheMode
 async def main():
    config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        scraping_strategy=LXMLWebScrapingStrategy()  # Faster alternative to default BeautifulSoup
    )
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url="https://example.com", 
            config=config
        )
        print(f"Success: {result.success}")
        print(f"Markdown length: {len(result.markdown_v2.raw_markdown)}")
 if __name__ == "__main__":
    asyncio.run(main())