refactor(scraping): replace ScrapingMode enum with strategy pattern

Replace the ScrapingMode enum with a proper strategy pattern implementation for content scraping. This change introduces: - New ContentScrapingStrategy abstract base class - Concrete WebScrapingStrategy and LXMLWebScrapingStrategy implementations - New Pydantic models for structured scraping results - Updated documentation reflecting the new strategy-based approach BREAKING CHANGE: ScrapingMode enum has been removed. Users should now use ContentScrapingStrategy implementations instead.
2025-01-13 17:53:12 +08:00
parent f3ae5a657c
commit c3370ec5da
7 changed files with 185 additions and 70 deletions
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -1,7 +1,8 @@
 # __init__.py

 from .async_webcrawler import AsyncWebCrawler, CacheMode
-from .async_configs import BrowserConfig, CrawlerRunConfig, ScrapingMode
+from .async_configs import BrowserConfig, CrawlerRunConfig
+from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy, LXMLWebScrapingStrategy
 from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy, CosineStrategy, JsonCssExtractionStrategy
 from .chunking_strategy import ChunkingStrategy, RegexChunking
 from .markdown_generation_strategy import DefaultMarkdownGenerator
@@ -14,7 +15,9 @@ __all__ = [
    "AsyncWebCrawler",
    "CrawlResult",
    "CacheMode",
-    "ScrapingMode",
+    "ContentScrapingStrategy",
+    "WebScrapingStrategy",
+    "LXMLWebScrapingStrategy",
    'BrowserConfig',
    'CrawlerRunConfig',
    'ExtractionStrategy',
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -9,10 +9,10 @@ from .config import (
 )
 from .user_agent_generator import UserAgentGenerator
 from .extraction_strategy import ExtractionStrategy
-from .chunking_strategy import ChunkingStrategy
+from .chunking_strategy import ChunkingStrategy, RegexChunking
 from .markdown_generation_strategy import MarkdownGenerationStrategy
+from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
 from typing import Union, List
-from enum import Enum

 class BrowserConfig:
    """
@@ -184,12 +184,6 @@ class BrowserConfig:
        )


-class ScrapingMode(str, Enum):
-    """Enum for different scraping modes."""
-    BEAUTIFULSOUP = "beautifulsoup"
-    LXML = "lxml"
-
-
 class CrawlerRunConfig:
    """
    Configuration class for controlling how the crawler runs each crawl operation.
@@ -227,8 +221,8 @@ class CrawlerRunConfig:
                          Default: False.
        parser_type (str): Type of parser to use for HTML parsing.
                           Default: "lxml".
-        scraping_mode (ScrapingMode): Scraping mode to use.
-                           Default: ScrapingMode.BEAUTIFULSOUP.
+        scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
+                           Default: WebScrapingStrategy.

        # Caching Parameters
        cache_mode (CacheMode or None): Defines how caching is handled.
@@ -329,7 +323,7 @@ class CrawlerRunConfig:
        # Content Processing Parameters
        word_count_threshold: int = MIN_WORD_THRESHOLD,
        extraction_strategy: ExtractionStrategy = None,
-        chunking_strategy: ChunkingStrategy = None,
+        chunking_strategy: ChunkingStrategy = RegexChunking(),
        markdown_generator: MarkdownGenerationStrategy = None,
        content_filter=None,
        only_text: bool = False,
@@ -340,7 +334,7 @@ class CrawlerRunConfig:
        remove_forms: bool = False,
        prettiify: bool = False,
        parser_type: str = "lxml",
-        scraping_mode: ScrapingMode = ScrapingMode.BEAUTIFULSOUP,
+        scraping_strategy: ContentScrapingStrategy = None,

        # SSL Parameters
        fetch_ssl_certificate: bool = False,
@@ -413,7 +407,7 @@ class CrawlerRunConfig:
        self.remove_forms = remove_forms
        self.prettiify = prettiify
        self.parser_type = parser_type
-        self.scraping_mode = scraping_mode
+        self.scraping_strategy = scraping_strategy or WebScrapingStrategy()

        # SSL Parameters
        self.fetch_ssl_certificate = fetch_ssl_certificate
@@ -480,7 +474,6 @@ class CrawlerRunConfig:

        # Set default chunking strategy if None
        if self.chunking_strategy is None:
-            from .chunking_strategy import RegexChunking
            self.chunking_strategy = RegexChunking()

    @staticmethod
@@ -489,7 +482,7 @@ class CrawlerRunConfig:
            # Content Processing Parameters
            word_count_threshold=kwargs.get("word_count_threshold", 200),
            extraction_strategy=kwargs.get("extraction_strategy"),
-            chunking_strategy=kwargs.get("chunking_strategy"),
+            chunking_strategy=kwargs.get("chunking_strategy", RegexChunking()),
            markdown_generator=kwargs.get("markdown_generator"),
            content_filter=kwargs.get("content_filter"),
            only_text=kwargs.get("only_text", False),
@@ -500,7 +493,7 @@ class CrawlerRunConfig:
            remove_forms=kwargs.get("remove_forms", False),
            prettiify=kwargs.get("prettiify", False),
            parser_type=kwargs.get("parser_type", "lxml"),
-            scraping_mode=kwargs.get("scraping_mode", ScrapingMode.BEAUTIFULSOUP),
+            scraping_strategy=kwargs.get("scraping_strategy"),

            # SSL Parameters
            fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
@@ -574,7 +567,7 @@ class CrawlerRunConfig:
            "remove_forms": self.remove_forms,
            "prettiify": self.prettiify,
            "parser_type": self.parser_type,
-            "scraping_mode": self.scraping_mode,
+            "scraping_strategy": self.scraping_strategy,
            "fetch_ssl_certificate": self.fetch_ssl_certificate,
            "cache_mode": self.cache_mode,
            "session_id": self.session_id,
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -543,27 +543,20 @@ class AsyncWebCrawler:
                _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
                t1 = time.perf_counter()

-                # Initialize scraping strategy based on mode
-                if config.scraping_mode == ScrapingMode.LXML:
-                    scrapping_strategy = LXMLWebScrapingStrategy(logger=self.logger)
-                else:  # Default to BeautifulSoup
-                    scrapping_strategy = WebScrapingStrategy(logger=self.logger)
+                # Get scraping strategy and ensure it has a logger
+                scraping_strategy = config.scraping_strategy
+                if not scraping_strategy.logger:
+                    scraping_strategy.logger = self.logger

                # Process HTML content
                params = {k:v for k, v in config.to_dict().items() if k not in ["url"]}
                # add keys from kwargs to params that doesn't exist in params
                params.update({k:v for k, v in kwargs.items() if k not in params.keys()})
                
-                result = scrapping_strategy.scrap(
+                result = scraping_strategy.scrap(
                    url,
                    html,
-                    **params,
-                    # word_count_threshold=config.word_count_threshold,
-                    # css_selector=config.css_selector,
-                    # only_text=config.only_text,
-                    # image_description_min_word_threshold=config.image_description_min_word_threshold,
-                    # content_filter=config.content_filter,
-                    # **kwargs
+                    **params
                )

                if result is None:
@@ -576,13 +569,17 @@ class AsyncWebCrawler:

       

-            # Extract results
-            cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
-            fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
-            fit_html = sanitize_input_encode(result.get("fit_html", ""))
-            media = result.get("media", [])
-            links = result.get("links", [])
-            metadata = result.get("metadata", {})
+            # Extract results - handle both dict and ScrapingResult
+            if isinstance(result, dict):
+                cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
+                media = result.get("media", {})
+                links = result.get("links", {})
+                metadata = result.get("metadata", {})
+            else:
+                cleaned_html = sanitize_input_encode(result.cleaned_html)
+                media = result.media.model_dump()
+                links = result.links.model_dump()
+                metadata = result.metadata

            # Markdown Generation
            markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator()
@@ -610,10 +607,7 @@ class AsyncWebCrawler:
            )

            # Handle content extraction if needed
-            if (extracted_content is None and 
-                config.extraction_strategy and 
-                config.chunking_strategy and 
-                not isinstance(config.extraction_strategy, NoExtractionStrategy)):
+            if (not bool(extracted_content) and config.extraction_strategy and not isinstance(config.extraction_strategy, NoExtractionStrategy)):
                
                t1 = time.perf_counter()
                
@@ -664,8 +658,8 @@ class AsyncWebCrawler:
                cleaned_html=cleaned_html,
                markdown_v2=markdown_v2,
                markdown=markdown,
-                fit_markdown=fit_markdown,
-                fit_html=fit_html,
+                fit_markdown=markdown_result.fit_markdown,
+                fit_html=markdown_result.fit_html,
                media=media,
                links=links,
                metadata=metadata,
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -21,6 +21,7 @@ from .utils import (
 from lxml import etree
 from lxml import html as lhtml
 from typing import Dict, Any, List, Tuple
+from .models import ScrapingResult, MediaItem, Link, Media, Links

 # Pre-compile regular expressions for Open Graph and Twitter metadata
 OG_REGEX = re.compile(r'^og:')
@@ -73,11 +74,11 @@ def fetch_image_file_size(img, base_url):

 class ContentScrapingStrategy(ABC):
    @abstractmethod
-    def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
+    def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
        pass

    @abstractmethod
-    async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
+    async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
        pass

 class WebScrapingStrategy(ContentScrapingStrategy):
@@ -101,7 +102,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
            log_method = getattr(self.logger, level)
            log_method(message=message, tag=tag, **kwargs)
                
-    def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
+    def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
        """
        Main entry point for content scraping.  

@@ -111,16 +112,40 @@ class WebScrapingStrategy(ContentScrapingStrategy):
            **kwargs: Additional keyword arguments.

        Returns:
-            Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys:
-
-            - 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
-            - 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
-            - 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
-            - 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
+            ScrapingResult: A structured result containing the scraped content.
        """
-        return self._scrap(url, html, is_async=False, **kwargs)
+        raw_result = self._scrap(url, html, is_async=False, **kwargs)
+        if raw_result is None:
+            return ScrapingResult(
+                cleaned_html="",
+                success=False,
+                media=Media(),
+                links=Links(),
+                metadata={}
+            )

-    async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
+        # Convert media items
+        media = Media(
+            images=[MediaItem(**img) for img in raw_result.get("media", {}).get("images", []) if img],
+            videos=[MediaItem(**vid) for vid in raw_result.get("media", {}).get("videos", []) if vid],
+            audios=[MediaItem(**aud) for aud in raw_result.get("media", {}).get("audios", []) if aud]
+        )
+
+        # Convert links
+        links = Links(
+            internal=[Link(**link) for link in raw_result.get("links", {}).get("internal", []) if link],
+            external=[Link(**link) for link in raw_result.get("links", {}).get("external", []) if link]
+        )
+
+        return ScrapingResult(
+            cleaned_html=raw_result.get("cleaned_html", ""),
+            success=raw_result.get("success", False),
+            media=media,
+            links=links,
+            metadata=raw_result.get("metadata", {})
+        )
+
+    async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
        """
        Main entry point for asynchronous content scraping.

@@ -130,12 +155,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
            **kwargs: Additional keyword arguments.

        Returns:
-            Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys:
-
-            - 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
-            - 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
-            - 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
-            - 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
+            ScrapingResult: A structured result containing the scraped content.
        """
        return await asyncio.to_thread(self._scrap, url, html, **kwargs)

--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -1,5 +1,5 @@
 from pydantic import BaseModel, HttpUrl
-from typing import List, Dict, Optional, Callable, Awaitable, Union, Tuple
+from typing import List, Dict, Optional, Callable, Awaitable, Union, Tuple, Any
 from enum import Enum
 from dataclasses import dataclass, field
 from .ssl_certificate import SSLCertificate
@@ -129,3 +129,38 @@ class AsyncCrawlResponse(BaseModel):

    class Config:
        arbitrary_types_allowed = True
+
+###############################
+# Scraping Models
+###############################
+class MediaItem(BaseModel):
+    src: str
+    alt: Optional[str] = None
+    desc: Optional[str] = None
+    score: int
+    type: str = "image"
+    group_id: int
+    format: Optional[str] = None
+    width: Optional[int] = None
+
+class Link(BaseModel):
+    href: str
+    text: str
+    title: Optional[str] = None
+    base_domain: str
+
+class Media(BaseModel):
+    images: List[MediaItem] = []
+    videos: List[MediaItem] = []  # Using MediaItem model for now, can be extended with Video model if needed
+    audios: List[MediaItem] = []  # Using MediaItem model for now, can be extended with Audio model if needed
+
+class Links(BaseModel):
+    internal: List[Link] = []
+    external: List[Link] = []
+
+class ScrapingResult(BaseModel):
+    cleaned_html: str
+    success: bool
+    media: Media = Media()
+    links: Links = Links()
+    metadata: Dict[str, Any] = {}
--- a/docs/md_v2/core/content-selection.md
+++ b/docs/md_v2/core/content-selection.md
@@ -320,14 +320,14 @@ if __name__ == "__main__":

 ## 6. Scraping Modes

-Crawl4AI provides two different scraping modes for HTML content processing: BeautifulSoup (default) and LXML. The LXML mode offers significantly better performance, especially for large HTML documents.
+Crawl4AI provides two different scraping strategies for HTML content processing: `WebScrapingStrategy` (BeautifulSoup-based, default) and `LXMLWebScrapingStrategy` (LXML-based). The LXML strategy offers significantly better performance, especially for large HTML documents.

 ```python
-from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, ScrapingMode
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LXMLWebScrapingStrategy

 async def main():
    config = CrawlerRunConfig(
-        scraping_mode=ScrapingMode.LXML  # Faster alternative to default BeautifulSoup
+        scraping_strategy=LXMLWebScrapingStrategy()  # Faster alternative to default BeautifulSoup
    )
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
@@ -336,20 +336,69 @@ async def main():
        )
 ```

+You can also create your own custom scraping strategy by inheriting from `ContentScrapingStrategy`. The strategy must return a `ScrapingResult` object with the following structure:
+
+```python
+from crawl4ai import ContentScrapingStrategy, ScrapingResult, MediaItem, Media, Link, Links
+
+class CustomScrapingStrategy(ContentScrapingStrategy):
+    def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
+        # Implement your custom scraping logic here
+        return ScrapingResult(
+            cleaned_html="<html>...</html>",  # Cleaned HTML content
+            success=True,                     # Whether scraping was successful
+            media=Media(
+                images=[                      # List of images found
+                    MediaItem(
+                        src="https://example.com/image.jpg",
+                        alt="Image description",
+                        desc="Surrounding text",
+                        score=1,
+                        type="image",
+                        group_id=1,
+                        format="jpg",
+                        width=800
+                    )
+                ],
+                videos=[],                    # List of videos (same structure as images)
+                audios=[]                     # List of audio files (same structure as images)
+            ),
+            links=Links(
+                internal=[                    # List of internal links
+                    Link(
+                        href="https://example.com/page",
+                        text="Link text",
+                        title="Link title",
+                        base_domain="example.com"
+                    )
+                ],
+                external=[]                   # List of external links (same structure)
+            ),
+            metadata={                        # Additional metadata
+                "title": "Page Title",
+                "description": "Page description"
+            }
+        )
+
+    async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
+        # For simple cases, you can use the sync version
+        return await asyncio.to_thread(self.scrap, url, html, **kwargs)
+```
+
 ### Performance Considerations

-The LXML mode can be up to 10-20x faster than BeautifulSoup mode, particularly when processing large HTML documents. However, please note:
+The LXML strategy can be up to 10-20x faster than BeautifulSoup strategy, particularly when processing large HTML documents. However, please note:

-1. LXML mode is currently experimental
+1. LXML strategy is currently experimental
 2. In some edge cases, the parsing results might differ slightly from BeautifulSoup
 3. If you encounter any inconsistencies between LXML and BeautifulSoup results, please [raise an issue](https://github.com/codeium/crawl4ai/issues) with a reproducible example

-Choose LXML mode when:
+Choose LXML strategy when:
 - Processing large HTML documents (recommended for >100KB)
 - Performance is critical
 - Working with well-formed HTML

-Stick to BeautifulSoup mode (default) when:
+Stick to BeautifulSoup strategy (default) when:
 - Maximum compatibility is needed
 - Working with malformed HTML
 - Exact parsing behavior is critical
--- a/tests/test_scraping_strategy.py
+++ b/tests/test_scraping_strategy.py
@@ -0,0 +1,21 @@
+import nest_asyncio
+nest_asyncio.apply()
+
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LXMLWebScrapingStrategy, CacheMode
+
+async def main():
+    config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        scraping_strategy=LXMLWebScrapingStrategy()  # Faster alternative to default BeautifulSoup
+    )
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com", 
+            config=config
+        )
+        print(f"Success: {result.success}")
+        print(f"Markdown length: {len(result.markdown_v2.raw_markdown)}")
+
+if __name__ == "__main__":
+    asyncio.run(main())