refactor(scraping): replace ScrapingMode enum with strategy pattern

Replace the ScrapingMode enum with a proper strategy pattern implementation for content scraping. This change introduces: - New ContentScrapingStrategy abstract base class - Concrete WebScrapingStrategy and LXMLWebScrapingStrategy implementations - New Pydantic models for structured scraping results - Updated documentation reflecting the new strategy-based approach BREAKING CHANGE: ScrapingMode enum has been removed. Users should now use ContentScrapingStrategy implementations instead.
2025-01-13 17:53:12 +08:00
parent f3ae5a657c
commit c3370ec5da
7 changed files with 185 additions and 70 deletions
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -21,6 +21,7 @@ from .utils import (
 from lxml import etree
 from lxml import html as lhtml
 from typing import Dict, Any, List, Tuple
+from .models import ScrapingResult, MediaItem, Link, Media, Links

 # Pre-compile regular expressions for Open Graph and Twitter metadata
 OG_REGEX = re.compile(r'^og:')
@@ -73,11 +74,11 @@ def fetch_image_file_size(img, base_url):

 class ContentScrapingStrategy(ABC):
    @abstractmethod
-    def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
+    def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
        pass

    @abstractmethod
-    async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
+    async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
        pass

 class WebScrapingStrategy(ContentScrapingStrategy):
@@ -101,7 +102,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
            log_method = getattr(self.logger, level)
            log_method(message=message, tag=tag, **kwargs)
                
-    def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
+    def scrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
        """
        Main entry point for content scraping.  

@@ -111,16 +112,40 @@ class WebScrapingStrategy(ContentScrapingStrategy):
            **kwargs: Additional keyword arguments.

        Returns:
-            Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys:
-
-            - 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
-            - 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
-            - 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
-            - 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
+            ScrapingResult: A structured result containing the scraped content.
        """
-        return self._scrap(url, html, is_async=False, **kwargs)
+        raw_result = self._scrap(url, html, is_async=False, **kwargs)
+        if raw_result is None:
+            return ScrapingResult(
+                cleaned_html="",
+                success=False,
+                media=Media(),
+                links=Links(),
+                metadata={}
+            )

-    async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
+        # Convert media items
+        media = Media(
+            images=[MediaItem(**img) for img in raw_result.get("media", {}).get("images", []) if img],
+            videos=[MediaItem(**vid) for vid in raw_result.get("media", {}).get("videos", []) if vid],
+            audios=[MediaItem(**aud) for aud in raw_result.get("media", {}).get("audios", []) if aud]
+        )
+
+        # Convert links
+        links = Links(
+            internal=[Link(**link) for link in raw_result.get("links", {}).get("internal", []) if link],
+            external=[Link(**link) for link in raw_result.get("links", {}).get("external", []) if link]
+        )
+
+        return ScrapingResult(
+            cleaned_html=raw_result.get("cleaned_html", ""),
+            success=raw_result.get("success", False),
+            media=media,
+            links=links,
+            metadata=raw_result.get("metadata", {})
+        )
+
+    async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
        """
        Main entry point for asynchronous content scraping.

@@ -130,12 +155,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
            **kwargs: Additional keyword arguments.

        Returns:
-            Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys:
-
-            - 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
-            - 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
-            - 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
-            - 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
+            ScrapingResult: A structured result containing the scraped content.
        """
        return await asyncio.to_thread(self._scrap, url, html, **kwargs)