Commit Message:

- Added examples for Amazon product data extraction methods - Updated configuration options and enhance documentation - Minor refactoring for improved performance and readability - Cleaned up version control settings.
2024-12-29 20:05:18 +08:00
parent f2d9912697
commit fb33a24891
27 changed files with 4371 additions and 1408 deletions
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -64,6 +64,17 @@ class ContentScrapingStrategy(ABC):
        pass

 class WebScrapingStrategy(ContentScrapingStrategy):
+    """
+    Class for web content scraping. Perhaps the most important class. 
+    
+    How it works:
+    1. Extract content from HTML using BeautifulSoup.
+    2. Clean the extracted content using a content cleaning strategy.
+    3. Filter the cleaned content using a content filtering strategy.
+    4. Generate markdown content from the filtered content.
+    5. Return the markdown content.
+    """
+    
    def __init__(self, logger=None):
        self.logger = logger

@@ -74,17 +85,57 @@ class WebScrapingStrategy(ContentScrapingStrategy):
            log_method(message=message, tag=tag, **kwargs)
                
    def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
+        """
+        Main entry point for content scraping.  
+
+        Args:
+            url (str): The URL of the page to scrape.
+            html (str): The HTML content of the page.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys:
+
+            - 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
+            - 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
+            - 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
+            - 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
+        """
        return self._scrap(url, html, is_async=False, **kwargs)

    async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
+        """
+        Main entry point for asynchronous content scraping.
+
+        Args:
+            url (str): The URL of the page to scrape.
+            html (str): The HTML content of the page.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys:
+
+            - 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
+            - 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
+            - 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
+            - 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
+        """
        return await asyncio.to_thread(self._scrap, url, html, **kwargs)

-    def _generate_markdown_content(self, 
-                                 cleaned_html: str,
-                                 html: str,
-                                 url: str,
-                                 success: bool,
-                                 **kwargs) -> Dict[str, Any]:
+    def _generate_markdown_content(self, cleaned_html: str,html: str,url: str, success: bool, **kwargs) -> Dict[str, Any]:
+        """
+        Generate markdown content from cleaned HTML.
+
+        Args:
+            cleaned_html (str): The cleaned HTML content.
+            html (str): The original HTML content.
+            url (str): The URL of the page.
+            success (bool): Whether the content was successfully cleaned.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            Dict[str, Any]: A dictionary containing the generated markdown content.
+        """
        markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerator())
        
        if markdown_generator:
@@ -158,6 +209,15 @@ class WebScrapingStrategy(ContentScrapingStrategy):
        """

    def flatten_nested_elements(self, node):
+        """
+        Flatten nested elements in a HTML tree.
+
+        Args:
+            node (Tag): The root node of the HTML tree.
+
+        Returns:
+            Tag: The flattened HTML tree.
+        """
        if isinstance(node, NavigableString):
            return node
        if len(node.contents) == 1 and isinstance(node.contents[0], Tag) and node.contents[0].name == node.name:
@@ -166,6 +226,16 @@ class WebScrapingStrategy(ContentScrapingStrategy):
        return node

    def find_closest_parent_with_useful_text(self, tag, **kwargs):
+        """
+        Find the closest parent with useful text.
+
+        Args:
+            tag (Tag): The starting tag to search from.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            Tag: The closest parent with useful text, or None if not found.
+        """
        image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
        current_tag = tag
        while current_tag:
@@ -179,6 +249,17 @@ class WebScrapingStrategy(ContentScrapingStrategy):
        return None

    def remove_unwanted_attributes(self, element, important_attrs, keep_data_attributes=False):
+        """
+        Remove unwanted attributes from an HTML element.
+
+        Args:    
+            element (Tag): The HTML element to remove attributes from.
+            important_attrs (list): List of important attributes to keep.
+            keep_data_attributes (bool): Whether to keep data attributes.
+
+        Returns:
+            None
+        """
        attrs_to_remove = []
        for attr in element.attrs:
            if attr not in important_attrs:
@@ -192,6 +273,26 @@ class WebScrapingStrategy(ContentScrapingStrategy):
            del element[attr]

    def process_image(self, img, url, index, total_images, **kwargs):
+        """
+        Process an image element.
+        
+        How it works:
+        1. Check if the image has valid display and inside undesired html elements.
+        2. Score an image for it's usefulness.
+        3. Extract image file metadata to extract size and extension.
+        4. Generate a dictionary with the processed image information.
+        5. Return the processed image information.
+
+        Args:
+            img (Tag): The image element to process.
+            url (str): The URL of the page containing the image.
+            index (int): The index of the image in the list of images.
+            total_images (int): The total number of images in the list.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            dict: A dictionary containing the processed image information.
+        """
        parse_srcset = lambda s: [{'url': u.strip().split()[0], 'width': u.strip().split()[-1].rstrip('w') 
                        if ' ' in u else None} 
                        for u in [f"http{p}" for p in s.split("http") if p]]
@@ -316,6 +417,23 @@ class WebScrapingStrategy(ContentScrapingStrategy):
        return image_variants if image_variants else None

    def process_element(self, url, element: PageElement, **kwargs) -> Dict[str, Any]:        
+        """
+        Process an HTML element.
+        
+        How it works:
+        1. Check if the element is an image, video, or audio.
+        2. Extract the element's attributes and content.
+        3. Process the element based on its type.
+        4. Return the processed element information.
+
+        Args:
+            url (str): The URL of the page containing the element.
+            element (Tag): The HTML element to process.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            dict: A dictionary containing the processed element information.
+        """
        media = {'images': [], 'videos': [], 'audios': []}
        internal_links_dict = {}
        external_links_dict = {}
@@ -334,6 +452,9 @@ class WebScrapingStrategy(ContentScrapingStrategy):
        }
        
    def _process_element(self, url, element: PageElement,  media: Dict[str, Any], internal_links_dict: Dict[str, Any], external_links_dict: Dict[str, Any], **kwargs) -> bool:
+        """
+        Process an HTML element.        
+        """
        try:
            if isinstance(element, NavigableString):
                if isinstance(element, Comment):
@@ -534,11 +655,25 @@ class WebScrapingStrategy(ContentScrapingStrategy):
            return False

    def _scrap(self, url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]:
+        """
+        Extract content from HTML using BeautifulSoup.
+
+        Args:
+            url (str): The URL of the page to scrape.
+            html (str): The HTML content of the page to scrape.
+            word_count_threshold (int): The minimum word count threshold for content extraction.
+            css_selector (str): The CSS selector to use for content extraction.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            dict: A dictionary containing the extracted content.
+        """
        success = True
        if not html:
            return None

-        soup = BeautifulSoup(html, 'lxml')
+        parser_type = kwargs.get('parser', 'lxml')
+        soup = BeautifulSoup(html, parser_type)
        body = soup.body
        base_domain = get_base_domain(url)