Commit Message:

- Added examples for Amazon product data extraction methods - Updated configuration options and enhance documentation - Minor refactoring for improved performance and readability - Cleaned up version control settings.
2024-12-29 20:05:18 +08:00
parent f2d9912697
commit fb33a24891
27 changed files with 4371 additions and 1408 deletions
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -11,6 +11,7 @@ from .user_agent_generator import UserAgentGenerator
 from .extraction_strategy import ExtractionStrategy
 from .chunking_strategy import ChunkingStrategy
 from .markdown_generation_strategy import MarkdownGenerationStrategy
+from typing import Union, List


 class BrowserConfig:
@@ -39,8 +40,8 @@ class BrowserConfig:
                             Default: None.
        proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
                                     If None, no additional proxy config. Default: None.
-        viewport_width (int): Default viewport width for pages. Default: 1920.
-        viewport_height (int): Default viewport height for pages. Default: 1080.
+        viewport_width (int): Default viewport width for pages. Default: 1080.
+        viewport_height (int): Default viewport height for pages. Default: 600.
        verbose (bool): Enable verbose logging.
                        Default: True.
        accept_downloads (bool): Whether to allow file downloads. If True, requires a downloads_path.
@@ -79,7 +80,7 @@ class BrowserConfig:
        chrome_channel: str = "chrome",
        proxy: str = None,
        proxy_config: dict = None,
-        viewport_width: int = 800,
+        viewport_width: int = 1080,
        viewport_height: int = 600, 
        accept_downloads: bool = False,
        downloads_path: str = None,
@@ -136,10 +137,15 @@ class BrowserConfig:
        self.debugging_port = debugging_port

        user_agenr_generator = UserAgentGenerator()
-        if self.user_agent_mode != "random":
+        if self.user_agent_mode != "random" and self.user_agent_generator_config:
            self.user_agent = user_agenr_generator.generate(
                **(self.user_agent_generator_config or {})
            )
+        elif self.user_agent_mode == "random":
+            self.user_agent = user_agenr_generator.generate()
+        else:
+            pass
+        
        self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent)
        self.headers.setdefault("sec-ch-ua", self.browser_hint)

@@ -158,8 +164,8 @@ class BrowserConfig:
            chrome_channel=kwargs.get("chrome_channel", "chrome"),
            proxy=kwargs.get("proxy"),
            proxy_config=kwargs.get("proxy_config"),
-            viewport_width=kwargs.get("viewport_width", 1920),
-            viewport_height=kwargs.get("viewport_height", 1080),
+            viewport_width=kwargs.get("viewport_width", 1080),
+            viewport_height=kwargs.get("viewport_height", 600),
            accept_downloads=kwargs.get("accept_downloads", False),
            downloads_path=kwargs.get("downloads_path"),
            storage_state=kwargs.get("storage_state"),
@@ -215,6 +221,8 @@ class CrawlerRunConfig:
                             Default: False.
        prettiify (bool): If True, apply `fast_format_html` to produce prettified HTML output.
                          Default: False.
+        parser_type (str): Type of parser to use for HTML parsing.
+                           Default: "lxml".

        # Caching Parameters
        cache_mode (CacheMode or None): Defines how caching is handled.
@@ -322,6 +330,7 @@ class CrawlerRunConfig:
        keep_data_attributes: bool = False,
        remove_forms: bool = False,
        prettiify: bool = False,
+        parser_type: str = "lxml",

        # SSL Parameters
        fetch_ssl_certificate: bool = False,
@@ -345,7 +354,7 @@ class CrawlerRunConfig:
        semaphore_count: int = 5,

        # Page Interaction Parameters
-        js_code=None,
+        js_code: Union[str, List[str]] = None,
        js_only: bool = False,
        ignore_body_visibility: bool = True,
        scan_full_page: bool = False,
@@ -393,6 +402,7 @@ class CrawlerRunConfig:
        self.keep_data_attributes = keep_data_attributes
        self.remove_forms = remove_forms
        self.prettiify = prettiify
+        self.parser_type = parser_type

        # SSL Parameters
        self.fetch_ssl_certificate = fetch_ssl_certificate
@@ -478,6 +488,7 @@ class CrawlerRunConfig:
            keep_data_attributes=kwargs.get("keep_data_attributes", False),
            remove_forms=kwargs.get("remove_forms", False),
            prettiify=kwargs.get("prettiify", False),
+            parser_type=kwargs.get("parser_type", "lxml"),

            # SSL Parameters
            fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
@@ -550,6 +561,7 @@ class CrawlerRunConfig:
            "keep_data_attributes": self.keep_data_attributes,
            "remove_forms": self.remove_forms,
            "prettiify": self.prettiify,
+            "parser_type": self.parser_type,
            "fetch_ssl_certificate": self.fetch_ssl_certificate,
            "cache_mode": self.cache_mode,
            "session_id": self.session_id,
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
--- a/crawl4ai/async_database.py
+++ b/crawl4ai/async_database.py
@@ -7,7 +7,7 @@ from contextlib import asynccontextmanager
 import logging
 import json  # Added for serialization/deserialization
 from .utils import ensure_content_dirs, generate_content_hash
-from .models import CrawlResult
+from .models import CrawlResult, MarkdownGenerationResult
 import xxhash
 import aiofiles
 from .config import NEED_MIGRATION
@@ -295,13 +295,18 @@ class AsyncDatabaseManager:
                        row_dict[field] = ""

                # Parse JSON fields
-                json_fields = ['media', 'links', 'metadata', 'response_headers']
+                json_fields = ['media', 'links', 'metadata', 'response_headers', 'markdown']
                for field in json_fields:
                    try:
                        row_dict[field] = json.loads(row_dict[field]) if row_dict[field] else {}
                    except json.JSONDecodeError:
                        row_dict[field] = {}

+                if isinstance(row_dict['markdown'], Dict):
+                    row_dict['markdown_v2'] = row_dict['markdown']
+                    if row_dict['markdown'].get('raw_markdown'):
+                        row_dict['markdown'] = row_dict['markdown']['raw_markdown']
+                
                # Parse downloaded_files
                try:
                    row_dict['downloaded_files'] = json.loads(row_dict['downloaded_files']) if row_dict['downloaded_files'] else []
@@ -331,10 +336,28 @@ class AsyncDatabaseManager:
        content_map = {
            'html': (result.html, 'html'),
            'cleaned_html': (result.cleaned_html or "", 'cleaned'),
-            'markdown': (result.markdown or "", 'markdown'),
+            'markdown': None,
            'extracted_content': (result.extracted_content or "", 'extracted'),
            'screenshot': (result.screenshot or "", 'screenshots')
        }
+
+        try:
+            if isinstance(result.markdown, MarkdownGenerationResult):
+                content_map['markdown'] = (result.markdown.model_dump_json(), 'markdown')
+            elif hasattr(result, 'markdown_v2'):
+                content_map['markdown'] = (result.markdown_v2.model_dump_json(), 'markdown')
+            elif isinstance(result.markdown, str):
+                markdown_result = MarkdownGenerationResult(raw_markdown=result.markdown)
+                content_map['markdown'] = (markdown_result.model_dump_json(), 'markdown')
+            else:
+                content_map['markdown'] = (MarkdownGenerationResult().model_dump_json(), 'markdown')
+        except Exception as e:
+            self.logger.warning(
+                message=f"Error processing markdown content: {str(e)}",
+                tag="WARNING"
+            )
+            # Fallback to empty markdown result
+            content_map['markdown'] = (MarkdownGenerationResult().model_dump_json(), 'markdown')
        
        content_hashes = {}
        for field, (content, content_type) in content_map.items():
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -69,6 +69,24 @@ class AsyncWebCrawler:
    New way (recommended):
        browser_config = BrowserConfig(browser_type="chromium", headless=True)
        crawler = AsyncWebCrawler(config=browser_config)
+    
+    
+    Attributes:
+        browser_config (BrowserConfig): Configuration object for browser settings.
+        crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages.
+        logger (AsyncLogger): Logger instance for recording events and errors.
+        always_bypass_cache (bool): Whether to always bypass cache.
+        crawl4ai_folder (str): Directory for storing cache.
+        base_directory (str): Base directory for storing cache.
+        ready (bool): Whether the crawler is ready for use.
+        
+        Methods:
+            start(): Start the crawler explicitly without using context manager.
+            close(): Close the crawler explicitly without using context manager.
+            arun(): Run the crawler for a single source: URL (web, local file, or raw HTML).
+            awarmup(): Perform warmup sequence.
+            arun_many(): Run the crawler for multiple sources.
+            aprocess_html(): Process HTML content.
    """
    _domain_last_hit = {}

@@ -321,7 +339,7 @@ class AsyncWebCrawler:

                    # Initialize processing variables
                    async_response: AsyncCrawlResponse = None
-                    cached_result = None
+                    cached_result: CrawlResult = None
                    screenshot_data = None
                    pdf_data = None
                    extracted_content = None
@@ -373,52 +391,89 @@ class AsyncWebCrawler:
                            tag="FETCH"
                        )

-                    # Process the HTML content
-                    crawl_result = await self.aprocess_html(
-                        url=url,
-                        html=html,
-                        extracted_content=extracted_content,
-                        config=config,  # Pass the config object instead of individual parameters
-                        screenshot=screenshot_data,
-                        pdf_data=pdf_data,
-                        verbose=config.verbose,
-                        is_raw_html = True if url.startswith("raw:") else False,
-                        **kwargs
-                    )
+                        # Process the HTML content
+                        crawl_result = await self.aprocess_html(
+                            url=url,
+                            html=html,
+                            extracted_content=extracted_content,
+                            config=config,  # Pass the config object instead of individual parameters
+                            screenshot=screenshot_data,
+                            pdf_data=pdf_data,
+                            verbose=config.verbose,
+                            is_raw_html = True if url.startswith("raw:") else False,
+                            **kwargs
+                        )
+
+                    #     crawl_result.status_code = async_response.status_code
+                    #     crawl_result.response_headers = async_response.response_headers
+                    #     crawl_result.downloaded_files = async_response.downloaded_files
+                    #     crawl_result.ssl_certificate = async_response.ssl_certificate  # Add SSL certificate
+                    # else:
+                    #     crawl_result.status_code = 200
+                    #     crawl_result.response_headers = cached_result.response_headers if cached_result else {}
+                    #     crawl_result.ssl_certificate = cached_result.ssl_certificate if cached_result else None  # Add SSL certificate from cache
+
+                        # # Check and set values from async_response to crawl_result
+                        try:
+                            for key in vars(async_response):
+                                if hasattr(crawl_result, key):
+                                    value = getattr(async_response, key, None)
+                                    current_value = getattr(crawl_result, key, None)
+                                    if value is not None and not current_value:
+                                        try:
+                                            setattr(crawl_result, key, value)
+                                        except Exception as e:
+                                            self.logger.warning(
+                                                message=f"Failed to set attribute {key}: {str(e)}",
+                                                tag="WARNING"
+                                            )
+                        except Exception as e:
+                            self.logger.warning(
+                                message=f"Error copying response attributes: {str(e)}",
+                                tag="WARNING"
+                            )
+
+                        crawl_result.success = bool(html)
+                        crawl_result.session_id = getattr(config, 'session_id', None)
+
+                        self.logger.success(
+                            message="{url:.50}... | Status: {status} | Total: {timing}",
+                            tag="COMPLETE",
+                            params={
+                                "url": cache_context.display_url,
+                                "status": crawl_result.success,
+                                "timing": f"{time.perf_counter() - start_time:.2f}s"
+                            },
+                            colors={
+                                "status": Fore.GREEN if crawl_result.success else Fore.RED,
+                                "timing": Fore.YELLOW
+                            }
+                        )
+
+                        # Update cache if appropriate
+                        if cache_context.should_write() and not bool(cached_result):
+                            await async_db_manager.acache_url(crawl_result)
+
+                        return crawl_result

-                    # Set response data
-                    if async_response:
-                        crawl_result.status_code = async_response.status_code
-                        crawl_result.response_headers = async_response.response_headers
-                        crawl_result.downloaded_files = async_response.downloaded_files
-                        crawl_result.ssl_certificate = async_response.ssl_certificate  # Add SSL certificate
                    else:
-                        crawl_result.status_code = 200
-                        crawl_result.response_headers = cached_result.response_headers if cached_result else {}
-                        crawl_result.ssl_certificate = cached_result.ssl_certificate if cached_result else None  # Add SSL certificate from cache
+                        self.logger.success(
+                            message="{url:.50}... | Status: {status} | Total: {timing}",
+                            tag="COMPLETE",
+                            params={
+                                "url": cache_context.display_url,
+                                "status": True,
+                                "timing": f"{time.perf_counter() - start_time:.2f}s"
+                            },
+                            colors={
+                                "status": Fore.GREEN,
+                                "timing": Fore.YELLOW
+                            }
+                        )

-                    crawl_result.success = bool(html)
-                    crawl_result.session_id = getattr(config, 'session_id', None)
-
-                    self.logger.success(
-                        message="{url:.50}... | Status: {status} | Total: {timing}",
-                        tag="COMPLETE",
-                        params={
-                            "url": cache_context.display_url,
-                            "status": crawl_result.success,
-                            "timing": f"{time.perf_counter() - start_time:.2f}s"
-                        },
-                        colors={
-                            "status": Fore.GREEN if crawl_result.success else Fore.RED,
-                            "timing": Fore.YELLOW
-                        }
-                    )
-
-                    # Update cache if appropriate
-                    if cache_context.should_write() and not bool(cached_result):
-                        await async_db_manager.acache_url(crawl_result)
-
-                    return crawl_result
+                        cached_result.success = bool(html)
+                        cached_result.session_id = getattr(config, 'session_id', None)
+                        return cached_result

                except Exception as e:
                    error_context = get_error_context(sys.exc_info())
@@ -465,6 +520,7 @@ class AsyncWebCrawler:
                extracted_content: Previously extracted content (if any)
                config: Configuration object controlling processing behavior
                screenshot: Screenshot data (if any)
+                pdf_data: PDF data (if any)
                verbose: Whether to enable verbose logging
                **kwargs: Additional parameters for backwards compatibility
            
--- a/crawl4ai/cache_context.py
+++ b/crawl4ai/cache_context.py
@@ -25,8 +25,26 @@ class CacheContext:
    
    This class centralizes all cache-related logic and URL type checking,
    making the caching behavior more predictable and maintainable.
+    
+    Attributes:
+        url (str): The URL being processed.
+        cache_mode (CacheMode): The cache mode for the current operation.
+        always_bypass (bool): If True, bypasses caching for this operation.
+        is_cacheable (bool): True if the URL is cacheable, False otherwise.
+        is_web_url (bool): True if the URL is a web URL, False otherwise.
+        is_local_file (bool): True if the URL is a local file, False otherwise.
+        is_raw_html (bool): True if the URL is raw HTML, False otherwise.
+        _url_display (str): The display name for the URL (web, local file, or raw HTML).
    """
    def __init__(self, url: str, cache_mode: CacheMode, always_bypass: bool = False):
+        """
+        Initializes the CacheContext with the provided URL and cache mode.
+        
+        Args:
+            url (str): The URL being processed.
+            cache_mode (CacheMode): The cache mode for the current operation.
+            always_bypass (bool): If True, bypasses caching for this operation.
+        """
        self.url = url
        self.cache_mode = cache_mode
        self.always_bypass = always_bypass
@@ -37,13 +55,31 @@ class CacheContext:
        self._url_display = url if not self.is_raw_html else "Raw HTML"
    
    def should_read(self) -> bool:
-        """Determines if cache should be read based on context."""
+        """
+        Determines if cache should be read based on context.
+        
+        How it works:
+        1. If always_bypass is True or is_cacheable is False, return False.
+        2. If cache_mode is ENABLED or READ_ONLY, return True.
+        
+        Returns:
+            bool: True if cache should be read, False otherwise.
+        """
        if self.always_bypass or not self.is_cacheable:
            return False
        return self.cache_mode in [CacheMode.ENABLED, CacheMode.READ_ONLY]
    
    def should_write(self) -> bool:
-        """Determines if cache should be written based on context."""
+        """
+        Determines if cache should be written based on context.
+        
+        How it works:
+        1. If always_bypass is True or is_cacheable is False, return False.
+        2. If cache_mode is ENABLED or WRITE_ONLY, return True.
+        
+        Returns:
+            bool: True if cache should be written, False otherwise.
+        """
        if self.always_bypass or not self.is_cacheable:
            return False
        return self.cache_mode in [CacheMode.ENABLED, CacheMode.WRITE_ONLY]
--- a/crawl4ai/chunking_strategy.py
+++ b/crawl4ai/chunking_strategy.py
@@ -7,22 +7,43 @@ from .utils import *

 # Define the abstract base class for chunking strategies
 class ChunkingStrategy(ABC):
+    """
+    Abstract base class for chunking strategies.
+    """
    
    @abstractmethod
    def chunk(self, text: str) -> list:
        """
        Abstract method to chunk the given text.
+        
+        Args:
+            text (str): The text to chunk.
+        
+        Returns:
+            list: A list of chunks.
        """
        pass

 # Create an identity chunking strategy f(x) = [x]
 class IdentityChunking(ChunkingStrategy):
+    """
+    Chunking strategy that returns the input text as a single chunk.
+    """
    def chunk(self, text: str) -> list:
        return [text]

 # Regex-based chunking
 class RegexChunking(ChunkingStrategy):
+    """
+    Chunking strategy that splits text based on regular expression patterns.
+    """
    def __init__(self, patterns=None, **kwargs):
+        """
+        Initialize the RegexChunking object.
+        
+        Args:
+            patterns (list): A list of regular expression patterns to split text.
+        """
        if patterns is None:
            patterns = [r'\n\n']  # Default split pattern
        self.patterns = patterns
@@ -38,9 +59,15 @@ class RegexChunking(ChunkingStrategy):
    
 # NLP-based sentence chunking 
 class NlpSentenceChunking(ChunkingStrategy):
+    """
+    Chunking strategy that splits text into sentences using NLTK's sentence tokenizer.
+    """ 
    def __init__(self, **kwargs):
+        """
+        Initialize the NlpSentenceChunking object.
+        """
        load_nltk_punkt()
-        pass
+        

    def chunk(self, text: str) -> list:
        # Improved regex for sentence splitting
@@ -57,8 +84,21 @@ class NlpSentenceChunking(ChunkingStrategy):
    
 # Topic-based segmentation using TextTiling
 class TopicSegmentationChunking(ChunkingStrategy):
+    """
+    Chunking strategy that segments text into topics using NLTK's TextTilingTokenizer.
+    
+    How it works:
+    1. Segment the text into topics using TextTilingTokenizer
+    2. Extract keywords for each topic segment
+    """
    
    def __init__(self, num_keywords=3, **kwargs):
+        """
+        Initialize the TopicSegmentationChunking object.
+        
+        Args:
+            num_keywords (int): The number of keywords to extract for each topic segment.
+        """
        import nltk as nl
        self.tokenizer = nl.tokenize.TextTilingTokenizer()
        self.num_keywords = num_keywords
@@ -88,6 +128,14 @@ class TopicSegmentationChunking(ChunkingStrategy):
    
 # Fixed-length word chunks
 class FixedLengthWordChunking(ChunkingStrategy):
+    """
+    Chunking strategy that splits text into fixed-length word chunks.
+    
+    How it works:
+    1. Split the text into words
+    2. Create chunks of fixed length
+    3. Return the list of chunks
+    """
    def __init__(self, chunk_size=100, **kwargs):
        """
        Initialize the fixed-length word chunking strategy with the given chunk size.
@@ -103,6 +151,14 @@ class FixedLengthWordChunking(ChunkingStrategy):
    
 # Sliding window chunking
 class SlidingWindowChunking(ChunkingStrategy):
+    """
+    Chunking strategy that splits text into overlapping word chunks.
+    
+    How it works:
+    1. Split the text into words
+    2. Create chunks of fixed length
+    3. Return the list of chunks
+    """
    def __init__(self, window_size=100, step=50, **kwargs):
        """
        Initialize the sliding window chunking strategy with the given window size and
@@ -133,6 +189,15 @@ class SlidingWindowChunking(ChunkingStrategy):
        return chunks
    
 class OverlappingWindowChunking(ChunkingStrategy):
+    """
+    Chunking strategy that splits text into overlapping word chunks.
+    
+    How it works:
+    1. Split the text into words using whitespace
+    2. Create chunks of fixed length equal to the window size
+    3. Slide the window by the overlap size
+    4. Return the list of chunks
+    """
    def __init__(self, window_size=1000, overlap=100, **kwargs):
        """
        Initialize the overlapping window chunking strategy with the given window size and
--- a/crawl4ai/content_filter_strategy.py
+++ b/crawl4ai/content_filter_strategy.py
@@ -9,17 +9,8 @@ from .utils import clean_tokens
 from abc import ABC, abstractmethod
 import math
 from snowballstemmer import stemmer
-
-
-# import regex
-# def tokenize_text(text):
-#     # Regular expression to match words or CJK (Chinese, Japanese, Korean) characters
-#     pattern = r'\p{L}+|\p{N}+|[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}ー]|[\p{P}]'
-#     return regex.findall(pattern, text)
-
-# from nltk.stem import PorterStemmer
-# ps = PorterStemmer()
 class RelevantContentFilter(ABC):
+    """Abstract base class for content filtering strategies"""
    def __init__(self, user_query: str = None):
        self.user_query = user_query
        self.included_tags = {
@@ -171,9 +162,8 @@ class RelevantContentFilter(ABC):
            chunks = [chunk for chunk in chunks if len(chunk[1].split()) >= min_word_threshold]
        
        return chunks    
-    

-    def extract_text_chunks1(self, soup: BeautifulSoup) -> List[Tuple[int, str, Tag]]:
+    def _deprecated_extract_text_chunks(self, soup: BeautifulSoup) -> List[Tuple[int, str, Tag]]:
        """Common method for extracting text chunks"""
        _text_cache = {}
        def fast_text(element: Tag) -> str:
@@ -271,7 +261,38 @@ class RelevantContentFilter(ABC):
            return str(tag)  # Fallback to original if anything fails

 class BM25ContentFilter(RelevantContentFilter):
+    """
+    Content filtering using BM25 algorithm with priority tag handling.
+    
+    How it works:
+    1. Extracts page metadata with fallbacks.
+    2. Extracts text chunks from the body element.
+    3. Tokenizes the corpus and query.
+    4. Applies BM25 algorithm to calculate scores for each chunk.
+    5. Filters out chunks below the threshold.
+    6. Sorts chunks by score in descending order.
+    7. Returns the top N chunks.
+    
+    Attributes:
+        user_query (str): User query for filtering (optional).
+        bm25_threshold (float): BM25 threshold for filtering (default: 1.0).
+        language (str): Language for stemming (default: 'english').
+        
+        Methods:
+            filter_content(self, html: str, min_word_threshold: int = None)
+    """
    def __init__(self, user_query: str = None, bm25_threshold: float = 1.0, language: str = 'english'):
+        """
+        Initializes the BM25ContentFilter class, if not provided, falls back to page metadata.
+        
+        Note:
+        If no query is given and no page metadata is available, then it tries to pick up the first significant paragraph.
+        
+        Args:
+            user_query (str): User query for filtering (optional).
+            bm25_threshold (float): BM25 threshold for filtering (default: 1.0).
+            language (str): Language for stemming (default: 'english').
+        """
        super().__init__(user_query=user_query)
        self.bm25_threshold = bm25_threshold
        self.priority_tags = {
@@ -290,7 +311,20 @@ class BM25ContentFilter(RelevantContentFilter):
        self.stemmer = stemmer(language)

    def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
-        """Implements content filtering using BM25 algorithm with priority tag handling"""
+        """
+        Implements content filtering using BM25 algorithm with priority tag handling.
+        
+            Note:
+        This method implements the filtering logic for the BM25ContentFilter class.
+        It takes HTML content as input and returns a list of filtered text chunks.
+        
+        Args:
+            html (str): HTML content to be filtered.
+            min_word_threshold (int): Minimum word threshold for filtering (optional).
+        
+        Returns:
+            List[str]: List of filtered text chunks.
+        """
        if not html or not isinstance(html, str):
            return []

@@ -357,15 +391,42 @@ class BM25ContentFilter(RelevantContentFilter):

        return [self.clean_element(tag) for _, _, tag in selected_candidates]

-
-
-
-
-
 class PruningContentFilter(RelevantContentFilter):
+    """
+    Content filtering using pruning algorithm with dynamic threshold.
+    
+    How it works:
+    1. Extracts page metadata with fallbacks.
+    2. Extracts text chunks from the body element.
+    3. Applies pruning algorithm to calculate scores for each chunk.
+    4. Filters out chunks below the threshold.
+    5. Sorts chunks by score in descending order.
+    6. Returns the top N chunks.
+
+    Attributes:
+        user_query (str): User query for filtering (optional), if not provided, falls back to page metadata.
+        min_word_threshold (int): Minimum word threshold for filtering (optional).
+        threshold_type (str): Threshold type for dynamic threshold (default: 'fixed').
+        threshold (float): Fixed threshold value (default: 0.48).
+        
+        Methods:
+            filter_content(self, html: str, min_word_threshold: int = None):
+    """
    def __init__(self, user_query: str = None, min_word_threshold: int = None, 
                 threshold_type: str = 'fixed', threshold: float = 0.48):
-        super().__init__(user_query)
+        """
+        Initializes the PruningContentFilter class, if not provided, falls back to page metadata.
+        
+        Note:
+        If no query is given and no page metadata is available, then it tries to pick up the first significant paragraph.
+        
+        Args:
+            user_query (str): User query for filtering (optional).
+            min_word_threshold (int): Minimum word threshold for filtering (optional).
+            threshold_type (str): Threshold type for dynamic threshold (default: 'fixed').
+            threshold (float): Fixed threshold value (default: 0.48).
+        """
+        super().__init__(None)
        self.min_word_threshold = min_word_threshold
        self.threshold_type = threshold_type
        self.threshold = threshold
@@ -418,6 +479,20 @@ class PruningContentFilter(RelevantContentFilter):
        }

    def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
+        """
+        Implements content filtering using pruning algorithm with dynamic threshold.
+        
+        Note:
+        This method implements the filtering logic for the PruningContentFilter class.
+        It takes HTML content as input and returns a list of filtered text chunks.
+        
+        Args:
+            html (str): HTML content to be filtered.
+            min_word_threshold (int): Minimum word threshold for filtering (optional).
+        
+        Returns:
+            List[str]: List of filtered text chunks.
+        """
        if not html or not isinstance(html, str):
            return []
            
@@ -444,15 +519,23 @@ class PruningContentFilter(RelevantContentFilter):
        return content_blocks

    def _remove_comments(self, soup):
+        """Removes HTML comments"""
        for element in soup(text=lambda text: isinstance(text, Comment)):
            element.extract()

    def _remove_unwanted_tags(self, soup):
+        """Removes unwanted tags"""
        for tag in self.excluded_tags:
            for element in soup.find_all(tag):
                element.decompose()

    def _prune_tree(self, node):
+        """
+        Prunes the tree starting from the given node.
+        
+        Args:
+            node (Tag): The node from which the pruning starts.
+        """
        if not node or not hasattr(node, 'name') or node.name is None:
            return

@@ -495,6 +578,7 @@ class PruningContentFilter(RelevantContentFilter):
                self._prune_tree(child)

    def _compute_composite_score(self, metrics, text_len, tag_len, link_text_len):
+        """Computes the composite score"""
        if self.min_word_threshold:
            # Get raw text from metrics node - avoid extra processing
            text = metrics['node'].get_text(strip=True)
@@ -531,6 +615,7 @@ class PruningContentFilter(RelevantContentFilter):
        return score / total_weight if total_weight > 0 else 0

    def _compute_class_id_weight(self, node):
+        """Computes the class ID weight"""
        class_id_score = 0
        if 'class' in node.attrs:
            classes = ' '.join(node['class'])
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -64,6 +64,17 @@ class ContentScrapingStrategy(ABC):
        pass

 class WebScrapingStrategy(ContentScrapingStrategy):
+    """
+    Class for web content scraping. Perhaps the most important class. 
+    
+    How it works:
+    1. Extract content from HTML using BeautifulSoup.
+    2. Clean the extracted content using a content cleaning strategy.
+    3. Filter the cleaned content using a content filtering strategy.
+    4. Generate markdown content from the filtered content.
+    5. Return the markdown content.
+    """
+    
    def __init__(self, logger=None):
        self.logger = logger

@@ -74,17 +85,57 @@ class WebScrapingStrategy(ContentScrapingStrategy):
            log_method(message=message, tag=tag, **kwargs)
                
    def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
+        """
+        Main entry point for content scraping.  
+
+        Args:
+            url (str): The URL of the page to scrape.
+            html (str): The HTML content of the page.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys:
+
+            - 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
+            - 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
+            - 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
+            - 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
+        """
        return self._scrap(url, html, is_async=False, **kwargs)

    async def ascrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]:
+        """
+        Main entry point for asynchronous content scraping.
+
+        Args:
+            url (str): The URL of the page to scrape.
+            html (str): The HTML content of the page.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            Dict[str, Any]: A dictionary containing the scraped content. This dictionary contains the following keys:
+
+            - 'markdown': The generated markdown content, type is str, however soon will become MarkdownGenerationResult via 'markdown.raw_markdown'.
+            - 'fit_markdown': The generated markdown content with relevant content filtered, this will be removed soon and available in 'markdown.fit_markdown'.
+            - 'fit_html': The HTML content with relevant content filtered, this will be removed soon and available in 'markdown.fit_html'.
+            - 'markdown_v2': The generated markdown content with relevant content filtered, this is temporary and will be removed soon and replaced with 'markdown'
+        """
        return await asyncio.to_thread(self._scrap, url, html, **kwargs)

-    def _generate_markdown_content(self, 
-                                 cleaned_html: str,
-                                 html: str,
-                                 url: str,
-                                 success: bool,
-                                 **kwargs) -> Dict[str, Any]:
+    def _generate_markdown_content(self, cleaned_html: str,html: str,url: str, success: bool, **kwargs) -> Dict[str, Any]:
+        """
+        Generate markdown content from cleaned HTML.
+
+        Args:
+            cleaned_html (str): The cleaned HTML content.
+            html (str): The original HTML content.
+            url (str): The URL of the page.
+            success (bool): Whether the content was successfully cleaned.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            Dict[str, Any]: A dictionary containing the generated markdown content.
+        """
        markdown_generator: Optional[MarkdownGenerationStrategy] = kwargs.get('markdown_generator', DefaultMarkdownGenerator())
        
        if markdown_generator:
@@ -158,6 +209,15 @@ class WebScrapingStrategy(ContentScrapingStrategy):
        """

    def flatten_nested_elements(self, node):
+        """
+        Flatten nested elements in a HTML tree.
+
+        Args:
+            node (Tag): The root node of the HTML tree.
+
+        Returns:
+            Tag: The flattened HTML tree.
+        """
        if isinstance(node, NavigableString):
            return node
        if len(node.contents) == 1 and isinstance(node.contents[0], Tag) and node.contents[0].name == node.name:
@@ -166,6 +226,16 @@ class WebScrapingStrategy(ContentScrapingStrategy):
        return node

    def find_closest_parent_with_useful_text(self, tag, **kwargs):
+        """
+        Find the closest parent with useful text.
+
+        Args:
+            tag (Tag): The starting tag to search from.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            Tag: The closest parent with useful text, or None if not found.
+        """
        image_description_min_word_threshold = kwargs.get('image_description_min_word_threshold', IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)
        current_tag = tag
        while current_tag:
@@ -179,6 +249,17 @@ class WebScrapingStrategy(ContentScrapingStrategy):
        return None

    def remove_unwanted_attributes(self, element, important_attrs, keep_data_attributes=False):
+        """
+        Remove unwanted attributes from an HTML element.
+
+        Args:    
+            element (Tag): The HTML element to remove attributes from.
+            important_attrs (list): List of important attributes to keep.
+            keep_data_attributes (bool): Whether to keep data attributes.
+
+        Returns:
+            None
+        """
        attrs_to_remove = []
        for attr in element.attrs:
            if attr not in important_attrs:
@@ -192,6 +273,26 @@ class WebScrapingStrategy(ContentScrapingStrategy):
            del element[attr]

    def process_image(self, img, url, index, total_images, **kwargs):
+        """
+        Process an image element.
+        
+        How it works:
+        1. Check if the image has valid display and inside undesired html elements.
+        2. Score an image for it's usefulness.
+        3. Extract image file metadata to extract size and extension.
+        4. Generate a dictionary with the processed image information.
+        5. Return the processed image information.
+
+        Args:
+            img (Tag): The image element to process.
+            url (str): The URL of the page containing the image.
+            index (int): The index of the image in the list of images.
+            total_images (int): The total number of images in the list.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            dict: A dictionary containing the processed image information.
+        """
        parse_srcset = lambda s: [{'url': u.strip().split()[0], 'width': u.strip().split()[-1].rstrip('w') 
                        if ' ' in u else None} 
                        for u in [f"http{p}" for p in s.split("http") if p]]
@@ -316,6 +417,23 @@ class WebScrapingStrategy(ContentScrapingStrategy):
        return image_variants if image_variants else None

    def process_element(self, url, element: PageElement, **kwargs) -> Dict[str, Any]:        
+        """
+        Process an HTML element.
+        
+        How it works:
+        1. Check if the element is an image, video, or audio.
+        2. Extract the element's attributes and content.
+        3. Process the element based on its type.
+        4. Return the processed element information.
+
+        Args:
+            url (str): The URL of the page containing the element.
+            element (Tag): The HTML element to process.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            dict: A dictionary containing the processed element information.
+        """
        media = {'images': [], 'videos': [], 'audios': []}
        internal_links_dict = {}
        external_links_dict = {}
@@ -334,6 +452,9 @@ class WebScrapingStrategy(ContentScrapingStrategy):
        }
        
    def _process_element(self, url, element: PageElement,  media: Dict[str, Any], internal_links_dict: Dict[str, Any], external_links_dict: Dict[str, Any], **kwargs) -> bool:
+        """
+        Process an HTML element.        
+        """
        try:
            if isinstance(element, NavigableString):
                if isinstance(element, Comment):
@@ -534,11 +655,25 @@ class WebScrapingStrategy(ContentScrapingStrategy):
            return False

    def _scrap(self, url: str, html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, **kwargs) -> Dict[str, Any]:
+        """
+        Extract content from HTML using BeautifulSoup.
+
+        Args:
+            url (str): The URL of the page to scrape.
+            html (str): The HTML content of the page to scrape.
+            word_count_threshold (int): The minimum word count threshold for content extraction.
+            css_selector (str): The CSS selector to use for content extraction.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            dict: A dictionary containing the extracted content.
+        """
        success = True
        if not html:
            return None

-        soup = BeautifulSoup(html, 'lxml')
+        parser_type = kwargs.get('parser', 'lxml')
+        soup = BeautifulSoup(html, parser_type)
        body = soup.body
        base_domain = get_base_domain(url)
        
--- a/crawl4ai/extraction_strategy.bak.py
+++ b/crawl4ai/extraction_strategy.bak.py
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -62,29 +62,66 @@ class ExtractionStrategy(ABC):
        return extracted_content    
    
 class NoExtractionStrategy(ExtractionStrategy):
+    """
+    A strategy that does not extract any meaningful content from the HTML. It simply returns the entire HTML as a single block.
+    """
    def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
+        """
+        Extract meaningful blocks or chunks from the given HTML.
+        """
        return [{"index": 0, "content": html}]
    
    def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
        return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)]
-   

 #######################################################
 # Strategies using LLM-based extraction for text data #
 #######################################################
-
-
-    
 class LLMExtractionStrategy(ExtractionStrategy):
+    """
+    A strategy that uses an LLM to extract meaningful content from the HTML.
+    
+    Attributes:
+        provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
+        api_token: The API token for the provider.
+        instruction: The instruction to use for the LLM model.  
+        schema: Pydantic model schema for structured data.
+        extraction_type: "block" or "schema".
+        chunk_token_threshold: Maximum tokens per chunk.
+        overlap_rate: Overlap between chunks.
+        word_token_rate: Word to token conversion rate.
+        apply_chunking: Whether to apply chunking.
+        base_url: The base URL for the API request.
+        api_base: The base URL for the API request.
+        extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
+        verbose: Whether to print verbose output.
+        usages: List of individual token usages.
+        total_usage: Accumulated token usage.
+    """
+
    def __init__(self, 
                 provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None, 
                 instruction:str = None, schema:Dict = None, extraction_type = "block", **kwargs):
        """
        Initialize the strategy with clustering parameters.
+        
+        Args:
+            provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
+            api_token: The API token for the provider.
+            instruction: The instruction to use for the LLM model.  
+            schema: Pydantic model schema for structured data.
+            extraction_type: "block" or "schema".
+            chunk_token_threshold: Maximum tokens per chunk.
+            overlap_rate: Overlap between chunks.
+            word_token_rate: Word to token conversion rate.
+            apply_chunking: Whether to apply chunking.
+            base_url: The base URL for the API request.
+            api_base: The base URL for the API request.
+            extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
+            verbose: Whether to print verbose output.
+            usages: List of individual token usages.
+            total_usage: Accumulated token usage.   

-        :param provider: The provider to use for extraction.
-        :param api_token: The API token for the provider.
-        :param instruction: The instruction to use for the LLM model.
        """
        super().__init__(**kwargs)
        self.provider = provider
@@ -114,6 +151,22 @@ class LLMExtractionStrategy(ExtractionStrategy):
        
            
    def extract(self, url: str, ix:int, html: str) -> List[Dict[str, Any]]:
+        """
+        Extract meaningful blocks or chunks from the given HTML using an LLM.
+        
+        How it works:
+        1. Construct a prompt with variables.
+        2. Make a request to the LLM using the prompt.
+        3. Parse the response and extract blocks or chunks.
+        
+        Args:
+            url: The URL of the webpage.
+            ix: Index of the block.
+            html: The HTML content of the webpage.
+            
+        Returns:
+            A list of extracted blocks or chunks.
+        """
        if self.verbose:
            # print("[LOG] Extracting blocks from URL:", url)
            print(f"[LOG] Call LLM for {url} - block index: {ix}")
@@ -180,6 +233,9 @@ class LLMExtractionStrategy(ExtractionStrategy):
        return blocks
    
    def _merge(self, documents, chunk_token_threshold, overlap):
+        """
+        Merge documents into sections based on chunk_token_threshold and overlap.
+        """
        chunks = []
        sections = []
        total_tokens = 0
@@ -229,6 +285,13 @@ class LLMExtractionStrategy(ExtractionStrategy):
    def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:
        """
        Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy.
+        
+        Args:
+            url: The URL of the webpage.
+            sections: List of sections (strings) to process.
+            
+        Returns:
+            A list of extracted blocks or chunks.
        """
        
        merged_sections = self._merge(
@@ -285,12 +348,30 @@ class LLMExtractionStrategy(ExtractionStrategy):
        for i, usage in enumerate(self.usages, 1):
            print(f"{i:<10} {usage.completion_tokens:>12,} {usage.prompt_tokens:>12,} {usage.total_tokens:>12,}")
  
-
 #######################################################
 # Strategies using clustering for text data extraction #
 #######################################################

 class CosineStrategy(ExtractionStrategy):
+    """
+    Extract meaningful blocks or chunks from the given HTML using cosine similarity.
+    
+    How it works:
+    1. Pre-filter documents using embeddings and semantic_filter.
+    2. Perform clustering using cosine similarity.
+    3. Organize texts by their cluster labels, retaining order.
+    4. Filter clusters by word count.
+    5. Extract meaningful blocks or chunks from the filtered clusters.
+    
+    Attributes:
+        semantic_filter (str): A keyword filter for document filtering.
+        word_count_threshold (int): Minimum number of words per cluster.
+        max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters.
+        linkage_method (str): The linkage method for hierarchical clustering.
+        top_k (int): Number of top categories to extract.
+        model_name (str): The name of the sentence-transformers model.
+        sim_threshold (float): The similarity threshold for clustering.
+    """ 
    def __init__(self, semantic_filter = None, word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'sentence-transformers/all-MiniLM-L6-v2', sim_threshold = 0.3, **kwargs):
        """
        Initialize the strategy with clustering parameters.
@@ -368,11 +449,13 @@ class CosineStrategy(ExtractionStrategy):
        """
        Filter and sort documents based on the cosine similarity of their embeddings with the semantic_filter embedding.

-        :param documents: List of text chunks (documents).
-        :param semantic_filter: A string containing the keywords for filtering.
-        :param threshold: Cosine similarity threshold for filtering documents.
-        :param at_least_k: Minimum number of documents to return.
-        :return: List of filtered documents, ensuring at least `at_least_k` documents.
+        Args:
+            documents (List[str]): A list of document texts.
+            semantic_filter (str): A keyword filter for document filtering.
+            at_least_k (int): The minimum number of documents to return.
+
+        Returns:
+            List[str]: A list of filtered and sorted document texts.
        """
        
        if not semantic_filter:
@@ -410,8 +493,11 @@ class CosineStrategy(ExtractionStrategy):
        """
        Get BERT embeddings for a list of sentences.

-        :param sentences: List of text chunks (sentences).
-        :return: NumPy array of embeddings.
+        Args:
+            sentences (List[str]): A list of text chunks (sentences).
+
+        Returns:
+            NumPy array of embeddings.
        """
        # if self.buffer_embeddings.any() and not bypass_buffer:
        #     return self.buffer_embeddings
@@ -455,8 +541,11 @@ class CosineStrategy(ExtractionStrategy):
        """
        Perform hierarchical clustering on sentences and return cluster labels.

-        :param sentences: List of text chunks (sentences).
-        :return: NumPy array of cluster labels.
+        Args:
+            sentences (List[str]): A list of text chunks (sentences).
+
+        Returns:
+            NumPy array of cluster labels.
        """
        # Get embeddings
        from scipy.cluster.hierarchy import linkage, fcluster
@@ -472,12 +561,15 @@ class CosineStrategy(ExtractionStrategy):
        labels = fcluster(linked, self.max_dist, criterion='distance')
        return labels

-    def filter_clusters_by_word_count(self, clusters: Dict[int, List[str]]):
+    def filter_clusters_by_word_count(self, clusters: Dict[int, List[str]]) -> Dict[int, List[str]]:
        """
        Filter clusters to remove those with a word count below the threshold.

-        :param clusters: Dictionary of clusters.
-        :return: Filtered dictionary of clusters.
+        Args:
+            clusters (Dict[int, List[str]]): Dictionary of clusters.
+
+        Returns:
+            Dict[int, List[str]]: Filtered dictionary of clusters.
        """
        filtered_clusters = {}
        for cluster_id, texts in clusters.items():
@@ -496,9 +588,12 @@ class CosineStrategy(ExtractionStrategy):
        """
        Extract clusters from HTML content using hierarchical clustering.

-        :param url: The URL of the webpage.
-        :param html: The HTML content of the webpage.
-        :return: A list of dictionaries representing the clusters.
+        Args:
+            url (str): The URL of the webpage.
+            html (str): The HTML content of the webpage.
+
+        Returns:
+            List[Dict[str, Any]]: A list of processed JSON blocks.
        """
        # Assume `html` is a list of text chunks for this strategy
        t = time.time()
@@ -560,159 +655,85 @@ class CosineStrategy(ExtractionStrategy):
        """
        Process sections using hierarchical clustering.

-        :param url: The URL of the webpage.
-        :param sections: List of sections (strings) to process.
-        :param provider: The provider to be used for extraction (not used here).
-        :param api_token: Optional API token for the provider (not used here).
-        :return: A list of processed JSON blocks.
+        Args:
+            url (str): The URL of the webpage.
+            sections (List[str]): List of sections (strings) to process.
+
+        Returns:
        """
        # This strategy processes all sections together
        
        return self.extract(url, self.DEL.join(sections), **kwargs)
    
-
-#######################################################
-# Strategies based on the extraction of specific types #
-#######################################################
-    
-class TopicExtractionStrategy(ExtractionStrategy):
-    def __init__(self, num_keywords: int = 3, **kwargs):
-        """
-        Initialize the topic extraction strategy with parameters for topic segmentation.
-
-        :param num_keywords: Number of keywords to represent each topic segment.
-        """
-        import nltk
-        super().__init__(**kwargs)
-        self.num_keywords = num_keywords
-        self.tokenizer = nltk.TextTilingTokenizer()
-
-    def extract_keywords(self, text: str) -> List[str]:
-        """
-        Extract keywords from a given text segment using simple frequency analysis.
-
-        :param text: The text segment from which to extract keywords.
-        :return: A list of keyword strings.
-        """
-        import nltk
-        # Tokenize the text and compute word frequency
-        words = nltk.word_tokenize(text)
-        freq_dist = nltk.FreqDist(words)
-        # Get the most common words as keywords
-        keywords = [word for (word, _) in freq_dist.most_common(self.num_keywords)]
-        return keywords
-
-    def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
-        """
-        Extract topics from HTML content using TextTiling for segmentation and keyword extraction.
-
-        :param url: The URL of the webpage.
-        :param html: The HTML content of the webpage.
-        :param provider: The provider to be used for extraction (not used here).
-        :param api_token: Optional API token for the provider (not used here).
-        :return: A list of dictionaries representing the topics.
-        """
-        # Use TextTiling to segment the text into topics
-        segmented_topics = html.split(self.DEL)  # Split by lines or paragraphs as needed
-
-        # Prepare the output as a list of dictionaries
-        topic_list = []
-        for i, segment in enumerate(segmented_topics):
-            # Extract keywords for each segment
-            keywords = self.extract_keywords(segment)
-            topic_list.append({
-                "index": i,
-                "content": segment,
-                "keywords": keywords
-            })
-
-        return topic_list
-
-    def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
-        """
-        Process sections using topic segmentation and keyword extraction.
-
-        :param url: The URL of the webpage.
-        :param sections: List of sections (strings) to process.
-        :param provider: The provider to be used for extraction (not used here).
-        :param api_token: Optional API token for the provider (not used here).
-        :return: A list of processed JSON blocks.
-        """
-        # Concatenate sections into a single text for coherent topic segmentation
-        
-        
-        return self.extract(url, self.DEL.join(sections), **kwargs)
-    
-class ContentSummarizationStrategy(ExtractionStrategy):
-    def __init__(self, model_name: str = "sshleifer/distilbart-cnn-12-6", **kwargs):
-        """
-        Initialize the content summarization strategy with a specific model.
-
-        :param model_name: The model to use for summarization.
-        """
-        super().__init__(**kwargs)
-        from transformers import pipeline
-        self.summarizer = pipeline("summarization", model=model_name)
-
-    def extract(self, url: str, text: str, provider: str = None, api_token: Optional[str] = None) -> List[Dict[str, Any]]:
-        """
-        Summarize a single section of text.
-
-        :param url: The URL of the webpage.
-        :param text: A section of text to summarize.
-        :param provider: The provider to be used for extraction (not used here).
-        :param api_token: Optional API token for the provider (not used here).
-        :return: A dictionary with the summary.
-        """
-        try:
-            summary = self.summarizer(text, max_length=130, min_length=30, do_sample=False)
-            return {"summary": summary[0]['summary_text']}
-        except Exception as e:
-            print(f"Error summarizing text: {e}")
-            return {"summary": text}  # Fallback to original text if summarization fails
-
-    def run(self, url: str, sections: List[str], provider: str = None, api_token: Optional[str] = None) -> List[Dict[str, Any]]:
-        """
-        Process each section in parallel to produce summaries.
-
-        :param url: The URL of the webpage.
-        :param sections: List of sections (strings) to summarize.
-        :param provider: The provider to be used for extraction (not used here).
-        :param api_token: Optional API token for the provider (not used here).
-        :return: A list of dictionaries with summaries for each section.
-        """
-        # Use a ThreadPoolExecutor to summarize in parallel
-        summaries = []
-        with ThreadPoolExecutor() as executor:
-            # Create a future for each section's summarization
-            future_to_section = {executor.submit(self.extract, url, section, provider, api_token): i for i, section in enumerate(sections)}
-            for future in as_completed(future_to_section):
-                section_index = future_to_section[future]
-                try:
-                    summary_result = future.result()
-                    summaries.append((section_index, summary_result))
-                except Exception as e:
-                    print(f"Error processing section {section_index}: {e}")
-                    summaries.append((section_index, {"summary": sections[section_index]}))  # Fallback to original text
-
-        # Sort summaries by the original section index to maintain order
-        summaries.sort(key=lambda x: x[0])
-        return [summary for _, summary in summaries]
- 
-
 #######################################################
 # New extraction strategies for JSON-based extraction #
 ####################################################### 

 class JsonElementExtractionStrategy(ExtractionStrategy):
+    """
+    Abstract base class for extracting structured JSON from HTML content.
+
+    How it works:
+    1. Parses HTML content using the `_parse_html` method.
+    2. Uses a schema to define base selectors, fields, and transformations.
+    3. Extracts data hierarchically, supporting nested fields and lists.
+    4. Handles computed fields with expressions or functions.
+
+    Attributes:
+        DEL (str): Delimiter used to combine HTML sections. Defaults to '\n'.
+        schema (Dict[str, Any]): The schema defining the extraction rules.
+        verbose (bool): Enables verbose logging for debugging purposes.
+
+    Methods:
+        extract(url, html_content, *q, **kwargs): Extracts structured data from HTML content.
+        _extract_item(element, fields): Extracts fields from a single element.
+        _extract_single_field(element, field): Extracts a single field based on its type.
+        _apply_transform(value, transform): Applies a transformation to a value.
+        _compute_field(item, field): Computes a field value using an expression or function.
+        run(url, sections, *q, **kwargs): Combines HTML sections and runs the extraction strategy.
+
+    Abstract Methods:
+        _parse_html(html_content): Parses raw HTML into a structured format (e.g., BeautifulSoup or lxml).
+        _get_base_elements(parsed_html, selector): Retrieves base elements using a selector.
+        _get_elements(element, selector): Retrieves child elements using a selector.
+        _get_element_text(element): Extracts text content from an element.
+        _get_element_html(element): Extracts raw HTML from an element.
+        _get_element_attribute(element, attribute): Extracts an attribute's value from an element.
+    """
+
+    
    DEL = '\n'

    def __init__(self, schema: Dict[str, Any], **kwargs):
+        """
+        Initialize the JSON element extraction strategy with a schema.
+
+        Args:
+            schema (Dict[str, Any]): The schema defining the extraction rules.
+        """
        super().__init__(**kwargs)
        self.schema = schema
        self.verbose = kwargs.get('verbose', False)

    def extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]:
+        """
+        Extract structured data from HTML content.
+
+        How it works:
+        1. Parses the HTML content using the `_parse_html` method.
+        2. Identifies base elements using the schema's base selector.
+        3. Extracts fields from each base element using `_extract_item`.
+
+        Args:
+            url (str): The URL of the page being processed.
+            html_content (str): The raw HTML content to parse and extract.
+            *q: Additional positional arguments.
+            **kwargs: Additional keyword arguments for custom extraction.
+
+        Returns:
+            List[Dict[str, Any]]: A list of extracted items, each represented as a dictionary.
+        """
+        
        parsed_html = self._parse_html(html_content)
        base_elements = self._get_base_elements(parsed_html, self.schema['baseSelector'])
        
@@ -772,6 +793,22 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
            return field.get('default')

    def _extract_single_field(self, element, field):
+        """
+        Extract a single field based on its type.
+
+        How it works:
+        1. Selects the target element using the field's selector.
+        2. Extracts the field value based on its type (e.g., text, attribute, regex).
+        3. Applies transformations if defined in the schema.
+
+        Args:
+            element: The base element to extract the field from.
+            field (Dict[str, Any]): The field definition in the schema.
+
+        Returns:
+            Any: The extracted field value.
+        """
+        
        if 'selector' in field:
            selected = self._get_elements(element, field['selector'])
            if not selected:
@@ -806,6 +843,22 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
        return item

    def _extract_item(self, element, fields):
+        """
+        Extracts fields from a given element.
+
+        How it works:
+        1. Iterates through the fields defined in the schema.
+        2. Handles computed, single, and nested field types.
+        3. Updates the item dictionary with extracted field values.
+
+        Args:
+            element: The base element to extract fields from.
+            fields (List[Dict[str, Any]]): The list of fields to extract.
+
+        Returns:
+            Dict[str, Any]: A dictionary representing the extracted item.
+        """
+        
        item = {}
        for field in fields:
            if field['type'] == 'computed':
@@ -817,6 +870,22 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
        return item

    def _apply_transform(self, value, transform):
+        """
+        Apply a transformation to a value.
+
+        How it works:
+        1. Checks the transformation type (e.g., `lowercase`, `strip`).
+        2. Applies the transformation to the value.
+        3. Returns the transformed value.
+
+        Args:
+            value (str): The value to transform.
+            transform (str): The type of transformation to apply.
+
+        Returns:
+            str: The transformed value.
+        """
+        
        if transform == 'lowercase':
            return value.lower()
        elif transform == 'uppercase':
@@ -837,6 +906,23 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
            return field.get('default')

    def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
+        """
+        Run the extraction strategy on a combined HTML content.
+
+        How it works:
+        1. Combines multiple HTML sections using the `DEL` delimiter.
+        2. Calls the `extract` method with the combined HTML.
+
+        Args:
+            url (str): The URL of the page being processed.
+            sections (List[str]): A list of HTML sections.
+            *q: Additional positional arguments.
+            **kwargs: Additional keyword arguments for custom extraction.
+
+        Returns:
+            List[Dict[str, Any]]: A list of extracted items.
+        """
+        
        combined_html = self.DEL.join(sections)
        return self.extract(url, combined_html, **kwargs)

@@ -856,6 +942,27 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
        pass

 class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
+    """
+    Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors.
+
+    How it works:
+    1. Parses HTML content with BeautifulSoup.
+    2. Selects elements using CSS selectors defined in the schema.
+    3. Extracts field data and applies transformations as defined.
+
+    Attributes:
+        schema (Dict[str, Any]): The schema defining the extraction rules.
+        verbose (bool): Enables verbose logging for debugging purposes.
+
+    Methods:
+        _parse_html(html_content): Parses HTML content into a BeautifulSoup object.
+        _get_base_elements(parsed_html, selector): Selects base elements using a CSS selector.
+        _get_elements(element, selector): Selects child elements using a CSS selector.
+        _get_element_text(element): Extracts text content from a BeautifulSoup element.
+        _get_element_html(element): Extracts the raw HTML content of a BeautifulSoup element.
+        _get_element_attribute(element, attribute): Retrieves an attribute value from a BeautifulSoup element.
+    """
+    
    def __init__(self, schema: Dict[str, Any], **kwargs):
        kwargs['input_format'] = 'html'  # Force HTML input
        super().__init__(schema, **kwargs)
@@ -880,6 +987,28 @@ class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
        return element.get(attribute)

 class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
+    """
+    Concrete implementation of `JsonElementExtractionStrategy` using XPath selectors.
+
+    How it works:
+    1. Parses HTML content into an lxml tree.
+    2. Selects elements using XPath expressions.
+    3. Converts CSS selectors to XPath when needed.
+
+    Attributes:
+        schema (Dict[str, Any]): The schema defining the extraction rules.
+        verbose (bool): Enables verbose logging for debugging purposes.
+
+    Methods:
+        _parse_html(html_content): Parses HTML content into an lxml tree.
+        _get_base_elements(parsed_html, selector): Selects base elements using an XPath selector.
+        _css_to_xpath(css_selector): Converts a CSS selector to an XPath expression.
+        _get_elements(element, selector): Selects child elements using an XPath selector.
+        _get_element_text(element): Extracts text content from an lxml element.
+        _get_element_html(element): Extracts the raw HTML content of an lxml element.
+        _get_element_attribute(element, attribute): Retrieves an attribute value from an lxml element.
+    """
+    
    def __init__(self, schema: Dict[str, Any], **kwargs):
        kwargs['input_format'] = 'html'  # Force HTML input
        super().__init__(schema, **kwargs)
@@ -921,259 +1050,3 @@ class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
    def _get_element_attribute(self, element, attribute: str):
        return element.get(attribute)
 
- 
-class _JsonCssExtractionStrategy(ExtractionStrategy):
-    def __init__(self, schema: Dict[str, Any], **kwargs):
-        kwargs['input_format'] = 'html'  # Force HTML input
-        super().__init__(**kwargs)
-        self.schema = schema
-
-    def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
-        soup = BeautifulSoup(html, 'html.parser')
-        base_elements = soup.select(self.schema['baseSelector'])
-        
-        results = []
-        for element in base_elements:
-            # Extract base element attributes first
-            item = {}
-            if 'baseFields' in self.schema:
-                for field in self.schema['baseFields']:
-                    value = self._extract_single_field(element, field)
-                    if value is not None:
-                        item[field['name']] = value
-            
-            # Then extract child fields
-            field_data = self._extract_item(element, self.schema['fields'])
-            item.update(field_data)
-            
-            results.append(item)
-        
-        return results
-
-    def _extract_field(self, element, field):
-        try:
-            if field['type'] == 'nested':
-                nested_element = element.select_one(field['selector'])
-                return self._extract_item(nested_element, field['fields']) if nested_element else {}
-            
-            if field['type'] == 'list':
-                elements = element.select(field['selector'])
-                return [self._extract_list_item(el, field['fields']) for el in elements]
-            
-            if field['type'] == 'nested_list':
-                elements = element.select(field['selector'])
-                return [self._extract_item(el, field['fields']) for el in elements]
-            
-            return self._extract_single_field(element, field)
-        except Exception as e:
-            if self.verbose:
-                print(f"Error extracting field {field['name']}: {str(e)}")
-            return field.get('default')
-
-    def _extract_list_item(self, element, fields):
-        item = {}
-        for field in fields:
-            value = self._extract_single_field(element, field)
-            if value is not None:
-                item[field['name']] = value
-        return item
-    
-    def _extract_single_field(self, element, field):
-        if 'selector' in field:
-            selected = element.select_one(field['selector'])
-            if not selected:
-                return field.get('default')
-        else:
-            selected = element
-
-        value = None
-        if field['type'] == 'text':
-            value = selected.get_text(strip=True)
-        elif field['type'] == 'attribute':
-            value = selected.get(field['attribute'])
-        elif field['type'] == 'html':
-            value = str(selected)
-        elif field['type'] == 'regex':
-            text = selected.get_text(strip=True)
-            match = re.search(field['pattern'], text)
-            value = match.group(1) if match else None
-
-        if 'transform' in field:
-            value = self._apply_transform(value, field['transform'])
-
-        return value if value is not None else field.get('default')
-
-    def _extract_item(self, element, fields):
-        item = {}
-        for field in fields:
-            if field['type'] == 'computed':
-                value = self._compute_field(item, field)
-            else:
-                value = self._extract_field(element, field)
-            if value is not None:
-                item[field['name']] = value
-        return item
-    
-    def _apply_transform(self, value, transform):
-        if transform == 'lowercase':
-            return value.lower()
-        elif transform == 'uppercase':
-            return value.upper()
-        elif transform == 'strip':
-            return value.strip()
-        return value
-
-    def _compute_field(self, item, field):
-        try:
-            if 'expression' in field:
-                return eval(field['expression'], {}, item)
-            elif 'function' in field:
-                return field['function'](item)
-        except Exception as e:
-            if self.verbose:
-                print(f"Error computing field {field['name']}: {str(e)}")
-            return field.get('default')
-
-    def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
-        combined_html = self.DEL.join(sections)
-        return self.extract(url, combined_html, **kwargs)
-class _JsonXPathExtractionStrategy(ExtractionStrategy):
-    def __init__(self, schema: Dict[str, Any], **kwargs):
-        kwargs['input_format'] = 'html'  # Force HTML input
-        super().__init__(**kwargs)
-        self.schema = schema
-
-    def extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]:
-        tree = html.fromstring(html_content)
-        base_xpath = self.schema['baseSelector']
-        base_elements = tree.xpath(base_xpath)
-        
-        results = []
-        for element in base_elements:
-            # Extract base element attributes first
-            item = {}
-            if 'baseFields' in self.schema:
-                for field in self.schema['baseFields']:
-                    value = self._extract_single_field(element, field)
-                    if value is not None:
-                        item[field['name']] = value
-            
-            # Then extract child fields
-            field_data = self._extract_item(element, self.schema['fields'])
-            item.update(field_data)
-            
-            results.append(item)
-        
-        return results
-
-    def _css_to_xpath(self, css_selector: str) -> str:
-        """Convert CSS selector to XPath if needed"""
-        if '/' in css_selector:  # Already an XPath
-            return css_selector
-        else:
-            # Fallback to basic conversion for common cases
-            return self._basic_css_to_xpath(css_selector)
-
-    def _basic_css_to_xpath(self, css_selector: str) -> str:
-        """Basic CSS to XPath conversion for common cases"""
-        # Handle basic cases
-        if ' > ' in css_selector:
-            parts = css_selector.split(' > ')
-            return '//' + '/'.join(parts)
-        if ' ' in css_selector:
-            parts = css_selector.split(' ')
-            return '//' + '//'.join(parts)
-        return '//' + css_selector
-
-    def _extract_field(self, element, field):
-        try:
-            if field['type'] == 'nested':
-                xpath = self._css_to_xpath(field['selector'])
-                nested_element = element.xpath(xpath)[0] if element.xpath(xpath) else None
-                return self._extract_item(nested_element, field['fields']) if nested_element is not None else {}
-            
-            if field['type'] == 'list':
-                xpath = self._css_to_xpath(field['selector'])
-                elements = element.xpath(xpath)
-                return [self._extract_list_item(el, field['fields']) for el in elements]
-            
-            if field['type'] == 'nested_list':
-                xpath = self._css_to_xpath(field['selector'])
-                elements = element.xpath(xpath)
-                return [self._extract_item(el, field['fields']) for el in elements]
-            
-            return self._extract_single_field(element, field)
-        except Exception as e:
-            if self.verbose:
-                print(f"Error extracting field {field['name']}: {str(e)}")
-            return field.get('default')
-
-    def _extract_list_item(self, element, fields):
-        item = {}
-        for field in fields:
-            value = self._extract_single_field(element, field)
-            if value is not None:
-                item[field['name']] = value
-        return item
-    
-    def _extract_single_field(self, element, field):
-        if 'selector' in field:
-            xpath = self._css_to_xpath(field['selector'])
-            selected = element.xpath(xpath)
-            if not selected:
-                return field.get('default')
-            selected = selected[0]
-        else:
-            selected = element
-
-        value = None
-        if field['type'] == 'text':
-            value = ''.join(selected.xpath('.//text()')).strip()
-        elif field['type'] == 'attribute':
-            value = selected.get(field['attribute'])
-        elif field['type'] == 'html':
-            value = etree.tostring(selected, encoding='unicode')
-        elif field['type'] == 'regex':
-            text = ''.join(selected.xpath('.//text()')).strip()
-            match = re.search(field['pattern'], text)
-            value = match.group(1) if match else None
-
-        if 'transform' in field:
-            value = self._apply_transform(value, field['transform'])
-
-        return value if value is not None else field.get('default')
-
-    def _extract_item(self, element, fields):
-        item = {}
-        for field in fields:
-            if field['type'] == 'computed':
-                value = self._compute_field(item, field)
-            else:
-                value = self._extract_field(element, field)
-            if value is not None:
-                item[field['name']] = value
-        return item
-    
-    def _apply_transform(self, value, transform):
-        if transform == 'lowercase':
-            return value.lower()
-        elif transform == 'uppercase':
-            return value.upper()
-        elif transform == 'strip':
-            return value.strip()
-        return value
-
-    def _compute_field(self, item, field):
-        try:
-            if 'expression' in field:
-                return eval(field['expression'], {}, item)
-            elif 'function' in field:
-                return field['function'](item)
-        except Exception as e:
-            if self.verbose:
-                print(f"Error computing field {field['name']}: {str(e)}")
-            return field.get('default')
-
-    def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
-        combined_html = self.DEL.join(sections)
-        return self.extract(url, combined_html, **kwargs)    
--- a/crawl4ai/markdown_generation_strategy.py
+++ b/crawl4ai/markdown_generation_strategy.py
@@ -38,11 +38,44 @@ class MarkdownGenerationStrategy(ABC):
        pass

 class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
-    """Default implementation of markdown generation strategy."""
+    """
+    Default implementation of markdown generation strategy.
+    
+    How it works:
+    1. Generate raw markdown from cleaned HTML.
+    2. Convert links to citations.
+    3. Generate fit markdown if content filter is provided.
+    4. Return MarkdownGenerationResult.
+    
+    Args:
+        content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
+        options (Optional[Dict[str, Any]]): Additional options for markdown generation. Defaults to None.
+        
+    Returns:
+        MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
+    """
    def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None):
        super().__init__(content_filter, options)
    
    def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]:
+        """
+        Convert links in markdown to citations.
+        
+        How it works:
+        1. Find all links in the markdown.
+        2. Convert links to citations.
+        3. Return converted markdown and references markdown.
+        
+        Note:
+        This function uses a regex pattern to find links in markdown.
+        
+        Args:
+            markdown (str): Markdown text.
+            base_url (str): Base URL for URL joins.
+            
+        Returns:
+            Tuple[str, str]: Converted markdown and references markdown.
+        """
        link_map = {}
        url_cache = {}  # Cache for URL joins
        parts = []
@@ -90,7 +123,26 @@ class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
                         content_filter: Optional[RelevantContentFilter] = None,
                         citations: bool = True,
                         **kwargs) -> MarkdownGenerationResult:
-        """Generate markdown with citations from cleaned HTML."""
+        """
+        Generate markdown with citations from cleaned HTML.
+        
+        How it works:
+        1. Generate raw markdown from cleaned HTML.
+        2. Convert links to citations.
+        3. Generate fit markdown if content filter is provided.
+        4. Return MarkdownGenerationResult.
+        
+        Args:
+            cleaned_html (str): Cleaned HTML content.
+            base_url (str): Base URL for URL joins.
+            html2text_options (Optional[Dict[str, Any]]): HTML2Text options.
+            options (Optional[Dict[str, Any]]): Additional options for markdown generation.
+            content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
+            citations (bool): Whether to generate citations.
+            
+        Returns:
+            MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
+        """
        # Initialize HTML2Text with options
        h = CustomHTML2Text()
        if html2text_options:
--- a/crawl4ai/ssl_certificate.py
+++ b/crawl4ai/ssl_certificate.py
@@ -13,13 +13,34 @@ from pathlib import Path
 class SSLCertificate:
    """
    A class representing an SSL certificate with methods to export in various formats.
+    
+    Attributes:
+        cert_info (Dict[str, Any]): The certificate information.
+        
+        Methods:
+            from_url(url: str, timeout: int = 10) -> Optional['SSLCertificate']: Create SSLCertificate instance from a URL.
+            from_file(file_path: str) -> Optional['SSLCertificate']: Create SSLCertificate instance from a file.
+            from_binary(binary_data: bytes) -> Optional['SSLCertificate']: Create SSLCertificate instance from binary data.
+            export_as_pem() -> str: Export the certificate as PEM format.
+            export_as_der() -> bytes: Export the certificate as DER format.
+            export_as_json() -> Dict[str, Any]: Export the certificate as JSON format.
+            export_as_text() -> str: Export the certificate as text format.
    """
    def __init__(self, cert_info: Dict[str, Any]):
        self._cert_info = self._decode_cert_data(cert_info)

    @staticmethod
    def from_url(url: str, timeout: int = 10) -> Optional['SSLCertificate']:
-        """Create SSLCertificate instance from a URL."""
+        """
+        Create SSLCertificate instance from a URL.
+        
+        Args:
+            url (str): URL of the website.
+            timeout (int): Timeout for the connection (default: 10).
+        
+        Returns:
+            Optional[SSLCertificate]: SSLCertificate instance if successful, None otherwise.
+        """
        try:
            hostname = urlparse(url).netloc
            if ':' in hostname:
@@ -73,7 +94,15 @@ class SSLCertificate:
        return data

    def to_json(self, filepath: Optional[str] = None) -> Optional[str]:
-        """Export certificate as JSON."""
+        """
+        Export certificate as JSON.
+        
+        Args:
+            filepath (Optional[str]): Path to save the JSON file (default: None).
+        
+        Returns:
+            Optional[str]: JSON string if successful, None otherwise.
+        """
        json_str = json.dumps(self._cert_info, indent=2, ensure_ascii=False)
        if filepath:
            Path(filepath).write_text(json_str, encoding='utf-8')
@@ -81,7 +110,15 @@ class SSLCertificate:
        return json_str

    def to_pem(self, filepath: Optional[str] = None) -> Optional[str]:
-        """Export certificate as PEM."""
+        """
+        Export certificate as PEM.
+        
+        Args:
+            filepath (Optional[str]): Path to save the PEM file (default: None).
+        
+        Returns:
+            Optional[str]: PEM string if successful, None otherwise.
+        """
        try:
            x509 = OpenSSL.crypto.load_certificate(
                OpenSSL.crypto.FILETYPE_ASN1, 
@@ -100,7 +137,15 @@ class SSLCertificate:
            return None

    def to_der(self, filepath: Optional[str] = None) -> Optional[bytes]:
-        """Export certificate as DER."""
+        """
+        Export certificate as DER.
+        
+        Args:
+            filepath (Optional[str]): Path to save the DER file (default: None).
+        
+        Returns:
+            Optional[bytes]: DER bytes if successful, None otherwise.
+        """
        try:
            der_data = base64.b64decode(self._cert_info['raw_cert'])
            if filepath:
--- a/crawl4ai/user_agent_generator.py
+++ b/crawl4ai/user_agent_generator.py
@@ -4,6 +4,34 @@ import re


 class UserAgentGenerator:
+    """
+    Generate random user agents with specified constraints.
+    
+    Attributes:
+        desktop_platforms (dict): A dictionary of possible desktop platforms and their corresponding user agent strings.
+        mobile_platforms (dict): A dictionary of possible mobile platforms and their corresponding user agent strings.
+        browser_combinations (dict): A dictionary of possible browser combinations and their corresponding user agent strings.
+        rendering_engines (dict): A dictionary of possible rendering engines and their corresponding user agent strings.
+        chrome_versions (list): A list of possible Chrome browser versions.
+        firefox_versions (list): A list of possible Firefox browser versions.
+        edge_versions (list): A list of possible Edge browser versions.
+        safari_versions (list): A list of possible Safari browser versions.
+        ios_versions (list): A list of possible iOS browser versions.
+        android_versions (list): A list of possible Android browser versions.
+        
+        Methods:
+            generate_user_agent(
+                platform: Literal["desktop", "mobile"] = "desktop",
+                browser: str = "chrome",
+                rendering_engine: str = "chrome_webkit",
+                chrome_version: Optional[str] = None,
+                firefox_version: Optional[str] = None,
+                edge_version: Optional[str] = None,
+                safari_version: Optional[str] = None,
+                ios_version: Optional[str] = None,
+                android_version: Optional[str] = None
+            ): Generates a random user agent string based on the specified parameters.    
+    """
    def __init__(self):
        # Previous platform definitions remain the same...
        self.desktop_platforms = {
@@ -105,7 +133,21 @@ class UserAgentGenerator:
        ]

    def get_browser_stack(self, num_browsers: int = 1) -> List[str]:
-        """Get a valid combination of browser versions"""
+        """
+        Get a valid combination of browser versions.
+        
+        How it works:
+        1. Check if the number of browsers is supported.
+        2. Randomly choose a combination of browsers.
+        3. Iterate through the combination and add browser versions.
+        4. Return the browser stack.
+        
+        Args:
+            num_browsers: Number of browser specifications (1-3)
+            
+        Returns:
+            List[str]: A list of browser versions.
+        """
        if num_browsers not in self.browser_combinations:
            raise ValueError(f"Unsupported number of browsers: {num_browsers}")
        
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -25,64 +25,91 @@ from functools import wraps
 class InvalidCSSSelectorError(Exception):
    pass

-def create_box_message(
-   message: str, 
-   type: str = "info", 
-   width: int = 120, 
-   add_newlines: bool = True,
-   double_line: bool = False
-) -> str:
-   init()
-   
-   # Define border and text colors for different types
-   styles = {
-       "warning": (Fore.YELLOW, Fore.LIGHTYELLOW_EX, "⚠"),
-       "info": (Fore.BLUE, Fore.LIGHTBLUE_EX, "ℹ"), 
-       "success": (Fore.GREEN, Fore.LIGHTGREEN_EX, "✓"),
-       "error": (Fore.RED, Fore.LIGHTRED_EX, "×"),
-   }
-   
-   border_color, text_color, prefix = styles.get(type.lower(), styles["info"])
-   
-   # Define box characters based on line style
-   box_chars = {
-       "single": ("─", "│", "┌", "┐", "└", "┘"),
-       "double": ("═", "║", "╔", "╗", "╚", "╝")
-   }
-   line_style = "double" if double_line else "single"
-   h_line, v_line, tl, tr, bl, br = box_chars[line_style]
-   
-   # Process lines with lighter text color
-   formatted_lines = []
-   raw_lines = message.split('\n')
-   
-   if raw_lines:
-       first_line = f"{prefix} {raw_lines[0].strip()}"
-       wrapped_first = textwrap.fill(first_line, width=width-4)
-       formatted_lines.extend(wrapped_first.split('\n'))
-       
-       for line in raw_lines[1:]:
-           if line.strip():
-               wrapped = textwrap.fill(f"  {line.strip()}", width=width-4)
-               formatted_lines.extend(wrapped.split('\n'))
-           else:
-               formatted_lines.append("")
-   
-   # Create the box with colored borders and lighter text
-   horizontal_line = h_line * (width - 1)
-   box = [
-       f"{border_color}{tl}{horizontal_line}{tr}",
-       *[f"{border_color}{v_line}{text_color} {line:<{width-2}}{border_color}{v_line}" for line in formatted_lines],
-       f"{border_color}{bl}{horizontal_line}{br}{Style.RESET_ALL}"
-   ]
-   
-   result = "\n".join(box)
-   if add_newlines:
-       result = f"\n{result}\n"
-   
-   return result
+def create_box_message(message: str, type: str = "info", width: int = 120, add_newlines: bool = True, double_line: bool = False) -> str:
+    """
+    Create a styled message box with colored borders and formatted text.
+
+    How it works:
+    1. Determines box style and colors based on the message type (e.g., info, warning).
+    2. Wraps text to fit within the specified width.
+    3. Constructs a box using characters (single or double lines) with appropriate formatting.
+    4. Adds optional newlines before and after the box.
+
+    Args:
+        message (str): The message to display inside the box.
+        type (str): Type of the message (e.g., "info", "warning", "error", "success"). Defaults to "info".
+        width (int): Width of the box. Defaults to 120.
+        add_newlines (bool): Whether to add newlines before and after the box. Defaults to True.
+        double_line (bool): Whether to use double lines for the box border. Defaults to False.
+
+    Returns:
+        str: A formatted string containing the styled message box.
+    """
+
+    init()
+
+    # Define border and text colors for different types
+    styles = {
+        "warning": (Fore.YELLOW, Fore.LIGHTYELLOW_EX, "⚠"),
+        "info": (Fore.BLUE, Fore.LIGHTBLUE_EX, "ℹ"), 
+        "success": (Fore.GREEN, Fore.LIGHTGREEN_EX, "✓"),
+        "error": (Fore.RED, Fore.LIGHTRED_EX, "×"),
+    }
+
+    border_color, text_color, prefix = styles.get(type.lower(), styles["info"])
+
+    # Define box characters based on line style
+    box_chars = {
+        "single": ("─", "│", "┌", "┐", "└", "┘"),
+        "double": ("═", "║", "╔", "╗", "╚", "╝")
+    }
+    line_style = "double" if double_line else "single"
+    h_line, v_line, tl, tr, bl, br = box_chars[line_style]
+
+    # Process lines with lighter text color
+    formatted_lines = []
+    raw_lines = message.split('\n')
+
+    if raw_lines:
+        first_line = f"{prefix} {raw_lines[0].strip()}"
+        wrapped_first = textwrap.fill(first_line, width=width-4)
+        formatted_lines.extend(wrapped_first.split('\n'))
+        
+        for line in raw_lines[1:]:
+            if line.strip():
+                wrapped = textwrap.fill(f"  {line.strip()}", width=width-4)
+                formatted_lines.extend(wrapped.split('\n'))
+            else:
+                formatted_lines.append("")
+
+    # Create the box with colored borders and lighter text
+    horizontal_line = h_line * (width - 1)
+    box = [
+        f"{border_color}{tl}{horizontal_line}{tr}",
+        *[f"{border_color}{v_line}{text_color} {line:<{width-2}}{border_color}{v_line}" for line in formatted_lines],
+        f"{border_color}{bl}{horizontal_line}{br}{Style.RESET_ALL}"
+    ]
+
+    result = "\n".join(box)
+    if add_newlines:
+        result = f"\n{result}\n"
+
+    return result

 def calculate_semaphore_count():
+    """
+    Calculate the optimal semaphore count based on system resources.
+
+    How it works:
+    1. Determines the number of CPU cores and total system memory.
+    2. Sets a base count as half of the available CPU cores.
+    3. Limits the count based on memory, assuming 2GB per semaphore instance.
+    4. Returns the minimum value between CPU and memory-based limits.
+
+    Returns:
+        int: The calculated semaphore count.
+    """
+    
    cpu_count = os.cpu_count()
    memory_gb = get_system_memory() / (1024 ** 3)  # Convert to GB
    base_count = max(1, cpu_count // 2)
@@ -90,6 +117,21 @@ def calculate_semaphore_count():
    return min(base_count, memory_based_cap)

 def get_system_memory():
+    """
+    Get the total system memory in bytes.
+
+    How it works:
+    1. Detects the operating system.
+    2. Reads memory information from system-specific commands or files.
+    3. Converts the memory to bytes for uniformity.
+
+    Returns:
+        int: The total system memory in bytes.
+
+    Raises:
+        OSError: If the operating system is unsupported.
+    """
+
    system = platform.system()
    if system == "Linux":
        with open('/proc/meminfo', 'r') as mem:
@@ -124,6 +166,18 @@ def get_system_memory():
        raise OSError("Unsupported operating system")

 def get_home_folder():
+    """
+    Get or create the home folder for Crawl4AI configuration and cache.
+
+    How it works:
+    1. Uses environment variables or defaults to the user's home directory.
+    2. Creates `.crawl4ai` and its subdirectories (`cache`, `models`) if they don't exist.
+    3. Returns the path to the home folder.
+
+    Returns:
+        str: The path to the Crawl4AI home folder.
+    """
+
    home_folder = os.path.join(os.getenv("CRAWL4_AI_BASE_DIRECTORY", os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), ".crawl4ai")
    os.makedirs(home_folder, exist_ok=True)
    os.makedirs(f"{home_folder}/cache", exist_ok=True)
@@ -194,6 +248,20 @@ def split_and_parse_json_objects(json_string):
    return parsed_objects, unparsed_segments

 def sanitize_html(html):
+    """
+    Sanitize an HTML string by escaping quotes.
+
+    How it works:
+    1. Replaces all unwanted and special characters with an empty string.
+    2. Escapes double and single quotes for safe usage.
+
+    Args:
+        html (str): The HTML string to sanitize.
+
+    Returns:
+        str: The sanitized HTML string.
+    """
+    
    # Replace all unwanted and special characters with an empty string
    sanitized_html = html
    # sanitized_html = re.sub(r'[^\w\s.,;:!?=\[\]{}()<>\/\\\-"]', '', html)
@@ -248,6 +316,23 @@ def escape_json_string(s):
    return s

 def replace_inline_tags(soup, tags, only_text=False):
+    """
+    Replace inline HTML tags with Markdown-style equivalents.
+
+    How it works:
+    1. Maps specific tags (e.g., <b>, <i>) to Markdown syntax.
+    2. Finds and replaces all occurrences of these tags in the provided BeautifulSoup object.
+    3. Optionally replaces tags with their text content only.
+
+    Args:
+        soup (BeautifulSoup): Parsed HTML content.
+        tags (List[str]): List of tags to replace.
+        only_text (bool): Whether to replace tags with plain text. Defaults to False.
+
+    Returns:
+        BeautifulSoup: Updated BeautifulSoup object with replaced tags.
+    """
+
    tag_replacements = {
        'b': lambda tag: f"**{tag.text}**",
        'i': lambda tag: f"*{tag.text}*",
@@ -292,6 +377,26 @@ def replace_inline_tags(soup, tags, only_text=False):
    # return soup

 def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None, **kwargs):
+    """
+    Extract structured content, media, and links from website HTML.
+
+    How it works:
+    1. Parses the HTML content using BeautifulSoup.
+    2. Extracts internal/external links and media (images, videos, audios).
+    3. Cleans the content by removing unwanted tags and attributes.
+    4. Converts cleaned HTML to Markdown.
+    5. Collects metadata and returns the extracted information.
+
+    Args:
+        url (str): The website URL.
+        html (str): The HTML content of the website.
+        word_count_threshold (int): Minimum word count for content inclusion. Defaults to MIN_WORD_THRESHOLD.
+        css_selector (Optional[str]): CSS selector to extract specific content. Defaults to None.
+
+    Returns:
+        Dict[str, Any]: Extracted content including Markdown, cleaned HTML, media, links, and metadata.
+    """
+
    try:
        if not html:
            return None
@@ -762,6 +867,27 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
    }

 def extract_metadata(html, soup=None):
+    """
+    Extract optimized content, media, and links from website HTML.
+
+    How it works:
+    1. Similar to `get_content_of_website`, but optimized for performance.
+    2. Filters and scores images for usefulness.
+    3. Extracts contextual descriptions for media files.
+    4. Handles excluded tags and CSS selectors.
+    5. Cleans HTML and converts it to Markdown.
+
+    Args:
+        url (str): The website URL.
+        html (str): The HTML content of the website.
+        word_count_threshold (int): Minimum word count for content inclusion. Defaults to MIN_WORD_THRESHOLD.
+        css_selector (Optional[str]): CSS selector to extract specific content. Defaults to None.
+        **kwargs: Additional options for customization.
+
+    Returns:
+        Dict[str, Any]: Extracted content including Markdown, cleaned HTML, media, links, and metadata.
+    """
+    
    metadata = {}
    
    if not html and not soup:
@@ -809,10 +935,35 @@ def extract_metadata(html, soup=None):
    return metadata

 def extract_xml_tags(string):
+    """
+    Extracts XML tags from a string.
+
+    Args:    
+        string (str): The input string containing XML tags.
+
+    Returns:
+        List[str]: A list of XML tags extracted from the input string.
+    """
    tags = re.findall(r'<(\w+)>', string)
    return list(set(tags))

 def extract_xml_data(tags, string):
+    """
+    Extract data for specified XML tags from a string.
+
+    How it works:
+    1. Searches the string for each tag using regex.
+    2. Extracts the content within the tags.
+    3. Returns a dictionary of tag-content pairs.
+
+    Args:
+        tags (List[str]): The list of XML tags to extract.
+        string (str): The input string containing XML data.
+
+    Returns:
+        Dict[str, str]: A dictionary with tag names as keys and extracted content as values.
+    """
+
    data = {}

    for tag in tags:
@@ -833,6 +984,26 @@ def perform_completion_with_backoff(
    base_url=None,
    **kwargs
    ):
+    """
+    Perform an API completion request with exponential backoff.
+
+    How it works:
+    1. Sends a completion request to the API.
+    2. Retries on rate-limit errors with exponential delays.
+    3. Returns the API response or an error after all retries.
+
+    Args:
+        provider (str): The name of the API provider.
+        prompt_with_variables (str): The input prompt for the completion request.
+        api_token (str): The API token for authentication.
+        json_response (bool): Whether to request a JSON response. Defaults to False.
+        base_url (Optional[str]): The base URL for the API. Defaults to None.
+        **kwargs: Additional arguments for the API request.
+
+    Returns:
+        dict: The API response or an error message after all retries.
+    """
+    
    from litellm import completion 
    from litellm.exceptions import RateLimitError
    max_attempts = 3
@@ -878,6 +1049,25 @@ def perform_completion_with_backoff(
                }]
    
 def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None, base_url = None):
+    """
+    Extract content blocks from website HTML using an AI provider.
+
+    How it works:
+    1. Prepares a prompt by sanitizing and escaping HTML.
+    2. Sends the prompt to an AI provider with optional retries.
+    3. Parses the response to extract structured blocks or errors.
+
+    Args:
+        url (str): The website URL.
+        html (str): The HTML content of the website.
+        provider (str): The AI provider for content extraction. Defaults to DEFAULT_PROVIDER.
+        api_token (Optional[str]): The API token for authentication. Defaults to None.
+        base_url (Optional[str]): The base URL for the API. Defaults to None.
+
+    Returns:
+        List[dict]: A list of extracted content blocks.
+    """
+
    # api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token
    api_token = PROVIDER_MODELS.get(provider, None) if not api_token else api_token
    
@@ -914,6 +1104,23 @@ def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None, bas
    return blocks

 def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_token = None):
+    """
+    Extract content blocks from a batch of website HTMLs.
+
+    How it works:
+    1. Prepares prompts for each URL and HTML pair.
+    2. Sends the prompts to the AI provider in a batch request.
+    3. Parses the responses to extract structured blocks or errors.
+
+    Args:
+        batch_data (List[Tuple[str, str]]): A list of (URL, HTML) pairs.
+        provider (str): The AI provider for content extraction. Defaults to "groq/llama3-70b-8192".
+        api_token (Optional[str]): The API token for authentication. Defaults to None.
+
+    Returns:
+        List[dict]: A list of extracted content blocks from all batch items.
+    """
+
    api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token
    from litellm import batch_completion
    messages = []
@@ -986,6 +1193,25 @@ def merge_chunks_based_on_token_threshold(chunks, token_threshold):
    return merged_sections

 def process_sections(url: str, sections: list, provider: str, api_token: str, base_url=None) -> list:
+    """
+    Process sections of HTML content sequentially or in parallel.
+
+    How it works:
+    1. Sequentially processes sections with delays for "groq/" providers.
+    2. Uses ThreadPoolExecutor for parallel processing with other providers.
+    3. Extracts content blocks for each section.
+
+    Args:
+        url (str): The website URL.
+        sections (List[str]): The list of HTML sections to process.
+        provider (str): The AI provider for content extraction.
+        api_token (str): The API token for authentication.
+        base_url (Optional[str]): The base URL for the API. Defaults to None.
+
+    Returns:
+        List[dict]: The list of extracted content blocks from all sections.
+    """
+
    extracted_content = []
    if provider.startswith("groq/"):
        # Sequential processing with a delay
@@ -1002,6 +1228,24 @@ def process_sections(url: str, sections: list, provider: str, api_token: str, ba
    return extracted_content

 def wrap_text(draw, text, font, max_width):
+    """
+    Wrap text to fit within a specified width for rendering.
+
+    How it works:
+    1. Splits the text into words.
+    2. Constructs lines that fit within the maximum width using the provided font.
+    3. Returns the wrapped text as a single string.
+
+    Args:
+        draw (ImageDraw.Draw): The drawing context for measuring text size.
+        text (str): The text to wrap.
+        font (ImageFont.FreeTypeFont): The font to use for measuring text size.
+        max_width (int): The maximum width for each line.
+
+    Returns:
+        str: The wrapped text.
+    """
+
    # Wrap the text to fit within the specified width
    lines = []
    words = text.split()
@@ -1013,6 +1257,21 @@ def wrap_text(draw, text, font, max_width):
    return '\n'.join(lines)

 def format_html(html_string):
+    """
+    Prettify an HTML string using BeautifulSoup.
+
+    How it works:
+    1. Parses the HTML string with BeautifulSoup.
+    2. Formats the HTML with proper indentation.
+    3. Returns the prettified HTML string.
+
+    Args:
+        html_string (str): The HTML string to format.
+
+    Returns:
+        str: The prettified HTML string.
+    """
+
    soup = BeautifulSoup(html_string, 'lxml.parser')
    return soup.prettify()

@@ -1110,7 +1369,20 @@ def normalize_url_tmp(href, base_url):
    return href.strip()

 def get_base_domain(url: str) -> str:
-    """Extract base domain from URL, handling various edge cases."""
+    """
+    Extract the base domain from a given URL, handling common edge cases.
+
+    How it works:
+    1. Parses the URL to extract the domain.
+    2. Removes the port number and 'www' prefix.
+    3. Handles special domains (e.g., 'co.uk') to extract the correct base.
+
+    Args:
+        url (str): The URL to extract the base domain from.
+
+    Returns:
+        str: The extracted base domain or an empty string if parsing fails.
+    """
    try:
        # Get domain from URL
        domain = urlparse(url).netloc.lower()
@@ -1136,7 +1408,20 @@ def get_base_domain(url: str) -> str:
        return ""

 def is_external_url(url: str, base_domain: str) -> bool:
-    """Check if URL is external to base domain."""
+    """
+    Extract the base domain from a given URL, handling common edge cases.
+
+    How it works:
+    1. Parses the URL to extract the domain.
+    2. Removes the port number and 'www' prefix.
+    3. Handles special domains (e.g., 'co.uk') to extract the correct base.
+
+    Args:
+        url (str): The URL to extract the base domain from.
+
+    Returns:
+        str: The extracted base domain or an empty string if parsing fails.
+    """
    special = {'mailto:', 'tel:', 'ftp:', 'file:', 'data:', 'javascript:'}
    if any(url.lower().startswith(p) for p in special):
        return True
@@ -1155,8 +1440,22 @@ def is_external_url(url: str, base_domain: str) -> bool:
    except Exception:
        return False

-
 def clean_tokens(tokens: list[str]) -> list[str]:
+    """
+    Clean a list of tokens by removing noise, stop words, and short tokens.
+
+    How it works:
+    1. Defines a set of noise words and stop words.
+    2. Filters tokens based on length and exclusion criteria.
+    3. Excludes tokens starting with certain symbols (e.g., "↑", "▲").
+
+    Args:
+        tokens (list[str]): The list of tokens to clean.
+
+    Returns:
+        list[str]: The cleaned list of tokens.
+    """
+
    # Set of tokens to remove
    noise = {'ccp', 'up', '↑', '▲', '⬆️', 'a', 'an', 'at', 'by', 'in', 'of', 'on', 'to', 'the'}

@@ -1212,6 +1511,21 @@ def clean_tokens(tokens: list[str]) -> list[str]:
            and not token.startswith('⬆')]

 def profile_and_time(func):
+    """
+    Decorator to profile a function's execution time and performance.
+
+    How it works:
+    1. Records the start time before executing the function.
+    2. Profiles the function's execution using `cProfile`.
+    3. Prints the elapsed time and profiling statistics.
+
+    Args:
+        func (Callable): The function to decorate.
+
+    Returns:
+        Callable: The decorated function with profiling and timing enabled.
+    """
+
    @wraps(func)
    def wrapper(self, *args, **kwargs):
        # Start timer