From 2327db6fdc3b49b122dbfd2dc85010adb378ca6c Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Wed, 5 Mar 2025 22:23:08 +0800
Subject: [PATCH 1/4] refactor(crawler): introduce CrawlResultContainer and
 simplify interfaces

Introduces a new generic CrawlResultContainer class to standardize return types and
improve type safety. Removes legacy parameter handling and simplifies method signatures.
This change makes the API more consistent and easier to maintain.

BREAKING CHANGE: Synchronous crawler methods now always return CrawlResultContainer
instead of raw CrawlResult or List[CrawlResult]. Legacy parameters have been removed
from method signatures.
---
 crawl4ai/async_webcrawler.py | 161 +++++++++++++++--------------------
 1 file changed, 71 insertions(+), 90 deletions(-)

diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index b5a646c9..dd777a36 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -4,7 +4,7 @@ import sys
 import time
 from colorama import Fore
 from pathlib import Path
-from typing import Optional, List
+from typing import Optional, List, Generic, TypeVar
 import json
 import asyncio
 
@@ -23,7 +23,7 @@ from .async_crawler_strategy import (
     AsyncPlaywrightCrawlerStrategy,
     AsyncCrawlResponse,
 )
-from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
+from .cache_context import CacheMode, CacheContext
 from .markdown_generation_strategy import (
     DefaultMarkdownGenerator,
     MarkdownGenerationStrategy,
@@ -44,17 +44,46 @@ from .utils import (
     RobotsParser,
 )
 
-from typing import Union, AsyncGenerator, TypeVar
+from typing import Union, AsyncGenerator
 
 CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult)
-RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
+# RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
 
-DeepCrawlSingleReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
-DeepCrawlManyReturn = Union[
-    List[List[CrawlResultT]],
-    AsyncGenerator[CrawlResultT, None],
+class CrawlResultContainer(Generic[CrawlResultT]):
+    def __init__(self, results: Union[CrawlResultT, List[CrawlResultT]]):
+        # Normalize to a list
+        if isinstance(results, list):
+            self._results = results
+        else:
+            self._results = [results]
+
+    def __iter__(self):
+        return iter(self._results)
+
+    def __getitem__(self, index):
+        return self._results[index]
+
+    def __len__(self):
+        return len(self._results)
+
+    def __getattr__(self, attr):
+        # Delegate attribute access to the first element.
+        if self._results:
+            return getattr(self._results[0], attr)
+        raise AttributeError(f"{self.__class__.__name__} object has no attribute '{attr}'")
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}({self._results!r})"
+
+# Redefine the union type. Now synchronous calls always return a container,
+# while stream mode is handled with an AsyncGenerator.
+RunManyReturn = Union[
+    CrawlResultContainer[CrawlResultT],
+    AsyncGenerator[CrawlResultT, None]
 ]
 
+
+
 class AsyncWebCrawler:
     """
     Asynchronous web crawler with flexible caching capabilities.
@@ -223,23 +252,6 @@ class AsyncWebCrawler:
         self,
         url: str,
         config: CrawlerRunConfig = None,
-        # Legacy parameters maintained for backwards compatibility
-        # word_count_threshold=MIN_WORD_THRESHOLD,
-        # extraction_strategy: ExtractionStrategy = None,
-        # chunking_strategy: ChunkingStrategy = RegexChunking(),
-        # content_filter: RelevantContentFilter = None,
-        # cache_mode: Optional[CacheMode] = None,
-        # Deprecated cache parameters
-        # bypass_cache: bool = False,
-        # disable_cache: bool = False,
-        # no_cache_read: bool = False,
-        # no_cache_write: bool = False,
-        # Other legacy parameters
-        # css_selector: str = None,
-        # screenshot: bool = False,
-        # pdf: bool = False,
-        # user_agent: str = None,
-        # verbose=True,
         **kwargs,
     ) -> RunManyReturn:
         """
@@ -270,47 +282,13 @@ class AsyncWebCrawler:
         Returns:
             CrawlResult: The result of crawling and processing
         """
-        crawler_config = config or CrawlerRunConfig()
+        config = config or CrawlerRunConfig()
         if not isinstance(url, str) or not url:
             raise ValueError("Invalid URL, make sure the URL is a non-empty string")
 
         async with self._lock or self.nullcontext():
             try:
-                self.logger.verbose = crawler_config.verbose
-                # Handle configuration
-                if crawler_config is not None:
-                    config = crawler_config
-                else:
-                    # Merge all parameters into a single kwargs dict for config creation
-                    # config_kwargs = {
-                    #     "word_count_threshold": word_count_threshold,
-                    #     "extraction_strategy": extraction_strategy,
-                    #     "chunking_strategy": chunking_strategy,
-                    #     "content_filter": content_filter,
-                    #     "cache_mode": cache_mode,
-                    #     "bypass_cache": bypass_cache,
-                    #     "disable_cache": disable_cache,
-                    #     "no_cache_read": no_cache_read,
-                    #     "no_cache_write": no_cache_write,
-                    #     "css_selector": css_selector,
-                    #     "screenshot": screenshot,
-                    #     "pdf": pdf,
-                    #     "verbose": verbose,
-                    #     **kwargs,
-                    # }
-                    # config = CrawlerRunConfig.from_kwargs(config_kwargs)
-                    pass
-
-                # Handle deprecated cache parameters
-                # if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]):
-                #     # Convert legacy parameters if cache_mode not provided
-                #     if config.cache_mode is None:
-                #         config.cache_mode = _legacy_to_cache_mode(
-                #             disable_cache=disable_cache,
-                #             bypass_cache=bypass_cache,
-                #             no_cache_read=no_cache_read,
-                #             no_cache_write=no_cache_write,
-                #         )
+                self.logger.verbose = config.verbose
 
                 # Default to ENABLED if no cache mode specified
                 if config.cache_mode is None:
@@ -457,7 +435,7 @@ class AsyncWebCrawler:
                     if cache_context.should_write() and not bool(cached_result):
                         await async_db_manager.acache_url(crawl_result)
 
-                    return crawl_result
+                    return CrawlResultContainer(crawl_result)
 
                 else:
                     self.logger.success(
@@ -474,7 +452,7 @@ class AsyncWebCrawler:
                     cached_result.success = bool(html)
                     cached_result.session_id = getattr(config, "session_id", None)
                     cached_result.redirected_url = cached_result.redirected_url or url
-                    return cached_result
+                    return CrawlResultContainer(cached_result)
 
             except Exception as e:
                 error_context = get_error_context(sys.exc_info())
@@ -492,8 +470,10 @@ class AsyncWebCrawler:
                     tag="ERROR",
                 )
 
-                return CrawlResult(
-                    url=url, html="", success=False, error_message=error_message
+                return  CrawlResultContainer(
+                    CrawlResult(
+                        url=url, html="", success=False, error_message=error_message
+                    )
                 )
 
     async def aprocess_html(
@@ -669,17 +649,17 @@ class AsyncWebCrawler:
         config: Optional[CrawlerRunConfig] = None, 
         dispatcher: Optional[BaseDispatcher] = None,
         # Legacy parameters maintained for backwards compatibility
-        word_count_threshold=MIN_WORD_THRESHOLD,
-        extraction_strategy: ExtractionStrategy = None,
-        chunking_strategy: ChunkingStrategy = RegexChunking(),
-        content_filter: RelevantContentFilter = None,
-        cache_mode: Optional[CacheMode] = None,
-        bypass_cache: bool = False,
-        css_selector: str = None,
-        screenshot: bool = False,
-        pdf: bool = False,
-        user_agent: str = None,
-        verbose=True,
+        # word_count_threshold=MIN_WORD_THRESHOLD,
+        # extraction_strategy: ExtractionStrategy = None,
+        # chunking_strategy: ChunkingStrategy = RegexChunking(),
+        # content_filter: RelevantContentFilter = None,
+        # cache_mode: Optional[CacheMode] = None,
+        # bypass_cache: bool = False,
+        # css_selector: str = None,
+        # screenshot: bool = False,
+        # pdf: bool = False,
+        # user_agent: str = None,
+        # verbose=True,
         **kwargs
         ) -> RunManyReturn:
         """
@@ -712,20 +692,21 @@ class AsyncWebCrawler:
         ):
             print(f"Processed {result.url}: {len(result.markdown)} chars")
         """
-        if config is None:
-            config = CrawlerRunConfig(
-                word_count_threshold=word_count_threshold,
-                extraction_strategy=extraction_strategy,
-                chunking_strategy=chunking_strategy,
-                content_filter=content_filter,
-                cache_mode=cache_mode,
-                bypass_cache=bypass_cache,
-                css_selector=css_selector,
-                screenshot=screenshot,
-                pdf=pdf,
-                verbose=verbose,
-                **kwargs,
-            )
+        config = config or CrawlerRunConfig()
+        # if config is None:
+        #     config = CrawlerRunConfig(
+        #         word_count_threshold=word_count_threshold,
+        #         extraction_strategy=extraction_strategy,
+        #         chunking_strategy=chunking_strategy,
+        #         content_filter=content_filter,
+        #         cache_mode=cache_mode,
+        #         bypass_cache=bypass_cache,
+        #         css_selector=css_selector,
+        #         screenshot=screenshot,
+        #         pdf=pdf,
+        #         verbose=verbose,
+        #         **kwargs,
+        #     )
 
         if dispatcher is None:
             dispatcher = MemoryAdaptiveDispatcher(

From 29f7915b795418bbc8dec9218fa8e9acae167885 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Thu, 6 Mar 2025 20:30:57 +0800
Subject: [PATCH 2/4] fix(models): support float timestamps in CrawlStats

Modify CrawlStats class to handle both datetime and float timestamp formats for start_time and end_time fields. This change improves compatibility with different time formats while maintaining existing functionality.

Other minor changes:
- Add datetime import in async_dispatcher
- Update JsonElementExtractionStrategy kwargs handling

No breaking changes.
---
 crawl4ai/async_dispatcher.py    |  2 +-
 crawl4ai/extraction_strategy.py |  5 +++--
 crawl4ai/models.py              | 38 +++++++++++++++++++++++++++++----
 3 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/crawl4ai/async_dispatcher.py b/crawl4ai/async_dispatcher.py
index 69d276fb..b587d011 100644
--- a/crawl4ai/async_dispatcher.py
+++ b/crawl4ai/async_dispatcher.py
@@ -13,7 +13,7 @@ from rich.live import Live
 from rich.table import Table
 from rich.console import Console
 from rich import box
-from datetime import timedelta
+from datetime import timedelta, datetime
 from collections.abc import AsyncGenerator
 import time
 import psutil
diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py
index e0e49d99..3b708421 100644
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -1064,7 +1064,7 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
             api_token (str): Legacy Parameter. API token for LLM provider
             llm_config (LLMConfig): LLM configuration object
             prompt (str, optional): Custom prompt template to use
-            **kwargs: Additional args passed to perform_completion_with_backoff
+            **kwargs: Additional args passed to LLM processor
             
         Returns:
             dict: Generated schema following the JsonElementExtractionStrategy format
@@ -1130,7 +1130,8 @@ In this scenario, use your best judgment to generate the schema. Try to maximize
                 prompt_with_variables="\n\n".join([system_message["content"], user_message["content"]]),
                 json_response = True,                
                 api_token=llm_config.api_token,
-                **kwargs
+                base_url=llm_config.base_url,
+                extra_args=kwargs
             )
             
             # Extract and return schema
diff --git a/crawl4ai/models.py b/crawl4ai/models.py
index ef9efc06..c1caff94 100644
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -37,13 +37,33 @@ class CrawlStatus(Enum):
     FAILED = "FAILED"
 
 
+# @dataclass
+# class CrawlStats:
+#     task_id: str
+#     url: str
+#     status: CrawlStatus
+#     start_time: Optional[datetime] = None
+#     end_time: Optional[datetime] = None
+#     memory_usage: float = 0.0
+#     peak_memory: float = 0.0
+#     error_message: str = ""
+
+#     @property
+#     def duration(self) -> str:
+#         if not self.start_time:
+#             return "0:00"
+#         end = self.end_time or datetime.now()
+#         duration = end - self.start_time
+#         return str(timedelta(seconds=int(duration.total_seconds())))
+
+
 @dataclass
 class CrawlStats:
     task_id: str
     url: str
     status: CrawlStatus
-    start_time: Optional[datetime] = None
-    end_time: Optional[datetime] = None
+    start_time: Optional[Union[datetime, float]] = None
+    end_time: Optional[Union[datetime, float]] = None
     memory_usage: float = 0.0
     peak_memory: float = 0.0
     error_message: str = ""
@@ -52,11 +72,21 @@ class CrawlStats:
     def duration(self) -> str:
         if not self.start_time:
             return "0:00"
+            
+        # Convert start_time to datetime if it's a float
+        start = self.start_time
+        if isinstance(start, float):
+            start = datetime.fromtimestamp(start)
+            
+        # Get end time or use current time
         end = self.end_time or datetime.now()
-        duration = end - self.start_time
+        # Convert end_time to datetime if it's a float
+        if isinstance(end, float):
+            end = datetime.fromtimestamp(end)
+            
+        duration = end - start
         return str(timedelta(seconds=int(duration.total_seconds())))
 
-
 class DisplayMode(Enum):
     DETAILED = "DETAILED"
     AGGREGATED = "AGGREGATED"

From 1b72880007ade6c4658551e61c337a438f498086 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Thu, 6 Mar 2025 20:32:32 +0800
Subject: [PATCH 3/4] chore(version): bump version to 0.5.0.post3

---
 crawl4ai/__version__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py
index e56feb58..1f2ef59b 100644
--- a/crawl4ai/__version__.py
+++ b/crawl4ai/__version__.py
@@ -1,2 +1,2 @@
 # crawl4ai/_version.py
-__version__ = "0.5.0.post2"
+__version__ = "0.5.0.post3"

From f78c46446ba647f92175329b55373987ec843e2a Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Thu, 6 Mar 2025 22:45:57 +0800
Subject: [PATCH 4/4] feat(deep-crawling): improve URL normalization and domain
 filtering

Enhance URL handling in deep crawling with:
- New URL normalization functions for consistent URL formats
- Improved domain filtering with subdomain support
- Added URLPatternFilter to public API
- Better URL deduplication in BFS strategy

These changes improve crawling accuracy and reduce duplicate visits.
---
 crawl4ai/__init__.py                   |  4 +-
 crawl4ai/__version__.py                |  2 +-
 crawl4ai/deep_crawling/bfs_strategy.py | 10 +++-
 crawl4ai/deep_crawling/filters.py      | 27 ++++++---
 crawl4ai/utils.py                      | 79 +++++++++++++++++++++++++-
 docs/snippets/deep_crawl/intro.py      | 78 +++++++++++++++++++++++++
 6 files changed, 186 insertions(+), 14 deletions(-)
 create mode 100644 docs/snippets/deep_crawl/intro.py

diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py
index 565bf93d..03cce871 100644
--- a/crawl4ai/__init__.py
+++ b/crawl4ai/__init__.py
@@ -48,8 +48,9 @@ from .deep_crawling import (
     DeepCrawlStrategy,
     BFSDeepCrawlStrategy,
     FilterChain,
-    ContentTypeFilter,
+    URLPatternFilter,
     DomainFilter,
+    ContentTypeFilter,
     URLFilter,
     FilterStats,
     SEOFilter,
@@ -75,6 +76,7 @@ __all__ = [
     "BestFirstCrawlingStrategy",
     "DFSDeepCrawlStrategy",
     "FilterChain",
+    "URLPatternFilter",
     "ContentTypeFilter",
     "DomainFilter",
     "FilterStats",
diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py
index 1f2ef59b..9477177b 100644
--- a/crawl4ai/__version__.py
+++ b/crawl4ai/__version__.py
@@ -1,2 +1,2 @@
 # crawl4ai/_version.py
-__version__ = "0.5.0.post3"
+__version__ = "0.5.0.post4"
diff --git a/crawl4ai/deep_crawling/bfs_strategy.py b/crawl4ai/deep_crawling/bfs_strategy.py
index 1ae4c4b9..54b72ea3 100644
--- a/crawl4ai/deep_crawling/bfs_strategy.py
+++ b/crawl4ai/deep_crawling/bfs_strategy.py
@@ -10,6 +10,7 @@ from .filters import FilterChain
 from .scorers import URLScorer
 from . import DeepCrawlStrategy  
 from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult
+from ..utils import normalize_url_for_deep_crawl, efficient_normalize_url_for_deep_crawl
 from math import inf as infinity
 
 class BFSDeepCrawlStrategy(DeepCrawlStrategy):
@@ -99,14 +100,17 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
         # First collect all valid links
         for link in links:
             url = link.get("href")
-            if url in visited:
+            # Strip URL fragments to avoid duplicate crawling
+            # base_url = url.split('#')[0] if url else url
+            base_url = normalize_url_for_deep_crawl(url, source_url)
+            if base_url in visited:
                 continue
             if not await self.can_process_url(url, next_depth):
                 self.stats.urls_skipped += 1
                 continue
 
             # Score the URL if a scorer is provided
-            score = self.url_scorer.score(url) if self.url_scorer else 0
+            score = self.url_scorer.score(base_url) if self.url_scorer else 0
             
             # Skip URLs with scores below the threshold
             if score < self.score_threshold:
@@ -114,7 +118,7 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
                 self.stats.urls_skipped += 1
                 continue
             
-            valid_links.append((url, score))
+            valid_links.append((base_url, score))
         
         # If we have more valid links than capacity, sort by score and take the top ones
         if len(valid_links) > remaining_capacity:
diff --git a/crawl4ai/deep_crawling/filters.py b/crawl4ai/deep_crawling/filters.py
index c8af3022..9fd8a72a 100644
--- a/crawl4ai/deep_crawling/filters.py
+++ b/crawl4ai/deep_crawling/filters.py
@@ -427,6 +427,11 @@ class DomainFilter(URLFilter):
         if isinstance(domains, str):
             return {domains.lower()}
         return {d.lower() for d in domains}
+    
+    @staticmethod
+    def _is_subdomain(domain: str, parent_domain: str) -> bool:
+        """Check if domain is a subdomain of parent_domain"""
+        return domain == parent_domain or domain.endswith(f".{parent_domain}")
 
     @staticmethod
     @lru_cache(maxsize=10000)
@@ -444,20 +449,26 @@ class DomainFilter(URLFilter):
 
         domain = self._extract_domain(url)
 
-        # Early return for blocked domains
-        if domain in self._blocked_domains:
-            self._update_stats(False)
-            return False
+        # Check for blocked domains, including subdomains
+        for blocked in self._blocked_domains:
+            if self._is_subdomain(domain, blocked):
+                self._update_stats(False)
+                return False
 
         # If no allowed domains specified, accept all non-blocked
         if self._allowed_domains is None:
             self._update_stats(True)
             return True
 
-        # Final allowed domains check
-        result = domain in self._allowed_domains
-        self._update_stats(result)
-        return result
+        # Check if domain matches any allowed domain (including subdomains)
+        for allowed in self._allowed_domains:
+            if self._is_subdomain(domain, allowed):
+                self._update_stats(True)
+                return True
+
+        # No matches found
+        self._update_stats(False)
+        return False
 
 
 class ContentRelevanceFilter(URLFilter):
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index b5a50eab..146ce06c 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -1,5 +1,4 @@
 import time
-from urllib.parse import urlparse
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString
 import json
@@ -33,6 +32,8 @@ import hashlib
 
 from urllib.robotparser import RobotFileParser
 import aiohttp
+from urllib.parse import urlparse, urlunparse
+from functools import lru_cache
 
 from packaging import version
 from . import __version__
@@ -1962,6 +1963,82 @@ def normalize_url(href, base_url):
     return normalized
 
 
+def normalize_url_for_deep_crawl(href, base_url):
+    """Normalize URLs to ensure consistent format"""
+    from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode
+
+    # Handle None or empty values
+    if not href:
+        return None
+
+    # Use urljoin to handle relative URLs
+    full_url = urljoin(base_url, href.strip())
+    
+    # Parse the URL for normalization
+    parsed = urlparse(full_url)
+    
+    # Convert hostname to lowercase
+    netloc = parsed.netloc.lower()
+    
+    # Remove fragment entirely
+    fragment = ''
+    
+    # Normalize query parameters if needed
+    query = parsed.query
+    if query:
+        # Parse query parameters
+        params = parse_qs(query)
+        
+        # Remove tracking parameters (example - customize as needed)
+        tracking_params = ['utm_source', 'utm_medium', 'utm_campaign', 'ref', 'fbclid']
+        for param in tracking_params:
+            if param in params:
+                del params[param]
+                
+        # Rebuild query string, sorted for consistency
+        query = urlencode(params, doseq=True) if params else ''
+    
+    # Build normalized URL
+    normalized = urlunparse((
+        parsed.scheme,
+        netloc,
+        parsed.path.rstrip('/') or '/',  # Normalize trailing slash
+        parsed.params,
+        query,
+        fragment
+    ))
+    
+    return normalized
+
+@lru_cache(maxsize=10000)
+def efficient_normalize_url_for_deep_crawl(href, base_url):
+    """Efficient URL normalization with proper parsing"""
+    from urllib.parse import urljoin
+    
+    if not href:
+        return None
+    
+    # Resolve relative URLs
+    full_url = urljoin(base_url, href.strip())
+    
+    # Use proper URL parsing
+    parsed = urlparse(full_url)
+    
+    # Only perform the most critical normalizations
+    # 1. Lowercase hostname
+    # 2. Remove fragment
+    normalized = urlunparse((
+        parsed.scheme,
+        parsed.netloc.lower(),
+        parsed.path,
+        parsed.params,
+        parsed.query,
+        ''  # Remove fragment
+    ))
+    
+    return normalized
+
+
 def normalize_url_tmp(href, base_url):
     """Normalize URLs to ensure consistent format"""
     # Extract protocol and domain from base URL
diff --git a/docs/snippets/deep_crawl/intro.py b/docs/snippets/deep_crawl/intro.py
new file mode 100644
index 00000000..d8fd2f94
--- /dev/null
+++ b/docs/snippets/deep_crawl/intro.py
@@ -0,0 +1,78 @@
+import asyncio
+from typing import List
+
+from crawl4ai import (
+    AsyncWebCrawler,
+    CrawlerRunConfig,
+    BFSDeepCrawlStrategy,
+    CrawlResult,
+    FilterChain,
+    DomainFilter,
+    URLPatternFilter,
+)
+
+# Import necessary classes from crawl4ai library:
+# - AsyncWebCrawler: The main class for web crawling.
+# - CrawlerRunConfig: Configuration class for crawler behavior.
+# - BFSDeepCrawlStrategy: Breadth-First Search deep crawling strategy.
+# - CrawlResult: Data model for individual crawl results.
+# - FilterChain: Used to chain multiple URL filters.
+# - URLPatternFilter: Filter URLs based on patterns.
+# You had from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, which is also correct,
+# but for simplicity and consistency, we will use the direct import from crawl4ai in this example, as it is re-exported in __init__.py
+
+async def basic_deep_crawl():
+    """
+    Performs a basic deep crawl starting from a seed URL, demonstrating:
+    - Breadth-First Search (BFS) deep crawling strategy.
+    - Filtering URLs based on URL patterns.
+    - Accessing crawl results and metadata.
+    """
+
+    # 1. Define URL Filters:
+    # Create a URLPatternFilter to include only URLs containing "text".
+    # This filter will be used to restrict crawling to URLs that are likely to contain textual content.
+    url_filter = URLPatternFilter(
+        patterns=[
+            "*text*", # Include URLs that contain "text" in their path or URL
+        ]
+    )
+
+    # Create a DomainFilter to allow only URLs from the "groq.com" domain and block URLs from the "example.com" domain.
+    # This filter will be used to restrict crawling to URLs within the "groq.com" domain.
+    domain_filter = DomainFilter(
+        allowed_domains=["groq.com"],
+        blocked_domains=["example.com"],
+    )
+
+    # 2. Configure CrawlerRunConfig for Deep Crawling:
+    # Configure CrawlerRunConfig to use BFSDeepCrawlStrategy for deep crawling.
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BFSDeepCrawlStrategy(
+            max_depth=2,  # Set the maximum depth of crawling to 2 levels from the start URL
+            max_pages=10, # Limit the total number of pages to crawl to 10, to prevent excessive crawling
+            include_external=False, # Set to False to only crawl URLs within the same domain as the start URL
+            filter_chain=FilterChain(filters=[url_filter, domain_filter]), # Apply the URLPatternFilter and DomainFilter to filter URLs during deep crawl
+        ),
+        verbose=True, # Enable verbose logging to see detailed output during crawling
+    )
+
+    # 3. Initialize and Run AsyncWebCrawler:
+    # Use AsyncWebCrawler as a context manager for automatic start and close.
+    async with AsyncWebCrawler() as crawler:
+        results: List[CrawlResult] = await crawler.arun(
+            # url="https://docs.crawl4ai.com", # Uncomment to use crawl4ai documentation as start URL
+            url="https://console.groq.com/docs", # Set the start URL for deep crawling to Groq documentation
+            config=config, # Pass the configured CrawlerRunConfig to arun method
+        )
+
+        # 4. Process and Print Crawl Results:
+        # Iterate through the list of CrawlResult objects returned by the deep crawl.
+        for result in results:
+            # Print the URL and its crawl depth from the metadata for each crawled URL.
+            print(f"URL: {result.url}, Depth: {result.metadata.get('depth', 0)}")
+
+
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(basic_deep_crawl())