refactor(deep-crawling): reorganize deep crawling strategies and add new implementations

Split deep crawling code into separate strategy files for better organization and maintainability. Added new BFF (Best First) and DFS crawling strategies. Introduced base strategy class and common types. BREAKING CHANGE: Deep crawling implementation has been split into multiple files. Import paths for deep crawling strategies have changed.
2025-02-05 22:50:39 +08:00
parent c308a794e8
commit a9415aaaf6
10 changed files with 769 additions and 214 deletions
--- a/crawl4ai/deep_crawling/init.py
+++ b/crawl4ai/deep_crawling/init.py
@@ -1,172 +1,11 @@
-# deep_crawl_strategy.py
-from abc import ABC, abstractmethod
-from typing import AsyncGenerator, Optional, Set, List, Dict, TypeVar, Union
-
-from ..models import CrawlResult
-from typing import TYPE_CHECKING
-from functools import wraps
-from contextvars import ContextVar
-
-if TYPE_CHECKING:
-    from ..async_configs import CrawlerRunConfig
-    from ..async_webcrawler import AsyncWebCrawler
-    from .bfs_strategy import BFSDeepCrawlStrategy
-
-CrawlResultT = TypeVar("CrawlResultT", bound=CrawlResult)
-# In batch mode we return List[CrawlResult] and in stream mode an AsyncGenerator.
-RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
-
-
-class DeepCrawlDecorator:
-    """Decorator that adds deep crawling capability to arun method."""
-    deep_crawl_active = ContextVar("deep_crawl_active", default=False)
-    
-    def __init__(self, crawler: "AsyncWebCrawler"):
-        self.crawler = crawler
-
-    def __call__(self, original_arun):
-        @wraps(original_arun)
-        async def wrapped_arun(url: str, config: Optional["CrawlerRunConfig"] = None, **kwargs):
-            # If deep crawling is already active, call the original method to avoid recursion.
-            if config and config.deep_crawl_strategy and not self.deep_crawl_active.get():
-                token = self.deep_crawl_active.set(True)
-                # Await the arun call to get the actual result object.
-                result_obj = await config.deep_crawl_strategy.arun(
-                    crawler=self.crawler,
-                    start_url=url,
-                    config=config
-                )
-                if config.stream:
-                    async def result_wrapper():
-                        try:
-                            async for result in result_obj:
-                                yield result
-                        finally:
-                            self.deep_crawl_active.reset(token)
-                    return result_wrapper()
-                else:
-                    try:
-                        return result_obj
-                    finally:
-                        self.deep_crawl_active.reset(token)
-            return await original_arun(url, config=config, **kwargs)
-        return wrapped_arun
-
-class DeepCrawlStrategy(ABC):
-    """
-    Abstract base class for deep crawling strategies.
-    
-    Core functions:
-      - arun: Main entry point that returns an async generator of CrawlResults.
-      - shutdown: Clean up resources.
-      - can_process_url: Validate a URL and decide whether to process it.
-      - _process_links: Extract and process links from a CrawlResult.
-    """
-
-    @abstractmethod
-    async def _arun_batch(
-        self,
-        start_url: str,
-        crawler: "AsyncWebCrawler",
-        config: "CrawlerRunConfig",
-    ) -> List[CrawlResult]:
-        """
-        Batch (non-streaming) mode:
-        Processes one BFS level at a time, then yields all the results.
-        """
-        pass
-
-    @abstractmethod
-    async def _arun_stream(
-        self,
-        start_url: str,
-        crawler: "AsyncWebCrawler",
-        config: "CrawlerRunConfig",
-    ) -> AsyncGenerator[CrawlResult, None]:
-        """
-        Streaming mode:
-        Processes one BFS level at a time and yields results immediately as they arrive.
-        """
-        pass
-    
-    async def arun(
-        self,
-        start_url: str,
-        crawler: "AsyncWebCrawler",
-        config: Optional["CrawlerRunConfig"] = None,
-    ) -> RunManyReturn:
-        """
-        Traverse the given URL using the specified crawler.
-        
-        Args:
-            start_url (str): The URL from which to start crawling.
-            crawler (AsyncWebCrawler): The crawler instance to use.
-            crawler_run_config (Optional[CrawlerRunConfig]): Crawler configuration.
-        
-        Returns:
-            AsyncGenerator[CrawlResult, None]: An async generator yielding crawl results.
-        """
-        if config is None:
-            raise ValueError("CrawlerRunConfig must be provided")
-
-        if config.stream:
-            return self._arun_stream(start_url, crawler, config)
-        else:
-            return await self._arun_batch(start_url, crawler, config)
-
-    @abstractmethod
-    async def shutdown(self) -> None:
-        """
-        Clean up resources used by the deep crawl strategy.
-        """
-        pass
-
-    @abstractmethod
-    async def can_process_url(self, url: str, depth: int) -> bool:
-        """
-        Validate the URL format and apply custom filtering logic.
-        
-        Args:
-            url (str): The URL to validate.
-            depth (int): The current depth in the crawl.
-        
-        Returns:
-            bool: True if the URL should be processed, False otherwise.
-        """
-        pass
-
-    @abstractmethod
-    async def link_discovery(
-        self,
-        result: CrawlResult,
-        source_url: str,
-        current_depth: int,
-        visited: Set[str],
-        next_level: List[tuple],
-        depths: Dict[str, int],
-    ) -> None:
-        """
-        Extract and process links from the given crawl result.
-        
-        This method should:
-          - Validate each extracted URL using can_process_url.
-          - Optionally score URLs.
-          - Append valid URLs (and their parent references) to the next_level list.
-          - Update the depths dictionary with the new depth for each URL.
-        
-        Args:
-            result (CrawlResult): The result from a crawl operation.
-            source_url (str): The URL from which this result was obtained.
-            current_depth (int): The depth at which the source URL was processed.
-            visited (Set[str]): Set of already visited URLs.
-            next_level (List[tuple]): List of tuples (url, parent_url) for the next BFS level.
-            depths (Dict[str, int]): Mapping of URLs to their current depth.
-        """
-        pass
-
+# deep_crawling/__init__.py
+from .base_strategy import DeepCrawlDecorator, DeepCrawlStrategy
+from .bfs_strategy import BFSDeepCrawlStrategy
+from .bff_strategy import BestFirstCrawlingStrategy

 __all__ = [
    "DeepCrawlDecorator",
    "DeepCrawlStrategy",
-    "BFSDeepCrawlStrategy"
+    "BFSDeepCrawlStrategy",
+    "BestFirstCrawlingStrategy",
 ]
--- a/crawl4ai/deep_crawling/base_strategy.py
+++ b/crawl4ai/deep_crawling/base_strategy.py
@@ -0,0 +1,156 @@
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import AsyncGenerator, Optional, Set, List, Dict
+from functools import wraps
+from contextvars import ContextVar
+from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn
+
+
+class DeepCrawlDecorator:
+    """Decorator that adds deep crawling capability to arun method."""
+    deep_crawl_active = ContextVar("deep_crawl_active", default=False)
+    
+    def __init__(self, crawler: AsyncWebCrawler): 
+        self.crawler = crawler
+
+    def __call__(self, original_arun):
+        @wraps(original_arun)
+        async def wrapped_arun(url: str, config: Optional[CrawlerRunConfig] = None, **kwargs):
+            # If deep crawling is already active, call the original method to avoid recursion.
+            if config and config.deep_crawl_strategy and not self.deep_crawl_active.get():
+                token = self.deep_crawl_active.set(True)
+                # Await the arun call to get the actual result object.
+                result_obj = await config.deep_crawl_strategy.arun(
+                    crawler=self.crawler,
+                    start_url=url,
+                    config=config
+                )
+                if config.stream:
+                    async def result_wrapper():
+                        try:
+                            async for result in result_obj:
+                                yield result
+                        finally:
+                            self.deep_crawl_active.reset(token)
+                    return result_wrapper()
+                else:
+                    try:
+                        return result_obj
+                    finally:
+                        self.deep_crawl_active.reset(token)
+            return await original_arun(url, config=config, **kwargs)
+        return wrapped_arun
+
+class DeepCrawlStrategy(ABC):
+    """
+    Abstract base class for deep crawling strategies.
+    
+    Core functions:
+      - arun: Main entry point that returns an async generator of CrawlResults.
+      - shutdown: Clean up resources.
+      - can_process_url: Validate a URL and decide whether to process it.
+      - _process_links: Extract and process links from a CrawlResult.
+    """
+
+    @abstractmethod
+    async def _arun_batch(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> List[CrawlResult]:
+        """
+        Batch (non-streaming) mode:
+        Processes one BFS level at a time, then yields all the results.
+        """
+        pass
+
+    @abstractmethod
+    async def _arun_stream(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> AsyncGenerator[CrawlResult, None]:
+        """
+        Streaming mode:
+        Processes one BFS level at a time and yields results immediately as they arrive.
+        """
+        pass
+    
+    async def arun(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: Optional[CrawlerRunConfig] = None,
+    ) -> RunManyReturn:
+        """
+        Traverse the given URL using the specified crawler.
+        
+        Args:
+            start_url (str): The URL from which to start crawling.
+            crawler (AsyncWebCrawler): The crawler instance to use.
+            crawler_run_config (Optional[CrawlerRunConfig]): Crawler configuration.
+        
+        Returns:
+            AsyncGenerator[CrawlResult, None]: An async generator yielding crawl results.
+        """
+        if config is None:
+            raise ValueError("CrawlerRunConfig must be provided")
+
+        if config.stream:
+            return self._arun_stream(start_url, crawler, config)
+        else:
+            return await self._arun_batch(start_url, crawler, config)
+
+    @abstractmethod
+    async def shutdown(self) -> None:
+        """
+        Clean up resources used by the deep crawl strategy.
+        """
+        pass
+
+    @abstractmethod
+    async def can_process_url(self, url: str, depth: int) -> bool:
+        """
+        Validate the URL format and apply custom filtering logic.
+        
+        Args:
+            url (str): The URL to validate.
+            depth (int): The current depth in the crawl.
+        
+        Returns:
+            bool: True if the URL should be processed, False otherwise.
+        """
+        pass
+
+    @abstractmethod
+    async def link_discovery(
+        self,
+        result: CrawlResult,
+        source_url: str,
+        current_depth: int,
+        visited: Set[str],
+        next_level: List[tuple],
+        depths: Dict[str, int],
+    ) -> None:
+        """
+        Extract and process links from the given crawl result.
+        
+        This method should:
+          - Validate each extracted URL using can_process_url.
+          - Optionally score URLs.
+          - Append valid URLs (and their parent references) to the next_level list.
+          - Update the depths dictionary with the new depth for each URL.
+        
+        Args:
+            result (CrawlResult): The result from a crawl operation.
+            source_url (str): The URL from which this result was obtained.
+            current_depth (int): The depth at which the source URL was processed.
+            visited (Set[str]): Set of already visited URLs.
+            next_level (List[tuple]): List of tuples (url, parent_url) for the next BFS level.
+            depths (Dict[str, int]): Mapping of URLs to their current depth.
+        """
+        pass
+
--- a/crawl4ai/deep_crawling/bff_strategy.py
+++ b/crawl4ai/deep_crawling/bff_strategy.py
@@ -0,0 +1,221 @@
+# best_first_crawling_strategy.py
+import asyncio
+import logging
+from datetime import datetime
+from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
+from urllib.parse import urlparse
+
+from ..models import TraversalStats
+from .filters import FastFilterChain
+from .scorers import FastURLScorer
+from . import DeepCrawlStrategy
+
+from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn
+
+
+# Configurable batch size for processing items from the priority queue
+BATCH_SIZE = 10
+
+
+class BestFirstCrawlingStrategy(DeepCrawlStrategy):
+    """
+    Best-First Crawling Strategy using a priority queue.
+    
+    This strategy prioritizes URLs based on their score, ensuring that higher-value
+    pages are crawled first. It reimplements the core traversal loop to use a priority
+    queue while keeping URL validation and link discovery consistent with our design.
+    
+    Core methods:
+      - arun: Returns either a list (batch mode) or an async generator (stream mode).
+      - _arun_best_first: Core generator that uses a priority queue to yield CrawlResults.
+      - can_process_url: Validates URLs and applies filtering (inherited behavior).
+      - link_discovery: Extracts and validates links from a CrawlResult.
+    """
+    def __init__(
+        self,
+        max_depth: int,
+        filter_chain: FastFilterChain = FastFilterChain(),
+        url_scorer: Optional[FastURLScorer] = None,
+        include_external: bool = False,
+        logger: Optional[logging.Logger] = None,
+    ):
+        self.max_depth = max_depth
+        self.filter_chain = filter_chain
+        self.url_scorer = url_scorer
+        self.include_external = include_external
+        self.logger = logger or logging.getLogger(__name__)
+        self.stats = TraversalStats(start_time=datetime.now())
+        self._cancel_event = asyncio.Event()
+
+    async def can_process_url(self, url: str, depth: int) -> bool:
+        """
+        Validate the URL format and apply filtering.
+        For the starting URL (depth 0), filtering is bypassed.
+        """
+        try:
+            parsed = urlparse(url)
+            if not parsed.scheme or not parsed.netloc:
+                raise ValueError("Missing scheme or netloc")
+            if parsed.scheme not in ("http", "https"):
+                raise ValueError("Invalid scheme")
+            if "." not in parsed.netloc:
+                raise ValueError("Invalid domain")
+        except Exception as e:
+            self.logger.warning(f"Invalid URL: {url}, error: {e}")
+            return False
+
+        if depth != 0 and not self.filter_chain.apply(url):
+            return False
+
+        return True
+
+    async def link_discovery(
+        self,
+        result: CrawlResult,
+        source_url: str,
+        current_depth: int,
+        visited: Set[str],
+        next_links: List[Tuple[str, Optional[str]]],
+        depths: Dict[str, int],
+    ) -> None:
+        """
+        Extract links from the crawl result, validate them, and append new URLs
+        (with their parent references) to next_links.
+        Also updates the depths dictionary.
+        """
+        new_depth = current_depth + 1
+        if new_depth > self.max_depth:
+            return
+
+        # Retrieve internal links; include external links if enabled.
+        links = result.links.get("internal", [])
+        if self.include_external:
+            links += result.links.get("external", [])
+
+        for link in links:
+            url = link.get("href")
+            if url in visited:
+                continue
+            if not await self.can_process_url(url, new_depth):
+                self.stats.urls_skipped += 1
+                continue
+
+            # Record the new depth.
+            depths[url] = new_depth
+            next_links.append((url, source_url))
+
+    async def _arun_best_first(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> AsyncGenerator[CrawlResult, None]:
+        """
+        Core best-first crawl method using a priority queue.
+        
+        The queue items are tuples of (score, depth, url, parent_url). Lower scores
+        are treated as higher priority. URLs are processed in batches for efficiency.
+        """
+        queue: asyncio.PriorityQueue = asyncio.PriorityQueue()
+        # Push the initial URL with score 0 and depth 0.
+        await queue.put((0, 0, start_url, None))
+        visited: Set[str] = set()
+        depths: Dict[str, int] = {start_url: 0}
+
+        while not queue.empty() and not self._cancel_event.is_set():
+            batch: List[Tuple[float, int, str, Optional[str]]] = []
+            # Retrieve up to BATCH_SIZE items from the priority queue.
+            for _ in range(BATCH_SIZE):
+                if queue.empty():
+                    break
+                item = await queue.get()
+                score, depth, url, parent_url = item
+                if url in visited:
+                    continue
+                visited.add(url)
+                batch.append(item)
+
+            if not batch:
+                continue
+
+            # Process the current batch of URLs.
+            urls = [item[2] for item in batch]
+            batch_config = config.clone(deep_crawl_strategy=None, stream=True)
+            stream_gen = await crawler.arun_many(urls=urls, config=batch_config)
+            async for result in stream_gen:
+                result_url = result.url
+                # Find the corresponding tuple from the batch.
+                corresponding = next((item for item in batch if item[2] == result_url), None)
+                if not corresponding:
+                    continue
+                score, depth, url, parent_url = corresponding
+                result.metadata = result.metadata or {}
+                result.metadata["depth"] = depth
+                result.metadata["parent_url"] = parent_url
+                result.metadata["score"] = score
+                yield result
+                # Discover new links from this result.
+                new_links: List[Tuple[str, Optional[str]]] = []
+                await self.link_discovery(result, result_url, depth, visited, new_links, depths)
+                for new_url, new_parent in new_links:
+                    new_depth = depths.get(new_url, depth + 1)
+                    new_score = self.url_scorer.score(new_url) if self.url_scorer else 0
+                    await queue.put((new_score, new_depth, new_url, new_parent))
+
+        # End of crawl.
+
+    async def _arun_batch(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> List[CrawlResult]:
+        """
+        Best-first crawl in batch mode.
+        
+        Aggregates all CrawlResults into a list.
+        """
+        results: List[CrawlResult] = []
+        async for result in self._arun_best_first(start_url, crawler, config):
+            results.append(result)
+        return results
+
+    async def _arun_stream(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
+    ) -> AsyncGenerator[CrawlResult, None]:
+        """
+        Best-first crawl in streaming mode.
+        
+        Yields CrawlResults as they become available.
+        """
+        async for result in self._arun_best_first(start_url, crawler, config):
+            yield result
+
+    async def arun(
+        self,
+        start_url: str,
+        crawler: AsyncWebCrawler,
+        config: Optional[CrawlerRunConfig] = None,
+    ) -> "RunManyReturn":
+        """
+        Main entry point for best-first crawling.
+        
+        Returns either a list (batch mode) or an async generator (stream mode)
+        of CrawlResults.
+        """
+        if config is None:
+            raise ValueError("CrawlerRunConfig must be provided")
+        if config.stream:
+            return self._arun_stream(start_url, crawler, config)
+        else:
+            return await self._arun_batch(start_url, crawler, config)
+
+    async def shutdown(self) -> None:
+        """
+        Signal cancellation and clean up resources.
+        """
+        self._cancel_event.set()
+        self.stats.end_time = datetime.now()
--- a/crawl4ai/deep_crawling/bfs_strategy.py
+++ b/crawl4ai/deep_crawling/bfs_strategy.py
@@ -5,15 +5,11 @@ from datetime import datetime
 from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
 from urllib.parse import urlparse

-from ..models import CrawlResult, TraversalStats
+from ..models import TraversalStats
 from .filters import FastFilterChain
 from .scorers import FastURLScorer
-
-from typing import TYPE_CHECKING
 from . import DeepCrawlStrategy  
-if TYPE_CHECKING:
-    from ..async_configs import CrawlerRunConfig
-    from ..async_webcrawler import AsyncWebCrawler
+from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult

 class BFSDeepCrawlStrategy(DeepCrawlStrategy):
    """
@@ -107,8 +103,8 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
    async def _arun_batch(
        self,
        start_url: str,
-        crawler: "AsyncWebCrawler",
-        config: "CrawlerRunConfig",
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
    ) -> List[CrawlResult]:
        """
        Batch (non-streaming) mode:
@@ -148,8 +144,8 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
    async def _arun_stream(
        self,
        start_url: str,
-        crawler: "AsyncWebCrawler",
-        config: "CrawlerRunConfig",
+        crawler: AsyncWebCrawler,
+        config: CrawlerRunConfig,
    ) -> AsyncGenerator[CrawlResult, None]:
        """
        Streaming mode:
--- a/crawl4ai/deep_crawling/crazy.py
+++ b/crawl4ai/deep_crawling/crazy.py
@@ -1,12 +1,12 @@
 from __future__ import annotations
 # I just got crazy, trying to wrute K&R C but in Python. Right now I feel like I'm in a quantum state.
+# I probably won't use this; I just want to leave it here. A century later, the future human race will be like, "WTF?"

-# from typing import TYPE_CHECKING
+# ------ Imports That Will Make You Question Reality ------ #
 from functools import wraps
 from contextvars import ContextVar
 import inspect

-from httpx import get
 from crawl4ai import CacheMode
 from crawl4ai.async_configs import CrawlerRunConfig
 from crawl4ai.models import CrawlResult, TraversalStats
--- a/crawl4ai/deep_crawling/dfs_strategy.py
+++ b/crawl4ai/deep_crawling/dfs_strategy.py
@@ -0,0 +1,95 @@
+# dfs_deep_crawl_strategy.py
+from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
+
+from ..models import CrawlResult
+from .bfs_strategy import BFSDeepCrawlStrategy  # Inherit common logic: can_process_url, link_discovery, etc.
+
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from ..async_configs import CrawlerRunConfig
+    from ..async_webcrawler import AsyncWebCrawler
+
+
+class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
+    """
+    Depth-First Search (DFS) deep crawling strategy.
+
+    Inherits URL validation and link discovery from BFSDeepCrawlStrategy.
+    Overrides _arun_batch and _arun_stream to use a stack (LIFO) for DFS traversal.
+    """
+    async def _arun_batch(
+        self,
+        start_url: str,
+        crawler: "AsyncWebCrawler",
+        config: "CrawlerRunConfig",
+    ) -> List[CrawlResult]:
+        """
+        Batch (non-streaming) DFS mode.
+        Uses a stack to traverse URLs in DFS order, aggregating CrawlResults into a list.
+        """
+        visited: Set[str] = set()
+        # Stack items: (url, parent_url, depth)
+        stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
+        depths: Dict[str, int] = {start_url: 0}
+        results: List[CrawlResult] = []
+
+        while stack and not self._cancel_event.is_set():
+            url, parent, depth = stack.pop()
+            if url in visited or depth > self.max_depth:
+                continue
+            visited.add(url)
+
+            # Clone config to disable recursive deep crawling.
+            batch_config = config.clone(deep_crawl_strategy=None, stream=False)
+            url_results = await crawler.arun_many(urls=[url], config=batch_config)
+            for result in url_results:
+                result.metadata = result.metadata or {}
+                result.metadata["depth"] = depth
+                result.metadata["parent_url"] = parent
+                if self.url_scorer:
+                    result.metadata["score"] = self.url_scorer.score(url)
+                results.append(result)
+
+                new_links: List[Tuple[str, Optional[str]]] = []
+                await self.link_discovery(result, url, depth, visited, new_links, depths)
+                # Push new links in reverse order so the first discovered is processed next.
+                for new_url, new_parent in reversed(new_links):
+                    new_depth = depths.get(new_url, depth + 1)
+                    stack.append((new_url, new_parent, new_depth))
+        return results
+
+    async def _arun_stream(
+        self,
+        start_url: str,
+        crawler: "AsyncWebCrawler",
+        config: "CrawlerRunConfig",
+    ) -> AsyncGenerator[CrawlResult, None]:
+        """
+        Streaming DFS mode.
+        Uses a stack to traverse URLs in DFS order and yields CrawlResults as they become available.
+        """
+        visited: Set[str] = set()
+        stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
+        depths: Dict[str, int] = {start_url: 0}
+
+        while stack and not self._cancel_event.is_set():
+            url, parent, depth = stack.pop()
+            if url in visited or depth > self.max_depth:
+                continue
+            visited.add(url)
+
+            stream_config = config.clone(deep_crawl_strategy=None, stream=True)
+            stream_gen = await crawler.arun_many(urls=[url], config=stream_config)
+            async for result in stream_gen:
+                result.metadata = result.metadata or {}
+                result.metadata["depth"] = depth
+                result.metadata["parent_url"] = parent
+                if self.url_scorer:
+                    result.metadata["score"] = self.url_scorer.score(url)
+                yield result
+
+                new_links: List[Tuple[str, Optional[str]]] = []
+                await self.link_discovery(result, url, depth, visited, new_links, depths)
+                for new_url, new_parent in reversed(new_links):
+                    new_depth = depths.get(new_url, depth + 1)
+                    stack.append((new_url, new_parent, new_depth))
--- a/crawl4ai/deep_crawling/filters.py
+++ b/crawl4ai/deep_crawling/filters.py
@@ -1,16 +1,18 @@
 from abc import ABC, abstractmethod
-from typing import List, Pattern, Set, Union, FrozenSet
-import re, time
+from typing import List, Pattern, Set, Union
 from urllib.parse import urlparse
 from array import array
+import re
 import logging
 from functools import lru_cache
 import fnmatch
 from dataclasses import dataclass
-from typing import ClassVar
 import weakref
 import mimetypes
-
+import math
+from collections import defaultdict
+from typing import Dict
+from ..utils import HeadPeekr

@dataclass
 class FilterStats:
@@ -624,6 +626,163 @@ class FastDomainFilter(FastURLFilter):
        return result


+
+class ContentRelevanceFilter(URLFilter):
+    """BM25-based relevance filter using head section content"""
+    
+    __slots__ = ('query_terms', 'threshold', 'k1', 'b', 'avgdl')
+    
+    def __init__(self, query: str, threshold: float, 
+                 k1: float = 1.2, b: float = 0.75, avgdl: int = 1000):
+        super().__init__(name="BM25RelevanceFilter")
+        self.query_terms = self._tokenize(query)
+        self.threshold = threshold
+        self.k1 = k1  # TF saturation parameter
+        self.b = b    # Length normalization parameter
+        self.avgdl = avgdl  # Average document length (empirical value)
+
+    async def apply(self, url: str) -> bool:
+        head_content = await HeadPeekr.peek_html(url)
+        if not head_content:
+            self._update_stats(False)
+            return False
+
+        # Field extraction with weighting
+        fields = {
+            'title': HeadPeekr.get_title(head_content) or "",
+            'meta': HeadPeekr.extract_meta_tags(head_content)
+        }
+        doc_text = self._build_document(fields)
+        
+        score = self._bm25(doc_text)
+        decision = score >= self.threshold
+        self._update_stats(decision)
+        return decision
+
+    def _build_document(self, fields: Dict) -> str:
+        """Weighted document construction"""
+        return ' '.join([
+            fields['title'] * 3,          # Title weight
+            fields['meta'].get('description', '') * 2,
+            fields['meta'].get('keywords', ''),
+            ' '.join(fields['meta'].values())
+        ])
+
+    def _tokenize(self, text: str) -> List[str]:
+        """Fast case-insensitive tokenization"""
+        return text.lower().split()
+
+    def _bm25(self, document: str) -> float:
+        """Optimized BM25 implementation for head sections"""
+        doc_terms = self._tokenize(document)
+        doc_len = len(doc_terms)
+        tf = defaultdict(int)
+        
+        for term in doc_terms:
+            tf[term] += 1
+
+        score = 0.0
+        for term in set(self.query_terms):
+            term_freq = tf[term]
+            idf = math.log((1 + 1) / (term_freq + 0.5) + 1)  # Simplified IDF
+            numerator = term_freq * (self.k1 + 1)
+            denominator = term_freq + self.k1 * (1 - self.b + 
+                        self.b * (doc_len / self.avgdl))
+            score += idf * (numerator / denominator)
+
+        return score
+
+
+class SEOFilter(URLFilter):
+    """Quantitative SEO quality assessment filter using head section analysis"""
+    
+    __slots__ = ('threshold', '_weights', '_kw_patterns')
+    
+    # Based on SEMrush/Google ranking factors research
+    DEFAULT_WEIGHTS = {
+        'title_length': 0.15,
+        'title_kw': 0.18,
+        'meta_description': 0.12,
+        'canonical': 0.10,
+        'robot_ok': 0.20,  # Most critical factor
+        'schema_org': 0.10,
+        'url_quality': 0.15
+    }
+
+    def __init__(self, threshold: float = 0.65, 
+                 keywords: List[str] = None,
+                 weights: Dict[str, float] = None):
+        super().__init__(name="SEOFilter")
+        self.threshold = threshold
+        self._weights = weights or self.DEFAULT_WEIGHTS
+        self._kw_patterns = re.compile(
+            r'\b({})\b'.format('|'.join(map(re.escape, keywords or []))),
+            re.I
+        ) if keywords else None
+
+    async def apply(self, url: str) -> bool:
+        head_content = await HeadPeekr.peek_html(url)
+        if not head_content:
+            self._update_stats(False)
+            return False
+
+        meta = HeadPeekr.extract_meta_tags(head_content)
+        title = HeadPeekr.get_title(head_content) or ''
+        parsed_url = urlparse(url)
+        
+        scores = {
+            'title_length': self._score_title_length(title),
+            'title_kw': self._score_keyword_presence(title),
+            'meta_description': self._score_meta_description(meta.get('description', '')),
+            'canonical': self._score_canonical(meta.get('canonical'), url),
+            'robot_ok': 1.0 if 'noindex' not in meta.get('robots', '') else 0.0,
+            'schema_org': self._score_schema_org(head_content),
+            'url_quality': self._score_url_quality(parsed_url)
+        }
+
+        total_score = sum(weight * scores[factor] 
+                        for factor, weight in self._weights.items())
+        
+        decision = total_score >= self.threshold
+        self._update_stats(decision)
+        return decision
+
+    def _score_title_length(self, title: str) -> float:
+        length = len(title)
+        if 50 <= length <= 60: return 1.0
+        if 40 <= length < 50 or 60 < length <= 70: return 0.7
+        return 0.3  # Poor length
+
+    def _score_keyword_presence(self, text: str) -> float:
+        if not self._kw_patterns: return 0.0
+        matches = len(self._kw_patterns.findall(text))
+        return min(matches * 0.3, 1.0)  # Max 3 matches
+
+    def _score_meta_description(self, desc: str) -> float:
+        length = len(desc)
+        if 140 <= length <= 160: return 1.0
+        return 0.5 if 120 <= length <= 200 else 0.2
+
+    def _score_canonical(self, canonical: str, original: str) -> float:
+        if not canonical: return 0.5  # Neutral score
+        return 1.0 if canonical == original else 0.2
+
+    def _score_schema_org(self, html: str) -> float:
+        # Detect any schema.org markup in head
+        return 1.0 if re.search(r'<script[^>]+type=["\']application/ld\+json', html) else 0.0
+
+    def _score_url_quality(self, parsed_url) -> float:
+        score = 1.0
+        path = parsed_url.path.lower()
+        
+        # Penalty factors
+        if len(path) > 80: score *= 0.7
+        if re.search(r'\d{4}', path): score *= 0.8  # Numbers in path
+        if parsed_url.query: score *= 0.6  # URL parameters
+        if '_' in path: score *= 0.9  # Underscores vs hyphens
+        
+        return score
+
 def create_fast_filter_chain() -> FastFilterChain:
    """Create an optimized filter chain with filters ordered by rejection rate"""
    return FastFilterChain(
@@ -647,8 +806,6 @@ def create_fast_filter_chain() -> FastFilterChain:

 def run_performance_test():
    import time
-    import random
-    from itertools import cycle

    # Generate test URLs
    base_urls = [
@@ -863,6 +1020,20 @@ def test_pattern_filter():
    else:
        print("\n❌ Some accuracy tests failed!")

+async def test_content_relevancy_filter():
+    # Initialize with query and threshold (tune based on your corpus)
+    relevance_filter = ContentRelevanceFilter(
+        query="machine learning", 
+        threshold=2.5
+    )
+
+    # In your crawler loop
+    for url in ["https://example.com", "https://example.com/blog/post-123"]:
+        if await relevance_filter.apply(url):
+            print(f"✅ Relevant: {url}")
+        else:
+            print(f"❌ Not Relevant: {url}")
+
 if __name__ == "__main__":
    run_performance_test()
    # test_pattern_filter()
--- a/crawl4ai/types.py
+++ b/crawl4ai/types.py
@@ -0,0 +1,14 @@
+from typing import TYPE_CHECKING, Union
+
+AsyncWebCrawler = Union['AsyncWebCrawlerType']  # Note the string literal
+CrawlerRunConfig = Union['CrawlerRunConfigType']
+CrawlResult = Union['CrawlResultType']
+RunManyReturn = Union['RunManyReturnType']
+
+if TYPE_CHECKING:
+    from . import (
+        AsyncWebCrawler as AsyncWebCrawlerType,
+        CrawlerRunConfig as CrawlerRunConfigType,
+        CrawlResult as CrawlResultType,
+        RunManyReturn as RunManyReturnType,
+    )
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -1,22 +1,25 @@
-from ast import Call
 import time
 from urllib.parse import urlparse
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString
 import json
 import html
+import lxml
 import re
 import os
 import platform
 from .prompts import PROMPT_EXTRACT_BLOCKS
 from array import array
-from .config import *
+from .html2text import html2text, CustomHTML2Text
+# from .config import *
+from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, IMAGE_SCORE_THRESHOLD, DEFAULT_PROVIDER, PROVIDER_MODELS
+import httpx
+from socket import gaierror
 from pathlib import Path
-from typing import Dict, Any, List, Tuple, Union, Optional, Callable
+from typing import Dict, Any, List, Optional, Callable
 from urllib.parse import urljoin
 import requests
 from requests.exceptions import InvalidSchema
-from typing import Dict, Any
 import xxhash
 from colorama import Fore, Style, init
 import textwrap
@@ -24,20 +27,20 @@ import cProfile
 import pstats
 from functools import wraps
 import asyncio
-from lxml import html, etree
+
 import sqlite3
 import hashlib
-from urllib.parse import urljoin, urlparse
+
 from urllib.robotparser import RobotFileParser
 import aiohttp
-from pathlib import Path
+
 from packaging import version
 from . import __version__
-from typing import Sequence, List
-from array import array
+from typing import Sequence
+
 from itertools import chain
 from collections import deque
-from typing import Callable, Generator, Iterable, List, Optional
+from typing import  Generator, Iterable

 def chunk_documents(
    documents: Iterable[str],
@@ -194,7 +197,7 @@ class VersionManager:
            return None
        try:
            return version.parse(self.version_file.read_text().strip())
-        except:
+        except Exception as _ex:
            return None

    def update_version(self):
@@ -286,7 +289,7 @@ class RobotsParser:
            domain = parsed.netloc
            if not domain:
                return True
-        except:
+        except Exception as _ex:
            return True

        # Fast path - check cache first
@@ -306,7 +309,7 @@ class RobotsParser:
                            self._cache_rules(domain, rules)
                        else:
                            return True
-            except:
+            except Exception as _ex:
                # On any error (timeout, connection failed, etc), allow access
                return True

@@ -1374,7 +1377,7 @@ def get_content_of_website_optimized(
            src = img.get("src", "")
            if base64_pattern.match(src):
                img["src"] = base64_pattern.sub("", src)
-        except:
+        except Exception as _ex:
            pass

    cleaned_html = str(body).replace("\n\n", "\n").replace("  ", " ")
@@ -1412,7 +1415,7 @@ def extract_metadata_using_lxml(html, doc=None):

    if doc is None:
        try:
-            doc = lhtml.document_fromstring(html)
+            doc = lxml.html.document_fromstring(html)
        except Exception:
            return {}

@@ -1728,10 +1731,10 @@ def extract_blocks_batch(batch_data, provider="groq/llama3-70b-8192", api_token=

    messages = []

-    for url, html in batch_data:
+    for url, _html in batch_data:
        variable_values = {
            "URL": url,
-            "HTML": html,
+            "HTML": _html,
        }

        prompt_with_variables = PROMPT_EXTRACT_BLOCKS
@@ -1913,7 +1916,7 @@ def fast_format_html(html_string):
    indent = 0
    indent_str = "  "  # Two spaces for indentation
    formatted = []
-    in_content = False
+    # in_content = False

    # Split by < and > to separate tags and content
    parts = html_string.replace(">", ">\n").replace("<", "\n<").split("\n")
@@ -2466,17 +2469,74 @@ def truncate(value, threshold):
 def optimize_html(html_str, threshold=200):
    root = html.fromstring(html_str)
    
-    for element in root.iter():
+    for _element in root.iter():
        # Process attributes
-        for attr in list(element.attrib):
-            element.attrib[attr] = truncate(element.attrib[attr], threshold)
+        for attr in list(_element.attrib):
+            _element.attrib[attr] = truncate(_element.attrib[attr], threshold)
        
        # Process text content
-        if element.text and len(element.text) > threshold:
-            element.text = truncate(element.text, threshold)
+        if _element.text and len(_element.text) > threshold:
+            _element.text = truncate(_element.text, threshold)
            
        # Process tail text
-        if element.tail and len(element.tail) > threshold:
-            element.tail = truncate(element.tail, threshold)
+        if _element.tail and len(_element.tail) > threshold:
+            _element.tail = truncate(_element.tail, threshold)
    
-    return html.tostring(root, encoding='unicode', pretty_print=False)
+    return html.tostring(root, encoding='unicode', pretty_print=False)
+
+class HeadPeekr:
+    @staticmethod
+    async def fetch_head_section(url, timeout=0.3):
+        headers = {
+            "User-Agent": "Mozilla/5.0 (compatible; CrawlBot/1.0)",
+            "Accept": "text/html",
+            "Connection": "close"  # Force close after response
+        }
+        try:
+            async with httpx.AsyncClient(timeout=timeout) as client:
+                response = await client.get(url, headers=headers, follow_redirects=True)
+                
+                # Handle redirects explicitly by using the final URL
+                if response.url != url:
+                    url = str(response.url)
+                    response = await client.get(url, headers=headers)
+                
+                content = b""
+                async for chunk in response.aiter_bytes():
+                    content += chunk
+                    if b"</head>" in content:
+                        break  # Stop after detecting </head>
+                return content.split(b"</head>")[0] + b"</head>"
+        except (httpx.HTTPError, gaierror) :
+            return None
+
+    @staticmethod
+    async def peek_html(url, timeout=0.3):
+        head_section = await HeadPeekr.fetch_head_section(url, timeout=timeout)
+        if head_section:
+            return head_section.decode("utf-8", errors="ignore")
+        return None
+
+    @staticmethod
+    def extract_meta_tags(head_content: str):
+        meta_tags = {}
+        
+        # Find all meta tags
+        meta_pattern = r'<meta[^>]+>'
+        for meta_tag in re.finditer(meta_pattern, head_content):
+            tag = meta_tag.group(0)
+            
+            # Extract name/property and content
+            name_match = re.search(r'name=["\'](.*?)["\']', tag)
+            property_match = re.search(r'property=["\'](.*?)["\']', tag)
+            content_match = re.search(r'content=["\'](.*?)["\']', tag)
+            
+            if content_match and (name_match or property_match):
+                key = name_match.group(1) if name_match else property_match.group(1)
+                meta_tags[key] = content_match.group(1)
+                
+        return meta_tags
+
+    def get_title(head_content: str):
+        title_match = re.search(r'<title>(.*?)</title>', head_content, re.IGNORECASE | re.DOTALL)
+        return title_match.group(1) if title_match else None
--- a/tests/20241401/test_deep_crawl.py
+++ b/tests/20241401/test_deep_crawl.py
@@ -2,8 +2,10 @@ import asyncio
 import time


-from crawl4ai import CrawlerRunConfig, AsyncWebCrawler,     CacheMode
-from crawl4ai.deep_crawling.bfs_strategy import BFSDeepCrawlStrategy
+from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
+from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
+# from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy


 async def main():
@@ -15,7 +17,8 @@ async def main():
        ),
        stream=False,
        verbose=True,
-        cache_mode=CacheMode.BYPASS
+        cache_mode=CacheMode.BYPASS,
+        scraping_strategy=LXMLWebScrapingStrategy()
    )

    async with AsyncWebCrawler() as crawler: