feat: Add deep crawl capabilities to arun_many function

Refactor: remove the old deep_crawl method
Refactor:Moved deep_crawl_strategy, inside crawler run config
2025-01-30 17:49:58 +05:30 · 2025-01-30 16:22:41 +05:30 · 2025-01-30 16:18:15 +05:30 · 2025-01-29 18:08:34 +05:30 · 2025-01-29 16:24:11 +05:30 · 2025-01-29 15:58:21 +05:30
17 changed files with 3701 additions and 27 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -234,3 +234,7 @@ todo/

 # windsurf rules
 .windsurfrules
+
+
+# windsurf rules
+.windsurfrules
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -10,6 +10,7 @@ from .config import (
 from .user_agent_generator import UserAgentGenerator, UAGen, ValidUAGenerator, OnlineUAGenerator
 from .extraction_strategy import ExtractionStrategy
 from .chunking_strategy import ChunkingStrategy, RegexChunking
+from .deep_crawl import DeepCrawlStrategy
 from .markdown_generation_strategy import MarkdownGenerationStrategy
 from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter, LLMContentFilter, PruningContentFilter
 from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
@@ -395,6 +396,7 @@ class CrawlerRunConfig:
        word_count_threshold: int = MIN_WORD_THRESHOLD,
        extraction_strategy: ExtractionStrategy = None,
        chunking_strategy: ChunkingStrategy = RegexChunking(),
+        deep_crawl_strategy: DeepCrawlStrategy = None,
        markdown_generator: MarkdownGenerationStrategy = None,
        content_filter : RelevantContentFilter = None,
        only_text: bool = False,
@@ -468,6 +470,7 @@ class CrawlerRunConfig:
        self.word_count_threshold = word_count_threshold
        self.extraction_strategy = extraction_strategy
        self.chunking_strategy = chunking_strategy
+        self.deep_crawl_strategy = deep_crawl_strategy
        self.markdown_generator = markdown_generator
        self.content_filter = content_filter
        self.only_text = only_text
@@ -555,6 +558,14 @@ class CrawlerRunConfig:
            raise ValueError(
                "extraction_strategy must be an instance of ExtractionStrategy"
            )
+        
+        if self.deep_crawl_strategy is not None and not isinstance(
+            self.deep_crawl_strategy, DeepCrawlStrategy
+        ):
+            raise ValueError(
+            "deep_crawl_strategy must be an instance of DeepCrawlStrategy"
+            )
+
        if self.chunking_strategy is not None and not isinstance(
            self.chunking_strategy, ChunkingStrategy
        ):
@@ -573,6 +584,7 @@ class CrawlerRunConfig:
            word_count_threshold=kwargs.get("word_count_threshold", 200),
            extraction_strategy=kwargs.get("extraction_strategy"),
            chunking_strategy=kwargs.get("chunking_strategy", RegexChunking()),
+            deep_crawl_strategy=kwargs.get("deep_crawl_strategy"),
            markdown_generator=kwargs.get("markdown_generator"),
            content_filter=kwargs.get("content_filter"),
            only_text=kwargs.get("only_text", False),
@@ -656,6 +668,7 @@ class CrawlerRunConfig:
            "word_count_threshold": self.word_count_threshold,
            "extraction_strategy": self.extraction_strategy,
            "chunking_strategy": self.chunking_strategy,
+            "deep_crawl_strategy": self.deep_crawl_strategy,
            "markdown_generator": self.markdown_generator,
            "content_filter": self.content_filter,
            "only_text": self.only_text,
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -10,13 +10,19 @@ import asyncio

 # from contextlib import nullcontext, asynccontextmanager
 from contextlib import asynccontextmanager
-from .models import CrawlResult, MarkdownGenerationResult, CrawlerTaskResult, DispatchResult
+
+from .models import (
+    CrawlResult,
+    MarkdownGenerationResult,
+    CrawlerTaskResult,
+    DispatchResult,
+)
 from .async_database import async_db_manager
 from .chunking_strategy import *  # noqa: F403
 from .chunking_strategy import RegexChunking, ChunkingStrategy, IdentityChunking
 from .content_filter_strategy import *  # noqa: F403
 from .content_filter_strategy import RelevantContentFilter
-from .extraction_strategy import * # noqa: F403
+from .extraction_strategy import *  # noqa: F403
 from .extraction_strategy import NoExtractionStrategy, ExtractionStrategy
 from .async_crawler_strategy import (
    AsyncCrawlerStrategy,
@@ -30,8 +36,9 @@ from .markdown_generation_strategy import (
 )
 from .async_logger import AsyncLogger
 from .async_configs import BrowserConfig, CrawlerRunConfig
-from .async_dispatcher import * # noqa: F403
+from .async_dispatcher import *  # noqa: F403
 from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
+from .deep_crawl import DeepCrawlStrategy

 from .config import MIN_WORD_THRESHOLD
 from .utils import (
@@ -46,11 +53,18 @@ from .utils import (
 from typing import Union, AsyncGenerator, List, TypeVar
 from collections.abc import AsyncGenerator

-CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult)
-RunManyReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]

 from .__version__ import __version__ as crawl4ai_version

+CrawlResultT = TypeVar("CrawlResultT", bound=CrawlResult)
+RunManyReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
+
+DeepCrawlSingleReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
+DeepCrawlManyReturn = Union[
+    List[List[CrawlResultT]],
+    AsyncGenerator[CrawlResultT, None],
+]
+

 class AsyncWebCrawler:
    """
@@ -257,7 +271,7 @@ class AsyncWebCrawler:

    @asynccontextmanager
    async def nullcontext(self):
-        """异步空上下文管理器"""
+        """Asynchronous null context manager"""
        yield

    async def arun(
@@ -282,7 +296,7 @@ class AsyncWebCrawler:
        user_agent: str = None,
        verbose=True,
        **kwargs,
-    ) -> CrawlResult:
+    ) -> Union[CrawlResult, DeepCrawlSingleReturn]:
        """
        Runs the crawler for a single source: URL (web, local file, or raw HTML).

@@ -384,6 +398,23 @@ class AsyncWebCrawler:
                extracted_content = None
                start_time = time.perf_counter()

+                if crawler_config.deep_crawl_strategy:
+                    if crawler_config.stream:
+                        return crawler_config.deep_crawl_strategy.arun(
+                            start_url=url,
+                            crawler=self,
+                            crawler_run_config=crawler_config,
+                        )
+                    else:
+                        results = []
+                        async for result in crawler_config.deep_crawl_strategy.arun(
+                            start_url=url,
+                            crawler=self,
+                            crawler_run_config=crawler_config,
+                        ):
+                            results.append(result)
+                        return results
+
                # Try to get cached result if appropriate
                if cache_context.should_read():
                    cached_result = await async_db_manager.aget_cached_url(url)
@@ -420,14 +451,18 @@ class AsyncWebCrawler:

                    # Check robots.txt if enabled
                    if config and config.check_robots_txt:
-                        if not await self.robots_parser.can_fetch(url, self.browser_config.user_agent):
+                        if not await self.robots_parser.can_fetch(
+                            url, self.browser_config.user_agent
+                        ):
                            return CrawlResult(
                                url=url,
                                html="",
                                success=False,
                                status_code=403,
                                error_message="Access denied by robots.txt",
-                                response_headers={"X-Robots-Status": "Blocked by robots.txt"}
+                                response_headers={
+                                    "X-Robots-Status": "Blocked by robots.txt"
+                                },
                            )

                    # Pass config to crawl method
@@ -449,7 +484,7 @@ class AsyncWebCrawler:
                    )

                    # Process the HTML content
-                    crawl_result : CrawlResult = await self.aprocess_html(
+                    crawl_result: CrawlResult = await self.aprocess_html(
                        url=url,
                        html=html,
                        extracted_content=extracted_content,
@@ -717,7 +752,7 @@ class AsyncWebCrawler:
    async def arun_many(
        self,
        urls: List[str],
-        config: Optional[CrawlerRunConfig] = None, 
+        config: Optional[CrawlerRunConfig] = None,
        dispatcher: Optional[BaseDispatcher] = None,
        # Legacy parameters maintained for backwards compatibility
        word_count_threshold=MIN_WORD_THRESHOLD,
@@ -731,8 +766,8 @@ class AsyncWebCrawler:
        pdf: bool = False,
        user_agent: str = None,
        verbose=True,
-        **kwargs
-        ) -> RunManyReturn:
+        **kwargs,
+    ) -> Union[RunManyReturn, DeepCrawlManyReturn]:
        """
        Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy.

@@ -763,6 +798,22 @@ class AsyncWebCrawler:
        ):
            print(f"Processed {result.url}: {len(result.markdown)} chars")
        """
+
+        async def merge_async_generators(generators):
+            tasks = {asyncio.create_task(gen.__anext__()): gen for gen in generators}
+            while tasks:
+                done, _ = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
+                
+                for task in done:
+                    gen = tasks.pop(task)  # Get the generator associated with this task
+                    
+                    try:
+                        result = task.result()
+                        yield result  # Yield the result
+                        tasks[asyncio.create_task(gen.__anext__())] = gen  # Fetch next item
+                    except StopAsyncIteration:
+                        pass  # Generator is exhausted, don't add it back to the tasks
+
        if config is None:
            config = CrawlerRunConfig(
                word_count_threshold=word_count_threshold,
@@ -786,7 +837,9 @@ class AsyncWebCrawler:
            )

        transform_result = lambda task_result: (
-            setattr(task_result.result, 'dispatch_result', 
+            setattr(
+                task_result.result,
+                "dispatch_result",
                DispatchResult(
                    task_id=task_result.task_id,
                    memory_usage=task_result.memory_usage,
@@ -794,20 +847,46 @@ class AsyncWebCrawler:
                    start_time=task_result.start_time,
                    end_time=task_result.end_time,
                    error_message=task_result.error_message,
-                )
-            ) or task_result.result
+                ),
+            )
+            or task_result.result
        )

        stream = config.stream
-        
+
+        if config.deep_crawl_strategy:
+            if config.stream:
+                generators = []
+                for url in urls:
+                    generators.append(
+                        config.deep_crawl_strategy.arun(
+                            start_url=url, crawler=self, crawler_run_config=config
+                        )
+                    )
+                return merge_async_generators(generators)
+            else:
+                results = []
+                for url in urls:
+                    url_results = []
+                    async for result in config.deep_crawl_strategy.arun(
+                        start_url=url, crawler=self, crawler_run_config=config
+                    ):
+                        url_results.append(result)
+                    results.append(url_results)
+                return results
+
        if stream:
+
            async def result_transformer():
-                async for task_result in dispatcher.run_urls_stream(crawler=self, urls=urls, config=config):
+                async for task_result in dispatcher.run_urls_stream(
+                    crawler=self, urls=urls, config=config
+                ):
                    yield transform_result(task_result)
+
            return result_transformer()
        else:
            _results = await dispatcher.run_urls(crawler=self, urls=urls, config=config)
-            return [transform_result(res) for res in _results]    
+            return [transform_result(res) for res in _results]

    async def aclear_cache(self):
        """Clear the cache database."""
--- a/crawl4ai/config.py
+++ b/crawl4ai/config.py
@@ -84,3 +84,4 @@ SHOW_DEPRECATION_WARNINGS = True
 SCREENSHOT_HEIGHT_TRESHOLD = 10000
 PAGE_TIMEOUT = 60000
 DOWNLOAD_PAGE_TIMEOUT = 60000
+DEEP_CRAWL_BATCH_SIZE = 5
--- a/crawl4ai/deep_crawl/init.py
+++ b/crawl4ai/deep_crawl/init.py
@@ -0,0 +1,29 @@
+from .bfs_deep_crawl_strategy import BFSDeepCrawlStrategy
+from .filters import (
+    URLFilter,
+    FilterChain,
+    URLPatternFilter,
+    ContentTypeFilter,
+    DomainFilter,
+)
+from .scorers import (
+    KeywordRelevanceScorer,
+    PathDepthScorer,
+    FreshnessScorer,
+    CompositeScorer,
+)
+from .deep_crawl_strategty import DeepCrawlStrategy
+
+__all__ = [
+    "BFSDeepCrawlStrategy",
+    "FilterChain",
+    "URLFilter",
+    "URLPatternFilter",
+    "ContentTypeFilter",
+    "DomainFilter",
+    "KeywordRelevanceScorer",
+    "PathDepthScorer",
+    "FreshnessScorer",
+    "CompositeScorer",
+    "DeepCrawlStrategy",
+]
--- a/crawl4ai/deep_crawl/bfs_deep_crawl_strategy.py
+++ b/crawl4ai/deep_crawl/bfs_deep_crawl_strategy.py
@@ -0,0 +1,193 @@
+from typing import AsyncGenerator, Optional, Dict, Set, List
+from datetime import datetime
+import asyncio
+import logging
+from urllib.parse import urlparse
+from ..models import CrawlResult, TraversalStats
+from .filters import FilterChain
+from .scorers import URLScorer
+from .deep_crawl_strategty import DeepCrawlStrategy
+from ..config import DEEP_CRAWL_BATCH_SIZE
+
+
+class BFSDeepCrawlStrategy(DeepCrawlStrategy):
+    """Best-First Search traversal strategy with filtering and scoring."""
+
+    def __init__(
+        self,
+        max_depth: int,
+        filter_chain: FilterChain,
+        url_scorer: URLScorer,
+        process_external_links: bool = False,
+        logger: Optional[logging.Logger] = None,
+    ):
+        self.max_depth = max_depth
+        self.filter_chain = filter_chain
+        self.url_scorer = url_scorer
+        self.logger = logger or logging.getLogger(__name__)
+
+        # Crawl control
+        self.stats = TraversalStats(start_time=datetime.now())
+        self._cancel_event = asyncio.Event()
+        self.process_external_links = process_external_links
+
+    async def can_process_url(self, url: str, depth: int) -> bool:
+        """Check if URL can be processed based on filters
+        This is our gatekeeper method that determines if a URL should be processed. It:
+            - Validates URL format using a robust built-in method
+            - Applies custom filters from the filter chain
+            - Updates statistics for blocked URLs
+            - Returns False early if any check fails
+        """
+        try:
+            result = urlparse(url)
+            if not all([result.scheme, result.netloc]):
+                raise ValueError("Invalid URL")
+            if result.scheme not in ("http", "https"):
+                raise ValueError("URL must be HTTP or HTTPS")
+            if not result.netloc or "." not in result.netloc:
+                raise ValueError("Invalid domain")
+        except Exception as e:
+            self.logger.warning(f"Invalid URL: {url}. Error: {str(e)}")
+            return False
+
+        # Apply the filter chain if it's not start page
+        if depth != 0 and not self.filter_chain.apply(url):
+            return False
+
+        return True
+
+    async def _process_links(
+        self,
+        result: CrawlResult,
+        source_url: str,
+        queue: asyncio.PriorityQueue,
+        visited: Set[str],
+        depths: Dict[str, int],
+    ) -> List[str]:
+        """Process extracted links from crawl result.
+        This is our link processor that:
+            Checks depth limits
+            Handles both internal and external links
+            Checks if URL is visited already
+            Checks if URL can be processed - validates URL, applies Filters with can_process_url
+            Scores URLs for priority
+            Updates depth tracking dictionary
+            Adds valid URLs to the queue
+            Updates maximum depth statistics
+        """
+        next_depth = depths[source_url] + 1
+        # If depth limit reached, exit without processing links
+        if next_depth > self.max_depth:
+            return
+        links_to_process = result.links["internal"]
+        if self.process_external_links:
+            links_to_process += result.links["external"]
+        for link in links_to_process:
+            url = link["href"]
+            if url in visited:
+                continue
+            if not await self.can_process_url(url, next_depth):
+                self.stats.urls_skipped += 1
+                continue
+            score = self.url_scorer.score(url) if self.url_scorer else 0
+            await queue.put((score, next_depth, url, source_url))
+            depths[url] = next_depth
+            self.stats.total_depth_reached = max(
+                self.stats.total_depth_reached, next_depth
+            )
+
+    async def arun(
+        self,
+        start_url: str,
+        crawler: "AsyncWebCrawler",
+        crawler_run_config: Optional["CrawlerRunConfig"] = None,
+    ) -> AsyncGenerator[CrawlResult, None]:
+        """Implement BFS traversal strategy"""
+
+        # Initialize traversal state
+        """
+        queue: A priority queue where items are tuples of (score, depth, url)
+            Score: Determines traversal priority (lower = higher priority)
+            Depth: Current distance from start_url
+            URL: The actual URL to crawl
+            visited: Keeps track of URLs we've already seen to avoid cycles
+            depths: Maps URLs to their depths from the start URL
+            active_crawls: Tracks currently running crawl tasks        
+        """
+        queue = asyncio.PriorityQueue()
+        await queue.put((0, 0, start_url, None))
+        visited: Set[str] = set()
+        depths = {start_url: 0}
+        active_crawls = {}  # Track URLs currently being processed with depth and score
+        active_crawls_lock = (
+            asyncio.Lock()
+        )  # Create the lock within the same event loop
+        try:
+            while (
+                not queue.empty() or active_crawls
+            ) and not self._cancel_event.is_set():
+                """
+                This sets up our main control loop which:
+                    - Continues while there are URLs to process (not queue.empty())
+                    - Or while there are active crawls still running (arun_many)
+                    - Can be interrupted via cancellation (not self._cancel_event.is_set())
+                """
+                # Collect batch of URLs into active_crawls to process
+                async with active_crawls_lock:
+                    while (
+                        len(active_crawls) < DEEP_CRAWL_BATCH_SIZE and not queue.empty()
+                    ):
+                        score, depth, url, parent_url = await queue.get()
+                        active_crawls[url] = {
+                            "depth": depth,
+                            "score": score,
+                            "parent_url": parent_url,
+                        }
+                        self.stats.current_depth = depth
+
+                if not active_crawls:
+                    # If no active crawls exist, wait a bit and continue
+                    await asyncio.sleep(0.1)
+                    continue
+                # Process batch
+                try:
+                    # This is very important to ensure recursively you don't deep_crawl down the children.
+                    if crawler_run_config:
+                        crawler_run_config = crawler_run_config.clone(
+                            deep_crawl_strategy=None, stream=True
+                        )
+                    async for result in await crawler.arun_many(
+                        urls=list(active_crawls.keys()),
+                        config=crawler_run_config
+                    ):
+                        async with active_crawls_lock:
+                            crawl_info = active_crawls.pop(result.url, None)
+
+                        if crawl_info and result.success:
+                            await self._process_links(
+                                result, result.url, queue, visited, depths
+                            )
+                            result.depth = crawl_info["depth"]
+                            result.score = crawl_info["score"]
+                            result.parent_url = crawl_info["parent_url"]
+                            yield result
+                        else:
+                            self.logger.warning(
+                                f"Failed to crawl {result.url}: {result.error_message}"
+                            )
+                except Exception as e:
+                    self.logger.error(f"Batch processing error: {e}")
+                    # Continue processing other batches
+                    continue
+
+        except Exception as e:
+            self.logger.error(f"Error in crawl process: {e}")
+            raise
+
+        finally:
+            self.stats.end_time = datetime.now()
+
+    async def shutdown(self):
+        """Clean up resources and stop crawling"""
+        self._cancel_event.set()
--- a/crawl4ai/deep_crawl/deep_crawl_strategty.py
+++ b/crawl4ai/deep_crawl/deep_crawl_strategty.py
@@ -0,0 +1,30 @@
+from abc import ABC, abstractmethod
+from typing import AsyncGenerator, Optional
+
+from ..models import CrawlResult
+
+
+class DeepCrawlStrategy(ABC):
+    @abstractmethod
+    async def arun(
+        self,
+        url: str,
+        crawler: "AsyncWebCrawler",
+        crawler_run_config: Optional["CrawlerRunConfig"] = None,
+    ) -> AsyncGenerator[CrawlResult, None]:
+        """Traverse the given URL using the specified crawler.
+
+        Args:
+            url (str): The starting URL for the traversal.
+            crawler (AsyncWebCrawler): The crawler instance to use for traversal.
+            crawler_run_config (CrawlerRunConfig, optional): The configuration for the crawler.
+
+        Returns:
+            AsyncGenerator[CrawlResult, None]: An async generator yielding crawl results.
+        """
+        pass
+
+    @abstractmethod
+    async def shutdown(self):
+        """Clean up resources used by the strategy"""
+        pass
--- a/crawl4ai/deep_crawl/filters.py
+++ b/crawl4ai/deep_crawl/filters.py
@@ -0,0 +1,868 @@
+from abc import ABC, abstractmethod
+from typing import List, Pattern, Set, Union, FrozenSet
+import re, time
+from urllib.parse import urlparse
+from array import array
+import logging
+from functools import lru_cache
+import fnmatch
+from dataclasses import dataclass
+from typing import ClassVar
+import weakref
+import mimetypes
+
+
+@dataclass
+class FilterStats:
+    # PERF: Using dataclass creates overhead with __init__ and property access
+    # PERF: Could use __slots__ to reduce memory footprint
+    # PERF: Consider using array.array('I') for atomic increments
+    total_urls: int = 0
+    rejected_urls: int = 0
+    passed_urls: int = 0
+
+
+class URLFilter(ABC):
+    # PERF: Logger creation is expensive, consider lazy initialization
+    # PERF: stats object creation adds overhead for each filter instance
+    def __init__(self, name: str = None):
+        self.name = name or self.__class__.__name__
+        self.stats = FilterStats()
+        self.logger = logging.getLogger(f"urlfilter.{self.name}")
+
+    @abstractmethod
+    def apply(self, url: str) -> bool:
+        pass
+
+    def _update_stats(self, passed: bool):
+        # PERF: Already optimized but could use bitwise operations
+        # PERF: Consider removing stats entirely in production/fast mode
+        self.stats.total_urls += 1
+        self.stats.passed_urls += passed
+        self.stats.rejected_urls += not passed
+
+
+class FilterChain:
+    # PERF: List traversal for each URL is expensive
+    # PERF: Could use array.array instead of list for filters
+    # PERF: Consider adding fast path for single filter case
+    def __init__(self, filters: List[URLFilter] = None):
+        self.filters = filters or []
+        self.stats = FilterStats()
+        self.logger = logging.getLogger("urlfilter.chain")
+
+    def apply(self, url: str) -> bool:
+        # PERF: Logging on every rejection is expensive
+        # PERF: Could reorder filters by rejection rate
+        # PERF: Consider batch processing mode
+        self.stats.total_urls += 1
+
+        for filter_ in self.filters:
+            if not filter_.apply(url):
+                self.stats.rejected_urls += 1
+                self.logger.debug(f"URL {url} rejected by {filter_.name}")
+                return False
+
+        self.stats.passed_urls += 1
+        return True
+
+
+class URLPatternFilter(URLFilter):
+    # PERF: Converting glob to regex is expensive
+    # PERF: Multiple regex compilation is slow
+    # PERF: List of patterns causes multiple regex evaluations
+    def __init__(
+        self,
+        patterns: Union[str, Pattern, List[Union[str, Pattern]]],
+        use_glob: bool = True,
+    ):
+        super().__init__()
+        self.patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
+        self.use_glob = use_glob
+        self._compiled_patterns = []
+
+        # PERF: This could be consolidated into a single regex with OR conditions
+        # PERF: glob_to_regex creates complex patterns, could be simplified
+        for pattern in self.patterns:
+            if isinstance(pattern, str) and use_glob:
+                self._compiled_patterns.append(self._glob_to_regex(pattern))
+            else:
+                self._compiled_patterns.append(
+                    re.compile(pattern) if isinstance(pattern, str) else pattern
+                )
+
+    def _glob_to_regex(self, pattern: str) -> Pattern:
+        # PERF: fnmatch.translate creates overly complex patterns
+        # PERF: Could cache common translations
+        return re.compile(fnmatch.translate(pattern))
+
+    def apply(self, url: str) -> bool:
+        # PERF: any() with generator is slower than direct loop with early return
+        # PERF: searching entire string is slower than anchored match
+        matches = any(pattern.search(url) for pattern in self._compiled_patterns)
+        self._update_stats(matches)
+        return matches
+
+
+class ContentTypeFilter(URLFilter):
+    # PERF: mimetypes guessing is extremely slow
+    # PERF: URL parsing on every check is expensive
+    # PERF: No caching of results for similar extensions
+    def __init__(
+        self, allowed_types: Union[str, List[str]], check_extension: bool = True
+    ):
+        super().__init__()
+        self.allowed_types = (
+            [allowed_types] if isinstance(allowed_types, str) else allowed_types
+        )
+        self.check_extension = check_extension
+        self._normalize_types()
+
+    def _normalize_types(self):
+        """Normalize content type strings"""
+        self.allowed_types = [t.lower() for t in self.allowed_types]
+
+    def _check_extension(self, url: str) -> bool:
+        # PERF: urlparse is called on every check
+        # PERF: multiple string splits are expensive
+        # PERF: mimetypes.guess_type is very slow
+        ext = (
+            urlparse(url).path.split(".")[-1].lower()
+            if "." in urlparse(url).path
+            else ""
+        )
+        if not ext:
+            return True
+
+        # PERF: guess_type is main bottleneck
+        guessed_type = mimetypes.guess_type(url)[0]
+        return any(
+            allowed in (guessed_type or "").lower() for allowed in self.allowed_types
+        )
+
+    def apply(self, url: str) -> bool:
+        """Check if URL's content type is allowed"""
+        result = True
+        if self.check_extension:
+            result = self._check_extension(url)
+        self._update_stats(result)
+        return result
+
+
+class DomainFilter(URLFilter):
+    # PERF: Set lookups are fast but string normalizations on init are not
+    # PERF: Creating two sets doubles memory usage
+    def __init__(
+        self,
+        allowed_domains: Union[str, List[str]] = None,
+        blocked_domains: Union[str, List[str]] = None,
+    ):
+        super().__init__()
+        # PERF: Normalizing domains on every init is wasteful
+        # PERF: Could use frozenset for immutable lists
+        self.allowed_domains = (
+            set(self._normalize_domains(allowed_domains)) if allowed_domains else None
+        )
+        self.blocked_domains = (
+            set(self._normalize_domains(blocked_domains)) if blocked_domains else set()
+        )
+
+    def _normalize_domains(self, domains: Union[str, List[str]]) -> List[str]:
+        # PERF: strip() and lower() create new strings for each domain
+        # PERF: List comprehension creates intermediate list
+        if isinstance(domains, str):
+            domains = [domains]
+        return [d.lower().strip() for d in domains]
+
+    def _extract_domain(self, url: str) -> str:
+        # PERF: urlparse is called for every URL check
+        # PERF: lower() creates new string every time
+        # PERF: Could cache recent results
+        return urlparse(url).netloc.lower()
+
+    def apply(self, url: str) -> bool:
+        # PERF: Two separate set lookups in worst case
+        # PERF: Domain extraction happens before knowing if we have any filters
+        domain = self._extract_domain(url)
+
+        if domain in self.blocked_domains:
+            self._update_stats(False)
+            return False
+
+        if self.allowed_domains is not None and domain not in self.allowed_domains:
+            self._update_stats(False)
+            return False
+
+        self._update_stats(True)
+        return True
+
+
+# Example usage:
+def create_common_filter_chain() -> FilterChain:
+    """Create a commonly used filter chain"""
+    return FilterChain(
+        [
+            URLPatternFilter(
+                [
+                    "*.html",
+                    "*.htm",  # HTML files
+                    "*/article/*",
+                    "*/blog/*",  # Common content paths
+                ]
+            ),
+            ContentTypeFilter(["text/html", "application/xhtml+xml"]),
+            DomainFilter(blocked_domains=["ads.*", "analytics.*"]),
+        ]
+    )
+
+
+####################################################################################
+# Uncledoe: Optimized Version
+####################################################################################
+
+
+# Use __slots__ and array for maximum memory/speed efficiency
+class FastFilterStats:
+    __slots__ = ("_counters",)
+
+    def __init__(self):
+        # Use array of unsigned ints for atomic operations
+        self._counters = array("I", [0, 0, 0])  # total, passed, rejected
+
+    @property
+    def total_urls(self):
+        return self._counters[0]
+
+    @property
+    def passed_urls(self):
+        return self._counters[1]
+
+    @property
+    def rejected_urls(self):
+        return self._counters[2]
+
+
+class FastURLFilter(ABC):
+    """Optimized base filter class"""
+
+    __slots__ = ("name", "stats", "_logger_ref")
+
+    def __init__(self, name: str = None):
+        self.name = name or self.__class__.__name__
+        self.stats = FastFilterStats()
+        # Lazy logger initialization using weakref
+        self._logger_ref = None
+
+    @property
+    def logger(self):
+        if self._logger_ref is None or self._logger_ref() is None:
+            logger = logging.getLogger(f"urlfilter.{self.name}")
+            self._logger_ref = weakref.ref(logger)
+        return self._logger_ref()
+
+    @abstractmethod
+    def apply(self, url: str) -> bool:
+        pass
+
+    def _update_stats(self, passed: bool):
+        # Use direct array index for speed
+        self.stats._counters[0] += 1  # total
+        self.stats._counters[1] += passed  # passed
+        self.stats._counters[2] += not passed  # rejected
+
+
+class FastFilterChain:
+    """Optimized filter chain"""
+
+    __slots__ = ("filters", "stats", "_logger_ref")
+
+    def __init__(self, filters: List[FastURLFilter] = None):
+        self.filters = tuple(filters or [])  # Immutable tuple for speed
+        self.stats = FastFilterStats()
+        self._logger_ref = None
+
+    @property
+    def logger(self):
+        if self._logger_ref is None or self._logger_ref() is None:
+            logger = logging.getLogger("urlfilter.chain")
+            self._logger_ref = weakref.ref(logger)
+        return self._logger_ref()
+
+    def add_filter(self, filter_: FastURLFilter) -> "FastFilterChain":
+        """Add a filter to the chain"""
+        self.filters.append(filter_)
+        return self  # Enable method chaining
+
+    def apply(self, url: str) -> bool:
+        """Optimized apply with minimal operations"""
+        self.stats._counters[0] += 1  # total
+
+        # Direct tuple iteration is faster than list
+        for f in self.filters:
+            if not f.apply(url):
+                self.stats._counters[2] += 1  # rejected
+                return False
+
+        self.stats._counters[1] += 1  # passed
+        return True
+
+class FastURLPatternFilter(FastURLFilter):
+    """Pattern filter balancing speed and completeness"""
+    __slots__ = ('_simple_suffixes', '_simple_prefixes', '_domain_patterns', '_path_patterns')
+    
+    PATTERN_TYPES = {
+        'SUFFIX': 1,    # *.html
+        'PREFIX': 2,    # /foo/*
+        'DOMAIN': 3,    # *.example.com
+        'PATH': 4 ,      # Everything else
+        'REGEX': 5 
+    }
+    
+    def __init__(self, patterns: Union[str, Pattern, List[Union[str, Pattern]]], use_glob: bool = True):
+        super().__init__()
+        patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
+        
+        self._simple_suffixes = set()
+        self._simple_prefixes = set()
+        self._domain_patterns = []
+        self._path_patterns = []
+        
+        for pattern in patterns:
+            pattern_type = self._categorize_pattern(pattern)
+            self._add_pattern(pattern, pattern_type)
+    
+    def _categorize_pattern(self, pattern: str) -> int:
+        """Categorize pattern for specialized handling"""
+        if not isinstance(pattern, str):
+            return self.PATTERN_TYPES['PATH']
+            
+        # Check if it's a regex pattern
+        if pattern.startswith('^') or pattern.endswith('$') or '\\d' in pattern:
+            return self.PATTERN_TYPES['REGEX']
+        
+        if pattern.count('*') == 1:
+            if pattern.startswith('*.'):
+                return self.PATTERN_TYPES['SUFFIX']
+            if pattern.endswith('/*'):
+                return self.PATTERN_TYPES['PREFIX']
+                
+        if '://' in pattern and pattern.startswith('*.'):
+            return self.PATTERN_TYPES['DOMAIN']
+            
+        return self.PATTERN_TYPES['PATH']
+    
+    def _add_pattern(self, pattern: str, pattern_type: int):
+        """Add pattern to appropriate matcher"""
+        if pattern_type == self.PATTERN_TYPES['REGEX']:
+            # For regex patterns, compile directly without glob translation
+            if isinstance(pattern, str) and (pattern.startswith('^') or pattern.endswith('$') or '\\d' in pattern):
+                self._path_patterns.append(re.compile(pattern))
+                return
+        elif pattern_type == self.PATTERN_TYPES['SUFFIX']:
+            self._simple_suffixes.add(pattern[2:])
+        elif pattern_type == self.PATTERN_TYPES['PREFIX']:
+            self._simple_prefixes.add(pattern[:-2])
+        elif pattern_type == self.PATTERN_TYPES['DOMAIN']:
+            self._domain_patterns.append(
+                re.compile(pattern.replace('*.', r'[^/]+\.'))
+            )
+        else:
+            if isinstance(pattern, str):
+                # Handle complex glob patterns
+                if '**' in pattern:
+                    pattern = pattern.replace('**', '.*')
+                if '{' in pattern:
+                    # Convert {a,b} to (a|b)
+                    pattern = re.sub(r'\{([^}]+)\}', 
+                                   lambda m: f'({"|".join(m.group(1).split(","))})',
+                                   pattern)
+                pattern = fnmatch.translate(pattern)
+            self._path_patterns.append(
+                pattern if isinstance(pattern, Pattern) else re.compile(pattern)
+            )
+
+    @lru_cache(maxsize=10000)
+    def apply(self, url: str) -> bool:
+        """Hierarchical pattern matching"""
+        # Quick suffix check (*.html)
+        if self._simple_suffixes:
+            path = url.split('?')[0]
+            if path.split('/')[-1].split('.')[-1] in self._simple_suffixes:
+                self._update_stats(True)
+                return True
+                
+        # Domain check
+        if self._domain_patterns:
+            for pattern in self._domain_patterns:
+                if pattern.match(url):
+                    self._update_stats(True)
+                    return True
+        
+        # Prefix check (/foo/*)
+        if self._simple_prefixes:
+            path = url.split('?')[0]
+            if any(path.startswith(p) for p in self._simple_prefixes):
+                self._update_stats(True)
+                return True
+                
+        # Complex patterns
+        if self._path_patterns:
+            if any(p.search(url) for p in self._path_patterns):
+                self._update_stats(True)
+                return True
+        
+        self._update_stats(False)
+        return False
+
+
+class FastContentTypeFilter(FastURLFilter):
+    """Optimized content type filter using fast lookups"""
+
+    __slots__ = ("allowed_types", "_ext_map", "_check_extension")
+
+    # Fast extension to mime type mapping
+    _MIME_MAP = {
+        # Text Formats
+        "txt": "text/plain",
+        "html": "text/html",
+        "htm": "text/html",
+        "xhtml": "application/xhtml+xml",
+        "css": "text/css",
+        "csv": "text/csv",
+        "ics": "text/calendar",
+        "js": "application/javascript",
+        # Images
+        "bmp": "image/bmp",
+        "gif": "image/gif",
+        "jpeg": "image/jpeg",
+        "jpg": "image/jpeg",
+        "png": "image/png",
+        "svg": "image/svg+xml",
+        "tiff": "image/tiff",
+        "ico": "image/x-icon",
+        "webp": "image/webp",
+        # Audio
+        "mp3": "audio/mpeg",
+        "wav": "audio/wav",
+        "ogg": "audio/ogg",
+        "m4a": "audio/mp4",
+        "aac": "audio/aac",
+        # Video
+        "mp4": "video/mp4",
+        "mpeg": "video/mpeg",
+        "webm": "video/webm",
+        "avi": "video/x-msvideo",
+        "mov": "video/quicktime",
+        "flv": "video/x-flv",
+        "wmv": "video/x-ms-wmv",
+        "mkv": "video/x-matroska",
+        # Applications
+        "json": "application/json",
+        "xml": "application/xml",
+        "pdf": "application/pdf",
+        "zip": "application/zip",
+        "gz": "application/gzip",
+        "tar": "application/x-tar",
+        "rar": "application/vnd.rar",
+        "7z": "application/x-7z-compressed",
+        "exe": "application/vnd.microsoft.portable-executable",
+        "msi": "application/x-msdownload",
+        # Fonts
+        "woff": "font/woff",
+        "woff2": "font/woff2",
+        "ttf": "font/ttf",
+        "otf": "font/otf",
+        # Microsoft Office
+        "doc": "application/msword",
+        "dot": "application/msword",
+        "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        "xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+        "xls": "application/vnd.ms-excel",
+        "ppt": "application/vnd.ms-powerpoint",
+        "pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+        # OpenDocument Formats
+        "odt": "application/vnd.oasis.opendocument.text",
+        "ods": "application/vnd.oasis.opendocument.spreadsheet",
+        "odp": "application/vnd.oasis.opendocument.presentation",
+        # Archives
+        "tar.gz": "application/gzip",
+        "tgz": "application/gzip",
+        "bz2": "application/x-bzip2",
+        # Others
+        "rtf": "application/rtf",
+        "apk": "application/vnd.android.package-archive",
+        "epub": "application/epub+zip",
+        "jar": "application/java-archive",
+        "swf": "application/x-shockwave-flash",
+        "midi": "audio/midi",
+        "mid": "audio/midi",
+        "ps": "application/postscript",
+        "ai": "application/postscript",
+        "eps": "application/postscript",
+        # Custom or less common
+        "bin": "application/octet-stream",
+        "dmg": "application/x-apple-diskimage",
+        "iso": "application/x-iso9660-image",
+        "deb": "application/x-debian-package",
+        "rpm": "application/x-rpm",
+        "sqlite": "application/vnd.sqlite3",
+        # Placeholder
+        "unknown": "application/octet-stream",  # Fallback for unknown file types
+    }
+
+    @staticmethod
+    @lru_cache(maxsize=1000)
+    def _extract_extension(path: str) -> str:
+        """Fast extension extraction with caching"""
+        if "." not in path:
+            return ""
+        return path.rpartition(".")[-1].lower()
+
+    def __init__(
+        self, allowed_types: Union[str, List[str]], check_extension: bool = True
+    ):
+        super().__init__()
+        # Normalize and store as frozenset for fast lookup
+        self.allowed_types = frozenset(
+            t.lower()
+            for t in (
+                allowed_types if isinstance(allowed_types, list) else [allowed_types]
+            )
+        )
+        self._check_extension = check_extension
+
+        # Pre-compute extension map for allowed types
+        self._ext_map = frozenset(
+            ext
+            for ext, mime in self._MIME_MAP.items()
+            if any(allowed in mime for allowed in self.allowed_types)
+        )
+
+    @lru_cache(maxsize=1000)
+    def _check_url_cached(self, url: str) -> bool:
+        """Cached URL checking"""
+        if not self._check_extension:
+            return True
+
+        path = url.split("?")[0]  # Fast path split
+        ext = self._extract_extension(path)
+        if not ext:
+            return True
+
+        return ext in self._ext_map
+
+    def apply(self, url: str) -> bool:
+        """Fast extension check with caching"""
+        result = self._check_url_cached(url)
+        self._update_stats(result)
+        return result
+
+
+class FastDomainFilter(FastURLFilter):
+    """Optimized domain filter with fast lookups and caching"""
+
+    __slots__ = ("_allowed_domains", "_blocked_domains", "_domain_cache")
+
+    # Regex for fast domain extraction
+    _DOMAIN_REGEX = re.compile(r"://([^/]+)")
+
+    def __init__(
+        self,
+        allowed_domains: Union[str, List[str]] = None,
+        blocked_domains: Union[str, List[str]] = None,
+    ):
+        super().__init__()
+
+        # Convert inputs to frozensets for immutable, fast lookups
+        self._allowed_domains = (
+            frozenset(self._normalize_domains(allowed_domains))
+            if allowed_domains
+            else None
+        )
+        self._blocked_domains = (
+            frozenset(self._normalize_domains(blocked_domains))
+            if blocked_domains
+            else frozenset()
+        )
+
+    @staticmethod
+    def _normalize_domains(domains: Union[str, List[str]]) -> Set[str]:
+        """Fast domain normalization"""
+        if isinstance(domains, str):
+            return {domains.lower()}
+        return {d.lower() for d in domains}
+
+    @staticmethod
+    @lru_cache(maxsize=10000)
+    def _extract_domain(url: str) -> str:
+        """Ultra-fast domain extraction with regex and caching"""
+        match = FastDomainFilter._DOMAIN_REGEX.search(url)
+        return match.group(1).lower() if match else ""
+
+    def apply(self, url: str) -> bool:
+        """Optimized domain checking with early returns"""
+        # Skip processing if no filters
+        if not self._blocked_domains and self._allowed_domains is None:
+            self._update_stats(True)
+            return True
+
+        domain = self._extract_domain(url)
+
+        # Early return for blocked domains
+        if domain in self._blocked_domains:
+            self._update_stats(False)
+            return False
+
+        # If no allowed domains specified, accept all non-blocked
+        if self._allowed_domains is None:
+            self._update_stats(True)
+            return True
+
+        # Final allowed domains check
+        result = domain in self._allowed_domains
+        self._update_stats(result)
+        return result
+
+
+def create_fast_filter_chain() -> FastFilterChain:
+    """Create an optimized filter chain with filters ordered by rejection rate"""
+    return FastFilterChain(
+        [
+            # Domain filter first (fastest rejection)
+            FastDomainFilter(blocked_domains=["ads.*", "analytics.*"]),
+            # Content filter second (medium speed)
+            FastContentTypeFilter(["text/html", "application/xhtml+xml"]),
+            # Pattern filter last (most expensive)
+            FastURLPatternFilter(
+                [
+                    "*.html",
+                    "*.htm",
+                    "*/article/*",
+                    "*/blog/*",
+                ]
+            ),
+        ]
+    )
+
+
+def run_performance_test():
+    import time
+    import random
+    from itertools import cycle
+
+    # Generate test URLs
+    base_urls = [
+        "https://example.com/article/123",
+        "https://blog.example.com/post/456",
+        "https://ads.example.com/tracking",
+        "https://example.com/about.html",
+        "https://analytics.example.com/script.js",
+        "https://example.com/products.php",
+        "https://subdomain.example.com/blog/post-123",
+        "https://example.com/path/file.pdf",
+    ]
+
+    # Create more varied test data
+    test_urls = []
+    for base in base_urls:
+        # Add original
+        test_urls.append(base)
+        # Add variations
+        parts = base.split("/")
+        for i in range(10):
+            parts[-1] = f"page_{i}.html"
+            test_urls.append("/".join(parts))
+
+    # Multiply to get enough test data
+    test_urls = test_urls * 10000  # Creates ~800k URLs
+
+    def benchmark(name: str, func, *args, warmup=True):
+        if warmup:
+            # Warmup run
+            func(*args)
+
+        # Actual timing
+        start = time.perf_counter_ns()
+        result = func(*args)
+        elapsed = (time.perf_counter_ns() - start) / 1_000_000  # Convert to ms
+        print(
+            f"{name:<30} {elapsed:>8.3f} ms  ({len(test_urls)/elapsed*1000:,.0f} URLs/sec)"
+        )
+        return result
+
+    print("\nBenchmarking original vs optimized implementations...")
+    print("-" * 70)
+
+    # Original implementation
+    pattern_filter = URLPatternFilter(["*.html", "*/article/*"])
+    content_filter = ContentTypeFilter(["text/html"])
+    domain_filter = DomainFilter(blocked_domains=["ads.*", "analytics.*"])
+    chain = FilterChain([pattern_filter, content_filter, domain_filter])
+
+    # Optimized implementation
+    fast_pattern_filter = FastURLPatternFilter(["*.html", "*/article/*"])
+    fast_content_filter = FastContentTypeFilter(["text/html"])
+    fast_domain_filter = FastDomainFilter(blocked_domains=["ads.*", "analytics.*"])
+    fast_chain = FastFilterChain(
+        [fast_domain_filter, fast_content_filter, fast_pattern_filter]
+    )
+
+    # Test individual filters
+    print("\nSingle filter performance (first 1000 URLs):")
+    test_subset = test_urls[:1000]
+
+    print("\nPattern Filters:")
+    benchmark(
+        "Original Pattern Filter",
+        lambda: [pattern_filter.apply(url) for url in test_subset],
+    )
+    benchmark(
+        "Optimized Pattern Filter",
+        lambda: [fast_pattern_filter.apply(url) for url in test_subset],
+    )
+
+    print("\nContent Filters:")
+    benchmark(
+        "Original Content Filter",
+        lambda: [content_filter.apply(url) for url in test_subset],
+    )
+    benchmark(
+        "Optimized Content Filter",
+        lambda: [fast_content_filter.apply(url) for url in test_subset],
+    )
+
+    print("\nDomain Filters:")
+    benchmark(
+        "Original Domain Filter",
+        lambda: [domain_filter.apply(url) for url in test_subset],
+    )
+    benchmark(
+        "Optimized Domain Filter",
+        lambda: [fast_domain_filter.apply(url) for url in test_subset],
+    )
+
+    print("\nFull Chain Performance (all URLs):")
+    # Test chain
+    benchmark("Original Chain", lambda: [chain.apply(url) for url in test_urls])
+    benchmark("Optimized Chain", lambda: [fast_chain.apply(url) for url in test_urls])
+
+    # Memory usage
+    import sys
+
+    print("\nMemory Usage per Filter:")
+    print(f"Original Pattern Filter: {sys.getsizeof(pattern_filter):,} bytes")
+    print(f"Optimized Pattern Filter: {sys.getsizeof(fast_pattern_filter):,} bytes")
+    print(f"Original Content Filter: {sys.getsizeof(content_filter):,} bytes")
+    print(f"Optimized Content Filter: {sys.getsizeof(fast_content_filter):,} bytes")
+    print(f"Original Domain Filter: {sys.getsizeof(domain_filter):,} bytes")
+    print(f"Optimized Domain Filter: {sys.getsizeof(fast_domain_filter):,} bytes")
+
+def test_pattern_filter():
+    import time
+    from itertools import chain
+
+    # Test cases as list of tuples instead of dict for multiple patterns
+    test_cases = [
+        # Simple suffix patterns (*.html)
+        ("*.html", {
+            "https://example.com/page.html": True,
+            "https://example.com/path/doc.html": True,
+            "https://example.com/page.htm": False,
+            "https://example.com/page.html?param=1": True,
+        }),
+        
+        # Path prefix patterns (/foo/*)
+        ("*/article/*", {
+            "https://example.com/article/123": True,
+            "https://example.com/blog/article/456": True,
+            "https://example.com/articles/789": False,
+            "https://example.com/article": False,
+        }),
+        
+        # Complex patterns
+        ("blog-*-[0-9]", {
+            "https://example.com/blog-post-1": True,
+            "https://example.com/blog-test-9": True,
+            "https://example.com/blog-post": False,
+            "https://example.com/blog-post-x": False,
+        }),
+        
+        # Multiple patterns case
+        (["*.pdf", "*/download/*"], {
+            "https://example.com/doc.pdf": True,
+            "https://example.com/download/file.txt": True,
+            "https://example.com/path/download/doc": True,
+            "https://example.com/uploads/file.txt": False,
+        }),
+        
+        # Edge cases
+        ("*", {
+            "https://example.com": True,
+            "": True,
+            "http://test.com/path": True,
+        }),
+        
+        # Complex regex
+        (r"^https?://.*\.example\.com/\d+", {
+            "https://sub.example.com/123": True,
+            "http://test.example.com/456": True,
+            "https://example.com/789": False,
+            "https://sub.example.com/abc": False,
+        })
+    ]
+
+    def run_accuracy_test():
+        print("\nAccuracy Tests:")
+        print("-" * 50)
+        
+        all_passed = True
+        for patterns, test_urls in test_cases:
+            filter_obj = FastURLPatternFilter(patterns)
+            
+            for url, expected in test_urls.items():
+                result = filter_obj.apply(url)
+                if result != expected:
+                    print(f"❌ Failed: Pattern '{patterns}' with URL '{url}'")
+                    print(f"   Expected: {expected}, Got: {result}")
+                    all_passed = False
+                else:
+                    print(f"✅ Passed: Pattern '{patterns}' with URL '{url}'")
+        
+        return all_passed
+
+    def run_speed_test():
+        print("\nSpeed Tests:")
+        print("-" * 50)
+        
+        # Create a large set of test URLs
+        all_urls = list(chain.from_iterable(urls.keys() for _, urls in test_cases))
+        test_urls = all_urls * 10000  # 100K+ URLs
+        
+        # Test both implementations
+        original = URLPatternFilter(["*.html", "*/article/*", "blog-*"])
+        optimized = FastURLPatternFilter(["*.html", "*/article/*", "blog-*"])
+        
+        def benchmark(name, filter_obj):
+            start = time.perf_counter()
+            for url in test_urls:
+                filter_obj.apply(url)
+            elapsed = time.perf_counter() - start
+            urls_per_sec = len(test_urls) / elapsed
+            print(f"{name:<20} {elapsed:.3f}s ({urls_per_sec:,.0f} URLs/sec)")
+        
+        benchmark("Original Filter:", original)
+        benchmark("Optimized Filter:", optimized)
+
+    # Run tests
+    print("Running Pattern Filter Tests...")
+    accuracy_passed = run_accuracy_test()
+    
+    if accuracy_passed:
+        print("\n✨ All accuracy tests passed!")
+        run_speed_test()
+    else:
+        print("\n❌ Some accuracy tests failed!")
+
+if __name__ == "__main__":
+    run_performance_test()
+    # test_pattern_filter()
--- a/crawl4ai/deep_crawl/scorers.py
+++ b/crawl4ai/deep_crawl/scorers.py
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -1,3 +1,4 @@
+from __future__ import annotations
 from pydantic import BaseModel, HttpUrl
 from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
 from enum import Enum
@@ -5,6 +6,7 @@ from dataclasses import dataclass
 from .ssl_certificate import SSLCertificate
 from datetime import datetime
 from datetime import timedelta
+from math import inf


 ###############################
@@ -95,6 +97,18 @@ class DispatchResult(BaseModel):
    error_message: str = ""


+@dataclass
+class TraversalStats:
+    """Statistics for the traversal process"""
+
+    start_time: datetime
+    urls_processed: int = 0
+    urls_failed: int = 0
+    urls_skipped: int = 0
+    total_depth_reached: int = 0
+    current_depth: int = 0
+
+
 class CrawlResult(BaseModel):
    url: str
    html: str
@@ -118,11 +132,14 @@ class CrawlResult(BaseModel):
    ssl_certificate: Optional[SSLCertificate] = None
    dispatch_result: Optional[DispatchResult] = None
    redirected_url: Optional[str] = None
+    # Attributes for position
+    depth: Optional[int] = None
+    score: Optional[float] = -inf
+    parent_url: Optional[str] = None

    class Config:
        arbitrary_types_allowed = True

-
 class AsyncCrawlResponse(BaseModel):
    html: str
    response_headers: Dict[str, str]
@@ -161,12 +178,12 @@ class Link(BaseModel):

 class Media(BaseModel):
    images: List[MediaItem] = []
-    videos: List[
-        MediaItem
-    ] = []  # Using MediaItem model for now, can be extended with Video model if needed
-    audios: List[
-        MediaItem
-    ] = []  # Using MediaItem model for now, can be extended with Audio model if needed
+    videos: List[MediaItem] = (
+        []
+    )  # Using MediaItem model for now, can be extended with Video model if needed
+    audios: List[MediaItem] = (
+        []
+    )  # Using MediaItem model for now, can be extended with Audio model if needed


 class Links(BaseModel):
--- a/docs/deep_crawl/bfs_traversal_strategy.md
+++ b/docs/deep_crawl/bfs_traversal_strategy.md
@@ -0,0 +1,244 @@
+# BFS Scraper Strategy: Smart Web Traversal
+
+The BFS (Breadth-First Search) Scraper Strategy provides an intelligent way to traverse websites systematically. It crawls websites level by level, ensuring thorough coverage while respecting web crawling etiquette.
+
+```mermaid
+flowchart TB
+    Start([Start]) --> Init[Initialize BFS Strategy]
+    Init --> InitStats[Initialize CrawlStats]
+    InitStats --> InitQueue[Initialize Priority Queue]
+    InitQueue --> AddStart[Add Start URL to Queue]
+    
+    AddStart --> CheckState{Queue Empty or\nTasks Pending?}
+    CheckState -->|No| Cleanup[Cleanup & Stats]
+    Cleanup --> End([End])
+    
+    CheckState -->|Yes| CheckCancel{Cancel\nRequested?}
+    CheckCancel -->|Yes| Cleanup
+    
+    CheckCancel -->|No| CheckConcurrent{Under Max\nConcurrent?}
+    
+    CheckConcurrent -->|No| WaitComplete[Wait for Task Completion]
+    WaitComplete --> YieldResult[Yield Result]
+    YieldResult --> CheckState
+    
+    CheckConcurrent -->|Yes| GetNextURL[Get Next URL from Queue]
+    
+    GetNextURL --> ValidateURL{Already\nVisited?}
+    ValidateURL -->|Yes| CheckState
+    
+    ValidateURL -->|No| ProcessURL[Process URL]
+    
+    subgraph URL_Processing [URL Processing]
+        ProcessURL --> CheckValid{URL Valid?}
+        CheckValid -->|No| UpdateStats[Update Skip Stats]
+        
+        CheckValid -->|Yes| CheckRobots{Allowed by\nrobots.txt?}
+        CheckRobots -->|No| UpdateRobotStats[Update Robot Stats]
+        
+        CheckRobots -->|Yes| ApplyDelay[Apply Politeness Delay]
+        ApplyDelay --> FetchContent[Fetch Content with Rate Limit]
+        
+        FetchContent --> CheckError{Error?}
+        CheckError -->|Yes| Retry{Retry\nNeeded?}
+        Retry -->|Yes| FetchContent
+        Retry -->|No| UpdateFailStats[Update Fail Stats]
+        
+        CheckError -->|No| ExtractLinks[Extract & Process Links]
+        ExtractLinks --> ScoreURLs[Score New URLs]
+        ScoreURLs --> AddToQueue[Add to Priority Queue]
+    end
+    
+    ProcessURL --> CreateTask{Parallel\nProcessing?}
+    CreateTask -->|Yes| AddTask[Add to Pending Tasks]
+    CreateTask -->|No| DirectProcess[Process Directly]
+    
+    AddTask --> CheckState
+    DirectProcess --> YieldResult
+    
+    UpdateStats --> CheckState
+    UpdateRobotStats --> CheckState
+    UpdateFailStats --> CheckState
+    
+    classDef process fill:#90caf9,stroke:#000,stroke-width:2px;
+    classDef decision fill:#fff59d,stroke:#000,stroke-width:2px;
+    classDef error fill:#ef9a9a,stroke:#000,stroke-width:2px;
+    classDef stats fill:#a5d6a7,stroke:#000,stroke-width:2px;
+    
+    class Start,End stats;
+    class CheckState,CheckCancel,CheckConcurrent,ValidateURL,CheckValid,CheckRobots,CheckError,Retry,CreateTask decision;
+    class UpdateStats,UpdateRobotStats,UpdateFailStats,InitStats,Cleanup stats;
+    class ProcessURL,FetchContent,ExtractLinks,ScoreURLs process;
+```
+
+## How It Works
+
+The BFS strategy crawls a website by:
+1. Starting from a root URL
+2. Processing all URLs at the current depth
+3. Moving to URLs at the next depth level
+4. Continuing until maximum depth is reached
+
+This ensures systematic coverage of the website while maintaining control over the crawling process.
+
+## Key Features
+
+### 1. Smart URL Processing
+```python
+strategy = BFSScraperStrategy(
+    max_depth=2,
+    filter_chain=my_filters,
+    url_scorer=my_scorer,
+    max_concurrent=5
+)
+```
+- Controls crawl depth
+- Filters unwanted URLs
+- Scores URLs for priority
+- Manages concurrent requests
+
+### 2. Polite Crawling
+The strategy automatically implements web crawling best practices:
+- Respects robots.txt
+- Implements rate limiting
+- Adds politeness delays
+- Manages concurrent requests
+
+### 3. Link Processing Control
+```python
+strategy = BFSScraperStrategy(
+    ...,
+    process_external_links=False  # Only process internal links
+)
+```
+- Control whether to follow external links
+- Default: internal links only
+- Enable external links when needed
+
+## Configuration Options
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| max_depth | Maximum crawl depth | Required |
+| filter_chain | URL filtering rules | Required |
+| url_scorer | URL priority scoring | Required |
+| max_concurrent | Max parallel requests | 5 |
+| min_crawl_delay | Seconds between requests | 1 |
+| process_external_links | Follow external links | False |
+
+## Best Practices
+
+1. **Set Appropriate Depth**
+   - Start with smaller depths (2-3)
+   - Increase based on needs
+   - Consider site structure
+
+2. **Configure Filters**
+   - Use URL patterns
+   - Filter by content type
+   - Avoid unwanted sections
+
+3. **Tune Performance**
+   - Adjust max_concurrent
+   - Set appropriate delays
+   - Monitor resource usage
+
+4. **Handle External Links**
+   - Keep external_links=False for focused crawls
+   - Enable only when needed
+   - Consider additional filtering
+
+## Example Usage
+
+```python
+from crawl4ai.scraper import BFSScraperStrategy
+from crawl4ai.scraper.filters import FilterChain
+from crawl4ai.scraper.scorers import BasicURLScorer
+
+# Configure strategy
+strategy = BFSScraperStrategy(
+    max_depth=3,
+    filter_chain=FilterChain([
+        URLPatternFilter("*.example.com/*"),
+        ContentTypeFilter(["text/html"])
+    ]),
+    url_scorer=BasicURLScorer(),
+    max_concurrent=5,
+    min_crawl_delay=1,
+    process_external_links=False
+)
+
+# Use with AsyncWebScraper
+scraper = AsyncWebScraper(crawler, strategy)
+results = await scraper.ascrape("https://example.com")
+```
+
+## Common Use Cases
+
+### 1. Site Mapping
+```python
+strategy = BFSScraperStrategy(
+    max_depth=5,
+    filter_chain=site_filter,
+    url_scorer=depth_scorer,
+    process_external_links=False
+)
+```
+Perfect for creating complete site maps or understanding site structure.
+
+### 2. Content Aggregation
+```python
+strategy = BFSScraperStrategy(
+    max_depth=2,
+    filter_chain=content_filter,
+    url_scorer=relevance_scorer,
+    max_concurrent=3
+)
+```
+Ideal for collecting specific types of content (articles, products, etc.).
+
+### 3. Link Analysis
+```python
+strategy = BFSScraperStrategy(
+    max_depth=1,
+    filter_chain=link_filter,
+    url_scorer=link_scorer,
+    process_external_links=True
+)
+```
+Useful for analyzing both internal and external link structures.
+
+## Advanced Features
+
+### Progress Monitoring
+```python
+async for result in scraper.ascrape(url):
+    print(f"Current depth: {strategy.stats.current_depth}")
+    print(f"Processed URLs: {strategy.stats.urls_processed}")
+```
+
+### Custom URL Scoring
+```python
+class CustomScorer(URLScorer):
+    def score(self, url: str) -> float:
+        # Lower scores = higher priority
+        return score_based_on_criteria(url)
+```
+
+## Troubleshooting
+
+1. **Slow Crawling**
+   - Increase max_concurrent
+   - Adjust min_crawl_delay
+   - Check network conditions
+
+2. **Missing Content**
+   - Verify max_depth
+   - Check filter settings
+   - Review URL patterns
+
+3. **High Resource Usage**
+   - Reduce max_concurrent
+   - Increase crawl delay
+   - Add more specific filters
+
--- a/docs/deep_crawl/deep_crawl_quickstart.py
+++ b/docs/deep_crawl/deep_crawl_quickstart.py
@@ -0,0 +1,260 @@
+from crawl4ai.async_configs import CrawlerRunConfig, BrowserConfig
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
+from crawl4ai.deep_crawl import (
+    BFSDeepCrawlStrategy,
+    FilterChain,
+    URLPatternFilter,
+    ContentTypeFilter,
+    DomainFilter,
+    KeywordRelevanceScorer,
+    PathDepthScorer,
+    FreshnessScorer,
+    CompositeScorer,
+)
+from crawl4ai.async_webcrawler import AsyncWebCrawler
+import re
+import time
+import logging
+
+browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600)
+
+
+async def basic_example():
+    """
+    Basic example: Deep crawl a blog site for articles
+    - Crawls only HTML pages
+    - Stays within the blog section
+    - Collects all results at once
+    """
+    # Create a simple filter chain
+    filter_chain = FilterChain(
+        [
+            # Only crawl pages within the blog section
+            URLPatternFilter("*/basic/*"),
+            # Only process HTML pages
+            ContentTypeFilter(["text/html"]),
+        ]
+    )
+
+    # Initialize the strategy with basic configuration
+    bfs_strategy = BFSDeepCrawlStrategy(
+        max_depth=2,  # Only go 2 levels deep
+        filter_chain=filter_chain,
+        url_scorer=None,  # Use default scoring
+        process_external_links=True,
+    )
+
+    # Create the crawler
+    async with AsyncWebCrawler(
+        config=browser_config,
+    ) as crawler:
+        # Start scraping
+        try:
+            results = await crawler.arun(
+                "https://crawl4ai.com/mkdocs",
+                CrawlerRunConfig(deep_crawl_strategy=bfs_strategy),
+            )
+            # Process results
+            print(f"Crawled {len(results)} pages:")
+            for result in results:
+                print(f"- {result.url}: {len(result.html)} bytes")
+
+        except Exception as e:
+            print(f"Error during scraping: {e}")
+
+
+async def advanced_example():
+    """
+    Advanced example: Intelligent news site crawling
+    - Uses all filter types
+    - Implements sophisticated scoring
+    - Streams results
+    - Includes monitoring and logging
+    """
+    # Set up logging
+    logging.basicConfig(level=logging.INFO)
+    logger = logging.getLogger("advanced_deep_crawler")
+
+    # Create sophisticated filter chain
+    filter_chain = FilterChain(
+        [
+            # Domain control
+            DomainFilter(
+                allowed_domains=["techcrunch.com"],
+                blocked_domains=["login.techcrunch.com", "legal.yahoo.com"],
+            ),
+            # URL patterns
+            URLPatternFilter(
+                [
+                    "*/article/*",
+                    "*/news/*",
+                    "*/blog/*",
+                    re.compile(r"\d{4}/\d{2}/.*"),  # Date-based URLs
+                ]
+            ),
+            # Content types
+            ContentTypeFilter(["text/html", "application/xhtml+xml"]),
+        ]
+    )
+
+    # Create composite scorer
+    scorer = CompositeScorer(
+        [
+            # Prioritize by keywords
+            KeywordRelevanceScorer(
+                keywords=["news", "breaking", "update", "latest"], weight=1.0
+            ),
+            # Prefer optimal URL structure
+            PathDepthScorer(optimal_depth=3, weight=0.7),
+            # Prioritize fresh content
+            FreshnessScorer(weight=0.9),
+        ]
+    )
+
+    # Initialize strategy with advanced configuration
+    bfs_strategy = BFSDeepCrawlStrategy(
+        max_depth=2, filter_chain=filter_chain, url_scorer=scorer
+    )
+
+    # Create crawler
+    async with AsyncWebCrawler(
+        config=browser_config,
+    ) as crawler:
+
+        # Track statistics
+        stats = {"processed": 0, "errors": 0, "total_size": 0}
+
+        try:
+            # Use streaming mode
+            results = []
+            result_generator = await crawler.arun(
+                "https://techcrunch.com",
+                config=CrawlerRunConfig(deep_crawl_strategy=bfs_strategy, stream=True),
+            )
+            async for result in result_generator:
+                stats["processed"] += 1
+
+                if result.success:
+                    stats["total_size"] += len(result.html)
+                    logger.info(
+                        f"Processed at depth: {result.depth} with score: {result.score:.3f} : \n {result.url}"
+                    )
+                    results.append(result)
+                else:
+                    stats["errors"] += 1
+                    logger.error(
+                        f"Failed to process {result.url}: {result.error_message}"
+                    )
+
+                # Log progress regularly
+                if stats["processed"] % 10 == 0:
+                    logger.info(f"Progress: {stats['processed']} URLs processed")
+
+        except Exception as e:
+            logger.error(f"Scraping error: {e}")
+
+        finally:
+            # Print final statistics
+            logger.info("Scraping completed:")
+            logger.info(f"- URLs processed: {stats['processed']}")
+            logger.info(f"- Errors: {stats['errors']}")
+            logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB")
+
+            # Print filter statistics
+            for filter_ in filter_chain.filters:
+                logger.info(f"{filter_.name} stats:")
+                logger.info(f"- Passed: {filter_.stats.passed_urls}")
+                logger.info(f"- Rejected: {filter_.stats.rejected_urls}")
+
+            # Print scorer statistics
+            logger.info("Scoring statistics:")
+            logger.info(f"- Average score: {scorer.stats.average_score:.2f}")
+            logger.info(
+                f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}"
+            )
+
+
+async def basic_example_many_urls():
+    filter_chain = FilterChain(
+        [
+            URLPatternFilter("*/basic/*"),
+            ContentTypeFilter(["text/html"]),
+        ]
+    )
+    # Initialize the strategy with basic configuration
+    bfs_strategy = BFSDeepCrawlStrategy(
+        max_depth=2,  # Only go 2 levels deep
+        filter_chain=filter_chain,
+        url_scorer=None,  # Use default scoring
+        process_external_links=False,
+    )
+
+    # Create the crawler
+    async with AsyncWebCrawler(
+        config=browser_config,
+    ) as crawler:
+        # Start scraping
+        try:
+            results = await crawler.arun_many(
+                urls=["https://crawl4ai.com/mkdocs","https://aravindkarnam.com"],
+                config=CrawlerRunConfig(deep_crawl_strategy=bfs_strategy),
+            )
+            # Process results
+            print(f"Crawled {len(results)} pages:")
+            for url_result in results:
+                for result in url_result:
+                    print(f"- {result.url}: {len(result.html)} bytes")
+
+        except Exception as e:
+            print(f"Error during scraping: {e}")
+
+async def basic_example_many_urls_stream():
+    filter_chain = FilterChain(
+        [
+            URLPatternFilter("*/basic/*"),
+            ContentTypeFilter(["text/html"]),
+        ]
+    )
+    # Initialize the strategy with basic configuration
+    bfs_strategy = BFSDeepCrawlStrategy(
+        max_depth=2,  # Only go 2 levels deep
+        filter_chain=filter_chain,
+        url_scorer=None,  # Use default scoring
+        process_external_links=False,
+    )
+
+    # Create the crawler
+    async with AsyncWebCrawler(
+        config=browser_config,
+    ) as crawler:
+        # Start scraping
+        try:
+            async for result in await crawler.arun_many(
+                urls=["https://crawl4ai.com/mkdocs","https://aravindkarnam.com"],
+                config=CrawlerRunConfig(deep_crawl_strategy=bfs_strategy,stream=True),
+            ):
+            # Process results
+                print(f"- {result.url}: {len(result.html)} bytes")
+        except Exception as e:
+            print(f"Error during scraping: {e}")
+
+if __name__ == "__main__":
+    import asyncio
+    import time
+
+    # Run basic example
+    start_time = time.perf_counter()
+    print("Running basic Deep crawl example...")
+    asyncio.run(basic_example())
+    end_time = time.perf_counter()
+    print(f"Basic deep crawl example completed in {end_time - start_time:.2f} seconds")
+
+    # Run advanced example
+    print("\nRunning advanced deep crawl example...")
+    asyncio.run(advanced_example())
+
+    print("\nRunning advanced deep crawl example with arun_many...")
+    asyncio.run(basic_example_many_urls())
+
+    print("\nRunning advanced deep crawl example with arun_many streaming enabled...")
+    asyncio.run(basic_example_many_urls_stream())
--- a/docs/deep_crawl/filters_scrorers.md
+++ b/docs/deep_crawl/filters_scrorers.md
@@ -0,0 +1,342 @@
+# URL Filters and Scorers
+
+The crawl4ai library provides powerful URL filtering and scoring capabilities that help you control and prioritize your web crawling. This guide explains how to use these features effectively.
+
+```mermaid
+flowchart TB
+    Start([URL Input]) --> Chain[Filter Chain]
+    
+    subgraph Chain Process
+        Chain --> Pattern{URL Pattern\nFilter}
+        Pattern -->|Match| Content{Content Type\nFilter}
+        Pattern -->|No Match| Reject1[Reject URL]
+        
+        Content -->|Allowed| Domain{Domain\nFilter}
+        Content -->|Not Allowed| Reject2[Reject URL]
+        
+        Domain -->|Allowed| Accept[Accept URL]
+        Domain -->|Blocked| Reject3[Reject URL]
+    end
+    
+    subgraph Statistics
+        Pattern --> UpdatePattern[Update Pattern Stats]
+        Content --> UpdateContent[Update Content Stats]
+        Domain --> UpdateDomain[Update Domain Stats]
+        Accept --> UpdateChain[Update Chain Stats]
+        Reject1 --> UpdateChain
+        Reject2 --> UpdateChain
+        Reject3 --> UpdateChain
+    end
+    
+    Accept --> End([End])
+    Reject1 --> End
+    Reject2 --> End
+    Reject3 --> End
+    
+    classDef process fill:#90caf9,stroke:#000,stroke-width:2px;
+    classDef decision fill:#fff59d,stroke:#000,stroke-width:2px;
+    classDef reject fill:#ef9a9a,stroke:#000,stroke-width:2px;
+    classDef accept fill:#a5d6a7,stroke:#000,stroke-width:2px;
+    
+    class Start,End accept;
+    class Pattern,Content,Domain decision;
+    class Reject1,Reject2,Reject3 reject;
+    class Chain,UpdatePattern,UpdateContent,UpdateDomain,UpdateChain process;
+```
+
+## URL Filters
+
+URL filters help you control which URLs are crawled. Multiple filters can be chained together to create sophisticated filtering rules.
+
+### Available Filters
+
+1. **URL Pattern Filter**
+```python
+pattern_filter = URLPatternFilter([
+    "*.example.com/*",  # Glob pattern
+    "*/article/*",      # Path pattern
+    re.compile(r"blog-\d+") # Regex pattern
+])
+```
+- Supports glob patterns and regex
+- Multiple patterns per filter
+- Pattern pre-compilation for performance
+
+2. **Content Type Filter**
+```python
+content_filter = ContentTypeFilter([
+    "text/html",
+    "application/pdf"
+], check_extension=True)
+```
+- Filter by MIME types
+- Extension checking
+- Support for multiple content types
+
+3. **Domain Filter**
+```python
+domain_filter = DomainFilter(
+    allowed_domains=["example.com", "blog.example.com"],
+    blocked_domains=["ads.example.com"]
+)
+```
+- Allow/block specific domains
+- Subdomain support
+- Efficient domain matching
+
+### Creating Filter Chains
+
+```python
+# Create and configure a filter chain
+filter_chain = FilterChain([
+    URLPatternFilter(["*.example.com/*"]),
+    ContentTypeFilter(["text/html"]),
+    DomainFilter(blocked_domains=["ads.*"])
+])
+
+# Add more filters
+filter_chain.add_filter(
+    URLPatternFilter(["*/article/*"])
+)
+```
+
+```mermaid
+flowchart TB
+    Start([URL Input]) --> Composite[Composite Scorer]
+    
+    subgraph Scoring Process
+        Composite --> Keywords[Keyword Relevance]
+        Composite --> Path[Path Depth]
+        Composite --> Content[Content Type]
+        Composite --> Fresh[Freshness]
+        Composite --> Domain[Domain Authority]
+        
+        Keywords --> KeywordScore[Calculate Score]
+        Path --> PathScore[Calculate Score]
+        Content --> ContentScore[Calculate Score]
+        Fresh --> FreshScore[Calculate Score]
+        Domain --> DomainScore[Calculate Score]
+        
+        KeywordScore --> Weight1[Apply Weight]
+        PathScore --> Weight2[Apply Weight]
+        ContentScore --> Weight3[Apply Weight]
+        FreshScore --> Weight4[Apply Weight]
+        DomainScore --> Weight5[Apply Weight]
+    end
+    
+    Weight1 --> Combine[Combine Scores]
+    Weight2 --> Combine
+    Weight3 --> Combine
+    Weight4 --> Combine
+    Weight5 --> Combine
+    
+    Combine --> Normalize{Normalize?}
+    Normalize -->|Yes| NormalizeScore[Normalize Combined Score]
+    Normalize -->|No| FinalScore[Final Score]
+    NormalizeScore --> FinalScore
+    
+    FinalScore --> Stats[Update Statistics]
+    Stats --> End([End])
+    
+    classDef process fill:#90caf9,stroke:#000,stroke-width:2px;
+    classDef scorer fill:#fff59d,stroke:#000,stroke-width:2px;
+    classDef calc fill:#a5d6a7,stroke:#000,stroke-width:2px;
+    classDef decision fill:#ef9a9a,stroke:#000,stroke-width:2px;
+    
+    class Start,End calc;
+    class Keywords,Path,Content,Fresh,Domain scorer;
+    class KeywordScore,PathScore,ContentScore,FreshScore,DomainScore process;
+    class Normalize decision;
+```
+
+## URL Scorers
+
+URL scorers help prioritize which URLs to crawl first. Higher scores indicate higher priority.
+
+### Available Scorers
+
+1. **Keyword Relevance Scorer**
+```python
+keyword_scorer = KeywordRelevanceScorer(
+    keywords=["python", "programming"],
+    weight=1.0,
+    case_sensitive=False
+)
+```
+- Score based on keyword matches
+- Case sensitivity options
+- Weighted scoring
+
+2. **Path Depth Scorer**
+```python
+path_scorer = PathDepthScorer(
+    optimal_depth=3,  # Preferred URL depth
+    weight=0.7
+)
+```
+- Score based on URL path depth
+- Configurable optimal depth
+- Diminishing returns for deeper paths
+
+3. **Content Type Scorer**
+```python
+content_scorer = ContentTypeScorer({
+    r'\.html$': 1.0,
+    r'\.pdf$': 0.8,
+    r'\.xml$': 0.6
+})
+```
+- Score based on file types
+- Configurable type weights
+- Pattern matching support
+
+4. **Freshness Scorer**
+```python
+freshness_scorer = FreshnessScorer(weight=0.9)
+```
+- Score based on date indicators in URLs
+- Multiple date format support
+- Recency weighting
+
+5. **Domain Authority Scorer**
+```python
+authority_scorer = DomainAuthorityScorer({
+    "python.org": 1.0,
+    "github.com": 0.9,
+    "medium.com": 0.7
+})
+```
+- Score based on domain importance
+- Configurable domain weights
+- Default weight for unknown domains
+
+### Combining Scorers
+
+```python
+# Create a composite scorer
+composite_scorer = CompositeScorer([
+    KeywordRelevanceScorer(["python"], weight=1.0),
+    PathDepthScorer(optimal_depth=2, weight=0.7),
+    FreshnessScorer(weight=0.8)
+], normalize=True)
+```
+
+## Best Practices
+
+### Filter Configuration
+
+1. **Start Restrictive**
+   ```python
+   # Begin with strict filters
+   filter_chain = FilterChain([
+       DomainFilter(allowed_domains=["example.com"]),
+       ContentTypeFilter(["text/html"])
+   ])
+   ```
+
+2. **Layer Filters**
+   ```python
+   # Add more specific filters
+   filter_chain.add_filter(
+       URLPatternFilter(["*/article/*", "*/blog/*"])
+   )
+   ```
+
+3. **Monitor Filter Statistics**
+   ```python
+   # Check filter performance
+   for filter in filter_chain.filters:
+       print(f"{filter.name}: {filter.stats.rejected_urls} rejected")
+   ```
+
+### Scorer Configuration
+
+1. **Balance Weights**
+   ```python
+   # Balanced scoring configuration
+   scorer = create_balanced_scorer()
+   ```
+
+2. **Customize for Content**
+   ```python
+   # News site configuration
+   news_scorer = CompositeScorer([
+       KeywordRelevanceScorer(["news", "article"], weight=1.0),
+       FreshnessScorer(weight=1.0),
+       PathDepthScorer(optimal_depth=2, weight=0.5)
+   ])
+   ```
+
+3. **Monitor Scoring Statistics**
+   ```python
+   # Check scoring distribution
+   print(f"Average score: {scorer.stats.average_score}")
+   print(f"Score range: {scorer.stats.min_score} - {scorer.stats.max_score}")
+   ```
+
+## Common Use Cases
+
+### Blog Crawling
+```python
+blog_config = {
+    'filters': FilterChain([
+        URLPatternFilter(["*/blog/*", "*/post/*"]),
+        ContentTypeFilter(["text/html"])
+    ]),
+    'scorer': CompositeScorer([
+        FreshnessScorer(weight=1.0),
+        KeywordRelevanceScorer(["blog", "article"], weight=0.8)
+    ])
+}
+```
+
+### Documentation Sites
+```python
+docs_config = {
+    'filters': FilterChain([
+        URLPatternFilter(["*/docs/*", "*/guide/*"]),
+        ContentTypeFilter(["text/html", "application/pdf"])
+    ]),
+    'scorer': CompositeScorer([
+        PathDepthScorer(optimal_depth=3, weight=1.0),
+        KeywordRelevanceScorer(["guide", "tutorial"], weight=0.9)
+    ])
+}
+```
+
+### E-commerce Sites
+```python
+ecommerce_config = {
+    'filters': FilterChain([
+        URLPatternFilter(["*/product/*", "*/category/*"]),
+        DomainFilter(blocked_domains=["ads.*", "tracker.*"])
+    ]),
+    'scorer': CompositeScorer([
+        PathDepthScorer(optimal_depth=2, weight=1.0),
+        ContentTypeScorer({
+            r'/product/': 1.0,
+            r'/category/': 0.8
+        })
+    ])
+}
+```
+
+## Advanced Topics
+
+### Custom Filters
+```python
+class CustomFilter(URLFilter):
+    def apply(self, url: str) -> bool:
+        # Your custom filtering logic
+        return True
+```
+
+### Custom Scorers
+```python
+class CustomScorer(URLScorer):
+    def _calculate_score(self, url: str) -> float:
+        # Your custom scoring logic
+        return 1.0
+```
+
+For more examples, check our [example repository](https://github.com/example/crawl4ai/examples).
--- a/docs/deep_crawl/how_to_use.md
+++ b/docs/deep_crawl/how_to_use.md
@@ -0,0 +1,206 @@
+# Scraper Examples Guide
+
+This guide provides two complete examples of using the crawl4ai scraper: a basic implementation for simple use cases and an advanced implementation showcasing all features.
+
+## Basic Example
+
+The basic example demonstrates a simple blog scraping scenario:
+
+```python
+from crawl4ai.scraper import AsyncWebScraper, BFSScraperStrategy, FilterChain
+
+# Create simple filter chain
+filter_chain = FilterChain([
+    URLPatternFilter("*/blog/*"),
+    ContentTypeFilter(["text/html"])
+])
+
+# Initialize strategy
+strategy = BFSScraperStrategy(
+    max_depth=2,
+    filter_chain=filter_chain,
+    url_scorer=None,
+    max_concurrent=3
+)
+
+# Create and run scraper
+crawler = AsyncWebCrawler()
+scraper = AsyncWebScraper(crawler, strategy)
+result = await scraper.ascrape("https://example.com/blog/")
+```
+
+### Features Demonstrated
+- Basic URL filtering
+- Simple content type filtering
+- Depth control
+- Concurrent request limiting
+- Result collection
+
+## Advanced Example
+
+The advanced example shows a sophisticated news site scraping setup with all features enabled:
+
+```python
+# Create comprehensive filter chain
+filter_chain = FilterChain([
+    DomainFilter(
+        allowed_domains=["example.com"],
+        blocked_domains=["ads.example.com"]
+    ),
+    URLPatternFilter([
+        "*/article/*",
+        re.compile(r"\d{4}/\d{2}/.*")
+    ]),
+    ContentTypeFilter(["text/html"])
+])
+
+# Create intelligent scorer
+scorer = CompositeScorer([
+    KeywordRelevanceScorer(
+        keywords=["news", "breaking"],
+        weight=1.0
+    ),
+    PathDepthScorer(optimal_depth=3, weight=0.7),
+    FreshnessScorer(weight=0.9)
+])
+
+# Initialize advanced strategy
+strategy = BFSScraperStrategy(
+    max_depth=4,
+    filter_chain=filter_chain,
+    url_scorer=scorer,
+    max_concurrent=5
+)
+```
+
+### Features Demonstrated
+1. **Advanced Filtering**
+   - Domain filtering
+   - Pattern matching
+   - Content type control
+
+2. **Intelligent Scoring**
+   - Keyword relevance
+   - Path optimization
+   - Freshness priority
+
+3. **Monitoring**
+   - Progress tracking
+   - Error handling
+   - Statistics collection
+
+4. **Resource Management**
+   - Concurrent processing
+   - Rate limiting
+   - Cleanup handling
+
+## Running the Examples
+
+```bash
+# Basic usage
+python basic_scraper_example.py
+
+# Advanced usage with logging
+PYTHONPATH=. python advanced_scraper_example.py
+```
+
+## Example Output
+
+### Basic Example
+```
+Crawled 15 pages:
+- https://example.com/blog/post1: 24560 bytes
+- https://example.com/blog/post2: 18920 bytes
+...
+```
+
+### Advanced Example
+```
+INFO: Starting crawl of https://example.com/news/
+INFO: Processed: https://example.com/news/breaking/story1
+DEBUG: KeywordScorer: 0.85
+DEBUG: FreshnessScorer: 0.95
+INFO: Progress: 10 URLs processed
+...
+INFO: Scraping completed:
+INFO: - URLs processed: 50
+INFO: - Errors: 2
+INFO: - Total content size: 1240.50 KB
+```
+
+## Customization
+
+### Adding Custom Filters
+```python
+class CustomFilter(URLFilter):
+    def apply(self, url: str) -> bool:
+        # Your custom filtering logic
+        return True
+
+filter_chain.add_filter(CustomFilter())
+```
+
+### Custom Scoring Logic
+```python
+class CustomScorer(URLScorer):
+    def _calculate_score(self, url: str) -> float:
+        # Your custom scoring logic
+        return 1.0
+
+scorer = CompositeScorer([
+    CustomScorer(weight=1.0),
+    ...
+])
+```
+
+## Best Practices
+
+1. **Start Simple**
+   - Begin with basic filtering
+   - Add features incrementally
+   - Test thoroughly at each step
+
+2. **Monitor Performance**
+   - Watch memory usage
+   - Track processing times
+   - Adjust concurrency as needed
+
+3. **Handle Errors**
+   - Implement proper error handling
+   - Log important events
+   - Track error statistics
+
+4. **Optimize Resources**
+   - Set appropriate delays
+   - Limit concurrent requests
+   - Use streaming for large crawls
+
+## Troubleshooting
+
+Common issues and solutions:
+
+1. **Too Many Requests**
+   ```python
+   strategy = BFSScraperStrategy(
+       max_concurrent=3,  # Reduce concurrent requests
+       min_crawl_delay=2  # Increase delay between requests
+   )
+   ```
+
+2. **Memory Issues**
+   ```python
+   # Use streaming mode for large crawls
+   async for result in scraper.ascrape(url, stream=True):
+       process_result(result)
+   ```
+
+3. **Missing Content**
+   ```python
+   # Check your filter chain
+   filter_chain = FilterChain([
+       URLPatternFilter("*"),  # Broaden patterns
+       ContentTypeFilter(["*"])  # Accept all content
+   ])
+   ```
+
+For more examples and use cases, visit our [GitHub repository](https://github.com/example/crawl4ai/examples).
--- a/docs/md_v2/basic/installation.md
+++ b/docs/md_v2/basic/installation.md
@@ -134,4 +134,4 @@ This script should successfully crawl the example website and print the first 50

 If you encounter any issues during installation or usage, please check the [documentation](https://docs.crawl4ai.com/) or raise an issue on the [GitHub repository](https://github.com/unclecode/crawl4ai/issues).

-Happy crawling! 🕷️🤖
+Happy crawling! 🕷️🤖
--- a/models.py
+++ b/models.py
--- a/tests/test_scraper.py
+++ b/tests/test_scraper.py
@@ -0,0 +1,184 @@
+# basic_scraper_example.py
+from crawl4ai.scraper import (
+    AsyncWebScraper,
+    BFSScraperStrategy,
+    FilterChain,
+    URLPatternFilter,
+    ContentTypeFilter
+)
+from crawl4ai.async_webcrawler import AsyncWebCrawler
+
+async def basic_scraper_example():
+    """
+    Basic example: Scrape a blog site for articles
+    - Crawls only HTML pages
+    - Stays within the blog section
+    - Collects all results at once
+    """
+    # Create a simple filter chain
+    filter_chain = FilterChain([
+        # Only crawl pages within the blog section
+        URLPatternFilter("*/blog/*"),
+        # Only process HTML pages
+        ContentTypeFilter(["text/html"])
+    ])
+
+    # Initialize the strategy with basic configuration
+    strategy = BFSScraperStrategy(
+        max_depth=2,  # Only go 2 levels deep
+        filter_chain=filter_chain,
+        url_scorer=None,  # Use default scoring
+        max_concurrent=3  # Limit concurrent requests
+    )
+
+    # Create the crawler and scraper
+    crawler = AsyncWebCrawler()
+    scraper = AsyncWebScraper(crawler, strategy)
+
+    # Start scraping
+    try:
+        result = await scraper.ascrape("https://example.com/blog/")
+        
+        # Process results
+        print(f"Crawled {len(result.crawled_urls)} pages:")
+        for url, data in result.extracted_data.items():
+            print(f"- {url}: {len(data.html)} bytes")
+            
+    except Exception as e:
+        print(f"Error during scraping: {e}")
+
+# advanced_scraper_example.py
+import logging
+from crawl4ai.scraper import (
+    AsyncWebScraper,
+    BFSScraperStrategy,
+    FilterChain,
+    URLPatternFilter,
+    ContentTypeFilter,
+    DomainFilter,
+    KeywordRelevanceScorer,
+    PathDepthScorer,
+    FreshnessScorer,
+    CompositeScorer
+)
+from crawl4ai.async_webcrawler import AsyncWebCrawler
+
+async def advanced_scraper_example():
+    """
+    Advanced example: Intelligent news site scraping
+    - Uses all filter types
+    - Implements sophisticated scoring
+    - Streams results
+    - Includes monitoring and logging
+    """
+    # Set up logging
+    logging.basicConfig(level=logging.INFO)
+    logger = logging.getLogger("advanced_scraper")
+
+    # Create sophisticated filter chain
+    filter_chain = FilterChain([
+        # Domain control
+        DomainFilter(
+            allowed_domains=["example.com", "blog.example.com"],
+            blocked_domains=["ads.example.com", "tracker.example.com"]
+        ),
+        # URL patterns
+        URLPatternFilter([
+            "*/article/*",
+            "*/news/*",
+            "*/blog/*",
+            re.compile(r"\d{4}/\d{2}/.*")  # Date-based URLs
+        ]),
+        # Content types
+        ContentTypeFilter([
+            "text/html",
+            "application/xhtml+xml"
+        ])
+    ])
+
+    # Create composite scorer
+    scorer = CompositeScorer([
+        # Prioritize by keywords
+        KeywordRelevanceScorer(
+            keywords=["news", "breaking", "update", "latest"],
+            weight=1.0
+        ),
+        # Prefer optimal URL structure
+        PathDepthScorer(
+            optimal_depth=3,
+            weight=0.7
+        ),
+        # Prioritize fresh content
+        FreshnessScorer(weight=0.9)
+    ])
+
+    # Initialize strategy with advanced configuration
+    strategy = BFSScraperStrategy(
+        max_depth=4,
+        filter_chain=filter_chain,
+        url_scorer=scorer,
+        max_concurrent=5,
+        min_crawl_delay=1
+    )
+
+    # Create crawler and scraper
+    crawler = AsyncWebCrawler()
+    scraper = AsyncWebScraper(crawler, strategy)
+
+    # Track statistics
+    stats = {
+        'processed': 0,
+        'errors': 0,
+        'total_size': 0
+    }
+
+    try:
+        # Use streaming mode
+        async for result in scraper.ascrape("https://example.com/news/", stream=True):
+            stats['processed'] += 1
+            
+            if result.success:
+                stats['total_size'] += len(result.html)
+                logger.info(f"Processed: {result.url}")
+                
+                # Print scoring information
+                for scorer_name, score in result.scores.items():
+                    logger.debug(f"{scorer_name}: {score:.2f}")
+            else:
+                stats['errors'] += 1
+                logger.error(f"Failed to process {result.url}: {result.error_message}")
+
+            # Log progress regularly
+            if stats['processed'] % 10 == 0:
+                logger.info(f"Progress: {stats['processed']} URLs processed")
+
+    except Exception as e:
+        logger.error(f"Scraping error: {e}")
+    
+    finally:
+        # Print final statistics
+        logger.info("Scraping completed:")
+        logger.info(f"- URLs processed: {stats['processed']}")
+        logger.info(f"- Errors: {stats['errors']}")
+        logger.info(f"- Total content size: {stats['total_size'] / 1024:.2f} KB")
+        
+        # Print filter statistics
+        for filter_ in filter_chain.filters:
+            logger.info(f"{filter_.name} stats:")
+            logger.info(f"- Passed: {filter_.stats.passed_urls}")
+            logger.info(f"- Rejected: {filter_.stats.rejected_urls}")
+        
+        # Print scorer statistics
+        logger.info("Scoring statistics:")
+        logger.info(f"- Average score: {scorer.stats.average_score:.2f}")
+        logger.info(f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}")
+
+if __name__ == "__main__":
+    import asyncio
+    
+    # Run basic example
+    print("Running basic scraper example...")
+    asyncio.run(basic_scraper_example())
+    
+    print("\nRunning advanced scraper example...")
+    asyncio.run(advanced_scraper_example())
Author	SHA1	Message	Date
Aravind Karnam	f7ce2d42c9	feat: Add deep crawl capabilities to arun_many function	2025-01-30 17:49:58 +05:30
Aravind Karnam	f6edb8342e	Refactor: remove the old deep_crawl method	2025-01-30 16:22:41 +05:30
Aravind Karnam	ca3f0126d3	Refactor:Moved deep_crawl_strategy, inside crawler run config	2025-01-30 16:18:15 +05:30
Aravind Karnam	858c18df39	fix: removed child_urls from CrawlResult	2025-01-29 18:08:34 +05:30
Aravind Karnam	2c8f2ec5a6	Refactor: Renamed scrape to traverse and deep_crawl in a few sections where it applies	2025-01-29 16:24:11 +05:30
Aravind Karnam	9ef43bc5f0	Refactor: Move adeep_crawl as method of crawler itself. Create attributes in CrawlResult to reconstruct the tree once deep crawling is completed	2025-01-29 15:58:21 +05:30
Aravind Karnam	84ffdaab9a	Refactor: Move adeep_crawl as method of crawler itself. Create attributes in CrawlResult to reconstruct the tree once deep crawling is completed	2025-01-29 13:06:09 +05:30
Aravind Karnam	78223bc847	feat: create ScraperPageResult model to attach score and depth attributes to yielded/returned crawl results	2025-01-28 16:47:30 +05:30
Aravind Karnam	60ce8bbf55	Merge: with v-0.4.3b	2025-01-28 12:59:53 +05:30
Aravind Karnam	85847ff13f	feat: 1. Make active_crawls into a dict instead of set and remove jobs array. Effective lookup and storage of active crawls and crawl control. 2. Put a lock on active_crawls, so similtanious push and pop by coroutines doesn't cause a race condition 3. Move the depth check logic outside the child link for loop, as source_url doesn't change in the loop.	2025-01-28 12:39:45 +05:30
Aravind Karnam	f34b4878cf	fix: code formatting	2025-01-28 10:00:01 +05:30
Aravind Karnam	d9324e3454	fix: Move the creation of crawler outside the main loop	2025-01-27 18:31:13 +05:30
Aravind Karnam	0ff95c83bc	feat: change input params to scraper, Add asynchronous context manager to AsyncWebScraper, Optimise filter application	2025-01-27 18:13:33 +05:30
Aravind Karnam	bb6450f458	Remove robots.txt compliance from scraper	2025-01-27 11:58:54 +05:30
Aravind Karnam	513d008de5	feat: Merge reviews from unclecode for scorers and filters & Remove the robots.txt compliance from scraper since that will be now handled by crawler	2025-01-27 11:54:10 +05:30
UncleCode	cf3e1e748d	feat(scraper): add optimized URL scoring system Implements a new high-performance URL scoring system with multiple scoring strategies: - FastKeywordRelevanceScorer for keyword matching - FastPathDepthScorer for URL depth analysis - FastContentTypeScorer for file type scoring - FastFreshnessScorer for date-based scoring - FastDomainAuthorityScorer for domain reputation - FastCompositeScorer for combining multiple scorers Key improvements: - Memory optimization using __slots__ - LRU caching for expensive operations - Optimized string operations - Pre-computed scoring tables - Fast path optimizations for common cases - Reduced object allocation Includes comprehensive benchmarking and testing utilities.	2025-01-23 20:46:33 +08:00
UncleCode	e6ef8d91ba	refactor(scraper): optimize URL validation and filter performance - Replace validators library with built-in urlparse for URL validation - Optimize filter statistics update logic for better performance - Add performance benchmarking suite for filters - Add execution time tracking to scraper examples - Update gitignore with windsurfrules BREAKING CHANGE: Removed dependency on validators library for URL validation	2025-01-22 19:45:56 +08:00
Aravind Karnam	6e78c56dda	Refactor: Removed all scheduling logic from scraper. From now scraper expects arun_many to handle all scheduling. Scraper will only do traversal, validations, compliance checks, URL filtering and scoring etc. Reformatted some of the scraper files with Black code formatter	2025-01-21 18:44:43 +05:30
Aravind Karnam	67fa06c09b	Refactor: Removed all scheduling logic from scraper. From now scraper expects arun_many to handle all scheduling. Scraper will only do traversal, validations, compliance checks, URL filtering and scoring etc. Reformatted some of the scraper files with Black code formatter	2025-01-21 17:49:51 +05:30
Aravind Karnam	26d78d8512	Merge branch 'next' into feature/scraper	2025-01-21 12:35:45 +05:30
Aravind Karnam	1079965453	refactor: Remove the URL processing logic out of scraper	2025-01-21 12:16:59 +05:30
Aravind	a677c2b61d	Merge pull request #496 from aravindkarnam/scraper-uc Trying to merge scraper on-going development with new developments in parallel processing	2025-01-20 16:55:41 +05:30
Aravind Karnam	7a5f83b76f	fix: Added browser config and crawler run config from 0.4.22	2024-12-18 10:33:09 +05:30
aravind	7c0fa269a6	Merge pull request #9 from aravindkarnam/main Pulling version 0.4.22 from main into scraper	2024-12-17 18:43:36 +05:30
Aravind Karnam	2f5e0598bb	updated definition of can_process_url to include dept as an argument, as it's needed to skip filters for start_url	2024-11-26 18:26:57 +05:30
Aravind Karnam	ff731e4ea1	fixed the final scraper_quickstart.py example	2024-11-26 17:08:32 +05:30
Aravind Karnam	9530ded83a	fixed the final scraper_quickstart.py example	2024-11-26 17:05:54 +05:30
Aravind Karnam	155c756238	<Future pending> issue fix was incorrect. Reverting	2024-11-26 17:04:04 +05:30
Aravind Karnam	a888c91790	Fix "Future attached to a different loop" error by ensuring tasks are created in the correct event loop - Explicitly retrieve and use the correct event loop when creating tasks to avoid cross-loop issues. - Ensures proper task scheduling in environments with multiple event loops.	2024-11-26 14:05:02 +05:30
Aravind Karnam	a98d51a62c	Remove the can_process_url check from _process_links since it's already being checked in process_url	2024-11-26 11:11:49 +05:30
Aravind Karnam	ee3001b1f7	fix: moved depth as a param to can_process_url and applying filter chain only when depth is not zero. This way filter chain is skipped but other validations are in place even for start URL	2024-11-26 10:22:14 +05:30
Aravind Karnam	b13fd71040	chore: 1. Expose process_external_links as a param 2. Removed a few unused imports 3. Removed URL normalisation for external links separately as that won't be necessary	2024-11-26 10:07:11 +05:30
Aravind Karnam	2226ef53c8	fix: Exempting the start_url from can_process_url	2024-11-23 14:59:14 +05:30
aravind	3d52b551f2	Merge pull request #8 from aravindkarnam/main Pulling in 0.3.74	2024-11-23 13:57:36 +05:30
Aravind Karnam	f8e85b1499	Fixed a bug in _process_links, handled condition for when url_scorer is passed as None, renamed the scrapper folder to scraper.	2024-11-23 13:52:34 +05:30
Aravind Karnam	c1797037c0	Fixed a few bugs, import errors and changed to asyncio wait_for instead of timeout to support python versions < 3.11	2024-11-23 12:39:25 +05:30
aravind	60670b2af6	Merge pull request #7 from aravindkarnam/main pulling the main branch into scraper-uc	2024-11-15 20:43:54 +05:30
UncleCode	0d357ab7d2	feat(scraper): Enhance URL filtering and scoring systems Implement comprehensive URL filtering and scoring capabilities: Filters: - Add URLPatternFilter with glob/regex support - Implement ContentTypeFilter with MIME type checking - Add DomainFilter for domain control - Create FilterChain with stats tracking Scorers: - Complete KeywordRelevanceScorer implementation - Add PathDepthScorer for URL structure scoring - Implement ContentTypeScorer for file type priorities - Add FreshnessScorer for date-based scoring - Add DomainAuthorityScorer for domain weighting - Create CompositeScorer for combined strategies Features: - Add statistics tracking for both filters and scorers - Implement logging support throughout - Add resource cleanup methods - Create comprehensive documentation - Include performance optimizations Tests and docs included. Note: Review URL normalization overlap with recent crawler changes.	2024-11-08 19:02:28 +08:00
UncleCode	bae4665949	feat(scraper): Enhance URL filtering and scoring systems Implement comprehensive URL filtering and scoring capabilities: Filters: - Add URLPatternFilter with glob/regex support - Implement ContentTypeFilter with MIME type checking - Add DomainFilter for domain control - Create FilterChain with stats tracking Scorers: - Complete KeywordRelevanceScorer implementation - Add PathDepthScorer for URL structure scoring - Implement ContentTypeScorer for file type priorities - Add FreshnessScorer for date-based scoring - Add DomainAuthorityScorer for domain weighting - Create CompositeScorer for combined strategies Features: - Add statistics tracking for both filters and scorers - Implement logging support throughout - Add resource cleanup methods - Create comprehensive documentation - Include performance optimizations Tests and docs included. Note: Review URL normalization overlap with recent crawler changes. - Quick Start is created and added	2024-11-08 18:45:12 +08:00
UncleCode	d11c004fbb	Enhanced BFS Strategy: Improved monitoring, resource management & configuration - Added CrawlStats for comprehensive crawl monitoring - Implemented proper resource cleanup with shutdown mechanism - Enhanced URL processing with better validation and politeness controls - Added configuration options (max_concurrent, timeout, external_links) - Improved error handling with retry logic - Added domain-specific queues for better performance - Created comprehensive documentation Note: URL normalization needs review - potential duplicate processing with core crawler for internal links. Currently commented out pending further investigation of edge cases.	2024-11-08 15:57:23 +08:00
UncleCode	3d1c9a8434	Revieweing the BFS strategy.	2024-11-07 18:54:53 +08:00
UncleCode	be472c624c	Refactored AsyncWebScraper to include comprehensive error handling and progress tracking capabilities. Introduced a ScrapingProgress data class to monitor processed and failed URLs. Enhanced scraping methods to log errors and track stats throughout the scraping process.	2024-11-06 21:09:47 +08:00
UncleCode	06b21dcc50	Update .gitignore to include new directories for issues and documentation	2024-11-06 18:44:03 +08:00
UncleCode	0f0f60527d	Merge pull request #172 from aravindkarnam/scraper Scraper	2024-11-06 07:00:44 +01:00
Aravind Karnam	8105fd178e	Removed stubs for remove_from_future_crawls since the visited set is updated soon as the URL was queued, Removed add_to_retry_queue(url) since retry with exponential backoff with help of tenacity is going to take care of it.	2024-10-17 15:42:43 +05:30
Aravind Karnam	ce7fce4b16	1. Moved to asyncio.wait instead of gather so that results can be yeilded just as they are ready, rather than in batches 2. Moved the visted.add(url), to before the task is put in queue rather than after the crawl is completed. This makes sure that duplicate crawls doesn't happen when same URL is found at different depth and that get's queued too because the crawl is not yet completed and visted set is not updated. 3. Named the yield_results attribute to stream instead. Since that seems to be popularly used in all other AI libraries for intermediate results.	2024-10-17 12:25:17 +05:30
Aravind Karnam	de28b59aca	removed unused imports	2024-10-16 22:36:48 +05:30
Aravind Karnam	04d8b47b92	Exposed min_crawl_delay for BFSScraperStrategy	2024-10-16 22:34:54 +05:30
Aravind Karnam	2943feeecf	1. Added a flag to yield each crawl result,as they become ready along with the final scraper result as another option 2. Removed ascrape_many method, as I'm currently not focusing on it in the first cut of scraper 3. Added some error handling for cases where robots.txt cannot be fetched or parsed.	2024-10-16 22:05:29 +05:30
Aravind Karnam	8a7d29ce85	updated some comments and removed content type checking functionality from core as it's implemented as a filter	2024-10-16 15:59:37 +05:30
aravind	159bd875bd	Merge pull request #5 from aravindkarnam/main Merging 0.3.6	2024-10-16 10:41:22 +05:30
Aravind Karnam	d743adac68	Fixed some bugs in robots.txt processing	2024-10-03 15:58:57 +05:30
Aravind Karnam	7fe220dbd5	1. Introduced a bool flag to ascrape method to switch between sequential and concurrent processing 2. Introduced a dictionary for depth tracking across various tasks 3. Removed redundancy with crawled_urls variable. Instead created a list with visited set variable in returned object.	2024-10-03 11:17:11 +05:30
aravind	65e013d9d1	Merge pull request #3 from aravindkarnam/main Merging latest changes from main branch	2024-10-03 09:52:12 +05:30
Aravind Karnam	7f3e2e47ed	Parallel processing with retry on failure with exponential backoff - Simplified URL validation and normalisation - respecting Robots.txt	2024-09-19 12:34:12 +05:30
aravind	78f26ac263	Merge pull request #2 from aravindkarnam/staging Staging	2024-09-18 18:16:23 +05:30
Aravind Karnam	44ce12c62c	Created scaffolding for Scraper as per the plan. Implemented the ascrape method in bfs_scraper_strategy	2024-09-09 13:13:34 +05:30