Refactor: Removed all scheduling logic from scraper. From now scraper expects arun_many to handle all scheduling. Scraper will only do traversal, validations, compliance checks, URL filtering and scoring etc. Reformatted some of the scraper files with Black code formatter

2025-01-21 17:49:51 +05:30
parent 26d78d8512
commit 67fa06c09b
9 changed files with 316 additions and 243 deletions
--- a/crawl4ai/config.py
+++ b/crawl4ai/config.py
@@ -84,3 +84,4 @@ SHOW_DEPRECATION_WARNINGS = True
 SCREENSHOT_HEIGHT_TRESHOLD = 10000
 PAGE_TIMEOUT = 60000
 DOWNLOAD_PAGE_TIMEOUT = 60000
+SCRAPER_BATCH_SIZE = 5
--- a/crawl4ai/scraper/init.py
+++ b/crawl4ai/scraper/init.py
@@ -1,5 +1,16 @@
 from .async_web_scraper import AsyncWebScraper
 from .bfs_scraper_strategy import BFSScraperStrategy
-from .filters import URLFilter, FilterChain, URLPatternFilter, ContentTypeFilter, DomainFilter
-from .scorers import KeywordRelevanceScorer, PathDepthScorer, FreshnessScorer, CompositeScorer
+from .filters import (
+    URLFilter,
+    FilterChain,
+    URLPatternFilter,
+    ContentTypeFilter,
+    DomainFilter,
+)
+from .scorers import (
+    KeywordRelevanceScorer,
+    PathDepthScorer,
+    FreshnessScorer,
+    CompositeScorer,
+)
 from .scraper_strategy import ScraperStrategy
--- a/crawl4ai/scraper/async_web_scraper.py
+++ b/crawl4ai/scraper/async_web_scraper.py
@@ -6,13 +6,16 @@ import logging
 from dataclasses import dataclass
 from contextlib import asynccontextmanager

+
@dataclass
 class ScrapingProgress:
    """Tracks the progress of a scraping operation."""
+
    processed_urls: int = 0
    failed_urls: int = 0
    current_url: Optional[str] = None

+
 class AsyncWebScraper:
    """
    A high-level web scraper that combines an async crawler with a scraping strategy.
@@ -27,7 +30,7 @@ class AsyncWebScraper:
        self,
        crawler: AsyncWebCrawler,
        strategy: ScraperStrategy,
-        logger: Optional[logging.Logger] = None
+        logger: Optional[logging.Logger] = None,
    ):
        if not isinstance(crawler, AsyncWebCrawler):
            raise TypeError("crawler must be an instance of AsyncWebCrawler")
@@ -55,9 +58,7 @@ class AsyncWebScraper:
            raise

    async def ascrape(
-        self, 
-        url: str, 
-        stream: bool = False
+        self, url: str, stream: bool = False
    ) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]:
        """
        Scrape a website starting from the given URL.
@@ -110,9 +111,9 @@ class AsyncWebScraper:
                crawled_urls=list(extracted_data.keys()),
                extracted_data=extracted_data,
                stats={
-                    'processed_urls': self._progress.processed_urls,
-                    'failed_urls': self._progress.failed_urls
-                }
+                    "processed_urls": self._progress.processed_urls,
+                    "failed_urls": self._progress.failed_urls,
+                },
            )
        except Exception as e:
            self.logger.error(f"Error in collecting scrape: {str(e)}")
--- a/crawl4ai/scraper/bfs_scraper_strategy.py
+++ b/crawl4ai/scraper/bfs_scraper_strategy.py
@@ -7,16 +7,19 @@ from urllib.parse import urlparse
 from urllib.robotparser import RobotFileParser
 import validators

-from crawl4ai.async_configs import CrawlerRunConfig
+from ..async_configs import CrawlerRunConfig
 from .models import CrawlResult
 from .filters import FilterChain
 from .scorers import URLScorer
 from ..async_webcrawler import AsyncWebCrawler
 from .scraper_strategy import ScraperStrategy
+from ..config import SCRAPER_BATCH_SIZE
+

@dataclass
 class CrawlStats:
    """Statistics for the crawling process"""
+
    start_time: datetime
    urls_processed: int = 0
    urls_failed: int = 0
@@ -25,6 +28,7 @@ class CrawlStats:
    current_depth: int = 0
    robots_blocked: int = 0

+
 class BFSScraperStrategy(ScraperStrategy):
    """Breadth-First Search scraping strategy with politeness controls"""

@@ -34,7 +38,7 @@ class BFSScraperStrategy(ScraperStrategy):
        filter_chain: FilterChain,
        url_scorer: URLScorer,
        process_external_links: bool = False,
-        logger: Optional[logging.Logger] = None
+        logger: Optional[logging.Logger] = None,
    ):
        self.max_depth = max_depth
        self.filter_chain = filter_chain
@@ -74,11 +78,11 @@ class BFSScraperStrategy(ScraperStrategy):

    async def _get_robot_parser(self, url: str) -> Optional[RobotFileParser]:
        """Get or create robots.txt parser for domain.
-            This is our robots.txt manager that:
-                - Uses domain-level caching of robot parsers
-                - Creates and caches new parsers as needed
-                - Handles failed robots.txt fetches gracefully
-                - Returns None if robots.txt can't be fetched, allowing crawling to proceed        
+        This is our robots.txt manager that:
+            - Uses domain-level caching of robot parsers
+            - Creates and caches new parsers as needed
+            - Handles failed robots.txt fetches gracefully
+            - Returns None if robots.txt can't be fetched, allowing crawling to proceed
        """
        domain = urlparse(url).netloc
        if domain not in self.robot_parsers:
@@ -100,7 +104,7 @@ class BFSScraperStrategy(ScraperStrategy):
        depth: int,
        queue: asyncio.PriorityQueue,
        visited: Set[str],
-        depths: Dict[str, int]
+        depths: Dict[str, int],
    ):
        """Process extracted links from crawl result.
        This is our link processor that:
@@ -116,7 +120,7 @@ class BFSScraperStrategy(ScraperStrategy):
        if self.process_external_links:
            links_to_process += result.links["external"]
        for link in links_to_process:
-            url = link['href']
+            url = link["href"]
            if not await self.can_process_url(url, depth):
                self.stats.urls_skipped += 1
                continue
@@ -132,8 +136,7 @@ class BFSScraperStrategy(ScraperStrategy):
                    await queue.put((score, new_depth, url))
                    depths[url] = new_depth
                    self.stats.total_depth_reached = max(
-                        self.stats.total_depth_reached, 
-                        new_depth
+                        self.stats.total_depth_reached, new_depth
                    )

    async def ascrape(
@@ -151,53 +154,72 @@ class BFSScraperStrategy(ScraperStrategy):
            URL: The actual URL to crawl
        visited: Keeps track of URLs we've already seen to avoid cycles
        depths: Maps URLs to their depths from the start URL
-        pending_tasks: Tracks currently running crawl tasks        
+        active_crawls: Tracks currently running crawl tasks        
        """
        queue = asyncio.PriorityQueue()
        await queue.put((0, 0, start_url))
        visited: Set[str] = set()
        depths = {start_url: 0}
-        
+        active_crawls = set()  # Track URLs currently being processed
        try:
-            while not queue.empty() and not self._cancel_event.is_set():
+            while (
+                not queue.empty() or active_crawls
+            ) and not self._cancel_event.is_set():
                """
                This sets up our main control loop which:
                    - Continues while there are URLs to process (not queue.empty())
-                    - Or while there are tasks still running (pending_tasks)
+                    - Or while there are active crawls still running (arun_many)
                    - Can be interrupted via cancellation (not self._cancel_event.is_set())
                """
-                n = 3
+                # Collect batch of jobs to process
                jobs = []
-                for _ in range(n):
-                    if self.queue.empty():
-                        break
-                    jobs.append(await self.queue.get())
+                # Fill batch with available jobs
+                while len(jobs) < SCRAPER_BATCH_SIZE and not queue.empty():
+                    score, depth, url = await queue.get()
+                    if url not in active_crawls:  # Only add if not currently processing
+                        jobs.append((score, depth, url))
+                        active_crawls.add(url)
+                        self.stats.current_depth = depth

-                # Filter jobs directly, ensuring uniqueness and checking against visited
-                filtered_jobs = []
-                for job in jobs:
-                    _, depth, url = job
-                    self.stats.current_depth = depth
-                    if url not in visited:
-                        visited.add(url)
-                        filtered_jobs.append(job)
+                if not jobs:
+                    # If no jobs but active crawls exist, wait a bit and continue
+                    if active_crawls:
+                        await asyncio.sleep(0.1)
+                    continue

-                crawler_config = CrawlerRunConfig(cache_mode="BYPASS")
-                async for result in await crawler.arun_many(urls=[url for _, _, url in filtered_jobs],
-                                                            config=crawler_config.clone(stream=True)):
-                    print(f"Received result for: {result.url} - Success: {result.success}")
-                    source_url, depth = next((url, depth) for _, depth, url in filtered_jobs if url == result.source_url)
-                    await self._process_links(result, source_url, depth, queue, visited, depths)
-                    yield result
+                # Process batch
+                crawler_config = CrawlerRunConfig(cache_mode="BYPASS", stream=True)
+                try:
+                    async for result in await crawler.arun_many(
+                        urls=[url for _, _, url in jobs], config=crawler_config
+                    ):
+                        source_url, depth = next(
+                            (url, depth) for _, depth, url in jobs if url == result.url
+                        )
+                        active_crawls.remove(source_url)  # Remove from active set
+
+                        if result.success:
+                            await self._process_links(
+                                result, source_url, depth, queue, visited, depths
+                            )
+                            yield result
+                        else:
+                            self.logger.warning(
+                                f"Failed to crawl {result.url}: {result.error_message}"
+                            )
+                except Exception as e:
+                    # Remove failed URLs from active set
+                    for _, _, url in jobs:
+                        active_crawls.discard(url)
+                    self.logger.error(f"Batch processing error: {e}")
+                    # Continue processing other batches
+                    continue

        except Exception as e:
            self.logger.error(f"Error in crawl process: {e}")
            raise

        finally:
-            # Clean up any remaining tasks
-            # for task in pending_tasks:
-            #     task.cancel()
            self.stats.end_time = datetime.now()

    async def shutdown(self):
--- a/crawl4ai/scraper/filters.py
+++ b/crawl4ai/scraper/filters.py
@@ -11,13 +11,16 @@ import logging
 from dataclasses import dataclass
 import fnmatch

+
@dataclass
 class FilterStats:
    """Statistics for filter applications"""
+
    total_urls: int = 0
    rejected_urls: int = 0
    passed_urls: int = 0

+
 class URLFilter(ABC):
    """Base class for URL filters"""

@@ -39,6 +42,7 @@ class URLFilter(ABC):
        else:
            self.stats.rejected_urls += 1

+
 class FilterChain:
    """Chain of URL filters."""

@@ -47,7 +51,7 @@ class FilterChain:
        self.stats = FilterStats()
        self.logger = logging.getLogger("urlfilter.chain")

-    def add_filter(self, filter_: URLFilter) -> 'FilterChain':
+    def add_filter(self, filter_: URLFilter) -> "FilterChain":
        """Add a filter to the chain"""
        self.filters.append(filter_)
        return self  # Enable method chaining
@@ -65,6 +69,7 @@ class FilterChain:
        self.stats.passed_urls += 1
        return True

+
 class URLPatternFilter(URLFilter):
    """Filter URLs based on glob patterns or regex.

@@ -79,8 +84,11 @@ class URLPatternFilter(URLFilter):
    - Pattern pre-compilation for performance
    """

-    def __init__(self, patterns: Union[str, Pattern, List[Union[str, Pattern]]], 
-                 use_glob: bool = True):
+    def __init__(
+        self,
+        patterns: Union[str, Pattern, List[Union[str, Pattern]]],
+        use_glob: bool = True,
+    ):
        super().__init__()
        self.patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
        self.use_glob = use_glob
@@ -90,7 +98,9 @@ class URLPatternFilter(URLFilter):
            if isinstance(pattern, str) and use_glob:
                self._compiled_patterns.append(self._glob_to_regex(pattern))
            else:
-                self._compiled_patterns.append(re.compile(pattern) if isinstance(pattern, str) else pattern)
+                self._compiled_patterns.append(
+                    re.compile(pattern) if isinstance(pattern, str) else pattern
+                )

    def _glob_to_regex(self, pattern: str) -> Pattern:
        """Convert glob pattern to regex"""
@@ -102,6 +112,7 @@ class URLPatternFilter(URLFilter):
        self._update_stats(matches)
        return matches

+
 class ContentTypeFilter(URLFilter):
    """Filter URLs based on expected content type.

@@ -115,10 +126,13 @@ class ContentTypeFilter(URLFilter):
    - Support for multiple content types
    """

-    def __init__(self, allowed_types: Union[str, List[str]], 
-                 check_extension: bool = True):
+    def __init__(
+        self, allowed_types: Union[str, List[str]], check_extension: bool = True
+    ):
        super().__init__()
-        self.allowed_types = [allowed_types] if isinstance(allowed_types, str) else allowed_types
+        self.allowed_types = (
+            [allowed_types] if isinstance(allowed_types, str) else allowed_types
+        )
        self.check_extension = check_extension
        self._normalize_types()

@@ -128,12 +142,18 @@ class ContentTypeFilter(URLFilter):

    def _check_extension(self, url: str) -> bool:
        """Check URL's file extension"""
-        ext = urlparse(url).path.split('.')[-1].lower() if '.' in urlparse(url).path else ''
+        ext = (
+            urlparse(url).path.split(".")[-1].lower()
+            if "." in urlparse(url).path
+            else ""
+        )
        if not ext:
            return True  # No extension, might be dynamic content

        guessed_type = mimetypes.guess_type(url)[0]
-        return any(allowed in (guessed_type or '').lower() for allowed in self.allowed_types)
+        return any(
+            allowed in (guessed_type or "").lower() for allowed in self.allowed_types
+        )

    def apply(self, url: str) -> bool:
        """Check if URL's content type is allowed"""
@@ -143,6 +163,7 @@ class ContentTypeFilter(URLFilter):
        self._update_stats(result)
        return result

+
 class DomainFilter(URLFilter):
    """Filter URLs based on allowed/blocked domains.

@@ -156,11 +177,18 @@ class DomainFilter(URLFilter):
    - Efficient domain matching
    """

-    def __init__(self, allowed_domains: Union[str, List[str]] = None, 
-                 blocked_domains: Union[str, List[str]] = None):
+    def __init__(
+        self,
+        allowed_domains: Union[str, List[str]] = None,
+        blocked_domains: Union[str, List[str]] = None,
+    ):
        super().__init__()
-        self.allowed_domains = set(self._normalize_domains(allowed_domains)) if allowed_domains else None
-        self.blocked_domains = set(self._normalize_domains(blocked_domains)) if blocked_domains else set()
+        self.allowed_domains = (
+            set(self._normalize_domains(allowed_domains)) if allowed_domains else None
+        )
+        self.blocked_domains = (
+            set(self._normalize_domains(blocked_domains)) if blocked_domains else set()
+        )

    def _normalize_domains(self, domains: Union[str, List[str]]) -> List[str]:
        """Normalize domain strings"""
@@ -187,19 +215,21 @@ class DomainFilter(URLFilter):
        self._update_stats(True)
        return True

+
 # Example usage:
 def create_common_filter_chain() -> FilterChain:
    """Create a commonly used filter chain"""
-    return FilterChain([
-        URLPatternFilter([
-            "*.html", "*.htm",  # HTML files
-            "*/article/*", "*/blog/*"  # Common content paths
-        ]),
-        ContentTypeFilter([
-            "text/html",
-            "application/xhtml+xml"
-        ]),
-        DomainFilter(
-            blocked_domains=["ads.*", "analytics.*"]
-        )
-    ])
+    return FilterChain(
+        [
+            URLPatternFilter(
+                [
+                    "*.html",
+                    "*.htm",  # HTML files
+                    "*/article/*",
+                    "*/blog/*",  # Common content paths
+                ]
+            ),
+            ContentTypeFilter(["text/html", "application/xhtml+xml"]),
+            DomainFilter(blocked_domains=["ads.*", "analytics.*"]),
+        ]
+    )
--- a/crawl4ai/scraper/models.py
+++ b/crawl4ai/scraper/models.py
@@ -2,7 +2,8 @@ from pydantic import BaseModel
 from typing import List, Dict
 from ..models import CrawlResult

+
 class ScraperResult(BaseModel):
    url: str
    crawled_urls: List[str]
-    extracted_data: Dict[str,CrawlResult]
+    extracted_data: Dict[str, CrawlResult]
--- a/crawl4ai/scraper/scorers.py
+++ b/crawl4ai/scraper/scorers.py
@@ -10,13 +10,15 @@ from collections import defaultdict
 import math
 import logging

+
@dataclass
 class ScoringStats:
    """Statistics for URL scoring"""
+
    urls_scored: int = 0
    total_score: float = 0.0
-    min_score: float = float('inf')
-    max_score: float = float('-inf')
+    min_score: float = float("inf")
+    max_score: float = float("-inf")

    def update(self, score: float):
        """Update scoring statistics"""
@@ -30,6 +32,7 @@ class ScoringStats:
        """Calculate average score"""
        return self.total_score / self.urls_scored if self.urls_scored > 0 else 0.0

+
 class URLScorer(ABC):
    """Base class for URL scoring strategies"""

@@ -51,6 +54,7 @@ class URLScorer(ABC):
        self.stats.update(weighted_score)
        return weighted_score

+
 class CompositeScorer(URLScorer):
    """Combines multiple scorers with weights"""

@@ -68,6 +72,7 @@ class CompositeScorer(URLScorer):

        return total_score

+
 class KeywordRelevanceScorer(URLScorer):
    """Score URLs based on keyword relevance.

@@ -82,8 +87,9 @@ class KeywordRelevanceScorer(URLScorer):
    - Weighted scoring
    """

-    def __init__(self, keywords: List[str], weight: float = 1.0,
-                 case_sensitive: bool = False):
+    def __init__(
+        self, keywords: List[str], weight: float = 1.0, case_sensitive: bool = False
+    ):
        super().__init__(weight=weight)
        self.keywords = keywords
        self.case_sensitive = case_sensitive
@@ -98,12 +104,12 @@ class KeywordRelevanceScorer(URLScorer):
        """Calculate score based on keyword matches"""
        decoded_url = unquote(url)
        total_matches = sum(
-            1 for pattern in self.patterns
-            if pattern.search(decoded_url)
+            1 for pattern in self.patterns if pattern.search(decoded_url)
        )
        # Normalize score between 0 and 1
        return total_matches / len(self.patterns) if self.patterns else 0.0

+
 class PathDepthScorer(URLScorer):
    """Score URLs based on their path depth.

@@ -124,12 +130,13 @@ class PathDepthScorer(URLScorer):
    def _calculate_score(self, url: str) -> float:
        """Calculate score based on path depth"""
        path = urlparse(url).path
-        depth = len([x for x in path.split('/') if x])
+        depth = len([x for x in path.split("/") if x])

        # Score decreases as we move away from optimal depth
        distance_from_optimal = abs(depth - self.optimal_depth)
        return 1.0 / (1.0 + distance_from_optimal)

+
 class ContentTypeScorer(URLScorer):
    """Score URLs based on content type preferences.

@@ -152,8 +159,7 @@ class ContentTypeScorer(URLScorer):
    def _compile_patterns(self):
        """Prepare content type patterns"""
        self.patterns = {
-            re.compile(pattern): weight
-            for pattern, weight in self.type_weights.items()
+            re.compile(pattern): weight for pattern, weight in self.type_weights.items()
        }

    def _calculate_score(self, url: str) -> float:
@@ -163,6 +169,7 @@ class ContentTypeScorer(URLScorer):
                return weight
        return 0.0

+
 class FreshnessScorer(URLScorer):
    """Score URLs based on freshness indicators.

@@ -175,9 +182,9 @@ class FreshnessScorer(URLScorer):
    def __init__(self, weight: float = 1.0):
        super().__init__(weight=weight)
        self.date_patterns = [
-            r'/(\d{4})/(\d{2})/(\d{2})/',  # yyyy/mm/dd
-            r'(\d{4})[-_](\d{2})[-_](\d{2})',  # yyyy-mm-dd
-            r'/(\d{4})/',  # year only
+            r"/(\d{4})/(\d{2})/(\d{2})/",  # yyyy/mm/dd
+            r"(\d{4})[-_](\d{2})[-_](\d{2})",  # yyyy-mm-dd
+            r"/(\d{4})/",  # year only
        ]
        self._compile_patterns()

@@ -194,6 +201,7 @@ class FreshnessScorer(URLScorer):
                return 1.0 - (2024 - year) * 0.1
        return 0.5  # Default score for URLs without dates

+
 class DomainAuthorityScorer(URLScorer):
    """Score URLs based on domain authority.

@@ -207,8 +215,12 @@ class DomainAuthorityScorer(URLScorer):
    Configurable domain weights
    Default weight for unknown domains"""

-    def __init__(self, domain_weights: Dict[str, float], 
-                 default_weight: float = 0.5, weight: float = 1.0):
+    def __init__(
+        self,
+        domain_weights: Dict[str, float],
+        default_weight: float = 0.5,
+        weight: float = 1.0,
+    ):
        super().__init__(weight=weight)
        self.domain_weights = domain_weights
        self.default_weight = default_weight
@@ -218,29 +230,23 @@ class DomainAuthorityScorer(URLScorer):
        domain = urlparse(url).netloc.lower()
        return self.domain_weights.get(domain, self.default_weight)

+
 def create_balanced_scorer() -> CompositeScorer:
    """Create a balanced composite scorer"""
-    return CompositeScorer([
-        KeywordRelevanceScorer(
-            keywords=["article", "blog", "news", "research"],
-            weight=1.0
-        ),
-        PathDepthScorer(
-            optimal_depth=3,
-            weight=0.7
-        ),
-        ContentTypeScorer(
-            type_weights={
-                r'\.html?$': 1.0,
-                r'\.pdf$': 0.8,
-                r'\.xml$': 0.6
-            },
-            weight=0.8
-        ),
-        FreshnessScorer(
-            weight=0.9
-        )
-    ])
+    return CompositeScorer(
+        [
+            KeywordRelevanceScorer(
+                keywords=["article", "blog", "news", "research"], weight=1.0
+            ),
+            PathDepthScorer(optimal_depth=3, weight=0.7),
+            ContentTypeScorer(
+                type_weights={r"\.html?$": 1.0, r"\.pdf$": 0.8, r"\.xml$": 0.6},
+                weight=0.8,
+            ),
+            FreshnessScorer(weight=0.9),
+        ]
+    )
+

 # Example Usage:
 """
--- a/crawl4ai/scraper/scraper_strategy.py
+++ b/crawl4ai/scraper/scraper_strategy.py
@@ -4,21 +4,20 @@ from ..models import CrawlResult
 from ..async_webcrawler import AsyncWebCrawler
 from typing import Union, AsyncGenerator

+
 class ScraperStrategy(ABC):
    @abstractmethod
    async def ascrape(
        self,
        url: str,
        crawler: AsyncWebCrawler,
-        parallel_processing: bool = True,
-        stream: bool = False
+        stream: bool = False,
    ) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]:
        """Scrape the given URL using the specified crawler.

        Args:
            url (str): The starting URL for the scrape.
            crawler (AsyncWebCrawler): The web crawler instance.
-            parallel_processing (bool): Whether to use parallel processing. Defaults to True.
            stream (bool): If True, yields individual crawl results as they are ready;
                                if False, accumulates results and returns a final ScraperResult.

--- a/docs/scraper/scraper_quickstart.py
+++ b/docs/scraper/scraper_quickstart.py
@@ -4,13 +4,14 @@ from crawl4ai.scraper import (
    BFSScraperStrategy,
    FilterChain,
    URLPatternFilter,
-    ContentTypeFilter
+    ContentTypeFilter,
 )
 from crawl4ai.async_webcrawler import AsyncWebCrawler, BrowserConfig
 import re

 browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600)

+
 async def basic_scraper_example():
    """
    Basic example: Scrape a blog site for articles
@@ -19,24 +20,25 @@ async def basic_scraper_example():
    - Collects all results at once
    """
    # Create a simple filter chain
-    filter_chain = FilterChain([
-        # Only crawl pages within the blog section
-        URLPatternFilter("*/tutorial/*"),
-        # Only process HTML pages
-        ContentTypeFilter(["text/html"])
-    ])
+    filter_chain = FilterChain(
+        [
+            # Only crawl pages within the blog section
+            URLPatternFilter("*/tutorial/*"),
+            # Only process HTML pages
+            ContentTypeFilter(["text/html"]),
+        ]
+    )

    # Initialize the strategy with basic configuration
    strategy = BFSScraperStrategy(
        max_depth=2,  # Only go 2 levels deep
        filter_chain=filter_chain,
        url_scorer=None,  # Use default scoring
-        max_concurrent=3,  # Limit concurrent requests
-        process_external_links=True
+        process_external_links=True,
    )

    # Create the crawler and scraper
-    async with AsyncWebCrawler(config=browser_config,verbose=True) as crawler:
+    async with AsyncWebCrawler(config=browser_config, verbose=True) as crawler:
        scraper = AsyncWebScraper(crawler, strategy)
        # Start scraping
        try:
@@ -50,6 +52,7 @@ async def basic_scraper_example():
        except Exception as e:
            print(f"Error during scraping: {e}")

+
 # advanced_scraper_example.py
 import logging
 from crawl4ai.scraper import (
@@ -62,10 +65,11 @@ from crawl4ai.scraper import (
    KeywordRelevanceScorer,
    PathDepthScorer,
    FreshnessScorer,
-    CompositeScorer
+    CompositeScorer,
 )
 from crawl4ai.async_webcrawler import AsyncWebCrawler

+
 async def advanced_scraper_example():
    """
    Advanced example: Intelligent news site scraping
@@ -79,49 +83,44 @@ async def advanced_scraper_example():
    logger = logging.getLogger("advanced_scraper")

    # Create sophisticated filter chain
-    filter_chain = FilterChain([
-        # Domain control
-        DomainFilter(
-            allowed_domains=["techcrunch.com"],
-            blocked_domains=["login.techcrunch.com","legal.yahoo.com"]
-        ),
-        # URL patterns
-        URLPatternFilter([
-            "*/article/*",
-            "*/news/*",
-            "*/blog/*",
-            re.compile(r"\d{4}/\d{2}/.*")  # Date-based URLs
-        ]),
-        # Content types
-        ContentTypeFilter([
-            "text/html",
-            "application/xhtml+xml"
-        ])
-    ])
+    filter_chain = FilterChain(
+        [
+            # Domain control
+            DomainFilter(
+                allowed_domains=["techcrunch.com"],
+                blocked_domains=["login.techcrunch.com", "legal.yahoo.com"],
+            ),
+            # URL patterns
+            URLPatternFilter(
+                [
+                    "*/article/*",
+                    "*/news/*",
+                    "*/blog/*",
+                    re.compile(r"\d{4}/\d{2}/.*"),  # Date-based URLs
+                ]
+            ),
+            # Content types
+            ContentTypeFilter(["text/html", "application/xhtml+xml"]),
+        ]
+    )

    # Create composite scorer
-    scorer = CompositeScorer([
-        # Prioritize by keywords
-        KeywordRelevanceScorer(
-            keywords=["news", "breaking", "update", "latest"],
-            weight=1.0
-        ),
-        # Prefer optimal URL structure
-        PathDepthScorer(
-            optimal_depth=3,
-            weight=0.7
-        ),
-        # Prioritize fresh content
-        FreshnessScorer(weight=0.9)
-    ])
+    scorer = CompositeScorer(
+        [
+            # Prioritize by keywords
+            KeywordRelevanceScorer(
+                keywords=["news", "breaking", "update", "latest"], weight=1.0
+            ),
+            # Prefer optimal URL structure
+            PathDepthScorer(optimal_depth=3, weight=0.7),
+            # Prioritize fresh content
+            FreshnessScorer(weight=0.9),
+        ]
+    )

    # Initialize strategy with advanced configuration
    strategy = BFSScraperStrategy(
-        max_depth=2,
-        filter_chain=filter_chain,
-        url_scorer=scorer,
-        max_concurrent=2,
-        min_crawl_delay=1
+        max_depth=2, filter_chain=filter_chain, url_scorer=scorer
    )

    # Create crawler and scraper
@@ -129,27 +128,27 @@ async def advanced_scraper_example():
        scraper = AsyncWebScraper(crawler, strategy)

        # Track statistics
-        stats = {
-            'processed': 0,
-            'errors': 0,
-            'total_size': 0
-        }
+        stats = {"processed": 0, "errors": 0, "total_size": 0}

        try:
            # Use streaming mode
-            result_generator = await scraper.ascrape("https://techcrunch.com", parallel_processing=True, stream=True)
+            result_generator = await scraper.ascrape(
+                "https://techcrunch.com", stream=True
+            )
            async for result in result_generator:
-                stats['processed'] += 1
+                stats["processed"] += 1

                if result.success:
-                    stats['total_size'] += len(result.html)
+                    stats["total_size"] += len(result.html)
                    logger.info(f"Processed: {result.url}")
                else:
-                    stats['errors'] += 1
-                    logger.error(f"Failed to process {result.url}: {result.error_message}")
+                    stats["errors"] += 1
+                    logger.error(
+                        f"Failed to process {result.url}: {result.error_message}"
+                    )

                # Log progress regularly
-                if stats['processed'] % 10 == 0:
+                if stats["processed"] % 10 == 0:
                    logger.info(f"Progress: {stats['processed']} URLs processed")

        except Exception as e:
@@ -171,7 +170,10 @@ async def advanced_scraper_example():
            # Print scorer statistics
            logger.info("Scoring statistics:")
            logger.info(f"- Average score: {scorer.stats.average_score:.2f}")
-            logger.info(f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}")
+            logger.info(
+                f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}"
+            )
+

 if __name__ == "__main__":
    import asyncio
@@ -181,5 +183,5 @@ if __name__ == "__main__":
    asyncio.run(basic_scraper_example())

    # Run advanced example
-    print("\nRunning advanced scraper example...")
-    asyncio.run(advanced_scraper_example())
+    # print("\nRunning advanced scraper example...")
+    # asyncio.run(advanced_scraper_example())