Refactor: Removed all scheduling logic from scraper. From now scraper expects arun_many to handle all scheduling. Scraper will only do traversal, validations, compliance checks, URL filtering and scoring etc. Reformatted some of the scraper files with Black code formatter

2025-01-21 17:49:51 +05:30
parent 26d78d8512
commit 67fa06c09b
9 changed files with 316 additions and 243 deletions
--- a/crawl4ai/config.py
+++ b/crawl4ai/config.py
@@ -84,3 +84,4 @@ SHOW_DEPRECATION_WARNINGS = True
 SCREENSHOT_HEIGHT_TRESHOLD = 10000
 PAGE_TIMEOUT = 60000
 DOWNLOAD_PAGE_TIMEOUT = 60000
 SCRAPER_BATCH_SIZE = 5
--- a/crawl4ai/scraper/init.py
+++ b/crawl4ai/scraper/init.py
@@ -1,5 +1,16 @@
 from .async_web_scraper import AsyncWebScraper
 from .bfs_scraper_strategy import BFSScraperStrategy
-from .filters import URLFilter, FilterChain, URLPatternFilter, ContentTypeFilter, DomainFilter
+from .filters import (
-from .scorers import KeywordRelevanceScorer, PathDepthScorer, FreshnessScorer, CompositeScorer
+    URLFilter,
    FilterChain,
    URLPatternFilter,
    ContentTypeFilter,
    DomainFilter,
 )
 from .scorers import (
    KeywordRelevanceScorer,
    PathDepthScorer,
    FreshnessScorer,
    CompositeScorer,
 )
 from .scraper_strategy import ScraperStrategy
--- a/crawl4ai/scraper/async_web_scraper.py
+++ b/crawl4ai/scraper/async_web_scraper.py
@@ -6,13 +6,16 @@ import logging
 from dataclasses import dataclass
 from contextlib import asynccontextmanager
@dataclass
 class ScrapingProgress:
    """Tracks the progress of a scraping operation."""
    processed_urls: int = 0
    failed_urls: int = 0
    current_url: Optional[str] = None
 class AsyncWebScraper:
    """
    A high-level web scraper that combines an async crawler with a scraping strategy.
@@ -27,7 +30,7 @@ class AsyncWebScraper:
        self,
        crawler: AsyncWebCrawler,
        strategy: ScraperStrategy,
-        logger: Optional[logging.Logger] = None
+        logger: Optional[logging.Logger] = None,
    ):
        if not isinstance(crawler, AsyncWebCrawler):
            raise TypeError("crawler must be an instance of AsyncWebCrawler")
@@ -55,9 +58,7 @@ class AsyncWebScraper:
            raise
    async def ascrape(
-        self, 
+        self, url: str, stream: bool = False
        url: str, 
        stream: bool = False
    ) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]:
        """
        Scrape a website starting from the given URL.
@@ -110,9 +111,9 @@ class AsyncWebScraper:
                crawled_urls=list(extracted_data.keys()),
                extracted_data=extracted_data,
                stats={
-                    'processed_urls': self._progress.processed_urls,
+                    "processed_urls": self._progress.processed_urls,
-                    'failed_urls': self._progress.failed_urls
+                    "failed_urls": self._progress.failed_urls,
-                }
+                },
            )
        except Exception as e:
            self.logger.error(f"Error in collecting scrape: {str(e)}")
--- a/crawl4ai/scraper/bfs_scraper_strategy.py
+++ b/crawl4ai/scraper/bfs_scraper_strategy.py
@@ -7,16 +7,19 @@ from urllib.parse import urlparse
 from urllib.robotparser import RobotFileParser
 import validators
-from crawl4ai.async_configs import CrawlerRunConfig
+from ..async_configs import CrawlerRunConfig
 from .models import CrawlResult
 from .filters import FilterChain
 from .scorers import URLScorer
 from ..async_webcrawler import AsyncWebCrawler
 from .scraper_strategy import ScraperStrategy
 from ..config import SCRAPER_BATCH_SIZE
@dataclass
 class CrawlStats:
    """Statistics for the crawling process"""
    start_time: datetime
    urls_processed: int = 0
    urls_failed: int = 0
@@ -25,6 +28,7 @@ class CrawlStats:
    current_depth: int = 0
    robots_blocked: int = 0
 class BFSScraperStrategy(ScraperStrategy):
    """Breadth-First Search scraping strategy with politeness controls"""
@@ -34,7 +38,7 @@ class BFSScraperStrategy(ScraperStrategy):
        filter_chain: FilterChain,
        url_scorer: URLScorer,
        process_external_links: bool = False,
-        logger: Optional[logging.Logger] = None
+        logger: Optional[logging.Logger] = None,
    ):
        self.max_depth = max_depth
        self.filter_chain = filter_chain
@@ -74,11 +78,11 @@ class BFSScraperStrategy(ScraperStrategy):
    async def _get_robot_parser(self, url: str) -> Optional[RobotFileParser]:
        """Get or create robots.txt parser for domain.
-            This is our robots.txt manager that:
+        This is our robots.txt manager that:
-                - Uses domain-level caching of robot parsers
+            - Uses domain-level caching of robot parsers
-                - Creates and caches new parsers as needed
+            - Creates and caches new parsers as needed
-                - Handles failed robots.txt fetches gracefully
+            - Handles failed robots.txt fetches gracefully
-                - Returns None if robots.txt can't be fetched, allowing crawling to proceed        
+            - Returns None if robots.txt can't be fetched, allowing crawling to proceed
        """
        domain = urlparse(url).netloc
        if domain not in self.robot_parsers:
@@ -100,7 +104,7 @@ class BFSScraperStrategy(ScraperStrategy):
        depth: int,
        queue: asyncio.PriorityQueue,
        visited: Set[str],
-        depths: Dict[str, int]
+        depths: Dict[str, int],
    ):
        """Process extracted links from crawl result.
        This is our link processor that:
@@ -116,7 +120,7 @@ class BFSScraperStrategy(ScraperStrategy):
        if self.process_external_links:
            links_to_process += result.links["external"]
        for link in links_to_process:
-            url = link['href']
+            url = link["href"]
            if not await self.can_process_url(url, depth):
                self.stats.urls_skipped += 1
                continue
@@ -132,8 +136,7 @@ class BFSScraperStrategy(ScraperStrategy):
                    await queue.put((score, new_depth, url))
                    depths[url] = new_depth
                    self.stats.total_depth_reached = max(
-                        self.stats.total_depth_reached, 
+                        self.stats.total_depth_reached, new_depth
                        new_depth
                    )
    async def ascrape(
@@ -151,53 +154,72 @@ class BFSScraperStrategy(ScraperStrategy):
            URL: The actual URL to crawl
        visited: Keeps track of URLs we've already seen to avoid cycles
        depths: Maps URLs to their depths from the start URL
-        pending_tasks: Tracks currently running crawl tasks        
+        active_crawls: Tracks currently running crawl tasks        
        """
        queue = asyncio.PriorityQueue()
        await queue.put((0, 0, start_url))
        visited: Set[str] = set()
        depths = {start_url: 0}
-        
+        active_crawls = set()  # Track URLs currently being processed
        try:
-            while not queue.empty() and not self._cancel_event.is_set():
+            while (
                not queue.empty() or active_crawls
            ) and not self._cancel_event.is_set():
                """
                This sets up our main control loop which:
                    - Continues while there are URLs to process (not queue.empty())
-                    - Or while there are tasks still running (pending_tasks)
+                    - Or while there are active crawls still running (arun_many)
                    - Can be interrupted via cancellation (not self._cancel_event.is_set())
                """
-                n = 3
+                # Collect batch of jobs to process
                jobs = []
-                for _ in range(n):
+                # Fill batch with available jobs
-                    if self.queue.empty():
+                while len(jobs) < SCRAPER_BATCH_SIZE and not queue.empty():
-                        break
+                    score, depth, url = await queue.get()
-                    jobs.append(await self.queue.get())
+                    if url not in active_crawls:  # Only add if not currently processing
                        jobs.append((score, depth, url))
                        active_crawls.add(url)
                        self.stats.current_depth = depth
-                # Filter jobs directly, ensuring uniqueness and checking against visited
+                if not jobs:
-                filtered_jobs = []
+                    # If no jobs but active crawls exist, wait a bit and continue
-                for job in jobs:
+                    if active_crawls:
-                    _, depth, url = job
+                        await asyncio.sleep(0.1)
-                    self.stats.current_depth = depth
+                    continue
                    if url not in visited:
                        visited.add(url)
                        filtered_jobs.append(job)
-                crawler_config = CrawlerRunConfig(cache_mode="BYPASS")
+                # Process batch
-                async for result in await crawler.arun_many(urls=[url for _, _, url in filtered_jobs],
+                crawler_config = CrawlerRunConfig(cache_mode="BYPASS", stream=True)
-                                                            config=crawler_config.clone(stream=True)):
+                try:
-                    print(f"Received result for: {result.url} - Success: {result.success}")
+                    async for result in await crawler.arun_many(
-                    source_url, depth = next((url, depth) for _, depth, url in filtered_jobs if url == result.source_url)
+                        urls=[url for _, _, url in jobs], config=crawler_config
-                    await self._process_links(result, source_url, depth, queue, visited, depths)
+                    ):
-                    yield result
+                        source_url, depth = next(
                            (url, depth) for _, depth, url in jobs if url == result.url
                        )
                        active_crawls.remove(source_url)  # Remove from active set
                        if result.success:
                            await self._process_links(
                                result, source_url, depth, queue, visited, depths
                            )
                            yield result
                        else:
                            self.logger.warning(
                                f"Failed to crawl {result.url}: {result.error_message}"
                            )
                except Exception as e:
                    # Remove failed URLs from active set
                    for _, _, url in jobs:
                        active_crawls.discard(url)
                    self.logger.error(f"Batch processing error: {e}")
                    # Continue processing other batches
                    continue
        except Exception as e:
            self.logger.error(f"Error in crawl process: {e}")
            raise
        finally:
            # Clean up any remaining tasks
            # for task in pending_tasks:
            #     task.cancel()
            self.stats.end_time = datetime.now()
    async def shutdown(self):
--- a/crawl4ai/scraper/filters.py
+++ b/crawl4ai/scraper/filters.py
@@ -11,13 +11,16 @@ import logging
 from dataclasses import dataclass
 import fnmatch
@dataclass
 class FilterStats:
    """Statistics for filter applications"""
    total_urls: int = 0
    rejected_urls: int = 0
    passed_urls: int = 0
 class URLFilter(ABC):
    """Base class for URL filters"""
@@ -39,6 +42,7 @@ class URLFilter(ABC):
        else:
            self.stats.rejected_urls += 1
 class FilterChain:
    """Chain of URL filters."""
@@ -47,7 +51,7 @@ class FilterChain:
        self.stats = FilterStats()
        self.logger = logging.getLogger("urlfilter.chain")
-    def add_filter(self, filter_: URLFilter) -> 'FilterChain':
+    def add_filter(self, filter_: URLFilter) -> "FilterChain":
        """Add a filter to the chain"""
        self.filters.append(filter_)
        return self  # Enable method chaining
@@ -65,6 +69,7 @@ class FilterChain:
        self.stats.passed_urls += 1
        return True
 class URLPatternFilter(URLFilter):
    """Filter URLs based on glob patterns or regex.
@@ -79,8 +84,11 @@ class URLPatternFilter(URLFilter):
    - Pattern pre-compilation for performance
    """
-    def __init__(self, patterns: Union[str, Pattern, List[Union[str, Pattern]]], 
+    def __init__(
-                 use_glob: bool = True):
+        self,
        patterns: Union[str, Pattern, List[Union[str, Pattern]]],
        use_glob: bool = True,
    ):
        super().__init__()
        self.patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
        self.use_glob = use_glob
@@ -90,7 +98,9 @@ class URLPatternFilter(URLFilter):
            if isinstance(pattern, str) and use_glob:
                self._compiled_patterns.append(self._glob_to_regex(pattern))
            else:
-                self._compiled_patterns.append(re.compile(pattern) if isinstance(pattern, str) else pattern)
+                self._compiled_patterns.append(
                    re.compile(pattern) if isinstance(pattern, str) else pattern
                )
    def _glob_to_regex(self, pattern: str) -> Pattern:
        """Convert glob pattern to regex"""
@@ -102,6 +112,7 @@ class URLPatternFilter(URLFilter):
        self._update_stats(matches)
        return matches
 class ContentTypeFilter(URLFilter):
    """Filter URLs based on expected content type.
@@ -115,10 +126,13 @@ class ContentTypeFilter(URLFilter):
    - Support for multiple content types
    """
-    def __init__(self, allowed_types: Union[str, List[str]], 
+    def __init__(
-                 check_extension: bool = True):
+        self, allowed_types: Union[str, List[str]], check_extension: bool = True
    ):
        super().__init__()
-        self.allowed_types = [allowed_types] if isinstance(allowed_types, str) else allowed_types
+        self.allowed_types = (
            [allowed_types] if isinstance(allowed_types, str) else allowed_types
        )
        self.check_extension = check_extension
        self._normalize_types()
@@ -128,12 +142,18 @@ class ContentTypeFilter(URLFilter):
    def _check_extension(self, url: str) -> bool:
        """Check URL's file extension"""
-        ext = urlparse(url).path.split('.')[-1].lower() if '.' in urlparse(url).path else ''
+        ext = (
            urlparse(url).path.split(".")[-1].lower()
            if "." in urlparse(url).path
            else ""
        )
        if not ext:
            return True  # No extension, might be dynamic content
        guessed_type = mimetypes.guess_type(url)[0]
-        return any(allowed in (guessed_type or '').lower() for allowed in self.allowed_types)
+        return any(
            allowed in (guessed_type or "").lower() for allowed in self.allowed_types
        )
    def apply(self, url: str) -> bool:
        """Check if URL's content type is allowed"""
@@ -143,6 +163,7 @@ class ContentTypeFilter(URLFilter):
        self._update_stats(result)
        return result
 class DomainFilter(URLFilter):
    """Filter URLs based on allowed/blocked domains.
@@ -156,11 +177,18 @@ class DomainFilter(URLFilter):
    - Efficient domain matching
    """
-    def __init__(self, allowed_domains: Union[str, List[str]] = None, 
+    def __init__(
-                 blocked_domains: Union[str, List[str]] = None):
+        self,
        allowed_domains: Union[str, List[str]] = None,
        blocked_domains: Union[str, List[str]] = None,
    ):
        super().__init__()
-        self.allowed_domains = set(self._normalize_domains(allowed_domains)) if allowed_domains else None
+        self.allowed_domains = (
-        self.blocked_domains = set(self._normalize_domains(blocked_domains)) if blocked_domains else set()
+            set(self._normalize_domains(allowed_domains)) if allowed_domains else None
        )
        self.blocked_domains = (
            set(self._normalize_domains(blocked_domains)) if blocked_domains else set()
        )
    def _normalize_domains(self, domains: Union[str, List[str]]) -> List[str]:
        """Normalize domain strings"""
@@ -187,19 +215,21 @@ class DomainFilter(URLFilter):
        self._update_stats(True)
        return True
 # Example usage:
 def create_common_filter_chain() -> FilterChain:
    """Create a commonly used filter chain"""
-    return FilterChain([
+    return FilterChain(
-        URLPatternFilter([
+        [
-            "*.html", "*.htm",  # HTML files
+            URLPatternFilter(
-            "*/article/*", "*/blog/*"  # Common content paths
+                [
-        ]),
+                    "*.html",
-        ContentTypeFilter([
+                    "*.htm",  # HTML files
-            "text/html",
+                    "*/article/*",
-            "application/xhtml+xml"
+                    "*/blog/*",  # Common content paths
-        ]),
+                ]
-        DomainFilter(
+            ),
-            blocked_domains=["ads.*", "analytics.*"]
+            ContentTypeFilter(["text/html", "application/xhtml+xml"]),
-        )
+            DomainFilter(blocked_domains=["ads.*", "analytics.*"]),
-    ])
+        ]
    )
--- a/crawl4ai/scraper/models.py
+++ b/crawl4ai/scraper/models.py
@@ -2,7 +2,8 @@ from pydantic import BaseModel
 from typing import List, Dict
 from ..models import CrawlResult
 class ScraperResult(BaseModel):
    url: str
    crawled_urls: List[str]
-    extracted_data: Dict[str,CrawlResult]
+    extracted_data: Dict[str, CrawlResult]
--- a/crawl4ai/scraper/scorers.py
+++ b/crawl4ai/scraper/scorers.py
@@ -10,13 +10,15 @@ from collections import defaultdict
 import math
 import logging
@dataclass
 class ScoringStats:
    """Statistics for URL scoring"""
    urls_scored: int = 0
    total_score: float = 0.0
-    min_score: float = float('inf')
+    min_score: float = float("inf")
-    max_score: float = float('-inf')
+    max_score: float = float("-inf")
    def update(self, score: float):
        """Update scoring statistics"""
@@ -30,6 +32,7 @@ class ScoringStats:
        """Calculate average score"""
        return self.total_score / self.urls_scored if self.urls_scored > 0 else 0.0
 class URLScorer(ABC):
    """Base class for URL scoring strategies"""
@@ -51,6 +54,7 @@ class URLScorer(ABC):
        self.stats.update(weighted_score)
        return weighted_score
 class CompositeScorer(URLScorer):
    """Combines multiple scorers with weights"""
@@ -68,6 +72,7 @@ class CompositeScorer(URLScorer):
        return total_score
 class KeywordRelevanceScorer(URLScorer):
    """Score URLs based on keyword relevance.
@@ -82,8 +87,9 @@ class KeywordRelevanceScorer(URLScorer):
    - Weighted scoring
    """
-    def __init__(self, keywords: List[str], weight: float = 1.0,
+    def __init__(
-                 case_sensitive: bool = False):
+        self, keywords: List[str], weight: float = 1.0, case_sensitive: bool = False
    ):
        super().__init__(weight=weight)
        self.keywords = keywords
        self.case_sensitive = case_sensitive
@@ -98,12 +104,12 @@ class KeywordRelevanceScorer(URLScorer):
        """Calculate score based on keyword matches"""
        decoded_url = unquote(url)
        total_matches = sum(
-            1 for pattern in self.patterns
+            1 for pattern in self.patterns if pattern.search(decoded_url)
            if pattern.search(decoded_url)
        )
        # Normalize score between 0 and 1
        return total_matches / len(self.patterns) if self.patterns else 0.0
 class PathDepthScorer(URLScorer):
    """Score URLs based on their path depth.
@@ -124,12 +130,13 @@ class PathDepthScorer(URLScorer):
    def _calculate_score(self, url: str) -> float:
        """Calculate score based on path depth"""
        path = urlparse(url).path
-        depth = len([x for x in path.split('/') if x])
+        depth = len([x for x in path.split("/") if x])
        # Score decreases as we move away from optimal depth
        distance_from_optimal = abs(depth - self.optimal_depth)
        return 1.0 / (1.0 + distance_from_optimal)
 class ContentTypeScorer(URLScorer):
    """Score URLs based on content type preferences.
@@ -152,8 +159,7 @@ class ContentTypeScorer(URLScorer):
    def _compile_patterns(self):
        """Prepare content type patterns"""
        self.patterns = {
-            re.compile(pattern): weight
+            re.compile(pattern): weight for pattern, weight in self.type_weights.items()
            for pattern, weight in self.type_weights.items()
        }
    def _calculate_score(self, url: str) -> float:
@@ -163,6 +169,7 @@ class ContentTypeScorer(URLScorer):
                return weight
        return 0.0
 class FreshnessScorer(URLScorer):
    """Score URLs based on freshness indicators.
@@ -175,9 +182,9 @@ class FreshnessScorer(URLScorer):
    def __init__(self, weight: float = 1.0):
        super().__init__(weight=weight)
        self.date_patterns = [
-            r'/(\d{4})/(\d{2})/(\d{2})/',  # yyyy/mm/dd
+            r"/(\d{4})/(\d{2})/(\d{2})/",  # yyyy/mm/dd
-            r'(\d{4})[-_](\d{2})[-_](\d{2})',  # yyyy-mm-dd
+            r"(\d{4})[-_](\d{2})[-_](\d{2})",  # yyyy-mm-dd
-            r'/(\d{4})/',  # year only
+            r"/(\d{4})/",  # year only
        ]
        self._compile_patterns()
@@ -194,6 +201,7 @@ class FreshnessScorer(URLScorer):
                return 1.0 - (2024 - year) * 0.1
        return 0.5  # Default score for URLs without dates
 class DomainAuthorityScorer(URLScorer):
    """Score URLs based on domain authority.
@@ -207,8 +215,12 @@ class DomainAuthorityScorer(URLScorer):
    Configurable domain weights
    Default weight for unknown domains"""
-    def __init__(self, domain_weights: Dict[str, float], 
+    def __init__(
-                 default_weight: float = 0.5, weight: float = 1.0):
+        self,
        domain_weights: Dict[str, float],
        default_weight: float = 0.5,
        weight: float = 1.0,
    ):
        super().__init__(weight=weight)
        self.domain_weights = domain_weights
        self.default_weight = default_weight
@@ -218,29 +230,23 @@ class DomainAuthorityScorer(URLScorer):
        domain = urlparse(url).netloc.lower()
        return self.domain_weights.get(domain, self.default_weight)
 def create_balanced_scorer() -> CompositeScorer:
    """Create a balanced composite scorer"""
-    return CompositeScorer([
+    return CompositeScorer(
-        KeywordRelevanceScorer(
+        [
-            keywords=["article", "blog", "news", "research"],
+            KeywordRelevanceScorer(
-            weight=1.0
+                keywords=["article", "blog", "news", "research"], weight=1.0
-        ),
+            ),
-        PathDepthScorer(
+            PathDepthScorer(optimal_depth=3, weight=0.7),
-            optimal_depth=3,
+            ContentTypeScorer(
-            weight=0.7
+                type_weights={r"\.html?$": 1.0, r"\.pdf$": 0.8, r"\.xml$": 0.6},
-        ),
+                weight=0.8,
-        ContentTypeScorer(
+            ),
-            type_weights={
+            FreshnessScorer(weight=0.9),
-                r'\.html?$': 1.0,
+        ]
-                r'\.pdf$': 0.8,
+    )
-                r'\.xml$': 0.6
+
            },
            weight=0.8
        ),
        FreshnessScorer(
            weight=0.9
        )
    ])
 # Example Usage:
 """
--- a/crawl4ai/scraper/scraper_strategy.py
+++ b/crawl4ai/scraper/scraper_strategy.py
@@ -4,21 +4,20 @@ from ..models import CrawlResult
 from ..async_webcrawler import AsyncWebCrawler
 from typing import Union, AsyncGenerator
 class ScraperStrategy(ABC):
    @abstractmethod
    async def ascrape(
        self,
        url: str,
        crawler: AsyncWebCrawler,
-        parallel_processing: bool = True,
+        stream: bool = False,
        stream: bool = False
    ) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]:
        """Scrape the given URL using the specified crawler.
        Args:
            url (str): The starting URL for the scrape.
            crawler (AsyncWebCrawler): The web crawler instance.
            parallel_processing (bool): Whether to use parallel processing. Defaults to True.
            stream (bool): If True, yields individual crawl results as they are ready;
                                if False, accumulates results and returns a final ScraperResult.
--- a/docs/scraper/scraper_quickstart.py
+++ b/docs/scraper/scraper_quickstart.py
@@ -4,13 +4,14 @@ from crawl4ai.scraper import (
    BFSScraperStrategy,
    FilterChain,
    URLPatternFilter,
-    ContentTypeFilter
+    ContentTypeFilter,
 )
 from crawl4ai.async_webcrawler import AsyncWebCrawler, BrowserConfig
 import re
 browser_config = BrowserConfig(headless=True, viewport_width=800, viewport_height=600)
 async def basic_scraper_example():
    """
    Basic example: Scrape a blog site for articles
@@ -19,24 +20,25 @@ async def basic_scraper_example():
    - Collects all results at once
    """
    # Create a simple filter chain
-    filter_chain = FilterChain([
+    filter_chain = FilterChain(
-        # Only crawl pages within the blog section
+        [
-        URLPatternFilter("*/tutorial/*"),
+            # Only crawl pages within the blog section
-        # Only process HTML pages
+            URLPatternFilter("*/tutorial/*"),
-        ContentTypeFilter(["text/html"])
+            # Only process HTML pages
-    ])
+            ContentTypeFilter(["text/html"]),
        ]
    )
    # Initialize the strategy with basic configuration
    strategy = BFSScraperStrategy(
        max_depth=2,  # Only go 2 levels deep
        filter_chain=filter_chain,
        url_scorer=None,  # Use default scoring
-        max_concurrent=3,  # Limit concurrent requests
+        process_external_links=True,
        process_external_links=True
    )
    # Create the crawler and scraper
-    async with AsyncWebCrawler(config=browser_config,verbose=True) as crawler:
+    async with AsyncWebCrawler(config=browser_config, verbose=True) as crawler:
        scraper = AsyncWebScraper(crawler, strategy)
        # Start scraping
        try:
@@ -50,6 +52,7 @@ async def basic_scraper_example():
        except Exception as e:
            print(f"Error during scraping: {e}")
 # advanced_scraper_example.py
 import logging
 from crawl4ai.scraper import (
@@ -62,10 +65,11 @@ from crawl4ai.scraper import (
    KeywordRelevanceScorer,
    PathDepthScorer,
    FreshnessScorer,
-    CompositeScorer
+    CompositeScorer,
 )
 from crawl4ai.async_webcrawler import AsyncWebCrawler
 async def advanced_scraper_example():
    """
    Advanced example: Intelligent news site scraping
@@ -79,49 +83,44 @@ async def advanced_scraper_example():
    logger = logging.getLogger("advanced_scraper")
    # Create sophisticated filter chain
-    filter_chain = FilterChain([
+    filter_chain = FilterChain(
-        # Domain control
+        [
-        DomainFilter(
+            # Domain control
-            allowed_domains=["techcrunch.com"],
+            DomainFilter(
-            blocked_domains=["login.techcrunch.com","legal.yahoo.com"]
+                allowed_domains=["techcrunch.com"],
-        ),
+                blocked_domains=["login.techcrunch.com", "legal.yahoo.com"],
-        # URL patterns
+            ),
-        URLPatternFilter([
+            # URL patterns
-            "*/article/*",
+            URLPatternFilter(
-            "*/news/*",
+                [
-            "*/blog/*",
+                    "*/article/*",
-            re.compile(r"\d{4}/\d{2}/.*")  # Date-based URLs
+                    "*/news/*",
-        ]),
+                    "*/blog/*",
-        # Content types
+                    re.compile(r"\d{4}/\d{2}/.*"),  # Date-based URLs
-        ContentTypeFilter([
+                ]
-            "text/html",
+            ),
-            "application/xhtml+xml"
+            # Content types
-        ])
+            ContentTypeFilter(["text/html", "application/xhtml+xml"]),
-    ])
+        ]
    )
    # Create composite scorer
-    scorer = CompositeScorer([
+    scorer = CompositeScorer(
-        # Prioritize by keywords
+        [
-        KeywordRelevanceScorer(
+            # Prioritize by keywords
-            keywords=["news", "breaking", "update", "latest"],
+            KeywordRelevanceScorer(
-            weight=1.0
+                keywords=["news", "breaking", "update", "latest"], weight=1.0
-        ),
+            ),
-        # Prefer optimal URL structure
+            # Prefer optimal URL structure
-        PathDepthScorer(
+            PathDepthScorer(optimal_depth=3, weight=0.7),
-            optimal_depth=3,
+            # Prioritize fresh content
-            weight=0.7
+            FreshnessScorer(weight=0.9),
-        ),
+        ]
-        # Prioritize fresh content
+    )
        FreshnessScorer(weight=0.9)
    ])
    # Initialize strategy with advanced configuration
    strategy = BFSScraperStrategy(
-        max_depth=2,
+        max_depth=2, filter_chain=filter_chain, url_scorer=scorer
        filter_chain=filter_chain,
        url_scorer=scorer,
        max_concurrent=2,
        min_crawl_delay=1
    )
    # Create crawler and scraper
@@ -129,27 +128,27 @@ async def advanced_scraper_example():
        scraper = AsyncWebScraper(crawler, strategy)
        # Track statistics
-        stats = {
+        stats = {"processed": 0, "errors": 0, "total_size": 0}
            'processed': 0,
            'errors': 0,
            'total_size': 0
        }
        try:
            # Use streaming mode
-            result_generator = await scraper.ascrape("https://techcrunch.com", parallel_processing=True, stream=True)
+            result_generator = await scraper.ascrape(
                "https://techcrunch.com", stream=True
            )
            async for result in result_generator:
-                stats['processed'] += 1
+                stats["processed"] += 1
                if result.success:
-                    stats['total_size'] += len(result.html)
+                    stats["total_size"] += len(result.html)
                    logger.info(f"Processed: {result.url}")
                else:
-                    stats['errors'] += 1
+                    stats["errors"] += 1
-                    logger.error(f"Failed to process {result.url}: {result.error_message}")
+                    logger.error(
                        f"Failed to process {result.url}: {result.error_message}"
                    )
                # Log progress regularly
-                if stats['processed'] % 10 == 0:
+                if stats["processed"] % 10 == 0:
                    logger.info(f"Progress: {stats['processed']} URLs processed")
        except Exception as e:
@@ -171,7 +170,10 @@ async def advanced_scraper_example():
            # Print scorer statistics
            logger.info("Scoring statistics:")
            logger.info(f"- Average score: {scorer.stats.average_score:.2f}")
-            logger.info(f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}")
+            logger.info(
                f"- Score range: {scorer.stats.min_score:.2f} - {scorer.stats.max_score:.2f}"
            )
 if __name__ == "__main__":
    import asyncio
@@ -181,5 +183,5 @@ if __name__ == "__main__":
    asyncio.run(basic_scraper_example())
    # Run advanced example
-    print("\nRunning advanced scraper example...")
+    # print("\nRunning advanced scraper example...")
-    asyncio.run(advanced_scraper_example())
+    # asyncio.run(advanced_scraper_example())