Revieweing the BFS strategy.

2024-11-07 18:54:53 +08:00
parent be472c624c
commit 3d1c9a8434
2 changed files with 368 additions and 111 deletions
--- a/crawl4ai/scraper/bfs_scraper_strategy
+++ b/crawl4ai/scraper/bfs_scraper_strategy
@@ -0,0 +1,138 @@
 from .scraper_strategy import ScraperStrategy
 from .filters import FilterChain
 from .scorers import URLScorer
 from ..models import CrawlResult
 from ..async_webcrawler import AsyncWebCrawler
 import asyncio
 import validators
 from urllib.parse import urljoin,urlparse,urlunparse
 from urllib.robotparser import RobotFileParser
 import time
 from aiolimiter import AsyncLimiter
 from tenacity import retry, stop_after_attempt, wait_exponential
 from collections import defaultdict
 import logging
 from typing import Dict, AsyncGenerator
 logging.basicConfig(level=logging.DEBUG)
 rate_limiter = AsyncLimiter(1, 1)  # 1 request per second
 class BFSScraperStrategy(ScraperStrategy):
    def __init__(self, max_depth: int, filter_chain: FilterChain, url_scorer: URLScorer, max_concurrent: int = 5, min_crawl_delay: int=1):
        self.max_depth = max_depth
        self.filter_chain = filter_chain
        self.url_scorer = url_scorer
        self.max_concurrent = max_concurrent
        # For Crawl Politeness
        self.last_crawl_time = defaultdict(float)
        self.min_crawl_delay = min_crawl_delay  # 1 second delay between requests to the same domain
        # For Robots.txt Compliance
        self.robot_parsers = {}
    # Robots.txt Parser
    def get_robot_parser(self, url: str) -> RobotFileParser:
        domain = urlparse(url)
        scheme = domain.scheme if domain.scheme else 'http'  # Default to 'http' if no scheme provided
        netloc = domain.netloc
        if netloc not in self.robot_parsers:
            rp = RobotFileParser()
            rp.set_url(f"{scheme}://{netloc}/robots.txt")
            try:
                rp.read()
            except Exception as e:
                # Log the type of error, message, and the URL
                logging.warning(f"Error {type(e).__name__} occurred while fetching robots.txt for {netloc}: {e}")
                return None
            self.robot_parsers[netloc] = rp
        return self.robot_parsers[netloc]
    # Retry with exponential backoff
    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
    async def retry_crawl(self, crawler: AsyncWebCrawler, url: str) -> CrawlResult:
        return await crawler.arun(url)
    async def process_url(self, url: str, depth: int, crawler: AsyncWebCrawler, queue: asyncio.PriorityQueue, visited: set, depths: Dict[str, int]) -> AsyncGenerator[CrawlResult, None]:
        def normalize_url(url: str) -> str:
            parsed = urlparse(url)
            return urlunparse(parsed._replace(fragment=""))
        # URL Validation
        if not validators.url(url):
            logging.warning(f"Invalid URL: {url}")
            return None
        # Robots.txt Compliance
        robot_parser = self.get_robot_parser(url)
        if robot_parser is None:
            logging.info(f"Could not retrieve robots.txt for {url}, hence proceeding with crawl.")
        else:
            # If robots.txt was fetched, check if crawling is allowed
            if not robot_parser.can_fetch(crawler.crawler_strategy.user_agent, url):
                logging.info(f"Skipping {url} as per robots.txt")
                return None
        # Crawl Politeness
        domain = urlparse(url).netloc
        time_since_last_crawl = time.time() - self.last_crawl_time[domain]
        if time_since_last_crawl < self.min_crawl_delay:
            await asyncio.sleep(self.min_crawl_delay - time_since_last_crawl)
        self.last_crawl_time[domain] = time.time()
        # Rate Limiting
        async with rate_limiter:
            # Error Handling
            try:
                crawl_result = await self.retry_crawl(crawler, url)
            except Exception as e:
                logging.error(f"Error crawling {url}: {str(e)}")
                crawl_result = CrawlResult(url=url, html="", success=False, status_code=0, error_message=str(e))
        if not crawl_result.success:
            # Logging and Monitoring
            logging.error(f"Failed to crawl URL: {url}. Error: {crawl_result.error_message}")
            return crawl_result
        # Process links
        for link_type in ["internal", "external"]:
            for link in crawl_result.links[link_type]:
                absolute_link = urljoin(url, link['href'])
                normalized_link = normalize_url(absolute_link)
                if self.filter_chain.apply(normalized_link) and normalized_link not in visited:
                    new_depth = depths[url] + 1
                    if new_depth <= self.max_depth:
                        # URL Scoring
                        score = self.url_scorer.score(normalized_link)
                        await queue.put((score, new_depth, normalized_link))
                        depths[normalized_link] = new_depth
        return crawl_result
    async def ascrape(self, start_url: str, crawler: AsyncWebCrawler, parallel_processing:bool = True) -> AsyncGenerator[CrawlResult,None]:
        queue = asyncio.PriorityQueue()
        queue.put_nowait((0, 0, start_url))
        visited = set()
        depths = {start_url: 0}
        pending_tasks = set()
        while not queue.empty() or pending_tasks:
            while not queue.empty() and len(pending_tasks) < self.max_concurrent:
                _, depth, url = await queue.get()
                if url not in visited:
                    # Adding URL to the visited set here itself, (instead of after result generation)
                    # so that other tasks are not queued for same URL, found at different depth before
                    # crawling and extraction of this task is completed.
                    visited.add(url)
                    if parallel_processing:
                        task = asyncio.create_task(self.process_url(url, depth, crawler, queue, visited, depths))
                        pending_tasks.add(task)
                    else:
                        result = await self.process_url(url, depth, crawler, queue, visited, depths)
                        if result:
                            yield result 
            # Wait for the first task to complete and yield results incrementally as each task is completed
            if pending_tasks:
                done, pending_tasks = await asyncio.wait(pending_tasks, return_when=asyncio.FIRST_COMPLETED)
                for task in done:
                    result = await task
                    if result:
                        yield result
--- a/crawl4ai/scraper/bfs_scraper_strategy.py
+++ b/crawl4ai/scraper/bfs_scraper_strategy.py
@@ -1,139 +1,258 @@
-from .scraper_strategy import ScraperStrategy
+from abc import ABC, abstractmethod
-from .filters import FilterChain
+from typing import Union, AsyncGenerator, Optional, Dict, Set
-from .scorers import URLScorer
+from dataclasses import dataclass
-from ..models import CrawlResult
+from datetime import datetime
 from ..async_webcrawler import AsyncWebCrawler
 import asyncio
-import validators
+import logging
-from urllib.parse import urljoin,urlparse,urlunparse
+from urllib.parse import urljoin, urlparse, urlunparse
 from urllib.robotparser import RobotFileParser
 import validators
 import time
 from aiolimiter import AsyncLimiter
 from tenacity import retry, stop_after_attempt, wait_exponential
 from collections import defaultdict
 import logging
 from typing import Dict, AsyncGenerator
 logging.basicConfig(level=logging.DEBUG)
-rate_limiter = AsyncLimiter(1, 1)  # 1 request per second
+from .models import ScraperResult, CrawlResult
 from .filters import FilterChain
 from .scorers import URLScorer
 from ..async_webcrawler import AsyncWebCrawler
@dataclass
 class CrawlStats:
    """Statistics for the crawling process"""
    start_time: datetime
    urls_processed: int = 0
    urls_failed: int = 0
    urls_skipped: int = 0
    total_depth_reached: int = 0
    current_depth: int = 0
    robots_blocked: int = 0
 class ScraperStrategy(ABC):
    """Base class for scraping strategies"""
    @abstractmethod
    async def ascrape(
        self, 
        url: str, 
        crawler: AsyncWebCrawler, 
        parallel_processing: bool = True,
        stream: bool = False
    ) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]:
        """Abstract method for scraping implementation"""
        pass
    @abstractmethod
    async def can_process_url(self, url: str) -> bool:
        """Check if URL can be processed based on strategy rules"""
        pass
    @abstractmethod
    async def shutdown(self):
        """Clean up resources used by the strategy"""
        pass
 class BFSScraperStrategy(ScraperStrategy):
-    def __init__(self, max_depth: int, filter_chain: FilterChain, url_scorer: URLScorer, max_concurrent: int = 5, min_crawl_delay: int=1):
+    """Breadth-First Search scraping strategy with politeness controls"""
    def __init__(
        self,
        max_depth: int,
        filter_chain: FilterChain,
        url_scorer: URLScorer,
        max_concurrent: int = 5,
        min_crawl_delay: int = 1,
        timeout: int = 30,
        logger: Optional[logging.Logger] = None
    ):
        self.max_depth = max_depth
        self.filter_chain = filter_chain
        self.url_scorer = url_scorer
        self.max_concurrent = max_concurrent
-        # For Crawl Politeness
+        self.min_crawl_delay = min_crawl_delay
        self.timeout = timeout
        self.logger = logger or logging.getLogger(__name__)
        # Crawl control
        self.stats = CrawlStats(start_time=datetime.now())
        self._cancel_event = asyncio.Event()
        # Rate limiting and politeness
        self.rate_limiter = AsyncLimiter(1, 1)
        self.last_crawl_time = defaultdict(float)
-        self.min_crawl_delay = min_crawl_delay  # 1 second delay between requests to the same domain
+        self.robot_parsers: Dict[str, RobotFileParser] = {}
-        # For Robots.txt Compliance
+        self.domain_queues: Dict[str, asyncio.Queue] = defaultdict(asyncio.Queue)
        self.robot_parsers = {}
-    # Robots.txt Parser
+    async def can_process_url(self, url: str) -> bool:
-    def get_robot_parser(self, url: str) -> RobotFileParser:
+        """Check if URL can be processed based on robots.txt and filters"""
        domain = urlparse(url)
        scheme = domain.scheme if domain.scheme else 'http'  # Default to 'http' if no scheme provided
        netloc = domain.netloc
        if netloc not in self.robot_parsers:
            rp = RobotFileParser()
            rp.set_url(f"{scheme}://{netloc}/robots.txt")
            try:
                rp.read()
            except Exception as e:
                # Log the type of error, message, and the URL
                logging.warning(f"Error {type(e).__name__} occurred while fetching robots.txt for {netloc}: {e}")
                return None
            self.robot_parsers[netloc] = rp
        return self.robot_parsers[netloc]
    # Retry with exponential backoff
    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
    async def retry_crawl(self, crawler: AsyncWebCrawler, url: str) -> CrawlResult:
        return await crawler.arun(url)
    async def process_url(self, url: str, depth: int, crawler: AsyncWebCrawler, queue: asyncio.PriorityQueue, visited: set, depths: Dict[str, int]) -> AsyncGenerator[CrawlResult, None]:
        def normalize_url(url: str) -> str:
            parsed = urlparse(url)
            return urlunparse(parsed._replace(fragment=""))
        # URL Validation
        if not validators.url(url):
-            logging.warning(f"Invalid URL: {url}")
+            self.logger.warning(f"Invalid URL: {url}")
            return False
        robot_parser = await self._get_robot_parser(url)
        if robot_parser and not robot_parser.can_fetch("*", url):
            self.stats.robots_blocked += 1
            self.logger.info(f"Blocked by robots.txt: {url}")
            return False
        return self.filter_chain.apply(url)
    async def _get_robot_parser(self, url: str) -> Optional[RobotFileParser]:
        """Get or create robots.txt parser for domain"""
        domain = urlparse(url).netloc
        if domain not in self.robot_parsers:
            parser = RobotFileParser()
            try:
                robots_url = f"{urlparse(url).scheme}://{domain}/robots.txt"
                parser.set_url(robots_url)
                parser.read()
                self.robot_parsers[domain] = parser
            except Exception as e:
                self.logger.warning(f"Error fetching robots.txt for {domain}: {e}")
                return None
        return self.robot_parsers[domain]
    @retry(stop=stop_after_attempt(3), 
           wait=wait_exponential(multiplier=1, min=4, max=10))
    async def _crawl_with_retry(
        self, 
        crawler: AsyncWebCrawler, 
        url: str
    ) -> CrawlResult:
        """Crawl URL with retry logic"""
        try:
            async with asyncio.timeout(self.timeout):
                return await crawler.arun(url)
        except asyncio.TimeoutError:
            self.logger.error(f"Timeout crawling {url}")
            raise
    async def process_url(
        self,
        url: str,
        depth: int,
        crawler: AsyncWebCrawler,
        queue: asyncio.PriorityQueue,
        visited: Set[str],
        depths: Dict[str, int]
    ) -> Optional[CrawlResult]:
        """Process a single URL and extract links"""
        if self._cancel_event.is_set():
            return None
-        # Robots.txt Compliance
+        if not await self.can_process_url(url):
-        robot_parser = self.get_robot_parser(url)
+            self.stats.urls_skipped += 1
-        if robot_parser is None:
+            return None
            logging.info(f"Could not retrieve robots.txt for {url}, hence proceeding with crawl.")
        else:
            # If robots.txt was fetched, check if crawling is allowed
            if not robot_parser.can_fetch(crawler.crawler_strategy.user_agent, url):
                logging.info(f"Skipping {url} as per robots.txt")
                return None
-        # Crawl Politeness
+        # Politeness delay
        domain = urlparse(url).netloc
-        time_since_last_crawl = time.time() - self.last_crawl_time[domain]
+        time_since_last = time.time() - self.last_crawl_time[domain]
-        if time_since_last_crawl < self.min_crawl_delay:
+        if time_since_last < self.min_crawl_delay:
-            await asyncio.sleep(self.min_crawl_delay - time_since_last_crawl)
+            await asyncio.sleep(self.min_crawl_delay - time_since_last)
        self.last_crawl_time[domain] = time.time()
-        # Rate Limiting
+        # Crawl with rate limiting
-        async with rate_limiter:
+        try:
-            # Error Handling
+            async with self.rate_limiter:
-            try:
+                result = await self._crawl_with_retry(crawler, url)
-                crawl_result = await self.retry_crawl(crawler, url)
+                self.stats.urls_processed += 1
-            except Exception as e:
+        except Exception as e:
-                logging.error(f"Error crawling {url}: {str(e)}")
+            self.logger.error(f"Error crawling {url}: {e}")
-                crawl_result = CrawlResult(url=url, html="", success=False, status_code=0, error_message=str(e))
+            self.stats.urls_failed += 1
-        
+            return None
        if not crawl_result.success:
            # Logging and Monitoring
            logging.error(f"Failed to crawl URL: {url}. Error: {crawl_result.error_message}")
            return crawl_result
        # Process links
-        for link_type in ["internal", "external"]:
+        await self._process_links(result, url, depth, queue, visited, depths)
            for link in crawl_result.links[link_type]:
                absolute_link = urljoin(url, link['href'])
                normalized_link = normalize_url(absolute_link)
                if self.filter_chain.apply(normalized_link) and normalized_link not in visited:
                    new_depth = depths[url] + 1
                    if new_depth <= self.max_depth:
                        # URL Scoring
                        score = self.url_scorer.score(normalized_link)
                        await queue.put((score, new_depth, normalized_link))
                        depths[normalized_link] = new_depth
        return crawl_result
-    async def ascrape(self, start_url: str, crawler: AsyncWebCrawler, parallel_processing:bool = True) -> AsyncGenerator[CrawlResult,None]:
+        return result
    async def _process_links(
        self,
        result: CrawlResult,
        source_url: str,
        depth: int,
        queue: asyncio.PriorityQueue,
        visited: Set[str],
        depths: Dict[str, int]
    ):
        """Process extracted links from crawl result"""
        for link_type in ["internal", "external"]:
            for link in result.links[link_type]:
                url = urljoin(source_url, link['href'])
                url = urlunparse(urlparse(url)._replace(fragment=""))
                if url not in visited and await self.can_process_url(url):
                    new_depth = depths[source_url] + 1
                    if new_depth <= self.max_depth:
                        score = self.url_scorer.score(url)
                        await queue.put((score, new_depth, url))
                        depths[url] = new_depth
                        self.stats.total_depth_reached = max(
                            self.stats.total_depth_reached, 
                            new_depth
                        )
    async def ascrape(
        self,
        start_url: str,
        crawler: AsyncWebCrawler,
        parallel_processing: bool = True
    ) -> AsyncGenerator[CrawlResult, None]:
        """Implement BFS crawling strategy"""
        # Initialize crawl state
        queue = asyncio.PriorityQueue()
-        queue.put_nowait((0, 0, start_url))
+        await queue.put((0, 0, start_url))
-        visited = set()
+        visited: Set[str] = set()
        depths = {start_url: 0}
        pending_tasks = set()
-        while not queue.empty() or pending_tasks:
+        try:
-            while not queue.empty() and len(pending_tasks) < self.max_concurrent:
+            while (not queue.empty() or pending_tasks) and not self._cancel_event.is_set():
-                _, depth, url = await queue.get()
+                # Start new tasks up to max_concurrent
-                if url not in visited:
+                while not queue.empty() and len(pending_tasks) < self.max_concurrent:
-                    # Adding URL to the visited set here itself, (instead of after result generation)
+                    _, depth, url = await queue.get()
-                    # so that other tasks are not queued for same URL, found at different depth before
+                    if url not in visited:
-                    # crawling and extraction of this task is completed.
+                        visited.add(url)
-                    visited.add(url)
+                        self.stats.current_depth = depth
-                    if parallel_processing:
+                        
-                        task = asyncio.create_task(self.process_url(url, depth, crawler, queue, visited, depths))
+                        if parallel_processing:
-                        pending_tasks.add(task)
+                            task = asyncio.create_task(
-                    else:
+                                self.process_url(url, depth, crawler, queue, visited, depths)
-                        result = await self.process_url(url, depth, crawler, queue, visited, depths)
+                            )
                            pending_tasks.add(task)
                        else:
                            result = await self.process_url(
                                url, depth, crawler, queue, visited, depths
                            )
                            if result:
                                yield result
                # Process completed tasks
                if pending_tasks:
                    done, pending_tasks = await asyncio.wait(
                        pending_tasks,
                        return_when=asyncio.FIRST_COMPLETED
                    )
                    for task in done:
                        result = await task
                        if result:
                            yield result
-            # Wait for the first task to complete and yield results incrementally as each task is completed
+        except Exception as e:
-            if pending_tasks:
+            self.logger.error(f"Error in crawl process: {e}")
-                done, pending_tasks = await asyncio.wait(pending_tasks, return_when=asyncio.FIRST_COMPLETED)
+            raise
-                for task in done:
+            
-                    result = await task
+        finally:
-                    if result:
+            # Clean up any remaining tasks
-                        yield result
+            for task in pending_tasks:
                task.cancel()
            self.stats.end_time = datetime.now()
    async def shutdown(self):
        """Clean up resources and stop crawling"""
        self._cancel_event.set()
        # Clear caches and close connections
        self.robot_parsers.clear()
        self.domain_queues.clear()