refactor: Remove the URL processing logic out of scraper

2025-01-21 12:16:59 +05:30
parent a677c2b61d
commit 1079965453
2 changed files with 35 additions and 144 deletions
--- a/crawl4ai/scraper/async_web_scraper.py
+++ b/crawl4ai/scraper/async_web_scraper.py
@@ -57,7 +57,6 @@ class AsyncWebScraper:
    async def ascrape(
        self, 
        url: str, 
-        parallel_processing: bool = True, 
        stream: bool = False
    ) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]:
        """
@@ -65,7 +64,6 @@ class AsyncWebScraper:
        
        Args:
            url: Starting URL for scraping
-            parallel_processing: Whether to process URLs in parallel
            stream: If True, yield results as they come; if False, collect all results
            
        Returns:
@@ -75,17 +73,16 @@ class AsyncWebScraper:
        
        async with self._error_handling_context(url):
            if stream:
-                return self._ascrape_yielding(url, parallel_processing)
-            return await self._ascrape_collecting(url, parallel_processing)
+                return self._ascrape_yielding(url)
+            return await self._ascrape_collecting(url)

    async def _ascrape_yielding(
        self, 
        url: str, 
-        parallel_processing: bool
    ) -> AsyncGenerator[CrawlResult, None]:
        """Stream scraping results as they become available."""
        try:
-            result_generator = self.strategy.ascrape(url, self.crawler, parallel_processing)
+            result_generator = self.strategy.ascrape(url, self.crawler)
            async for res in result_generator:
                self._progress.processed_urls += 1
                self._progress.current_url = res.url
@@ -97,13 +94,12 @@ class AsyncWebScraper:
    async def _ascrape_collecting(
        self, 
        url: str, 
-        parallel_processing: bool
    ) -> ScraperResult:
        """Collect all scraping results before returning."""
        extracted_data = {}
        
        try:
-            result_generator = self.strategy.ascrape(url, self.crawler, parallel_processing)
+            result_generator = self.strategy.ascrape(url, self.crawler)
            async for res in result_generator:
                self._progress.processed_urls += 1
                self._progress.current_url = res.url
--- a/crawl4ai/scraper/bfs_scraper_strategy.py
+++ b/crawl4ai/scraper/bfs_scraper_strategy.py
@@ -6,15 +6,12 @@ import logging
 from urllib.parse import urlparse
 from urllib.robotparser import RobotFileParser
 import validators
-import time
-from aiolimiter import AsyncLimiter
-from tenacity import retry, stop_after_attempt, wait_exponential
-from collections import defaultdict

+from crawl4ai.async_configs import CrawlerRunConfig
 from .models import CrawlResult
 from .filters import FilterChain
 from .scorers import URLScorer
-from ..async_webcrawler import AsyncWebCrawler, CrawlerRunConfig
+from ..async_webcrawler import AsyncWebCrawler
 from .scraper_strategy import ScraperStrategy

@dataclass
@@ -37,29 +34,18 @@ class BFSScraperStrategy(ScraperStrategy):
        filter_chain: FilterChain,
        url_scorer: URLScorer,
        process_external_links: bool = False,
-        max_concurrent: int = 5,
-        min_crawl_delay: int = 1,
-        timeout: int = 30,
        logger: Optional[logging.Logger] = None
    ):
        self.max_depth = max_depth
        self.filter_chain = filter_chain
        self.url_scorer = url_scorer
-        self.max_concurrent = max_concurrent
-        self.min_crawl_delay = min_crawl_delay
-        self.timeout = timeout
        self.logger = logger or logging.getLogger(__name__)
        
        # Crawl control
        self.stats = CrawlStats(start_time=datetime.now())
        self._cancel_event = asyncio.Event()
        self.process_external_links = process_external_links
-        
-        # Rate limiting and politeness
-        self.rate_limiter = AsyncLimiter(1, 1)
-        self.last_crawl_time = defaultdict(float)
        self.robot_parsers: Dict[str, RobotFileParser] = {}
-        self.domain_queues: Dict[str, asyncio.Queue] = defaultdict(asyncio.Queue)

    async def can_process_url(self, url: str, depth: int) -> bool:
        """Check if URL can be processed based on robots.txt and filters
@@ -107,74 +93,6 @@ class BFSScraperStrategy(ScraperStrategy):
                return None
        return self.robot_parsers[domain]

-    @retry(stop=stop_after_attempt(3), 
-           wait=wait_exponential(multiplier=1, min=4, max=10))
-    async def _crawl_with_retry(
-        self, 
-        crawler: AsyncWebCrawler, 
-        url: str
-    ) -> CrawlResult:
-        """Crawl URL with retry logic"""
-        try:
-            crawler_config = CrawlerRunConfig(cache_mode="BYPASS")
-            return await asyncio.wait_for(crawler.arun(url, config=crawler_config), timeout=self.timeout)
-        except asyncio.TimeoutError:
-            self.logger.error(f"Timeout crawling {url}")
-            raise
-        except Exception as e:
-            # Catch any other exceptions that may cause retries
-            self.logger.error(f"Error crawling {url}: {e}")
-            raise
-
-
-    async def process_url(
-        self,
-        url: str,
-        depth: int,
-        crawler: AsyncWebCrawler,
-        queue: asyncio.PriorityQueue,
-        visited: Set[str],
-        depths: Dict[str, int]
-    ) -> Optional[CrawlResult]:
-        """Process a single URL and extract links.
-        This is our main URL processing workhorse that:
-            - Checks for cancellation
-            - Validates URLs through can_process_url
-            - Implements politeness delays per domain
-            - Applies rate limiting
-            - Handles crawling with retries
-            - Updates various statistics
-            - Processes extracted links
-            - Returns the crawl result or None on failure
-        """
-        
-        if self._cancel_event.is_set():
-            return None
-            
-        if not await self.can_process_url(url, depth):
-            self.stats.urls_skipped += 1
-            return None
-
-        # Politeness delay
-        domain = urlparse(url).netloc
-        time_since_last = time.time() - self.last_crawl_time[domain]
-        if time_since_last < self.min_crawl_delay:
-            await asyncio.sleep(self.min_crawl_delay - time_since_last)
-        self.last_crawl_time[domain] = time.time()
-
-        # Crawl with rate limiting
-        try:
-            async with self.rate_limiter:
-                result = await self._crawl_with_retry(crawler, url)
-                self.stats.urls_processed += 1
-                 # Process links
-                await self._process_links(result, url, depth, queue, visited, depths)
-                return result
-        except Exception as e:
-            self.logger.error(f"Error crawling {url}: {e}")
-            self.stats.urls_failed += 1
-            return None
-
    async def _process_links(
        self,
        result: CrawlResult,
@@ -187,7 +105,7 @@ class BFSScraperStrategy(ScraperStrategy):
        """Process extracted links from crawl result.
        This is our link processor that:
            Handles both internal and external links
-            Normalizes URLs (removes fragments)
+            Checks if URL can be processed - validates URL, applies Filters and tests Robots.txt compliance with can_process_url
            Checks depth limits
            Scores URLs for priority
            Updates depth tracking
@@ -199,6 +117,9 @@ class BFSScraperStrategy(ScraperStrategy):
            links_to_process += result.links["external"]
        for link in links_to_process:
            url = link['href']
+            if not await self.can_process_url(url, depth):
+                self.stats.urls_skipped += 1
+                continue
            if url not in visited:
                new_depth = depths[source_url] + 1
                if new_depth <= self.max_depth:
@@ -219,7 +140,6 @@ class BFSScraperStrategy(ScraperStrategy):
        self,
        start_url: str,
        crawler: AsyncWebCrawler,
-        parallel_processing: bool = True
    ) -> AsyncGenerator[CrawlResult, None]:
        """Implement BFS crawling strategy"""
        
@@ -237,62 +157,38 @@ class BFSScraperStrategy(ScraperStrategy):
        await queue.put((0, 0, start_url))
        visited: Set[str] = set()
        depths = {start_url: 0}
-        pending_tasks = set()
        
        try:
-            while (not queue.empty() or pending_tasks) and not self._cancel_event.is_set():
+            while not queue.empty() and not self._cancel_event.is_set():
                """
                This sets up our main control loop which:
                    - Continues while there are URLs to process (not queue.empty())
                    - Or while there are tasks still running (pending_tasks)
                    - Can be interrupted via cancellation (not self._cancel_event.is_set())
                """
-                # Start new tasks up to max_concurrent
-                while not queue.empty() and len(pending_tasks) < self.max_concurrent:
-                    """
-                    This section manages task creation:
-                        Checks if we can start more tasks (under max_concurrent limit)
-                        Gets the next URL from the priority queue
-                        Marks URLs as visited immediately to prevent duplicates
-                        Updates current depth in stats
-                        Either:
-                            Creates a new async task (parallel mode)
-                            Processes URL directly (sequential mode)
-                    """
-                    _, depth, url = await queue.get()
+                n = 3
+                jobs = []
+                for _ in range(n):
+                    if self.queue.empty():
+                        break
+                    jobs.append(await self.queue.get())
+                
+                # Filter jobs directly, ensuring uniqueness and checking against visited
+                filtered_jobs = []
+                for job in jobs:
+                    _, depth, url = job
+                    self.stats.current_depth = depth
                    if url not in visited:
                        visited.add(url)
-                        self.stats.current_depth = depth
-                        
-                        if parallel_processing:
-                            task = asyncio.create_task(
-                                self.process_url(url, depth, crawler, queue, visited, depths)
-                            )
-                            pending_tasks.add(task)
-                        else:
-                            result = await self.process_url(
-                                url, depth, crawler, queue, visited, depths
-                            )
-                            if result:
-                                yield result
-
-                # Process completed tasks
-                """
-                This section manages completed tasks:
-                    Waits for any task to complete using asyncio.wait
-                    Uses FIRST_COMPLETED to handle results as soon as they're ready
-                    Yields successful results to the caller
-                    Updates pending_tasks to remove completed ones
-                """
-                if pending_tasks:
-                    done, pending_tasks = await asyncio.wait(
-                        pending_tasks,
-                        return_when=asyncio.FIRST_COMPLETED
-                    )
-                    for task in done:
-                        result = await task
-                        if result:
-                            yield result
+                        filtered_jobs.append(job)
+                
+                crawler_config = CrawlerRunConfig(cache_mode="BYPASS")
+                async for result in await crawler.arun_many(urls=[url for _, _, url in filtered_jobs],
+                                                            config=crawler_config.clone(stream=True)):
+                    print(f"Received result for: {result.url} - Success: {result.success}")
+                    source_url, depth = next((url, depth) for _, depth, url in filtered_jobs if url == result.source_url)
+                    await self._process_links(result, source_url, depth, queue, visited, depths)
+                    yield result
                            
        except Exception as e:
            self.logger.error(f"Error in crawl process: {e}")
@@ -300,13 +196,12 @@ class BFSScraperStrategy(ScraperStrategy):
            
        finally:
            # Clean up any remaining tasks
-            for task in pending_tasks:
-                task.cancel()
+            # for task in pending_tasks:
+            #     task.cancel()
            self.stats.end_time = datetime.now()

    async def shutdown(self):
        """Clean up resources and stop crawling"""
        self._cancel_event.set()
        # Clear caches and close connections
-        self.robot_parsers.clear()
-        self.domain_queues.clear()
+        self.robot_parsers.clear()