fix: Move the creation of crawler outside the main loop

2025-01-27 18:31:13 +05:30
parent 0ff95c83bc
commit d9324e3454
1 changed files with 35 additions and 35 deletions
--- a/crawl4ai/scraper/bfs_scraper_strategy.py
+++ b/crawl4ai/scraper/bfs_scraper_strategy.py
@@ -135,36 +135,36 @@ class BFSScraperStrategy(ScraperStrategy):
        visited: Set[str] = set()
        depths = {start_url: 0}
        active_crawls = set()  # Track URLs currently being processed
-        try:
+        async with AsyncWebCrawler(
-            while (
+            config=browser_config,
-                not queue.empty() or active_crawls
+            verbose=True,
-            ) and not self._cancel_event.is_set():
+        ) as crawler:
-                """
+            try:
-                This sets up our main control loop which:
+                while (
-                    - Continues while there are URLs to process (not queue.empty())
+                    not queue.empty() or active_crawls
-                    - Or while there are active crawls still running (arun_many)
+                ) and not self._cancel_event.is_set():
-                    - Can be interrupted via cancellation (not self._cancel_event.is_set())
+                    """
-                """
+                    This sets up our main control loop which:
-                # Collect batch of jobs to process
+                        - Continues while there are URLs to process (not queue.empty())
-                jobs = []
+                        - Or while there are active crawls still running (arun_many)
-                # Fill batch with available jobs
+                        - Can be interrupted via cancellation (not self._cancel_event.is_set())
-                while len(jobs) < SCRAPER_BATCH_SIZE and not queue.empty():
+                    """
-                    score, depth, url = await queue.get()
+                    # Collect batch of jobs to process
-                    if url not in active_crawls:  # Only add if not currently processing
+                    jobs = []
-                        jobs.append((score, depth, url))
+                    # Fill batch with available jobs
-                        active_crawls.add(url)
+                    while len(jobs) < SCRAPER_BATCH_SIZE and not queue.empty():
-                        self.stats.current_depth = depth
+                        score, depth, url = await queue.get()
                        if url not in active_crawls:  # Only add if not currently processing
                            jobs.append((score, depth, url))
                            active_crawls.add(url)
                            self.stats.current_depth = depth
-                if not jobs:
+                    if not jobs:
-                    # If no jobs but active crawls exist, wait a bit and continue
+                        # If no jobs but active crawls exist, wait a bit and continue
-                    if active_crawls:
+                        if active_crawls:
-                        await asyncio.sleep(0.1)
+                            await asyncio.sleep(0.1)
-                    continue
+                        continue
-                # Process batch
+                    # Process batch
                async with AsyncWebCrawler(
                    config=browser_config,
                    verbose=True,
                ) as crawler:
                    try:
                        async for result in await crawler.arun_many(
                            urls=[url for _, _, url in jobs],
@@ -194,13 +194,13 @@ class BFSScraperStrategy(ScraperStrategy):
                        # Continue processing other batches
                        continue
-        except Exception as e:
+            except Exception as e:
-            self.logger.error(f"Error in crawl process: {e}")
+                self.logger.error(f"Error in crawl process: {e}")
-            raise
+                raise
-        finally:
+            finally:
-            self.stats.end_time = datetime.now()
+                self.stats.end_time = datetime.now()
-            await crawler.close()
+                await crawler.close()
    async def shutdown(self):
        """Clean up resources and stop crawling"""