fix: Move the creation of crawler outside the main loop

2025-01-27 18:31:13 +05:30
parent 0ff95c83bc
commit d9324e3454
1 changed files with 35 additions and 35 deletions
--- a/crawl4ai/scraper/bfs_scraper_strategy.py
+++ b/crawl4ai/scraper/bfs_scraper_strategy.py
@@ -135,36 +135,36 @@ class BFSScraperStrategy(ScraperStrategy):
        visited: Set[str] = set()
        depths = {start_url: 0}
        active_crawls = set()  # Track URLs currently being processed
-        try:
-            while (
-                not queue.empty() or active_crawls
-            ) and not self._cancel_event.is_set():
-                """
-                This sets up our main control loop which:
-                    - Continues while there are URLs to process (not queue.empty())
-                    - Or while there are active crawls still running (arun_many)
-                    - Can be interrupted via cancellation (not self._cancel_event.is_set())
-                """
-                # Collect batch of jobs to process
-                jobs = []
-                # Fill batch with available jobs
-                while len(jobs) < SCRAPER_BATCH_SIZE and not queue.empty():
-                    score, depth, url = await queue.get()
-                    if url not in active_crawls:  # Only add if not currently processing
-                        jobs.append((score, depth, url))
-                        active_crawls.add(url)
-                        self.stats.current_depth = depth
+        async with AsyncWebCrawler(
+            config=browser_config,
+            verbose=True,
+        ) as crawler:
+            try:
+                while (
+                    not queue.empty() or active_crawls
+                ) and not self._cancel_event.is_set():
+                    """
+                    This sets up our main control loop which:
+                        - Continues while there are URLs to process (not queue.empty())
+                        - Or while there are active crawls still running (arun_many)
+                        - Can be interrupted via cancellation (not self._cancel_event.is_set())
+                    """
+                    # Collect batch of jobs to process
+                    jobs = []
+                    # Fill batch with available jobs
+                    while len(jobs) < SCRAPER_BATCH_SIZE and not queue.empty():
+                        score, depth, url = await queue.get()
+                        if url not in active_crawls:  # Only add if not currently processing
+                            jobs.append((score, depth, url))
+                            active_crawls.add(url)
+                            self.stats.current_depth = depth

-                if not jobs:
-                    # If no jobs but active crawls exist, wait a bit and continue
-                    if active_crawls:
-                        await asyncio.sleep(0.1)
-                    continue
-                # Process batch
-                async with AsyncWebCrawler(
-                    config=browser_config,
-                    verbose=True,
-                ) as crawler:
+                    if not jobs:
+                        # If no jobs but active crawls exist, wait a bit and continue
+                        if active_crawls:
+                            await asyncio.sleep(0.1)
+                        continue
+                    # Process batch
                    try:
                        async for result in await crawler.arun_many(
                            urls=[url for _, _, url in jobs],
@@ -194,13 +194,13 @@ class BFSScraperStrategy(ScraperStrategy):
                        # Continue processing other batches
                        continue

-        except Exception as e:
-            self.logger.error(f"Error in crawl process: {e}")
-            raise
+            except Exception as e:
+                self.logger.error(f"Error in crawl process: {e}")
+                raise

-        finally:
-            self.stats.end_time = datetime.now()
-            await crawler.close()
+            finally:
+                self.stats.end_time = datetime.now()
+                await crawler.close()

    async def shutdown(self):
        """Clean up resources and stop crawling"""