fix: Move the creation of crawler outside the main loop

This commit is contained in:
Aravind Karnam
2025-01-27 18:31:13 +05:30
parent 0ff95c83bc
commit d9324e3454

View File

@@ -135,36 +135,36 @@ class BFSScraperStrategy(ScraperStrategy):
visited: Set[str] = set() visited: Set[str] = set()
depths = {start_url: 0} depths = {start_url: 0}
active_crawls = set() # Track URLs currently being processed active_crawls = set() # Track URLs currently being processed
try: async with AsyncWebCrawler(
while ( config=browser_config,
not queue.empty() or active_crawls verbose=True,
) and not self._cancel_event.is_set(): ) as crawler:
""" try:
This sets up our main control loop which: while (
- Continues while there are URLs to process (not queue.empty()) not queue.empty() or active_crawls
- Or while there are active crawls still running (arun_many) ) and not self._cancel_event.is_set():
- Can be interrupted via cancellation (not self._cancel_event.is_set()) """
""" This sets up our main control loop which:
# Collect batch of jobs to process - Continues while there are URLs to process (not queue.empty())
jobs = [] - Or while there are active crawls still running (arun_many)
# Fill batch with available jobs - Can be interrupted via cancellation (not self._cancel_event.is_set())
while len(jobs) < SCRAPER_BATCH_SIZE and not queue.empty(): """
score, depth, url = await queue.get() # Collect batch of jobs to process
if url not in active_crawls: # Only add if not currently processing jobs = []
jobs.append((score, depth, url)) # Fill batch with available jobs
active_crawls.add(url) while len(jobs) < SCRAPER_BATCH_SIZE and not queue.empty():
self.stats.current_depth = depth score, depth, url = await queue.get()
if url not in active_crawls: # Only add if not currently processing
jobs.append((score, depth, url))
active_crawls.add(url)
self.stats.current_depth = depth
if not jobs: if not jobs:
# If no jobs but active crawls exist, wait a bit and continue # If no jobs but active crawls exist, wait a bit and continue
if active_crawls: if active_crawls:
await asyncio.sleep(0.1) await asyncio.sleep(0.1)
continue continue
# Process batch # Process batch
async with AsyncWebCrawler(
config=browser_config,
verbose=True,
) as crawler:
try: try:
async for result in await crawler.arun_many( async for result in await crawler.arun_many(
urls=[url for _, _, url in jobs], urls=[url for _, _, url in jobs],
@@ -194,13 +194,13 @@ class BFSScraperStrategy(ScraperStrategy):
# Continue processing other batches # Continue processing other batches
continue continue
except Exception as e: except Exception as e:
self.logger.error(f"Error in crawl process: {e}") self.logger.error(f"Error in crawl process: {e}")
raise raise
finally: finally:
self.stats.end_time = datetime.now() self.stats.end_time = datetime.now()
await crawler.close() await crawler.close()
async def shutdown(self): async def shutdown(self):
"""Clean up resources and stop crawling""" """Clean up resources and stop crawling"""