fix: Move the creation of crawler outside the main loop
This commit is contained in:
@@ -135,36 +135,36 @@ class BFSScraperStrategy(ScraperStrategy):
|
|||||||
visited: Set[str] = set()
|
visited: Set[str] = set()
|
||||||
depths = {start_url: 0}
|
depths = {start_url: 0}
|
||||||
active_crawls = set() # Track URLs currently being processed
|
active_crawls = set() # Track URLs currently being processed
|
||||||
try:
|
async with AsyncWebCrawler(
|
||||||
while (
|
config=browser_config,
|
||||||
not queue.empty() or active_crawls
|
verbose=True,
|
||||||
) and not self._cancel_event.is_set():
|
) as crawler:
|
||||||
"""
|
try:
|
||||||
This sets up our main control loop which:
|
while (
|
||||||
- Continues while there are URLs to process (not queue.empty())
|
not queue.empty() or active_crawls
|
||||||
- Or while there are active crawls still running (arun_many)
|
) and not self._cancel_event.is_set():
|
||||||
- Can be interrupted via cancellation (not self._cancel_event.is_set())
|
"""
|
||||||
"""
|
This sets up our main control loop which:
|
||||||
# Collect batch of jobs to process
|
- Continues while there are URLs to process (not queue.empty())
|
||||||
jobs = []
|
- Or while there are active crawls still running (arun_many)
|
||||||
# Fill batch with available jobs
|
- Can be interrupted via cancellation (not self._cancel_event.is_set())
|
||||||
while len(jobs) < SCRAPER_BATCH_SIZE and not queue.empty():
|
"""
|
||||||
score, depth, url = await queue.get()
|
# Collect batch of jobs to process
|
||||||
if url not in active_crawls: # Only add if not currently processing
|
jobs = []
|
||||||
jobs.append((score, depth, url))
|
# Fill batch with available jobs
|
||||||
active_crawls.add(url)
|
while len(jobs) < SCRAPER_BATCH_SIZE and not queue.empty():
|
||||||
self.stats.current_depth = depth
|
score, depth, url = await queue.get()
|
||||||
|
if url not in active_crawls: # Only add if not currently processing
|
||||||
|
jobs.append((score, depth, url))
|
||||||
|
active_crawls.add(url)
|
||||||
|
self.stats.current_depth = depth
|
||||||
|
|
||||||
if not jobs:
|
if not jobs:
|
||||||
# If no jobs but active crawls exist, wait a bit and continue
|
# If no jobs but active crawls exist, wait a bit and continue
|
||||||
if active_crawls:
|
if active_crawls:
|
||||||
await asyncio.sleep(0.1)
|
await asyncio.sleep(0.1)
|
||||||
continue
|
continue
|
||||||
# Process batch
|
# Process batch
|
||||||
async with AsyncWebCrawler(
|
|
||||||
config=browser_config,
|
|
||||||
verbose=True,
|
|
||||||
) as crawler:
|
|
||||||
try:
|
try:
|
||||||
async for result in await crawler.arun_many(
|
async for result in await crawler.arun_many(
|
||||||
urls=[url for _, _, url in jobs],
|
urls=[url for _, _, url in jobs],
|
||||||
@@ -194,13 +194,13 @@ class BFSScraperStrategy(ScraperStrategy):
|
|||||||
# Continue processing other batches
|
# Continue processing other batches
|
||||||
continue
|
continue
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.error(f"Error in crawl process: {e}")
|
self.logger.error(f"Error in crawl process: {e}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
self.stats.end_time = datetime.now()
|
self.stats.end_time = datetime.now()
|
||||||
await crawler.close()
|
await crawler.close()
|
||||||
|
|
||||||
async def shutdown(self):
|
async def shutdown(self):
|
||||||
"""Clean up resources and stop crawling"""
|
"""Clean up resources and stop crawling"""
|
||||||
|
|||||||
Reference in New Issue
Block a user