feat:
1. Make active_crawls into a dict instead of set and remove jobs array. Effective lookup and storage of active crawls and crawl control. 2. Put a lock on active_crawls, so similtanious push and pop by coroutines doesn't cause a race condition 3. Move the depth check logic outside the child link for loop, as source_url doesn't change in the loop.
This commit is contained in:
@@ -46,6 +46,7 @@ class BFSScraperStrategy(ScraperStrategy):
|
||||
self.stats = CrawlStats(start_time=datetime.now())
|
||||
self._cancel_event = asyncio.Event()
|
||||
self.process_external_links = process_external_links
|
||||
self._active_crawls_lock = asyncio.Lock()
|
||||
|
||||
async def can_process_url(self, url: str, depth: int) -> bool:
|
||||
"""Check if URL can be processed based on filters
|
||||
@@ -77,21 +78,25 @@ class BFSScraperStrategy(ScraperStrategy):
|
||||
self,
|
||||
result: CrawlResult,
|
||||
source_url: str,
|
||||
depth: int,
|
||||
queue: asyncio.PriorityQueue,
|
||||
visited: Set[str],
|
||||
depths: Dict[str, int],
|
||||
):
|
||||
"""Process extracted links from crawl result.
|
||||
This is our link processor that:
|
||||
Handles both internal and external links
|
||||
Checks if URL can be processed - validates URL, applies Filters with can_process_url
|
||||
Checks depth limits
|
||||
Handles both internal and external links
|
||||
Checks if URL is visited already
|
||||
Checks if URL can be processed - validates URL, applies Filters with can_process_url
|
||||
Scores URLs for priority
|
||||
Updates depth tracking
|
||||
Updates depth tracking dictionary
|
||||
Adds valid URLs to the queue
|
||||
Updates maximum depth statistics
|
||||
"""
|
||||
next_depth = depths[source_url] + 1
|
||||
# If depth limit reached, exit without processing links
|
||||
if next_depth > self.max_depth:
|
||||
return
|
||||
links_to_process = result.links["internal"]
|
||||
if self.process_external_links:
|
||||
links_to_process += result.links["external"]
|
||||
@@ -99,17 +104,14 @@ class BFSScraperStrategy(ScraperStrategy):
|
||||
url = link["href"]
|
||||
if url in visited:
|
||||
continue
|
||||
new_depth = depths[source_url] + 1
|
||||
if new_depth > self.max_depth:
|
||||
continue
|
||||
if not await self.can_process_url(url, new_depth):
|
||||
if not await self.can_process_url(url, next_depth):
|
||||
self.stats.urls_skipped += 1
|
||||
continue
|
||||
score = self.url_scorer.score(url) if self.url_scorer else 0
|
||||
await queue.put((score, new_depth, url))
|
||||
depths[url] = new_depth
|
||||
await queue.put((score, next_depth, url))
|
||||
depths[url] = next_depth
|
||||
self.stats.total_depth_reached = max(
|
||||
self.stats.total_depth_reached, new_depth
|
||||
self.stats.total_depth_reached, next_depth
|
||||
)
|
||||
|
||||
async def ascrape(
|
||||
@@ -134,7 +136,7 @@ class BFSScraperStrategy(ScraperStrategy):
|
||||
await queue.put((0, 0, start_url))
|
||||
visited: Set[str] = set()
|
||||
depths = {start_url: 0}
|
||||
active_crawls = set() # Track URLs currently being processed
|
||||
active_crawls = {} # Track URLs currently being processed with depth and score
|
||||
async with AsyncWebCrawler(
|
||||
config=browser_config,
|
||||
verbose=True,
|
||||
@@ -149,39 +151,31 @@ class BFSScraperStrategy(ScraperStrategy):
|
||||
- Or while there are active crawls still running (arun_many)
|
||||
- Can be interrupted via cancellation (not self._cancel_event.is_set())
|
||||
"""
|
||||
# Collect batch of jobs to process
|
||||
jobs = []
|
||||
# Fill batch with available jobs
|
||||
while len(jobs) < SCRAPER_BATCH_SIZE and not queue.empty():
|
||||
score, depth, url = await queue.get()
|
||||
if (
|
||||
url not in active_crawls
|
||||
): # Only add if not currently processing
|
||||
jobs.append((score, depth, url))
|
||||
active_crawls.add(url)
|
||||
# Collect batch of URLs into active_crawls to process
|
||||
async with self._active_crawls_lock:
|
||||
while len(active_crawls) < SCRAPER_BATCH_SIZE and not queue.empty():
|
||||
score, depth, url = await queue.get()
|
||||
active_crawls[url] = {"depth": depth, "score": score}
|
||||
self.stats.current_depth = depth
|
||||
|
||||
if not jobs:
|
||||
# If no jobs but active crawls exist, wait a bit and continue
|
||||
if active_crawls:
|
||||
await asyncio.sleep(0.1)
|
||||
if not active_crawls:
|
||||
# If no active crawls exist, wait a bit and continue
|
||||
await asyncio.sleep(0.1)
|
||||
continue
|
||||
# Process batch
|
||||
try:
|
||||
async for result in await crawler.arun_many(
|
||||
urls=[url for _, _, url in jobs],
|
||||
urls=list(active_crawls.keys()),
|
||||
config=crawler_config.clone(stream=True),
|
||||
):
|
||||
source_url, depth = next(
|
||||
(url, depth)
|
||||
for _, depth, url in jobs
|
||||
if url == result.url
|
||||
)
|
||||
active_crawls.remove(source_url) # Remove from active set
|
||||
source_url = result.url
|
||||
depth = active_crawls[source_url]["depth"]
|
||||
async with self._active_crawls_lock:
|
||||
active_crawls.pop(source_url, None)
|
||||
|
||||
if result.success:
|
||||
await self._process_links(
|
||||
result, source_url, depth, queue, visited, depths
|
||||
result, source_url, queue, visited, depths
|
||||
)
|
||||
yield result
|
||||
else:
|
||||
@@ -189,9 +183,6 @@ class BFSScraperStrategy(ScraperStrategy):
|
||||
f"Failed to crawl {result.url}: {result.error_message}"
|
||||
)
|
||||
except Exception as e:
|
||||
# Remove failed URLs from active set
|
||||
for _, _, url in jobs:
|
||||
active_crawls.discard(url)
|
||||
self.logger.error(f"Batch processing error: {e}")
|
||||
# Continue processing other batches
|
||||
continue
|
||||
|
||||
@@ -188,11 +188,11 @@ if __name__ == "__main__":
|
||||
import time
|
||||
|
||||
# Run basic example
|
||||
# start_time = time.perf_counter()
|
||||
# print("Running basic scraper example...")
|
||||
# asyncio.run(basic_scraper_example())
|
||||
# end_time = time.perf_counter()
|
||||
# print(f"Basic scraper example completed in {end_time - start_time:.2f} seconds")
|
||||
start_time = time.perf_counter()
|
||||
print("Running basic scraper example...")
|
||||
asyncio.run(basic_scraper_example())
|
||||
end_time = time.perf_counter()
|
||||
print(f"Basic scraper example completed in {end_time - start_time:.2f} seconds")
|
||||
|
||||
# # Run advanced example
|
||||
print("\nRunning advanced scraper example...")
|
||||
|
||||
Reference in New Issue
Block a user