refactor(deep-crawl): add max_pages limit and improve crawl control

Add max_pages parameter to all deep crawling strategies to limit total pages crawled.
Add score_threshold parameter to BFS/DFS strategies for quality control.
Remove legacy parameter handling in AsyncWebCrawler.
Improve error handling and logging in crawl strategies.

BREAKING CHANGE: Removed support for legacy parameters in AsyncWebCrawler.run_many()
This commit is contained in:
UncleCode
2025-03-03 21:51:11 +08:00
parent c612f9a852
commit d024749633
7 changed files with 372 additions and 91 deletions

View File

@@ -37,6 +37,7 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
# Clone config to disable recursive deep crawling.
batch_config = config.clone(deep_crawl_strategy=None, stream=False)
url_results = await crawler.arun_many(urls=[url], config=batch_config)
for result in url_results:
result.metadata = result.metadata or {}
result.metadata["depth"] = depth
@@ -44,13 +45,19 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
if self.url_scorer:
result.metadata["score"] = self.url_scorer.score(url)
results.append(result)
new_links: List[Tuple[str, Optional[str]]] = []
await self.link_discovery(result, url, depth, visited, new_links, depths)
# Push new links in reverse order so the first discovered is processed next.
for new_url, new_parent in reversed(new_links):
new_depth = depths.get(new_url, depth + 1)
stack.append((new_url, new_parent, new_depth))
# Count only successful crawls toward max_pages limit
if result.success:
self._pages_crawled += 1
# Only discover links from successful crawls
new_links: List[Tuple[str, Optional[str]]] = []
await self.link_discovery(result, url, depth, visited, new_links, depths)
# Push new links in reverse order so the first discovered is processed next.
for new_url, new_parent in reversed(new_links):
new_depth = depths.get(new_url, depth + 1)
stack.append((new_url, new_parent, new_depth))
return results
async def _arun_stream(
@@ -83,8 +90,13 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
result.metadata["score"] = self.url_scorer.score(url)
yield result
new_links: List[Tuple[str, Optional[str]]] = []
await self.link_discovery(result, url, depth, visited, new_links, depths)
for new_url, new_parent in reversed(new_links):
new_depth = depths.get(new_url, depth + 1)
stack.append((new_url, new_parent, new_depth))
# Only count successful crawls toward max_pages limit
# and only discover links from successful crawls
if result.success:
self._pages_crawled += 1
new_links: List[Tuple[str, Optional[str]]] = []
await self.link_discovery(result, url, depth, visited, new_links, depths)
for new_url, new_parent in reversed(new_links):
new_depth = depths.get(new_url, depth + 1)
stack.append((new_url, new_parent, new_depth))