Add crash recovery for deep crawl strategies

Add optional resume_state and on_state_change parameters to all deep
crawl strategies (BFS, DFS, Best-First) for cloud deployment crash
recovery.

Features:
- resume_state: Pass saved state to resume from checkpoint
- on_state_change: Async callback fired after each URL for real-time
  state persistence to external storage (Redis, DB, etc.)
- export_state(): Get last captured state manually
- Zero overhead when features are disabled (None defaults)

State includes visited URLs, pending queue/stack, depths, and
pages_crawled count. All state is JSON-serializable.
This commit is contained in:
unclecode
2025-12-22 14:51:10 +00:00
parent 67e03d64b8
commit 31ebf37252
6 changed files with 1156 additions and 34 deletions

View File

@@ -2,7 +2,7 @@
import asyncio
import logging
from datetime import datetime
from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple, Any, Callable, Awaitable
from urllib.parse import urlparse
from ..models import TraversalStats
@@ -41,6 +41,9 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
include_external: bool = False,
max_pages: int = infinity,
logger: Optional[logging.Logger] = None,
# Optional resume/callback parameters for crash recovery
resume_state: Optional[Dict[str, Any]] = None,
on_state_change: Optional[Callable[[Dict[str, Any]], Awaitable[None]]] = None,
):
self.max_depth = max_depth
self.filter_chain = filter_chain
@@ -57,6 +60,12 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
self.stats = TraversalStats(start_time=datetime.now())
self._cancel_event = asyncio.Event()
self._pages_crawled = 0
# Store for use in arun methods
self._resume_state = resume_state
self._on_state_change = on_state_change
self._last_state: Optional[Dict[str, Any]] = None
# Shadow list for queue items (only used when on_state_change is set)
self._queue_shadow: Optional[List[Tuple[float, int, str, Optional[str]]]] = None
async def can_process_url(self, url: str, depth: int) -> bool:
"""
@@ -135,16 +144,36 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
) -> AsyncGenerator[CrawlResult, None]:
"""
Core best-first crawl method using a priority queue.
The queue items are tuples of (score, depth, url, parent_url). Lower scores
are treated as higher priority. URLs are processed in batches for efficiency.
"""
queue: asyncio.PriorityQueue = asyncio.PriorityQueue()
# Push the initial URL with score 0 and depth 0.
initial_score = self.url_scorer.score(start_url) if self.url_scorer else 0
await queue.put((-initial_score, 0, start_url, None))
visited: Set[str] = set()
depths: Dict[str, int] = {start_url: 0}
# Conditional state initialization for resume support
if self._resume_state:
visited = set(self._resume_state.get("visited", []))
depths = dict(self._resume_state.get("depths", {}))
self._pages_crawled = self._resume_state.get("pages_crawled", 0)
# Restore queue from saved items
queue_items = self._resume_state.get("queue_items", [])
for item in queue_items:
await queue.put((item["score"], item["depth"], item["url"], item["parent_url"]))
# Initialize shadow list if callback is set
if self._on_state_change:
self._queue_shadow = [
(item["score"], item["depth"], item["url"], item["parent_url"])
for item in queue_items
]
else:
# Original initialization
initial_score = self.url_scorer.score(start_url) if self.url_scorer else 0
await queue.put((-initial_score, 0, start_url, None))
visited: Set[str] = set()
depths: Dict[str, int] = {start_url: 0}
# Initialize shadow list if callback is set
if self._on_state_change:
self._queue_shadow = [(-initial_score, 0, start_url, None)]
while not queue.empty() and not self._cancel_event.is_set():
# Stop if we've reached the max pages limit
@@ -166,6 +195,12 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
if queue.empty():
break
item = await queue.get()
# Remove from shadow list if tracking
if self._on_state_change and self._queue_shadow is not None:
try:
self._queue_shadow.remove(item)
except ValueError:
pass # Item may have been removed already
score, depth, url, parent_url = item
if url in visited:
continue
@@ -210,7 +245,26 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
for new_url, new_parent in new_links:
new_depth = depths.get(new_url, depth + 1)
new_score = self.url_scorer.score(new_url) if self.url_scorer else 0
await queue.put((-new_score, new_depth, new_url, new_parent))
queue_item = (-new_score, new_depth, new_url, new_parent)
await queue.put(queue_item)
# Add to shadow list if tracking
if self._on_state_change and self._queue_shadow is not None:
self._queue_shadow.append(queue_item)
# Capture state after EACH URL processed (if callback set)
if self._on_state_change and self._queue_shadow is not None:
state = {
"strategy_type": "best_first",
"visited": list(visited),
"queue_items": [
{"score": s, "depth": d, "url": u, "parent_url": p}
for s, d, u, p in self._queue_shadow
],
"depths": depths,
"pages_crawled": self._pages_crawled,
}
self._last_state = state
await self._on_state_change(state)
# End of crawl.
@@ -269,3 +323,15 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
"""
self._cancel_event.set()
self.stats.end_time = datetime.now()
def export_state(self) -> Optional[Dict[str, Any]]:
"""
Export current crawl state for external persistence.
Note: This returns the last captured state. For real-time state,
use the on_state_change callback.
Returns:
Dict with strategy state, or None if no state captured yet.
"""
return self._last_state

View File

@@ -2,7 +2,7 @@
import asyncio
import logging
from datetime import datetime
from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple, Any, Callable, Awaitable
from urllib.parse import urlparse
from ..models import TraversalStats
@@ -26,11 +26,14 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
self,
max_depth: int,
filter_chain: FilterChain = FilterChain(),
url_scorer: Optional[URLScorer] = None,
url_scorer: Optional[URLScorer] = None,
include_external: bool = False,
score_threshold: float = -infinity,
max_pages: int = infinity,
logger: Optional[logging.Logger] = None,
# Optional resume/callback parameters for crash recovery
resume_state: Optional[Dict[str, Any]] = None,
on_state_change: Optional[Callable[[Dict[str, Any]], Awaitable[None]]] = None,
):
self.max_depth = max_depth
self.filter_chain = filter_chain
@@ -48,6 +51,10 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
self.stats = TraversalStats(start_time=datetime.now())
self._cancel_event = asyncio.Event()
self._pages_crawled = 0
# Store for use in arun methods
self._resume_state = resume_state
self._on_state_change = on_state_change
self._last_state: Optional[Dict[str, Any]] = None
async def can_process_url(self, url: str, depth: int) -> bool:
"""
@@ -155,10 +162,21 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
Batch (non-streaming) mode:
Processes one BFS level at a time, then yields all the results.
"""
visited: Set[str] = set()
# current_level holds tuples: (url, parent_url)
current_level: List[Tuple[str, Optional[str]]] = [(start_url, None)]
depths: Dict[str, int] = {start_url: 0}
# Conditional state initialization for resume support
if self._resume_state:
visited = set(self._resume_state.get("visited", []))
current_level = [
(item["url"], item["parent_url"])
for item in self._resume_state.get("pending", [])
]
depths = dict(self._resume_state.get("depths", {}))
self._pages_crawled = self._resume_state.get("pages_crawled", 0)
else:
# Original initialization
visited: Set[str] = set()
# current_level holds tuples: (url, parent_url)
current_level: List[Tuple[str, Optional[str]]] = [(start_url, None)]
depths: Dict[str, int] = {start_url: 0}
results: List[CrawlResult] = []
@@ -174,11 +192,7 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
# Clone the config to disable deep crawling recursion and enforce batch mode.
batch_config = config.clone(deep_crawl_strategy=None, stream=False)
batch_results = await crawler.arun_many(urls=urls, config=batch_config)
# Update pages crawled counter - count only successful crawls
successful_results = [r for r in batch_results if r.success]
self._pages_crawled += len(successful_results)
for result in batch_results:
url = result.url
depth = depths.get(url, 0)
@@ -187,12 +201,27 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
parent_url = next((parent for (u, parent) in current_level if u == url), None)
result.metadata["parent_url"] = parent_url
results.append(result)
# Only discover links from successful crawls
if result.success:
# Increment pages crawled per URL for accurate state tracking
self._pages_crawled += 1
# Link discovery will handle the max pages limit internally
await self.link_discovery(result, url, depth, visited, next_level, depths)
# Capture state after EACH URL processed (if callback set)
if self._on_state_change:
state = {
"strategy_type": "bfs",
"visited": list(visited),
"pending": [{"url": u, "parent_url": p} for u, p in next_level],
"depths": depths,
"pages_crawled": self._pages_crawled,
}
self._last_state = state
await self._on_state_change(state)
current_level = next_level
return results
@@ -207,9 +236,20 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
Streaming mode:
Processes one BFS level at a time and yields results immediately as they arrive.
"""
visited: Set[str] = set()
current_level: List[Tuple[str, Optional[str]]] = [(start_url, None)]
depths: Dict[str, int] = {start_url: 0}
# Conditional state initialization for resume support
if self._resume_state:
visited = set(self._resume_state.get("visited", []))
current_level = [
(item["url"], item["parent_url"])
for item in self._resume_state.get("pending", [])
]
depths = dict(self._resume_state.get("depths", {}))
self._pages_crawled = self._resume_state.get("pages_crawled", 0)
else:
# Original initialization
visited: Set[str] = set()
current_level: List[Tuple[str, Optional[str]]] = [(start_url, None)]
depths: Dict[str, int] = {start_url: 0}
while current_level and not self._cancel_event.is_set():
next_level: List[Tuple[str, Optional[str]]] = []
@@ -244,7 +284,19 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
if result.success:
# Link discovery will handle the max pages limit internally
await self.link_discovery(result, url, depth, visited, next_level, depths)
# Capture state after EACH URL processed (if callback set)
if self._on_state_change:
state = {
"strategy_type": "bfs",
"visited": list(visited),
"pending": [{"url": u, "parent_url": p} for u, p in next_level],
"depths": depths,
"pages_crawled": self._pages_crawled,
}
self._last_state = state
await self._on_state_change(state)
# If we didn't get results back (e.g. due to errors), avoid getting stuck in an infinite loop
# by considering these URLs as visited but not counting them toward the max_pages limit
if results_count == 0 and urls:
@@ -258,3 +310,15 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
"""
self._cancel_event.set()
self.stats.end_time = datetime.now()
def export_state(self) -> Optional[Dict[str, Any]]:
"""
Export current crawl state for external persistence.
Note: This returns the last captured state. For real-time state,
use the on_state_change callback.
Returns:
Dict with strategy state, or None if no state captured yet.
"""
return self._last_state

View File

@@ -38,12 +38,25 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
in control of traversal. Every successful page bumps ``_pages_crawled`` and
seeds new stack items discovered via :meth:`link_discovery`.
"""
visited: Set[str] = set()
# Stack items: (url, parent_url, depth)
stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
depths: Dict[str, int] = {start_url: 0}
results: List[CrawlResult] = []
self._reset_seen(start_url)
# Conditional state initialization for resume support
if self._resume_state:
visited = set(self._resume_state.get("visited", []))
stack = [
(item["url"], item["parent_url"], item["depth"])
for item in self._resume_state.get("stack", [])
]
depths = dict(self._resume_state.get("depths", {}))
self._pages_crawled = self._resume_state.get("pages_crawled", 0)
self._dfs_seen = set(self._resume_state.get("dfs_seen", []))
results: List[CrawlResult] = []
else:
# Original initialization
visited: Set[str] = set()
# Stack items: (url, parent_url, depth)
stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
depths: Dict[str, int] = {start_url: 0}
results: List[CrawlResult] = []
self._reset_seen(start_url)
while stack and not self._cancel_event.is_set():
url, parent, depth = stack.pop()
@@ -79,6 +92,22 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
for new_url, new_parent in reversed(new_links):
new_depth = depths.get(new_url, depth + 1)
stack.append((new_url, new_parent, new_depth))
# Capture state after each URL processed (if callback set)
if self._on_state_change:
state = {
"strategy_type": "dfs",
"visited": list(visited),
"stack": [
{"url": u, "parent_url": p, "depth": d}
for u, p, d in stack
],
"depths": depths,
"pages_crawled": self._pages_crawled,
"dfs_seen": list(self._dfs_seen),
}
self._last_state = state
await self._on_state_change(state)
return results
async def _arun_stream(
@@ -94,10 +123,22 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
yielded before we even look at the next stack entry. Successful crawls
still feed :meth:`link_discovery`, keeping DFS order intact.
"""
visited: Set[str] = set()
stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
depths: Dict[str, int] = {start_url: 0}
self._reset_seen(start_url)
# Conditional state initialization for resume support
if self._resume_state:
visited = set(self._resume_state.get("visited", []))
stack = [
(item["url"], item["parent_url"], item["depth"])
for item in self._resume_state.get("stack", [])
]
depths = dict(self._resume_state.get("depths", {}))
self._pages_crawled = self._resume_state.get("pages_crawled", 0)
self._dfs_seen = set(self._resume_state.get("dfs_seen", []))
else:
# Original initialization
visited: Set[str] = set()
stack: List[Tuple[str, Optional[str], int]] = [(start_url, None, 0)]
depths: Dict[str, int] = {start_url: 0}
self._reset_seen(start_url)
while stack and not self._cancel_event.is_set():
url, parent, depth = stack.pop()
@@ -130,6 +171,22 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
new_depth = depths.get(new_url, depth + 1)
stack.append((new_url, new_parent, new_depth))
# Capture state after each URL processed (if callback set)
if self._on_state_change:
state = {
"strategy_type": "dfs",
"visited": list(visited),
"stack": [
{"url": u, "parent_url": p, "depth": d}
for u, p, d in stack
],
"depths": depths,
"pages_crawled": self._pages_crawled,
"dfs_seen": list(self._dfs_seen),
}
self._last_state = state
await self._on_state_change(state)
async def link_discovery(
self,
result: CrawlResult,

View File

View File

@@ -0,0 +1,773 @@
"""
Test Suite: Deep Crawl Resume/Crash Recovery Tests
Tests that verify:
1. State export produces valid JSON-serializable data
2. Resume from checkpoint continues without duplicates
3. Simulated crash at various points recovers correctly
4. State callback fires at expected intervals
5. No damage to existing system behavior (regression tests)
"""
import pytest
import asyncio
import json
from typing import Dict, Any, List
from unittest.mock import AsyncMock, MagicMock
from crawl4ai.deep_crawling import (
BFSDeepCrawlStrategy,
DFSDeepCrawlStrategy,
BestFirstCrawlingStrategy,
FilterChain,
URLPatternFilter,
DomainFilter,
)
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
# ============================================================================
# Helper Functions for Mock Crawler
# ============================================================================
def create_mock_config(stream=False):
"""Create a mock CrawlerRunConfig."""
config = MagicMock()
config.clone = MagicMock(return_value=config)
config.stream = stream
return config
def create_mock_crawler_with_links(num_links: int = 3, include_keyword: bool = False):
"""Create mock crawler that returns results with links."""
call_count = 0
async def mock_arun_many(urls, config):
nonlocal call_count
results = []
for url in urls:
call_count += 1
result = MagicMock()
result.url = url
result.success = True
result.metadata = {}
# Generate child links
links = []
for i in range(num_links):
link_url = f"{url}/child{call_count}_{i}"
if include_keyword:
link_url = f"{url}/important-child{call_count}_{i}"
links.append({"href": link_url})
result.links = {"internal": links, "external": []}
results.append(result)
# For streaming mode, return async generator
if config.stream:
async def gen():
for r in results:
yield r
return gen()
return results
crawler = MagicMock()
crawler.arun_many = mock_arun_many
return crawler
def create_mock_crawler_tracking(crawl_order: List[str], return_no_links: bool = False):
"""Create mock crawler that tracks crawl order."""
async def mock_arun_many(urls, config):
results = []
for url in urls:
crawl_order.append(url)
result = MagicMock()
result.url = url
result.success = True
result.metadata = {}
result.links = {"internal": [], "external": []} if return_no_links else {"internal": [{"href": f"{url}/child"}], "external": []}
results.append(result)
# For streaming mode, return async generator
if config.stream:
async def gen():
for r in results:
yield r
return gen()
return results
crawler = MagicMock()
crawler.arun_many = mock_arun_many
return crawler
def create_simple_mock_crawler():
"""Basic mock crawler returning 1 result with 2 child links."""
call_count = 0
async def mock_arun_many(urls, config):
nonlocal call_count
results = []
for url in urls:
call_count += 1
result = MagicMock()
result.url = url
result.success = True
result.metadata = {}
result.links = {
"internal": [
{"href": f"{url}/child1"},
{"href": f"{url}/child2"},
],
"external": []
}
results.append(result)
if config.stream:
async def gen():
for r in results:
yield r
return gen()
return results
crawler = MagicMock()
crawler.arun_many = mock_arun_many
return crawler
def create_mock_crawler_unlimited_links():
"""Mock crawler that always returns links (for testing limits)."""
async def mock_arun_many(urls, config):
results = []
for url in urls:
result = MagicMock()
result.url = url
result.success = True
result.metadata = {}
result.links = {
"internal": [{"href": f"{url}/link{i}"} for i in range(10)],
"external": []
}
results.append(result)
if config.stream:
async def gen():
for r in results:
yield r
return gen()
return results
crawler = MagicMock()
crawler.arun_many = mock_arun_many
return crawler
# ============================================================================
# TEST SUITE 1: Crash Recovery Tests
# ============================================================================
class TestBFSResume:
"""BFS strategy resume tests."""
@pytest.mark.asyncio
async def test_state_export_json_serializable(self):
"""Verify exported state can be JSON serialized."""
captured_states: List[Dict] = []
async def capture_state(state: Dict[str, Any]):
# Verify JSON serializable
json_str = json.dumps(state)
parsed = json.loads(json_str)
captured_states.append(parsed)
strategy = BFSDeepCrawlStrategy(
max_depth=2,
max_pages=10,
on_state_change=capture_state,
)
# Create mock crawler that returns predictable results
mock_crawler = create_mock_crawler_with_links(num_links=3)
mock_config = create_mock_config()
results = await strategy._arun_batch("https://example.com", mock_crawler, mock_config)
# Verify states were captured
assert len(captured_states) > 0
# Verify state structure
for state in captured_states:
assert state["strategy_type"] == "bfs"
assert "visited" in state
assert "pending" in state
assert "depths" in state
assert "pages_crawled" in state
assert isinstance(state["visited"], list)
assert isinstance(state["pending"], list)
assert isinstance(state["depths"], dict)
assert isinstance(state["pages_crawled"], int)
@pytest.mark.asyncio
async def test_resume_continues_from_checkpoint(self):
"""Verify resume starts from saved state, not beginning."""
# Simulate state from previous crawl (visited 5 URLs, 3 pending)
saved_state = {
"strategy_type": "bfs",
"visited": [
"https://example.com",
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page3",
"https://example.com/page4",
],
"pending": [
{"url": "https://example.com/page5", "parent_url": "https://example.com/page2"},
{"url": "https://example.com/page6", "parent_url": "https://example.com/page3"},
{"url": "https://example.com/page7", "parent_url": "https://example.com/page3"},
],
"depths": {
"https://example.com": 0,
"https://example.com/page1": 1,
"https://example.com/page2": 1,
"https://example.com/page3": 1,
"https://example.com/page4": 1,
"https://example.com/page5": 2,
"https://example.com/page6": 2,
"https://example.com/page7": 2,
},
"pages_crawled": 5,
}
crawled_urls: List[str] = []
strategy = BFSDeepCrawlStrategy(
max_depth=2,
max_pages=20,
resume_state=saved_state,
)
# Verify internal state was restored
assert strategy._resume_state == saved_state
mock_crawler = create_mock_crawler_tracking(crawled_urls, return_no_links=True)
mock_config = create_mock_config()
await strategy._arun_batch("https://example.com", mock_crawler, mock_config)
# Should NOT re-crawl already visited URLs
for visited_url in saved_state["visited"]:
assert visited_url not in crawled_urls, f"Re-crawled already visited: {visited_url}"
# Should crawl pending URLs
for pending in saved_state["pending"]:
assert pending["url"] in crawled_urls, f"Did not crawl pending: {pending['url']}"
@pytest.mark.asyncio
async def test_simulated_crash_mid_crawl(self):
"""Simulate crash at URL N, verify resume continues from pending URLs."""
crash_after = 3
states_before_crash: List[Dict] = []
async def capture_until_crash(state: Dict[str, Any]):
states_before_crash.append(state)
if state["pages_crawled"] >= crash_after:
raise Exception("Simulated crash!")
strategy1 = BFSDeepCrawlStrategy(
max_depth=2,
max_pages=10,
on_state_change=capture_until_crash,
)
mock_crawler = create_mock_crawler_with_links(num_links=5)
mock_config = create_mock_config()
# First crawl - crashes
with pytest.raises(Exception, match="Simulated crash"):
await strategy1._arun_batch("https://example.com", mock_crawler, mock_config)
# Get last state before crash
last_state = states_before_crash[-1]
assert last_state["pages_crawled"] >= crash_after
# Calculate which URLs were already crawled vs pending
pending_urls = {item["url"] for item in last_state["pending"]}
visited_urls = set(last_state["visited"])
already_crawled_urls = visited_urls - pending_urls
# Resume from checkpoint
crawled_in_resume: List[str] = []
strategy2 = BFSDeepCrawlStrategy(
max_depth=2,
max_pages=10,
resume_state=last_state,
)
mock_crawler2 = create_mock_crawler_tracking(crawled_in_resume, return_no_links=True)
await strategy2._arun_batch("https://example.com", mock_crawler2, mock_config)
# Verify already-crawled URLs are not re-crawled
for crawled_url in already_crawled_urls:
assert crawled_url not in crawled_in_resume, f"Re-crawled already visited: {crawled_url}"
# Verify pending URLs are crawled
for pending_url in pending_urls:
assert pending_url in crawled_in_resume, f"Did not crawl pending: {pending_url}"
@pytest.mark.asyncio
async def test_callback_fires_per_url(self):
"""Verify callback fires after each URL for maximum granularity."""
callback_count = 0
pages_crawled_sequence: List[int] = []
async def count_callbacks(state: Dict[str, Any]):
nonlocal callback_count
callback_count += 1
pages_crawled_sequence.append(state["pages_crawled"])
strategy = BFSDeepCrawlStrategy(
max_depth=1,
max_pages=5,
on_state_change=count_callbacks,
)
mock_crawler = create_mock_crawler_with_links(num_links=2)
mock_config = create_mock_config()
await strategy._arun_batch("https://example.com", mock_crawler, mock_config)
# Callback should fire once per successful URL
assert callback_count == strategy._pages_crawled, \
f"Callback fired {callback_count} times, expected {strategy._pages_crawled} (per URL)"
# pages_crawled should increment by 1 each callback
for i, count in enumerate(pages_crawled_sequence):
assert count == i + 1, f"Expected pages_crawled={i+1} at callback {i}, got {count}"
@pytest.mark.asyncio
async def test_export_state_returns_last_captured(self):
"""Verify export_state() returns last captured state."""
last_state = None
async def capture(state):
nonlocal last_state
last_state = state
strategy = BFSDeepCrawlStrategy(max_depth=2, max_pages=5, on_state_change=capture)
mock_crawler = create_mock_crawler_with_links(num_links=2)
mock_config = create_mock_config()
await strategy._arun_batch("https://example.com", mock_crawler, mock_config)
exported = strategy.export_state()
assert exported == last_state
class TestDFSResume:
"""DFS strategy resume tests."""
@pytest.mark.asyncio
async def test_state_export_includes_stack_and_dfs_seen(self):
"""Verify DFS state includes stack structure and _dfs_seen."""
captured_states: List[Dict] = []
async def capture_state(state: Dict[str, Any]):
captured_states.append(state)
strategy = DFSDeepCrawlStrategy(
max_depth=3,
max_pages=10,
on_state_change=capture_state,
)
mock_crawler = create_mock_crawler_with_links(num_links=2)
mock_config = create_mock_config()
await strategy._arun_batch("https://example.com", mock_crawler, mock_config)
assert len(captured_states) > 0
for state in captured_states:
assert state["strategy_type"] == "dfs"
assert "stack" in state
assert "dfs_seen" in state
# Stack items should have depth
for item in state["stack"]:
assert "url" in item
assert "parent_url" in item
assert "depth" in item
@pytest.mark.asyncio
async def test_resume_restores_stack_order(self):
"""Verify DFS stack order is preserved on resume."""
saved_state = {
"strategy_type": "dfs",
"visited": ["https://example.com"],
"stack": [
{"url": "https://example.com/deep3", "parent_url": "https://example.com/deep2", "depth": 3},
{"url": "https://example.com/deep2", "parent_url": "https://example.com/deep1", "depth": 2},
{"url": "https://example.com/page1", "parent_url": "https://example.com", "depth": 1},
],
"depths": {"https://example.com": 0},
"pages_crawled": 1,
"dfs_seen": ["https://example.com", "https://example.com/deep3", "https://example.com/deep2", "https://example.com/page1"],
}
crawl_order: List[str] = []
strategy = DFSDeepCrawlStrategy(
max_depth=3,
max_pages=10,
resume_state=saved_state,
)
mock_crawler = create_mock_crawler_tracking(crawl_order, return_no_links=True)
mock_config = create_mock_config()
await strategy._arun_batch("https://example.com", mock_crawler, mock_config)
# DFS pops from end of stack, so order should be: page1, deep2, deep3
assert crawl_order[0] == "https://example.com/page1"
assert crawl_order[1] == "https://example.com/deep2"
assert crawl_order[2] == "https://example.com/deep3"
class TestBestFirstResume:
"""Best-First strategy resume tests."""
@pytest.mark.asyncio
async def test_state_export_includes_scored_queue(self):
"""Verify Best-First state includes queue with scores."""
captured_states: List[Dict] = []
async def capture_state(state: Dict[str, Any]):
captured_states.append(state)
scorer = KeywordRelevanceScorer(keywords=["important"], weight=1.0)
strategy = BestFirstCrawlingStrategy(
max_depth=2,
max_pages=10,
url_scorer=scorer,
on_state_change=capture_state,
)
mock_crawler = create_mock_crawler_with_links(num_links=3, include_keyword=True)
mock_config = create_mock_config(stream=True)
async for _ in strategy._arun_stream("https://example.com", mock_crawler, mock_config):
pass
assert len(captured_states) > 0
for state in captured_states:
assert state["strategy_type"] == "best_first"
assert "queue_items" in state
for item in state["queue_items"]:
assert "score" in item
assert "depth" in item
assert "url" in item
assert "parent_url" in item
@pytest.mark.asyncio
async def test_resume_maintains_priority_order(self):
"""Verify priority queue order is maintained on resume."""
saved_state = {
"strategy_type": "best_first",
"visited": ["https://example.com"],
"queue_items": [
{"score": -0.9, "depth": 1, "url": "https://example.com/high-priority", "parent_url": "https://example.com"},
{"score": -0.5, "depth": 1, "url": "https://example.com/medium-priority", "parent_url": "https://example.com"},
{"score": -0.1, "depth": 1, "url": "https://example.com/low-priority", "parent_url": "https://example.com"},
],
"depths": {"https://example.com": 0},
"pages_crawled": 1,
}
crawl_order: List[str] = []
strategy = BestFirstCrawlingStrategy(
max_depth=2,
max_pages=10,
resume_state=saved_state,
)
mock_crawler = create_mock_crawler_tracking(crawl_order, return_no_links=True)
mock_config = create_mock_config(stream=True)
async for _ in strategy._arun_stream("https://example.com", mock_crawler, mock_config):
pass
# Higher negative score = higher priority (min-heap)
# So -0.9 should be crawled first
assert crawl_order[0] == "https://example.com/high-priority"
class TestCrossStrategyResume:
"""Tests that apply to all strategies."""
@pytest.mark.asyncio
@pytest.mark.parametrize("strategy_class,strategy_type", [
(BFSDeepCrawlStrategy, "bfs"),
(DFSDeepCrawlStrategy, "dfs"),
(BestFirstCrawlingStrategy, "best_first"),
])
async def test_no_callback_means_no_overhead(self, strategy_class, strategy_type):
"""Verify no state tracking when callback is None."""
strategy = strategy_class(max_depth=2, max_pages=5)
# _queue_shadow should be None for Best-First when no callback
if strategy_class == BestFirstCrawlingStrategy:
assert strategy._queue_shadow is None
# _last_state should be None initially
assert strategy._last_state is None
@pytest.mark.asyncio
@pytest.mark.parametrize("strategy_class", [
BFSDeepCrawlStrategy,
DFSDeepCrawlStrategy,
BestFirstCrawlingStrategy,
])
async def test_export_state_returns_last_captured(self, strategy_class):
"""Verify export_state() returns last captured state."""
last_state = None
async def capture(state):
nonlocal last_state
last_state = state
strategy = strategy_class(max_depth=2, max_pages=5, on_state_change=capture)
mock_crawler = create_mock_crawler_with_links(num_links=2)
if strategy_class == BestFirstCrawlingStrategy:
mock_config = create_mock_config(stream=True)
async for _ in strategy._arun_stream("https://example.com", mock_crawler, mock_config):
pass
else:
mock_config = create_mock_config()
await strategy._arun_batch("https://example.com", mock_crawler, mock_config)
exported = strategy.export_state()
assert exported == last_state
# ============================================================================
# TEST SUITE 2: Regression Tests (No Damage to Current System)
# ============================================================================
class TestBFSRegressions:
"""Ensure BFS works identically when new params not used."""
@pytest.mark.asyncio
async def test_default_params_unchanged(self):
"""Constructor with only original params works."""
strategy = BFSDeepCrawlStrategy(
max_depth=2,
include_external=False,
max_pages=10,
)
assert strategy.max_depth == 2
assert strategy.include_external == False
assert strategy.max_pages == 10
assert strategy._resume_state is None
assert strategy._on_state_change is None
@pytest.mark.asyncio
async def test_filter_chain_still_works(self):
"""FilterChain integration unchanged."""
filter_chain = FilterChain([
URLPatternFilter(patterns=["*/blog/*"]),
DomainFilter(allowed_domains=["example.com"]),
])
strategy = BFSDeepCrawlStrategy(
max_depth=2,
filter_chain=filter_chain,
)
# Test filter still applies
assert await strategy.can_process_url("https://example.com/blog/post1", 1) == True
assert await strategy.can_process_url("https://other.com/blog/post1", 1) == False
@pytest.mark.asyncio
async def test_url_scorer_still_works(self):
"""URL scoring integration unchanged."""
scorer = KeywordRelevanceScorer(keywords=["python", "tutorial"], weight=1.0)
strategy = BFSDeepCrawlStrategy(
max_depth=2,
url_scorer=scorer,
score_threshold=0.5,
)
assert strategy.url_scorer is not None
assert strategy.score_threshold == 0.5
# Scorer should work
score = scorer.score("https://example.com/python-tutorial")
assert score > 0
@pytest.mark.asyncio
async def test_batch_mode_returns_list(self):
"""Batch mode still returns List[CrawlResult]."""
strategy = BFSDeepCrawlStrategy(max_depth=1, max_pages=5)
mock_crawler = create_simple_mock_crawler()
mock_config = create_mock_config(stream=False)
results = await strategy._arun_batch("https://example.com", mock_crawler, mock_config)
assert isinstance(results, list)
assert len(results) > 0
@pytest.mark.asyncio
async def test_max_pages_limit_respected(self):
"""max_pages limit still enforced."""
strategy = BFSDeepCrawlStrategy(max_depth=10, max_pages=3)
mock_crawler = create_mock_crawler_unlimited_links()
mock_config = create_mock_config()
results = await strategy._arun_batch("https://example.com", mock_crawler, mock_config)
# Should stop at max_pages
assert strategy._pages_crawled <= 3
@pytest.mark.asyncio
async def test_max_depth_limit_respected(self):
"""max_depth limit still enforced."""
strategy = BFSDeepCrawlStrategy(max_depth=2, max_pages=100)
mock_crawler = create_mock_crawler_unlimited_links()
mock_config = create_mock_config()
results = await strategy._arun_batch("https://example.com", mock_crawler, mock_config)
# All results should have depth <= max_depth
for result in results:
assert result.metadata.get("depth", 0) <= 2
@pytest.mark.asyncio
async def test_metadata_depth_still_set(self):
"""Result metadata still includes depth."""
strategy = BFSDeepCrawlStrategy(max_depth=2, max_pages=5)
mock_crawler = create_simple_mock_crawler()
mock_config = create_mock_config()
results = await strategy._arun_batch("https://example.com", mock_crawler, mock_config)
for result in results:
assert "depth" in result.metadata
assert isinstance(result.metadata["depth"], int)
@pytest.mark.asyncio
async def test_metadata_parent_url_still_set(self):
"""Result metadata still includes parent_url."""
strategy = BFSDeepCrawlStrategy(max_depth=2, max_pages=5)
mock_crawler = create_simple_mock_crawler()
mock_config = create_mock_config()
results = await strategy._arun_batch("https://example.com", mock_crawler, mock_config)
# First result (start URL) should have parent_url = None
assert results[0].metadata.get("parent_url") is None
# Child results should have parent_url set
for result in results[1:]:
assert "parent_url" in result.metadata
class TestDFSRegressions:
"""Ensure DFS works identically when new params not used."""
@pytest.mark.asyncio
async def test_inherits_bfs_params(self):
"""DFS still inherits all BFS parameters."""
strategy = DFSDeepCrawlStrategy(
max_depth=3,
include_external=True,
max_pages=20,
score_threshold=0.5,
)
assert strategy.max_depth == 3
assert strategy.include_external == True
assert strategy.max_pages == 20
assert strategy.score_threshold == 0.5
@pytest.mark.asyncio
async def test_dfs_seen_initialized(self):
"""DFS _dfs_seen set still initialized."""
strategy = DFSDeepCrawlStrategy(max_depth=2)
assert hasattr(strategy, '_dfs_seen')
assert isinstance(strategy._dfs_seen, set)
class TestBestFirstRegressions:
"""Ensure Best-First works identically when new params not used."""
@pytest.mark.asyncio
async def test_default_params_unchanged(self):
"""Constructor with only original params works."""
strategy = BestFirstCrawlingStrategy(
max_depth=2,
include_external=False,
max_pages=10,
)
assert strategy.max_depth == 2
assert strategy.include_external == False
assert strategy.max_pages == 10
assert strategy._resume_state is None
assert strategy._on_state_change is None
assert strategy._queue_shadow is None # Not initialized without callback
@pytest.mark.asyncio
async def test_scorer_integration(self):
"""URL scorer still affects crawl priority."""
scorer = KeywordRelevanceScorer(keywords=["important"], weight=1.0)
strategy = BestFirstCrawlingStrategy(
max_depth=2,
max_pages=10,
url_scorer=scorer,
)
assert strategy.url_scorer is scorer
class TestAPICompatibility:
"""Ensure API/serialization compatibility."""
def test_strategy_signature_backward_compatible(self):
"""Old code calling with positional/keyword args still works."""
# Positional args (old style)
s1 = BFSDeepCrawlStrategy(2)
assert s1.max_depth == 2
# Keyword args (old style)
s2 = BFSDeepCrawlStrategy(max_depth=3, max_pages=10)
assert s2.max_depth == 3
# Mixed (old style)
s3 = BFSDeepCrawlStrategy(2, FilterChain(), None, False, float('-inf'), 100)
assert s3.max_depth == 2
assert s3.max_pages == 100
def test_no_required_new_params(self):
"""New params are optional, not required."""
# Should not raise
BFSDeepCrawlStrategy(max_depth=2)
DFSDeepCrawlStrategy(max_depth=2)
BestFirstCrawlingStrategy(max_depth=2)

View File

@@ -0,0 +1,162 @@
"""
Integration Test: Deep Crawl Resume with Real URLs
Tests the crash recovery feature using books.toscrape.com - a site
designed for scraping practice with a clear hierarchy:
- Home page → Category pages → Book detail pages
"""
import pytest
import asyncio
import json
from typing import Dict, Any, List
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
class TestBFSResumeIntegration:
"""Integration tests for BFS resume with real crawling."""
@pytest.mark.asyncio
async def test_real_crawl_state_capture_and_resume(self):
"""
Test crash recovery with real URLs from books.toscrape.com.
Flow:
1. Start crawl with state callback
2. Stop after N pages (simulated crash)
3. Resume from saved state
4. Verify no duplicate crawls
"""
# Phase 1: Initial crawl that "crashes" after 3 pages
crash_after = 3
captured_states: List[Dict[str, Any]] = []
crawled_urls_phase1: List[str] = []
async def capture_state_until_crash(state: Dict[str, Any]):
captured_states.append(state)
crawled_urls_phase1.clear()
crawled_urls_phase1.extend(state["visited"])
if state["pages_crawled"] >= crash_after:
raise Exception("Simulated crash!")
strategy1 = BFSDeepCrawlStrategy(
max_depth=2,
max_pages=10,
on_state_change=capture_state_until_crash,
)
config = CrawlerRunConfig(
deep_crawl_strategy=strategy1,
stream=False,
verbose=False,
)
async with AsyncWebCrawler(verbose=False) as crawler:
# First crawl - will crash after 3 pages
with pytest.raises(Exception, match="Simulated crash"):
await crawler.arun("https://books.toscrape.com", config=config)
# Verify we captured state before crash
assert len(captured_states) > 0, "No states captured before crash"
last_state = captured_states[-1]
print(f"\n=== Phase 1: Crashed after {last_state['pages_crawled']} pages ===")
print(f"Visited URLs: {len(last_state['visited'])}")
print(f"Pending URLs: {len(last_state['pending'])}")
# Verify state structure
assert last_state["strategy_type"] == "bfs"
assert last_state["pages_crawled"] >= crash_after
assert len(last_state["visited"]) > 0
assert "pending" in last_state
assert "depths" in last_state
# Verify state is JSON serializable (important for Redis/DB storage)
json_str = json.dumps(last_state)
restored_state = json.loads(json_str)
assert restored_state == last_state, "State not JSON round-trip safe"
# Phase 2: Resume from checkpoint
crawled_urls_phase2: List[str] = []
async def track_resumed_crawl(state: Dict[str, Any]):
# Track what's being crawled in phase 2
new_visited = set(state["visited"]) - set(last_state["visited"])
for url in new_visited:
if url not in crawled_urls_phase2:
crawled_urls_phase2.append(url)
strategy2 = BFSDeepCrawlStrategy(
max_depth=2,
max_pages=10,
resume_state=restored_state,
on_state_change=track_resumed_crawl,
)
config2 = CrawlerRunConfig(
deep_crawl_strategy=strategy2,
stream=False,
verbose=False,
)
async with AsyncWebCrawler(verbose=False) as crawler:
results = await crawler.arun("https://books.toscrape.com", config=config2)
print(f"\n=== Phase 2: Resumed crawl ===")
print(f"New URLs crawled: {len(crawled_urls_phase2)}")
print(f"Final pages_crawled: {strategy2._pages_crawled}")
# Verify no duplicates - URLs from phase 1 should not be re-crawled
already_crawled = set(last_state["visited"]) - {item["url"] for item in last_state["pending"]}
duplicates = set(crawled_urls_phase2) & already_crawled
assert len(duplicates) == 0, f"Duplicate crawls detected: {duplicates}"
# Verify we made progress (crawled some of the pending URLs)
pending_urls = {item["url"] for item in last_state["pending"]}
crawled_pending = set(crawled_urls_phase2) & pending_urls
print(f"Pending URLs crawled in phase 2: {len(crawled_pending)}")
# Final state should show more pages crawled than before crash
final_state = strategy2.export_state()
if final_state:
assert final_state["pages_crawled"] >= last_state["pages_crawled"], \
"Resume did not make progress"
print("\n=== Integration test PASSED ===")
@pytest.mark.asyncio
async def test_state_export_method(self):
"""Test that export_state() returns valid state during crawl."""
states_from_callback: List[Dict] = []
async def capture(state):
states_from_callback.append(state)
strategy = BFSDeepCrawlStrategy(
max_depth=1,
max_pages=3,
on_state_change=capture,
)
config = CrawlerRunConfig(
deep_crawl_strategy=strategy,
stream=False,
verbose=False,
)
async with AsyncWebCrawler(verbose=False) as crawler:
await crawler.arun("https://books.toscrape.com", config=config)
# export_state should return the last captured state
exported = strategy.export_state()
assert exported is not None, "export_state() returned None"
assert exported == states_from_callback[-1], "export_state() doesn't match last callback"
print(f"\n=== export_state() test PASSED ===")
print(f"Final state: {exported['pages_crawled']} pages, {len(exported['visited'])} visited")