Refactor adaptive crawling state management

- Renamed `CrawlState` to `AdaptiveCrawlResult` to better reflect its purpose.
- Updated all references to `CrawlState` in the codebase, including method signatures and documentation.
- Modified the `AdaptiveCrawler` class to initialize and manage the new `AdaptiveCrawlResult` state.
- Adjusted example strategies and documentation to align with the new state class.
- Ensured all tests are updated to use `AdaptiveCrawlResult` instead of `CrawlState`.
This commit is contained in:
UncleCode
2025-07-24 20:11:43 +08:00
parent d1de82a332
commit 843457a9cb
12 changed files with 51 additions and 1898 deletions

View File

@@ -9,7 +9,7 @@ import asyncio
import re
from typing import List, Dict, Set
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
from crawl4ai.adaptive_crawler import CrawlState, Link
from crawl4ai.adaptive_crawler import AdaptiveCrawlResult, Link
import math
@@ -45,7 +45,7 @@ class APIDocumentationStrategy:
r'/legal/'
]
def score_link(self, link: Link, query: str, state: CrawlState) -> float:
def score_link(self, link: Link, query: str, state: AdaptiveCrawlResult) -> float:
"""Custom link scoring for API documentation"""
score = 1.0
url = link.href.lower()
@@ -77,7 +77,7 @@ class APIDocumentationStrategy:
return score
def calculate_api_coverage(self, state: CrawlState, query: str) -> Dict[str, float]:
def calculate_api_coverage(self, state: AdaptiveCrawlResult, query: str) -> Dict[str, float]:
"""Calculate specialized coverage metrics for API documentation"""
metrics = {
'endpoint_coverage': 0.0,

View File

@@ -130,7 +130,7 @@ Factors:
```python
class CustomLinkScorer:
def score(self, link: Link, query: str, state: CrawlState) -> float:
def score(self, link: Link, query: str, state: AdaptiveCrawlResult) -> float:
# Prioritize specific URL patterns
if "/api/reference/" in link.href:
return 2.0 # Double the score
@@ -325,17 +325,17 @@ with open("crawl_analysis.json", "w") as f:
from crawl4ai.adaptive_crawler import BaseStrategy
class DomainSpecificStrategy(BaseStrategy):
def calculate_coverage(self, state: CrawlState) -> float:
def calculate_coverage(self, state: AdaptiveCrawlResult) -> float:
# Custom coverage calculation
# e.g., weight certain terms more heavily
pass
def calculate_consistency(self, state: CrawlState) -> float:
def calculate_consistency(self, state: AdaptiveCrawlResult) -> float:
# Custom consistency logic
# e.g., domain-specific validation
pass
def rank_links(self, links: List[Link], state: CrawlState) -> List[Link]:
def rank_links(self, links: List[Link], state: AdaptiveCrawlResult) -> List[Link]:
# Custom link ranking
# e.g., prioritize specific URL patterns
pass
@@ -359,7 +359,7 @@ class HybridStrategy(BaseStrategy):
URLPatternStrategy()
]
def calculate_confidence(self, state: CrawlState) -> float:
def calculate_confidence(self, state: AdaptiveCrawlResult) -> float:
# Weighted combination of strategies
scores = [s.calculate_confidence(state) for s in self.strategies]
weights = [0.5, 0.3, 0.2]

View File

@@ -27,7 +27,7 @@ async def digest(
start_url: str,
query: str,
resume_from: Optional[Union[str, Path]] = None
) -> CrawlState
) -> AdaptiveCrawlResult
```
#### Parameters
@@ -38,7 +38,7 @@ async def digest(
#### Returns
- **CrawlState**: The final crawl state containing all crawled URLs, knowledge base, and metrics
- **AdaptiveCrawlResult**: The final crawl state containing all crawled URLs, knowledge base, and metrics
#### Example
@@ -92,7 +92,7 @@ Access to the current crawl state.
```python
@property
def state(self) -> CrawlState
def state(self) -> AdaptiveCrawlResult
```
## Methods

View File

@@ -9,7 +9,7 @@ async def digest(
start_url: str,
query: str,
resume_from: Optional[Union[str, Path]] = None
) -> CrawlState
) -> AdaptiveCrawlResult
```
## Parameters
@@ -31,7 +31,7 @@ async def digest(
## Return Value
Returns a `CrawlState` object containing:
Returns a `AdaptiveCrawlResult` object containing:
- **crawled_urls** (`Set[str]`): All URLs that have been crawled
- **knowledge_base** (`List[CrawlResult]`): Collection of crawled pages with content