Refactor adaptive crawling state management
- Renamed `CrawlState` to `AdaptiveCrawlResult` to better reflect its purpose. - Updated all references to `CrawlState` in the codebase, including method signatures and documentation. - Modified the `AdaptiveCrawler` class to initialize and manage the new `AdaptiveCrawlResult` state. - Adjusted example strategies and documentation to align with the new state class. - Ensured all tests are updated to use `AdaptiveCrawlResult` instead of `CrawlState`.
This commit is contained in:
@@ -9,7 +9,7 @@ import asyncio
|
||||
import re
|
||||
from typing import List, Dict, Set
|
||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||
from crawl4ai.adaptive_crawler import CrawlState, Link
|
||||
from crawl4ai.adaptive_crawler import AdaptiveCrawlResult, Link
|
||||
import math
|
||||
|
||||
|
||||
@@ -45,7 +45,7 @@ class APIDocumentationStrategy:
|
||||
r'/legal/'
|
||||
]
|
||||
|
||||
def score_link(self, link: Link, query: str, state: CrawlState) -> float:
|
||||
def score_link(self, link: Link, query: str, state: AdaptiveCrawlResult) -> float:
|
||||
"""Custom link scoring for API documentation"""
|
||||
score = 1.0
|
||||
url = link.href.lower()
|
||||
@@ -77,7 +77,7 @@ class APIDocumentationStrategy:
|
||||
|
||||
return score
|
||||
|
||||
def calculate_api_coverage(self, state: CrawlState, query: str) -> Dict[str, float]:
|
||||
def calculate_api_coverage(self, state: AdaptiveCrawlResult, query: str) -> Dict[str, float]:
|
||||
"""Calculate specialized coverage metrics for API documentation"""
|
||||
metrics = {
|
||||
'endpoint_coverage': 0.0,
|
||||
|
||||
@@ -130,7 +130,7 @@ Factors:
|
||||
|
||||
```python
|
||||
class CustomLinkScorer:
|
||||
def score(self, link: Link, query: str, state: CrawlState) -> float:
|
||||
def score(self, link: Link, query: str, state: AdaptiveCrawlResult) -> float:
|
||||
# Prioritize specific URL patterns
|
||||
if "/api/reference/" in link.href:
|
||||
return 2.0 # Double the score
|
||||
@@ -325,17 +325,17 @@ with open("crawl_analysis.json", "w") as f:
|
||||
from crawl4ai.adaptive_crawler import BaseStrategy
|
||||
|
||||
class DomainSpecificStrategy(BaseStrategy):
|
||||
def calculate_coverage(self, state: CrawlState) -> float:
|
||||
def calculate_coverage(self, state: AdaptiveCrawlResult) -> float:
|
||||
# Custom coverage calculation
|
||||
# e.g., weight certain terms more heavily
|
||||
pass
|
||||
|
||||
def calculate_consistency(self, state: CrawlState) -> float:
|
||||
def calculate_consistency(self, state: AdaptiveCrawlResult) -> float:
|
||||
# Custom consistency logic
|
||||
# e.g., domain-specific validation
|
||||
pass
|
||||
|
||||
def rank_links(self, links: List[Link], state: CrawlState) -> List[Link]:
|
||||
def rank_links(self, links: List[Link], state: AdaptiveCrawlResult) -> List[Link]:
|
||||
# Custom link ranking
|
||||
# e.g., prioritize specific URL patterns
|
||||
pass
|
||||
@@ -359,7 +359,7 @@ class HybridStrategy(BaseStrategy):
|
||||
URLPatternStrategy()
|
||||
]
|
||||
|
||||
def calculate_confidence(self, state: CrawlState) -> float:
|
||||
def calculate_confidence(self, state: AdaptiveCrawlResult) -> float:
|
||||
# Weighted combination of strategies
|
||||
scores = [s.calculate_confidence(state) for s in self.strategies]
|
||||
weights = [0.5, 0.3, 0.2]
|
||||
|
||||
@@ -27,7 +27,7 @@ async def digest(
|
||||
start_url: str,
|
||||
query: str,
|
||||
resume_from: Optional[Union[str, Path]] = None
|
||||
) -> CrawlState
|
||||
) -> AdaptiveCrawlResult
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
@@ -38,7 +38,7 @@ async def digest(
|
||||
|
||||
#### Returns
|
||||
|
||||
- **CrawlState**: The final crawl state containing all crawled URLs, knowledge base, and metrics
|
||||
- **AdaptiveCrawlResult**: The final crawl state containing all crawled URLs, knowledge base, and metrics
|
||||
|
||||
#### Example
|
||||
|
||||
@@ -92,7 +92,7 @@ Access to the current crawl state.
|
||||
|
||||
```python
|
||||
@property
|
||||
def state(self) -> CrawlState
|
||||
def state(self) -> AdaptiveCrawlResult
|
||||
```
|
||||
|
||||
## Methods
|
||||
|
||||
@@ -9,7 +9,7 @@ async def digest(
|
||||
start_url: str,
|
||||
query: str,
|
||||
resume_from: Optional[Union[str, Path]] = None
|
||||
) -> CrawlState
|
||||
) -> AdaptiveCrawlResult
|
||||
```
|
||||
|
||||
## Parameters
|
||||
@@ -31,7 +31,7 @@ async def digest(
|
||||
|
||||
## Return Value
|
||||
|
||||
Returns a `CrawlState` object containing:
|
||||
Returns a `AdaptiveCrawlResult` object containing:
|
||||
|
||||
- **crawled_urls** (`Set[str]`): All URLs that have been crawled
|
||||
- **knowledge_base** (`List[CrawlResult]`): Collection of crawled pages with content
|
||||
|
||||
Reference in New Issue
Block a user