refactor(deep-crawling): reorganize deep crawling strategies and add new implementations

Split deep crawling code into separate strategy files for better organization and maintainability. Added new BFF (Best First) and DFS crawling strategies. Introduced base strategy class and common types.

BREAKING CHANGE: Deep crawling implementation has been split into multiple files. Import paths for deep crawling strategies have changed.
This commit is contained in:
UncleCode
2025-02-05 22:50:39 +08:00
parent c308a794e8
commit a9415aaaf6
10 changed files with 769 additions and 214 deletions

View File

@@ -5,15 +5,11 @@ from datetime import datetime
from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
from urllib.parse import urlparse
from ..models import CrawlResult, TraversalStats
from ..models import TraversalStats
from .filters import FastFilterChain
from .scorers import FastURLScorer
from typing import TYPE_CHECKING
from . import DeepCrawlStrategy
if TYPE_CHECKING:
from ..async_configs import CrawlerRunConfig
from ..async_webcrawler import AsyncWebCrawler
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult
class BFSDeepCrawlStrategy(DeepCrawlStrategy):
"""
@@ -107,8 +103,8 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
async def _arun_batch(
self,
start_url: str,
crawler: "AsyncWebCrawler",
config: "CrawlerRunConfig",
crawler: AsyncWebCrawler,
config: CrawlerRunConfig,
) -> List[CrawlResult]:
"""
Batch (non-streaming) mode:
@@ -148,8 +144,8 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
async def _arun_stream(
self,
start_url: str,
crawler: "AsyncWebCrawler",
config: "CrawlerRunConfig",
crawler: AsyncWebCrawler,
config: CrawlerRunConfig,
) -> AsyncGenerator[CrawlResult, None]:
"""
Streaming mode: