feat(crawler): add deep crawling capabilities with BFS strategy

Implements deep crawling functionality with a new BreadthFirstSearch strategy:
- Add DeepCrawlStrategy base class and BFS implementation
- Integrate deep crawling with AsyncWebCrawler via decorator pattern
- Update CrawlerRunConfig to support deep crawling parameters
- Add pagination support for Google Search crawler

BREAKING CHANGE: AsyncWebCrawler.arun and arun_many return types now include deep crawl results
This commit is contained in:
UncleCode
2025-02-04 01:24:49 +08:00
parent 04bc643cec
commit bc7559586f
5 changed files with 216 additions and 11 deletions

View File

@@ -29,6 +29,7 @@ from .markdown_generation_strategy import (
DefaultMarkdownGenerator,
MarkdownGenerationStrategy,
)
from .async_deep_crawl import DeepCrawlHandler
from .async_logger import AsyncLogger
from .async_configs import BrowserConfig, CrawlerRunConfig
from .async_dispatcher import * # noqa: F403
@@ -47,7 +48,7 @@ from .utils import (
from typing import Union, AsyncGenerator, TypeVar
CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult)
RunManyReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
DeepCrawlSingleReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
DeepCrawlManyReturn = Union[
@@ -215,6 +216,10 @@ class AsyncWebCrawler:
self.ready = False
# Decorate arun method with deep crawling capabilities
self._deep_handler = DeepCrawlHandler(self)
self.arun = self._deep_handler(self.arun)
async def start(self):
"""
Start the crawler explicitly without using context manager.
@@ -288,7 +293,7 @@ class AsyncWebCrawler:
user_agent: str = None,
verbose=True,
**kwargs,
) -> Union[CrawlResult, DeepCrawlSingleReturn]:
) -> RunManyReturn:
"""
Runs the crawler for a single source: URL (web, local file, or raw HTML).
@@ -715,7 +720,7 @@ class AsyncWebCrawler:
user_agent: str = None,
verbose=True,
**kwargs
) -> Union[RunManyReturn, DeepCrawlManyReturn]:
) -> RunManyReturn:
"""
Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy.