Files
crawl4ai/tests/20241401/test_deep_crawl.py
UncleCode a9415aaaf6 refactor(deep-crawling): reorganize deep crawling strategies and add new implementations
Split deep crawling code into separate strategy files for better organization and maintainability. Added new BFF (Best First) and DFS crawling strategies. Introduced base strategy class and common types.

BREAKING CHANGE: Deep crawling implementation has been split into multiple files. Import paths for deep crawling strategies have changed.
2025-02-05 22:50:39 +08:00

46 lines
1.5 KiB
Python

import asyncio
import time
from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
# from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
async def main():
"""Example deep crawl of documentation site."""
config = CrawlerRunConfig(
deep_crawl_strategy = BFSDeepCrawlStrategy(
max_depth=2,
include_external=False
),
stream=False,
verbose=True,
cache_mode=CacheMode.BYPASS,
scraping_strategy=LXMLWebScrapingStrategy()
)
async with AsyncWebCrawler() as crawler:
start_time = time.perf_counter()
print("\nStarting deep crawl in batch mode:")
results = await crawler.arun(
url="https://docs.crawl4ai.com",
config=config
)
print(f"Crawled {len(results)} pages")
print(f"Example page: {results[0].url}")
print(f"Duration: {time.perf_counter() - start_time:.2f} seconds\n")
print("Starting deep crawl in streaming mode:")
config.stream = True
start_time = time.perf_counter()
async for result in await crawler.arun(
url="https://docs.crawl4ai.com",
config=config
):
print(f"{result.url} (Depth: {result.metadata.get('depth', 0)})")
print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")
if __name__ == "__main__":
asyncio.run(main())