Split deep crawling code into separate strategy files for better organization and maintainability. Added new BFF (Best First) and DFS crawling strategies. Introduced base strategy class and common types. BREAKING CHANGE: Deep crawling implementation has been split into multiple files. Import paths for deep crawling strategies have changed.
46 lines
1.5 KiB
Python
46 lines
1.5 KiB
Python
import asyncio
|
|
import time
|
|
|
|
|
|
from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
|
|
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
|
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
|
|
# from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
|
|
|
|
|
|
async def main():
|
|
"""Example deep crawl of documentation site."""
|
|
config = CrawlerRunConfig(
|
|
deep_crawl_strategy = BFSDeepCrawlStrategy(
|
|
max_depth=2,
|
|
include_external=False
|
|
),
|
|
stream=False,
|
|
verbose=True,
|
|
cache_mode=CacheMode.BYPASS,
|
|
scraping_strategy=LXMLWebScrapingStrategy()
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
start_time = time.perf_counter()
|
|
print("\nStarting deep crawl in batch mode:")
|
|
results = await crawler.arun(
|
|
url="https://docs.crawl4ai.com",
|
|
config=config
|
|
)
|
|
print(f"Crawled {len(results)} pages")
|
|
print(f"Example page: {results[0].url}")
|
|
print(f"Duration: {time.perf_counter() - start_time:.2f} seconds\n")
|
|
|
|
print("Starting deep crawl in streaming mode:")
|
|
config.stream = True
|
|
start_time = time.perf_counter()
|
|
async for result in await crawler.arun(
|
|
url="https://docs.crawl4ai.com",
|
|
config=config
|
|
):
|
|
print(f"→ {result.url} (Depth: {result.metadata.get('depth', 0)})")
|
|
print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main()) |