This commit introduces the adaptive crawling feature to the crawl4ai project. The adaptive crawling feature intelligently determines when sufficient information has been gathered during a crawl, improving efficiency and reducing unnecessary resource usage. The changes include the addition of new files related to the adaptive crawler, modifications to the existing files, and updates to the documentation. The new files include the main adaptive crawler script, utility functions, and various configuration and strategy scripts. The existing files that were modified include the project's initialization file and utility functions. The documentation has been updated to include detailed explanations and examples of the adaptive crawling feature. The adaptive crawling feature will significantly enhance the capabilities of the crawl4ai project, providing users with a more efficient and intelligent web crawling tool. Significant modifications: - Added adaptive_crawler.py and related scripts - Modified __init__.py and utils.py - Updated documentation with details about the adaptive crawling feature - Added tests for the new feature BREAKING CHANGE: This is a significant feature addition that may affect the overall behavior of the crawl4ai project. Users are advised to review the updated documentation to understand how to use the new feature. Refs: #123, #456
76 lines
2.7 KiB
Python
76 lines
2.7 KiB
Python
"""
|
|
Basic Adaptive Crawling Example
|
|
|
|
This example demonstrates the simplest use case of adaptive crawling:
|
|
finding information about a specific topic and knowing when to stop.
|
|
"""
|
|
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler
|
|
|
|
|
|
async def main():
|
|
"""Basic adaptive crawling example"""
|
|
|
|
# Initialize the crawler
|
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
|
# Create an adaptive crawler with default settings (statistical strategy)
|
|
adaptive = AdaptiveCrawler(crawler)
|
|
|
|
# Note: You can also use embedding strategy for semantic understanding:
|
|
# from crawl4ai import AdaptiveConfig
|
|
# config = AdaptiveConfig(strategy="embedding")
|
|
# adaptive = AdaptiveCrawler(crawler, config)
|
|
|
|
# Start adaptive crawling
|
|
print("Starting adaptive crawl for Python async programming information...")
|
|
result = await adaptive.digest(
|
|
start_url="https://docs.python.org/3/library/asyncio.html",
|
|
query="async await context managers coroutines"
|
|
)
|
|
|
|
# Display crawl statistics
|
|
print("\n" + "="*50)
|
|
print("CRAWL STATISTICS")
|
|
print("="*50)
|
|
adaptive.print_stats(detailed=False)
|
|
|
|
# Get the most relevant content found
|
|
print("\n" + "="*50)
|
|
print("MOST RELEVANT PAGES")
|
|
print("="*50)
|
|
|
|
relevant_pages = adaptive.get_relevant_content(top_k=5)
|
|
for i, page in enumerate(relevant_pages, 1):
|
|
print(f"\n{i}. {page['url']}")
|
|
print(f" Relevance Score: {page['score']:.2%}")
|
|
|
|
# Show a snippet of the content
|
|
content = page['content'] or ""
|
|
if content:
|
|
snippet = content[:200].replace('\n', ' ')
|
|
if len(content) > 200:
|
|
snippet += "..."
|
|
print(f" Preview: {snippet}")
|
|
|
|
# Show final confidence
|
|
print(f"\n{'='*50}")
|
|
print(f"Final Confidence: {adaptive.confidence:.2%}")
|
|
print(f"Total Pages Crawled: {len(result.crawled_urls)}")
|
|
print(f"Knowledge Base Size: {len(adaptive.state.knowledge_base)} documents")
|
|
|
|
# Example: Check if we can answer specific questions
|
|
print(f"\n{'='*50}")
|
|
print("INFORMATION SUFFICIENCY CHECK")
|
|
print(f"{'='*50}")
|
|
|
|
if adaptive.confidence >= 0.8:
|
|
print("✓ High confidence - can answer detailed questions about async Python")
|
|
elif adaptive.confidence >= 0.6:
|
|
print("~ Moderate confidence - can answer basic questions")
|
|
else:
|
|
print("✗ Low confidence - need more information")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main()) |