Files
crawl4ai/docs/examples/adaptive_crawling/export_import_kb.py
UncleCode 1a73fb60db feat(crawl4ai): Implement adaptive crawling feature
This commit introduces the adaptive crawling feature to the crawl4ai project. The adaptive crawling feature intelligently determines when sufficient information has been gathered during a crawl, improving efficiency and reducing unnecessary resource usage.

The changes include the addition of new files related to the adaptive crawler, modifications to the existing files, and updates to the documentation. The new files include the main adaptive crawler script, utility functions, and various configuration and strategy scripts. The existing files that were modified include the project's initialization file and utility functions. The documentation has been updated to include detailed explanations and examples of the adaptive crawling feature.

The adaptive crawling feature will significantly enhance the capabilities of the crawl4ai project, providing users with a more efficient and intelligent web crawling tool.

Significant modifications:
- Added adaptive_crawler.py and related scripts
- Modified __init__.py and utils.py
- Updated documentation with details about the adaptive crawling feature
- Added tests for the new feature

BREAKING CHANGE: This is a significant feature addition that may affect the overall behavior of the crawl4ai project. Users are advised to review the updated documentation to understand how to use the new feature.

Refs: #123, #456
2025-07-04 15:16:53 +08:00

232 lines
8.0 KiB
Python

"""
Knowledge Base Export and Import
This example demonstrates how to export crawled knowledge bases and
import them for reuse, sharing, or analysis.
"""
import asyncio
import json
from pathlib import Path
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
async def build_knowledge_base():
"""Build a knowledge base about web technologies"""
print("="*60)
print("PHASE 1: Building Knowledge Base")
print("="*60)
async with AsyncWebCrawler(verbose=False) as crawler:
adaptive = AdaptiveCrawler(crawler)
# Crawl information about HTTP
print("\n1. Gathering HTTP protocol information...")
await adaptive.digest(
start_url="https://httpbin.org",
query="http methods headers status codes"
)
print(f" - Pages crawled: {len(adaptive.state.crawled_urls)}")
print(f" - Confidence: {adaptive.confidence:.2%}")
# Add more information about APIs
print("\n2. Adding API documentation knowledge...")
await adaptive.digest(
start_url="https://httpbin.org/anything",
query="rest api json response request"
)
print(f" - Total pages: {len(adaptive.state.crawled_urls)}")
print(f" - Confidence: {adaptive.confidence:.2%}")
# Export the knowledge base
export_path = "web_tech_knowledge.jsonl"
print(f"\n3. Exporting knowledge base to {export_path}")
adaptive.export_knowledge_base(export_path)
# Show export statistics
export_size = Path(export_path).stat().st_size / 1024
with open(export_path, 'r') as f:
line_count = sum(1 for _ in f)
print(f" - Exported {line_count} documents")
print(f" - File size: {export_size:.1f} KB")
return export_path
async def analyze_knowledge_base(kb_path):
"""Analyze the exported knowledge base"""
print("\n" + "="*60)
print("PHASE 2: Analyzing Exported Knowledge Base")
print("="*60)
# Read and analyze JSONL
documents = []
with open(kb_path, 'r') as f:
for line in f:
documents.append(json.loads(line))
print(f"\nKnowledge base contains {len(documents)} documents:")
# Analyze document properties
total_content_length = 0
urls_by_domain = {}
for doc in documents:
# Content analysis
content_length = len(doc.get('content', ''))
total_content_length += content_length
# URL analysis
url = doc.get('url', '')
domain = url.split('/')[2] if url.startswith('http') else 'unknown'
urls_by_domain[domain] = urls_by_domain.get(domain, 0) + 1
# Show sample document
if documents.index(doc) == 0:
print(f"\nSample document structure:")
print(f" - URL: {url}")
print(f" - Content length: {content_length} chars")
print(f" - Has metadata: {'metadata' in doc}")
print(f" - Has links: {len(doc.get('links', []))} links")
print(f" - Query: {doc.get('query', 'N/A')}")
print(f"\nContent statistics:")
print(f" - Total content: {total_content_length:,} characters")
print(f" - Average per document: {total_content_length/len(documents):,.0f} chars")
print(f"\nDomain distribution:")
for domain, count in urls_by_domain.items():
print(f" - {domain}: {count} pages")
async def import_and_continue():
"""Import a knowledge base and continue crawling"""
print("\n" + "="*60)
print("PHASE 3: Importing and Extending Knowledge Base")
print("="*60)
kb_path = "web_tech_knowledge.jsonl"
async with AsyncWebCrawler(verbose=False) as crawler:
# Create new adaptive crawler
adaptive = AdaptiveCrawler(crawler)
# Import existing knowledge base
print(f"\n1. Importing knowledge base from {kb_path}")
adaptive.import_knowledge_base(kb_path)
print(f" - Imported {len(adaptive.state.knowledge_base)} documents")
print(f" - Existing URLs: {len(adaptive.state.crawled_urls)}")
# Check current state
print("\n2. Checking imported knowledge state:")
adaptive.print_stats(detailed=False)
# Continue crawling with new query
print("\n3. Extending knowledge with new query...")
await adaptive.digest(
start_url="https://httpbin.org/status/200",
query="error handling retry timeout"
)
print("\n4. Final knowledge base state:")
adaptive.print_stats(detailed=False)
# Export extended knowledge base
extended_path = "web_tech_knowledge_extended.jsonl"
adaptive.export_knowledge_base(extended_path)
print(f"\n5. Extended knowledge base exported to {extended_path}")
async def share_knowledge_bases():
"""Demonstrate sharing knowledge bases between projects"""
print("\n" + "="*60)
print("PHASE 4: Sharing Knowledge Between Projects")
print("="*60)
# Simulate two different projects
project_a_kb = "project_a_knowledge.jsonl"
project_b_kb = "project_b_knowledge.jsonl"
async with AsyncWebCrawler(verbose=False) as crawler:
# Project A: Security documentation
print("\n1. Project A: Building security knowledge...")
crawler_a = AdaptiveCrawler(crawler)
await crawler_a.digest(
start_url="https://httpbin.org/basic-auth/user/pass",
query="authentication security headers"
)
crawler_a.export_knowledge_base(project_a_kb)
print(f" - Exported {len(crawler_a.state.knowledge_base)} documents")
# Project B: API testing
print("\n2. Project B: Building testing knowledge...")
crawler_b = AdaptiveCrawler(crawler)
await crawler_b.digest(
start_url="https://httpbin.org/anything",
query="testing endpoints mocking"
)
crawler_b.export_knowledge_base(project_b_kb)
print(f" - Exported {len(crawler_b.state.knowledge_base)} documents")
# Merge knowledge bases
print("\n3. Merging knowledge bases...")
merged_crawler = AdaptiveCrawler(crawler)
# Import both knowledge bases
merged_crawler.import_knowledge_base(project_a_kb)
initial_size = len(merged_crawler.state.knowledge_base)
merged_crawler.import_knowledge_base(project_b_kb)
final_size = len(merged_crawler.state.knowledge_base)
print(f" - Project A documents: {initial_size}")
print(f" - Additional from Project B: {final_size - initial_size}")
print(f" - Total merged documents: {final_size}")
# Export merged knowledge
merged_kb = "merged_knowledge.jsonl"
merged_crawler.export_knowledge_base(merged_kb)
print(f"\n4. Merged knowledge base exported to {merged_kb}")
# Show combined coverage
print("\n5. Combined knowledge coverage:")
merged_crawler.print_stats(detailed=False)
async def main():
"""Run all examples"""
try:
# Build initial knowledge base
kb_path = await build_knowledge_base()
# Analyze the export
await analyze_knowledge_base(kb_path)
# Import and extend
await import_and_continue()
# Demonstrate sharing
await share_knowledge_bases()
print("\n" + "="*60)
print("All examples completed successfully!")
print("="*60)
finally:
# Clean up generated files
print("\nCleaning up generated files...")
for file in [
"web_tech_knowledge.jsonl",
"web_tech_knowledge_extended.jsonl",
"project_a_knowledge.jsonl",
"project_b_knowledge.jsonl",
"merged_knowledge.jsonl"
]:
Path(file).unlink(missing_ok=True)
print("Cleanup complete.")
if __name__ == "__main__":
asyncio.run(main())