docs: Update adaptive crawling parameters and examples in README and release notes

This commit is contained in:
ntohidi
2025-07-15 10:15:05 +02:00
parent dd5ee752cf
commit 58024755c5
3 changed files with 52 additions and 69 deletions

View File

@@ -523,15 +523,18 @@ async def test_news_crawl():
- **🧠 Adaptive Crawling**: Your crawler now learns and adapts to website patterns automatically: - **🧠 Adaptive Crawling**: Your crawler now learns and adapts to website patterns automatically:
```python ```python
config = AdaptiveConfig( config = AdaptiveConfig(
confidence_threshold=0.7, confidence_threshold=0.7, # Min confidence to stop crawling
max_history=100, max_depth=5, # Maximum crawl depth
learning_rate=0.2 max_pages=20, # Maximum number of pages to crawl
strategy="statistical"
) )
result = await crawler.arun( async with AsyncWebCrawler() as crawler:
"https://news.example.com", adaptive_crawler = AdaptiveCrawler(crawler, config)
config=CrawlerRunConfig(adaptive_config=config) state = await adaptive_crawler.digest(
) start_url="https://news.example.com",
query="latest news content"
)
# Crawler learns patterns and improves extraction over time # Crawler learns patterns and improves extraction over time
``` ```

View File

@@ -30,44 +30,34 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking:
- Extraction confidence scores - Extraction confidence scores
```python ```python
from crawl4ai import AdaptiveCrawler, AdaptiveConfig, CrawlState from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
# Initialize with custom learning parameters # Initialize with custom adaptive parameters
config = AdaptiveConfig( config = AdaptiveConfig(
confidence_threshold=0.7, # Min confidence to use learned patterns confidence_threshold=0.7, # Min confidence to stop crawling
max_history=100, # Remember last 100 crawls per domain max_depth=5, # Maximum crawl depth
learning_rate=0.2, # How quickly to adapt to changes max_pages=20, # Maximum number of pages to crawl
patterns_per_page=3, # Patterns to learn per page type top_k_links=3, # Number of top links to follow per page
extraction_strategy='css' # 'css' or 'xpath' strategy="statistical", # 'statistical' or 'embedding'
coverage_weight=0.4, # Weight for coverage in confidence calculation
consistency_weight=0.3, # Weight for consistency in confidence calculation
saturation_weight=0.3 # Weight for saturation in confidence calculation
) )
adaptive_crawler = AdaptiveCrawler(config) # Initialize adaptive crawler with web crawler
# First crawl - crawler learns the structure
async with AsyncWebCrawler() as crawler: async with AsyncWebCrawler() as crawler:
result = await crawler.arun( adaptive_crawler = AdaptiveCrawler(crawler, config)
"https://news.example.com/article/12345",
config=CrawlerRunConfig( # Crawl and learn patterns
adaptive_config=config, state = await adaptive_crawler.digest(
extraction_hints={ # Optional hints to speed up learning start_url="https://news.example.com/article/12345",
"title": "article h1", query="latest news articles and content"
"content": "article .body-content"
}
)
) )
# Crawler identifies and stores patterns # Access results and confidence
if result.success: print(f"Confidence Level: {adaptive_crawler.confidence:.0%}")
state = adaptive_crawler.get_state("news.example.com") print(f"Pages Crawled: {len(state.crawled_urls)}")
print(f"Learned {len(state.patterns)} patterns") print(f"Knowledge Base: {len(adaptive_crawler.state.knowledge_base)} documents")
print(f"Confidence: {state.avg_confidence:.2%}")
# Subsequent crawls - uses learned patterns
result2 = await crawler.arun(
"https://news.example.com/article/67890",
config=CrawlerRunConfig(adaptive_config=config)
)
# Automatically extracts using learned patterns!
``` ```
**Expected Real-World Impact:** **Expected Real-World Impact:**

View File

@@ -30,44 +30,34 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking:
- Extraction confidence scores - Extraction confidence scores
```python ```python
from crawl4ai import AdaptiveCrawler, AdaptiveConfig, CrawlState from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
# Initialize with custom learning parameters # Initialize with custom adaptive parameters
config = AdaptiveConfig( config = AdaptiveConfig(
confidence_threshold=0.7, # Min confidence to use learned patterns confidence_threshold=0.7, # Min confidence to stop crawling
max_history=100, # Remember last 100 crawls per domain max_depth=5, # Maximum crawl depth
learning_rate=0.2, # How quickly to adapt to changes max_pages=20, # Maximum number of pages to crawl
patterns_per_page=3, # Patterns to learn per page type top_k_links=3, # Number of top links to follow per page
extraction_strategy='css' # 'css' or 'xpath' strategy="statistical", # 'statistical' or 'embedding'
coverage_weight=0.4, # Weight for coverage in confidence calculation
consistency_weight=0.3, # Weight for consistency in confidence calculation
saturation_weight=0.3 # Weight for saturation in confidence calculation
) )
adaptive_crawler = AdaptiveCrawler(config) # Initialize adaptive crawler with web crawler
# First crawl - crawler learns the structure
async with AsyncWebCrawler() as crawler: async with AsyncWebCrawler() as crawler:
result = await crawler.arun( adaptive_crawler = AdaptiveCrawler(crawler, config)
"https://news.example.com/article/12345",
config=CrawlerRunConfig( # Crawl and learn patterns
adaptive_config=config, state = await adaptive_crawler.digest(
extraction_hints={ # Optional hints to speed up learning start_url="https://news.example.com/article/12345",
"title": "article h1", query="latest news articles and content"
"content": "article .body-content"
}
)
) )
# Crawler identifies and stores patterns # Access results and confidence
if result.success: print(f"Confidence Level: {adaptive_crawler.confidence:.0%}")
state = adaptive_crawler.get_state("news.example.com") print(f"Pages Crawled: {len(state.crawled_urls)}")
print(f"Learned {len(state.patterns)} patterns") print(f"Knowledge Base: {len(adaptive_crawler.state.knowledge_base)} documents")
print(f"Confidence: {state.avg_confidence:.2%}")
# Subsequent crawls - uses learned patterns
result2 = await crawler.arun(
"https://news.example.com/article/67890",
config=CrawlerRunConfig(adaptive_config=config)
)
# Automatically extracts using learned patterns!
``` ```
**Expected Real-World Impact:** **Expected Real-World Impact:**