docs: Update adaptive crawling parameters and examples in README and release notes
This commit is contained in:
17
README.md
17
README.md
@@ -523,15 +523,18 @@ async def test_news_crawl():
|
|||||||
- **🧠 Adaptive Crawling**: Your crawler now learns and adapts to website patterns automatically:
|
- **🧠 Adaptive Crawling**: Your crawler now learns and adapts to website patterns automatically:
|
||||||
```python
|
```python
|
||||||
config = AdaptiveConfig(
|
config = AdaptiveConfig(
|
||||||
confidence_threshold=0.7,
|
confidence_threshold=0.7, # Min confidence to stop crawling
|
||||||
max_history=100,
|
max_depth=5, # Maximum crawl depth
|
||||||
learning_rate=0.2
|
max_pages=20, # Maximum number of pages to crawl
|
||||||
|
strategy="statistical"
|
||||||
)
|
)
|
||||||
|
|
||||||
result = await crawler.arun(
|
async with AsyncWebCrawler() as crawler:
|
||||||
"https://news.example.com",
|
adaptive_crawler = AdaptiveCrawler(crawler, config)
|
||||||
config=CrawlerRunConfig(adaptive_config=config)
|
state = await adaptive_crawler.digest(
|
||||||
)
|
start_url="https://news.example.com",
|
||||||
|
query="latest news content"
|
||||||
|
)
|
||||||
# Crawler learns patterns and improves extraction over time
|
# Crawler learns patterns and improves extraction over time
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -30,44 +30,34 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking:
|
|||||||
- Extraction confidence scores
|
- Extraction confidence scores
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import AdaptiveCrawler, AdaptiveConfig, CrawlState
|
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||||
|
|
||||||
# Initialize with custom learning parameters
|
# Initialize with custom adaptive parameters
|
||||||
config = AdaptiveConfig(
|
config = AdaptiveConfig(
|
||||||
confidence_threshold=0.7, # Min confidence to use learned patterns
|
confidence_threshold=0.7, # Min confidence to stop crawling
|
||||||
max_history=100, # Remember last 100 crawls per domain
|
max_depth=5, # Maximum crawl depth
|
||||||
learning_rate=0.2, # How quickly to adapt to changes
|
max_pages=20, # Maximum number of pages to crawl
|
||||||
patterns_per_page=3, # Patterns to learn per page type
|
top_k_links=3, # Number of top links to follow per page
|
||||||
extraction_strategy='css' # 'css' or 'xpath'
|
strategy="statistical", # 'statistical' or 'embedding'
|
||||||
|
coverage_weight=0.4, # Weight for coverage in confidence calculation
|
||||||
|
consistency_weight=0.3, # Weight for consistency in confidence calculation
|
||||||
|
saturation_weight=0.3 # Weight for saturation in confidence calculation
|
||||||
)
|
)
|
||||||
|
|
||||||
adaptive_crawler = AdaptiveCrawler(config)
|
# Initialize adaptive crawler with web crawler
|
||||||
|
|
||||||
# First crawl - crawler learns the structure
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
result = await crawler.arun(
|
adaptive_crawler = AdaptiveCrawler(crawler, config)
|
||||||
"https://news.example.com/article/12345",
|
|
||||||
config=CrawlerRunConfig(
|
# Crawl and learn patterns
|
||||||
adaptive_config=config,
|
state = await adaptive_crawler.digest(
|
||||||
extraction_hints={ # Optional hints to speed up learning
|
start_url="https://news.example.com/article/12345",
|
||||||
"title": "article h1",
|
query="latest news articles and content"
|
||||||
"content": "article .body-content"
|
|
||||||
}
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Crawler identifies and stores patterns
|
# Access results and confidence
|
||||||
if result.success:
|
print(f"Confidence Level: {adaptive_crawler.confidence:.0%}")
|
||||||
state = adaptive_crawler.get_state("news.example.com")
|
print(f"Pages Crawled: {len(state.crawled_urls)}")
|
||||||
print(f"Learned {len(state.patterns)} patterns")
|
print(f"Knowledge Base: {len(adaptive_crawler.state.knowledge_base)} documents")
|
||||||
print(f"Confidence: {state.avg_confidence:.2%}")
|
|
||||||
|
|
||||||
# Subsequent crawls - uses learned patterns
|
|
||||||
result2 = await crawler.arun(
|
|
||||||
"https://news.example.com/article/67890",
|
|
||||||
config=CrawlerRunConfig(adaptive_config=config)
|
|
||||||
)
|
|
||||||
# Automatically extracts using learned patterns!
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Expected Real-World Impact:**
|
**Expected Real-World Impact:**
|
||||||
|
|||||||
@@ -30,44 +30,34 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking:
|
|||||||
- Extraction confidence scores
|
- Extraction confidence scores
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import AdaptiveCrawler, AdaptiveConfig, CrawlState
|
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||||
|
|
||||||
# Initialize with custom learning parameters
|
# Initialize with custom adaptive parameters
|
||||||
config = AdaptiveConfig(
|
config = AdaptiveConfig(
|
||||||
confidence_threshold=0.7, # Min confidence to use learned patterns
|
confidence_threshold=0.7, # Min confidence to stop crawling
|
||||||
max_history=100, # Remember last 100 crawls per domain
|
max_depth=5, # Maximum crawl depth
|
||||||
learning_rate=0.2, # How quickly to adapt to changes
|
max_pages=20, # Maximum number of pages to crawl
|
||||||
patterns_per_page=3, # Patterns to learn per page type
|
top_k_links=3, # Number of top links to follow per page
|
||||||
extraction_strategy='css' # 'css' or 'xpath'
|
strategy="statistical", # 'statistical' or 'embedding'
|
||||||
|
coverage_weight=0.4, # Weight for coverage in confidence calculation
|
||||||
|
consistency_weight=0.3, # Weight for consistency in confidence calculation
|
||||||
|
saturation_weight=0.3 # Weight for saturation in confidence calculation
|
||||||
)
|
)
|
||||||
|
|
||||||
adaptive_crawler = AdaptiveCrawler(config)
|
# Initialize adaptive crawler with web crawler
|
||||||
|
|
||||||
# First crawl - crawler learns the structure
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
result = await crawler.arun(
|
adaptive_crawler = AdaptiveCrawler(crawler, config)
|
||||||
"https://news.example.com/article/12345",
|
|
||||||
config=CrawlerRunConfig(
|
# Crawl and learn patterns
|
||||||
adaptive_config=config,
|
state = await adaptive_crawler.digest(
|
||||||
extraction_hints={ # Optional hints to speed up learning
|
start_url="https://news.example.com/article/12345",
|
||||||
"title": "article h1",
|
query="latest news articles and content"
|
||||||
"content": "article .body-content"
|
|
||||||
}
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Crawler identifies and stores patterns
|
# Access results and confidence
|
||||||
if result.success:
|
print(f"Confidence Level: {adaptive_crawler.confidence:.0%}")
|
||||||
state = adaptive_crawler.get_state("news.example.com")
|
print(f"Pages Crawled: {len(state.crawled_urls)}")
|
||||||
print(f"Learned {len(state.patterns)} patterns")
|
print(f"Knowledge Base: {len(adaptive_crawler.state.knowledge_base)} documents")
|
||||||
print(f"Confidence: {state.avg_confidence:.2%}")
|
|
||||||
|
|
||||||
# Subsequent crawls - uses learned patterns
|
|
||||||
result2 = await crawler.arun(
|
|
||||||
"https://news.example.com/article/67890",
|
|
||||||
config=CrawlerRunConfig(adaptive_config=config)
|
|
||||||
)
|
|
||||||
# Automatically extracts using learned patterns!
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Expected Real-World Impact:**
|
**Expected Real-World Impact:**
|
||||||
|
|||||||
Reference in New Issue
Block a user