docs: Update adaptive crawling parameters and examples in README and release notes

2025-07-15 10:15:05 +02:00
parent dd5ee752cf
commit 58024755c5
3 changed files with 52 additions and 69 deletions
--- a/README.md
+++ b/README.md
@@ -523,15 +523,18 @@ async def test_news_crawl():
 - **🧠 Adaptive Crawling**: Your crawler now learns and adapts to website patterns automatically:
  ```python
  config = AdaptiveConfig(
-      confidence_threshold=0.7,
+      confidence_threshold=0.7, # Min confidence to stop crawling
-      max_history=100,
+      max_depth=5, # Maximum crawl depth
-      learning_rate=0.2
+      max_pages=20, # Maximum number of pages to crawl
      strategy="statistical"
  )
-  result = await crawler.arun(
+  async with AsyncWebCrawler() as crawler:
-      "https://news.example.com",
+      adaptive_crawler = AdaptiveCrawler(crawler, config)
-      config=CrawlerRunConfig(adaptive_config=config)
+      state = await adaptive_crawler.digest(
-  )
+          start_url="https://news.example.com",
          query="latest news content"
      )
  # Crawler learns patterns and improves extraction over time
  ```
--- a/docs/blog/release-v0.7.0.md
+++ b/docs/blog/release-v0.7.0.md
@@ -30,44 +30,34 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking:
 - Extraction confidence scores
 ```python
-from crawl4ai import AdaptiveCrawler, AdaptiveConfig, CrawlState
+from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
-# Initialize with custom learning parameters
+# Initialize with custom adaptive parameters
 config = AdaptiveConfig(
-    confidence_threshold=0.7,    # Min confidence to use learned patterns
+    confidence_threshold=0.7,    # Min confidence to stop crawling
-    max_history=100,            # Remember last 100 crawls per domain
+    max_depth=5,                # Maximum crawl depth
-    learning_rate=0.2,          # How quickly to adapt to changes
+    max_pages=20,               # Maximum number of pages to crawl
-    patterns_per_page=3,        # Patterns to learn per page type
+    top_k_links=3,              # Number of top links to follow per page
-    extraction_strategy='css'   # 'css' or 'xpath'
+    strategy="statistical",     # 'statistical' or 'embedding'
    coverage_weight=0.4,        # Weight for coverage in confidence calculation
    consistency_weight=0.3,     # Weight for consistency in confidence calculation
    saturation_weight=0.3       # Weight for saturation in confidence calculation
 )
-adaptive_crawler = AdaptiveCrawler(config)
+# Initialize adaptive crawler with web crawler
 # First crawl - crawler learns the structure
 async with AsyncWebCrawler() as crawler:
-    result = await crawler.arun(
+    adaptive_crawler = AdaptiveCrawler(crawler, config)
-        "https://news.example.com/article/12345",
+    
-        config=CrawlerRunConfig(
+    # Crawl and learn patterns
-            adaptive_config=config,
+    state = await adaptive_crawler.digest(
-            extraction_hints={  # Optional hints to speed up learning
+        start_url="https://news.example.com/article/12345",
-                "title": "article h1",
+        query="latest news articles and content"
                "content": "article .body-content"
            }
        )
    )
-    # Crawler identifies and stores patterns
+    # Access results and confidence
-    if result.success:
+    print(f"Confidence Level: {adaptive_crawler.confidence:.0%}")
-        state = adaptive_crawler.get_state("news.example.com")
+    print(f"Pages Crawled: {len(state.crawled_urls)}")
-        print(f"Learned {len(state.patterns)} patterns")
+    print(f"Knowledge Base: {len(adaptive_crawler.state.knowledge_base)} documents")
        print(f"Confidence: {state.avg_confidence:.2%}")
 # Subsequent crawls - uses learned patterns
 result2 = await crawler.arun(
    "https://news.example.com/article/67890",
    config=CrawlerRunConfig(adaptive_config=config)
 )
 # Automatically extracts using learned patterns!
 ```
 **Expected Real-World Impact:**
--- a/docs/md_v2/blog/releases/0.7.0.md
+++ b/docs/md_v2/blog/releases/0.7.0.md
@@ -30,44 +30,34 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking:
 - Extraction confidence scores
 ```python
-from crawl4ai import AdaptiveCrawler, AdaptiveConfig, CrawlState
+from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
-# Initialize with custom learning parameters
+# Initialize with custom adaptive parameters
 config = AdaptiveConfig(
-    confidence_threshold=0.7,    # Min confidence to use learned patterns
+    confidence_threshold=0.7,    # Min confidence to stop crawling
-    max_history=100,            # Remember last 100 crawls per domain
+    max_depth=5,                # Maximum crawl depth
-    learning_rate=0.2,          # How quickly to adapt to changes
+    max_pages=20,               # Maximum number of pages to crawl
-    patterns_per_page=3,        # Patterns to learn per page type
+    top_k_links=3,              # Number of top links to follow per page
-    extraction_strategy='css'   # 'css' or 'xpath'
+    strategy="statistical",     # 'statistical' or 'embedding'
    coverage_weight=0.4,        # Weight for coverage in confidence calculation
    consistency_weight=0.3,     # Weight for consistency in confidence calculation
    saturation_weight=0.3       # Weight for saturation in confidence calculation
 )
-adaptive_crawler = AdaptiveCrawler(config)
+# Initialize adaptive crawler with web crawler
 # First crawl - crawler learns the structure
 async with AsyncWebCrawler() as crawler:
-    result = await crawler.arun(
+    adaptive_crawler = AdaptiveCrawler(crawler, config)
-        "https://news.example.com/article/12345",
+    
-        config=CrawlerRunConfig(
+    # Crawl and learn patterns
-            adaptive_config=config,
+    state = await adaptive_crawler.digest(
-            extraction_hints={  # Optional hints to speed up learning
+        start_url="https://news.example.com/article/12345",
-                "title": "article h1",
+        query="latest news articles and content"
                "content": "article .body-content"
            }
        )
    )
-    # Crawler identifies and stores patterns
+    # Access results and confidence
-    if result.success:
+    print(f"Confidence Level: {adaptive_crawler.confidence:.0%}")
-        state = adaptive_crawler.get_state("news.example.com")
+    print(f"Pages Crawled: {len(state.crawled_urls)}")
-        print(f"Learned {len(state.patterns)} patterns")
+    print(f"Knowledge Base: {len(adaptive_crawler.state.knowledge_base)} documents")
        print(f"Confidence: {state.avg_confidence:.2%}")
 # Subsequent crawls - uses learned patterns
 result2 = await crawler.arun(
    "https://news.example.com/article/67890",
    config=CrawlerRunConfig(adaptive_config=config)
 )
 # Automatically extracts using learned patterns!
 ```
 **Expected Real-World Impact:**