feat: cleanup unused code and enhance documentation for v0.7.1

- Remove unused StealthConfig from browser_manager.py - Update LinkPreviewConfig import path in __init__.py and examples - Fix infinity handling in content_scraping_strategy.py (use 0 instead of float('inf')) - Remove sanitize_json_data functions from API endpoints - Add comprehensive C4A Script documentation to release notes - Update v0.7.0 release notes with improved code examples - Create v0.7.1 release notes focusing on cleanup and documentation improvements - Update demo files with corrected import paths and examples - Fix virtual scroll and adaptive crawling examples across documentation 🤖 Generated with Claude Code Co-Authored-By: Claude <noreply@anthropic.com>
2025-07-17 11:35:16 +02:00
parent ccbe3c105c
commit cf8badfe27
13 changed files with 241 additions and 343 deletions
--- a/docs/blog/release-v0.7.0.md
+++ b/docs/blog/release-v0.7.0.md
@@ -30,33 +30,40 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking:

 ```python
 from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
+import asyncio

-# Initialize with custom adaptive parameters
-config = AdaptiveConfig(
-    confidence_threshold=0.7,    # Min confidence to stop crawling
-    max_depth=5,                # Maximum crawl depth
-    max_pages=20,               # Maximum number of pages to crawl
-    top_k_links=3,              # Number of top links to follow per page
-    strategy="statistical",     # 'statistical' or 'embedding'
-    coverage_weight=0.4,        # Weight for coverage in confidence calculation
-    consistency_weight=0.3,     # Weight for consistency in confidence calculation
-    saturation_weight=0.3       # Weight for saturation in confidence calculation
-)
-
-# Initialize adaptive crawler with web crawler
-async with AsyncWebCrawler() as crawler:
-    adaptive_crawler = AdaptiveCrawler(crawler, config)
+async def main():
    
-    # Crawl and learn patterns
-    state = await adaptive_crawler.digest(
-        start_url="https://news.example.com/article/12345",
-        query="latest news articles and content"
+    # Configure adaptive crawler
+    config = AdaptiveConfig(
+        strategy="statistical",  # or "embedding" for semantic understanding
+        max_pages=10,
+        confidence_threshold=0.7,  # Stop at 70% confidence
+        top_k_links=3,  # Follow top 3 links per page
+        min_gain_threshold=0.05  # Need 5% information gain to continue
    )
    
-    # Access results and confidence
-    print(f"Confidence Level: {adaptive_crawler.confidence:.0%}")
-    print(f"Pages Crawled: {len(state.crawled_urls)}")
-    print(f"Knowledge Base: {len(adaptive_crawler.state.knowledge_base)} documents")
+    async with AsyncWebCrawler(verbose=False) as crawler:
+        adaptive = AdaptiveCrawler(crawler, config)
+        
+        print("Starting adaptive crawl about Python decorators...")
+        result = await adaptive.digest(
+            start_url="https://docs.python.org/3/glossary.html",
+            query="python decorators functions wrapping"
+        )
+        
+        print(f"\n✅ Crawling Complete!")
+        print(f"• Confidence Level: {adaptive.confidence:.0%}")
+        print(f"• Pages Crawled: {len(result.crawled_urls)}")
+        print(f"• Knowledge Base: {len(adaptive.state.knowledge_base)} documents")
+        
+        # Get most relevant content
+        relevant = adaptive.get_relevant_content(top_k=3)
+        print(f"\nMost Relevant Pages:")
+        for i, page in enumerate(relevant, 1):
+            print(f"{i}. {page['url']} (relevance: {page['score']:.2%})")
+
+asyncio.run(main())
 ```

 **Expected Real-World Impact:**
@@ -141,53 +148,47 @@ async with AsyncWebCrawler() as crawler:

 **My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals.

-### The Three-Layer Scoring System
+### Intelligent Link Analysis and Scoring

 ```python
-from crawl4ai import LinkPreviewConfig, CrawlerRunConfig, CacheMode
+import asyncio
+from crawl4ai import CrawlerRunConfig, CacheMode, AsyncWebCrawler
+from crawl4ai.adaptive_crawler import LinkPreviewConfig

-# Configure intelligent link analysis
-link_config = LinkPreviewConfig(
-    include_internal=True,
-    include_external=False,
-    max_links=10,
-    concurrency=5,
-    query="python tutorial",  # For contextual scoring
-    score_threshold=0.3,
-    verbose=True
-)
-
-# Use in your crawl
-result = await crawler.arun(
-    "https://tech-blog.example.com",
-    config=CrawlerRunConfig(
-        link_preview_config=link_config,
-        score_links=True,  # Enable intrinsic scoring
-        cache_mode=CacheMode.BYPASS
+async def main():
+    # Configure intelligent link analysis
+    link_config = LinkPreviewConfig(
+        include_internal=True,
+        include_external=False,
+        max_links=10,
+        concurrency=5,
+        query="python tutorial",  # For contextual scoring
+        score_threshold=0.3,
+        verbose=True
    )
-)
+    # Use in your crawl
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            "https://www.geeksforgeeks.org/",
+            config=CrawlerRunConfig(
+                link_preview_config=link_config,
+                score_links=True,  # Enable intrinsic scoring
+                cache_mode=CacheMode.BYPASS
+            )
+        )

-# Access scored and sorted links
-if result.success and result.links:
-# Get scored links
-internal_links = result.links.get("internal", [])
-scored_links = [l for l in internal_links if l.get("total_score")]
-scored_links.sort(key=lambda x: x.get("total_score", 0), reverse=True)
+        # Access scored and sorted links
+        if result.success and result.links:
+            for link in result.links.get("internal", []):
+                text = link.get('text', 'No text')[:40]
+                print(
+                    text,
+                    f"{link.get('intrinsic_score', 0):.1f}/10" if link.get('intrinsic_score') is not None else "0.0/10",
+                    f"{link.get('contextual_score', 0):.2f}/1" if link.get('contextual_score') is not None else "0.00/1",
+                    f"{link.get('total_score', 0):.3f}" if link.get('total_score') is not None else "0.000"
+                )

-# Print scoring results
-print("Link Scoring Results:")
-print("=" * 50)
-for link in scored_links[:5]:
-    text = link.get('text', 'No text')[:40]
-    intrinsic = link.get('intrinsic_score', 0)
-    contextual = link.get('contextual_score', 0)
-    total = link.get('total_score', 0)
-    
-    print(f"Link: {text}")
-    print(f"  Intrinsic Score: {intrinsic:.1f}/10")
-    print(f"  Contextual Score: {contextual:.2f}/1") 
-    print(f"  Total Score: {total:.3f}")
-    print("-" * 30)
+asyncio.run(main())
 ```

 **Scoring Components:**
@@ -220,58 +221,34 @@ for link in scored_links[:5]:
 ### Technical Architecture

 ```python
+import asyncio
 from crawl4ai import AsyncUrlSeeder, SeedingConfig

-# Basic discovery - find all product pages
-seeder_config = SeedingConfig(
-    # Discovery sources
-    source="cc+sitemap",        # Sitemap + Common Crawl
-    
-    # Filtering
-    pattern="*/product/*",      # URL pattern matching
-    
-    # Validation
-    live_check=True,           # Verify URLs are alive
-    max_urls=50,             # Stop at 50 URLs
-    
-    # Performance  
-    concurrency=100,           # Maximum concurrent requests for live checks/head extraction
-    hits_per_sec=10           # Rate limit in requests per second to avoid overwhelming servers
-)
+async def main():
+    async with AsyncUrlSeeder() as seeder:
+        # Discover Python tutorial URLs
+        config = SeedingConfig(
+            source="sitemap",  # Use sitemap
+            pattern="*python*",  # URL pattern filter
+            extract_head=True,  # Get metadata
+            query="python tutorial",  # For relevance scoring
+            scoring_method="bm25",
+            score_threshold=0.2,
+            max_urls=10
+        )
+        
+        print("Discovering Python async tutorial URLs...")
+        urls = await seeder.urls("https://www.geeksforgeeks.org/", config)
+        
+        print(f"\n✅ Found {len(urls)} relevant URLs:")
+        for i, url_info in enumerate(urls[:5], 1):
+            print(f"\n{i}. {url_info['url']}")
+            if url_info.get('relevance_score'):
+                print(f"   Relevance: {url_info['relevance_score']:.3f}")
+            if url_info.get('head_data', {}).get('title'):
+                print(f"   Title: {url_info['head_data']['title'][:60]}...")

-async with AsyncUrlSeeder() as seeder:
-    console.print("Discovering URLs from Python docs...")
-    urls = await seeder.urls("docs.python.org", seeding_config)
-    console.print(f"\n✓ Discovered {len(urls)} URLs")
-
-# Advanced: Relevance-based discovery
-research_config = SeedingConfig(
-    source="sitemap+cc",       # Sitemap + Common Crawl
-    pattern="*/blog/*",        # Blog posts only
-    
-    # Content relevance
-    extract_head=True,         # Get meta tags
-    query="quantum computing tutorials",
-    scoring_method="bm25",     # BM25 scoring method
-    score_threshold=0.4,       # High relevance only
-    
-    # Smart filtering
-    filter_nonsense_urls=True,  # Remove .xml, .txt, etc.
-    
-    force=True                 # Bypass cache
-)
-
-# Discover with progress tracking
-discovered = []
-async with AsyncUrlSeeder() as seeder:
-    discovered = await seeder.urls("https://physics-blog.com", research_config)
-    console.print(f"\n✓ Discovered {len(discovered)} URLs")
-
-# Results include scores and metadata
-for url_data in discovered[:5]:
-    print(f"URL: {url_data['url']}")
-    print(f"Score: {url_data['relevance_score']:.3f}")
-    print(f"Title: {url_data['head_data']['title']}")
+asyncio.run(main())
 ```

 **Discovery Methods:**