Merge branch 'release/v0.7.1'

chore: update version to 0.7.1
feat: cleanup unused code and enhance documentation for v0.7.1
2025-07-17 17:42:04 +08:00 · 2025-07-17 11:37:41 +02:00 · 2025-07-17 11:35:16 +02:00 · 2025-07-17 09:13:20 +02:00 · 2025-07-16 13:34:25 +02:00 · 2025-07-16 13:33:53 +02:00
13 changed files with 583 additions and 297 deletions
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -3,7 +3,7 @@ import warnings
 from .async_webcrawler import AsyncWebCrawler, CacheMode
 # MODIFIED: Add SeedingConfig and VirtualScrollConfig here
-from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig
+from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig, LinkPreviewConfig
 from .content_scraping_strategy import (
    ContentScrapingStrategy,
@@ -173,6 +173,7 @@ __all__ = [
    "CompilationResult",
    "ValidationResult",
    "ErrorDetail",
    "LinkPreviewConfig"
 ]
--- a/crawl4ai/version.py
+++ b/crawl4ai/version.py
@@ -1,7 +1,7 @@
 # crawl4ai/__version__.py
 # This is the version that will be used for stable releases
-__version__ = "0.7.0"
+__version__ = "0.7.1"
 # For nightly builds, this gets set during build process
 __nightly_version__ = None
--- a/crawl4ai/browser_manager.py
+++ b/crawl4ai/browser_manager.py
@@ -14,23 +14,8 @@ import hashlib
 from .js_snippet import load_js_script
 from .config import DOWNLOAD_PAGE_TIMEOUT
 from .async_configs import BrowserConfig, CrawlerRunConfig
 from playwright_stealth import StealthConfig
 from .utils import get_chromium_path
 stealth_config = StealthConfig(
    webdriver=True,
    chrome_app=True,
    chrome_csi=True,
    chrome_load_times=True,
    chrome_runtime=True,
    navigator_languages=True,
    navigator_plugins=True,
    navigator_permissions=True,
    webgl_vendor=True,
    outerdimensions=True,
    navigator_hardware_concurrency=True,
    media_codecs=True,
 )
 BROWSER_DISABLE_OPTIONS = [
    "--disable-background-networking",
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -1145,10 +1145,10 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
                        link_data["intrinsic_score"] = intrinsic_score
                    except Exception:
                        # Fail gracefully - assign default score
-                        link_data["intrinsic_score"] = float('inf')
+                        link_data["intrinsic_score"] = 0
                else:
                    # No scoring enabled - assign infinity (all links equal priority)
-                    link_data["intrinsic_score"] = float('inf')
+                    link_data["intrinsic_score"] = 0
                is_external = is_external_url(normalized_href, base_domain)
                if is_external:
--- a/docs/blog/release-v0.7.0.md
+++ b/docs/blog/release-v0.7.0.md
@@ -30,33 +30,40 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking:
 ```python
 from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
 import asyncio
-# Initialize with custom adaptive parameters
+async def main():
 config = AdaptiveConfig(
    confidence_threshold=0.7,    # Min confidence to stop crawling
    max_depth=5,                # Maximum crawl depth
    max_pages=20,               # Maximum number of pages to crawl
    top_k_links=3,              # Number of top links to follow per page
    strategy="statistical",     # 'statistical' or 'embedding'
    coverage_weight=0.4,        # Weight for coverage in confidence calculation
    consistency_weight=0.3,     # Weight for consistency in confidence calculation
    saturation_weight=0.3       # Weight for saturation in confidence calculation
 )
 # Initialize adaptive crawler with web crawler
 async with AsyncWebCrawler() as crawler:
    adaptive_crawler = AdaptiveCrawler(crawler, config)
-    # Crawl and learn patterns
+    # Configure adaptive crawler
-    state = await adaptive_crawler.digest(
+    config = AdaptiveConfig(
-        start_url="https://news.example.com/article/12345",
+        strategy="statistical",  # or "embedding" for semantic understanding
-        query="latest news articles and content"
+        max_pages=10,
        confidence_threshold=0.7,  # Stop at 70% confidence
        top_k_links=3,  # Follow top 3 links per page
        min_gain_threshold=0.05  # Need 5% information gain to continue
    )
-    # Access results and confidence
+    async with AsyncWebCrawler(verbose=False) as crawler:
-    print(f"Confidence Level: {adaptive_crawler.confidence:.0%}")
+        adaptive = AdaptiveCrawler(crawler, config)
-    print(f"Pages Crawled: {len(state.crawled_urls)}")
+        
-    print(f"Knowledge Base: {len(adaptive_crawler.state.knowledge_base)} documents")
+        print("Starting adaptive crawl about Python decorators...")
        result = await adaptive.digest(
            start_url="https://docs.python.org/3/glossary.html",
            query="python decorators functions wrapping"
        )
        print(f"\n✅ Crawling Complete!")
        print(f"• Confidence Level: {adaptive.confidence:.0%}")
        print(f"• Pages Crawled: {len(result.crawled_urls)}")
        print(f"• Knowledge Base: {len(adaptive.state.knowledge_base)} documents")
        # Get most relevant content
        relevant = adaptive.get_relevant_content(top_k=3)
        print(f"\nMost Relevant Pages:")
        for i, page in enumerate(relevant, 1):
            print(f"{i}. {page['url']} (relevance: {page['score']:.2%})")
 asyncio.run(main())
 ```
 **Expected Real-World Impact:**
@@ -141,56 +148,47 @@ async with AsyncWebCrawler() as crawler:
 **My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals.
-### The Three-Layer Scoring System
+### Intelligent Link Analysis and Scoring
 ```python
-from crawl4ai import LinkPreviewConfig, CrawlerRunConfig, CacheMode
+import asyncio
 from crawl4ai import CrawlerRunConfig, CacheMode, AsyncWebCrawler
 from crawl4ai.adaptive_crawler import LinkPreviewConfig
-# Configure intelligent link analysis
+async def main():
-link_config = LinkPreviewConfig(
+    # Configure intelligent link analysis
-    include_internal=True,
+    link_config = LinkPreviewConfig(
-    include_external=False,
+        include_internal=True,
-    max_links=10,
+        include_external=False,
-    concurrency=5,
+        max_links=10,
-    query="python tutorial",  # For contextual scoring
+        concurrency=5,
-    score_threshold=0.3,
+        query="python tutorial",  # For contextual scoring
-    verbose=True
+        score_threshold=0.3,
-)
+        verbose=True
 # Use in your crawl
 result = await crawler.arun(
    "https://tech-blog.example.com",
    config=CrawlerRunConfig(
        link_preview_config=link_config,
        score_links=True,  # Enable intrinsic scoring
        cache_mode=CacheMode.BYPASS
    )
-)
+    # Use in your crawl
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            "https://www.geeksforgeeks.org/",
            config=CrawlerRunConfig(
                link_preview_config=link_config,
                score_links=True,  # Enable intrinsic scoring
                cache_mode=CacheMode.BYPASS
            )
        )
-# Access scored and sorted links
+        # Access scored and sorted links
-if result.success and result.links:
+        if result.success and result.links:
-# Get scored links
+            for link in result.links.get("internal", []):
-internal_links = result.links.get("internal", [])
+                text = link.get('text', 'No text')[:40]
-scored_links = [l for l in internal_links if l.get("total_score")]
+                print(
-scored_links.sort(key=lambda x: x.get("total_score", 0), reverse=True)
+                    text,
                    f"{link.get('intrinsic_score', 0):.1f}/10" if link.get('intrinsic_score') is not None else "0.0/10",
                    f"{link.get('contextual_score', 0):.2f}/1" if link.get('contextual_score') is not None else "0.00/1",
                    f"{link.get('total_score', 0):.3f}" if link.get('total_score') is not None else "0.000"
                )
-# Create a scoring table
+asyncio.run(main())
 table = Table(title="Link Scoring Results", box=box.ROUNDED)
 table.add_column("Link Text", style="cyan", width=40)
 table.add_column("Intrinsic Score", justify="center")
 table.add_column("Contextual Score", justify="center")
 table.add_column("Total Score", justify="center", style="bold green")
 for link in scored_links[:5]:
    text = link.get('text', 'No text')[:40]
    table.add_row(
        text,
        f"{link.get('intrinsic_score', 0):.1f}/10",
        f"{link.get('contextual_score', 0):.2f}/1",
        f"{link.get('total_score', 0):.3f}"
    )
 console.print(table)
 ```
 **Scoring Components:**
@@ -223,58 +221,34 @@ console.print(table)
 ### Technical Architecture
 ```python
 import asyncio
 from crawl4ai import AsyncUrlSeeder, SeedingConfig
-# Basic discovery - find all product pages
+async def main():
-seeder_config = SeedingConfig(
+    async with AsyncUrlSeeder() as seeder:
-    # Discovery sources
+        # Discover Python tutorial URLs
-    source="cc+sitemap",        # Sitemap + Common Crawl
+        config = SeedingConfig(
-    
+            source="sitemap",  # Use sitemap
-    # Filtering
+            pattern="*python*",  # URL pattern filter
-    pattern="*/product/*",      # URL pattern matching
+            extract_head=True,  # Get metadata
-    
+            query="python tutorial",  # For relevance scoring
-    # Validation
+            scoring_method="bm25",
-    live_check=True,           # Verify URLs are alive
+            score_threshold=0.2,
-    max_urls=50,             # Stop at 50 URLs
+            max_urls=10
-    
+        )
-    # Performance  
+        
-    concurrency=100,           # Maximum concurrent requests for live checks/head extraction
+        print("Discovering Python async tutorial URLs...")
-    hits_per_sec=10           # Rate limit in requests per second to avoid overwhelming servers
+        urls = await seeder.urls("https://www.geeksforgeeks.org/", config)
-)
+        
        print(f"\n✅ Found {len(urls)} relevant URLs:")
        for i, url_info in enumerate(urls[:5], 1):
            print(f"\n{i}. {url_info['url']}")
            if url_info.get('relevance_score'):
                print(f"   Relevance: {url_info['relevance_score']:.3f}")
            if url_info.get('head_data', {}).get('title'):
                print(f"   Title: {url_info['head_data']['title'][:60]}...")
-async with AsyncUrlSeeder() as seeder:
+asyncio.run(main())
    console.print("Discovering URLs from Python docs...")
    urls = await seeder.urls("docs.python.org", seeding_config)
    console.print(f"\n✓ Discovered {len(urls)} URLs")
 # Advanced: Relevance-based discovery
 research_config = SeedingConfig(
    source="sitemap+cc",       # Sitemap + Common Crawl
    pattern="*/blog/*",        # Blog posts only
    # Content relevance
    extract_head=True,         # Get meta tags
    query="quantum computing tutorials",
    scoring_method="bm25",     # BM25 scoring method
    score_threshold=0.4,       # High relevance only
    # Smart filtering
    filter_nonsense_urls=True,  # Remove .xml, .txt, etc.
    force=True                 # Bypass cache
 )
 # Discover with progress tracking
 discovered = []
 async with AsyncUrlSeeder() as seeder:
    discovered = await seeder.urls("https://physics-blog.com", research_config)
    console.print(f"\n✓ Discovered {len(discovered)} URLs")
 # Results include scores and metadata
 for url_data in discovered[:5]:
    print(f"URL: {url_data['url']}")
    print(f"Score: {url_data['relevance_score']:.3f}")
    print(f"Title: {url_data['head_data']['title']}")
 ```
 **Discovery Methods:**
--- a/docs/blog/release-v0.7.1.md
+++ b/docs/blog/release-v0.7.1.md
@@ -0,0 +1,43 @@
 # 🛠️ Crawl4AI v0.7.1: Minor Cleanup Update
 *July 17, 2025 • 2 min read*
 ---
 A small maintenance release that removes unused code and improves documentation.
 ## 🎯 What's Changed
 - **Removed unused StealthConfig** from `crawl4ai/browser_manager.py`
 - **Updated documentation** with better examples and parameter explanations
 - **Fixed virtual scroll configuration** examples in docs
 ## 🧹 Code Cleanup
 Removed unused `StealthConfig` import and configuration that wasn't being used anywhere in the codebase. The project uses its own custom stealth implementation through JavaScript injection instead.
 ```python
 # Removed unused code:
 from playwright_stealth import StealthConfig
 stealth_config = StealthConfig(...)  # This was never used
 ```
 ## 📖 Documentation Updates
 - Fixed adaptive crawling parameter examples
 - Updated session management documentation
 - Corrected virtual scroll configuration examples
 ## 🚀 Installation
 ```bash
 pip install crawl4ai==0.7.1
 ```
 No breaking changes - upgrade directly from v0.7.0.
 ---
 Questions? Issues? 
 - GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
 - Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
--- a/docs/examples/link_head_extraction_example.py
+++ b/docs/examples/link_head_extraction_example.py
@@ -18,7 +18,7 @@ Usage:
 import asyncio
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.async_configs import LinkPreviewConfig
+from crawl4ai import LinkPreviewConfig
 async def basic_link_head_extraction():
--- a/docs/md_v2/blog/releases/0.7.0.md
+++ b/docs/md_v2/blog/releases/0.7.0.md
@@ -30,33 +30,40 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking:
 ```python
 from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
 import asyncio
-# Initialize with custom adaptive parameters
+async def main():
 config = AdaptiveConfig(
    confidence_threshold=0.7,    # Min confidence to stop crawling
    max_depth=5,                # Maximum crawl depth
    max_pages=20,               # Maximum number of pages to crawl
    top_k_links=3,              # Number of top links to follow per page
    strategy="statistical",     # 'statistical' or 'embedding'
    coverage_weight=0.4,        # Weight for coverage in confidence calculation
    consistency_weight=0.3,     # Weight for consistency in confidence calculation
    saturation_weight=0.3       # Weight for saturation in confidence calculation
 )
 # Initialize adaptive crawler with web crawler
 async with AsyncWebCrawler() as crawler:
    adaptive_crawler = AdaptiveCrawler(crawler, config)
-    # Crawl and learn patterns
+    # Configure adaptive crawler
-    state = await adaptive_crawler.digest(
+    config = AdaptiveConfig(
-        start_url="https://news.example.com/article/12345",
+        strategy="statistical",  # or "embedding" for semantic understanding
-        query="latest news articles and content"
+        max_pages=10,
        confidence_threshold=0.7,  # Stop at 70% confidence
        top_k_links=3,  # Follow top 3 links per page
        min_gain_threshold=0.05  # Need 5% information gain to continue
    )
-    # Access results and confidence
+    async with AsyncWebCrawler(verbose=False) as crawler:
-    print(f"Confidence Level: {adaptive_crawler.confidence:.0%}")
+        adaptive = AdaptiveCrawler(crawler, config)
-    print(f"Pages Crawled: {len(state.crawled_urls)}")
+        
-    print(f"Knowledge Base: {len(adaptive_crawler.state.knowledge_base)} documents")
+        print("Starting adaptive crawl about Python decorators...")
        result = await adaptive.digest(
            start_url="https://docs.python.org/3/glossary.html",
            query="python decorators functions wrapping"
        )
        print(f"\n✅ Crawling Complete!")
        print(f"• Confidence Level: {adaptive.confidence:.0%}")
        print(f"• Pages Crawled: {len(result.crawled_urls)}")
        print(f"• Knowledge Base: {len(adaptive.state.knowledge_base)} documents")
        # Get most relevant content
        relevant = adaptive.get_relevant_content(top_k=3)
        print(f"\nMost Relevant Pages:")
        for i, page in enumerate(relevant, 1):
            print(f"{i}. {page['url']} (relevance: {page['score']:.2%})")
 asyncio.run(main())
 ```
 **Expected Real-World Impact:**
@@ -141,56 +148,47 @@ async with AsyncWebCrawler() as crawler:
 **My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals.
-### The Three-Layer Scoring System
+### Intelligent Link Analysis and Scoring
 ```python
-from crawl4ai import LinkPreviewConfig, CrawlerRunConfig, CacheMode
+import asyncio
 from crawl4ai import CrawlerRunConfig, CacheMode, AsyncWebCrawler
 from crawl4ai.adaptive_crawler import LinkPreviewConfig
-# Configure intelligent link analysis
+async def main():
-link_config = LinkPreviewConfig(
+    # Configure intelligent link analysis
-    include_internal=True,
+    link_config = LinkPreviewConfig(
-    include_external=False,
+        include_internal=True,
-    max_links=10,
+        include_external=False,
-    concurrency=5,
+        max_links=10,
-    query="python tutorial",  # For contextual scoring
+        concurrency=5,
-    score_threshold=0.3,
+        query="python tutorial",  # For contextual scoring
-    verbose=True
+        score_threshold=0.3,
-)
+        verbose=True
 # Use in your crawl
 result = await crawler.arun(
    "https://tech-blog.example.com",
    config=CrawlerRunConfig(
        link_preview_config=link_config,
        score_links=True,  # Enable intrinsic scoring
        cache_mode=CacheMode.BYPASS
    )
-)
+    # Use in your crawl
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            "https://www.geeksforgeeks.org/",
            config=CrawlerRunConfig(
                link_preview_config=link_config,
                score_links=True,  # Enable intrinsic scoring
                cache_mode=CacheMode.BYPASS
            )
        )
-# Access scored and sorted links
+        # Access scored and sorted links
-if result.success and result.links:
+        if result.success and result.links:
-# Get scored links
+            for link in result.links.get("internal", []):
-internal_links = result.links.get("internal", [])
+                text = link.get('text', 'No text')[:40]
-scored_links = [l for l in internal_links if l.get("total_score")]
+                print(
-scored_links.sort(key=lambda x: x.get("total_score", 0), reverse=True)
+                    text,
                    f"{link.get('intrinsic_score', 0):.1f}/10" if link.get('intrinsic_score') is not None else "0.0/10",
                    f"{link.get('contextual_score', 0):.2f}/1" if link.get('contextual_score') is not None else "0.00/1",
                    f"{link.get('total_score', 0):.3f}" if link.get('total_score') is not None else "0.000"
                )
-# Create a scoring table
+asyncio.run(main())
 table = Table(title="Link Scoring Results", box=box.ROUNDED)
 table.add_column("Link Text", style="cyan", width=40)
 table.add_column("Intrinsic Score", justify="center")
 table.add_column("Contextual Score", justify="center")
 table.add_column("Total Score", justify="center", style="bold green")
 for link in scored_links[:5]:
    text = link.get('text', 'No text')[:40]
    table.add_row(
        text,
        f"{link.get('intrinsic_score', 0):.1f}/10",
        f"{link.get('contextual_score', 0):.2f}/1",
        f"{link.get('total_score', 0):.3f}"
    )
 console.print(table)
 ```
 **Scoring Components:**
@@ -223,58 +221,34 @@ console.print(table)
 ### Technical Architecture
 ```python
 import asyncio
 from crawl4ai import AsyncUrlSeeder, SeedingConfig
-# Basic discovery - find all product pages
+async def main():
-seeder_config = SeedingConfig(
+    async with AsyncUrlSeeder() as seeder:
-    # Discovery sources
+        # Discover Python tutorial URLs
-    source="cc+sitemap",        # Sitemap + Common Crawl
+        config = SeedingConfig(
-    
+            source="sitemap",  # Use sitemap
-    # Filtering
+            pattern="*python*",  # URL pattern filter
-    pattern="*/product/*",      # URL pattern matching
+            extract_head=True,  # Get metadata
-    
+            query="python tutorial",  # For relevance scoring
-    # Validation
+            scoring_method="bm25",
-    live_check=True,           # Verify URLs are alive
+            score_threshold=0.2,
-    max_urls=50,             # Stop at 50 URLs
+            max_urls=10
-    
+        )
-    # Performance  
+        
-    concurrency=100,           # Maximum concurrent requests for live checks/head extraction
+        print("Discovering Python async tutorial URLs...")
-    hits_per_sec=10           # Rate limit in requests per second to avoid overwhelming servers
+        urls = await seeder.urls("https://www.geeksforgeeks.org/", config)
-)
+        
        print(f"\n✅ Found {len(urls)} relevant URLs:")
        for i, url_info in enumerate(urls[:5], 1):
            print(f"\n{i}. {url_info['url']}")
            if url_info.get('relevance_score'):
                print(f"   Relevance: {url_info['relevance_score']:.3f}")
            if url_info.get('head_data', {}).get('title'):
                print(f"   Title: {url_info['head_data']['title'][:60]}...")
-async with AsyncUrlSeeder() as seeder:
+asyncio.run(main())
    console.print("Discovering URLs from Python docs...")
    urls = await seeder.urls("docs.python.org", seeding_config)
    console.print(f"\n✓ Discovered {len(urls)} URLs")
 # Advanced: Relevance-based discovery
 research_config = SeedingConfig(
    source="sitemap+cc",       # Sitemap + Common Crawl
    pattern="*/blog/*",        # Blog posts only
    # Content relevance
    extract_head=True,         # Get meta tags
    query="quantum computing tutorials",
    scoring_method="bm25",     # BM25 scoring method
    score_threshold=0.4,       # High relevance only
    # Smart filtering
    filter_nonsense_urls=True,  # Remove .xml, .txt, etc.
    force=True                 # Bypass cache
 )
 # Discover with progress tracking
 discovered = []
 async with AsyncUrlSeeder() as seeder:
    discovered = await seeder.urls("https://physics-blog.com", research_config)
    console.print(f"\n✓ Discovered {len(discovered)} URLs")
 # Results include scores and metadata
 for url_data in discovered[:5]:
    print(f"URL: {url_data['url']}")
    print(f"Score: {url_data['relevance_score']:.3f}")
    print(f"Title: {url_data['head_data']['title']}")
 ```
 **Discovery Methods:**
--- a/docs/md_v2/core/link-media.md
+++ b/docs/md_v2/core/link-media.md
@@ -125,7 +125,7 @@ Here's a full example you can copy, paste, and run immediately:
 ```python
 import asyncio
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.async_configs import LinkPreviewConfig
+from crawl4ai import LinkPreviewConfig
 async def extract_link_heads_example():
    """
@@ -237,7 +237,7 @@ if __name__ == "__main__":
 The `LinkPreviewConfig` class supports these options:
 ```python
-from crawl4ai.async_configs import LinkPreviewConfig
+from crawl4ai import LinkPreviewConfig
 link_preview_config = LinkPreviewConfig(
    # BASIC SETTINGS
--- a/docs/releases_review/crawl4ai_v0_7_0_showcase.py
+++ b/docs/releases_review/crawl4ai_v0_7_0_showcase.py
@@ -28,7 +28,7 @@ from rich import box
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, AdaptiveCrawler, AdaptiveConfig, BrowserConfig, CacheMode
 from crawl4ai import AsyncUrlSeeder, SeedingConfig
-from crawl4ai.async_configs import LinkPreviewConfig, VirtualScrollConfig
+from crawl4ai import LinkPreviewConfig, VirtualScrollConfig
 from crawl4ai import c4a_compile, CompilationResult
 # Initialize Rich console for beautiful output
--- a/docs/releases_review/v0_7_0_features_demo.py
+++ b/docs/releases_review/v0_7_0_features_demo.py
@@ -13,14 +13,13 @@ from crawl4ai import (
    BrowserConfig,
    CacheMode,
    # New imports for v0.7.0
    LinkPreviewConfig,
    VirtualScrollConfig,
    LinkPreviewConfig,
    AdaptiveCrawler,
    AdaptiveConfig,
    AsyncUrlSeeder,
    SeedingConfig,
    c4a_compile,
    CompilationResult
 )
@@ -170,16 +169,16 @@ async def demo_url_seeder():
        # Discover Python tutorial URLs
        config = SeedingConfig(
            source="sitemap",  # Use sitemap
-            pattern="*tutorial*",  # URL pattern filter
+            pattern="*python*",  # URL pattern filter
            extract_head=True,  # Get metadata
-            query="python async programming",  # For relevance scoring
+            query="python tutorial",  # For relevance scoring
            scoring_method="bm25",
            score_threshold=0.2,
            max_urls=10
        )
        print("Discovering Python async tutorial URLs...")
-        urls = await seeder.urls("docs.python.org", config)
+        urls = await seeder.urls("https://www.geeksforgeeks.org/", config)
        print(f"\n✅ Found {len(urls)} relevant URLs:")
        for i, url_info in enumerate(urls[:5], 1):
@@ -245,39 +244,6 @@ IF (EXISTS `.price-filter`) THEN CLICK `input[data-max-price="100"]`
        print(f"❌ Compilation error: {result.first_error.message}")
 async def demo_pdf_support():
    """
    Demo 6: PDF Parsing Support
    Shows how to extract content from PDF files.
    Note: Requires 'pip install crawl4ai[pdf]'
    """
    print("\n" + "="*60)
    print("📄 DEMO 6: PDF Parsing Support")
    print("="*60)
    try:
        # Check if PDF support is installed
        import PyPDF2
        # Example: Process a PDF URL
        config = CrawlerRunConfig(
            cache_mode=CacheMode.BYPASS,
            pdf=True,  # Enable PDF generation
            extract_text_from_pdf=True  # Extract text content
        )
        print("PDF parsing is available!")
        print("You can now crawl PDF URLs and extract their content.")
        print("\nExample usage:")
        print('  result = await crawler.arun("https://example.com/document.pdf")')
        print('  pdf_text = result.extracted_content  # Contains extracted text')
    except ImportError:
        print("⚠️  PDF support not installed.")
        print("Install with: pip install crawl4ai[pdf]")
 async def main():
    """Run all demos"""
    print("\n🚀 Crawl4AI v0.7.0 Feature Demonstrations")
@@ -289,7 +255,6 @@ async def main():
        ("Virtual Scroll", demo_virtual_scroll),
        ("URL Seeder", demo_url_seeder),
        ("C4A Script", demo_c4a_script),
        ("PDF Support", demo_pdf_support)
    ]
    for name, demo_func in demos:
@@ -309,7 +274,6 @@ async def main():
    print("• Virtual Scroll: Capture all content from modern web pages")
    print("• URL Seeder: Pre-discover and filter URLs efficiently")
    print("• C4A Script: Simple language for complex automations")
    print("• PDF Support: Extract content from PDF documents")
 if __name__ == "__main__":
--- a/tests/docker/simple_api_test.py
+++ b/tests/docker/simple_api_test.py
@@ -0,0 +1,345 @@
 #!/usr/bin/env python3
 """
 Simple API Test for Crawl4AI Docker Server v0.7.0
 Uses only built-in Python modules to test all endpoints.
 """
 import urllib.request
 import urllib.parse
 import json
 import time
 import sys
 from typing import Dict, List, Optional
 # Configuration
 BASE_URL = "http://localhost:11234"  # Change to your server URL
 TEST_TIMEOUT = 30
 class SimpleApiTester:
    def __init__(self, base_url: str = BASE_URL):
        self.base_url = base_url
        self.token = None
        self.results = []
    def log(self, message: str):
        print(f"[INFO] {message}")
    def test_get_endpoint(self, endpoint: str) -> Dict:
        """Test a GET endpoint"""
        url = f"{self.base_url}{endpoint}"
        start_time = time.time()
        try:
            req = urllib.request.Request(url)
            if self.token:
                req.add_header('Authorization', f'Bearer {self.token}')
            with urllib.request.urlopen(req, timeout=TEST_TIMEOUT) as response:
                response_time = time.time() - start_time
                status_code = response.getcode()
                content = response.read().decode('utf-8')
                # Try to parse JSON
                try:
                    data = json.loads(content)
                except:
                    data = {"raw_response": content[:200]}
                return {
                    "endpoint": endpoint,
                    "method": "GET",
                    "status": "PASS" if status_code < 400 else "FAIL",
                    "status_code": status_code,
                    "response_time": response_time,
                    "data": data
                }
        except Exception as e:
            response_time = time.time() - start_time
            return {
                "endpoint": endpoint,
                "method": "GET",
                "status": "FAIL",
                "status_code": None,
                "response_time": response_time,
                "error": str(e)
            }
    def test_post_endpoint(self, endpoint: str, payload: Dict) -> Dict:
        """Test a POST endpoint"""
        url = f"{self.base_url}{endpoint}"
        start_time = time.time()
        try:
            data = json.dumps(payload).encode('utf-8')
            req = urllib.request.Request(url, data=data, method='POST')
            req.add_header('Content-Type', 'application/json')
            if self.token:
                req.add_header('Authorization', f'Bearer {self.token}')
            with urllib.request.urlopen(req, timeout=TEST_TIMEOUT) as response:
                response_time = time.time() - start_time
                status_code = response.getcode()
                content = response.read().decode('utf-8')
                # Try to parse JSON
                try:
                    data = json.loads(content)
                except:
                    data = {"raw_response": content[:200]}
                return {
                    "endpoint": endpoint,
                    "method": "POST",
                    "status": "PASS" if status_code < 400 else "FAIL",
                    "status_code": status_code,
                    "response_time": response_time,
                    "data": data
                }
        except Exception as e:
            response_time = time.time() - start_time
            return {
                "endpoint": endpoint,
                "method": "POST",
                "status": "FAIL",
                "status_code": None,
                "response_time": response_time,
                "error": str(e)
            }
    def print_result(self, result: Dict):
        """Print a formatted test result"""
        status_color = {
            "PASS": "✅",
            "FAIL": "❌",
            "SKIP": "⏭️"
        }
        print(f"{status_color[result['status']]} {result['method']} {result['endpoint']} "
              f"| {result['response_time']:.3f}s | Status: {result['status_code'] or 'N/A'}")
        if result['status'] == 'FAIL' and 'error' in result:
            print(f"    Error: {result['error']}")
        self.results.append(result)
    def run_all_tests(self):
        """Run all API tests"""
        print("🚀 Starting Crawl4AI v0.7.0 API Test Suite")
        print(f"📡 Testing server at: {self.base_url}")
        print("=" * 60)
        # # Test basic endpoints
        # print("\n=== BASIC ENDPOINTS ===")
        # # Health check
        # result = self.test_get_endpoint("/health")
        # self.print_result(result)
        # # Schema endpoint
        # result = self.test_get_endpoint("/schema")
        # self.print_result(result)
        # # Metrics endpoint
        # result = self.test_get_endpoint("/metrics")
        # self.print_result(result)
        # # Root redirect
        # result = self.test_get_endpoint("/")
        # self.print_result(result)
        # # Test authentication
        # print("\n=== AUTHENTICATION ===")
        # # Get token
        # token_payload = {"email": "test@example.com"}
        # result = self.test_post_endpoint("/token", token_payload)
        # self.print_result(result)
        # # Extract token if successful
        # if result['status'] == 'PASS' and 'data' in result:
        #     token = result['data'].get('access_token')
        #     if token:
        #         self.token = token
        #         self.log(f"Successfully obtained auth token: {token[:20]}...")
        # Test core APIs
        print("\n=== CORE APIs ===")
        test_url = "https://example.com"
        # Test markdown endpoint
        md_payload = {
            "url": test_url,
            "f": "fit",
            "q": "test query",
            "c": "0"
        }
        result = self.test_post_endpoint("/md", md_payload)
        # print(result['data'].get('markdown', ''))
        self.print_result(result)
        # Test HTML endpoint
        html_payload = {"url": test_url}
        result = self.test_post_endpoint("/html", html_payload)
        self.print_result(result)
        # Test screenshot endpoint
        screenshot_payload = {
            "url": test_url,
            "screenshot_wait_for": 2
        }
        result = self.test_post_endpoint("/screenshot", screenshot_payload)
        self.print_result(result)
        # Test PDF endpoint
        pdf_payload = {"url": test_url}
        result = self.test_post_endpoint("/pdf", pdf_payload)
        self.print_result(result)
        # Test JavaScript execution
        js_payload = {
            "url": test_url,
            "scripts": ["(() => document.title)()"]
        }
        result = self.test_post_endpoint("/execute_js", js_payload)
        self.print_result(result)
        # Test crawl endpoint
        crawl_payload = {
            "urls": [test_url],
            "browser_config": {},
            "crawler_config": {}
        }
        result = self.test_post_endpoint("/crawl", crawl_payload)
        self.print_result(result)
        # Test config dump
        config_payload = {"code": "CrawlerRunConfig()"}
        result = self.test_post_endpoint("/config/dump", config_payload)
        self.print_result(result)
        # Test LLM endpoint
        llm_endpoint = f"/llm/{test_url}?q=Extract%20main%20content"
        result = self.test_get_endpoint(llm_endpoint)
        self.print_result(result)
        # Test ask endpoint
        ask_endpoint = "/ask?context_type=all&query=crawl4ai&max_results=5"
        result = self.test_get_endpoint(ask_endpoint)
        print(result)
        self.print_result(result)
        # Test job APIs
        print("\n=== JOB APIs ===")
        # Test LLM job
        llm_job_payload = {
            "url": test_url,
            "q": "Extract main content",
            "cache": False
        }
        result = self.test_post_endpoint("/llm/job", llm_job_payload)
        self.print_result(result)
        # Test crawl job
        crawl_job_payload = {
            "urls": [test_url],
            "browser_config": {},
            "crawler_config": {}
        }
        result = self.test_post_endpoint("/crawl/job", crawl_job_payload)
        self.print_result(result)
        # Test MCP
        print("\n=== MCP APIs ===")
        # Test MCP schema
        result = self.test_get_endpoint("/mcp/schema")
        self.print_result(result)
        # Test error handling
        print("\n=== ERROR HANDLING ===")
        # Test invalid URL
        invalid_payload = {"url": "invalid-url", "f": "fit"}
        result = self.test_post_endpoint("/md", invalid_payload)
        self.print_result(result)
        # Test invalid endpoint
        result = self.test_get_endpoint("/nonexistent")
        self.print_result(result)
        # Print summary
        self.print_summary()
    def print_summary(self):
        """Print test results summary"""
        print("\n" + "=" * 60)
        print("📊 TEST RESULTS SUMMARY")
        print("=" * 60)
        total = len(self.results)
        passed = sum(1 for r in self.results if r['status'] == 'PASS')
        failed = sum(1 for r in self.results if r['status'] == 'FAIL')
        print(f"Total Tests: {total}")
        print(f"✅ Passed: {passed}")
        print(f"❌ Failed: {failed}")
        print(f"📈 Success Rate: {(passed/total)*100:.1f}%")
        if failed > 0:
            print("\n❌ FAILED TESTS:")
            for result in self.results:
                if result['status'] == 'FAIL':
                    print(f"  • {result['method']} {result['endpoint']}")
                    if 'error' in result:
                        print(f"    Error: {result['error']}")
        # Performance statistics
        response_times = [r['response_time'] for r in self.results if r['response_time'] > 0]
        if response_times:
            avg_time = sum(response_times) / len(response_times)
            max_time = max(response_times)
            print(f"\n⏱️  Average Response Time: {avg_time:.3f}s")
            print(f"⏱️  Max Response Time: {max_time:.3f}s")
        # Save detailed report
        report_file = f"crawl4ai_test_report_{int(time.time())}.json"
        with open(report_file, 'w') as f:
            json.dump({
                "timestamp": time.time(),
                "server_url": self.base_url,
                "version": "0.7.0",
                "summary": {
                    "total": total,
                    "passed": passed,
                    "failed": failed
                },
                "results": self.results
            }, f, indent=2)
        print(f"\n📄 Detailed report saved to: {report_file}")
 def main():
    """Main test runner"""
    import argparse
    parser = argparse.ArgumentParser(description='Crawl4AI v0.7.0 API Test Suite')
    parser.add_argument('--url', default=BASE_URL, help='Base URL of the server')
    args = parser.parse_args()
    tester = SimpleApiTester(args.url)
    try:
        tester.run_all_tests()
    except KeyboardInterrupt:
        print("\n🛑 Test suite interrupted by user")
    except Exception as e:
        print(f"\n💥 Test suite failed with error: {e}")
        sys.exit(1)
 if __name__ == "__main__":
    main()
--- a/tests/test_link_extractor.py
+++ b/tests/test_link_extractor.py
@@ -5,7 +5,7 @@ Test script for Link Extractor functionality
 from crawl4ai.models import Link
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.async_configs import LinkPreviewConfig
+from crawl4ai import LinkPreviewConfig
 import asyncio
 import sys
 import os
@@ -237,7 +237,7 @@ def test_config_examples():
            print(f"     {key}: {value}")
        print("   Usage:")
-        print("     from crawl4ai.async_configs import LinkPreviewConfig")
+        print("     from crawl4ai import LinkPreviewConfig")
        print("     config = CrawlerRunConfig(")
        print("         link_preview_config=LinkPreviewConfig(")
        for key, value in config_dict.items():
Author	SHA1	Message	Date
unclecode	0163bd797c	Merge branch 'release/v0.7.1'	2025-07-17 17:42:04 +08:00
ntohidi	26bad799e4	chore: update version to 0.7.1	2025-07-17 11:37:41 +02:00
ntohidi	cf8badfe27	feat: cleanup unused code and enhance documentation for v0.7.1 - Remove unused StealthConfig from browser_manager.py - Update LinkPreviewConfig import path in __init__.py and examples - Fix infinity handling in content_scraping_strategy.py (use 0 instead of float('inf')) - Remove sanitize_json_data functions from API endpoints - Add comprehensive C4A Script documentation to release notes - Update v0.7.0 release notes with improved code examples - Create v0.7.1 release notes focusing on cleanup and documentation improvements - Update demo files with corrected import paths and examples - Fix virtual scroll and adaptive crawling examples across documentation 🤖 Generated with Claude Code Co-Authored-By: Claude <noreply@anthropic.com>	2025-07-17 11:35:16 +02:00
ntohidi	ccbe3c105c	refactor: improve link scoring output format in release notes	2025-07-17 09:13:20 +02:00
Nasrin	761c19d54b	Merge pull request #1307 from unclecode/fix/json-infinity-serialization fix: Handle infinity values in JSON serialization for API responses	2025-07-16 13:34:25 +02:00
Nasrin	14b0ecb137	Merge pull request #1305 from unclecode/fix/release-notes-demo-code Fix: Update release notes and demo code	2025-07-16 13:33:53 +02:00
ntohidi	0eaa9f9895	fix: handle infinity values in JSON serialization for API responses - Add sanitize_json_data() function to convert infinity/NaN to JSON-compliant strings - Fix /execute_js endpoint returning ValueError: Out of range float values are not JSON compliant: inf - Fix /crawl endpoint batch responses with infinity values - Fix /crawl/stream endpoint streaming responses with infinity values - Fix /crawl/job endpoint background job responses with infinity values The sanitize_json_data() function recursively processes response data: - float('inf') → \"Infinity\" - float('-inf') → \"-Infinity\" - float('nan') → \"NaN\" This prevents JSON serialization errors when JavaScript execution or crawling operations produce infinity values, ensuring all API endpoints return valid JSON. Fixes: API endpoints crashing with infinity JSON serialization errors Affects: /execute_js, /crawl, /crawl/stream, /crawl/job endpoints	2025-07-15 13:49:07 +02:00
UncleCode	bde1bba6a2	docs: Add missing documentation pages to mkdocs.yml - Added Adaptive Crawling to Core section - Added URL Seeding to Core section - Added Adaptive Strategies to Advanced section	2025-07-12 19:56:33 +08:00