feat(crawl4ai): Update to version 0.7.1 with improvements and new tests

This commit includes several updates to the crawl4ai package, including changes to the browser manager and content scraping strategy. The version number has been updated to 0.7.1. Significant modifications have been made to the documentation, including updates to the release notes for version 0.7.0 and the addition of release notes for version 0.7.1. Examples and core documentation have also been updated to reflect the changes in this version. Additionally, a new simple API test has been added to the Docker tests. These changes were made to improve the functionality of the crawl4ai package and to provide clearer, more up-to-date documentation for users. The new test will help ensure the API is working as expected. BREAKING CHANGE: The updates to the browser manager and content scraping strategy may affect how these components interact with the rest of the package. Users should review the updated documentation for details on these changes.
2025-07-18 16:27:19 +08:00
parent 7b80eb6b99
commit 8a04351406
18 changed files with 709 additions and 485 deletions
--- a/docs/md_v2/core/adaptive-crawling.md
+++ b/docs/md_v2/core/adaptive-crawling.md
@@ -35,7 +35,7 @@ from crawl4ai import AsyncWebCrawler, AdaptiveCrawler

 async def main():
    async with AsyncWebCrawler() as crawler:
-        # Create an adaptive crawler
+        # Create an adaptive crawler (config is optional)
        adaptive = AdaptiveCrawler(crawler)
        
        # Start crawling with a query
@@ -59,13 +59,13 @@ async def main():
 from crawl4ai import AdaptiveConfig

 config = AdaptiveConfig(
-    confidence_threshold=0.7,    # Stop when 70% confident (default: 0.8)
-    max_pages=20,               # Maximum pages to crawl (default: 50)
-    top_k_links=3,              # Links to follow per page (default: 5)
+    confidence_threshold=0.8,    # Stop when 80% confident (default: 0.7)
+    max_pages=30,               # Maximum pages to crawl (default: 20)
+    top_k_links=5,              # Links to follow per page (default: 3)
    min_gain_threshold=0.05     # Minimum expected gain to continue (default: 0.1)
 )

-adaptive = AdaptiveCrawler(crawler, config=config)
+adaptive = AdaptiveCrawler(crawler, config)
 ```

 ## Crawling Strategies
@@ -198,8 +198,8 @@ if result.metrics.get('is_irrelevant', False):
 The confidence score (0-1) indicates how sufficient the gathered information is:
 - **0.0-0.3**: Insufficient information, needs more crawling
 - **0.3-0.6**: Partial information, may answer basic queries
- **0.6-0.8**: Good coverage, can answer most queries
- **0.8-1.0**: Excellent coverage, comprehensive information
+- **0.6-0.7**: Good coverage, can answer most queries
+- **0.7-1.0**: Excellent coverage, comprehensive information

 ### Statistics Display

@@ -257,9 +257,9 @@ new_adaptive.import_knowledge_base("knowledge_base.jsonl")
 - Avoid overly broad queries

 ### 2. Threshold Tuning
- Start with default (0.8) for general use
- Lower to 0.6-0.7 for exploratory crawling
- Raise to 0.9+ for exhaustive coverage
+- Start with default (0.7) for general use
+- Lower to 0.5-0.6 for exploratory crawling
+- Raise to 0.8+ for exhaustive coverage

 ### 3. Performance Optimization
 - Use appropriate `max_pages` limits
--- a/docs/md_v2/core/link-media.md
+++ b/docs/md_v2/core/link-media.md
@@ -125,7 +125,7 @@ Here's a full example you can copy, paste, and run immediately:
 ```python
 import asyncio
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
-from crawl4ai.async_configs import LinkPreviewConfig
+from crawl4ai import LinkPreviewConfig

 async def extract_link_heads_example():
    """
@@ -237,7 +237,7 @@ if __name__ == "__main__":
 The `LinkPreviewConfig` class supports these options:

 ```python
-from crawl4ai.async_configs import LinkPreviewConfig
+from crawl4ai import LinkPreviewConfig

 link_preview_config = LinkPreviewConfig(
    # BASIC SETTINGS
--- a/docs/md_v2/core/url-seeding.md
+++ b/docs/md_v2/core/url-seeding.md
@@ -137,7 +137,7 @@ async def smart_blog_crawler():
            word_count_threshold=300  # Only substantial articles
        )
        
-        # Extract URLs and stream results as they come
+        # Extract URLs and crawl them
        tutorial_urls = [t["url"] for t in tutorials[:10]]
        results = await crawler.arun_many(tutorial_urls, config=config)
        
@@ -231,7 +231,7 @@ Common Crawl is a massive public dataset that regularly crawls the entire web. I

 ```python
 # Use both sources
-config = SeedingConfig(source="cc+sitemap")
+config = SeedingConfig(source="sitemap+cc")
 urls = await seeder.urls("example.com", config)
 ```

@@ -241,13 +241,13 @@ The `SeedingConfig` object is your control panel. Here's everything you can conf

 | Parameter | Type | Default | Description |
 |-----------|------|---------|-------------|
-| `source` | str | "cc" | URL source: "cc" (Common Crawl), "sitemap", or "cc+sitemap" |
+| `source` | str | "sitemap+cc" | URL source: "cc" (Common Crawl), "sitemap", or "sitemap+cc" |
 | `pattern` | str | "*" | URL pattern filter (e.g., "*/blog/*", "*.html") |
 | `extract_head` | bool | False | Extract metadata from page `<head>` |
 | `live_check` | bool | False | Verify URLs are accessible |
 | `max_urls` | int | -1 | Maximum URLs to return (-1 = unlimited) |
 | `concurrency` | int | 10 | Parallel workers for fetching |
-| `hits_per_sec` | int | None | Rate limit for requests |
+| `hits_per_sec` | int | 5 | Rate limit for requests |
 | `force` | bool | False | Bypass cache, fetch fresh data |
 | `verbose` | bool | False | Show detailed progress |
 | `query` | str | None | Search query for BM25 scoring |
@@ -522,7 +522,7 @@ urls = await seeder.urls("docs.example.com", config)
 ```python
 # Find specific products
 config = SeedingConfig(
-    source="cc+sitemap",  # Use both sources
+    source="sitemap+cc",  # Use both sources
    extract_head=True,
    query="wireless headphones noise canceling",
    scoring_method="bm25",
@@ -782,7 +782,7 @@ class ResearchAssistant:
        
        # Step 1: Discover relevant URLs
        config = SeedingConfig(
-            source="cc+sitemap",     # Maximum coverage
+            source="sitemap+cc",     # Maximum coverage
            extract_head=True,       # Get metadata
            query=topic,             # Research topic
            scoring_method="bm25",   # Smart scoring
@@ -832,7 +832,8 @@ class ResearchAssistant:
            # Extract URLs and crawl all articles
            article_urls = [article['url'] for article in top_articles]
            results = []
-            async for result in await crawler.arun_many(article_urls, config=config):
+            crawl_results = await crawler.arun_many(article_urls, config=config)
+            async for result in crawl_results:
                if result.success:
                    results.append({
                        'url': result.url,
@@ -933,10 +934,10 @@ config = SeedingConfig(concurrency=10, hits_per_sec=5)
 # When crawling many URLs
 async with AsyncWebCrawler() as crawler:
    # Assuming urls is a list of URL strings
-    results = await crawler.arun_many(urls, config=config)
+    crawl_results = await crawler.arun_many(urls, config=config)
    
    # Process as they arrive
-    async for result in results:
+    async for result in crawl_results:
        process_immediately(result)  # Don't wait for all
 ```

@@ -1020,7 +1021,7 @@ config = SeedingConfig(

 # E-commerce product discovery
 config = SeedingConfig(
-    source="cc+sitemap",
+    source="sitemap+cc",
    pattern="*/product/*",
    extract_head=True,
    live_check=True