Release v0.7.0-r1: The Adaptive Intelligence Update

- Bump version to 0.7.0 - Add release notes and demo files - Update README with v0.7.0 features - Update Docker configurations for v0.7.0-r1 - Move v0.7.0 demo files to releases_review - Fix BM25 scoring bug in URLSeeder Major features: - Adaptive Crawling with pattern learning - Virtual Scroll support for infinite pages - Link Preview with 3-layer scoring - Async URL Seeder for massive discovery - Performance optimizations
2025-07-12 18:51:13 +08:00
parent ba2ed53ff1
commit 0c8bb742b7
11 changed files with 1307 additions and 89 deletions
--- a/crawl4ai/version.py
+++ b/crawl4ai/version.py
@@ -1,7 +1,7 @@
 # crawl4ai/__version__.py

 # This is the version that will be used for stable releases
-__version__ = "0.6.3"
+__version__ = "0.7.0"

 # For nightly builds, this gets set during build process
 __nightly_version__ = None
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -1659,22 +1659,57 @@ class SeedingConfig:
    """
    def __init__(
        self,
-        source: str = "sitemap+cc",  # Options: "sitemap", "cc", "sitemap+cc"
-        pattern: Optional[str] = "*", # URL pattern to filter discovered URLs (e.g., "*example.com/blog/*")
-        live_check: bool = False,    # Whether to perform HEAD requests to verify URL liveness
-        extract_head: bool = False,  # Whether to fetch and parse <head> section for metadata
-        max_urls: int = -1, # Maximum number of URLs to discover (default: -1 for no limit)
-        concurrency: int = 1000,      # Maximum concurrent requests for live checks/head extraction
-        hits_per_sec: int = 5,      # Rate limit in requests per second
-        force: bool = False, # If True, bypasses the AsyncUrlSeeder's internal .jsonl cache
-        base_directory: Optional[str] = None, # Base directory for UrlSeeder's cache files (.jsonl)
-        llm_config: Optional[LLMConfig] = None, # Forward LLM config for future use (e.g., relevance scoring)
-        verbose: Optional[bool] = None, # Override crawler's general verbose setting
-        query: Optional[str] = None,  # Search query for relevance scoring
-        score_threshold: Optional[float] = None,  # Minimum relevance score to include URL (0.0-1.0)
-        scoring_method: str = "bm25",  # Scoring method: "bm25" (default), future: "semantic"
-        filter_nonsense_urls: bool = True,  # Filter out utility URLs like robots.txt, sitemap.xml, etc.
+        source: str = "sitemap+cc",
+        pattern: Optional[str] = "*",
+        live_check: bool = False,
+        extract_head: bool = False,
+        max_urls: int = -1,
+        concurrency: int = 1000,
+        hits_per_sec: int = 5,
+        force: bool = False,
+        base_directory: Optional[str] = None,
+        llm_config: Optional[LLMConfig] = None,
+        verbose: Optional[bool] = None,
+        query: Optional[str] = None,
+        score_threshold: Optional[float] = None,
+        scoring_method: str = "bm25",
+        filter_nonsense_urls: bool = True,
    ):
+        """
+        Initialize URL seeding configuration.
+        
+        Args:
+            source: Discovery source(s) to use. Options: "sitemap", "cc" (Common Crawl), 
+                   or "sitemap+cc" (both). Default: "sitemap+cc"
+            pattern: URL pattern to filter discovered URLs (e.g., "*example.com/blog/*"). 
+                    Supports glob-style wildcards. Default: "*" (all URLs)
+            live_check: Whether to perform HEAD requests to verify URL liveness. 
+                       Default: False
+            extract_head: Whether to fetch and parse <head> section for metadata extraction.
+                         Required for BM25 relevance scoring. Default: False
+            max_urls: Maximum number of URLs to discover. Use -1 for no limit. 
+                     Default: -1
+            concurrency: Maximum concurrent requests for live checks/head extraction. 
+                        Default: 1000
+            hits_per_sec: Rate limit in requests per second to avoid overwhelming servers. 
+                         Default: 5
+            force: If True, bypasses the AsyncUrlSeeder's internal .jsonl cache and 
+                  re-fetches URLs. Default: False
+            base_directory: Base directory for UrlSeeder's cache files (.jsonl). 
+                           If None, uses default ~/.crawl4ai/. Default: None
+            llm_config: LLM configuration for future features (e.g., semantic scoring). 
+                       Currently unused. Default: None
+            verbose: Override crawler's general verbose setting for seeding operations. 
+                    Default: None (inherits from crawler)
+            query: Search query for BM25 relevance scoring (e.g., "python tutorials"). 
+                  Requires extract_head=True. Default: None
+            score_threshold: Minimum relevance score (0.0-1.0) to include URL. 
+                           Only applies when query is provided. Default: None
+            scoring_method: Scoring algorithm to use. Currently only "bm25" is supported. 
+                          Future: "semantic". Default: "bm25"
+            filter_nonsense_urls: Filter out utility URLs like robots.txt, sitemap.xml, 
+                                 ads.txt, favicon.ico, etc. Default: True
+        """
        self.source = source
        self.pattern = pattern
        self.live_check = live_check
--- a/crawl4ai/async_url_seeder.py
+++ b/crawl4ai/async_url_seeder.py
@@ -424,10 +424,21 @@ class AsyncUrlSeeder:
        self._log("info", "Finished URL seeding for {domain}. Total URLs: {count}",
                  params={"domain": domain, "count": len(results)}, tag="URL_SEED")

-        # Sort by relevance score if query was provided
+        # Apply BM25 scoring if query was provided
        if query and extract_head and scoring_method == "bm25":
-            results.sort(key=lambda x: x.get(
-                "relevance_score", 0.0), reverse=True)
+            # Apply collective BM25 scoring across all documents
+            results = await self._apply_bm25_scoring(results, config)
+            
+            # Filter by score threshold if specified
+            if score_threshold is not None:
+                original_count = len(results)
+                results = [r for r in results if r.get("relevance_score", 0) >= score_threshold]
+                if original_count > len(results):
+                    self._log("info", "Filtered {filtered} URLs below score threshold {threshold}",
+                              params={"filtered": original_count - len(results), "threshold": score_threshold}, tag="URL_SEED")
+            
+            # Sort by relevance score
+            results.sort(key=lambda x: x.get("relevance_score", 0.0), reverse=True)
            self._log("info", "Sorted {count} URLs by relevance score for query: '{query}'",
                      params={"count": len(results), "query": query}, tag="URL_SEED")
        elif query and not extract_head:
@@ -982,28 +993,6 @@ class AsyncUrlSeeder:
                "head_data": head_data,
            }

-            # Apply BM25 scoring if query is provided and head data exists
-            if query and ok and scoring_method == "bm25" and head_data:
-                text_context = self._extract_text_context(head_data)
-                if text_context:
-                    # Calculate BM25 score for this single document
-                    # scores = self._calculate_bm25_score(query, [text_context])
-                    scores = await asyncio.to_thread(self._calculate_bm25_score, query, [text_context])
-                    relevance_score = scores[0] if scores else 0.0
-                    entry["relevance_score"] = float(relevance_score)
-                else:
-                    # No text context, use URL-based scoring as fallback
-                    relevance_score = self._calculate_url_relevance_score(
-                        query, entry["url"])
-                    entry["relevance_score"] = float(relevance_score)
-            elif query:
-                # Query provided but no head data - we reject this entry
-                self._log("debug", "No head data for {url}, using URL-based scoring",
-                          params={"url": url}, tag="URL_SEED")
-                return
-                # relevance_score = self._calculate_url_relevance_score(query, entry["url"])
-                # entry["relevance_score"] = float(relevance_score)
-
        elif live:
            self._log("debug", "Performing live check for {url}", params={
                      "url": url}, tag="URL_SEED")
@@ -1013,35 +1002,13 @@ class AsyncUrlSeeder:
                      params={"status": status.upper(), "url": url}, tag="URL_SEED")
            entry = {"url": url, "status": status, "head_data": {}}

-            # Apply URL-based scoring if query is provided
-            if query:
-                relevance_score = self._calculate_url_relevance_score(
-                    query, url)
-                entry["relevance_score"] = float(relevance_score)
-
        else:
            entry = {"url": url, "status": "unknown", "head_data": {}}

-            # Apply URL-based scoring if query is provided
-            if query:
-                relevance_score = self._calculate_url_relevance_score(
-                    query, url)
-                entry["relevance_score"] = float(relevance_score)
-
-        # Now decide whether to add the entry based on score threshold
-        if query and "relevance_score" in entry:
-            if score_threshold is None or entry["relevance_score"] >= score_threshold:
-                if live or extract:
-                    await self._cache_set(cache_kind, url, entry)
-                res_list.append(entry)
-            else:
-                self._log("debug", "URL {url} filtered out with score {score} < {threshold}",
-                          params={"url": url, "score": entry["relevance_score"], "threshold": score_threshold}, tag="URL_SEED")
-        else:
-            # No query or no scoring - add as usual
-            if live or extract:
-                await self._cache_set(cache_kind, url, entry)
-            res_list.append(entry)
+        # Add entry to results (scoring will be done later)
+        if live or extract:
+            await self._cache_set(cache_kind, url, entry)
+        res_list.append(entry)

    async def _head_ok(self, url: str, timeout: int) -> bool:
        try:
@@ -1436,8 +1403,19 @@ class AsyncUrlSeeder:
            scores = bm25.get_scores(query_tokens)

            # Normalize scores to 0-1 range
-            max_score = max(scores) if max(scores) > 0 else 1.0
-            normalized_scores = [score / max_score for score in scores]
+            # BM25 can return negative scores, so we need to handle the full range
+            if len(scores) == 0:
+                return []
+            
+            min_score = min(scores)
+            max_score = max(scores)
+            
+            # If all scores are the same, return 0.5 for all
+            if max_score == min_score:
+                return [0.5] * len(scores)
+            
+            # Normalize to 0-1 range using min-max normalization
+            normalized_scores = [(score - min_score) / (max_score - min_score) for score in scores]

            return normalized_scores
        except Exception as e: