feat(deep-crawling): improve URL normalization and domain filtering

Enhance URL handling in deep crawling with: - New URL normalization functions for consistent URL formats - Improved domain filtering with subdomain support - Added URLPatternFilter to public API - Better URL deduplication in BFS strategy These changes improve crawling accuracy and reduce duplicate visits.
2025-03-06 22:45:57 +08:00
parent 1b72880007
commit f78c46446b
6 changed files with 186 additions and 14 deletions
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -48,8 +48,9 @@ from .deep_crawling import (
    DeepCrawlStrategy,
    BFSDeepCrawlStrategy,
    FilterChain,
-    ContentTypeFilter,
+    URLPatternFilter,
    DomainFilter,
+    ContentTypeFilter,
    URLFilter,
    FilterStats,
    SEOFilter,
@@ -75,6 +76,7 @@ __all__ = [
    "BestFirstCrawlingStrategy",
    "DFSDeepCrawlStrategy",
    "FilterChain",
+    "URLPatternFilter",
    "ContentTypeFilter",
    "DomainFilter",
    "FilterStats",
--- a/crawl4ai/version.py
+++ b/crawl4ai/version.py
@@ -1,2 +1,2 @@
 # crawl4ai/_version.py
-__version__ = "0.5.0.post3"
+__version__ = "0.5.0.post4"
--- a/crawl4ai/deep_crawling/bfs_strategy.py
+++ b/crawl4ai/deep_crawling/bfs_strategy.py
@@ -10,6 +10,7 @@ from .filters import FilterChain
 from .scorers import URLScorer
 from . import DeepCrawlStrategy  
 from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult
+from ..utils import normalize_url_for_deep_crawl, efficient_normalize_url_for_deep_crawl
 from math import inf as infinity

 class BFSDeepCrawlStrategy(DeepCrawlStrategy):
@@ -99,14 +100,17 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
        # First collect all valid links
        for link in links:
            url = link.get("href")
-            if url in visited:
+            # Strip URL fragments to avoid duplicate crawling
+            # base_url = url.split('#')[0] if url else url
+            base_url = normalize_url_for_deep_crawl(url, source_url)
+            if base_url in visited:
                continue
            if not await self.can_process_url(url, next_depth):
                self.stats.urls_skipped += 1
                continue

            # Score the URL if a scorer is provided
-            score = self.url_scorer.score(url) if self.url_scorer else 0
+            score = self.url_scorer.score(base_url) if self.url_scorer else 0
            
            # Skip URLs with scores below the threshold
            if score < self.score_threshold:
@@ -114,7 +118,7 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
                self.stats.urls_skipped += 1
                continue
            
-            valid_links.append((url, score))
+            valid_links.append((base_url, score))
        
        # If we have more valid links than capacity, sort by score and take the top ones
        if len(valid_links) > remaining_capacity:
--- a/crawl4ai/deep_crawling/filters.py
+++ b/crawl4ai/deep_crawling/filters.py
@@ -427,6 +427,11 @@ class DomainFilter(URLFilter):
        if isinstance(domains, str):
            return {domains.lower()}
        return {d.lower() for d in domains}
+    
+    @staticmethod
+    def _is_subdomain(domain: str, parent_domain: str) -> bool:
+        """Check if domain is a subdomain of parent_domain"""
+        return domain == parent_domain or domain.endswith(f".{parent_domain}")

    @staticmethod
    @lru_cache(maxsize=10000)
@@ -444,20 +449,26 @@ class DomainFilter(URLFilter):

        domain = self._extract_domain(url)

-        # Early return for blocked domains
-        if domain in self._blocked_domains:
-            self._update_stats(False)
-            return False
+        # Check for blocked domains, including subdomains
+        for blocked in self._blocked_domains:
+            if self._is_subdomain(domain, blocked):
+                self._update_stats(False)
+                return False

        # If no allowed domains specified, accept all non-blocked
        if self._allowed_domains is None:
            self._update_stats(True)
            return True

-        # Final allowed domains check
-        result = domain in self._allowed_domains
-        self._update_stats(result)
-        return result
+        # Check if domain matches any allowed domain (including subdomains)
+        for allowed in self._allowed_domains:
+            if self._is_subdomain(domain, allowed):
+                self._update_stats(True)
+                return True
+
+        # No matches found
+        self._update_stats(False)
+        return False


 class ContentRelevanceFilter(URLFilter):
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -1,5 +1,4 @@
 import time
-from urllib.parse import urlparse
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString
 import json
@@ -33,6 +32,8 @@ import hashlib

 from urllib.robotparser import RobotFileParser
 import aiohttp
+from urllib.parse import urlparse, urlunparse
+from functools import lru_cache

 from packaging import version
 from . import __version__
@@ -1962,6 +1963,82 @@ def normalize_url(href, base_url):
    return normalized


+def normalize_url_for_deep_crawl(href, base_url):
+    """Normalize URLs to ensure consistent format"""
+    from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode
+
+    # Handle None or empty values
+    if not href:
+        return None
+
+    # Use urljoin to handle relative URLs
+    full_url = urljoin(base_url, href.strip())
+    
+    # Parse the URL for normalization
+    parsed = urlparse(full_url)
+    
+    # Convert hostname to lowercase
+    netloc = parsed.netloc.lower()
+    
+    # Remove fragment entirely
+    fragment = ''
+    
+    # Normalize query parameters if needed
+    query = parsed.query
+    if query:
+        # Parse query parameters
+        params = parse_qs(query)
+        
+        # Remove tracking parameters (example - customize as needed)
+        tracking_params = ['utm_source', 'utm_medium', 'utm_campaign', 'ref', 'fbclid']
+        for param in tracking_params:
+            if param in params:
+                del params[param]
+                
+        # Rebuild query string, sorted for consistency
+        query = urlencode(params, doseq=True) if params else ''
+    
+    # Build normalized URL
+    normalized = urlunparse((
+        parsed.scheme,
+        netloc,
+        parsed.path.rstrip('/') or '/',  # Normalize trailing slash
+        parsed.params,
+        query,
+        fragment
+    ))
+    
+    return normalized
+
+@lru_cache(maxsize=10000)
+def efficient_normalize_url_for_deep_crawl(href, base_url):
+    """Efficient URL normalization with proper parsing"""
+    from urllib.parse import urljoin
+    
+    if not href:
+        return None
+    
+    # Resolve relative URLs
+    full_url = urljoin(base_url, href.strip())
+    
+    # Use proper URL parsing
+    parsed = urlparse(full_url)
+    
+    # Only perform the most critical normalizations
+    # 1. Lowercase hostname
+    # 2. Remove fragment
+    normalized = urlunparse((
+        parsed.scheme,
+        parsed.netloc.lower(),
+        parsed.path,
+        parsed.params,
+        parsed.query,
+        ''  # Remove fragment
+    ))
+    
+    return normalized
+
+
 def normalize_url_tmp(href, base_url):
    """Normalize URLs to ensure consistent format"""
    # Extract protocol and domain from base URL
--- a/docs/snippets/deep_crawl/intro.py
+++ b/docs/snippets/deep_crawl/intro.py
@@ -0,0 +1,78 @@
+import asyncio
+from typing import List
+
+from crawl4ai import (
+    AsyncWebCrawler,
+    CrawlerRunConfig,
+    BFSDeepCrawlStrategy,
+    CrawlResult,
+    FilterChain,
+    DomainFilter,
+    URLPatternFilter,
+)
+
+# Import necessary classes from crawl4ai library:
+# - AsyncWebCrawler: The main class for web crawling.
+# - CrawlerRunConfig: Configuration class for crawler behavior.
+# - BFSDeepCrawlStrategy: Breadth-First Search deep crawling strategy.
+# - CrawlResult: Data model for individual crawl results.
+# - FilterChain: Used to chain multiple URL filters.
+# - URLPatternFilter: Filter URLs based on patterns.
+# You had from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, which is also correct,
+# but for simplicity and consistency, we will use the direct import from crawl4ai in this example, as it is re-exported in __init__.py
+
+async def basic_deep_crawl():
+    """
+    Performs a basic deep crawl starting from a seed URL, demonstrating:
+    - Breadth-First Search (BFS) deep crawling strategy.
+    - Filtering URLs based on URL patterns.
+    - Accessing crawl results and metadata.
+    """
+
+    # 1. Define URL Filters:
+    # Create a URLPatternFilter to include only URLs containing "text".
+    # This filter will be used to restrict crawling to URLs that are likely to contain textual content.
+    url_filter = URLPatternFilter(
+        patterns=[
+            "*text*", # Include URLs that contain "text" in their path or URL
+        ]
+    )
+
+    # Create a DomainFilter to allow only URLs from the "groq.com" domain and block URLs from the "example.com" domain.
+    # This filter will be used to restrict crawling to URLs within the "groq.com" domain.
+    domain_filter = DomainFilter(
+        allowed_domains=["groq.com"],
+        blocked_domains=["example.com"],
+    )
+
+    # 2. Configure CrawlerRunConfig for Deep Crawling:
+    # Configure CrawlerRunConfig to use BFSDeepCrawlStrategy for deep crawling.
+    config = CrawlerRunConfig(
+        deep_crawl_strategy=BFSDeepCrawlStrategy(
+            max_depth=2,  # Set the maximum depth of crawling to 2 levels from the start URL
+            max_pages=10, # Limit the total number of pages to crawl to 10, to prevent excessive crawling
+            include_external=False, # Set to False to only crawl URLs within the same domain as the start URL
+            filter_chain=FilterChain(filters=[url_filter, domain_filter]), # Apply the URLPatternFilter and DomainFilter to filter URLs during deep crawl
+        ),
+        verbose=True, # Enable verbose logging to see detailed output during crawling
+    )
+
+    # 3. Initialize and Run AsyncWebCrawler:
+    # Use AsyncWebCrawler as a context manager for automatic start and close.
+    async with AsyncWebCrawler() as crawler:
+        results: List[CrawlResult] = await crawler.arun(
+            # url="https://docs.crawl4ai.com", # Uncomment to use crawl4ai documentation as start URL
+            url="https://console.groq.com/docs", # Set the start URL for deep crawling to Groq documentation
+            config=config, # Pass the configured CrawlerRunConfig to arun method
+        )
+
+        # 4. Process and Print Crawl Results:
+        # Iterate through the list of CrawlResult objects returned by the deep crawl.
+        for result in results:
+            # Print the URL and its crawl depth from the metadata for each crawled URL.
+            print(f"URL: {result.url}, Depth: {result.metadata.get('depth', 0)}")
+
+
+if __name__ == "__main__":
+    import asyncio
+    asyncio.run(basic_deep_crawl())