diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 565bf93d..03cce871 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -48,8 +48,9 @@ from .deep_crawling import ( DeepCrawlStrategy, BFSDeepCrawlStrategy, FilterChain, - ContentTypeFilter, + URLPatternFilter, DomainFilter, + ContentTypeFilter, URLFilter, FilterStats, SEOFilter, @@ -75,6 +76,7 @@ __all__ = [ "BestFirstCrawlingStrategy", "DFSDeepCrawlStrategy", "FilterChain", + "URLPatternFilter", "ContentTypeFilter", "DomainFilter", "FilterStats", diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 1f2ef59b..9477177b 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.5.0.post3" +__version__ = "0.5.0.post4" diff --git a/crawl4ai/deep_crawling/bfs_strategy.py b/crawl4ai/deep_crawling/bfs_strategy.py index 1ae4c4b9..54b72ea3 100644 --- a/crawl4ai/deep_crawling/bfs_strategy.py +++ b/crawl4ai/deep_crawling/bfs_strategy.py @@ -10,6 +10,7 @@ from .filters import FilterChain from .scorers import URLScorer from . import DeepCrawlStrategy from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult +from ..utils import normalize_url_for_deep_crawl, efficient_normalize_url_for_deep_crawl from math import inf as infinity class BFSDeepCrawlStrategy(DeepCrawlStrategy): @@ -99,14 +100,17 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): # First collect all valid links for link in links: url = link.get("href") - if url in visited: + # Strip URL fragments to avoid duplicate crawling + # base_url = url.split('#')[0] if url else url + base_url = normalize_url_for_deep_crawl(url, source_url) + if base_url in visited: continue if not await self.can_process_url(url, next_depth): self.stats.urls_skipped += 1 continue # Score the URL if a scorer is provided - score = self.url_scorer.score(url) if self.url_scorer else 0 + score = self.url_scorer.score(base_url) if self.url_scorer else 0 # Skip URLs with scores below the threshold if score < self.score_threshold: @@ -114,7 +118,7 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): self.stats.urls_skipped += 1 continue - valid_links.append((url, score)) + valid_links.append((base_url, score)) # If we have more valid links than capacity, sort by score and take the top ones if len(valid_links) > remaining_capacity: diff --git a/crawl4ai/deep_crawling/filters.py b/crawl4ai/deep_crawling/filters.py index c8af3022..9fd8a72a 100644 --- a/crawl4ai/deep_crawling/filters.py +++ b/crawl4ai/deep_crawling/filters.py @@ -427,6 +427,11 @@ class DomainFilter(URLFilter): if isinstance(domains, str): return {domains.lower()} return {d.lower() for d in domains} + + @staticmethod + def _is_subdomain(domain: str, parent_domain: str) -> bool: + """Check if domain is a subdomain of parent_domain""" + return domain == parent_domain or domain.endswith(f".{parent_domain}") @staticmethod @lru_cache(maxsize=10000) @@ -444,20 +449,26 @@ class DomainFilter(URLFilter): domain = self._extract_domain(url) - # Early return for blocked domains - if domain in self._blocked_domains: - self._update_stats(False) - return False + # Check for blocked domains, including subdomains + for blocked in self._blocked_domains: + if self._is_subdomain(domain, blocked): + self._update_stats(False) + return False # If no allowed domains specified, accept all non-blocked if self._allowed_domains is None: self._update_stats(True) return True - # Final allowed domains check - result = domain in self._allowed_domains - self._update_stats(result) - return result + # Check if domain matches any allowed domain (including subdomains) + for allowed in self._allowed_domains: + if self._is_subdomain(domain, allowed): + self._update_stats(True) + return True + + # No matches found + self._update_stats(False) + return False class ContentRelevanceFilter(URLFilter): diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index b5a50eab..146ce06c 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -1,5 +1,4 @@ import time -from urllib.parse import urlparse from concurrent.futures import ThreadPoolExecutor, as_completed from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString import json @@ -33,6 +32,8 @@ import hashlib from urllib.robotparser import RobotFileParser import aiohttp +from urllib.parse import urlparse, urlunparse +from functools import lru_cache from packaging import version from . import __version__ @@ -1962,6 +1963,82 @@ def normalize_url(href, base_url): return normalized +def normalize_url_for_deep_crawl(href, base_url): + """Normalize URLs to ensure consistent format""" + from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode + + # Handle None or empty values + if not href: + return None + + # Use urljoin to handle relative URLs + full_url = urljoin(base_url, href.strip()) + + # Parse the URL for normalization + parsed = urlparse(full_url) + + # Convert hostname to lowercase + netloc = parsed.netloc.lower() + + # Remove fragment entirely + fragment = '' + + # Normalize query parameters if needed + query = parsed.query + if query: + # Parse query parameters + params = parse_qs(query) + + # Remove tracking parameters (example - customize as needed) + tracking_params = ['utm_source', 'utm_medium', 'utm_campaign', 'ref', 'fbclid'] + for param in tracking_params: + if param in params: + del params[param] + + # Rebuild query string, sorted for consistency + query = urlencode(params, doseq=True) if params else '' + + # Build normalized URL + normalized = urlunparse(( + parsed.scheme, + netloc, + parsed.path.rstrip('/') or '/', # Normalize trailing slash + parsed.params, + query, + fragment + )) + + return normalized + +@lru_cache(maxsize=10000) +def efficient_normalize_url_for_deep_crawl(href, base_url): + """Efficient URL normalization with proper parsing""" + from urllib.parse import urljoin + + if not href: + return None + + # Resolve relative URLs + full_url = urljoin(base_url, href.strip()) + + # Use proper URL parsing + parsed = urlparse(full_url) + + # Only perform the most critical normalizations + # 1. Lowercase hostname + # 2. Remove fragment + normalized = urlunparse(( + parsed.scheme, + parsed.netloc.lower(), + parsed.path, + parsed.params, + parsed.query, + '' # Remove fragment + )) + + return normalized + + def normalize_url_tmp(href, base_url): """Normalize URLs to ensure consistent format""" # Extract protocol and domain from base URL diff --git a/docs/snippets/deep_crawl/intro.py b/docs/snippets/deep_crawl/intro.py new file mode 100644 index 00000000..d8fd2f94 --- /dev/null +++ b/docs/snippets/deep_crawl/intro.py @@ -0,0 +1,78 @@ +import asyncio +from typing import List + +from crawl4ai import ( + AsyncWebCrawler, + CrawlerRunConfig, + BFSDeepCrawlStrategy, + CrawlResult, + FilterChain, + DomainFilter, + URLPatternFilter, +) + +# Import necessary classes from crawl4ai library: +# - AsyncWebCrawler: The main class for web crawling. +# - CrawlerRunConfig: Configuration class for crawler behavior. +# - BFSDeepCrawlStrategy: Breadth-First Search deep crawling strategy. +# - CrawlResult: Data model for individual crawl results. +# - FilterChain: Used to chain multiple URL filters. +# - URLPatternFilter: Filter URLs based on patterns. +# You had from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, which is also correct, +# but for simplicity and consistency, we will use the direct import from crawl4ai in this example, as it is re-exported in __init__.py + +async def basic_deep_crawl(): + """ + Performs a basic deep crawl starting from a seed URL, demonstrating: + - Breadth-First Search (BFS) deep crawling strategy. + - Filtering URLs based on URL patterns. + - Accessing crawl results and metadata. + """ + + # 1. Define URL Filters: + # Create a URLPatternFilter to include only URLs containing "text". + # This filter will be used to restrict crawling to URLs that are likely to contain textual content. + url_filter = URLPatternFilter( + patterns=[ + "*text*", # Include URLs that contain "text" in their path or URL + ] + ) + + # Create a DomainFilter to allow only URLs from the "groq.com" domain and block URLs from the "example.com" domain. + # This filter will be used to restrict crawling to URLs within the "groq.com" domain. + domain_filter = DomainFilter( + allowed_domains=["groq.com"], + blocked_domains=["example.com"], + ) + + # 2. Configure CrawlerRunConfig for Deep Crawling: + # Configure CrawlerRunConfig to use BFSDeepCrawlStrategy for deep crawling. + config = CrawlerRunConfig( + deep_crawl_strategy=BFSDeepCrawlStrategy( + max_depth=2, # Set the maximum depth of crawling to 2 levels from the start URL + max_pages=10, # Limit the total number of pages to crawl to 10, to prevent excessive crawling + include_external=False, # Set to False to only crawl URLs within the same domain as the start URL + filter_chain=FilterChain(filters=[url_filter, domain_filter]), # Apply the URLPatternFilter and DomainFilter to filter URLs during deep crawl + ), + verbose=True, # Enable verbose logging to see detailed output during crawling + ) + + # 3. Initialize and Run AsyncWebCrawler: + # Use AsyncWebCrawler as a context manager for automatic start and close. + async with AsyncWebCrawler() as crawler: + results: List[CrawlResult] = await crawler.arun( + # url="https://docs.crawl4ai.com", # Uncomment to use crawl4ai documentation as start URL + url="https://console.groq.com/docs", # Set the start URL for deep crawling to Groq documentation + config=config, # Pass the configured CrawlerRunConfig to arun method + ) + + # 4. Process and Print Crawl Results: + # Iterate through the list of CrawlResult objects returned by the deep crawl. + for result in results: + # Print the URL and its crawl depth from the metadata for each crawled URL. + print(f"URL: {result.url}, Depth: {result.metadata.get('depth', 0)}") + + +if __name__ == "__main__": + import asyncio + asyncio.run(basic_deep_crawl())