diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 565bf93d..03cce871 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -48,8 +48,9 @@ from .deep_crawling import ( DeepCrawlStrategy, BFSDeepCrawlStrategy, FilterChain, - ContentTypeFilter, + URLPatternFilter, DomainFilter, + ContentTypeFilter, URLFilter, FilterStats, SEOFilter, @@ -75,6 +76,7 @@ __all__ = [ "BestFirstCrawlingStrategy", "DFSDeepCrawlStrategy", "FilterChain", + "URLPatternFilter", "ContentTypeFilter", "DomainFilter", "FilterStats", diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index e56feb58..9477177b 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.5.0.post2" +__version__ = "0.5.0.post4" diff --git a/crawl4ai/async_dispatcher.py b/crawl4ai/async_dispatcher.py index 69d276fb..b587d011 100644 --- a/crawl4ai/async_dispatcher.py +++ b/crawl4ai/async_dispatcher.py @@ -13,7 +13,7 @@ from rich.live import Live from rich.table import Table from rich.console import Console from rich import box -from datetime import timedelta +from datetime import timedelta, datetime from collections.abc import AsyncGenerator import time import psutil diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index b5a646c9..dd777a36 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -4,7 +4,7 @@ import sys import time from colorama import Fore from pathlib import Path -from typing import Optional, List +from typing import Optional, List, Generic, TypeVar import json import asyncio @@ -23,7 +23,7 @@ from .async_crawler_strategy import ( AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse, ) -from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode +from .cache_context import CacheMode, CacheContext from .markdown_generation_strategy import ( DefaultMarkdownGenerator, MarkdownGenerationStrategy, @@ -44,17 +44,46 @@ from .utils import ( RobotsParser, ) -from typing import Union, AsyncGenerator, TypeVar +from typing import Union, AsyncGenerator CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult) -RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]] +# RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]] -DeepCrawlSingleReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]] -DeepCrawlManyReturn = Union[ - List[List[CrawlResultT]], - AsyncGenerator[CrawlResultT, None], +class CrawlResultContainer(Generic[CrawlResultT]): + def __init__(self, results: Union[CrawlResultT, List[CrawlResultT]]): + # Normalize to a list + if isinstance(results, list): + self._results = results + else: + self._results = [results] + + def __iter__(self): + return iter(self._results) + + def __getitem__(self, index): + return self._results[index] + + def __len__(self): + return len(self._results) + + def __getattr__(self, attr): + # Delegate attribute access to the first element. + if self._results: + return getattr(self._results[0], attr) + raise AttributeError(f"{self.__class__.__name__} object has no attribute '{attr}'") + + def __repr__(self): + return f"{self.__class__.__name__}({self._results!r})" + +# Redefine the union type. Now synchronous calls always return a container, +# while stream mode is handled with an AsyncGenerator. +RunManyReturn = Union[ + CrawlResultContainer[CrawlResultT], + AsyncGenerator[CrawlResultT, None] ] + + class AsyncWebCrawler: """ Asynchronous web crawler with flexible caching capabilities. @@ -223,23 +252,6 @@ class AsyncWebCrawler: self, url: str, config: CrawlerRunConfig = None, - # Legacy parameters maintained for backwards compatibility - # word_count_threshold=MIN_WORD_THRESHOLD, - # extraction_strategy: ExtractionStrategy = None, - # chunking_strategy: ChunkingStrategy = RegexChunking(), - # content_filter: RelevantContentFilter = None, - # cache_mode: Optional[CacheMode] = None, - # Deprecated cache parameters - # bypass_cache: bool = False, - # disable_cache: bool = False, - # no_cache_read: bool = False, - # no_cache_write: bool = False, - # Other legacy parameters - # css_selector: str = None, - # screenshot: bool = False, - # pdf: bool = False, - # user_agent: str = None, - # verbose=True, **kwargs, ) -> RunManyReturn: """ @@ -270,47 +282,13 @@ class AsyncWebCrawler: Returns: CrawlResult: The result of crawling and processing """ - crawler_config = config or CrawlerRunConfig() + config = config or CrawlerRunConfig() if not isinstance(url, str) or not url: raise ValueError("Invalid URL, make sure the URL is a non-empty string") async with self._lock or self.nullcontext(): try: - self.logger.verbose = crawler_config.verbose - # Handle configuration - if crawler_config is not None: - config = crawler_config - else: - # Merge all parameters into a single kwargs dict for config creation - # config_kwargs = { - # "word_count_threshold": word_count_threshold, - # "extraction_strategy": extraction_strategy, - # "chunking_strategy": chunking_strategy, - # "content_filter": content_filter, - # "cache_mode": cache_mode, - # "bypass_cache": bypass_cache, - # "disable_cache": disable_cache, - # "no_cache_read": no_cache_read, - # "no_cache_write": no_cache_write, - # "css_selector": css_selector, - # "screenshot": screenshot, - # "pdf": pdf, - # "verbose": verbose, - # **kwargs, - # } - # config = CrawlerRunConfig.from_kwargs(config_kwargs) - pass - - # Handle deprecated cache parameters - # if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]): - # # Convert legacy parameters if cache_mode not provided - # if config.cache_mode is None: - # config.cache_mode = _legacy_to_cache_mode( - # disable_cache=disable_cache, - # bypass_cache=bypass_cache, - # no_cache_read=no_cache_read, - # no_cache_write=no_cache_write, - # ) + self.logger.verbose = config.verbose # Default to ENABLED if no cache mode specified if config.cache_mode is None: @@ -457,7 +435,7 @@ class AsyncWebCrawler: if cache_context.should_write() and not bool(cached_result): await async_db_manager.acache_url(crawl_result) - return crawl_result + return CrawlResultContainer(crawl_result) else: self.logger.success( @@ -474,7 +452,7 @@ class AsyncWebCrawler: cached_result.success = bool(html) cached_result.session_id = getattr(config, "session_id", None) cached_result.redirected_url = cached_result.redirected_url or url - return cached_result + return CrawlResultContainer(cached_result) except Exception as e: error_context = get_error_context(sys.exc_info()) @@ -492,8 +470,10 @@ class AsyncWebCrawler: tag="ERROR", ) - return CrawlResult( - url=url, html="", success=False, error_message=error_message + return CrawlResultContainer( + CrawlResult( + url=url, html="", success=False, error_message=error_message + ) ) async def aprocess_html( @@ -669,17 +649,17 @@ class AsyncWebCrawler: config: Optional[CrawlerRunConfig] = None, dispatcher: Optional[BaseDispatcher] = None, # Legacy parameters maintained for backwards compatibility - word_count_threshold=MIN_WORD_THRESHOLD, - extraction_strategy: ExtractionStrategy = None, - chunking_strategy: ChunkingStrategy = RegexChunking(), - content_filter: RelevantContentFilter = None, - cache_mode: Optional[CacheMode] = None, - bypass_cache: bool = False, - css_selector: str = None, - screenshot: bool = False, - pdf: bool = False, - user_agent: str = None, - verbose=True, + # word_count_threshold=MIN_WORD_THRESHOLD, + # extraction_strategy: ExtractionStrategy = None, + # chunking_strategy: ChunkingStrategy = RegexChunking(), + # content_filter: RelevantContentFilter = None, + # cache_mode: Optional[CacheMode] = None, + # bypass_cache: bool = False, + # css_selector: str = None, + # screenshot: bool = False, + # pdf: bool = False, + # user_agent: str = None, + # verbose=True, **kwargs ) -> RunManyReturn: """ @@ -712,20 +692,21 @@ class AsyncWebCrawler: ): print(f"Processed {result.url}: {len(result.markdown)} chars") """ - if config is None: - config = CrawlerRunConfig( - word_count_threshold=word_count_threshold, - extraction_strategy=extraction_strategy, - chunking_strategy=chunking_strategy, - content_filter=content_filter, - cache_mode=cache_mode, - bypass_cache=bypass_cache, - css_selector=css_selector, - screenshot=screenshot, - pdf=pdf, - verbose=verbose, - **kwargs, - ) + config = config or CrawlerRunConfig() + # if config is None: + # config = CrawlerRunConfig( + # word_count_threshold=word_count_threshold, + # extraction_strategy=extraction_strategy, + # chunking_strategy=chunking_strategy, + # content_filter=content_filter, + # cache_mode=cache_mode, + # bypass_cache=bypass_cache, + # css_selector=css_selector, + # screenshot=screenshot, + # pdf=pdf, + # verbose=verbose, + # **kwargs, + # ) if dispatcher is None: dispatcher = MemoryAdaptiveDispatcher( diff --git a/crawl4ai/deep_crawling/bfs_strategy.py b/crawl4ai/deep_crawling/bfs_strategy.py index 1ae4c4b9..54b72ea3 100644 --- a/crawl4ai/deep_crawling/bfs_strategy.py +++ b/crawl4ai/deep_crawling/bfs_strategy.py @@ -10,6 +10,7 @@ from .filters import FilterChain from .scorers import URLScorer from . import DeepCrawlStrategy from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult +from ..utils import normalize_url_for_deep_crawl, efficient_normalize_url_for_deep_crawl from math import inf as infinity class BFSDeepCrawlStrategy(DeepCrawlStrategy): @@ -99,14 +100,17 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): # First collect all valid links for link in links: url = link.get("href") - if url in visited: + # Strip URL fragments to avoid duplicate crawling + # base_url = url.split('#')[0] if url else url + base_url = normalize_url_for_deep_crawl(url, source_url) + if base_url in visited: continue if not await self.can_process_url(url, next_depth): self.stats.urls_skipped += 1 continue # Score the URL if a scorer is provided - score = self.url_scorer.score(url) if self.url_scorer else 0 + score = self.url_scorer.score(base_url) if self.url_scorer else 0 # Skip URLs with scores below the threshold if score < self.score_threshold: @@ -114,7 +118,7 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy): self.stats.urls_skipped += 1 continue - valid_links.append((url, score)) + valid_links.append((base_url, score)) # If we have more valid links than capacity, sort by score and take the top ones if len(valid_links) > remaining_capacity: diff --git a/crawl4ai/deep_crawling/filters.py b/crawl4ai/deep_crawling/filters.py index c8af3022..9fd8a72a 100644 --- a/crawl4ai/deep_crawling/filters.py +++ b/crawl4ai/deep_crawling/filters.py @@ -427,6 +427,11 @@ class DomainFilter(URLFilter): if isinstance(domains, str): return {domains.lower()} return {d.lower() for d in domains} + + @staticmethod + def _is_subdomain(domain: str, parent_domain: str) -> bool: + """Check if domain is a subdomain of parent_domain""" + return domain == parent_domain or domain.endswith(f".{parent_domain}") @staticmethod @lru_cache(maxsize=10000) @@ -444,20 +449,26 @@ class DomainFilter(URLFilter): domain = self._extract_domain(url) - # Early return for blocked domains - if domain in self._blocked_domains: - self._update_stats(False) - return False + # Check for blocked domains, including subdomains + for blocked in self._blocked_domains: + if self._is_subdomain(domain, blocked): + self._update_stats(False) + return False # If no allowed domains specified, accept all non-blocked if self._allowed_domains is None: self._update_stats(True) return True - # Final allowed domains check - result = domain in self._allowed_domains - self._update_stats(result) - return result + # Check if domain matches any allowed domain (including subdomains) + for allowed in self._allowed_domains: + if self._is_subdomain(domain, allowed): + self._update_stats(True) + return True + + # No matches found + self._update_stats(False) + return False class ContentRelevanceFilter(URLFilter): diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index e0e49d99..3b708421 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -1064,7 +1064,7 @@ class JsonElementExtractionStrategy(ExtractionStrategy): api_token (str): Legacy Parameter. API token for LLM provider llm_config (LLMConfig): LLM configuration object prompt (str, optional): Custom prompt template to use - **kwargs: Additional args passed to perform_completion_with_backoff + **kwargs: Additional args passed to LLM processor Returns: dict: Generated schema following the JsonElementExtractionStrategy format @@ -1130,7 +1130,8 @@ In this scenario, use your best judgment to generate the schema. Try to maximize prompt_with_variables="\n\n".join([system_message["content"], user_message["content"]]), json_response = True, api_token=llm_config.api_token, - **kwargs + base_url=llm_config.base_url, + extra_args=kwargs ) # Extract and return schema diff --git a/crawl4ai/models.py b/crawl4ai/models.py index ef9efc06..c1caff94 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -37,13 +37,33 @@ class CrawlStatus(Enum): FAILED = "FAILED" +# @dataclass +# class CrawlStats: +# task_id: str +# url: str +# status: CrawlStatus +# start_time: Optional[datetime] = None +# end_time: Optional[datetime] = None +# memory_usage: float = 0.0 +# peak_memory: float = 0.0 +# error_message: str = "" + +# @property +# def duration(self) -> str: +# if not self.start_time: +# return "0:00" +# end = self.end_time or datetime.now() +# duration = end - self.start_time +# return str(timedelta(seconds=int(duration.total_seconds()))) + + @dataclass class CrawlStats: task_id: str url: str status: CrawlStatus - start_time: Optional[datetime] = None - end_time: Optional[datetime] = None + start_time: Optional[Union[datetime, float]] = None + end_time: Optional[Union[datetime, float]] = None memory_usage: float = 0.0 peak_memory: float = 0.0 error_message: str = "" @@ -52,11 +72,21 @@ class CrawlStats: def duration(self) -> str: if not self.start_time: return "0:00" + + # Convert start_time to datetime if it's a float + start = self.start_time + if isinstance(start, float): + start = datetime.fromtimestamp(start) + + # Get end time or use current time end = self.end_time or datetime.now() - duration = end - self.start_time + # Convert end_time to datetime if it's a float + if isinstance(end, float): + end = datetime.fromtimestamp(end) + + duration = end - start return str(timedelta(seconds=int(duration.total_seconds()))) - class DisplayMode(Enum): DETAILED = "DETAILED" AGGREGATED = "AGGREGATED" diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index b5a50eab..146ce06c 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -1,5 +1,4 @@ import time -from urllib.parse import urlparse from concurrent.futures import ThreadPoolExecutor, as_completed from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString import json @@ -33,6 +32,8 @@ import hashlib from urllib.robotparser import RobotFileParser import aiohttp +from urllib.parse import urlparse, urlunparse +from functools import lru_cache from packaging import version from . import __version__ @@ -1962,6 +1963,82 @@ def normalize_url(href, base_url): return normalized +def normalize_url_for_deep_crawl(href, base_url): + """Normalize URLs to ensure consistent format""" + from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode + + # Handle None or empty values + if not href: + return None + + # Use urljoin to handle relative URLs + full_url = urljoin(base_url, href.strip()) + + # Parse the URL for normalization + parsed = urlparse(full_url) + + # Convert hostname to lowercase + netloc = parsed.netloc.lower() + + # Remove fragment entirely + fragment = '' + + # Normalize query parameters if needed + query = parsed.query + if query: + # Parse query parameters + params = parse_qs(query) + + # Remove tracking parameters (example - customize as needed) + tracking_params = ['utm_source', 'utm_medium', 'utm_campaign', 'ref', 'fbclid'] + for param in tracking_params: + if param in params: + del params[param] + + # Rebuild query string, sorted for consistency + query = urlencode(params, doseq=True) if params else '' + + # Build normalized URL + normalized = urlunparse(( + parsed.scheme, + netloc, + parsed.path.rstrip('/') or '/', # Normalize trailing slash + parsed.params, + query, + fragment + )) + + return normalized + +@lru_cache(maxsize=10000) +def efficient_normalize_url_for_deep_crawl(href, base_url): + """Efficient URL normalization with proper parsing""" + from urllib.parse import urljoin + + if not href: + return None + + # Resolve relative URLs + full_url = urljoin(base_url, href.strip()) + + # Use proper URL parsing + parsed = urlparse(full_url) + + # Only perform the most critical normalizations + # 1. Lowercase hostname + # 2. Remove fragment + normalized = urlunparse(( + parsed.scheme, + parsed.netloc.lower(), + parsed.path, + parsed.params, + parsed.query, + '' # Remove fragment + )) + + return normalized + + def normalize_url_tmp(href, base_url): """Normalize URLs to ensure consistent format""" # Extract protocol and domain from base URL diff --git a/docs/snippets/deep_crawl/intro.py b/docs/snippets/deep_crawl/intro.py new file mode 100644 index 00000000..d8fd2f94 --- /dev/null +++ b/docs/snippets/deep_crawl/intro.py @@ -0,0 +1,78 @@ +import asyncio +from typing import List + +from crawl4ai import ( + AsyncWebCrawler, + CrawlerRunConfig, + BFSDeepCrawlStrategy, + CrawlResult, + FilterChain, + DomainFilter, + URLPatternFilter, +) + +# Import necessary classes from crawl4ai library: +# - AsyncWebCrawler: The main class for web crawling. +# - CrawlerRunConfig: Configuration class for crawler behavior. +# - BFSDeepCrawlStrategy: Breadth-First Search deep crawling strategy. +# - CrawlResult: Data model for individual crawl results. +# - FilterChain: Used to chain multiple URL filters. +# - URLPatternFilter: Filter URLs based on patterns. +# You had from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, which is also correct, +# but for simplicity and consistency, we will use the direct import from crawl4ai in this example, as it is re-exported in __init__.py + +async def basic_deep_crawl(): + """ + Performs a basic deep crawl starting from a seed URL, demonstrating: + - Breadth-First Search (BFS) deep crawling strategy. + - Filtering URLs based on URL patterns. + - Accessing crawl results and metadata. + """ + + # 1. Define URL Filters: + # Create a URLPatternFilter to include only URLs containing "text". + # This filter will be used to restrict crawling to URLs that are likely to contain textual content. + url_filter = URLPatternFilter( + patterns=[ + "*text*", # Include URLs that contain "text" in their path or URL + ] + ) + + # Create a DomainFilter to allow only URLs from the "groq.com" domain and block URLs from the "example.com" domain. + # This filter will be used to restrict crawling to URLs within the "groq.com" domain. + domain_filter = DomainFilter( + allowed_domains=["groq.com"], + blocked_domains=["example.com"], + ) + + # 2. Configure CrawlerRunConfig for Deep Crawling: + # Configure CrawlerRunConfig to use BFSDeepCrawlStrategy for deep crawling. + config = CrawlerRunConfig( + deep_crawl_strategy=BFSDeepCrawlStrategy( + max_depth=2, # Set the maximum depth of crawling to 2 levels from the start URL + max_pages=10, # Limit the total number of pages to crawl to 10, to prevent excessive crawling + include_external=False, # Set to False to only crawl URLs within the same domain as the start URL + filter_chain=FilterChain(filters=[url_filter, domain_filter]), # Apply the URLPatternFilter and DomainFilter to filter URLs during deep crawl + ), + verbose=True, # Enable verbose logging to see detailed output during crawling + ) + + # 3. Initialize and Run AsyncWebCrawler: + # Use AsyncWebCrawler as a context manager for automatic start and close. + async with AsyncWebCrawler() as crawler: + results: List[CrawlResult] = await crawler.arun( + # url="https://docs.crawl4ai.com", # Uncomment to use crawl4ai documentation as start URL + url="https://console.groq.com/docs", # Set the start URL for deep crawling to Groq documentation + config=config, # Pass the configured CrawlerRunConfig to arun method + ) + + # 4. Process and Print Crawl Results: + # Iterate through the list of CrawlResult objects returned by the deep crawl. + for result in results: + # Print the URL and its crawl depth from the metadata for each crawled URL. + print(f"URL: {result.url}, Depth: {result.metadata.get('depth', 0)}") + + +if __name__ == "__main__": + import asyncio + asyncio.run(basic_deep_crawl())