Merge branch 'vr0.5.0.post4'

This commit is contained in:
UncleCode
2025-03-06 22:46:44 +08:00
10 changed files with 295 additions and 111 deletions

View File

@@ -48,8 +48,9 @@ from .deep_crawling import (
DeepCrawlStrategy, DeepCrawlStrategy,
BFSDeepCrawlStrategy, BFSDeepCrawlStrategy,
FilterChain, FilterChain,
ContentTypeFilter, URLPatternFilter,
DomainFilter, DomainFilter,
ContentTypeFilter,
URLFilter, URLFilter,
FilterStats, FilterStats,
SEOFilter, SEOFilter,
@@ -75,6 +76,7 @@ __all__ = [
"BestFirstCrawlingStrategy", "BestFirstCrawlingStrategy",
"DFSDeepCrawlStrategy", "DFSDeepCrawlStrategy",
"FilterChain", "FilterChain",
"URLPatternFilter",
"ContentTypeFilter", "ContentTypeFilter",
"DomainFilter", "DomainFilter",
"FilterStats", "FilterStats",

View File

@@ -1,2 +1,2 @@
# crawl4ai/_version.py # crawl4ai/_version.py
__version__ = "0.5.0.post2" __version__ = "0.5.0.post4"

View File

@@ -13,7 +13,7 @@ from rich.live import Live
from rich.table import Table from rich.table import Table
from rich.console import Console from rich.console import Console
from rich import box from rich import box
from datetime import timedelta from datetime import timedelta, datetime
from collections.abc import AsyncGenerator from collections.abc import AsyncGenerator
import time import time
import psutil import psutil

View File

@@ -4,7 +4,7 @@ import sys
import time import time
from colorama import Fore from colorama import Fore
from pathlib import Path from pathlib import Path
from typing import Optional, List from typing import Optional, List, Generic, TypeVar
import json import json
import asyncio import asyncio
@@ -23,7 +23,7 @@ from .async_crawler_strategy import (
AsyncPlaywrightCrawlerStrategy, AsyncPlaywrightCrawlerStrategy,
AsyncCrawlResponse, AsyncCrawlResponse,
) )
from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode from .cache_context import CacheMode, CacheContext
from .markdown_generation_strategy import ( from .markdown_generation_strategy import (
DefaultMarkdownGenerator, DefaultMarkdownGenerator,
MarkdownGenerationStrategy, MarkdownGenerationStrategy,
@@ -44,17 +44,46 @@ from .utils import (
RobotsParser, RobotsParser,
) )
from typing import Union, AsyncGenerator, TypeVar from typing import Union, AsyncGenerator
CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult) CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult)
RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]] # RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
DeepCrawlSingleReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]] class CrawlResultContainer(Generic[CrawlResultT]):
DeepCrawlManyReturn = Union[ def __init__(self, results: Union[CrawlResultT, List[CrawlResultT]]):
List[List[CrawlResultT]], # Normalize to a list
AsyncGenerator[CrawlResultT, None], if isinstance(results, list):
self._results = results
else:
self._results = [results]
def __iter__(self):
return iter(self._results)
def __getitem__(self, index):
return self._results[index]
def __len__(self):
return len(self._results)
def __getattr__(self, attr):
# Delegate attribute access to the first element.
if self._results:
return getattr(self._results[0], attr)
raise AttributeError(f"{self.__class__.__name__} object has no attribute '{attr}'")
def __repr__(self):
return f"{self.__class__.__name__}({self._results!r})"
# Redefine the union type. Now synchronous calls always return a container,
# while stream mode is handled with an AsyncGenerator.
RunManyReturn = Union[
CrawlResultContainer[CrawlResultT],
AsyncGenerator[CrawlResultT, None]
] ]
class AsyncWebCrawler: class AsyncWebCrawler:
""" """
Asynchronous web crawler with flexible caching capabilities. Asynchronous web crawler with flexible caching capabilities.
@@ -223,23 +252,6 @@ class AsyncWebCrawler:
self, self,
url: str, url: str,
config: CrawlerRunConfig = None, config: CrawlerRunConfig = None,
# Legacy parameters maintained for backwards compatibility
# word_count_threshold=MIN_WORD_THRESHOLD,
# extraction_strategy: ExtractionStrategy = None,
# chunking_strategy: ChunkingStrategy = RegexChunking(),
# content_filter: RelevantContentFilter = None,
# cache_mode: Optional[CacheMode] = None,
# Deprecated cache parameters
# bypass_cache: bool = False,
# disable_cache: bool = False,
# no_cache_read: bool = False,
# no_cache_write: bool = False,
# Other legacy parameters
# css_selector: str = None,
# screenshot: bool = False,
# pdf: bool = False,
# user_agent: str = None,
# verbose=True,
**kwargs, **kwargs,
) -> RunManyReturn: ) -> RunManyReturn:
""" """
@@ -270,47 +282,13 @@ class AsyncWebCrawler:
Returns: Returns:
CrawlResult: The result of crawling and processing CrawlResult: The result of crawling and processing
""" """
crawler_config = config or CrawlerRunConfig() config = config or CrawlerRunConfig()
if not isinstance(url, str) or not url: if not isinstance(url, str) or not url:
raise ValueError("Invalid URL, make sure the URL is a non-empty string") raise ValueError("Invalid URL, make sure the URL is a non-empty string")
async with self._lock or self.nullcontext(): async with self._lock or self.nullcontext():
try: try:
self.logger.verbose = crawler_config.verbose self.logger.verbose = config.verbose
# Handle configuration
if crawler_config is not None:
config = crawler_config
else:
# Merge all parameters into a single kwargs dict for config creation
# config_kwargs = {
# "word_count_threshold": word_count_threshold,
# "extraction_strategy": extraction_strategy,
# "chunking_strategy": chunking_strategy,
# "content_filter": content_filter,
# "cache_mode": cache_mode,
# "bypass_cache": bypass_cache,
# "disable_cache": disable_cache,
# "no_cache_read": no_cache_read,
# "no_cache_write": no_cache_write,
# "css_selector": css_selector,
# "screenshot": screenshot,
# "pdf": pdf,
# "verbose": verbose,
# **kwargs,
# }
# config = CrawlerRunConfig.from_kwargs(config_kwargs)
pass
# Handle deprecated cache parameters
# if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]):
# # Convert legacy parameters if cache_mode not provided
# if config.cache_mode is None:
# config.cache_mode = _legacy_to_cache_mode(
# disable_cache=disable_cache,
# bypass_cache=bypass_cache,
# no_cache_read=no_cache_read,
# no_cache_write=no_cache_write,
# )
# Default to ENABLED if no cache mode specified # Default to ENABLED if no cache mode specified
if config.cache_mode is None: if config.cache_mode is None:
@@ -457,7 +435,7 @@ class AsyncWebCrawler:
if cache_context.should_write() and not bool(cached_result): if cache_context.should_write() and not bool(cached_result):
await async_db_manager.acache_url(crawl_result) await async_db_manager.acache_url(crawl_result)
return crawl_result return CrawlResultContainer(crawl_result)
else: else:
self.logger.success( self.logger.success(
@@ -474,7 +452,7 @@ class AsyncWebCrawler:
cached_result.success = bool(html) cached_result.success = bool(html)
cached_result.session_id = getattr(config, "session_id", None) cached_result.session_id = getattr(config, "session_id", None)
cached_result.redirected_url = cached_result.redirected_url or url cached_result.redirected_url = cached_result.redirected_url or url
return cached_result return CrawlResultContainer(cached_result)
except Exception as e: except Exception as e:
error_context = get_error_context(sys.exc_info()) error_context = get_error_context(sys.exc_info())
@@ -492,8 +470,10 @@ class AsyncWebCrawler:
tag="ERROR", tag="ERROR",
) )
return CrawlResult( return CrawlResultContainer(
url=url, html="", success=False, error_message=error_message CrawlResult(
url=url, html="", success=False, error_message=error_message
)
) )
async def aprocess_html( async def aprocess_html(
@@ -669,17 +649,17 @@ class AsyncWebCrawler:
config: Optional[CrawlerRunConfig] = None, config: Optional[CrawlerRunConfig] = None,
dispatcher: Optional[BaseDispatcher] = None, dispatcher: Optional[BaseDispatcher] = None,
# Legacy parameters maintained for backwards compatibility # Legacy parameters maintained for backwards compatibility
word_count_threshold=MIN_WORD_THRESHOLD, # word_count_threshold=MIN_WORD_THRESHOLD,
extraction_strategy: ExtractionStrategy = None, # extraction_strategy: ExtractionStrategy = None,
chunking_strategy: ChunkingStrategy = RegexChunking(), # chunking_strategy: ChunkingStrategy = RegexChunking(),
content_filter: RelevantContentFilter = None, # content_filter: RelevantContentFilter = None,
cache_mode: Optional[CacheMode] = None, # cache_mode: Optional[CacheMode] = None,
bypass_cache: bool = False, # bypass_cache: bool = False,
css_selector: str = None, # css_selector: str = None,
screenshot: bool = False, # screenshot: bool = False,
pdf: bool = False, # pdf: bool = False,
user_agent: str = None, # user_agent: str = None,
verbose=True, # verbose=True,
**kwargs **kwargs
) -> RunManyReturn: ) -> RunManyReturn:
""" """
@@ -712,20 +692,21 @@ class AsyncWebCrawler:
): ):
print(f"Processed {result.url}: {len(result.markdown)} chars") print(f"Processed {result.url}: {len(result.markdown)} chars")
""" """
if config is None: config = config or CrawlerRunConfig()
config = CrawlerRunConfig( # if config is None:
word_count_threshold=word_count_threshold, # config = CrawlerRunConfig(
extraction_strategy=extraction_strategy, # word_count_threshold=word_count_threshold,
chunking_strategy=chunking_strategy, # extraction_strategy=extraction_strategy,
content_filter=content_filter, # chunking_strategy=chunking_strategy,
cache_mode=cache_mode, # content_filter=content_filter,
bypass_cache=bypass_cache, # cache_mode=cache_mode,
css_selector=css_selector, # bypass_cache=bypass_cache,
screenshot=screenshot, # css_selector=css_selector,
pdf=pdf, # screenshot=screenshot,
verbose=verbose, # pdf=pdf,
**kwargs, # verbose=verbose,
) # **kwargs,
# )
if dispatcher is None: if dispatcher is None:
dispatcher = MemoryAdaptiveDispatcher( dispatcher = MemoryAdaptiveDispatcher(

View File

@@ -10,6 +10,7 @@ from .filters import FilterChain
from .scorers import URLScorer from .scorers import URLScorer
from . import DeepCrawlStrategy from . import DeepCrawlStrategy
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult
from ..utils import normalize_url_for_deep_crawl, efficient_normalize_url_for_deep_crawl
from math import inf as infinity from math import inf as infinity
class BFSDeepCrawlStrategy(DeepCrawlStrategy): class BFSDeepCrawlStrategy(DeepCrawlStrategy):
@@ -99,14 +100,17 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
# First collect all valid links # First collect all valid links
for link in links: for link in links:
url = link.get("href") url = link.get("href")
if url in visited: # Strip URL fragments to avoid duplicate crawling
# base_url = url.split('#')[0] if url else url
base_url = normalize_url_for_deep_crawl(url, source_url)
if base_url in visited:
continue continue
if not await self.can_process_url(url, next_depth): if not await self.can_process_url(url, next_depth):
self.stats.urls_skipped += 1 self.stats.urls_skipped += 1
continue continue
# Score the URL if a scorer is provided # Score the URL if a scorer is provided
score = self.url_scorer.score(url) if self.url_scorer else 0 score = self.url_scorer.score(base_url) if self.url_scorer else 0
# Skip URLs with scores below the threshold # Skip URLs with scores below the threshold
if score < self.score_threshold: if score < self.score_threshold:
@@ -114,7 +118,7 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
self.stats.urls_skipped += 1 self.stats.urls_skipped += 1
continue continue
valid_links.append((url, score)) valid_links.append((base_url, score))
# If we have more valid links than capacity, sort by score and take the top ones # If we have more valid links than capacity, sort by score and take the top ones
if len(valid_links) > remaining_capacity: if len(valid_links) > remaining_capacity:

View File

@@ -428,6 +428,11 @@ class DomainFilter(URLFilter):
return {domains.lower()} return {domains.lower()}
return {d.lower() for d in domains} return {d.lower() for d in domains}
@staticmethod
def _is_subdomain(domain: str, parent_domain: str) -> bool:
"""Check if domain is a subdomain of parent_domain"""
return domain == parent_domain or domain.endswith(f".{parent_domain}")
@staticmethod @staticmethod
@lru_cache(maxsize=10000) @lru_cache(maxsize=10000)
def _extract_domain(url: str) -> str: def _extract_domain(url: str) -> str:
@@ -444,20 +449,26 @@ class DomainFilter(URLFilter):
domain = self._extract_domain(url) domain = self._extract_domain(url)
# Early return for blocked domains # Check for blocked domains, including subdomains
if domain in self._blocked_domains: for blocked in self._blocked_domains:
self._update_stats(False) if self._is_subdomain(domain, blocked):
return False self._update_stats(False)
return False
# If no allowed domains specified, accept all non-blocked # If no allowed domains specified, accept all non-blocked
if self._allowed_domains is None: if self._allowed_domains is None:
self._update_stats(True) self._update_stats(True)
return True return True
# Final allowed domains check # Check if domain matches any allowed domain (including subdomains)
result = domain in self._allowed_domains for allowed in self._allowed_domains:
self._update_stats(result) if self._is_subdomain(domain, allowed):
return result self._update_stats(True)
return True
# No matches found
self._update_stats(False)
return False
class ContentRelevanceFilter(URLFilter): class ContentRelevanceFilter(URLFilter):

View File

@@ -1064,7 +1064,7 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
api_token (str): Legacy Parameter. API token for LLM provider api_token (str): Legacy Parameter. API token for LLM provider
llm_config (LLMConfig): LLM configuration object llm_config (LLMConfig): LLM configuration object
prompt (str, optional): Custom prompt template to use prompt (str, optional): Custom prompt template to use
**kwargs: Additional args passed to perform_completion_with_backoff **kwargs: Additional args passed to LLM processor
Returns: Returns:
dict: Generated schema following the JsonElementExtractionStrategy format dict: Generated schema following the JsonElementExtractionStrategy format
@@ -1130,7 +1130,8 @@ In this scenario, use your best judgment to generate the schema. Try to maximize
prompt_with_variables="\n\n".join([system_message["content"], user_message["content"]]), prompt_with_variables="\n\n".join([system_message["content"], user_message["content"]]),
json_response = True, json_response = True,
api_token=llm_config.api_token, api_token=llm_config.api_token,
**kwargs base_url=llm_config.base_url,
extra_args=kwargs
) )
# Extract and return schema # Extract and return schema

View File

@@ -37,13 +37,33 @@ class CrawlStatus(Enum):
FAILED = "FAILED" FAILED = "FAILED"
# @dataclass
# class CrawlStats:
# task_id: str
# url: str
# status: CrawlStatus
# start_time: Optional[datetime] = None
# end_time: Optional[datetime] = None
# memory_usage: float = 0.0
# peak_memory: float = 0.0
# error_message: str = ""
# @property
# def duration(self) -> str:
# if not self.start_time:
# return "0:00"
# end = self.end_time or datetime.now()
# duration = end - self.start_time
# return str(timedelta(seconds=int(duration.total_seconds())))
@dataclass @dataclass
class CrawlStats: class CrawlStats:
task_id: str task_id: str
url: str url: str
status: CrawlStatus status: CrawlStatus
start_time: Optional[datetime] = None start_time: Optional[Union[datetime, float]] = None
end_time: Optional[datetime] = None end_time: Optional[Union[datetime, float]] = None
memory_usage: float = 0.0 memory_usage: float = 0.0
peak_memory: float = 0.0 peak_memory: float = 0.0
error_message: str = "" error_message: str = ""
@@ -52,10 +72,20 @@ class CrawlStats:
def duration(self) -> str: def duration(self) -> str:
if not self.start_time: if not self.start_time:
return "0:00" return "0:00"
end = self.end_time or datetime.now()
duration = end - self.start_time
return str(timedelta(seconds=int(duration.total_seconds())))
# Convert start_time to datetime if it's a float
start = self.start_time
if isinstance(start, float):
start = datetime.fromtimestamp(start)
# Get end time or use current time
end = self.end_time or datetime.now()
# Convert end_time to datetime if it's a float
if isinstance(end, float):
end = datetime.fromtimestamp(end)
duration = end - start
return str(timedelta(seconds=int(duration.total_seconds())))
class DisplayMode(Enum): class DisplayMode(Enum):
DETAILED = "DETAILED" DETAILED = "DETAILED"

View File

@@ -1,5 +1,4 @@
import time import time
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString
import json import json
@@ -33,6 +32,8 @@ import hashlib
from urllib.robotparser import RobotFileParser from urllib.robotparser import RobotFileParser
import aiohttp import aiohttp
from urllib.parse import urlparse, urlunparse
from functools import lru_cache
from packaging import version from packaging import version
from . import __version__ from . import __version__
@@ -1962,6 +1963,82 @@ def normalize_url(href, base_url):
return normalized return normalized
def normalize_url_for_deep_crawl(href, base_url):
"""Normalize URLs to ensure consistent format"""
from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode
# Handle None or empty values
if not href:
return None
# Use urljoin to handle relative URLs
full_url = urljoin(base_url, href.strip())
# Parse the URL for normalization
parsed = urlparse(full_url)
# Convert hostname to lowercase
netloc = parsed.netloc.lower()
# Remove fragment entirely
fragment = ''
# Normalize query parameters if needed
query = parsed.query
if query:
# Parse query parameters
params = parse_qs(query)
# Remove tracking parameters (example - customize as needed)
tracking_params = ['utm_source', 'utm_medium', 'utm_campaign', 'ref', 'fbclid']
for param in tracking_params:
if param in params:
del params[param]
# Rebuild query string, sorted for consistency
query = urlencode(params, doseq=True) if params else ''
# Build normalized URL
normalized = urlunparse((
parsed.scheme,
netloc,
parsed.path.rstrip('/') or '/', # Normalize trailing slash
parsed.params,
query,
fragment
))
return normalized
@lru_cache(maxsize=10000)
def efficient_normalize_url_for_deep_crawl(href, base_url):
"""Efficient URL normalization with proper parsing"""
from urllib.parse import urljoin
if not href:
return None
# Resolve relative URLs
full_url = urljoin(base_url, href.strip())
# Use proper URL parsing
parsed = urlparse(full_url)
# Only perform the most critical normalizations
# 1. Lowercase hostname
# 2. Remove fragment
normalized = urlunparse((
parsed.scheme,
parsed.netloc.lower(),
parsed.path,
parsed.params,
parsed.query,
'' # Remove fragment
))
return normalized
def normalize_url_tmp(href, base_url): def normalize_url_tmp(href, base_url):
"""Normalize URLs to ensure consistent format""" """Normalize URLs to ensure consistent format"""
# Extract protocol and domain from base URL # Extract protocol and domain from base URL

View File

@@ -0,0 +1,78 @@
import asyncio
from typing import List
from crawl4ai import (
AsyncWebCrawler,
CrawlerRunConfig,
BFSDeepCrawlStrategy,
CrawlResult,
FilterChain,
DomainFilter,
URLPatternFilter,
)
# Import necessary classes from crawl4ai library:
# - AsyncWebCrawler: The main class for web crawling.
# - CrawlerRunConfig: Configuration class for crawler behavior.
# - BFSDeepCrawlStrategy: Breadth-First Search deep crawling strategy.
# - CrawlResult: Data model for individual crawl results.
# - FilterChain: Used to chain multiple URL filters.
# - URLPatternFilter: Filter URLs based on patterns.
# You had from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, which is also correct,
# but for simplicity and consistency, we will use the direct import from crawl4ai in this example, as it is re-exported in __init__.py
async def basic_deep_crawl():
"""
Performs a basic deep crawl starting from a seed URL, demonstrating:
- Breadth-First Search (BFS) deep crawling strategy.
- Filtering URLs based on URL patterns.
- Accessing crawl results and metadata.
"""
# 1. Define URL Filters:
# Create a URLPatternFilter to include only URLs containing "text".
# This filter will be used to restrict crawling to URLs that are likely to contain textual content.
url_filter = URLPatternFilter(
patterns=[
"*text*", # Include URLs that contain "text" in their path or URL
]
)
# Create a DomainFilter to allow only URLs from the "groq.com" domain and block URLs from the "example.com" domain.
# This filter will be used to restrict crawling to URLs within the "groq.com" domain.
domain_filter = DomainFilter(
allowed_domains=["groq.com"],
blocked_domains=["example.com"],
)
# 2. Configure CrawlerRunConfig for Deep Crawling:
# Configure CrawlerRunConfig to use BFSDeepCrawlStrategy for deep crawling.
config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(
max_depth=2, # Set the maximum depth of crawling to 2 levels from the start URL
max_pages=10, # Limit the total number of pages to crawl to 10, to prevent excessive crawling
include_external=False, # Set to False to only crawl URLs within the same domain as the start URL
filter_chain=FilterChain(filters=[url_filter, domain_filter]), # Apply the URLPatternFilter and DomainFilter to filter URLs during deep crawl
),
verbose=True, # Enable verbose logging to see detailed output during crawling
)
# 3. Initialize and Run AsyncWebCrawler:
# Use AsyncWebCrawler as a context manager for automatic start and close.
async with AsyncWebCrawler() as crawler:
results: List[CrawlResult] = await crawler.arun(
# url="https://docs.crawl4ai.com", # Uncomment to use crawl4ai documentation as start URL
url="https://console.groq.com/docs", # Set the start URL for deep crawling to Groq documentation
config=config, # Pass the configured CrawlerRunConfig to arun method
)
# 4. Process and Print Crawl Results:
# Iterate through the list of CrawlResult objects returned by the deep crawl.
for result in results:
# Print the URL and its crawl depth from the metadata for each crawled URL.
print(f"URL: {result.url}, Depth: {result.metadata.get('depth', 0)}")
if __name__ == "__main__":
import asyncio
asyncio.run(basic_deep_crawl())