Merge branch 'vr0.5.0.post4'
This commit is contained in:
@@ -48,8 +48,9 @@ from .deep_crawling import (
|
|||||||
DeepCrawlStrategy,
|
DeepCrawlStrategy,
|
||||||
BFSDeepCrawlStrategy,
|
BFSDeepCrawlStrategy,
|
||||||
FilterChain,
|
FilterChain,
|
||||||
ContentTypeFilter,
|
URLPatternFilter,
|
||||||
DomainFilter,
|
DomainFilter,
|
||||||
|
ContentTypeFilter,
|
||||||
URLFilter,
|
URLFilter,
|
||||||
FilterStats,
|
FilterStats,
|
||||||
SEOFilter,
|
SEOFilter,
|
||||||
@@ -75,6 +76,7 @@ __all__ = [
|
|||||||
"BestFirstCrawlingStrategy",
|
"BestFirstCrawlingStrategy",
|
||||||
"DFSDeepCrawlStrategy",
|
"DFSDeepCrawlStrategy",
|
||||||
"FilterChain",
|
"FilterChain",
|
||||||
|
"URLPatternFilter",
|
||||||
"ContentTypeFilter",
|
"ContentTypeFilter",
|
||||||
"DomainFilter",
|
"DomainFilter",
|
||||||
"FilterStats",
|
"FilterStats",
|
||||||
|
|||||||
@@ -1,2 +1,2 @@
|
|||||||
# crawl4ai/_version.py
|
# crawl4ai/_version.py
|
||||||
__version__ = "0.5.0.post2"
|
__version__ = "0.5.0.post4"
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ from rich.live import Live
|
|||||||
from rich.table import Table
|
from rich.table import Table
|
||||||
from rich.console import Console
|
from rich.console import Console
|
||||||
from rich import box
|
from rich import box
|
||||||
from datetime import timedelta
|
from datetime import timedelta, datetime
|
||||||
from collections.abc import AsyncGenerator
|
from collections.abc import AsyncGenerator
|
||||||
import time
|
import time
|
||||||
import psutil
|
import psutil
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ import sys
|
|||||||
import time
|
import time
|
||||||
from colorama import Fore
|
from colorama import Fore
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, List
|
from typing import Optional, List, Generic, TypeVar
|
||||||
import json
|
import json
|
||||||
import asyncio
|
import asyncio
|
||||||
|
|
||||||
@@ -23,7 +23,7 @@ from .async_crawler_strategy import (
|
|||||||
AsyncPlaywrightCrawlerStrategy,
|
AsyncPlaywrightCrawlerStrategy,
|
||||||
AsyncCrawlResponse,
|
AsyncCrawlResponse,
|
||||||
)
|
)
|
||||||
from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
|
from .cache_context import CacheMode, CacheContext
|
||||||
from .markdown_generation_strategy import (
|
from .markdown_generation_strategy import (
|
||||||
DefaultMarkdownGenerator,
|
DefaultMarkdownGenerator,
|
||||||
MarkdownGenerationStrategy,
|
MarkdownGenerationStrategy,
|
||||||
@@ -44,17 +44,46 @@ from .utils import (
|
|||||||
RobotsParser,
|
RobotsParser,
|
||||||
)
|
)
|
||||||
|
|
||||||
from typing import Union, AsyncGenerator, TypeVar
|
from typing import Union, AsyncGenerator
|
||||||
|
|
||||||
CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult)
|
CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult)
|
||||||
RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
# RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
||||||
|
|
||||||
DeepCrawlSingleReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
class CrawlResultContainer(Generic[CrawlResultT]):
|
||||||
DeepCrawlManyReturn = Union[
|
def __init__(self, results: Union[CrawlResultT, List[CrawlResultT]]):
|
||||||
List[List[CrawlResultT]],
|
# Normalize to a list
|
||||||
AsyncGenerator[CrawlResultT, None],
|
if isinstance(results, list):
|
||||||
|
self._results = results
|
||||||
|
else:
|
||||||
|
self._results = [results]
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return iter(self._results)
|
||||||
|
|
||||||
|
def __getitem__(self, index):
|
||||||
|
return self._results[index]
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self._results)
|
||||||
|
|
||||||
|
def __getattr__(self, attr):
|
||||||
|
# Delegate attribute access to the first element.
|
||||||
|
if self._results:
|
||||||
|
return getattr(self._results[0], attr)
|
||||||
|
raise AttributeError(f"{self.__class__.__name__} object has no attribute '{attr}'")
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"{self.__class__.__name__}({self._results!r})"
|
||||||
|
|
||||||
|
# Redefine the union type. Now synchronous calls always return a container,
|
||||||
|
# while stream mode is handled with an AsyncGenerator.
|
||||||
|
RunManyReturn = Union[
|
||||||
|
CrawlResultContainer[CrawlResultT],
|
||||||
|
AsyncGenerator[CrawlResultT, None]
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class AsyncWebCrawler:
|
class AsyncWebCrawler:
|
||||||
"""
|
"""
|
||||||
Asynchronous web crawler with flexible caching capabilities.
|
Asynchronous web crawler with flexible caching capabilities.
|
||||||
@@ -223,23 +252,6 @@ class AsyncWebCrawler:
|
|||||||
self,
|
self,
|
||||||
url: str,
|
url: str,
|
||||||
config: CrawlerRunConfig = None,
|
config: CrawlerRunConfig = None,
|
||||||
# Legacy parameters maintained for backwards compatibility
|
|
||||||
# word_count_threshold=MIN_WORD_THRESHOLD,
|
|
||||||
# extraction_strategy: ExtractionStrategy = None,
|
|
||||||
# chunking_strategy: ChunkingStrategy = RegexChunking(),
|
|
||||||
# content_filter: RelevantContentFilter = None,
|
|
||||||
# cache_mode: Optional[CacheMode] = None,
|
|
||||||
# Deprecated cache parameters
|
|
||||||
# bypass_cache: bool = False,
|
|
||||||
# disable_cache: bool = False,
|
|
||||||
# no_cache_read: bool = False,
|
|
||||||
# no_cache_write: bool = False,
|
|
||||||
# Other legacy parameters
|
|
||||||
# css_selector: str = None,
|
|
||||||
# screenshot: bool = False,
|
|
||||||
# pdf: bool = False,
|
|
||||||
# user_agent: str = None,
|
|
||||||
# verbose=True,
|
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> RunManyReturn:
|
) -> RunManyReturn:
|
||||||
"""
|
"""
|
||||||
@@ -270,47 +282,13 @@ class AsyncWebCrawler:
|
|||||||
Returns:
|
Returns:
|
||||||
CrawlResult: The result of crawling and processing
|
CrawlResult: The result of crawling and processing
|
||||||
"""
|
"""
|
||||||
crawler_config = config or CrawlerRunConfig()
|
config = config or CrawlerRunConfig()
|
||||||
if not isinstance(url, str) or not url:
|
if not isinstance(url, str) or not url:
|
||||||
raise ValueError("Invalid URL, make sure the URL is a non-empty string")
|
raise ValueError("Invalid URL, make sure the URL is a non-empty string")
|
||||||
|
|
||||||
async with self._lock or self.nullcontext():
|
async with self._lock or self.nullcontext():
|
||||||
try:
|
try:
|
||||||
self.logger.verbose = crawler_config.verbose
|
self.logger.verbose = config.verbose
|
||||||
# Handle configuration
|
|
||||||
if crawler_config is not None:
|
|
||||||
config = crawler_config
|
|
||||||
else:
|
|
||||||
# Merge all parameters into a single kwargs dict for config creation
|
|
||||||
# config_kwargs = {
|
|
||||||
# "word_count_threshold": word_count_threshold,
|
|
||||||
# "extraction_strategy": extraction_strategy,
|
|
||||||
# "chunking_strategy": chunking_strategy,
|
|
||||||
# "content_filter": content_filter,
|
|
||||||
# "cache_mode": cache_mode,
|
|
||||||
# "bypass_cache": bypass_cache,
|
|
||||||
# "disable_cache": disable_cache,
|
|
||||||
# "no_cache_read": no_cache_read,
|
|
||||||
# "no_cache_write": no_cache_write,
|
|
||||||
# "css_selector": css_selector,
|
|
||||||
# "screenshot": screenshot,
|
|
||||||
# "pdf": pdf,
|
|
||||||
# "verbose": verbose,
|
|
||||||
# **kwargs,
|
|
||||||
# }
|
|
||||||
# config = CrawlerRunConfig.from_kwargs(config_kwargs)
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Handle deprecated cache parameters
|
|
||||||
# if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]):
|
|
||||||
# # Convert legacy parameters if cache_mode not provided
|
|
||||||
# if config.cache_mode is None:
|
|
||||||
# config.cache_mode = _legacy_to_cache_mode(
|
|
||||||
# disable_cache=disable_cache,
|
|
||||||
# bypass_cache=bypass_cache,
|
|
||||||
# no_cache_read=no_cache_read,
|
|
||||||
# no_cache_write=no_cache_write,
|
|
||||||
# )
|
|
||||||
|
|
||||||
# Default to ENABLED if no cache mode specified
|
# Default to ENABLED if no cache mode specified
|
||||||
if config.cache_mode is None:
|
if config.cache_mode is None:
|
||||||
@@ -457,7 +435,7 @@ class AsyncWebCrawler:
|
|||||||
if cache_context.should_write() and not bool(cached_result):
|
if cache_context.should_write() and not bool(cached_result):
|
||||||
await async_db_manager.acache_url(crawl_result)
|
await async_db_manager.acache_url(crawl_result)
|
||||||
|
|
||||||
return crawl_result
|
return CrawlResultContainer(crawl_result)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
self.logger.success(
|
self.logger.success(
|
||||||
@@ -474,7 +452,7 @@ class AsyncWebCrawler:
|
|||||||
cached_result.success = bool(html)
|
cached_result.success = bool(html)
|
||||||
cached_result.session_id = getattr(config, "session_id", None)
|
cached_result.session_id = getattr(config, "session_id", None)
|
||||||
cached_result.redirected_url = cached_result.redirected_url or url
|
cached_result.redirected_url = cached_result.redirected_url or url
|
||||||
return cached_result
|
return CrawlResultContainer(cached_result)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_context = get_error_context(sys.exc_info())
|
error_context = get_error_context(sys.exc_info())
|
||||||
@@ -492,8 +470,10 @@ class AsyncWebCrawler:
|
|||||||
tag="ERROR",
|
tag="ERROR",
|
||||||
)
|
)
|
||||||
|
|
||||||
return CrawlResult(
|
return CrawlResultContainer(
|
||||||
url=url, html="", success=False, error_message=error_message
|
CrawlResult(
|
||||||
|
url=url, html="", success=False, error_message=error_message
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
async def aprocess_html(
|
async def aprocess_html(
|
||||||
@@ -669,17 +649,17 @@ class AsyncWebCrawler:
|
|||||||
config: Optional[CrawlerRunConfig] = None,
|
config: Optional[CrawlerRunConfig] = None,
|
||||||
dispatcher: Optional[BaseDispatcher] = None,
|
dispatcher: Optional[BaseDispatcher] = None,
|
||||||
# Legacy parameters maintained for backwards compatibility
|
# Legacy parameters maintained for backwards compatibility
|
||||||
word_count_threshold=MIN_WORD_THRESHOLD,
|
# word_count_threshold=MIN_WORD_THRESHOLD,
|
||||||
extraction_strategy: ExtractionStrategy = None,
|
# extraction_strategy: ExtractionStrategy = None,
|
||||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
# chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||||
content_filter: RelevantContentFilter = None,
|
# content_filter: RelevantContentFilter = None,
|
||||||
cache_mode: Optional[CacheMode] = None,
|
# cache_mode: Optional[CacheMode] = None,
|
||||||
bypass_cache: bool = False,
|
# bypass_cache: bool = False,
|
||||||
css_selector: str = None,
|
# css_selector: str = None,
|
||||||
screenshot: bool = False,
|
# screenshot: bool = False,
|
||||||
pdf: bool = False,
|
# pdf: bool = False,
|
||||||
user_agent: str = None,
|
# user_agent: str = None,
|
||||||
verbose=True,
|
# verbose=True,
|
||||||
**kwargs
|
**kwargs
|
||||||
) -> RunManyReturn:
|
) -> RunManyReturn:
|
||||||
"""
|
"""
|
||||||
@@ -712,20 +692,21 @@ class AsyncWebCrawler:
|
|||||||
):
|
):
|
||||||
print(f"Processed {result.url}: {len(result.markdown)} chars")
|
print(f"Processed {result.url}: {len(result.markdown)} chars")
|
||||||
"""
|
"""
|
||||||
if config is None:
|
config = config or CrawlerRunConfig()
|
||||||
config = CrawlerRunConfig(
|
# if config is None:
|
||||||
word_count_threshold=word_count_threshold,
|
# config = CrawlerRunConfig(
|
||||||
extraction_strategy=extraction_strategy,
|
# word_count_threshold=word_count_threshold,
|
||||||
chunking_strategy=chunking_strategy,
|
# extraction_strategy=extraction_strategy,
|
||||||
content_filter=content_filter,
|
# chunking_strategy=chunking_strategy,
|
||||||
cache_mode=cache_mode,
|
# content_filter=content_filter,
|
||||||
bypass_cache=bypass_cache,
|
# cache_mode=cache_mode,
|
||||||
css_selector=css_selector,
|
# bypass_cache=bypass_cache,
|
||||||
screenshot=screenshot,
|
# css_selector=css_selector,
|
||||||
pdf=pdf,
|
# screenshot=screenshot,
|
||||||
verbose=verbose,
|
# pdf=pdf,
|
||||||
**kwargs,
|
# verbose=verbose,
|
||||||
)
|
# **kwargs,
|
||||||
|
# )
|
||||||
|
|
||||||
if dispatcher is None:
|
if dispatcher is None:
|
||||||
dispatcher = MemoryAdaptiveDispatcher(
|
dispatcher = MemoryAdaptiveDispatcher(
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ from .filters import FilterChain
|
|||||||
from .scorers import URLScorer
|
from .scorers import URLScorer
|
||||||
from . import DeepCrawlStrategy
|
from . import DeepCrawlStrategy
|
||||||
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult
|
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult
|
||||||
|
from ..utils import normalize_url_for_deep_crawl, efficient_normalize_url_for_deep_crawl
|
||||||
from math import inf as infinity
|
from math import inf as infinity
|
||||||
|
|
||||||
class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
||||||
@@ -99,14 +100,17 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
|||||||
# First collect all valid links
|
# First collect all valid links
|
||||||
for link in links:
|
for link in links:
|
||||||
url = link.get("href")
|
url = link.get("href")
|
||||||
if url in visited:
|
# Strip URL fragments to avoid duplicate crawling
|
||||||
|
# base_url = url.split('#')[0] if url else url
|
||||||
|
base_url = normalize_url_for_deep_crawl(url, source_url)
|
||||||
|
if base_url in visited:
|
||||||
continue
|
continue
|
||||||
if not await self.can_process_url(url, next_depth):
|
if not await self.can_process_url(url, next_depth):
|
||||||
self.stats.urls_skipped += 1
|
self.stats.urls_skipped += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Score the URL if a scorer is provided
|
# Score the URL if a scorer is provided
|
||||||
score = self.url_scorer.score(url) if self.url_scorer else 0
|
score = self.url_scorer.score(base_url) if self.url_scorer else 0
|
||||||
|
|
||||||
# Skip URLs with scores below the threshold
|
# Skip URLs with scores below the threshold
|
||||||
if score < self.score_threshold:
|
if score < self.score_threshold:
|
||||||
@@ -114,7 +118,7 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
|||||||
self.stats.urls_skipped += 1
|
self.stats.urls_skipped += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
valid_links.append((url, score))
|
valid_links.append((base_url, score))
|
||||||
|
|
||||||
# If we have more valid links than capacity, sort by score and take the top ones
|
# If we have more valid links than capacity, sort by score and take the top ones
|
||||||
if len(valid_links) > remaining_capacity:
|
if len(valid_links) > remaining_capacity:
|
||||||
|
|||||||
@@ -428,6 +428,11 @@ class DomainFilter(URLFilter):
|
|||||||
return {domains.lower()}
|
return {domains.lower()}
|
||||||
return {d.lower() for d in domains}
|
return {d.lower() for d in domains}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _is_subdomain(domain: str, parent_domain: str) -> bool:
|
||||||
|
"""Check if domain is a subdomain of parent_domain"""
|
||||||
|
return domain == parent_domain or domain.endswith(f".{parent_domain}")
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@lru_cache(maxsize=10000)
|
@lru_cache(maxsize=10000)
|
||||||
def _extract_domain(url: str) -> str:
|
def _extract_domain(url: str) -> str:
|
||||||
@@ -444,20 +449,26 @@ class DomainFilter(URLFilter):
|
|||||||
|
|
||||||
domain = self._extract_domain(url)
|
domain = self._extract_domain(url)
|
||||||
|
|
||||||
# Early return for blocked domains
|
# Check for blocked domains, including subdomains
|
||||||
if domain in self._blocked_domains:
|
for blocked in self._blocked_domains:
|
||||||
self._update_stats(False)
|
if self._is_subdomain(domain, blocked):
|
||||||
return False
|
self._update_stats(False)
|
||||||
|
return False
|
||||||
|
|
||||||
# If no allowed domains specified, accept all non-blocked
|
# If no allowed domains specified, accept all non-blocked
|
||||||
if self._allowed_domains is None:
|
if self._allowed_domains is None:
|
||||||
self._update_stats(True)
|
self._update_stats(True)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# Final allowed domains check
|
# Check if domain matches any allowed domain (including subdomains)
|
||||||
result = domain in self._allowed_domains
|
for allowed in self._allowed_domains:
|
||||||
self._update_stats(result)
|
if self._is_subdomain(domain, allowed):
|
||||||
return result
|
self._update_stats(True)
|
||||||
|
return True
|
||||||
|
|
||||||
|
# No matches found
|
||||||
|
self._update_stats(False)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
class ContentRelevanceFilter(URLFilter):
|
class ContentRelevanceFilter(URLFilter):
|
||||||
|
|||||||
@@ -1064,7 +1064,7 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
|
|||||||
api_token (str): Legacy Parameter. API token for LLM provider
|
api_token (str): Legacy Parameter. API token for LLM provider
|
||||||
llm_config (LLMConfig): LLM configuration object
|
llm_config (LLMConfig): LLM configuration object
|
||||||
prompt (str, optional): Custom prompt template to use
|
prompt (str, optional): Custom prompt template to use
|
||||||
**kwargs: Additional args passed to perform_completion_with_backoff
|
**kwargs: Additional args passed to LLM processor
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
dict: Generated schema following the JsonElementExtractionStrategy format
|
dict: Generated schema following the JsonElementExtractionStrategy format
|
||||||
@@ -1130,7 +1130,8 @@ In this scenario, use your best judgment to generate the schema. Try to maximize
|
|||||||
prompt_with_variables="\n\n".join([system_message["content"], user_message["content"]]),
|
prompt_with_variables="\n\n".join([system_message["content"], user_message["content"]]),
|
||||||
json_response = True,
|
json_response = True,
|
||||||
api_token=llm_config.api_token,
|
api_token=llm_config.api_token,
|
||||||
**kwargs
|
base_url=llm_config.base_url,
|
||||||
|
extra_args=kwargs
|
||||||
)
|
)
|
||||||
|
|
||||||
# Extract and return schema
|
# Extract and return schema
|
||||||
|
|||||||
@@ -37,13 +37,33 @@ class CrawlStatus(Enum):
|
|||||||
FAILED = "FAILED"
|
FAILED = "FAILED"
|
||||||
|
|
||||||
|
|
||||||
|
# @dataclass
|
||||||
|
# class CrawlStats:
|
||||||
|
# task_id: str
|
||||||
|
# url: str
|
||||||
|
# status: CrawlStatus
|
||||||
|
# start_time: Optional[datetime] = None
|
||||||
|
# end_time: Optional[datetime] = None
|
||||||
|
# memory_usage: float = 0.0
|
||||||
|
# peak_memory: float = 0.0
|
||||||
|
# error_message: str = ""
|
||||||
|
|
||||||
|
# @property
|
||||||
|
# def duration(self) -> str:
|
||||||
|
# if not self.start_time:
|
||||||
|
# return "0:00"
|
||||||
|
# end = self.end_time or datetime.now()
|
||||||
|
# duration = end - self.start_time
|
||||||
|
# return str(timedelta(seconds=int(duration.total_seconds())))
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class CrawlStats:
|
class CrawlStats:
|
||||||
task_id: str
|
task_id: str
|
||||||
url: str
|
url: str
|
||||||
status: CrawlStatus
|
status: CrawlStatus
|
||||||
start_time: Optional[datetime] = None
|
start_time: Optional[Union[datetime, float]] = None
|
||||||
end_time: Optional[datetime] = None
|
end_time: Optional[Union[datetime, float]] = None
|
||||||
memory_usage: float = 0.0
|
memory_usage: float = 0.0
|
||||||
peak_memory: float = 0.0
|
peak_memory: float = 0.0
|
||||||
error_message: str = ""
|
error_message: str = ""
|
||||||
@@ -52,10 +72,20 @@ class CrawlStats:
|
|||||||
def duration(self) -> str:
|
def duration(self) -> str:
|
||||||
if not self.start_time:
|
if not self.start_time:
|
||||||
return "0:00"
|
return "0:00"
|
||||||
end = self.end_time or datetime.now()
|
|
||||||
duration = end - self.start_time
|
|
||||||
return str(timedelta(seconds=int(duration.total_seconds())))
|
|
||||||
|
|
||||||
|
# Convert start_time to datetime if it's a float
|
||||||
|
start = self.start_time
|
||||||
|
if isinstance(start, float):
|
||||||
|
start = datetime.fromtimestamp(start)
|
||||||
|
|
||||||
|
# Get end time or use current time
|
||||||
|
end = self.end_time or datetime.now()
|
||||||
|
# Convert end_time to datetime if it's a float
|
||||||
|
if isinstance(end, float):
|
||||||
|
end = datetime.fromtimestamp(end)
|
||||||
|
|
||||||
|
duration = end - start
|
||||||
|
return str(timedelta(seconds=int(duration.total_seconds())))
|
||||||
|
|
||||||
class DisplayMode(Enum):
|
class DisplayMode(Enum):
|
||||||
DETAILED = "DETAILED"
|
DETAILED = "DETAILED"
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
import time
|
import time
|
||||||
from urllib.parse import urlparse
|
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString
|
from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString
|
||||||
import json
|
import json
|
||||||
@@ -33,6 +32,8 @@ import hashlib
|
|||||||
|
|
||||||
from urllib.robotparser import RobotFileParser
|
from urllib.robotparser import RobotFileParser
|
||||||
import aiohttp
|
import aiohttp
|
||||||
|
from urllib.parse import urlparse, urlunparse
|
||||||
|
from functools import lru_cache
|
||||||
|
|
||||||
from packaging import version
|
from packaging import version
|
||||||
from . import __version__
|
from . import __version__
|
||||||
@@ -1962,6 +1963,82 @@ def normalize_url(href, base_url):
|
|||||||
return normalized
|
return normalized
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_url_for_deep_crawl(href, base_url):
|
||||||
|
"""Normalize URLs to ensure consistent format"""
|
||||||
|
from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode
|
||||||
|
|
||||||
|
# Handle None or empty values
|
||||||
|
if not href:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Use urljoin to handle relative URLs
|
||||||
|
full_url = urljoin(base_url, href.strip())
|
||||||
|
|
||||||
|
# Parse the URL for normalization
|
||||||
|
parsed = urlparse(full_url)
|
||||||
|
|
||||||
|
# Convert hostname to lowercase
|
||||||
|
netloc = parsed.netloc.lower()
|
||||||
|
|
||||||
|
# Remove fragment entirely
|
||||||
|
fragment = ''
|
||||||
|
|
||||||
|
# Normalize query parameters if needed
|
||||||
|
query = parsed.query
|
||||||
|
if query:
|
||||||
|
# Parse query parameters
|
||||||
|
params = parse_qs(query)
|
||||||
|
|
||||||
|
# Remove tracking parameters (example - customize as needed)
|
||||||
|
tracking_params = ['utm_source', 'utm_medium', 'utm_campaign', 'ref', 'fbclid']
|
||||||
|
for param in tracking_params:
|
||||||
|
if param in params:
|
||||||
|
del params[param]
|
||||||
|
|
||||||
|
# Rebuild query string, sorted for consistency
|
||||||
|
query = urlencode(params, doseq=True) if params else ''
|
||||||
|
|
||||||
|
# Build normalized URL
|
||||||
|
normalized = urlunparse((
|
||||||
|
parsed.scheme,
|
||||||
|
netloc,
|
||||||
|
parsed.path.rstrip('/') or '/', # Normalize trailing slash
|
||||||
|
parsed.params,
|
||||||
|
query,
|
||||||
|
fragment
|
||||||
|
))
|
||||||
|
|
||||||
|
return normalized
|
||||||
|
|
||||||
|
@lru_cache(maxsize=10000)
|
||||||
|
def efficient_normalize_url_for_deep_crawl(href, base_url):
|
||||||
|
"""Efficient URL normalization with proper parsing"""
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
if not href:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Resolve relative URLs
|
||||||
|
full_url = urljoin(base_url, href.strip())
|
||||||
|
|
||||||
|
# Use proper URL parsing
|
||||||
|
parsed = urlparse(full_url)
|
||||||
|
|
||||||
|
# Only perform the most critical normalizations
|
||||||
|
# 1. Lowercase hostname
|
||||||
|
# 2. Remove fragment
|
||||||
|
normalized = urlunparse((
|
||||||
|
parsed.scheme,
|
||||||
|
parsed.netloc.lower(),
|
||||||
|
parsed.path,
|
||||||
|
parsed.params,
|
||||||
|
parsed.query,
|
||||||
|
'' # Remove fragment
|
||||||
|
))
|
||||||
|
|
||||||
|
return normalized
|
||||||
|
|
||||||
|
|
||||||
def normalize_url_tmp(href, base_url):
|
def normalize_url_tmp(href, base_url):
|
||||||
"""Normalize URLs to ensure consistent format"""
|
"""Normalize URLs to ensure consistent format"""
|
||||||
# Extract protocol and domain from base URL
|
# Extract protocol and domain from base URL
|
||||||
|
|||||||
78
docs/snippets/deep_crawl/intro.py
Normal file
78
docs/snippets/deep_crawl/intro.py
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
import asyncio
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from crawl4ai import (
|
||||||
|
AsyncWebCrawler,
|
||||||
|
CrawlerRunConfig,
|
||||||
|
BFSDeepCrawlStrategy,
|
||||||
|
CrawlResult,
|
||||||
|
FilterChain,
|
||||||
|
DomainFilter,
|
||||||
|
URLPatternFilter,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Import necessary classes from crawl4ai library:
|
||||||
|
# - AsyncWebCrawler: The main class for web crawling.
|
||||||
|
# - CrawlerRunConfig: Configuration class for crawler behavior.
|
||||||
|
# - BFSDeepCrawlStrategy: Breadth-First Search deep crawling strategy.
|
||||||
|
# - CrawlResult: Data model for individual crawl results.
|
||||||
|
# - FilterChain: Used to chain multiple URL filters.
|
||||||
|
# - URLPatternFilter: Filter URLs based on patterns.
|
||||||
|
# You had from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, which is also correct,
|
||||||
|
# but for simplicity and consistency, we will use the direct import from crawl4ai in this example, as it is re-exported in __init__.py
|
||||||
|
|
||||||
|
async def basic_deep_crawl():
|
||||||
|
"""
|
||||||
|
Performs a basic deep crawl starting from a seed URL, demonstrating:
|
||||||
|
- Breadth-First Search (BFS) deep crawling strategy.
|
||||||
|
- Filtering URLs based on URL patterns.
|
||||||
|
- Accessing crawl results and metadata.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# 1. Define URL Filters:
|
||||||
|
# Create a URLPatternFilter to include only URLs containing "text".
|
||||||
|
# This filter will be used to restrict crawling to URLs that are likely to contain textual content.
|
||||||
|
url_filter = URLPatternFilter(
|
||||||
|
patterns=[
|
||||||
|
"*text*", # Include URLs that contain "text" in their path or URL
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create a DomainFilter to allow only URLs from the "groq.com" domain and block URLs from the "example.com" domain.
|
||||||
|
# This filter will be used to restrict crawling to URLs within the "groq.com" domain.
|
||||||
|
domain_filter = DomainFilter(
|
||||||
|
allowed_domains=["groq.com"],
|
||||||
|
blocked_domains=["example.com"],
|
||||||
|
)
|
||||||
|
|
||||||
|
# 2. Configure CrawlerRunConfig for Deep Crawling:
|
||||||
|
# Configure CrawlerRunConfig to use BFSDeepCrawlStrategy for deep crawling.
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||||
|
max_depth=2, # Set the maximum depth of crawling to 2 levels from the start URL
|
||||||
|
max_pages=10, # Limit the total number of pages to crawl to 10, to prevent excessive crawling
|
||||||
|
include_external=False, # Set to False to only crawl URLs within the same domain as the start URL
|
||||||
|
filter_chain=FilterChain(filters=[url_filter, domain_filter]), # Apply the URLPatternFilter and DomainFilter to filter URLs during deep crawl
|
||||||
|
),
|
||||||
|
verbose=True, # Enable verbose logging to see detailed output during crawling
|
||||||
|
)
|
||||||
|
|
||||||
|
# 3. Initialize and Run AsyncWebCrawler:
|
||||||
|
# Use AsyncWebCrawler as a context manager for automatic start and close.
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
results: List[CrawlResult] = await crawler.arun(
|
||||||
|
# url="https://docs.crawl4ai.com", # Uncomment to use crawl4ai documentation as start URL
|
||||||
|
url="https://console.groq.com/docs", # Set the start URL for deep crawling to Groq documentation
|
||||||
|
config=config, # Pass the configured CrawlerRunConfig to arun method
|
||||||
|
)
|
||||||
|
|
||||||
|
# 4. Process and Print Crawl Results:
|
||||||
|
# Iterate through the list of CrawlResult objects returned by the deep crawl.
|
||||||
|
for result in results:
|
||||||
|
# Print the URL and its crawl depth from the metadata for each crawled URL.
|
||||||
|
print(f"URL: {result.url}, Depth: {result.metadata.get('depth', 0)}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import asyncio
|
||||||
|
asyncio.run(basic_deep_crawl())
|
||||||
Reference in New Issue
Block a user