Compare commits
3 Commits
next
...
feat/undet
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
805c498adf | ||
|
|
6a728cbe5b | ||
|
|
5c33cbcca2 |
@@ -216,7 +216,7 @@ Under certain assumptions about link preview accuracy:
|
||||
|
||||
### 8.1 Core Components
|
||||
|
||||
1. **AdaptiveCrawlResult**: Maintains crawl history and metrics
|
||||
1. **CrawlState**: Maintains crawl history and metrics
|
||||
2. **AdaptiveConfig**: Configuration parameters
|
||||
3. **CrawlStrategy**: Pluggable strategy interface
|
||||
4. **AdaptiveCrawler**: Main orchestrator
|
||||
|
||||
17
README.md
17
README.md
@@ -523,18 +523,15 @@ async def test_news_crawl():
|
||||
- **🧠 Adaptive Crawling**: Your crawler now learns and adapts to website patterns automatically:
|
||||
```python
|
||||
config = AdaptiveConfig(
|
||||
confidence_threshold=0.7, # Min confidence to stop crawling
|
||||
max_depth=5, # Maximum crawl depth
|
||||
max_pages=20, # Maximum number of pages to crawl
|
||||
strategy="statistical"
|
||||
confidence_threshold=0.7,
|
||||
max_history=100,
|
||||
learning_rate=0.2
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
adaptive_crawler = AdaptiveCrawler(crawler, config)
|
||||
state = await adaptive_crawler.digest(
|
||||
start_url="https://news.example.com",
|
||||
query="latest news content"
|
||||
)
|
||||
result = await crawler.arun(
|
||||
"https://news.example.com",
|
||||
config=CrawlerRunConfig(adaptive_config=config)
|
||||
)
|
||||
# Crawler learns patterns and improves extraction over time
|
||||
```
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ import warnings
|
||||
|
||||
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
||||
# MODIFIED: Add SeedingConfig and VirtualScrollConfig here
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig, LinkPreviewConfig
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig
|
||||
|
||||
from .content_scraping_strategy import (
|
||||
ContentScrapingStrategy,
|
||||
@@ -73,7 +73,7 @@ from .async_url_seeder import AsyncUrlSeeder
|
||||
from .adaptive_crawler import (
|
||||
AdaptiveCrawler,
|
||||
AdaptiveConfig,
|
||||
AdaptiveCrawlResult,
|
||||
CrawlState,
|
||||
CrawlStrategy,
|
||||
StatisticalStrategy
|
||||
)
|
||||
@@ -88,6 +88,13 @@ from .script import (
|
||||
ErrorDetail
|
||||
)
|
||||
|
||||
# Browser Adapters
|
||||
from .browser_adapter import (
|
||||
BrowserAdapter,
|
||||
PlaywrightAdapter,
|
||||
UndetectedAdapter
|
||||
)
|
||||
|
||||
from .utils import (
|
||||
start_colab_display_server,
|
||||
setup_colab_environment
|
||||
@@ -108,7 +115,7 @@ __all__ = [
|
||||
# Adaptive Crawler
|
||||
"AdaptiveCrawler",
|
||||
"AdaptiveConfig",
|
||||
"AdaptiveCrawlResult",
|
||||
"CrawlState",
|
||||
"CrawlStrategy",
|
||||
"StatisticalStrategy",
|
||||
"DeepCrawlStrategy",
|
||||
@@ -173,7 +180,10 @@ __all__ = [
|
||||
"CompilationResult",
|
||||
"ValidationResult",
|
||||
"ErrorDetail",
|
||||
"LinkPreviewConfig"
|
||||
# Browser Adapters
|
||||
"BrowserAdapter",
|
||||
"PlaywrightAdapter",
|
||||
"UndetectedAdapter",
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# crawl4ai/__version__.py
|
||||
|
||||
# This is the version that will be used for stable releases
|
||||
__version__ = "0.7.1"
|
||||
__version__ = "0.7.0"
|
||||
|
||||
# For nightly builds, this gets set during build process
|
||||
__nightly_version__ = None
|
||||
|
||||
1847
crawl4ai/adaptive_crawler copy.py
Normal file
1847
crawl4ai/adaptive_crawler copy.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -24,7 +24,7 @@ from crawl4ai.models import Link, CrawlResult
|
||||
import numpy as np
|
||||
|
||||
@dataclass
|
||||
class AdaptiveCrawlResult:
|
||||
class CrawlState:
|
||||
"""Tracks the current state of adaptive crawling"""
|
||||
crawled_urls: Set[str] = field(default_factory=set)
|
||||
knowledge_base: List[CrawlResult] = field(default_factory=list)
|
||||
@@ -80,7 +80,7 @@ class AdaptiveCrawlResult:
|
||||
json.dump(state_dict, f, indent=2)
|
||||
|
||||
@classmethod
|
||||
def load(cls, path: Union[str, Path]) -> 'AdaptiveCrawlResult':
|
||||
def load(cls, path: Union[str, Path]) -> 'CrawlState':
|
||||
"""Load state from disk"""
|
||||
path = Path(path)
|
||||
with open(path, 'r') as f:
|
||||
@@ -256,22 +256,22 @@ class CrawlStrategy(ABC):
|
||||
"""Abstract base class for crawling strategies"""
|
||||
|
||||
@abstractmethod
|
||||
async def calculate_confidence(self, state: AdaptiveCrawlResult) -> float:
|
||||
async def calculate_confidence(self, state: CrawlState) -> float:
|
||||
"""Calculate overall confidence that we have sufficient information"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def rank_links(self, state: AdaptiveCrawlResult, config: AdaptiveConfig) -> List[Tuple[Link, float]]:
|
||||
async def rank_links(self, state: CrawlState, config: AdaptiveConfig) -> List[Tuple[Link, float]]:
|
||||
"""Rank pending links by expected information gain"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def should_stop(self, state: AdaptiveCrawlResult, config: AdaptiveConfig) -> bool:
|
||||
async def should_stop(self, state: CrawlState, config: AdaptiveConfig) -> bool:
|
||||
"""Determine if crawling should stop"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def update_state(self, state: AdaptiveCrawlResult, new_results: List[CrawlResult]) -> None:
|
||||
async def update_state(self, state: CrawlState, new_results: List[CrawlResult]) -> None:
|
||||
"""Update state with new crawl results"""
|
||||
pass
|
||||
|
||||
@@ -284,7 +284,7 @@ class StatisticalStrategy(CrawlStrategy):
|
||||
self.bm25_k1 = 1.2 # BM25 parameter
|
||||
self.bm25_b = 0.75 # BM25 parameter
|
||||
|
||||
async def calculate_confidence(self, state: AdaptiveCrawlResult) -> float:
|
||||
async def calculate_confidence(self, state: CrawlState) -> float:
|
||||
"""Calculate confidence using coverage, consistency, and saturation"""
|
||||
if not state.knowledge_base:
|
||||
return 0.0
|
||||
@@ -303,7 +303,7 @@ class StatisticalStrategy(CrawlStrategy):
|
||||
|
||||
return confidence
|
||||
|
||||
def _calculate_coverage(self, state: AdaptiveCrawlResult) -> float:
|
||||
def _calculate_coverage(self, state: CrawlState) -> float:
|
||||
"""Coverage scoring - measures query term presence across knowledge base
|
||||
|
||||
Returns a score between 0 and 1, where:
|
||||
@@ -344,7 +344,7 @@ class StatisticalStrategy(CrawlStrategy):
|
||||
# This helps differentiate between partial and good coverage
|
||||
return min(1.0, math.sqrt(coverage))
|
||||
|
||||
def _calculate_consistency(self, state: AdaptiveCrawlResult) -> float:
|
||||
def _calculate_consistency(self, state: CrawlState) -> float:
|
||||
"""Information overlap between pages - high overlap suggests coherent topic coverage"""
|
||||
if len(state.knowledge_base) < 2:
|
||||
return 1.0 # Single or no documents are perfectly consistent
|
||||
@@ -371,7 +371,7 @@ class StatisticalStrategy(CrawlStrategy):
|
||||
|
||||
return consistency
|
||||
|
||||
def _calculate_saturation(self, state: AdaptiveCrawlResult) -> float:
|
||||
def _calculate_saturation(self, state: CrawlState) -> float:
|
||||
"""Diminishing returns indicator - are we still discovering new information?"""
|
||||
if not state.new_terms_history:
|
||||
return 0.0
|
||||
@@ -388,7 +388,7 @@ class StatisticalStrategy(CrawlStrategy):
|
||||
|
||||
return max(0.0, min(saturation, 1.0))
|
||||
|
||||
async def rank_links(self, state: AdaptiveCrawlResult, config: AdaptiveConfig) -> List[Tuple[Link, float]]:
|
||||
async def rank_links(self, state: CrawlState, config: AdaptiveConfig) -> List[Tuple[Link, float]]:
|
||||
"""Rank links by expected information gain"""
|
||||
scored_links = []
|
||||
|
||||
@@ -415,7 +415,7 @@ class StatisticalStrategy(CrawlStrategy):
|
||||
|
||||
return scored_links
|
||||
|
||||
def _calculate_relevance(self, link: Link, state: AdaptiveCrawlResult) -> float:
|
||||
def _calculate_relevance(self, link: Link, state: CrawlState) -> float:
|
||||
"""BM25 relevance score between link preview and query"""
|
||||
if not state.query or not link:
|
||||
return 0.0
|
||||
@@ -447,7 +447,7 @@ class StatisticalStrategy(CrawlStrategy):
|
||||
overlap = len(query_terms & link_terms) / len(query_terms)
|
||||
return overlap
|
||||
|
||||
def _calculate_novelty(self, link: Link, state: AdaptiveCrawlResult) -> float:
|
||||
def _calculate_novelty(self, link: Link, state: CrawlState) -> float:
|
||||
"""Estimate how much new information this link might provide"""
|
||||
if not state.knowledge_base:
|
||||
return 1.0 # First links are maximally novel
|
||||
@@ -502,7 +502,7 @@ class StatisticalStrategy(CrawlStrategy):
|
||||
|
||||
return min(score, 1.0)
|
||||
|
||||
async def should_stop(self, state: AdaptiveCrawlResult, config: AdaptiveConfig) -> bool:
|
||||
async def should_stop(self, state: CrawlState, config: AdaptiveConfig) -> bool:
|
||||
"""Determine if crawling should stop"""
|
||||
# Check confidence threshold
|
||||
confidence = state.metrics.get('confidence', 0.0)
|
||||
@@ -523,7 +523,7 @@ class StatisticalStrategy(CrawlStrategy):
|
||||
|
||||
return False
|
||||
|
||||
async def update_state(self, state: AdaptiveCrawlResult, new_results: List[CrawlResult]) -> None:
|
||||
async def update_state(self, state: CrawlState, new_results: List[CrawlResult]) -> None:
|
||||
"""Update state with new crawl results"""
|
||||
for result in new_results:
|
||||
# Track new terms
|
||||
@@ -921,7 +921,7 @@ class EmbeddingStrategy(CrawlStrategy):
|
||||
|
||||
return sorted(scored_links, key=lambda x: x[1], reverse=True)
|
||||
|
||||
async def calculate_confidence(self, state: AdaptiveCrawlResult) -> float:
|
||||
async def calculate_confidence(self, state: CrawlState) -> float:
|
||||
"""Coverage-based learning score (0–1)."""
|
||||
# Guard clauses
|
||||
if state.kb_embeddings is None or state.query_embeddings is None:
|
||||
@@ -951,7 +951,7 @@ class EmbeddingStrategy(CrawlStrategy):
|
||||
|
||||
|
||||
|
||||
# async def calculate_confidence(self, state: AdaptiveCrawlResult) -> float:
|
||||
# async def calculate_confidence(self, state: CrawlState) -> float:
|
||||
# """Calculate learning score for adaptive crawling (used for stopping)"""
|
||||
#
|
||||
|
||||
@@ -1021,7 +1021,7 @@ class EmbeddingStrategy(CrawlStrategy):
|
||||
# # For stopping criteria, return learning score
|
||||
# return float(learning_score)
|
||||
|
||||
async def rank_links(self, state: AdaptiveCrawlResult, config: AdaptiveConfig) -> List[Tuple[Link, float]]:
|
||||
async def rank_links(self, state: CrawlState, config: AdaptiveConfig) -> List[Tuple[Link, float]]:
|
||||
"""Main entry point for link ranking"""
|
||||
# Store config for use in other methods
|
||||
self.config = config
|
||||
@@ -1052,7 +1052,7 @@ class EmbeddingStrategy(CrawlStrategy):
|
||||
state.kb_embeddings
|
||||
)
|
||||
|
||||
async def validate_coverage(self, state: AdaptiveCrawlResult) -> float:
|
||||
async def validate_coverage(self, state: CrawlState) -> float:
|
||||
"""Validate coverage using held-out queries with caching"""
|
||||
if not hasattr(self, '_validation_queries') or not self._validation_queries:
|
||||
return state.metrics.get('confidence', 0.0)
|
||||
@@ -1088,7 +1088,7 @@ class EmbeddingStrategy(CrawlStrategy):
|
||||
|
||||
return validation_confidence
|
||||
|
||||
async def should_stop(self, state: AdaptiveCrawlResult, config: AdaptiveConfig) -> bool:
|
||||
async def should_stop(self, state: CrawlState, config: AdaptiveConfig) -> bool:
|
||||
"""Stop based on learning curve convergence"""
|
||||
confidence = state.metrics.get('confidence', 0.0)
|
||||
|
||||
@@ -1139,7 +1139,7 @@ class EmbeddingStrategy(CrawlStrategy):
|
||||
|
||||
return False
|
||||
|
||||
def get_quality_confidence(self, state: AdaptiveCrawlResult) -> float:
|
||||
def get_quality_confidence(self, state: CrawlState) -> float:
|
||||
"""Calculate quality-based confidence score for display"""
|
||||
learning_score = state.metrics.get('learning_score', 0.0)
|
||||
validation_score = state.metrics.get('validation_confidence', 0.0)
|
||||
@@ -1166,7 +1166,7 @@ class EmbeddingStrategy(CrawlStrategy):
|
||||
|
||||
return confidence
|
||||
|
||||
async def update_state(self, state: AdaptiveCrawlResult, new_results: List[CrawlResult]) -> None:
|
||||
async def update_state(self, state: CrawlState, new_results: List[CrawlResult]) -> None:
|
||||
"""Update embeddings and coverage metrics with deduplication"""
|
||||
from .utils import get_text_embeddings
|
||||
|
||||
@@ -1246,7 +1246,7 @@ class AdaptiveCrawler:
|
||||
self.strategy = self._create_strategy(self.config.strategy)
|
||||
|
||||
# Initialize state
|
||||
self.state: Optional[AdaptiveCrawlResult] = None
|
||||
self.state: Optional[CrawlState] = None
|
||||
|
||||
# Track if we own the crawler (for cleanup)
|
||||
self._owns_crawler = crawler is None
|
||||
@@ -1266,14 +1266,14 @@ class AdaptiveCrawler:
|
||||
async def digest(self,
|
||||
start_url: str,
|
||||
query: str,
|
||||
resume_from: Optional[str] = None) -> AdaptiveCrawlResult:
|
||||
resume_from: Optional[str] = None) -> CrawlState:
|
||||
"""Main entry point for adaptive crawling"""
|
||||
# Initialize or resume state
|
||||
if resume_from:
|
||||
self.state = AdaptiveCrawlResult.load(resume_from)
|
||||
self.state = CrawlState.load(resume_from)
|
||||
self.state.query = query # Update query in case it changed
|
||||
else:
|
||||
self.state = AdaptiveCrawlResult(
|
||||
self.state = CrawlState(
|
||||
crawled_urls=set(),
|
||||
knowledge_base=[],
|
||||
pending_links=[],
|
||||
@@ -1803,7 +1803,7 @@ class AdaptiveCrawler:
|
||||
|
||||
# Initialize state if needed
|
||||
if not self.state:
|
||||
self.state = AdaptiveCrawlResult()
|
||||
self.state = CrawlState()
|
||||
|
||||
# Add imported results
|
||||
self.state.knowledge_base.extend(imported_results)
|
||||
|
||||
@@ -383,6 +383,8 @@ class BrowserConfig:
|
||||
light_mode (bool): Disables certain background features for performance gains. Default: False.
|
||||
extra_args (list): Additional command-line arguments passed to the browser.
|
||||
Default: [].
|
||||
enable_stealth (bool): If True, applies playwright-stealth to bypass basic bot detection.
|
||||
Cannot be used with use_undetected browser mode. Default: False.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -423,6 +425,7 @@ class BrowserConfig:
|
||||
extra_args: list = None,
|
||||
debugging_port: int = 9222,
|
||||
host: str = "localhost",
|
||||
enable_stealth: bool = False,
|
||||
):
|
||||
self.browser_type = browser_type
|
||||
self.headless = headless
|
||||
@@ -463,6 +466,7 @@ class BrowserConfig:
|
||||
self.verbose = verbose
|
||||
self.debugging_port = debugging_port
|
||||
self.host = host
|
||||
self.enable_stealth = enable_stealth
|
||||
|
||||
fa_user_agenr_generator = ValidUAGenerator()
|
||||
if self.user_agent_mode == "random":
|
||||
@@ -494,6 +498,13 @@ class BrowserConfig:
|
||||
# If persistent context is requested, ensure managed browser is enabled
|
||||
if self.use_persistent_context:
|
||||
self.use_managed_browser = True
|
||||
|
||||
# Validate stealth configuration
|
||||
if self.enable_stealth and self.use_managed_browser and self.browser_mode == "builtin":
|
||||
raise ValueError(
|
||||
"enable_stealth cannot be used with browser_mode='builtin'. "
|
||||
"Stealth mode requires a dedicated browser instance."
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def from_kwargs(kwargs: dict) -> "BrowserConfig":
|
||||
@@ -530,6 +541,7 @@ class BrowserConfig:
|
||||
extra_args=kwargs.get("extra_args", []),
|
||||
debugging_port=kwargs.get("debugging_port", 9222),
|
||||
host=kwargs.get("host", "localhost"),
|
||||
enable_stealth=kwargs.get("enable_stealth", False),
|
||||
)
|
||||
|
||||
def to_dict(self):
|
||||
@@ -564,6 +576,7 @@ class BrowserConfig:
|
||||
"verbose": self.verbose,
|
||||
"debugging_port": self.debugging_port,
|
||||
"host": self.host,
|
||||
"enable_stealth": self.enable_stealth,
|
||||
}
|
||||
|
||||
|
||||
|
||||
2450
crawl4ai/async_crawler_strategy.back.py
Normal file
2450
crawl4ai/async_crawler_strategy.back.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -21,6 +21,7 @@ from .async_logger import AsyncLogger
|
||||
from .ssl_certificate import SSLCertificate
|
||||
from .user_agent_generator import ValidUAGenerator
|
||||
from .browser_manager import BrowserManager
|
||||
from .browser_adapter import BrowserAdapter, PlaywrightAdapter, UndetectedAdapter
|
||||
|
||||
import aiofiles
|
||||
import aiohttp
|
||||
@@ -71,7 +72,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, browser_config: BrowserConfig = None, logger: AsyncLogger = None, **kwargs
|
||||
self, browser_config: BrowserConfig = None, logger: AsyncLogger = None, browser_adapter: BrowserAdapter = None, **kwargs
|
||||
):
|
||||
"""
|
||||
Initialize the AsyncPlaywrightCrawlerStrategy with a browser configuration.
|
||||
@@ -80,11 +81,16 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
browser_config (BrowserConfig): Configuration object containing browser settings.
|
||||
If None, will be created from kwargs for backwards compatibility.
|
||||
logger: Logger instance for recording events and errors.
|
||||
browser_adapter (BrowserAdapter): Browser adapter for handling browser-specific operations.
|
||||
If None, defaults to PlaywrightAdapter.
|
||||
**kwargs: Additional arguments for backwards compatibility and extending functionality.
|
||||
"""
|
||||
# Initialize browser config, either from provided object or kwargs
|
||||
self.browser_config = browser_config or BrowserConfig.from_kwargs(kwargs)
|
||||
self.logger = logger
|
||||
|
||||
# Initialize browser adapter
|
||||
self.adapter = browser_adapter or PlaywrightAdapter()
|
||||
|
||||
# Initialize session management
|
||||
self._downloaded_files = []
|
||||
@@ -104,7 +110,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
|
||||
# Initialize browser manager with config
|
||||
self.browser_manager = BrowserManager(
|
||||
browser_config=self.browser_config, logger=self.logger
|
||||
browser_config=self.browser_config,
|
||||
logger=self.logger,
|
||||
use_undetected=isinstance(self.adapter, UndetectedAdapter)
|
||||
)
|
||||
|
||||
async def __aenter__(self):
|
||||
@@ -322,7 +330,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
"""
|
||||
|
||||
try:
|
||||
result = await page.evaluate(wrapper_js)
|
||||
result = await self.adapter.evaluate(page, wrapper_js)
|
||||
return result
|
||||
except Exception as e:
|
||||
if "Error evaluating condition" in str(e):
|
||||
@@ -367,7 +375,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
|
||||
# Replace the iframe with a div containing the extracted content
|
||||
_iframe = iframe_content.replace("`", "\\`")
|
||||
await page.evaluate(
|
||||
await self.adapter.evaluate(page,
|
||||
f"""
|
||||
() => {{
|
||||
const iframe = document.getElementById('iframe-{i}');
|
||||
@@ -628,91 +636,16 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
page.on("requestfailed", handle_request_failed_capture)
|
||||
|
||||
# Console Message Capturing
|
||||
handle_console = None
|
||||
handle_error = None
|
||||
if config.capture_console_messages:
|
||||
def handle_console_capture(msg):
|
||||
try:
|
||||
message_type = "unknown"
|
||||
try:
|
||||
message_type = msg.type
|
||||
except:
|
||||
pass
|
||||
|
||||
message_text = "unknown"
|
||||
try:
|
||||
message_text = msg.text
|
||||
except:
|
||||
pass
|
||||
|
||||
# Basic console message with minimal content
|
||||
entry = {
|
||||
"type": message_type,
|
||||
"text": message_text,
|
||||
"timestamp": time.time()
|
||||
}
|
||||
|
||||
captured_console.append(entry)
|
||||
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.warning(f"Error capturing console message: {e}", tag="CAPTURE")
|
||||
# Still add something to the list even on error
|
||||
captured_console.append({
|
||||
"type": "console_capture_error",
|
||||
"error": str(e),
|
||||
"timestamp": time.time()
|
||||
})
|
||||
|
||||
def handle_pageerror_capture(err):
|
||||
try:
|
||||
error_message = "Unknown error"
|
||||
try:
|
||||
error_message = err.message
|
||||
except:
|
||||
pass
|
||||
|
||||
error_stack = ""
|
||||
try:
|
||||
error_stack = err.stack
|
||||
except:
|
||||
pass
|
||||
|
||||
captured_console.append({
|
||||
"type": "error",
|
||||
"text": error_message,
|
||||
"stack": error_stack,
|
||||
"timestamp": time.time()
|
||||
})
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.warning(f"Error capturing page error: {e}", tag="CAPTURE")
|
||||
captured_console.append({
|
||||
"type": "pageerror_capture_error",
|
||||
"error": str(e),
|
||||
"timestamp": time.time()
|
||||
})
|
||||
|
||||
# Add event listeners directly
|
||||
page.on("console", handle_console_capture)
|
||||
page.on("pageerror", handle_pageerror_capture)
|
||||
# Set up console capture using adapter
|
||||
handle_console = await self.adapter.setup_console_capture(page, captured_console)
|
||||
handle_error = await self.adapter.setup_error_capture(page, captured_console)
|
||||
|
||||
# Set up console logging if requested
|
||||
if config.log_console:
|
||||
def log_consol(
|
||||
msg, console_log_type="debug"
|
||||
): # Corrected the parameter syntax
|
||||
if console_log_type == "error":
|
||||
self.logger.error(
|
||||
message=f"Console error: {msg}", # Use f-string for variable interpolation
|
||||
tag="CONSOLE"
|
||||
)
|
||||
elif console_log_type == "debug":
|
||||
self.logger.debug(
|
||||
message=f"Console: {msg}", # Use f-string for variable interpolation
|
||||
tag="CONSOLE"
|
||||
)
|
||||
|
||||
page.on("console", log_consol)
|
||||
page.on("pageerror", lambda e: log_consol(e, "error"))
|
||||
# Note: For undetected browsers, console logging won't work directly
|
||||
# but captured messages can still be logged after retrieval
|
||||
|
||||
try:
|
||||
# Get SSL certificate information if requested and URL is HTTPS
|
||||
@@ -998,7 +931,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
await page.wait_for_load_state("domcontentloaded", timeout=5)
|
||||
except PlaywrightTimeoutError:
|
||||
pass
|
||||
await page.evaluate(update_image_dimensions_js)
|
||||
await self.adapter.evaluate(page, update_image_dimensions_js)
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
message="Error updating image dimensions: {error}",
|
||||
@@ -1027,7 +960,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
|
||||
for selector in selectors:
|
||||
try:
|
||||
content = await page.evaluate(
|
||||
content = await self.adapter.evaluate(page,
|
||||
f"""Array.from(document.querySelectorAll("{selector}"))
|
||||
.map(el => el.outerHTML)
|
||||
.join('')"""
|
||||
@@ -1085,6 +1018,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
await asyncio.sleep(delay)
|
||||
return await page.content()
|
||||
|
||||
# For undetected browsers, retrieve console messages before returning
|
||||
if config.capture_console_messages and hasattr(self.adapter, 'retrieve_console_messages'):
|
||||
final_messages = await self.adapter.retrieve_console_messages(page)
|
||||
captured_console.extend(final_messages)
|
||||
|
||||
# Return complete response
|
||||
return AsyncCrawlResponse(
|
||||
html=html,
|
||||
@@ -1123,8 +1061,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
page.remove_listener("response", handle_response_capture)
|
||||
page.remove_listener("requestfailed", handle_request_failed_capture)
|
||||
if config.capture_console_messages:
|
||||
page.remove_listener("console", handle_console_capture)
|
||||
page.remove_listener("pageerror", handle_pageerror_capture)
|
||||
# Retrieve any final console messages for undetected browsers
|
||||
if hasattr(self.adapter, 'retrieve_console_messages'):
|
||||
final_messages = await self.adapter.retrieve_console_messages(page)
|
||||
captured_console.extend(final_messages)
|
||||
|
||||
# Clean up console capture
|
||||
await self.adapter.cleanup_console_capture(page, handle_console, handle_error)
|
||||
|
||||
# Close the page
|
||||
await page.close()
|
||||
@@ -1354,7 +1297,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
"""
|
||||
|
||||
# Execute virtual scroll capture
|
||||
result = await page.evaluate(virtual_scroll_js, config.to_dict())
|
||||
result = await self.adapter.evaluate(page, virtual_scroll_js, config.to_dict())
|
||||
|
||||
if result.get("replaced", False):
|
||||
self.logger.success(
|
||||
@@ -1438,7 +1381,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
remove_overlays_js = load_js_script("remove_overlay_elements")
|
||||
|
||||
try:
|
||||
await page.evaluate(
|
||||
await self.adapter.evaluate(page,
|
||||
f"""
|
||||
(() => {{
|
||||
try {{
|
||||
@@ -1843,7 +1786,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
# When {script} contains statements (e.g., const link = …; link.click();),
|
||||
# this forms invalid JavaScript, causing Playwright execution error: SyntaxError: Unexpected token 'const'.
|
||||
# """
|
||||
result = await page.evaluate(
|
||||
result = await self.adapter.evaluate(page,
|
||||
f"""
|
||||
(async () => {{
|
||||
try {{
|
||||
@@ -1965,7 +1908,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
for script in scripts:
|
||||
try:
|
||||
# Execute the script and wait for network idle
|
||||
result = await page.evaluate(
|
||||
result = await self.adapter.evaluate(page,
|
||||
f"""
|
||||
(() => {{
|
||||
return new Promise((resolve) => {{
|
||||
@@ -2049,7 +1992,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
Returns:
|
||||
Boolean indicating visibility
|
||||
"""
|
||||
return await page.evaluate(
|
||||
return await self.adapter.evaluate(page,
|
||||
"""
|
||||
() => {
|
||||
const element = document.body;
|
||||
@@ -2090,7 +2033,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
Dict containing scroll status and position information
|
||||
"""
|
||||
try:
|
||||
result = await page.evaluate(
|
||||
result = await self.adapter.evaluate(page,
|
||||
f"""() => {{
|
||||
try {{
|
||||
const startX = window.scrollX;
|
||||
@@ -2147,7 +2090,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
Returns:
|
||||
Dict containing width and height of the page
|
||||
"""
|
||||
return await page.evaluate(
|
||||
return await self.adapter.evaluate(page,
|
||||
"""
|
||||
() => {
|
||||
const {scrollWidth, scrollHeight} = document.documentElement;
|
||||
@@ -2167,7 +2110,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
bool: True if page needs scrolling
|
||||
"""
|
||||
try:
|
||||
need_scroll = await page.evaluate(
|
||||
need_scroll = await self.adapter.evaluate(page,
|
||||
"""
|
||||
() => {
|
||||
const scrollHeight = document.documentElement.scrollHeight;
|
||||
@@ -2186,265 +2129,3 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
return True # Default to scrolling if check fails
|
||||
|
||||
|
||||
####################################################################################################
|
||||
# HTTP Crawler Strategy
|
||||
####################################################################################################
|
||||
|
||||
class HTTPCrawlerError(Exception):
|
||||
"""Base error class for HTTP crawler specific exceptions"""
|
||||
pass
|
||||
|
||||
|
||||
class ConnectionTimeoutError(HTTPCrawlerError):
|
||||
"""Raised when connection timeout occurs"""
|
||||
pass
|
||||
|
||||
|
||||
class HTTPStatusError(HTTPCrawlerError):
|
||||
"""Raised for unexpected status codes"""
|
||||
def __init__(self, status_code: int, message: str):
|
||||
self.status_code = status_code
|
||||
super().__init__(f"HTTP {status_code}: {message}")
|
||||
|
||||
|
||||
class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
"""
|
||||
Fast, lightweight HTTP-only crawler strategy optimized for memory efficiency.
|
||||
"""
|
||||
|
||||
__slots__ = ('logger', 'max_connections', 'dns_cache_ttl', 'chunk_size', '_session', 'hooks', 'browser_config')
|
||||
|
||||
DEFAULT_TIMEOUT: Final[int] = 30
|
||||
DEFAULT_CHUNK_SIZE: Final[int] = 64 * 1024
|
||||
DEFAULT_MAX_CONNECTIONS: Final[int] = min(32, (os.cpu_count() or 1) * 4)
|
||||
DEFAULT_DNS_CACHE_TTL: Final[int] = 300
|
||||
VALID_SCHEMES: Final = frozenset({'http', 'https', 'file', 'raw'})
|
||||
|
||||
_BASE_HEADERS: Final = MappingProxyType({
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Connection': 'keep-alive',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||||
})
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
browser_config: Optional[HTTPCrawlerConfig] = None,
|
||||
logger: Optional[AsyncLogger] = None,
|
||||
max_connections: int = DEFAULT_MAX_CONNECTIONS,
|
||||
dns_cache_ttl: int = DEFAULT_DNS_CACHE_TTL,
|
||||
chunk_size: int = DEFAULT_CHUNK_SIZE
|
||||
):
|
||||
"""Initialize the HTTP crawler with config"""
|
||||
self.browser_config = browser_config or HTTPCrawlerConfig()
|
||||
self.logger = logger
|
||||
self.max_connections = max_connections
|
||||
self.dns_cache_ttl = dns_cache_ttl
|
||||
self.chunk_size = chunk_size
|
||||
self._session: Optional[aiohttp.ClientSession] = None
|
||||
|
||||
self.hooks = {
|
||||
k: partial(self._execute_hook, k)
|
||||
for k in ('before_request', 'after_request', 'on_error')
|
||||
}
|
||||
|
||||
# Set default hooks
|
||||
self.set_hook('before_request', lambda *args, **kwargs: None)
|
||||
self.set_hook('after_request', lambda *args, **kwargs: None)
|
||||
self.set_hook('on_error', lambda *args, **kwargs: None)
|
||||
|
||||
|
||||
async def __aenter__(self) -> AsyncHTTPCrawlerStrategy:
|
||||
await self.start()
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
|
||||
await self.close()
|
||||
|
||||
@contextlib.asynccontextmanager
|
||||
async def _session_context(self):
|
||||
try:
|
||||
if not self._session:
|
||||
await self.start()
|
||||
yield self._session
|
||||
finally:
|
||||
pass
|
||||
|
||||
def set_hook(self, hook_type: str, hook_func: Callable) -> None:
|
||||
if hook_type in self.hooks:
|
||||
self.hooks[hook_type] = partial(self._execute_hook, hook_type, hook_func)
|
||||
else:
|
||||
raise ValueError(f"Invalid hook type: {hook_type}")
|
||||
|
||||
async def _execute_hook(
|
||||
self,
|
||||
hook_type: str,
|
||||
hook_func: Callable,
|
||||
*args: Any,
|
||||
**kwargs: Any
|
||||
) -> Any:
|
||||
if asyncio.iscoroutinefunction(hook_func):
|
||||
return await hook_func(*args, **kwargs)
|
||||
return hook_func(*args, **kwargs)
|
||||
|
||||
async def start(self) -> None:
|
||||
if not self._session:
|
||||
connector = aiohttp.TCPConnector(
|
||||
limit=self.max_connections,
|
||||
ttl_dns_cache=self.dns_cache_ttl,
|
||||
use_dns_cache=True,
|
||||
force_close=False
|
||||
)
|
||||
self._session = aiohttp.ClientSession(
|
||||
headers=dict(self._BASE_HEADERS),
|
||||
connector=connector,
|
||||
timeout=ClientTimeout(total=self.DEFAULT_TIMEOUT)
|
||||
)
|
||||
|
||||
async def close(self) -> None:
|
||||
if self._session and not self._session.closed:
|
||||
try:
|
||||
await asyncio.wait_for(self._session.close(), timeout=5.0)
|
||||
except asyncio.TimeoutError:
|
||||
if self.logger:
|
||||
self.logger.warning(
|
||||
message="Session cleanup timed out",
|
||||
tag="CLEANUP"
|
||||
)
|
||||
finally:
|
||||
self._session = None
|
||||
|
||||
async def _stream_file(self, path: str) -> AsyncGenerator[memoryview, None]:
|
||||
async with aiofiles.open(path, mode='rb') as f:
|
||||
while chunk := await f.read(self.chunk_size):
|
||||
yield memoryview(chunk)
|
||||
|
||||
async def _handle_file(self, path: str) -> AsyncCrawlResponse:
|
||||
if not os.path.exists(path):
|
||||
raise FileNotFoundError(f"Local file not found: {path}")
|
||||
|
||||
chunks = []
|
||||
async for chunk in self._stream_file(path):
|
||||
chunks.append(chunk.tobytes().decode('utf-8', errors='replace'))
|
||||
|
||||
return AsyncCrawlResponse(
|
||||
html=''.join(chunks),
|
||||
response_headers={},
|
||||
status_code=200
|
||||
)
|
||||
|
||||
async def _handle_raw(self, content: str) -> AsyncCrawlResponse:
|
||||
return AsyncCrawlResponse(
|
||||
html=content,
|
||||
response_headers={},
|
||||
status_code=200
|
||||
)
|
||||
|
||||
|
||||
async def _handle_http(
|
||||
self,
|
||||
url: str,
|
||||
config: CrawlerRunConfig
|
||||
) -> AsyncCrawlResponse:
|
||||
async with self._session_context() as session:
|
||||
timeout = ClientTimeout(
|
||||
total=config.page_timeout or self.DEFAULT_TIMEOUT,
|
||||
connect=10,
|
||||
sock_read=30
|
||||
)
|
||||
|
||||
headers = dict(self._BASE_HEADERS)
|
||||
if self.browser_config.headers:
|
||||
headers.update(self.browser_config.headers)
|
||||
|
||||
request_kwargs = {
|
||||
'timeout': timeout,
|
||||
'allow_redirects': self.browser_config.follow_redirects,
|
||||
'ssl': self.browser_config.verify_ssl,
|
||||
'headers': headers
|
||||
}
|
||||
|
||||
if self.browser_config.method == "POST":
|
||||
if self.browser_config.data:
|
||||
request_kwargs['data'] = self.browser_config.data
|
||||
if self.browser_config.json:
|
||||
request_kwargs['json'] = self.browser_config.json
|
||||
|
||||
await self.hooks['before_request'](url, request_kwargs)
|
||||
|
||||
try:
|
||||
async with session.request(self.browser_config.method, url, **request_kwargs) as response:
|
||||
content = memoryview(await response.read())
|
||||
|
||||
if not (200 <= response.status < 300):
|
||||
raise HTTPStatusError(
|
||||
response.status,
|
||||
f"Unexpected status code for {url}"
|
||||
)
|
||||
|
||||
encoding = response.charset
|
||||
if not encoding:
|
||||
encoding = chardet.detect(content.tobytes())['encoding'] or 'utf-8'
|
||||
|
||||
result = AsyncCrawlResponse(
|
||||
html=content.tobytes().decode(encoding, errors='replace'),
|
||||
response_headers=dict(response.headers),
|
||||
status_code=response.status,
|
||||
redirected_url=str(response.url)
|
||||
)
|
||||
|
||||
await self.hooks['after_request'](result)
|
||||
return result
|
||||
|
||||
except aiohttp.ServerTimeoutError as e:
|
||||
await self.hooks['on_error'](e)
|
||||
raise ConnectionTimeoutError(f"Request timed out: {str(e)}")
|
||||
|
||||
except aiohttp.ClientConnectorError as e:
|
||||
await self.hooks['on_error'](e)
|
||||
raise ConnectionError(f"Connection failed: {str(e)}")
|
||||
|
||||
except aiohttp.ClientError as e:
|
||||
await self.hooks['on_error'](e)
|
||||
raise HTTPCrawlerError(f"HTTP client error: {str(e)}")
|
||||
|
||||
except asyncio.exceptions.TimeoutError as e:
|
||||
await self.hooks['on_error'](e)
|
||||
raise ConnectionTimeoutError(f"Request timed out: {str(e)}")
|
||||
|
||||
except Exception as e:
|
||||
await self.hooks['on_error'](e)
|
||||
raise HTTPCrawlerError(f"HTTP request failed: {str(e)}")
|
||||
|
||||
async def crawl(
|
||||
self,
|
||||
url: str,
|
||||
config: Optional[CrawlerRunConfig] = None,
|
||||
**kwargs
|
||||
) -> AsyncCrawlResponse:
|
||||
config = config or CrawlerRunConfig.from_kwargs(kwargs)
|
||||
|
||||
parsed = urlparse(url)
|
||||
scheme = parsed.scheme.rstrip('/')
|
||||
|
||||
if scheme not in self.VALID_SCHEMES:
|
||||
raise ValueError(f"Unsupported URL scheme: {scheme}")
|
||||
|
||||
try:
|
||||
if scheme == 'file':
|
||||
return await self._handle_file(parsed.path)
|
||||
elif scheme == 'raw':
|
||||
return await self._handle_raw(parsed.path)
|
||||
else: # http or https
|
||||
return await self._handle_http(url, config)
|
||||
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.error(
|
||||
message="Crawl failed: {error}",
|
||||
tag="CRAWL",
|
||||
params={"error": str(e), "url": url}
|
||||
)
|
||||
raise
|
||||
@@ -47,7 +47,6 @@ from .utils import (
|
||||
get_error_context,
|
||||
RobotsParser,
|
||||
preprocess_html_for_schema,
|
||||
should_crawl_based_on_head,
|
||||
)
|
||||
|
||||
|
||||
@@ -269,56 +268,31 @@ class AsyncWebCrawler:
|
||||
cached_result = await async_db_manager.aget_cached_url(url)
|
||||
|
||||
if cached_result:
|
||||
# Check if SMART mode requires validation
|
||||
if cache_context.cache_mode == CacheMode.SMART:
|
||||
# Perform HEAD check to see if content has changed
|
||||
user_agent = self.crawler_strategy.user_agent if hasattr(self.crawler_strategy, 'user_agent') else "Mozilla/5.0"
|
||||
should_crawl, reason = await should_crawl_based_on_head(
|
||||
url=url,
|
||||
cached_headers=cached_result.response_headers or {},
|
||||
user_agent=user_agent,
|
||||
timeout=5
|
||||
)
|
||||
|
||||
if should_crawl:
|
||||
self.logger.info(
|
||||
f"SMART cache: {reason} - Re-crawling {url}",
|
||||
tag="SMART"
|
||||
)
|
||||
cached_result = None # Force re-crawl
|
||||
else:
|
||||
self.logger.info(
|
||||
f"SMART cache: {reason} - Using cache for {url}",
|
||||
tag="SMART"
|
||||
)
|
||||
|
||||
# Process cached result if still valid
|
||||
if cached_result:
|
||||
html = sanitize_input_encode(cached_result.html)
|
||||
extracted_content = sanitize_input_encode(
|
||||
cached_result.extracted_content or ""
|
||||
)
|
||||
extracted_content = (
|
||||
None
|
||||
if not extracted_content or extracted_content == "[]"
|
||||
else extracted_content
|
||||
)
|
||||
# If screenshot is requested but its not in cache, then set cache_result to None
|
||||
screenshot_data = cached_result.screenshot
|
||||
pdf_data = cached_result.pdf
|
||||
# if config.screenshot and not screenshot or config.pdf and not pdf:
|
||||
if config.screenshot and not screenshot_data:
|
||||
cached_result = None
|
||||
html = sanitize_input_encode(cached_result.html)
|
||||
extracted_content = sanitize_input_encode(
|
||||
cached_result.extracted_content or ""
|
||||
)
|
||||
extracted_content = (
|
||||
None
|
||||
if not extracted_content or extracted_content == "[]"
|
||||
else extracted_content
|
||||
)
|
||||
# If screenshot is requested but its not in cache, then set cache_result to None
|
||||
screenshot_data = cached_result.screenshot
|
||||
pdf_data = cached_result.pdf
|
||||
# if config.screenshot and not screenshot or config.pdf and not pdf:
|
||||
if config.screenshot and not screenshot_data:
|
||||
cached_result = None
|
||||
|
||||
if config.pdf and not pdf_data:
|
||||
cached_result = None
|
||||
if config.pdf and not pdf_data:
|
||||
cached_result = None
|
||||
|
||||
self.logger.url_status(
|
||||
url=cache_context.display_url,
|
||||
success=bool(html),
|
||||
timing=time.perf_counter() - start_time,
|
||||
tag="FETCH",
|
||||
)
|
||||
self.logger.url_status(
|
||||
url=cache_context.display_url,
|
||||
success=bool(html),
|
||||
timing=time.perf_counter() - start_time,
|
||||
tag="FETCH",
|
||||
)
|
||||
|
||||
# Update proxy configuration from rotation strategy if available
|
||||
if config and config.proxy_rotation_strategy:
|
||||
|
||||
293
crawl4ai/browser_adapter.py
Normal file
293
crawl4ai/browser_adapter.py
Normal file
@@ -0,0 +1,293 @@
|
||||
# browser_adapter.py
|
||||
"""
|
||||
Browser adapter for Crawl4AI to support both Playwright and undetected browsers
|
||||
with minimal changes to existing codebase.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import List, Dict, Any, Optional, Callable
|
||||
import time
|
||||
import json
|
||||
|
||||
# Import both, but use conditionally
|
||||
try:
|
||||
from playwright.async_api import Page
|
||||
except ImportError:
|
||||
Page = Any
|
||||
|
||||
try:
|
||||
from patchright.async_api import Page as UndetectedPage
|
||||
except ImportError:
|
||||
UndetectedPage = Any
|
||||
|
||||
|
||||
class BrowserAdapter(ABC):
|
||||
"""Abstract adapter for browser-specific operations"""
|
||||
|
||||
@abstractmethod
|
||||
async def evaluate(self, page: Page, expression: str, arg: Any = None) -> Any:
|
||||
"""Execute JavaScript in the page"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def setup_console_capture(self, page: Page, captured_console: List[Dict]) -> Optional[Callable]:
|
||||
"""Setup console message capturing, returns handler function if needed"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def setup_error_capture(self, page: Page, captured_console: List[Dict]) -> Optional[Callable]:
|
||||
"""Setup error capturing, returns handler function if needed"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def retrieve_console_messages(self, page: Page) -> List[Dict]:
|
||||
"""Retrieve captured console messages (for undetected browsers)"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def cleanup_console_capture(self, page: Page, handle_console: Optional[Callable], handle_error: Optional[Callable]):
|
||||
"""Clean up console event listeners"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_imports(self) -> tuple:
|
||||
"""Get the appropriate imports for this adapter"""
|
||||
pass
|
||||
|
||||
|
||||
class PlaywrightAdapter(BrowserAdapter):
|
||||
"""Adapter for standard Playwright"""
|
||||
|
||||
async def evaluate(self, page: Page, expression: str, arg: Any = None) -> Any:
|
||||
"""Standard Playwright evaluate"""
|
||||
if arg is not None:
|
||||
return await page.evaluate(expression, arg)
|
||||
return await page.evaluate(expression)
|
||||
|
||||
async def setup_console_capture(self, page: Page, captured_console: List[Dict]) -> Optional[Callable]:
|
||||
"""Setup console capture using Playwright's event system"""
|
||||
def handle_console_capture(msg):
|
||||
try:
|
||||
message_type = "unknown"
|
||||
try:
|
||||
message_type = msg.type
|
||||
except:
|
||||
pass
|
||||
|
||||
message_text = "unknown"
|
||||
try:
|
||||
message_text = msg.text
|
||||
except:
|
||||
pass
|
||||
|
||||
entry = {
|
||||
"type": message_type,
|
||||
"text": message_text,
|
||||
"timestamp": time.time()
|
||||
}
|
||||
|
||||
captured_console.append(entry)
|
||||
|
||||
except Exception as e:
|
||||
captured_console.append({
|
||||
"type": "console_capture_error",
|
||||
"error": str(e),
|
||||
"timestamp": time.time()
|
||||
})
|
||||
|
||||
page.on("console", handle_console_capture)
|
||||
return handle_console_capture
|
||||
|
||||
async def setup_error_capture(self, page: Page, captured_console: List[Dict]) -> Optional[Callable]:
|
||||
"""Setup error capture using Playwright's event system"""
|
||||
def handle_pageerror_capture(err):
|
||||
try:
|
||||
error_message = "Unknown error"
|
||||
try:
|
||||
error_message = err.message
|
||||
except:
|
||||
pass
|
||||
|
||||
error_stack = ""
|
||||
try:
|
||||
error_stack = err.stack
|
||||
except:
|
||||
pass
|
||||
|
||||
captured_console.append({
|
||||
"type": "error",
|
||||
"text": error_message,
|
||||
"stack": error_stack,
|
||||
"timestamp": time.time()
|
||||
})
|
||||
except Exception as e:
|
||||
captured_console.append({
|
||||
"type": "pageerror_capture_error",
|
||||
"error": str(e),
|
||||
"timestamp": time.time()
|
||||
})
|
||||
|
||||
page.on("pageerror", handle_pageerror_capture)
|
||||
return handle_pageerror_capture
|
||||
|
||||
async def retrieve_console_messages(self, page: Page) -> List[Dict]:
|
||||
"""Not needed for Playwright - messages are captured via events"""
|
||||
return []
|
||||
|
||||
async def cleanup_console_capture(self, page: Page, handle_console: Optional[Callable], handle_error: Optional[Callable]):
|
||||
"""Remove event listeners"""
|
||||
if handle_console:
|
||||
page.remove_listener("console", handle_console)
|
||||
if handle_error:
|
||||
page.remove_listener("pageerror", handle_error)
|
||||
|
||||
def get_imports(self) -> tuple:
|
||||
"""Return Playwright imports"""
|
||||
from playwright.async_api import Page, Error
|
||||
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
|
||||
return Page, Error, PlaywrightTimeoutError
|
||||
|
||||
|
||||
class UndetectedAdapter(BrowserAdapter):
|
||||
"""Adapter for undetected browser automation with stealth features"""
|
||||
|
||||
def __init__(self):
|
||||
self._console_script_injected = {}
|
||||
|
||||
async def evaluate(self, page: UndetectedPage, expression: str, arg: Any = None) -> Any:
|
||||
"""Undetected browser evaluate with isolated context"""
|
||||
# For most evaluations, use isolated context for stealth
|
||||
# Only use non-isolated when we need to access our injected console capture
|
||||
isolated = not (
|
||||
"__console" in expression or
|
||||
"__captured" in expression or
|
||||
"__error" in expression or
|
||||
"window.__" in expression
|
||||
)
|
||||
|
||||
if arg is not None:
|
||||
return await page.evaluate(expression, arg, isolated_context=isolated)
|
||||
return await page.evaluate(expression, isolated_context=isolated)
|
||||
|
||||
async def setup_console_capture(self, page: UndetectedPage, captured_console: List[Dict]) -> Optional[Callable]:
|
||||
"""Setup console capture using JavaScript injection for undetected browsers"""
|
||||
if not self._console_script_injected.get(page, False):
|
||||
await page.add_init_script("""
|
||||
// Initialize console capture
|
||||
window.__capturedConsole = [];
|
||||
window.__capturedErrors = [];
|
||||
|
||||
// Store original console methods
|
||||
const originalConsole = {};
|
||||
['log', 'info', 'warn', 'error', 'debug'].forEach(method => {
|
||||
originalConsole[method] = console[method];
|
||||
console[method] = function(...args) {
|
||||
try {
|
||||
window.__capturedConsole.push({
|
||||
type: method,
|
||||
text: args.map(arg => {
|
||||
try {
|
||||
if (typeof arg === 'object') {
|
||||
return JSON.stringify(arg);
|
||||
}
|
||||
return String(arg);
|
||||
} catch (e) {
|
||||
return '[Object]';
|
||||
}
|
||||
}).join(' '),
|
||||
timestamp: Date.now()
|
||||
});
|
||||
} catch (e) {
|
||||
// Fail silently to avoid detection
|
||||
}
|
||||
|
||||
// Call original method
|
||||
originalConsole[method].apply(console, args);
|
||||
};
|
||||
});
|
||||
""")
|
||||
self._console_script_injected[page] = True
|
||||
|
||||
return None # No handler function needed for undetected browser
|
||||
|
||||
async def setup_error_capture(self, page: UndetectedPage, captured_console: List[Dict]) -> Optional[Callable]:
|
||||
"""Setup error capture using JavaScript injection for undetected browsers"""
|
||||
if not self._console_script_injected.get(page, False):
|
||||
await page.add_init_script("""
|
||||
// Capture errors
|
||||
window.addEventListener('error', (event) => {
|
||||
try {
|
||||
window.__capturedErrors.push({
|
||||
type: 'error',
|
||||
text: event.message,
|
||||
stack: event.error ? event.error.stack : '',
|
||||
filename: event.filename,
|
||||
lineno: event.lineno,
|
||||
colno: event.colno,
|
||||
timestamp: Date.now()
|
||||
});
|
||||
} catch (e) {
|
||||
// Fail silently
|
||||
}
|
||||
});
|
||||
|
||||
// Capture unhandled promise rejections
|
||||
window.addEventListener('unhandledrejection', (event) => {
|
||||
try {
|
||||
window.__capturedErrors.push({
|
||||
type: 'unhandledrejection',
|
||||
text: event.reason ? String(event.reason) : 'Unhandled Promise Rejection',
|
||||
stack: event.reason && event.reason.stack ? event.reason.stack : '',
|
||||
timestamp: Date.now()
|
||||
});
|
||||
} catch (e) {
|
||||
// Fail silently
|
||||
}
|
||||
});
|
||||
""")
|
||||
self._console_script_injected[page] = True
|
||||
|
||||
return None # No handler function needed for undetected browser
|
||||
|
||||
async def retrieve_console_messages(self, page: UndetectedPage) -> List[Dict]:
|
||||
"""Retrieve captured console messages and errors from the page"""
|
||||
messages = []
|
||||
|
||||
try:
|
||||
# Get console messages
|
||||
console_messages = await page.evaluate(
|
||||
"() => { const msgs = window.__capturedConsole || []; window.__capturedConsole = []; return msgs; }",
|
||||
isolated_context=False
|
||||
)
|
||||
messages.extend(console_messages)
|
||||
|
||||
# Get errors
|
||||
errors = await page.evaluate(
|
||||
"() => { const errs = window.__capturedErrors || []; window.__capturedErrors = []; return errs; }",
|
||||
isolated_context=False
|
||||
)
|
||||
messages.extend(errors)
|
||||
|
||||
# Convert timestamps from JS to Python format
|
||||
for msg in messages:
|
||||
if 'timestamp' in msg and isinstance(msg['timestamp'], (int, float)):
|
||||
msg['timestamp'] = msg['timestamp'] / 1000.0 # Convert from ms to seconds
|
||||
|
||||
except Exception:
|
||||
# If retrieval fails, return empty list
|
||||
pass
|
||||
|
||||
return messages
|
||||
|
||||
async def cleanup_console_capture(self, page: UndetectedPage, handle_console: Optional[Callable], handle_error: Optional[Callable]):
|
||||
"""Clean up for undetected browser - retrieve final messages"""
|
||||
# For undetected browser, we don't have event listeners to remove
|
||||
# but we should retrieve any final messages
|
||||
final_messages = await self.retrieve_console_messages(page)
|
||||
return final_messages
|
||||
|
||||
def get_imports(self) -> tuple:
|
||||
"""Return undetected browser imports"""
|
||||
from patchright.async_api import Page, Error
|
||||
from patchright.async_api import TimeoutError as PlaywrightTimeoutError
|
||||
return Page, Error, PlaywrightTimeoutError
|
||||
@@ -16,7 +16,6 @@ from .config import DOWNLOAD_PAGE_TIMEOUT
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from .utils import get_chromium_path
|
||||
|
||||
|
||||
BROWSER_DISABLE_OPTIONS = [
|
||||
"--disable-background-networking",
|
||||
"--disable-background-timer-throttling",
|
||||
@@ -573,21 +572,26 @@ class BrowserManager:
|
||||
_playwright_instance = None
|
||||
|
||||
@classmethod
|
||||
async def get_playwright(cls):
|
||||
from playwright.async_api import async_playwright
|
||||
async def get_playwright(cls, use_undetected: bool = False):
|
||||
if use_undetected:
|
||||
from patchright.async_api import async_playwright
|
||||
else:
|
||||
from playwright.async_api import async_playwright
|
||||
cls._playwright_instance = await async_playwright().start()
|
||||
return cls._playwright_instance
|
||||
|
||||
def __init__(self, browser_config: BrowserConfig, logger=None):
|
||||
def __init__(self, browser_config: BrowserConfig, logger=None, use_undetected: bool = False):
|
||||
"""
|
||||
Initialize the BrowserManager with a browser configuration.
|
||||
|
||||
Args:
|
||||
browser_config (BrowserConfig): Configuration object containing all browser settings
|
||||
logger: Logger instance for recording events and errors
|
||||
use_undetected (bool): Whether to use undetected browser (Patchright)
|
||||
"""
|
||||
self.config: BrowserConfig = browser_config
|
||||
self.logger = logger
|
||||
self.use_undetected = use_undetected
|
||||
|
||||
# Browser state
|
||||
self.browser = None
|
||||
@@ -601,7 +605,11 @@ class BrowserManager:
|
||||
|
||||
# Keep track of contexts by a "config signature," so each unique config reuses a single context
|
||||
self.contexts_by_config = {}
|
||||
self._contexts_lock = asyncio.Lock()
|
||||
self._contexts_lock = asyncio.Lock()
|
||||
|
||||
# Stealth-related attributes
|
||||
self._stealth_instance = None
|
||||
self._stealth_cm = None
|
||||
|
||||
# Initialize ManagedBrowser if needed
|
||||
if self.config.use_managed_browser:
|
||||
@@ -630,9 +638,21 @@ class BrowserManager:
|
||||
if self.playwright is not None:
|
||||
await self.close()
|
||||
|
||||
from playwright.async_api import async_playwright
|
||||
if self.use_undetected:
|
||||
from patchright.async_api import async_playwright
|
||||
else:
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
self.playwright = await async_playwright().start()
|
||||
# Initialize playwright with or without stealth
|
||||
if self.config.enable_stealth and not self.use_undetected:
|
||||
# Import stealth only when needed
|
||||
from playwright_stealth import Stealth
|
||||
# Use the recommended stealth wrapper approach
|
||||
self._stealth_instance = Stealth()
|
||||
self._stealth_cm = self._stealth_instance.use_async(async_playwright())
|
||||
self.playwright = await self._stealth_cm.__aenter__()
|
||||
else:
|
||||
self.playwright = await async_playwright().start()
|
||||
|
||||
if self.config.cdp_url or self.config.use_managed_browser:
|
||||
self.config.use_managed_browser = True
|
||||
@@ -1094,5 +1114,19 @@ class BrowserManager:
|
||||
self.managed_browser = None
|
||||
|
||||
if self.playwright:
|
||||
await self.playwright.stop()
|
||||
# Handle stealth context manager cleanup if it exists
|
||||
if hasattr(self, '_stealth_cm') and self._stealth_cm is not None:
|
||||
try:
|
||||
await self._stealth_cm.__aexit__(None, None, None)
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.error(
|
||||
message="Error closing stealth context: {error}",
|
||||
tag="ERROR",
|
||||
params={"error": str(e)}
|
||||
)
|
||||
self._stealth_cm = None
|
||||
self._stealth_instance = None
|
||||
else:
|
||||
await self.playwright.stop()
|
||||
self.playwright = None
|
||||
|
||||
@@ -11,7 +11,6 @@ class CacheMode(Enum):
|
||||
- READ_ONLY: Only read from cache, don't write
|
||||
- WRITE_ONLY: Only write to cache, don't read
|
||||
- BYPASS: Bypass cache for this operation
|
||||
- SMART: Validate cache with HEAD request before using
|
||||
"""
|
||||
|
||||
ENABLED = "enabled"
|
||||
@@ -19,7 +18,6 @@ class CacheMode(Enum):
|
||||
READ_ONLY = "read_only"
|
||||
WRITE_ONLY = "write_only"
|
||||
BYPASS = "bypass"
|
||||
SMART = "smart"
|
||||
|
||||
|
||||
class CacheContext:
|
||||
@@ -64,14 +62,14 @@ class CacheContext:
|
||||
|
||||
How it works:
|
||||
1. If always_bypass is True or is_cacheable is False, return False.
|
||||
2. If cache_mode is ENABLED, READ_ONLY, or SMART, return True.
|
||||
2. If cache_mode is ENABLED or READ_ONLY, return True.
|
||||
|
||||
Returns:
|
||||
bool: True if cache should be read, False otherwise.
|
||||
"""
|
||||
if self.always_bypass or not self.is_cacheable:
|
||||
return False
|
||||
return self.cache_mode in [CacheMode.ENABLED, CacheMode.READ_ONLY, CacheMode.SMART]
|
||||
return self.cache_mode in [CacheMode.ENABLED, CacheMode.READ_ONLY]
|
||||
|
||||
def should_write(self) -> bool:
|
||||
"""
|
||||
@@ -79,14 +77,14 @@ class CacheContext:
|
||||
|
||||
How it works:
|
||||
1. If always_bypass is True or is_cacheable is False, return False.
|
||||
2. If cache_mode is ENABLED, WRITE_ONLY, or SMART, return True.
|
||||
2. If cache_mode is ENABLED or WRITE_ONLY, return True.
|
||||
|
||||
Returns:
|
||||
bool: True if cache should be written, False otherwise.
|
||||
"""
|
||||
if self.always_bypass or not self.is_cacheable:
|
||||
return False
|
||||
return self.cache_mode in [CacheMode.ENABLED, CacheMode.WRITE_ONLY, CacheMode.SMART]
|
||||
return self.cache_mode in [CacheMode.ENABLED, CacheMode.WRITE_ONLY]
|
||||
|
||||
@property
|
||||
def display_url(self) -> str:
|
||||
|
||||
@@ -1145,10 +1145,10 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
link_data["intrinsic_score"] = intrinsic_score
|
||||
except Exception:
|
||||
# Fail gracefully - assign default score
|
||||
link_data["intrinsic_score"] = 0
|
||||
link_data["intrinsic_score"] = float('inf')
|
||||
else:
|
||||
# No scoring enabled - assign infinity (all links equal priority)
|
||||
link_data["intrinsic_score"] = 0
|
||||
link_data["intrinsic_score"] = float('inf')
|
||||
|
||||
is_external = is_external_url(normalized_href, base_domain)
|
||||
if is_external:
|
||||
|
||||
@@ -1088,147 +1088,111 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
|
||||
@staticmethod
|
||||
def generate_schema(
|
||||
html: str,
|
||||
*,
|
||||
schema_type: str = "CSS", # "CSS" or "XPATH"
|
||||
query: str | None = None,
|
||||
target_json_example: str | None = None,
|
||||
last_instruction: str | None = None, # extra “IMPORTANT” notes
|
||||
llm_config: "LLMConfig" = create_llm_config(),
|
||||
token_usages: Optional[list["TokenUsage"]] = None,
|
||||
prompt: str | None = None,
|
||||
**kwargs,
|
||||
schema_type: str = "CSS", # or XPATH
|
||||
query: str = None,
|
||||
target_json_example: str = None,
|
||||
llm_config: 'LLMConfig' = create_llm_config(),
|
||||
provider: str = None,
|
||||
api_token: str = None,
|
||||
**kwargs
|
||||
) -> dict:
|
||||
"""
|
||||
Produce a JSON extraction schema from raw HTML.
|
||||
|
||||
- If `query` is given, the task section echoes it.
|
||||
- If no `query` but `target_json_example` exists,
|
||||
we instruct the model to fit the schema to that example.
|
||||
- If neither is provided, we ask the model to detect
|
||||
the most obvious repeating data and build a schema.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dict
|
||||
A schema compliant with JsonElementExtractionStrategy.
|
||||
Generate extraction schema from HTML content and optional query.
|
||||
|
||||
Args:
|
||||
html (str): The HTML content to analyze
|
||||
query (str, optional): Natural language description of what data to extract
|
||||
provider (str): Legacy Parameter. LLM provider to use
|
||||
api_token (str): Legacy Parameter. API token for LLM provider
|
||||
llm_config (LLMConfig): LLM configuration object
|
||||
prompt (str, optional): Custom prompt template to use
|
||||
**kwargs: Additional args passed to LLM processor
|
||||
|
||||
Returns:
|
||||
dict: Generated schema following the JsonElementExtractionStrategy format
|
||||
"""
|
||||
import json, re, textwrap
|
||||
from .prompts import JSON_SCHEMA_BUILDER, JSON_SCHEMA_BUILDER_XPATH
|
||||
from .prompts import JSON_SCHEMA_BUILDER
|
||||
from .utils import perform_completion_with_backoff
|
||||
|
||||
# ─── basic validation ────────────────────────────────────
|
||||
if not html or not html.strip():
|
||||
raise ValueError("html must be non-empty")
|
||||
if schema_type not in {"CSS", "XPATH"}:
|
||||
raise ValueError("schema_type must be 'CSS' or 'XPATH'")
|
||||
for name, msg in JsonElementExtractionStrategy._GENERATE_SCHEMA_UNWANTED_PROPS.items():
|
||||
if locals().get(name) is not None:
|
||||
raise AttributeError(f"Setting '{name}' is deprecated. {msg}")
|
||||
|
||||
# ─── prompt selection ────────────────────────────────────
|
||||
prompt_template = (
|
||||
prompt
|
||||
if prompt is not None
|
||||
else (JSON_SCHEMA_BUILDER if schema_type == "CSS" else JSON_SCHEMA_BUILDER_XPATH)
|
||||
)
|
||||
|
||||
# ─── derive task description ─────────────────────────────
|
||||
if query:
|
||||
task_line = query.strip()
|
||||
elif target_json_example:
|
||||
task_line = (
|
||||
"Use the example JSON below to infer all required fields, "
|
||||
"then generate a schema that extracts matching data."
|
||||
)
|
||||
else:
|
||||
task_line = (
|
||||
"Detect the most obvious repeating data on this page and "
|
||||
"generate a schema that captures it completely."
|
||||
)
|
||||
|
||||
# ─── build user prompt body ──────────────────────────────
|
||||
html_clean = re.sub(r"\s{2,}", " ", textwrap.dedent(html).strip())
|
||||
|
||||
parts: list[str] = [
|
||||
f"{prompt_template}",
|
||||
"\n\n## Extracted HTML\n"
|
||||
"==================== Beginning of Html ====================\n",
|
||||
html_clean,
|
||||
"\n==================== End of Html ====================\n",
|
||||
]
|
||||
|
||||
if target_json_example:
|
||||
parts.extend(
|
||||
[
|
||||
"\n## Example of end result\n",
|
||||
target_json_example.strip(),
|
||||
"\n",
|
||||
]
|
||||
)
|
||||
|
||||
if last_instruction:
|
||||
parts.extend(
|
||||
[
|
||||
"\n## Important\n",
|
||||
last_instruction.strip(),
|
||||
"\n",
|
||||
]
|
||||
)
|
||||
|
||||
parts.extend(
|
||||
[
|
||||
"\n## Task:\n",
|
||||
task_line,
|
||||
]
|
||||
)
|
||||
|
||||
user_message = {"role": "user", "content": "".join(parts)}
|
||||
|
||||
# slim system message, JSON_SCHEMA_BUILDER already holds heavy guidance
|
||||
for name, message in JsonElementExtractionStrategy._GENERATE_SCHEMA_UNWANTED_PROPS.items():
|
||||
if locals()[name] is not None:
|
||||
raise AttributeError(f"Setting '{name}' is deprecated. {message}")
|
||||
|
||||
# Use default or custom prompt
|
||||
prompt_template = JSON_SCHEMA_BUILDER if schema_type == "CSS" else JSON_SCHEMA_BUILDER_XPATH
|
||||
|
||||
# Build the prompt
|
||||
system_message = {
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You generate reliable JSON schemas for structured extraction. "
|
||||
"Return valid JSON only."
|
||||
),
|
||||
"role": "system",
|
||||
"content": f"""You specialize in generating special JSON schemas for web scraping. This schema uses CSS or XPATH selectors to present a repetitive pattern in crawled HTML, such as a product in a product list or a search result item in a list of search results. We use this JSON schema to pass to a language model along with the HTML content to extract structured data from the HTML. The language model uses the JSON schema to extract data from the HTML and retrieve values for fields in the JSON schema, following the schema.
|
||||
|
||||
Generating this HTML manually is not feasible, so you need to generate the JSON schema using the HTML content. The HTML copied from the crawled website is provided below, which we believe contains the repetitive pattern.
|
||||
|
||||
# Schema main keys:
|
||||
- name: This is the name of the schema.
|
||||
- baseSelector: This is the CSS or XPATH selector that identifies the base element that contains all the repetitive patterns.
|
||||
- baseFields: This is a list of fields that you extract from the base element itself.
|
||||
- fields: This is a list of fields that you extract from the children of the base element. {{name, selector, type}} based on the type, you may have extra keys such as "attribute" when the type is "attribute".
|
||||
|
||||
# Extra Context:
|
||||
In this context, the following items may or may not be present:
|
||||
- Example of target JSON object: This is a sample of the final JSON object that we hope to extract from the HTML using the schema you are generating.
|
||||
- Extra Instructions: This is optional instructions to consider when generating the schema provided by the user.
|
||||
- Query or explanation of target/goal data item: This is a description of what data we are trying to extract from the HTML. This explanation means we're not sure about the rigid schema of the structures we want, so we leave it to you to use your expertise to create the best and most comprehensive structures aimed at maximizing data extraction from this page. You must ensure that you do not pick up nuances that may exist on a particular page. The focus should be on the data we are extracting, and it must be valid, safe, and robust based on the given HTML.
|
||||
|
||||
# What if there is no example of target JSON object and also no extra instructions or even no explanation of target/goal data item?
|
||||
In this scenario, use your best judgment to generate the schema. You need to examine the content of the page and understand the data it provides. If the page contains repetitive data, such as lists of items, products, jobs, places, books, or movies, focus on one single item that repeats. If the page is a detailed page about one product or item, create a schema to extract the entire structured data. At this stage, you must think and decide for yourself. Try to maximize the number of fields that you can extract from the HTML.
|
||||
|
||||
# What are the instructions and details for this schema generation?
|
||||
{prompt_template}"""
|
||||
}
|
||||
|
||||
user_message = {
|
||||
"role": "user",
|
||||
"content": f"""
|
||||
HTML to analyze:
|
||||
```html
|
||||
{html}
|
||||
```
|
||||
"""
|
||||
}
|
||||
|
||||
# ─── call LLM ─────────────────────────────────────────────
|
||||
response = perform_completion_with_backoff(
|
||||
provider=llm_config.provider,
|
||||
prompt_with_variables="\n\n".join(
|
||||
[system_message["content"], user_message["content"]]
|
||||
),
|
||||
json_response=True,
|
||||
api_token=llm_config.api_token,
|
||||
base_url=llm_config.base_url,
|
||||
extra_args=kwargs,
|
||||
)
|
||||
if query:
|
||||
user_message["content"] += f"\n\n## Query or explanation of target/goal data item:\n{query}"
|
||||
if target_json_example:
|
||||
user_message["content"] += f"\n\n## Example of target JSON object:\n```json\n{target_json_example}\n```"
|
||||
|
||||
# ─── token usage accounting ──────────────────────────────
|
||||
if token_usages is not None and hasattr(response, "usage"):
|
||||
token_usages.append(
|
||||
TokenUsage(
|
||||
completion_tokens=getattr(response.usage, "completion_tokens", 0),
|
||||
prompt_tokens=getattr(response.usage, "prompt_tokens", 0),
|
||||
total_tokens=getattr(response.usage, "total_tokens", 0),
|
||||
)
|
||||
)
|
||||
if query and not target_json_example:
|
||||
user_message["content"] += """IMPORTANT: To remind you, in this process, we are not providing a rigid example of the adjacent objects we seek. We rely on your understanding of the explanation provided in the above section. Make sure to grasp what we are looking for and, based on that, create the best schema.."""
|
||||
elif not query and target_json_example:
|
||||
user_message["content"] += """IMPORTANT: Please remember that in this process, we provided a proper example of a target JSON object. Make sure to adhere to the structure and create a schema that exactly fits this example. If you find that some elements on the page do not match completely, vote for the majority."""
|
||||
elif not query and not target_json_example:
|
||||
user_message["content"] += """IMPORTANT: Since we neither have a query nor an example, it is crucial to rely solely on the HTML content provided. Leverage your expertise to determine the schema based on the repetitive patterns observed in the content."""
|
||||
|
||||
user_message["content"] += """IMPORTANT:
|
||||
0/ Ensure your schema remains reliable by avoiding selectors that appear to generate dynamically and are not dependable. You want a reliable schema, as it consistently returns the same data even after many page reloads.
|
||||
1/ DO NOT USE use base64 kind of classes, they are temporary and not reliable.
|
||||
2/ Every selector must refer to only one unique element. You should ensure your selector points to a single element and is unique to the place that contains the information. You have to use available techniques based on CSS or XPATH requested schema to make sure your selector is unique and also not fragile, meaning if we reload the page now or in the future, the selector should remain reliable.
|
||||
3/ Do not use Regex as much as possible.
|
||||
|
||||
Analyze the HTML and generate a JSON schema that follows the specified format. Only output valid JSON schema, nothing else.
|
||||
"""
|
||||
|
||||
# ─── parse and validate JSON answer ──────────────────────
|
||||
try:
|
||||
schema = json.loads(response.choices[0].message.content)
|
||||
except Exception as exc:
|
||||
raise ValueError(f"LLM returned invalid JSON: {exc}") from exc
|
||||
|
||||
required = {"name", "baseSelector", "fields"}
|
||||
if not required.issubset(schema):
|
||||
missing = required - set(schema)
|
||||
raise ValueError(f"Generated schema missing required keys: {missing}")
|
||||
|
||||
return schema
|
||||
|
||||
|
||||
# Call LLM with backoff handling
|
||||
response = perform_completion_with_backoff(
|
||||
provider=llm_config.provider,
|
||||
prompt_with_variables="\n\n".join([system_message["content"], user_message["content"]]),
|
||||
json_response = True,
|
||||
api_token=llm_config.api_token,
|
||||
base_url=llm_config.base_url,
|
||||
extra_args=kwargs
|
||||
)
|
||||
|
||||
# Extract and return schema
|
||||
return json.loads(response.choices[0].message.content)
|
||||
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to generate schema: {str(e)}")
|
||||
|
||||
class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
|
||||
"""
|
||||
|
||||
@@ -119,6 +119,32 @@ def install_playwright():
|
||||
logger.warning(
|
||||
f"Please run '{sys.executable} -m playwright install --with-deps' manually after the installation."
|
||||
)
|
||||
|
||||
# Install Patchright browsers for undetected browser support
|
||||
logger.info("Installing Patchright browsers for undetected mode...", tag="INIT")
|
||||
try:
|
||||
subprocess.check_call(
|
||||
[
|
||||
sys.executable,
|
||||
"-m",
|
||||
"patchright",
|
||||
"install",
|
||||
"--with-deps",
|
||||
"--force",
|
||||
"chromium",
|
||||
]
|
||||
)
|
||||
logger.success(
|
||||
"Patchright installation completed successfully.", tag="COMPLETE"
|
||||
)
|
||||
except subprocess.CalledProcessError:
|
||||
logger.warning(
|
||||
f"Please run '{sys.executable} -m patchright install --with-deps' manually after the installation."
|
||||
)
|
||||
except Exception:
|
||||
logger.warning(
|
||||
f"Please run '{sys.executable} -m patchright install --with-deps' manually after the installation."
|
||||
)
|
||||
|
||||
|
||||
def run_migration():
|
||||
|
||||
@@ -3387,90 +3387,3 @@ def cosine_distance(vec1: np.ndarray, vec2: np.ndarray) -> float:
|
||||
"""Calculate cosine distance (1 - similarity) between two vectors"""
|
||||
return 1 - cosine_similarity(vec1, vec2)
|
||||
|
||||
|
||||
async def should_crawl_based_on_head(
|
||||
url: str,
|
||||
cached_headers: Dict[str, str],
|
||||
user_agent: str = "Mozilla/5.0",
|
||||
timeout: int = 5
|
||||
) -> tuple[bool, str]:
|
||||
"""
|
||||
Check if content has changed using HEAD request.
|
||||
|
||||
Args:
|
||||
url: The URL to check
|
||||
cached_headers: The cached response headers from previous crawl
|
||||
user_agent: User agent string to use for the HEAD request
|
||||
timeout: Timeout in seconds for the HEAD request
|
||||
|
||||
Returns:
|
||||
Tuple of (should_crawl: bool, reason: str)
|
||||
- should_crawl: True if content has changed and should be re-crawled, False otherwise
|
||||
- reason: Explanation of the decision
|
||||
"""
|
||||
import email.utils
|
||||
|
||||
if not cached_headers:
|
||||
return True, "No cached headers available, must crawl"
|
||||
|
||||
headers = {
|
||||
"Accept-Encoding": "identity",
|
||||
"User-Agent": user_agent,
|
||||
"Want-Content-Digest": "sha-256", # Request RFC 9530 digest
|
||||
}
|
||||
|
||||
# Add conditional headers if available in cache
|
||||
if cached_headers.get("etag"):
|
||||
headers["If-None-Match"] = cached_headers["etag"]
|
||||
if cached_headers.get("last-modified"):
|
||||
headers["If-Modified-Since"] = cached_headers["last-modified"]
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.head(
|
||||
url,
|
||||
headers=headers,
|
||||
timeout=aiohttp.ClientTimeout(total=timeout),
|
||||
allow_redirects=True
|
||||
) as response:
|
||||
# 304 Not Modified - content hasn't changed
|
||||
if response.status == 304:
|
||||
return False, "304 Not Modified - Content unchanged"
|
||||
|
||||
# Check other headers if no 304 response
|
||||
new_headers = dict(response.headers)
|
||||
|
||||
# Check Content-Digest (most reliable)
|
||||
if new_headers.get("content-digest") and cached_headers.get("content-digest"):
|
||||
if new_headers["content-digest"] == cached_headers["content-digest"]:
|
||||
return False, "Content-Digest matches - Content unchanged"
|
||||
|
||||
# Check strong ETag
|
||||
if new_headers.get("etag") and cached_headers.get("etag"):
|
||||
# Strong ETags start with '"'
|
||||
if (new_headers["etag"].startswith('"') and
|
||||
new_headers["etag"] == cached_headers["etag"]):
|
||||
return False, "Strong ETag matches - Content unchanged"
|
||||
|
||||
# Check Last-Modified
|
||||
if new_headers.get("last-modified") and cached_headers.get("last-modified"):
|
||||
try:
|
||||
new_lm = email.utils.parsedate_to_datetime(new_headers["last-modified"])
|
||||
cached_lm = email.utils.parsedate_to_datetime(cached_headers["last-modified"])
|
||||
if new_lm <= cached_lm:
|
||||
return False, "Last-Modified not newer - Content unchanged"
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Content-Length changed is a positive signal
|
||||
if (new_headers.get("content-length") and cached_headers.get("content-length") and
|
||||
new_headers["content-length"] != cached_headers["content-length"]):
|
||||
return True, f"Content-Length changed ({cached_headers['content-length']} -> {new_headers['content-length']})"
|
||||
|
||||
# Default: assume content has changed
|
||||
return True, "No definitive cache headers matched - Assuming content changed"
|
||||
|
||||
except Exception as e:
|
||||
# On error, assume content has changed (safe default)
|
||||
return True, f"HEAD request failed: {str(e)} - Assuming content changed"
|
||||
|
||||
|
||||
@@ -10,8 +10,9 @@ Today I'm releasing Crawl4AI v0.7.0—the Adaptive Intelligence Update. This rel
|
||||
|
||||
- **Adaptive Crawling**: Your crawler now learns and adapts to website patterns
|
||||
- **Virtual Scroll Support**: Complete content extraction from infinite scroll pages
|
||||
- **Link Preview with Intelligent Scoring**: Intelligent link analysis and prioritization
|
||||
- **Link Preview with 3-Layer Scoring**: Intelligent link analysis and prioritization
|
||||
- **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering
|
||||
- **PDF Parsing**: Extract data from PDF documents
|
||||
- **Performance Optimizations**: Significant speed and memory improvements
|
||||
|
||||
## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning
|
||||
@@ -29,41 +30,44 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking:
|
||||
- Extraction confidence scores
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||
import asyncio
|
||||
from crawl4ai import AdaptiveCrawler, AdaptiveConfig, CrawlState
|
||||
|
||||
async def main():
|
||||
|
||||
# Configure adaptive crawler
|
||||
config = AdaptiveConfig(
|
||||
strategy="statistical", # or "embedding" for semantic understanding
|
||||
max_pages=10,
|
||||
confidence_threshold=0.7, # Stop at 70% confidence
|
||||
top_k_links=3, # Follow top 3 links per page
|
||||
min_gain_threshold=0.05 # Need 5% information gain to continue
|
||||
# Initialize with custom learning parameters
|
||||
config = AdaptiveConfig(
|
||||
confidence_threshold=0.7, # Min confidence to use learned patterns
|
||||
max_history=100, # Remember last 100 crawls per domain
|
||||
learning_rate=0.2, # How quickly to adapt to changes
|
||||
patterns_per_page=3, # Patterns to learn per page type
|
||||
extraction_strategy='css' # 'css' or 'xpath'
|
||||
)
|
||||
|
||||
adaptive_crawler = AdaptiveCrawler(config)
|
||||
|
||||
# First crawl - crawler learns the structure
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://news.example.com/article/12345",
|
||||
config=CrawlerRunConfig(
|
||||
adaptive_config=config,
|
||||
extraction_hints={ # Optional hints to speed up learning
|
||||
"title": "article h1",
|
||||
"content": "article .body-content"
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
|
||||
print("Starting adaptive crawl about Python decorators...")
|
||||
result = await adaptive.digest(
|
||||
start_url="https://docs.python.org/3/glossary.html",
|
||||
query="python decorators functions wrapping"
|
||||
)
|
||||
|
||||
print(f"\n✅ Crawling Complete!")
|
||||
print(f"• Confidence Level: {adaptive.confidence:.0%}")
|
||||
print(f"• Pages Crawled: {len(result.crawled_urls)}")
|
||||
print(f"• Knowledge Base: {len(adaptive.state.knowledge_base)} documents")
|
||||
|
||||
# Get most relevant content
|
||||
relevant = adaptive.get_relevant_content(top_k=3)
|
||||
print(f"\nMost Relevant Pages:")
|
||||
for i, page in enumerate(relevant, 1):
|
||||
print(f"{i}. {page['url']} (relevance: {page['score']:.2%})")
|
||||
# Crawler identifies and stores patterns
|
||||
if result.success:
|
||||
state = adaptive_crawler.get_state("news.example.com")
|
||||
print(f"Learned {len(state.patterns)} patterns")
|
||||
print(f"Confidence: {state.avg_confidence:.2%}")
|
||||
|
||||
asyncio.run(main())
|
||||
# Subsequent crawls - uses learned patterns
|
||||
result2 = await crawler.arun(
|
||||
"https://news.example.com/article/67890",
|
||||
config=CrawlerRunConfig(adaptive_config=config)
|
||||
)
|
||||
# Automatically extracts using learned patterns!
|
||||
```
|
||||
|
||||
**Expected Real-World Impact:**
|
||||
@@ -88,7 +92,9 @@ twitter_config = VirtualScrollConfig(
|
||||
container_selector="[data-testid='primaryColumn']",
|
||||
scroll_count=20, # Number of scrolls
|
||||
scroll_by="container_height", # Smart scrolling by container size
|
||||
wait_after_scroll=1.0 # Let content load
|
||||
wait_after_scroll=1.0, # Let content load
|
||||
capture_method="incremental", # Capture new content on each scroll
|
||||
deduplicate=True # Remove duplicate elements
|
||||
)
|
||||
|
||||
# For e-commerce product grids (Instagram style)
|
||||
@@ -96,7 +102,8 @@ grid_config = VirtualScrollConfig(
|
||||
container_selector="main .product-grid",
|
||||
scroll_count=30,
|
||||
scroll_by=800, # Fixed pixel scrolling
|
||||
wait_after_scroll=1.5 # Images need time
|
||||
wait_after_scroll=1.5, # Images need time
|
||||
stop_on_no_change=True # Smart stopping
|
||||
)
|
||||
|
||||
# For news feeds with lazy loading
|
||||
@@ -104,7 +111,9 @@ news_config = VirtualScrollConfig(
|
||||
container_selector=".article-feed",
|
||||
scroll_count=50,
|
||||
scroll_by="page_height", # Viewport-based scrolling
|
||||
wait_after_scroll=0.5 # Wait for content to load
|
||||
wait_after_scroll=0.5,
|
||||
wait_for_selector=".article-card", # Wait for specific elements
|
||||
timeout=30000 # Max 30 seconds total
|
||||
)
|
||||
|
||||
# Use it in your crawl
|
||||
@@ -148,63 +157,68 @@ async with AsyncWebCrawler() as crawler:
|
||||
|
||||
**My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals.
|
||||
|
||||
### Intelligent Link Analysis and Scoring
|
||||
### The Three-Layer Scoring System
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import CrawlerRunConfig, CacheMode, AsyncWebCrawler
|
||||
from crawl4ai.adaptive_crawler import LinkPreviewConfig
|
||||
from crawl4ai import LinkPreviewConfig
|
||||
|
||||
async def main():
|
||||
# Configure intelligent link analysis
|
||||
link_config = LinkPreviewConfig(
|
||||
include_internal=True,
|
||||
include_external=False,
|
||||
max_links=10,
|
||||
concurrency=5,
|
||||
query="python tutorial", # For contextual scoring
|
||||
score_threshold=0.3,
|
||||
verbose=True
|
||||
# Configure intelligent link analysis
|
||||
link_config = LinkPreviewConfig(
|
||||
# What to analyze
|
||||
include_internal=True,
|
||||
include_external=True,
|
||||
max_links=100, # Analyze top 100 links
|
||||
|
||||
# Relevance scoring
|
||||
query="machine learning tutorials", # Your interest
|
||||
score_threshold=0.3, # Minimum relevance score
|
||||
|
||||
# Performance
|
||||
concurrent_requests=10, # Parallel processing
|
||||
timeout_per_link=5000, # 5s per link
|
||||
|
||||
# Advanced scoring weights
|
||||
scoring_weights={
|
||||
"intrinsic": 0.3, # Link quality indicators
|
||||
"contextual": 0.5, # Relevance to query
|
||||
"popularity": 0.2 # Link prominence
|
||||
}
|
||||
)
|
||||
|
||||
# Use in your crawl
|
||||
result = await crawler.arun(
|
||||
"https://tech-blog.example.com",
|
||||
config=CrawlerRunConfig(
|
||||
link_preview_config=link_config,
|
||||
score_links=True
|
||||
)
|
||||
# Use in your crawl
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://www.geeksforgeeks.org/",
|
||||
config=CrawlerRunConfig(
|
||||
link_preview_config=link_config,
|
||||
score_links=True, # Enable intrinsic scoring
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
# Access scored and sorted links
|
||||
if result.success and result.links:
|
||||
for link in result.links.get("internal", []):
|
||||
text = link.get('text', 'No text')[:40]
|
||||
print(
|
||||
text,
|
||||
f"{link.get('intrinsic_score', 0):.1f}/10" if link.get('intrinsic_score') is not None else "0.0/10",
|
||||
f"{link.get('contextual_score', 0):.2f}/1" if link.get('contextual_score') is not None else "0.00/1",
|
||||
f"{link.get('total_score', 0):.3f}" if link.get('total_score') is not None else "0.000"
|
||||
)
|
||||
|
||||
asyncio.run(main())
|
||||
# Access scored and sorted links
|
||||
for link in result.links["internal"][:10]: # Top 10 internal links
|
||||
print(f"Score: {link['total_score']:.3f}")
|
||||
print(f" Intrinsic: {link['intrinsic_score']:.1f}/10") # Position, attributes
|
||||
print(f" Contextual: {link['contextual_score']:.1f}/1") # Relevance to query
|
||||
print(f" URL: {link['href']}")
|
||||
print(f" Title: {link['head_data']['title']}")
|
||||
print(f" Description: {link['head_data']['meta']['description'][:100]}...")
|
||||
```
|
||||
|
||||
**Scoring Components:**
|
||||
|
||||
1. **Intrinsic Score**: Based on link quality indicators
|
||||
1. **Intrinsic Score (0-10)**: Based on link quality indicators
|
||||
- Position on page (navigation, content, footer)
|
||||
- Link attributes (rel, title, class names)
|
||||
- Anchor text quality and length
|
||||
- URL structure and depth
|
||||
|
||||
2. **Contextual Score**: Relevance to your query using BM25 algorithm
|
||||
2. **Contextual Score (0-1)**: Relevance to your query
|
||||
- Semantic similarity using embeddings
|
||||
- Keyword matching in link text and title
|
||||
- Meta description analysis
|
||||
- Content preview scoring
|
||||
|
||||
3. **Total Score**: Combined score for final ranking
|
||||
3. **Total Score**: Weighted combination for final ranking
|
||||
|
||||
**Expected Real-World Impact:**
|
||||
- **Research Efficiency**: Find relevant papers 10x faster by following only high-score links
|
||||
@@ -221,34 +235,58 @@ asyncio.run(main())
|
||||
### Technical Architecture
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
||||
|
||||
async def main():
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
# Discover Python tutorial URLs
|
||||
config = SeedingConfig(
|
||||
source="sitemap", # Use sitemap
|
||||
pattern="*python*", # URL pattern filter
|
||||
extract_head=True, # Get metadata
|
||||
query="python tutorial", # For relevance scoring
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.2,
|
||||
max_urls=10
|
||||
)
|
||||
|
||||
print("Discovering Python async tutorial URLs...")
|
||||
urls = await seeder.urls("https://www.geeksforgeeks.org/", config)
|
||||
|
||||
print(f"\n✅ Found {len(urls)} relevant URLs:")
|
||||
for i, url_info in enumerate(urls[:5], 1):
|
||||
print(f"\n{i}. {url_info['url']}")
|
||||
if url_info.get('relevance_score'):
|
||||
print(f" Relevance: {url_info['relevance_score']:.3f}")
|
||||
if url_info.get('head_data', {}).get('title'):
|
||||
print(f" Title: {url_info['head_data']['title'][:60]}...")
|
||||
# Basic discovery - find all product pages
|
||||
seeder_config = SeedingConfig(
|
||||
# Discovery sources
|
||||
source="sitemap+cc", # Sitemap + Common Crawl
|
||||
|
||||
# Filtering
|
||||
pattern="*/product/*", # URL pattern matching
|
||||
ignore_patterns=["*/reviews/*", "*/questions/*"],
|
||||
|
||||
# Validation
|
||||
live_check=True, # Verify URLs are alive
|
||||
max_urls=5000, # Stop at 5000 URLs
|
||||
|
||||
# Performance
|
||||
concurrency=100, # Parallel requests
|
||||
hits_per_sec=10 # Rate limiting
|
||||
)
|
||||
|
||||
asyncio.run(main())
|
||||
seeder = AsyncUrlSeeder(seeder_config)
|
||||
urls = await seeder.discover("https://shop.example.com")
|
||||
|
||||
# Advanced: Relevance-based discovery
|
||||
research_config = SeedingConfig(
|
||||
source="crawl+sitemap", # Deep crawl + sitemap
|
||||
pattern="*/blog/*", # Blog posts only
|
||||
|
||||
# Content relevance
|
||||
extract_head=True, # Get meta tags
|
||||
query="quantum computing tutorials",
|
||||
scoring_method="bm25", # Or "semantic" (coming soon)
|
||||
score_threshold=0.4, # High relevance only
|
||||
|
||||
# Smart filtering
|
||||
filter_nonsense_urls=True, # Remove .xml, .txt, etc.
|
||||
min_content_length=500, # Skip thin content
|
||||
|
||||
force=True # Bypass cache
|
||||
)
|
||||
|
||||
# Discover with progress tracking
|
||||
discovered = []
|
||||
async for batch in seeder.discover_iter("https://physics-blog.com", research_config):
|
||||
discovered.extend(batch)
|
||||
print(f"Found {len(discovered)} relevant URLs so far...")
|
||||
|
||||
# Results include scores and metadata
|
||||
for url_data in discovered[:5]:
|
||||
print(f"URL: {url_data['url']}")
|
||||
print(f"Score: {url_data['score']:.3f}")
|
||||
print(f"Title: {url_data['title']}")
|
||||
```
|
||||
|
||||
**Discovery Methods:**
|
||||
@@ -271,18 +309,35 @@ This release includes significant performance improvements through optimized res
|
||||
### What We Optimized
|
||||
|
||||
```python
|
||||
# Optimized crawling with v0.7.0 improvements
|
||||
# Before v0.7.0 (slow)
|
||||
results = []
|
||||
for url in urls:
|
||||
result = await crawler.arun(
|
||||
url,
|
||||
config=CrawlerRunConfig(
|
||||
# Performance optimizations
|
||||
wait_until="domcontentloaded", # Faster than networkidle
|
||||
cache_mode=CacheMode.ENABLED # Enable caching
|
||||
)
|
||||
)
|
||||
result = await crawler.arun(url)
|
||||
results.append(result)
|
||||
|
||||
# After v0.7.0 (fast)
|
||||
# Automatic batching and connection pooling
|
||||
results = await crawler.arun_batch(
|
||||
urls,
|
||||
config=CrawlerRunConfig(
|
||||
# New performance options
|
||||
batch_size=10, # Process 10 URLs concurrently
|
||||
reuse_browser=True, # Keep browser warm
|
||||
eager_loading=False, # Load only what's needed
|
||||
streaming_extraction=True, # Stream large extractions
|
||||
|
||||
# Optimized defaults
|
||||
wait_until="domcontentloaded", # Faster than networkidle
|
||||
exclude_external_resources=True, # Skip third-party assets
|
||||
block_ads=True # Ad blocking built-in
|
||||
)
|
||||
)
|
||||
|
||||
# Memory-efficient streaming for large crawls
|
||||
async for result in crawler.arun_stream(large_url_list):
|
||||
# Process results as they complete
|
||||
await process_result(result)
|
||||
# Memory is freed after each iteration
|
||||
```
|
||||
|
||||
**Performance Gains:**
|
||||
@@ -292,6 +347,24 @@ for url in urls:
|
||||
- **Memory Usage**: 60% reduction with streaming processing
|
||||
- **Concurrent Crawls**: Handle 5x more parallel requests
|
||||
|
||||
## 📄 PDF Support
|
||||
|
||||
PDF extraction is now natively supported in Crawl4AI.
|
||||
|
||||
```python
|
||||
# Extract data from PDF documents
|
||||
result = await crawler.arun(
|
||||
"https://example.com/report.pdf",
|
||||
config=CrawlerRunConfig(
|
||||
pdf_extraction=True,
|
||||
extraction_strategy=JsonCssExtractionStrategy({
|
||||
# Works on converted PDF structure
|
||||
"title": {"selector": "h1", "type": "text"},
|
||||
"sections": {"selector": "h2", "type": "list"}
|
||||
})
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
## 🔧 Important Changes
|
||||
|
||||
|
||||
@@ -1,43 +0,0 @@
|
||||
# 🛠️ Crawl4AI v0.7.1: Minor Cleanup Update
|
||||
|
||||
*July 17, 2025 • 2 min read*
|
||||
|
||||
---
|
||||
|
||||
A small maintenance release that removes unused code and improves documentation.
|
||||
|
||||
## 🎯 What's Changed
|
||||
|
||||
- **Removed unused StealthConfig** from `crawl4ai/browser_manager.py`
|
||||
- **Updated documentation** with better examples and parameter explanations
|
||||
- **Fixed virtual scroll configuration** examples in docs
|
||||
|
||||
## 🧹 Code Cleanup
|
||||
|
||||
Removed unused `StealthConfig` import and configuration that wasn't being used anywhere in the codebase. The project uses its own custom stealth implementation through JavaScript injection instead.
|
||||
|
||||
```python
|
||||
# Removed unused code:
|
||||
from playwright_stealth import StealthConfig
|
||||
stealth_config = StealthConfig(...) # This was never used
|
||||
```
|
||||
|
||||
## 📖 Documentation Updates
|
||||
|
||||
- Fixed adaptive crawling parameter examples
|
||||
- Updated session management documentation
|
||||
- Corrected virtual scroll configuration examples
|
||||
|
||||
## 🚀 Installation
|
||||
|
||||
```bash
|
||||
pip install crawl4ai==0.7.1
|
||||
```
|
||||
|
||||
No breaking changes - upgrade directly from v0.7.0.
|
||||
|
||||
---
|
||||
|
||||
Questions? Issues?
|
||||
- GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||
- Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
|
||||
@@ -9,7 +9,7 @@ import asyncio
|
||||
import re
|
||||
from typing import List, Dict, Set
|
||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||
from crawl4ai.adaptive_crawler import AdaptiveCrawlResult, Link
|
||||
from crawl4ai.adaptive_crawler import CrawlState, Link
|
||||
import math
|
||||
|
||||
|
||||
@@ -45,7 +45,7 @@ class APIDocumentationStrategy:
|
||||
r'/legal/'
|
||||
]
|
||||
|
||||
def score_link(self, link: Link, query: str, state: AdaptiveCrawlResult) -> float:
|
||||
def score_link(self, link: Link, query: str, state: CrawlState) -> float:
|
||||
"""Custom link scoring for API documentation"""
|
||||
score = 1.0
|
||||
url = link.href.lower()
|
||||
@@ -77,7 +77,7 @@ class APIDocumentationStrategy:
|
||||
|
||||
return score
|
||||
|
||||
def calculate_api_coverage(self, state: AdaptiveCrawlResult, query: str) -> Dict[str, float]:
|
||||
def calculate_api_coverage(self, state: CrawlState, query: str) -> Dict[str, float]:
|
||||
"""Calculate specialized coverage metrics for API documentation"""
|
||||
metrics = {
|
||||
'endpoint_coverage': 0.0,
|
||||
|
||||
@@ -3,8 +3,8 @@ C4A-Script API Usage Examples
|
||||
Shows how to use the new Result-based API in various scenarios
|
||||
"""
|
||||
|
||||
from c4a_compile import compile, validate, compile_file
|
||||
from c4a_result import CompilationResult, ValidationResult
|
||||
from crawl4ai.script.c4a_compile import compile, validate, compile_file
|
||||
from crawl4ai.script.c4a_result import CompilationResult, ValidationResult
|
||||
import json
|
||||
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@ C4A-Script Hello World
|
||||
A concise example showing how to use the C4A-Script compiler
|
||||
"""
|
||||
|
||||
from c4a_compile import compile
|
||||
from crawl4ai.script.c4a_compile import compile
|
||||
|
||||
# Define your C4A-Script
|
||||
script = """
|
||||
|
||||
@@ -3,7 +3,7 @@ C4A-Script Hello World - Error Example
|
||||
Shows how error handling works
|
||||
"""
|
||||
|
||||
from c4a_compile import compile
|
||||
from crawl4ai.script.c4a_compile import compile
|
||||
|
||||
# Define a script with an error (missing THEN)
|
||||
script = """
|
||||
|
||||
@@ -8,8 +8,6 @@ from crawl4ai import (
|
||||
CrawlResult
|
||||
)
|
||||
|
||||
from crawl4ai.prompts import GENERATE_SCRIPT_PROMPT
|
||||
|
||||
|
||||
async def main():
|
||||
browser_config = BrowserConfig(
|
||||
|
||||
57
docs/examples/hello_world_undetected.py
Normal file
57
docs/examples/hello_world_undetected.py
Normal file
@@ -0,0 +1,57 @@
|
||||
import asyncio
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
DefaultMarkdownGenerator,
|
||||
PruningContentFilter,
|
||||
CrawlResult,
|
||||
UndetectedAdapter
|
||||
)
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
|
||||
async def main():
|
||||
# Create browser config
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
# Create the undetected adapter
|
||||
undetected_adapter = UndetectedAdapter()
|
||||
|
||||
# Create the crawler strategy with the undetected adapter
|
||||
crawler_strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=undetected_adapter
|
||||
)
|
||||
|
||||
# Create the crawler with our custom strategy
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=crawler_strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
# Configure the crawl
|
||||
crawler_config = CrawlerRunConfig(
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter()
|
||||
),
|
||||
capture_console_messages=True, # Enable console capture to test adapter
|
||||
)
|
||||
|
||||
# Test on a site that typically detects bots
|
||||
print("Testing undetected adapter...")
|
||||
result: CrawlResult = await crawler.arun(
|
||||
url="https://www.helloworld.org",
|
||||
config=crawler_config
|
||||
)
|
||||
|
||||
print(f"Status: {result.status_code}")
|
||||
print(f"Success: {result.success}")
|
||||
print(f"Console messages captured: {len(result.console_messages or [])}")
|
||||
print(f"Markdown content (first 500 chars):\n{result.markdown.raw_markdown[:500]}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -18,7 +18,7 @@ Usage:
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai import LinkPreviewConfig
|
||||
from crawl4ai.async_configs import LinkPreviewConfig
|
||||
|
||||
|
||||
async def basic_link_head_extraction():
|
||||
|
||||
59
docs/examples/simple_anti_bot_examples.py
Normal file
59
docs/examples/simple_anti_bot_examples.py
Normal file
@@ -0,0 +1,59 @@
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, UndetectedAdapter
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
# Example 1: Stealth Mode
|
||||
async def stealth_mode_example():
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True,
|
||||
headless=False
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun("https://example.com")
|
||||
return result.html[:500]
|
||||
|
||||
# Example 2: Undetected Browser
|
||||
async def undetected_browser_example():
|
||||
browser_config = BrowserConfig(
|
||||
headless=False
|
||||
)
|
||||
|
||||
adapter = UndetectedAdapter()
|
||||
strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=adapter
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
result = await crawler.arun("https://example.com")
|
||||
return result.html[:500]
|
||||
|
||||
# Example 3: Both Combined
|
||||
async def combined_example():
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True,
|
||||
headless=False
|
||||
)
|
||||
|
||||
adapter = UndetectedAdapter()
|
||||
strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=adapter
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
result = await crawler.arun("https://example.com")
|
||||
return result.html[:500]
|
||||
|
||||
# Run examples
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(stealth_mode_example())
|
||||
asyncio.run(undetected_browser_example())
|
||||
asyncio.run(combined_example())
|
||||
@@ -1,202 +0,0 @@
|
||||
"""
|
||||
SMART Cache Mode Example for Crawl4AI
|
||||
|
||||
This example demonstrates how to use the SMART cache mode to intelligently
|
||||
validate cached content before using it. SMART mode can save 70-95% bandwidth
|
||||
on unchanged content while ensuring you always get fresh data when it changes.
|
||||
|
||||
SMART Cache Mode: Only Crawl When Changes
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.cache_context import CacheMode
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
|
||||
|
||||
async def basic_smart_cache_example():
|
||||
"""Basic example showing SMART cache mode in action"""
|
||||
print("=== Basic SMART Cache Example ===\n")
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
url = "https://example.com"
|
||||
|
||||
# First crawl: Cache the content
|
||||
print("1. Initial crawl to cache the content:")
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
||||
result1 = await crawler.arun(url=url, config=config)
|
||||
print(f" Initial crawl: {len(result1.html)} bytes\n")
|
||||
|
||||
# Second crawl: Use SMART mode
|
||||
print("2. SMART mode crawl (should use cache for static content):")
|
||||
smart_config = CrawlerRunConfig(cache_mode=CacheMode.SMART)
|
||||
start_time = time.time()
|
||||
result2 = await crawler.arun(url=url, config=smart_config)
|
||||
elapsed = time.time() - start_time
|
||||
print(f" SMART crawl: {len(result2.html)} bytes in {elapsed:.2f}s")
|
||||
print(f" Content identical: {result1.html == result2.html}\n")
|
||||
|
||||
|
||||
async def news_site_monitoring():
|
||||
"""Monitor a news site for changes using SMART cache mode"""
|
||||
print("=== News Site Monitoring Example ===\n")
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.SMART)
|
||||
url = "https://news.ycombinator.com"
|
||||
|
||||
print("Monitoring Hacker News for changes...\n")
|
||||
|
||||
previous_length = 0
|
||||
for i in range(3):
|
||||
result = await crawler.arun(url=url, config=config)
|
||||
current_length = len(result.html)
|
||||
|
||||
if i == 0:
|
||||
print(f"Check {i+1}: Initial crawl - {current_length} bytes")
|
||||
else:
|
||||
if current_length != previous_length:
|
||||
print(f"Check {i+1}: Content changed! {previous_length} -> {current_length} bytes")
|
||||
else:
|
||||
print(f"Check {i+1}: Content unchanged - {current_length} bytes")
|
||||
|
||||
previous_length = current_length
|
||||
|
||||
if i < 2: # Don't wait after last check
|
||||
print(" Waiting 10 seconds before next check...")
|
||||
await asyncio.sleep(10)
|
||||
|
||||
print()
|
||||
|
||||
|
||||
async def compare_cache_modes():
|
||||
"""Compare different cache modes to understand SMART mode benefits"""
|
||||
print("=== Cache Mode Comparison ===\n")
|
||||
|
||||
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||
url = "https://www.wikipedia.org"
|
||||
|
||||
# First, populate the cache
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
||||
await crawler.arun(url=url, config=config)
|
||||
print("Cache populated.\n")
|
||||
|
||||
# Test different cache modes
|
||||
modes = [
|
||||
(CacheMode.ENABLED, "ENABLED (always uses cache if available)"),
|
||||
(CacheMode.BYPASS, "BYPASS (never uses cache)"),
|
||||
(CacheMode.SMART, "SMART (validates cache before using)")
|
||||
]
|
||||
|
||||
for mode, description in modes:
|
||||
config = CrawlerRunConfig(cache_mode=mode)
|
||||
start_time = time.time()
|
||||
result = await crawler.arun(url=url, config=config)
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
print(f"{description}:")
|
||||
print(f" Time: {elapsed:.2f}s")
|
||||
print(f" Size: {len(result.html)} bytes\n")
|
||||
|
||||
|
||||
async def dynamic_content_example():
|
||||
"""Show how SMART mode handles dynamic content"""
|
||||
print("=== Dynamic Content Example ===\n")
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
# URL that returns different content each time
|
||||
dynamic_url = "https://httpbin.org/uuid"
|
||||
|
||||
print("Testing with dynamic content (changes every request):\n")
|
||||
|
||||
# First crawl
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
||||
result1 = await crawler.arun(url=dynamic_url, config=config)
|
||||
|
||||
# Extract UUID from the response
|
||||
import re
|
||||
uuid1 = re.search(r'"uuid":\s*"([^"]+)"', result1.html)
|
||||
if uuid1:
|
||||
print(f"1. First crawl UUID: {uuid1.group(1)}")
|
||||
|
||||
# SMART mode crawl - should detect change and re-crawl
|
||||
smart_config = CrawlerRunConfig(cache_mode=CacheMode.SMART)
|
||||
result2 = await crawler.arun(url=dynamic_url, config=smart_config)
|
||||
|
||||
uuid2 = re.search(r'"uuid":\s*"([^"]+)"', result2.html)
|
||||
if uuid2:
|
||||
print(f"2. SMART crawl UUID: {uuid2.group(1)}")
|
||||
print(f" Different UUIDs: {uuid1.group(1) != uuid2.group(1)} (should be True)")
|
||||
|
||||
|
||||
async def bandwidth_savings_demo():
|
||||
"""Demonstrate bandwidth savings with SMART mode"""
|
||||
print("=== Bandwidth Savings Demo ===\n")
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
# List of URLs to crawl
|
||||
urls = [
|
||||
"https://example.com",
|
||||
"https://www.python.org",
|
||||
"https://docs.python.org/3/",
|
||||
]
|
||||
|
||||
print("Crawling multiple URLs twice to show bandwidth savings:\n")
|
||||
|
||||
# First pass: Cache all URLs
|
||||
print("First pass - Caching all URLs:")
|
||||
total_bytes_pass1 = 0
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
||||
|
||||
for url in urls:
|
||||
result = await crawler.arun(url=url, config=config)
|
||||
total_bytes_pass1 += len(result.html)
|
||||
print(f" {url}: {len(result.html)} bytes")
|
||||
|
||||
print(f"\nTotal downloaded in first pass: {total_bytes_pass1} bytes")
|
||||
|
||||
# Second pass: Use SMART mode
|
||||
print("\nSecond pass - Using SMART mode:")
|
||||
total_bytes_pass2 = 0
|
||||
smart_config = CrawlerRunConfig(cache_mode=CacheMode.SMART)
|
||||
|
||||
for url in urls:
|
||||
result = await crawler.arun(url=url, config=smart_config)
|
||||
# In SMART mode, unchanged content uses cache (minimal bandwidth)
|
||||
print(f" {url}: Using {'cache' if result else 'fresh crawl'}")
|
||||
|
||||
print(f"\nBandwidth saved: ~{total_bytes_pass1} bytes (only HEAD requests sent)")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all examples"""
|
||||
examples = [
|
||||
basic_smart_cache_example,
|
||||
news_site_monitoring,
|
||||
compare_cache_modes,
|
||||
dynamic_content_example,
|
||||
bandwidth_savings_demo
|
||||
]
|
||||
|
||||
for example in examples:
|
||||
await example()
|
||||
print("\n" + "="*50 + "\n")
|
||||
await asyncio.sleep(2) # Brief pause between examples
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("""
|
||||
Crawl4AI SMART Cache Mode Examples
|
||||
==================================
|
||||
|
||||
These examples demonstrate the SMART cache mode that intelligently
|
||||
validates cached content using HEAD requests before deciding whether
|
||||
to use cache or perform a fresh crawl.
|
||||
|
||||
""")
|
||||
asyncio.run(main())
|
||||
522
docs/examples/stealth_mode_example.py
Normal file
522
docs/examples/stealth_mode_example.py
Normal file
@@ -0,0 +1,522 @@
|
||||
"""
|
||||
Stealth Mode Example with Crawl4AI
|
||||
|
||||
This example demonstrates how to use the stealth mode feature to bypass basic bot detection.
|
||||
The stealth mode uses playwright-stealth to modify browser fingerprints and behaviors
|
||||
that are commonly used to detect automated browsers.
|
||||
|
||||
Key features demonstrated:
|
||||
1. Comparing crawling with and without stealth mode
|
||||
2. Testing against bot detection sites
|
||||
3. Accessing sites that block automated browsers
|
||||
4. Best practices for stealth crawling
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from typing import Dict, Any
|
||||
from colorama import Fore, Style, init
|
||||
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
# Initialize colorama for colored output
|
||||
init()
|
||||
|
||||
# Create a logger for better output
|
||||
logger = AsyncLogger(verbose=True)
|
||||
|
||||
|
||||
async def test_bot_detection(use_stealth: bool = False) -> Dict[str, Any]:
|
||||
"""Test against a bot detection service"""
|
||||
|
||||
logger.info(
|
||||
f"Testing bot detection with stealth={'ON' if use_stealth else 'OFF'}",
|
||||
tag="STEALTH"
|
||||
)
|
||||
|
||||
# Configure browser with or without stealth
|
||||
browser_config = BrowserConfig(
|
||||
headless=False, # Use False to see the browser in action
|
||||
enable_stealth=use_stealth,
|
||||
viewport_width=1280,
|
||||
viewport_height=800
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# JavaScript to extract bot detection results
|
||||
detection_script = """
|
||||
// Comprehensive bot detection checks
|
||||
(() => {
|
||||
const detectionResults = {
|
||||
// Basic WebDriver detection
|
||||
webdriver: navigator.webdriver,
|
||||
|
||||
// Chrome specific
|
||||
chrome: !!window.chrome,
|
||||
chromeRuntime: !!window.chrome?.runtime,
|
||||
|
||||
// Automation indicators
|
||||
automationControlled: navigator.webdriver,
|
||||
|
||||
// Permissions API
|
||||
permissionsPresent: !!navigator.permissions?.query,
|
||||
|
||||
// Plugins
|
||||
pluginsLength: navigator.plugins.length,
|
||||
pluginsArray: Array.from(navigator.plugins).map(p => p.name),
|
||||
|
||||
// Languages
|
||||
languages: navigator.languages,
|
||||
language: navigator.language,
|
||||
|
||||
// User agent
|
||||
userAgent: navigator.userAgent,
|
||||
|
||||
// Screen and window properties
|
||||
screen: {
|
||||
width: screen.width,
|
||||
height: screen.height,
|
||||
availWidth: screen.availWidth,
|
||||
availHeight: screen.availHeight,
|
||||
colorDepth: screen.colorDepth,
|
||||
pixelDepth: screen.pixelDepth
|
||||
},
|
||||
|
||||
// WebGL vendor
|
||||
webglVendor: (() => {
|
||||
try {
|
||||
const canvas = document.createElement('canvas');
|
||||
const gl = canvas.getContext('webgl') || canvas.getContext('experimental-webgl');
|
||||
const ext = gl.getExtension('WEBGL_debug_renderer_info');
|
||||
return gl.getParameter(ext.UNMASKED_VENDOR_WEBGL);
|
||||
} catch (e) {
|
||||
return 'Error';
|
||||
}
|
||||
})(),
|
||||
|
||||
// Platform
|
||||
platform: navigator.platform,
|
||||
|
||||
// Hardware concurrency
|
||||
hardwareConcurrency: navigator.hardwareConcurrency,
|
||||
|
||||
// Device memory
|
||||
deviceMemory: navigator.deviceMemory,
|
||||
|
||||
// Connection
|
||||
connection: navigator.connection?.effectiveType
|
||||
};
|
||||
|
||||
// Log results for console capture
|
||||
console.log('DETECTION_RESULTS:', JSON.stringify(detectionResults, null, 2));
|
||||
|
||||
// Return results
|
||||
return detectionResults;
|
||||
})();
|
||||
"""
|
||||
|
||||
# Crawl bot detection test page
|
||||
config = CrawlerRunConfig(
|
||||
js_code=detection_script,
|
||||
capture_console_messages=True,
|
||||
wait_until="networkidle",
|
||||
delay_before_return_html=2.0 # Give time for all checks to complete
|
||||
)
|
||||
|
||||
result = await crawler.arun(
|
||||
url="https://bot.sannysoft.com",
|
||||
config=config
|
||||
)
|
||||
|
||||
if result.success:
|
||||
# Extract detection results from console
|
||||
detection_data = None
|
||||
for msg in result.console_messages or []:
|
||||
if "DETECTION_RESULTS:" in msg.get("text", ""):
|
||||
try:
|
||||
json_str = msg["text"].replace("DETECTION_RESULTS:", "").strip()
|
||||
detection_data = json.loads(json_str)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Also try to get from JavaScript execution result
|
||||
if not detection_data and result.js_execution_result:
|
||||
detection_data = result.js_execution_result
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"url": result.url,
|
||||
"detection_data": detection_data,
|
||||
"page_title": result.metadata.get("title", ""),
|
||||
"stealth_enabled": use_stealth
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"success": False,
|
||||
"error": result.error_message,
|
||||
"stealth_enabled": use_stealth
|
||||
}
|
||||
|
||||
|
||||
async def test_cloudflare_site(use_stealth: bool = False) -> Dict[str, Any]:
|
||||
"""Test accessing a Cloudflare-protected site"""
|
||||
|
||||
logger.info(
|
||||
f"Testing Cloudflare site with stealth={'ON' if use_stealth else 'OFF'}",
|
||||
tag="STEALTH"
|
||||
)
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=True, # Cloudflare detection works better in headless mode with stealth
|
||||
enable_stealth=use_stealth,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
wait_until="networkidle",
|
||||
page_timeout=30000, # 30 seconds
|
||||
delay_before_return_html=3.0
|
||||
)
|
||||
|
||||
# Test on a site that often shows Cloudflare challenges
|
||||
result = await crawler.arun(
|
||||
url="https://nowsecure.nl",
|
||||
config=config
|
||||
)
|
||||
|
||||
# Check if we hit Cloudflare challenge
|
||||
cloudflare_detected = False
|
||||
if result.html:
|
||||
cloudflare_indicators = [
|
||||
"Checking your browser",
|
||||
"Just a moment",
|
||||
"cf-browser-verification",
|
||||
"cf-challenge",
|
||||
"ray ID"
|
||||
]
|
||||
cloudflare_detected = any(indicator in result.html for indicator in cloudflare_indicators)
|
||||
|
||||
return {
|
||||
"success": result.success,
|
||||
"url": result.url,
|
||||
"cloudflare_challenge": cloudflare_detected,
|
||||
"status_code": result.status_code,
|
||||
"page_title": result.metadata.get("title", "") if result.metadata else "",
|
||||
"stealth_enabled": use_stealth,
|
||||
"html_snippet": result.html[:500] if result.html else ""
|
||||
}
|
||||
|
||||
|
||||
async def test_anti_bot_site(use_stealth: bool = False) -> Dict[str, Any]:
|
||||
"""Test against sites with anti-bot measures"""
|
||||
|
||||
logger.info(
|
||||
f"Testing anti-bot site with stealth={'ON' if use_stealth else 'OFF'}",
|
||||
tag="STEALTH"
|
||||
)
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
enable_stealth=use_stealth,
|
||||
# Additional browser arguments that help with stealth
|
||||
extra_args=[
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--disable-features=site-per-process"
|
||||
] if not use_stealth else [] # These are automatically applied with stealth
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Some sites check for specific behaviors
|
||||
behavior_script = """
|
||||
(async () => {
|
||||
// Simulate human-like behavior
|
||||
const sleep = ms => new Promise(resolve => setTimeout(resolve, ms));
|
||||
|
||||
// Random mouse movement
|
||||
const moveX = Math.random() * 100;
|
||||
const moveY = Math.random() * 100;
|
||||
|
||||
// Simulate reading time
|
||||
await sleep(1000 + Math.random() * 2000);
|
||||
|
||||
// Scroll slightly
|
||||
window.scrollBy(0, 100 + Math.random() * 200);
|
||||
|
||||
console.log('Human behavior simulation complete');
|
||||
return true;
|
||||
})()
|
||||
"""
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
js_code=behavior_script,
|
||||
wait_until="networkidle",
|
||||
delay_before_return_html=5.0, # Longer delay to appear more human
|
||||
capture_console_messages=True
|
||||
)
|
||||
|
||||
# Test on a site that implements anti-bot measures
|
||||
result = await crawler.arun(
|
||||
url="https://www.g2.com/",
|
||||
config=config
|
||||
)
|
||||
|
||||
# Check for common anti-bot blocks
|
||||
blocked_indicators = [
|
||||
"Access Denied",
|
||||
"403 Forbidden",
|
||||
"Security Check",
|
||||
"Verify you are human",
|
||||
"captcha",
|
||||
"challenge"
|
||||
]
|
||||
|
||||
blocked = False
|
||||
if result.html:
|
||||
blocked = any(indicator.lower() in result.html.lower() for indicator in blocked_indicators)
|
||||
|
||||
return {
|
||||
"success": result.success and not blocked,
|
||||
"url": result.url,
|
||||
"blocked": blocked,
|
||||
"status_code": result.status_code,
|
||||
"page_title": result.metadata.get("title", "") if result.metadata else "",
|
||||
"stealth_enabled": use_stealth
|
||||
}
|
||||
|
||||
|
||||
async def compare_results():
|
||||
"""Run all tests with and without stealth mode and compare results"""
|
||||
|
||||
print(f"\n{Fore.CYAN}{'='*60}{Style.RESET_ALL}")
|
||||
print(f"{Fore.CYAN}Crawl4AI Stealth Mode Comparison{Style.RESET_ALL}")
|
||||
print(f"{Fore.CYAN}{'='*60}{Style.RESET_ALL}\n")
|
||||
|
||||
# Test 1: Bot Detection
|
||||
print(f"{Fore.YELLOW}1. Bot Detection Test (bot.sannysoft.com){Style.RESET_ALL}")
|
||||
print("-" * 40)
|
||||
|
||||
# Without stealth
|
||||
regular_detection = await test_bot_detection(use_stealth=False)
|
||||
if regular_detection["success"] and regular_detection["detection_data"]:
|
||||
print(f"{Fore.RED}Without Stealth:{Style.RESET_ALL}")
|
||||
data = regular_detection["detection_data"]
|
||||
print(f" • WebDriver detected: {data.get('webdriver', 'Unknown')}")
|
||||
print(f" • Chrome: {data.get('chrome', 'Unknown')}")
|
||||
print(f" • Languages: {data.get('languages', 'Unknown')}")
|
||||
print(f" • Plugins: {data.get('pluginsLength', 'Unknown')}")
|
||||
print(f" • User Agent: {data.get('userAgent', 'Unknown')[:60]}...")
|
||||
|
||||
# With stealth
|
||||
stealth_detection = await test_bot_detection(use_stealth=True)
|
||||
if stealth_detection["success"] and stealth_detection["detection_data"]:
|
||||
print(f"\n{Fore.GREEN}With Stealth:{Style.RESET_ALL}")
|
||||
data = stealth_detection["detection_data"]
|
||||
print(f" • WebDriver detected: {data.get('webdriver', 'Unknown')}")
|
||||
print(f" • Chrome: {data.get('chrome', 'Unknown')}")
|
||||
print(f" • Languages: {data.get('languages', 'Unknown')}")
|
||||
print(f" • Plugins: {data.get('pluginsLength', 'Unknown')}")
|
||||
print(f" • User Agent: {data.get('userAgent', 'Unknown')[:60]}...")
|
||||
|
||||
# Test 2: Cloudflare Site
|
||||
print(f"\n\n{Fore.YELLOW}2. Cloudflare Protected Site Test{Style.RESET_ALL}")
|
||||
print("-" * 40)
|
||||
|
||||
# Without stealth
|
||||
regular_cf = await test_cloudflare_site(use_stealth=False)
|
||||
print(f"{Fore.RED}Without Stealth:{Style.RESET_ALL}")
|
||||
print(f" • Success: {regular_cf['success']}")
|
||||
print(f" • Cloudflare Challenge: {regular_cf['cloudflare_challenge']}")
|
||||
print(f" • Status Code: {regular_cf['status_code']}")
|
||||
print(f" • Page Title: {regular_cf['page_title']}")
|
||||
|
||||
# With stealth
|
||||
stealth_cf = await test_cloudflare_site(use_stealth=True)
|
||||
print(f"\n{Fore.GREEN}With Stealth:{Style.RESET_ALL}")
|
||||
print(f" • Success: {stealth_cf['success']}")
|
||||
print(f" • Cloudflare Challenge: {stealth_cf['cloudflare_challenge']}")
|
||||
print(f" • Status Code: {stealth_cf['status_code']}")
|
||||
print(f" • Page Title: {stealth_cf['page_title']}")
|
||||
|
||||
# Test 3: Anti-bot Site
|
||||
print(f"\n\n{Fore.YELLOW}3. Anti-Bot Site Test{Style.RESET_ALL}")
|
||||
print("-" * 40)
|
||||
|
||||
# Without stealth
|
||||
regular_antibot = await test_anti_bot_site(use_stealth=False)
|
||||
print(f"{Fore.RED}Without Stealth:{Style.RESET_ALL}")
|
||||
print(f" • Success: {regular_antibot['success']}")
|
||||
print(f" • Blocked: {regular_antibot['blocked']}")
|
||||
print(f" • Status Code: {regular_antibot['status_code']}")
|
||||
print(f" • Page Title: {regular_antibot['page_title']}")
|
||||
|
||||
# With stealth
|
||||
stealth_antibot = await test_anti_bot_site(use_stealth=True)
|
||||
print(f"\n{Fore.GREEN}With Stealth:{Style.RESET_ALL}")
|
||||
print(f" • Success: {stealth_antibot['success']}")
|
||||
print(f" • Blocked: {stealth_antibot['blocked']}")
|
||||
print(f" • Status Code: {stealth_antibot['status_code']}")
|
||||
print(f" • Page Title: {stealth_antibot['page_title']}")
|
||||
|
||||
# Summary
|
||||
print(f"\n{Fore.CYAN}{'='*60}{Style.RESET_ALL}")
|
||||
print(f"{Fore.CYAN}Summary:{Style.RESET_ALL}")
|
||||
print(f"{Fore.CYAN}{'='*60}{Style.RESET_ALL}")
|
||||
print(f"\nStealth mode helps bypass basic bot detection by:")
|
||||
print(f" • Hiding webdriver property")
|
||||
print(f" • Modifying browser fingerprints")
|
||||
print(f" • Adjusting navigator properties")
|
||||
print(f" • Emulating real browser plugin behavior")
|
||||
print(f"\n{Fore.YELLOW}Note:{Style.RESET_ALL} Stealth mode is not a silver bullet.")
|
||||
print(f"Advanced anti-bot systems may still detect automation.")
|
||||
print(f"Always respect robots.txt and website terms of service.")
|
||||
|
||||
|
||||
async def stealth_best_practices():
|
||||
"""Demonstrate best practices for using stealth mode"""
|
||||
|
||||
print(f"\n\n{Fore.CYAN}{'='*60}{Style.RESET_ALL}")
|
||||
print(f"{Fore.CYAN}Stealth Mode Best Practices{Style.RESET_ALL}")
|
||||
print(f"{Fore.CYAN}{'='*60}{Style.RESET_ALL}\n")
|
||||
|
||||
# Best Practice 1: Combine with realistic behavior
|
||||
print(f"{Fore.YELLOW}1. Combine with Realistic Behavior:{Style.RESET_ALL}")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
enable_stealth=True,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Simulate human-like behavior
|
||||
human_behavior_script = """
|
||||
(async () => {
|
||||
// Wait random time between actions
|
||||
const randomWait = () => Math.random() * 2000 + 1000;
|
||||
|
||||
// Simulate reading
|
||||
await new Promise(resolve => setTimeout(resolve, randomWait()));
|
||||
|
||||
// Smooth scroll
|
||||
const smoothScroll = async () => {
|
||||
const totalHeight = document.body.scrollHeight;
|
||||
const viewHeight = window.innerHeight;
|
||||
let currentPosition = 0;
|
||||
|
||||
while (currentPosition < totalHeight - viewHeight) {
|
||||
const scrollAmount = Math.random() * 300 + 100;
|
||||
window.scrollBy({
|
||||
top: scrollAmount,
|
||||
behavior: 'smooth'
|
||||
});
|
||||
currentPosition += scrollAmount;
|
||||
await new Promise(resolve => setTimeout(resolve, randomWait()));
|
||||
}
|
||||
};
|
||||
|
||||
await smoothScroll();
|
||||
console.log('Human-like behavior simulation completed');
|
||||
return true;
|
||||
})()
|
||||
"""
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
js_code=human_behavior_script,
|
||||
wait_until="networkidle",
|
||||
delay_before_return_html=3.0,
|
||||
capture_console_messages=True
|
||||
)
|
||||
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=config
|
||||
)
|
||||
|
||||
print(f" ✓ Simulated human-like scrolling and reading patterns")
|
||||
print(f" ✓ Added random delays between actions")
|
||||
print(f" ✓ Result: {result.success}")
|
||||
|
||||
# Best Practice 2: Use appropriate viewport and user agent
|
||||
print(f"\n{Fore.YELLOW}2. Use Realistic Viewport and User Agent:{Style.RESET_ALL}")
|
||||
|
||||
# Get a realistic user agent
|
||||
from crawl4ai.user_agent_generator import UserAgentGenerator
|
||||
ua_generator = UserAgentGenerator()
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
enable_stealth=True,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080,
|
||||
user_agent=ua_generator.generate(device_type="desktop", browser_type="chrome")
|
||||
)
|
||||
|
||||
print(f" ✓ Using realistic viewport: 1920x1080")
|
||||
print(f" ✓ Using current Chrome user agent")
|
||||
print(f" ✓ Stealth mode will ensure consistency")
|
||||
|
||||
# Best Practice 3: Manage request rate
|
||||
print(f"\n{Fore.YELLOW}3. Manage Request Rate:{Style.RESET_ALL}")
|
||||
print(f" ✓ Add delays between requests")
|
||||
print(f" ✓ Randomize timing patterns")
|
||||
print(f" ✓ Respect robots.txt")
|
||||
|
||||
# Best Practice 4: Session management
|
||||
print(f"\n{Fore.YELLOW}4. Use Session Management:{Style.RESET_ALL}")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
enable_stealth=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Create a session for multiple requests
|
||||
session_id = "stealth_session_1"
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
session_id=session_id,
|
||||
wait_until="domcontentloaded"
|
||||
)
|
||||
|
||||
# First request
|
||||
result1 = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=config
|
||||
)
|
||||
|
||||
# Subsequent request reuses the same browser context
|
||||
result2 = await crawler.arun(
|
||||
url="https://example.com/about",
|
||||
config=config
|
||||
)
|
||||
|
||||
print(f" ✓ Reused browser session for multiple requests")
|
||||
print(f" ✓ Maintains cookies and state between requests")
|
||||
print(f" ✓ More efficient and realistic browsing pattern")
|
||||
|
||||
print(f"\n{Fore.CYAN}{'='*60}{Style.RESET_ALL}")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all examples"""
|
||||
|
||||
# Run comparison tests
|
||||
await compare_results()
|
||||
|
||||
# Show best practices
|
||||
await stealth_best_practices()
|
||||
|
||||
print(f"\n{Fore.GREEN}Examples completed!{Style.RESET_ALL}")
|
||||
print(f"\n{Fore.YELLOW}Remember:{Style.RESET_ALL}")
|
||||
print(f"• Stealth mode helps with basic bot detection")
|
||||
print(f"• Always respect website terms of service")
|
||||
print(f"• Consider rate limiting and ethical scraping practices")
|
||||
print(f"• For advanced protection, consider additional measures")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
215
docs/examples/stealth_mode_quick_start.py
Normal file
215
docs/examples/stealth_mode_quick_start.py
Normal file
@@ -0,0 +1,215 @@
|
||||
"""
|
||||
Quick Start: Using Stealth Mode in Crawl4AI
|
||||
|
||||
This example shows practical use cases for the stealth mode feature.
|
||||
Stealth mode helps bypass basic bot detection mechanisms.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
|
||||
|
||||
async def example_1_basic_stealth():
|
||||
"""Example 1: Basic stealth mode usage"""
|
||||
print("\n=== Example 1: Basic Stealth Mode ===")
|
||||
|
||||
# Enable stealth mode in browser config
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True, # This is the key parameter
|
||||
headless=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
print(f"✓ Crawled {result.url} successfully")
|
||||
print(f"✓ Title: {result.metadata.get('title', 'N/A')}")
|
||||
|
||||
|
||||
async def example_2_stealth_with_screenshot():
|
||||
"""Example 2: Stealth mode with screenshot to show detection results"""
|
||||
print("\n=== Example 2: Stealth Mode Visual Verification ===")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True,
|
||||
headless=False # Set to False to see the browser
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
screenshot=True,
|
||||
wait_until="networkidle"
|
||||
)
|
||||
|
||||
result = await crawler.arun(
|
||||
url="https://bot.sannysoft.com",
|
||||
config=config
|
||||
)
|
||||
|
||||
if result.success:
|
||||
print(f"✓ Successfully crawled bot detection site")
|
||||
print(f"✓ With stealth enabled, many detection tests should show as passed")
|
||||
|
||||
if result.screenshot:
|
||||
# Save screenshot for verification
|
||||
import base64
|
||||
with open("stealth_detection_results.png", "wb") as f:
|
||||
f.write(base64.b64decode(result.screenshot))
|
||||
print(f"✓ Screenshot saved as 'stealth_detection_results.png'")
|
||||
print(f" Check the screenshot to see detection results!")
|
||||
|
||||
|
||||
async def example_3_stealth_for_protected_sites():
|
||||
"""Example 3: Using stealth for sites with bot protection"""
|
||||
print("\n=== Example 3: Stealth for Protected Sites ===")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True,
|
||||
headless=True,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Add human-like behavior
|
||||
config = CrawlerRunConfig(
|
||||
wait_until="networkidle",
|
||||
delay_before_return_html=2.0, # Wait 2 seconds
|
||||
js_code="""
|
||||
// Simulate human-like scrolling
|
||||
window.scrollTo({
|
||||
top: document.body.scrollHeight / 2,
|
||||
behavior: 'smooth'
|
||||
});
|
||||
"""
|
||||
)
|
||||
|
||||
# Try accessing a site that might have bot protection
|
||||
result = await crawler.arun(
|
||||
url="https://www.g2.com/products/slack/reviews",
|
||||
config=config
|
||||
)
|
||||
|
||||
if result.success:
|
||||
print(f"✓ Successfully accessed protected site")
|
||||
print(f"✓ Retrieved {len(result.html)} characters of HTML")
|
||||
else:
|
||||
print(f"✗ Failed to access site: {result.error_message}")
|
||||
|
||||
|
||||
async def example_4_stealth_with_sessions():
|
||||
"""Example 4: Stealth mode with session management"""
|
||||
print("\n=== Example 4: Stealth + Session Management ===")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True,
|
||||
headless=False
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
session_id = "my_stealth_session"
|
||||
|
||||
# First request - establish session
|
||||
config = CrawlerRunConfig(
|
||||
session_id=session_id,
|
||||
wait_until="domcontentloaded"
|
||||
)
|
||||
|
||||
result1 = await crawler.arun(
|
||||
url="https://news.ycombinator.com",
|
||||
config=config
|
||||
)
|
||||
print(f"✓ First request completed: {result1.url}")
|
||||
|
||||
# Second request - reuse session
|
||||
await asyncio.sleep(2) # Brief delay between requests
|
||||
|
||||
result2 = await crawler.arun(
|
||||
url="https://news.ycombinator.com/best",
|
||||
config=config
|
||||
)
|
||||
print(f"✓ Second request completed: {result2.url}")
|
||||
print(f"✓ Session reused, maintaining cookies and state")
|
||||
|
||||
|
||||
async def example_5_stealth_comparison():
|
||||
"""Example 5: Compare results with and without stealth using screenshots"""
|
||||
print("\n=== Example 5: Stealth Mode Comparison ===")
|
||||
|
||||
test_url = "https://bot.sannysoft.com"
|
||||
|
||||
# First test WITHOUT stealth
|
||||
print("\nWithout stealth:")
|
||||
regular_config = BrowserConfig(
|
||||
enable_stealth=False,
|
||||
headless=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=regular_config) as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
screenshot=True,
|
||||
wait_until="networkidle"
|
||||
)
|
||||
result = await crawler.arun(url=test_url, config=config)
|
||||
|
||||
if result.success and result.screenshot:
|
||||
import base64
|
||||
with open("comparison_without_stealth.png", "wb") as f:
|
||||
f.write(base64.b64decode(result.screenshot))
|
||||
print(f" ✓ Screenshot saved: comparison_without_stealth.png")
|
||||
print(f" Many tests will show as FAILED (red)")
|
||||
|
||||
# Then test WITH stealth
|
||||
print("\nWith stealth:")
|
||||
stealth_config = BrowserConfig(
|
||||
enable_stealth=True,
|
||||
headless=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=stealth_config) as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
screenshot=True,
|
||||
wait_until="networkidle"
|
||||
)
|
||||
result = await crawler.arun(url=test_url, config=config)
|
||||
|
||||
if result.success and result.screenshot:
|
||||
import base64
|
||||
with open("comparison_with_stealth.png", "wb") as f:
|
||||
f.write(base64.b64decode(result.screenshot))
|
||||
print(f" ✓ Screenshot saved: comparison_with_stealth.png")
|
||||
print(f" More tests should show as PASSED (green)")
|
||||
|
||||
print("\nCompare the two screenshots to see the difference!")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all examples"""
|
||||
print("Crawl4AI Stealth Mode Examples")
|
||||
print("==============================")
|
||||
|
||||
# Run basic example
|
||||
await example_1_basic_stealth()
|
||||
|
||||
# Run screenshot verification example
|
||||
await example_2_stealth_with_screenshot()
|
||||
|
||||
# Run protected site example
|
||||
await example_3_stealth_for_protected_sites()
|
||||
|
||||
# Run session example
|
||||
await example_4_stealth_with_sessions()
|
||||
|
||||
# Run comparison example
|
||||
await example_5_stealth_comparison()
|
||||
|
||||
print("\n" + "="*50)
|
||||
print("Tips for using stealth mode effectively:")
|
||||
print("- Use realistic viewport sizes (1920x1080, 1366x768)")
|
||||
print("- Add delays between requests to appear more human")
|
||||
print("- Combine with session management for better results")
|
||||
print("- Remember: stealth mode is for legitimate scraping only")
|
||||
print("="*50)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
62
docs/examples/stealth_test_simple.py
Normal file
62
docs/examples/stealth_test_simple.py
Normal file
@@ -0,0 +1,62 @@
|
||||
"""
|
||||
Simple test to verify stealth mode is working
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
|
||||
|
||||
async def test_stealth():
|
||||
"""Test stealth mode effectiveness"""
|
||||
|
||||
# Test WITHOUT stealth
|
||||
print("=== WITHOUT Stealth ===")
|
||||
config1 = BrowserConfig(
|
||||
headless=False,
|
||||
enable_stealth=False
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=config1) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://bot.sannysoft.com",
|
||||
config=CrawlerRunConfig(
|
||||
wait_until="networkidle",
|
||||
screenshot=True
|
||||
)
|
||||
)
|
||||
print(f"Success: {result.success}")
|
||||
# Take screenshot
|
||||
if result.screenshot:
|
||||
with open("without_stealth.png", "wb") as f:
|
||||
import base64
|
||||
f.write(base64.b64decode(result.screenshot))
|
||||
print("Screenshot saved: without_stealth.png")
|
||||
|
||||
# Test WITH stealth
|
||||
print("\n=== WITH Stealth ===")
|
||||
config2 = BrowserConfig(
|
||||
headless=False,
|
||||
enable_stealth=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=config2) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://bot.sannysoft.com",
|
||||
config=CrawlerRunConfig(
|
||||
wait_until="networkidle",
|
||||
screenshot=True
|
||||
)
|
||||
)
|
||||
print(f"Success: {result.success}")
|
||||
# Take screenshot
|
||||
if result.screenshot:
|
||||
with open("with_stealth.png", "wb") as f:
|
||||
import base64
|
||||
f.write(base64.b64decode(result.screenshot))
|
||||
print("Screenshot saved: with_stealth.png")
|
||||
|
||||
print("\nCheck the screenshots to see the difference in bot detection results!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_stealth())
|
||||
74
docs/examples/undetectability/undetected_basic_test.py
Normal file
74
docs/examples/undetectability/undetected_basic_test.py
Normal file
@@ -0,0 +1,74 @@
|
||||
"""
|
||||
Basic Undetected Browser Test
|
||||
Simple example to test if undetected mode works
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
||||
|
||||
async def test_regular_mode():
|
||||
"""Test with regular browser"""
|
||||
print("Testing Regular Browser Mode...")
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url="https://www.example.com")
|
||||
print(f"Regular Mode - Success: {result.success}")
|
||||
print(f"Regular Mode - Status: {result.status_code}")
|
||||
print(f"Regular Mode - Content length: {len(result.markdown.raw_markdown)}")
|
||||
print(f"Regular Mode - First 100 chars: {result.markdown.raw_markdown[:100]}...")
|
||||
return result.success
|
||||
|
||||
async def test_undetected_mode():
|
||||
"""Test with undetected browser"""
|
||||
print("\nTesting Undetected Browser Mode...")
|
||||
from crawl4ai import UndetectedAdapter
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
# Create undetected adapter
|
||||
undetected_adapter = UndetectedAdapter()
|
||||
|
||||
# Create strategy with undetected adapter
|
||||
crawler_strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=undetected_adapter
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=crawler_strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
result = await crawler.arun(url="https://www.example.com")
|
||||
print(f"Undetected Mode - Success: {result.success}")
|
||||
print(f"Undetected Mode - Status: {result.status_code}")
|
||||
print(f"Undetected Mode - Content length: {len(result.markdown.raw_markdown)}")
|
||||
print(f"Undetected Mode - First 100 chars: {result.markdown.raw_markdown[:100]}...")
|
||||
return result.success
|
||||
|
||||
async def main():
|
||||
"""Run both tests"""
|
||||
print("🤖 Crawl4AI Basic Adapter Test\n")
|
||||
|
||||
# Test regular mode
|
||||
regular_success = await test_regular_mode()
|
||||
|
||||
# Test undetected mode
|
||||
undetected_success = await test_undetected_mode()
|
||||
|
||||
# Summary
|
||||
print("\n" + "="*50)
|
||||
print("Summary:")
|
||||
print(f"Regular Mode: {'✅ Success' if regular_success else '❌ Failed'}")
|
||||
print(f"Undetected Mode: {'✅ Success' if undetected_success else '❌ Failed'}")
|
||||
print("="*50)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
155
docs/examples/undetectability/undetected_bot_test.py
Normal file
155
docs/examples/undetectability/undetected_bot_test.py
Normal file
@@ -0,0 +1,155 @@
|
||||
"""
|
||||
Bot Detection Test - Compare Regular vs Undetected
|
||||
Tests browser fingerprinting differences at bot.sannysoft.com
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
UndetectedAdapter,
|
||||
CrawlResult
|
||||
)
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
# Bot detection test site
|
||||
TEST_URL = "https://bot.sannysoft.com"
|
||||
|
||||
def analyze_bot_detection(result: CrawlResult) -> dict:
|
||||
"""Analyze bot detection results from the page"""
|
||||
detections = {
|
||||
"webdriver": False,
|
||||
"headless": False,
|
||||
"automation": False,
|
||||
"user_agent": False,
|
||||
"total_tests": 0,
|
||||
"failed_tests": 0
|
||||
}
|
||||
|
||||
if not result.success or not result.html:
|
||||
return detections
|
||||
|
||||
# Look for specific test results in the HTML
|
||||
html_lower = result.html.lower()
|
||||
|
||||
# Check for common bot indicators
|
||||
if "webdriver" in html_lower and ("fail" in html_lower or "true" in html_lower):
|
||||
detections["webdriver"] = True
|
||||
detections["failed_tests"] += 1
|
||||
|
||||
if "headless" in html_lower and ("fail" in html_lower or "true" in html_lower):
|
||||
detections["headless"] = True
|
||||
detections["failed_tests"] += 1
|
||||
|
||||
if "automation" in html_lower and "detected" in html_lower:
|
||||
detections["automation"] = True
|
||||
detections["failed_tests"] += 1
|
||||
|
||||
# Count total tests (approximate)
|
||||
detections["total_tests"] = html_lower.count("test") + html_lower.count("check")
|
||||
|
||||
return detections
|
||||
|
||||
async def test_browser_mode(adapter_name: str, adapter=None):
|
||||
"""Test a browser mode and return results"""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Testing: {adapter_name}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=False, # Run in headed mode for better results
|
||||
verbose=True,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080,
|
||||
)
|
||||
|
||||
if adapter:
|
||||
# Use undetected mode
|
||||
crawler_strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=adapter
|
||||
)
|
||||
crawler = AsyncWebCrawler(
|
||||
crawler_strategy=crawler_strategy,
|
||||
config=browser_config
|
||||
)
|
||||
else:
|
||||
# Use regular mode
|
||||
crawler = AsyncWebCrawler(config=browser_config)
|
||||
|
||||
async with crawler:
|
||||
config = CrawlerRunConfig(
|
||||
delay_before_return_html=3.0, # Let detection scripts run
|
||||
wait_for_images=True,
|
||||
screenshot=True,
|
||||
simulate_user=False, # Don't simulate for accurate detection
|
||||
)
|
||||
|
||||
result = await crawler.arun(url=TEST_URL, config=config)
|
||||
|
||||
print(f"\n✓ Success: {result.success}")
|
||||
print(f"✓ Status Code: {result.status_code}")
|
||||
|
||||
if result.success:
|
||||
# Analyze detection results
|
||||
detections = analyze_bot_detection(result)
|
||||
|
||||
print(f"\n🔍 Bot Detection Analysis:")
|
||||
print(f" - WebDriver Detected: {'❌ Yes' if detections['webdriver'] else '✅ No'}")
|
||||
print(f" - Headless Detected: {'❌ Yes' if detections['headless'] else '✅ No'}")
|
||||
print(f" - Automation Detected: {'❌ Yes' if detections['automation'] else '✅ No'}")
|
||||
print(f" - Failed Tests: {detections['failed_tests']}")
|
||||
|
||||
# Show some content
|
||||
if result.markdown.raw_markdown:
|
||||
print(f"\nContent preview:")
|
||||
lines = result.markdown.raw_markdown.split('\n')
|
||||
for line in lines[:20]: # Show first 20 lines
|
||||
if any(keyword in line.lower() for keyword in ['test', 'pass', 'fail', 'yes', 'no']):
|
||||
print(f" {line.strip()}")
|
||||
|
||||
return result, detections if result.success else {}
|
||||
|
||||
async def main():
|
||||
"""Run the comparison"""
|
||||
print("🤖 Crawl4AI - Bot Detection Test")
|
||||
print(f"Testing at: {TEST_URL}")
|
||||
print("This site runs various browser fingerprinting tests\n")
|
||||
|
||||
# Test regular browser
|
||||
regular_result, regular_detections = await test_browser_mode("Regular Browser")
|
||||
|
||||
# Small delay
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Test undetected browser
|
||||
undetected_adapter = UndetectedAdapter()
|
||||
undetected_result, undetected_detections = await test_browser_mode(
|
||||
"Undetected Browser",
|
||||
undetected_adapter
|
||||
)
|
||||
|
||||
# Summary comparison
|
||||
print(f"\n{'='*60}")
|
||||
print("COMPARISON SUMMARY")
|
||||
print(f"{'='*60}")
|
||||
|
||||
print(f"\n{'Test':<25} {'Regular':<15} {'Undetected':<15}")
|
||||
print(f"{'-'*55}")
|
||||
|
||||
if regular_detections and undetected_detections:
|
||||
print(f"{'WebDriver Detection':<25} {'❌ Detected' if regular_detections['webdriver'] else '✅ Passed':<15} {'❌ Detected' if undetected_detections['webdriver'] else '✅ Passed':<15}")
|
||||
print(f"{'Headless Detection':<25} {'❌ Detected' if regular_detections['headless'] else '✅ Passed':<15} {'❌ Detected' if undetected_detections['headless'] else '✅ Passed':<15}")
|
||||
print(f"{'Automation Detection':<25} {'❌ Detected' if regular_detections['automation'] else '✅ Passed':<15} {'❌ Detected' if undetected_detections['automation'] else '✅ Passed':<15}")
|
||||
print(f"{'Failed Tests':<25} {regular_detections['failed_tests']:<15} {undetected_detections['failed_tests']:<15}")
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
|
||||
if undetected_detections.get('failed_tests', 0) < regular_detections.get('failed_tests', 1):
|
||||
print("✅ Undetected browser performed better at evading detection!")
|
||||
else:
|
||||
print("ℹ️ Both browsers had similar detection results")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
164
docs/examples/undetectability/undetected_cloudflare_test.py
Normal file
164
docs/examples/undetectability/undetected_cloudflare_test.py
Normal file
@@ -0,0 +1,164 @@
|
||||
"""
|
||||
Undetected Browser Test - Cloudflare Protected Site
|
||||
Tests the difference between regular and undetected modes on a Cloudflare-protected site
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
UndetectedAdapter
|
||||
)
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
# Test URL with Cloudflare protection
|
||||
TEST_URL = "https://nowsecure.nl"
|
||||
|
||||
async def test_regular_browser():
|
||||
"""Test with regular browser - likely to be blocked"""
|
||||
print("=" * 60)
|
||||
print("Testing with Regular Browser")
|
||||
print("=" * 60)
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
verbose=True,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080,
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
delay_before_return_html=2.0,
|
||||
simulate_user=True,
|
||||
magic=True, # Try with magic mode too
|
||||
)
|
||||
|
||||
result = await crawler.arun(url=TEST_URL, config=config)
|
||||
|
||||
print(f"\n✓ Success: {result.success}")
|
||||
print(f"✓ Status Code: {result.status_code}")
|
||||
print(f"✓ HTML Length: {len(result.html)}")
|
||||
|
||||
# Check for Cloudflare challenge
|
||||
if result.html:
|
||||
cf_indicators = [
|
||||
"Checking your browser",
|
||||
"Please stand by",
|
||||
"cloudflare",
|
||||
"cf-browser-verification",
|
||||
"Access denied",
|
||||
"Ray ID"
|
||||
]
|
||||
|
||||
detected = False
|
||||
for indicator in cf_indicators:
|
||||
if indicator.lower() in result.html.lower():
|
||||
print(f"⚠️ Cloudflare Challenge Detected: '{indicator}' found")
|
||||
detected = True
|
||||
break
|
||||
|
||||
if not detected and len(result.markdown.raw_markdown) > 100:
|
||||
print("✅ Successfully bypassed Cloudflare!")
|
||||
print(f"Content preview: {result.markdown.raw_markdown[:200]}...")
|
||||
elif not detected:
|
||||
print("⚠️ Page loaded but content seems minimal")
|
||||
|
||||
return result
|
||||
|
||||
async def test_undetected_browser():
|
||||
"""Test with undetected browser - should bypass Cloudflare"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Testing with Undetected Browser")
|
||||
print("=" * 60)
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=False, # Headless is easier to detect
|
||||
verbose=True,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080,
|
||||
)
|
||||
|
||||
# Create undetected adapter
|
||||
undetected_adapter = UndetectedAdapter()
|
||||
|
||||
# Create strategy with undetected adapter
|
||||
crawler_strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=undetected_adapter
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=crawler_strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
delay_before_return_html=2.0,
|
||||
simulate_user=True,
|
||||
)
|
||||
|
||||
result = await crawler.arun(url=TEST_URL, config=config)
|
||||
|
||||
print(f"\n✓ Success: {result.success}")
|
||||
print(f"✓ Status Code: {result.status_code}")
|
||||
print(f"✓ HTML Length: {len(result.html)}")
|
||||
|
||||
# Check for Cloudflare challenge
|
||||
if result.html:
|
||||
cf_indicators = [
|
||||
"Checking your browser",
|
||||
"Please stand by",
|
||||
"cloudflare",
|
||||
"cf-browser-verification",
|
||||
"Access denied",
|
||||
"Ray ID"
|
||||
]
|
||||
|
||||
detected = False
|
||||
for indicator in cf_indicators:
|
||||
if indicator.lower() in result.html.lower():
|
||||
print(f"⚠️ Cloudflare Challenge Detected: '{indicator}' found")
|
||||
detected = True
|
||||
break
|
||||
|
||||
if not detected and len(result.markdown.raw_markdown) > 100:
|
||||
print("✅ Successfully bypassed Cloudflare!")
|
||||
print(f"Content preview: {result.markdown.raw_markdown[:200]}...")
|
||||
elif not detected:
|
||||
print("⚠️ Page loaded but content seems minimal")
|
||||
|
||||
return result
|
||||
|
||||
async def main():
|
||||
"""Compare regular vs undetected browser"""
|
||||
print("🤖 Crawl4AI - Cloudflare Bypass Test")
|
||||
print(f"Testing URL: {TEST_URL}\n")
|
||||
|
||||
# Test regular browser
|
||||
regular_result = await test_regular_browser()
|
||||
|
||||
# Small delay
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Test undetected browser
|
||||
undetected_result = await test_undetected_browser()
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 60)
|
||||
print("SUMMARY")
|
||||
print("=" * 60)
|
||||
print(f"Regular Browser:")
|
||||
print(f" - Success: {regular_result.success}")
|
||||
print(f" - Content Length: {len(regular_result.markdown.raw_markdown) if regular_result.markdown else 0}")
|
||||
|
||||
print(f"\nUndetected Browser:")
|
||||
print(f" - Success: {undetected_result.success}")
|
||||
print(f" - Content Length: {len(undetected_result.markdown.raw_markdown) if undetected_result.markdown else 0}")
|
||||
|
||||
if undetected_result.success and len(undetected_result.markdown.raw_markdown) > len(regular_result.markdown.raw_markdown):
|
||||
print("\n✅ Undetected browser successfully bypassed protection!")
|
||||
print("=" * 60)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -0,0 +1,184 @@
|
||||
"""
|
||||
Undetected vs Regular Browser Comparison
|
||||
This example demonstrates the difference between regular and undetected browser modes
|
||||
when accessing sites with bot detection services.
|
||||
|
||||
Based on tested anti-bot services:
|
||||
- Cloudflare
|
||||
- Kasada
|
||||
- Akamai
|
||||
- DataDome
|
||||
- Bet365
|
||||
- And others
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
PlaywrightAdapter,
|
||||
UndetectedAdapter,
|
||||
CrawlResult
|
||||
)
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
|
||||
# Test URLs for various bot detection services
|
||||
TEST_SITES = {
|
||||
"Cloudflare Protected": "https://nowsecure.nl",
|
||||
# "Bot Detection Test": "https://bot.sannysoft.com",
|
||||
# "Fingerprint Test": "https://fingerprint.com/products/bot-detection",
|
||||
# "Browser Scan": "https://browserscan.net",
|
||||
# "CreepJS": "https://abrahamjuliot.github.io/creepjs",
|
||||
}
|
||||
|
||||
|
||||
async def test_with_adapter(url: str, adapter_name: str, adapter):
|
||||
"""Test a URL with a specific adapter"""
|
||||
browser_config = BrowserConfig(
|
||||
headless=False, # Better for avoiding detection
|
||||
viewport_width=1920,
|
||||
viewport_height=1080,
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
# Create the crawler strategy with the adapter
|
||||
crawler_strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=adapter
|
||||
)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Testing with {adapter_name} adapter")
|
||||
print(f"URL: {url}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
try:
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=crawler_strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
crawler_config = CrawlerRunConfig(
|
||||
delay_before_return_html=3.0, # Give page time to load
|
||||
wait_for_images=True,
|
||||
screenshot=True,
|
||||
simulate_user=True, # Add user simulation
|
||||
)
|
||||
|
||||
result: CrawlResult = await crawler.arun(
|
||||
url=url,
|
||||
config=crawler_config
|
||||
)
|
||||
|
||||
# Check results
|
||||
print(f"✓ Status Code: {result.status_code}")
|
||||
print(f"✓ Success: {result.success}")
|
||||
print(f"✓ HTML Length: {len(result.html)}")
|
||||
print(f"✓ Markdown Length: {len(result.markdown.raw_markdown)}")
|
||||
|
||||
# Check for common bot detection indicators
|
||||
detection_indicators = [
|
||||
"Access denied",
|
||||
"Please verify you are human",
|
||||
"Checking your browser",
|
||||
"Enable JavaScript",
|
||||
"captcha",
|
||||
"403 Forbidden",
|
||||
"Bot detection",
|
||||
"Security check"
|
||||
]
|
||||
|
||||
content_lower = result.markdown.raw_markdown.lower()
|
||||
detected = False
|
||||
for indicator in detection_indicators:
|
||||
if indicator.lower() in content_lower:
|
||||
print(f"⚠️ Possible detection: Found '{indicator}'")
|
||||
detected = True
|
||||
break
|
||||
|
||||
if not detected:
|
||||
print("✅ No obvious bot detection triggered!")
|
||||
# Show first 200 chars of content
|
||||
print(f"Content preview: {result.markdown.raw_markdown[:200]}...")
|
||||
|
||||
return result.success and not detected
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error: {str(e)}")
|
||||
return False
|
||||
|
||||
|
||||
async def compare_adapters(url: str, site_name: str):
|
||||
"""Compare regular and undetected adapters on the same URL"""
|
||||
print(f"\n{'#'*60}")
|
||||
print(f"# Testing: {site_name}")
|
||||
print(f"{'#'*60}")
|
||||
|
||||
# Test with regular adapter
|
||||
regular_adapter = PlaywrightAdapter()
|
||||
regular_success = await test_with_adapter(url, "Regular", regular_adapter)
|
||||
|
||||
# Small delay between tests
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Test with undetected adapter
|
||||
undetected_adapter = UndetectedAdapter()
|
||||
undetected_success = await test_with_adapter(url, "Undetected", undetected_adapter)
|
||||
|
||||
# Summary
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Summary for {site_name}:")
|
||||
print(f"Regular Adapter: {'✅ Passed' if regular_success else '❌ Blocked/Detected'}")
|
||||
print(f"Undetected Adapter: {'✅ Passed' if undetected_success else '❌ Blocked/Detected'}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
return regular_success, undetected_success
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run comparison tests on multiple sites"""
|
||||
print("🤖 Crawl4AI Browser Adapter Comparison")
|
||||
print("Testing regular vs undetected browser modes\n")
|
||||
|
||||
results = {}
|
||||
|
||||
# Test each site
|
||||
for site_name, url in TEST_SITES.items():
|
||||
regular, undetected = await compare_adapters(url, site_name)
|
||||
results[site_name] = {
|
||||
"regular": regular,
|
||||
"undetected": undetected
|
||||
}
|
||||
|
||||
# Delay between different sites
|
||||
await asyncio.sleep(3)
|
||||
|
||||
# Final summary
|
||||
print(f"\n{'#'*60}")
|
||||
print("# FINAL RESULTS")
|
||||
print(f"{'#'*60}")
|
||||
print(f"{'Site':<30} {'Regular':<15} {'Undetected':<15}")
|
||||
print(f"{'-'*60}")
|
||||
|
||||
for site, result in results.items():
|
||||
regular_status = "✅ Passed" if result["regular"] else "❌ Blocked"
|
||||
undetected_status = "✅ Passed" if result["undetected"] else "❌ Blocked"
|
||||
print(f"{site:<30} {regular_status:<15} {undetected_status:<15}")
|
||||
|
||||
# Calculate success rates
|
||||
regular_success = sum(1 for r in results.values() if r["regular"])
|
||||
undetected_success = sum(1 for r in results.values() if r["undetected"])
|
||||
total = len(results)
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Success Rates:")
|
||||
print(f"Regular Adapter: {regular_success}/{total} ({regular_success/total*100:.1f}%)")
|
||||
print(f"Undetected Adapter: {undetected_success}/{total} ({undetected_success/total*100:.1f}%)")
|
||||
print(f"{'='*60}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Note: This example may take a while to run as it tests multiple sites
|
||||
# You can comment out sites in TEST_SITES to run faster tests
|
||||
asyncio.run(main())
|
||||
118
docs/examples/undetected_simple_demo.py
Normal file
118
docs/examples/undetected_simple_demo.py
Normal file
@@ -0,0 +1,118 @@
|
||||
"""
|
||||
Simple Undetected Browser Demo
|
||||
Demonstrates the basic usage of undetected browser mode
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
UndetectedAdapter
|
||||
)
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
async def crawl_with_regular_browser(url: str):
|
||||
"""Crawl with regular browser"""
|
||||
print("\n[Regular Browser Mode]")
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
config=CrawlerRunConfig(
|
||||
delay_before_return_html=2.0
|
||||
)
|
||||
)
|
||||
|
||||
print(f"Success: {result.success}")
|
||||
print(f"Status: {result.status_code}")
|
||||
print(f"Content length: {len(result.markdown.raw_markdown)}")
|
||||
|
||||
# Check for bot detection keywords
|
||||
content = result.markdown.raw_markdown.lower()
|
||||
if any(word in content for word in ["cloudflare", "checking your browser", "please wait"]):
|
||||
print("⚠️ Bot detection triggered!")
|
||||
else:
|
||||
print("✅ Page loaded successfully")
|
||||
|
||||
return result
|
||||
|
||||
async def crawl_with_undetected_browser(url: str):
|
||||
"""Crawl with undetected browser"""
|
||||
print("\n[Undetected Browser Mode]")
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
# Create undetected adapter and strategy
|
||||
undetected_adapter = UndetectedAdapter()
|
||||
crawler_strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=undetected_adapter
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=crawler_strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
config=CrawlerRunConfig(
|
||||
delay_before_return_html=2.0
|
||||
)
|
||||
)
|
||||
|
||||
print(f"Success: {result.success}")
|
||||
print(f"Status: {result.status_code}")
|
||||
print(f"Content length: {len(result.markdown.raw_markdown)}")
|
||||
|
||||
# Check for bot detection keywords
|
||||
content = result.markdown.raw_markdown.lower()
|
||||
if any(word in content for word in ["cloudflare", "checking your browser", "please wait"]):
|
||||
print("⚠️ Bot detection triggered!")
|
||||
else:
|
||||
print("✅ Page loaded successfully")
|
||||
|
||||
return result
|
||||
|
||||
async def main():
|
||||
"""Demo comparing regular vs undetected modes"""
|
||||
print("🤖 Crawl4AI Undetected Browser Demo")
|
||||
print("="*50)
|
||||
|
||||
# Test URLs - you can change these
|
||||
test_urls = [
|
||||
"https://www.example.com", # Simple site
|
||||
"https://httpbin.org/headers", # Shows request headers
|
||||
]
|
||||
|
||||
for url in test_urls:
|
||||
print(f"\n📍 Testing URL: {url}")
|
||||
|
||||
# Test with regular browser
|
||||
regular_result = await crawl_with_regular_browser(url)
|
||||
|
||||
# Small delay
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Test with undetected browser
|
||||
undetected_result = await crawl_with_undetected_browser(url)
|
||||
|
||||
# Compare results
|
||||
print(f"\n📊 Comparison for {url}:")
|
||||
print(f"Regular browser content: {len(regular_result.markdown.raw_markdown)} chars")
|
||||
print(f"Undetected browser content: {len(undetected_result.markdown.raw_markdown)} chars")
|
||||
|
||||
if url == "https://httpbin.org/headers":
|
||||
# Show headers for comparison
|
||||
print("\nHeaders seen by server:")
|
||||
print("Regular:", regular_result.markdown.raw_markdown[:500])
|
||||
print("\nUndetected:", undetected_result.markdown.raw_markdown[:500])
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -130,7 +130,7 @@ Factors:
|
||||
|
||||
```python
|
||||
class CustomLinkScorer:
|
||||
def score(self, link: Link, query: str, state: AdaptiveCrawlResult) -> float:
|
||||
def score(self, link: Link, query: str, state: CrawlState) -> float:
|
||||
# Prioritize specific URL patterns
|
||||
if "/api/reference/" in link.href:
|
||||
return 2.0 # Double the score
|
||||
@@ -325,17 +325,17 @@ with open("crawl_analysis.json", "w") as f:
|
||||
from crawl4ai.adaptive_crawler import BaseStrategy
|
||||
|
||||
class DomainSpecificStrategy(BaseStrategy):
|
||||
def calculate_coverage(self, state: AdaptiveCrawlResult) -> float:
|
||||
def calculate_coverage(self, state: CrawlState) -> float:
|
||||
# Custom coverage calculation
|
||||
# e.g., weight certain terms more heavily
|
||||
pass
|
||||
|
||||
def calculate_consistency(self, state: AdaptiveCrawlResult) -> float:
|
||||
def calculate_consistency(self, state: CrawlState) -> float:
|
||||
# Custom consistency logic
|
||||
# e.g., domain-specific validation
|
||||
pass
|
||||
|
||||
def rank_links(self, links: List[Link], state: AdaptiveCrawlResult) -> List[Link]:
|
||||
def rank_links(self, links: List[Link], state: CrawlState) -> List[Link]:
|
||||
# Custom link ranking
|
||||
# e.g., prioritize specific URL patterns
|
||||
pass
|
||||
@@ -359,7 +359,7 @@ class HybridStrategy(BaseStrategy):
|
||||
URLPatternStrategy()
|
||||
]
|
||||
|
||||
def calculate_confidence(self, state: AdaptiveCrawlResult) -> float:
|
||||
def calculate_confidence(self, state: CrawlState) -> float:
|
||||
# Weighted combination of strategies
|
||||
scores = [s.calculate_confidence(state) for s in self.strategies]
|
||||
weights = [0.5, 0.3, 0.2]
|
||||
|
||||
@@ -358,9 +358,77 @@ if __name__ == "__main__":
|
||||
|
||||
---
|
||||
|
||||
---
|
||||
|
||||
## 7. Anti-Bot Features (Stealth Mode & Undetected Browser)
|
||||
|
||||
Crawl4AI provides two powerful features to bypass bot detection:
|
||||
|
||||
### 7.1 Stealth Mode
|
||||
|
||||
Stealth mode uses playwright-stealth to modify browser fingerprints and behaviors. Enable it with a simple flag:
|
||||
|
||||
```python
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True, # Activates stealth mode
|
||||
headless=False
|
||||
)
|
||||
```
|
||||
|
||||
**When to use**: Sites with basic bot detection (checking navigator.webdriver, plugins, etc.)
|
||||
|
||||
### 7.2 Undetected Browser
|
||||
|
||||
For advanced bot detection, use the undetected browser adapter:
|
||||
|
||||
```python
|
||||
from crawl4ai import UndetectedAdapter
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
# Create undetected adapter
|
||||
adapter = UndetectedAdapter()
|
||||
strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=adapter
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(crawler_strategy=strategy, config=browser_config) as crawler:
|
||||
# Your crawling code
|
||||
```
|
||||
|
||||
**When to use**: Sites with sophisticated bot detection (Cloudflare, DataDome, etc.)
|
||||
|
||||
### 7.3 Combining Both
|
||||
|
||||
For maximum evasion, combine stealth mode with undetected browser:
|
||||
|
||||
```python
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True, # Enable stealth
|
||||
headless=False
|
||||
)
|
||||
|
||||
adapter = UndetectedAdapter() # Use undetected browser
|
||||
```
|
||||
|
||||
### Choosing the Right Approach
|
||||
|
||||
| Detection Level | Recommended Approach |
|
||||
|----------------|---------------------|
|
||||
| No protection | Regular browser |
|
||||
| Basic checks | Regular + Stealth mode |
|
||||
| Advanced protection | Undetected browser |
|
||||
| Maximum evasion | Undetected + Stealth mode |
|
||||
|
||||
**Best Practice**: Start with regular browser + stealth mode. Only use undetected browser if needed, as it may be slightly slower.
|
||||
|
||||
See [Undetected Browser Mode](undetected-browser.md) for detailed examples.
|
||||
|
||||
---
|
||||
|
||||
## Conclusion & Next Steps
|
||||
|
||||
You’ve now explored several **advanced** features:
|
||||
You've now explored several **advanced** features:
|
||||
|
||||
- **Proxy Usage**
|
||||
- **PDF & Screenshot** capturing for large or critical pages
|
||||
@@ -368,7 +436,10 @@ You’ve now explored several **advanced** features:
|
||||
- **Custom Headers** for language or specialized requests
|
||||
- **Session Persistence** via storage state
|
||||
- **Robots.txt Compliance**
|
||||
- **Anti-Bot Features** (Stealth Mode & Undetected Browser)
|
||||
|
||||
With these power tools, you can build robust scraping workflows that mimic real user behavior, handle secure sites, capture detailed snapshots, and manage sessions across multiple runs—streamlining your entire data collection pipeline.
|
||||
With these power tools, you can build robust scraping workflows that mimic real user behavior, handle secure sites, capture detailed snapshots, manage sessions across multiple runs, and bypass bot detection—streamlining your entire data collection pipeline.
|
||||
|
||||
**Last Updated**: 2025-01-01
|
||||
**Note**: In future versions, we may enable stealth mode and undetected browser by default. For now, users should explicitly enable these features when needed.
|
||||
|
||||
**Last Updated**: 2025-01-17
|
||||
@@ -49,75 +49,46 @@ from crawl4ai import JsonCssExtractionStrategy
|
||||
from crawl4ai.cache_context import CacheMode
|
||||
|
||||
async def crawl_dynamic_content():
|
||||
url = "https://github.com/microsoft/TypeScript/commits/main"
|
||||
session_id = "wait_for_session"
|
||||
all_commits = []
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
session_id = "github_commits_session"
|
||||
url = "https://github.com/microsoft/TypeScript/commits/main"
|
||||
all_commits = []
|
||||
|
||||
js_next_page = """
|
||||
const commits = document.querySelectorAll('li[data-testid="commit-row-item"] h4');
|
||||
if (commits.length > 0) {
|
||||
window.lastCommit = commits[0].textContent.trim();
|
||||
}
|
||||
const button = document.querySelector('a[data-testid="pagination-next-button"]');
|
||||
if (button) {button.click(); console.log('button clicked') }
|
||||
"""
|
||||
# Define extraction schema
|
||||
schema = {
|
||||
"name": "Commit Extractor",
|
||||
"baseSelector": "li.Box-sc-g0xbh4-0",
|
||||
"fields": [{
|
||||
"name": "title", "selector": "h4.markdown-title", "type": "text"
|
||||
}],
|
||||
}
|
||||
extraction_strategy = JsonCssExtractionStrategy(schema)
|
||||
|
||||
wait_for = """() => {
|
||||
const commits = document.querySelectorAll('li[data-testid="commit-row-item"] h4');
|
||||
if (commits.length === 0) return false;
|
||||
const firstCommit = commits[0].textContent.trim();
|
||||
return firstCommit !== window.lastCommit;
|
||||
}"""
|
||||
|
||||
schema = {
|
||||
"name": "Commit Extractor",
|
||||
"baseSelector": "li[data-testid='commit-row-item']",
|
||||
"fields": [
|
||||
{
|
||||
"name": "title",
|
||||
"selector": "h4 a",
|
||||
"type": "text",
|
||||
"transform": "strip",
|
||||
},
|
||||
],
|
||||
}
|
||||
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
|
||||
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
verbose=True,
|
||||
headless=False,
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# JavaScript and wait configurations
|
||||
js_next_page = """document.querySelector('a[data-testid="pagination-next-button"]').click();"""
|
||||
wait_for = """() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0"""
|
||||
|
||||
# Crawl multiple pages
|
||||
for page in range(3):
|
||||
crawler_config = CrawlerRunConfig(
|
||||
config = CrawlerRunConfig(
|
||||
url=url,
|
||||
session_id=session_id,
|
||||
css_selector="li[data-testid='commit-row-item']",
|
||||
extraction_strategy=extraction_strategy,
|
||||
js_code=js_next_page if page > 0 else None,
|
||||
wait_for=wait_for if page > 0 else None,
|
||||
js_only=page > 0,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
capture_console_messages=True,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
result = await crawler.arun(url=url, config=crawler_config)
|
||||
|
||||
if result.console_messages:
|
||||
print(f"Page {page + 1} console messages:", result.console_messages)
|
||||
|
||||
if result.extracted_content:
|
||||
# print(f"Page {page + 1} result:", result.extracted_content)
|
||||
|
||||
result = await crawler.arun(config=config)
|
||||
if result.success:
|
||||
commits = json.loads(result.extracted_content)
|
||||
all_commits.extend(commits)
|
||||
print(f"Page {page + 1}: Found {len(commits)} commits")
|
||||
else:
|
||||
print(f"Page {page + 1}: No content extracted")
|
||||
|
||||
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
|
||||
# Clean up session
|
||||
await crawler.crawler_strategy.kill_session(session_id)
|
||||
return all_commits
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
394
docs/md_v2/advanced/undetected-browser.md
Normal file
394
docs/md_v2/advanced/undetected-browser.md
Normal file
@@ -0,0 +1,394 @@
|
||||
# Undetected Browser Mode
|
||||
|
||||
## Overview
|
||||
|
||||
Crawl4AI offers two powerful anti-bot features to help you access websites with bot detection:
|
||||
|
||||
1. **Stealth Mode** - Uses playwright-stealth to modify browser fingerprints and behaviors
|
||||
2. **Undetected Browser Mode** - Advanced browser adapter with deep-level patches for sophisticated bot detection
|
||||
|
||||
This guide covers both features and helps you choose the right approach for your needs.
|
||||
|
||||
## Anti-Bot Features Comparison
|
||||
|
||||
| Feature | Regular Browser | Stealth Mode | Undetected Browser |
|
||||
|---------|----------------|--------------|-------------------|
|
||||
| WebDriver Detection | ❌ | ✅ | ✅ |
|
||||
| Navigator Properties | ❌ | ✅ | ✅ |
|
||||
| Plugin Emulation | ❌ | ✅ | ✅ |
|
||||
| CDP Detection | ❌ | Partial | ✅ |
|
||||
| Deep Browser Patches | ❌ | ❌ | ✅ |
|
||||
| Performance Impact | None | Minimal | Moderate |
|
||||
| Setup Complexity | None | None | Minimal |
|
||||
|
||||
## When to Use Each Approach
|
||||
|
||||
### Use Regular Browser + Stealth Mode When:
|
||||
- Sites have basic bot detection (checking navigator.webdriver, plugins, etc.)
|
||||
- You need good performance with basic protection
|
||||
- Sites check for common automation indicators
|
||||
|
||||
### Use Undetected Browser When:
|
||||
- Sites employ sophisticated bot detection services (Cloudflare, DataDome, etc.)
|
||||
- Stealth mode alone isn't sufficient
|
||||
- You're willing to trade some performance for better evasion
|
||||
|
||||
### Best Practice: Progressive Enhancement
|
||||
1. **Start with**: Regular browser + Stealth mode
|
||||
2. **If blocked**: Switch to Undetected browser
|
||||
3. **If still blocked**: Combine Undetected browser + Stealth mode
|
||||
|
||||
## Stealth Mode
|
||||
|
||||
Stealth mode is the simpler anti-bot solution that works with both regular and undetected browsers:
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
||||
|
||||
# Enable stealth mode with regular browser
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True, # Simple flag to enable
|
||||
headless=False # Better for avoiding detection
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun("https://example.com")
|
||||
```
|
||||
|
||||
### What Stealth Mode Does:
|
||||
- Removes `navigator.webdriver` flag
|
||||
- Modifies browser fingerprints
|
||||
- Emulates realistic plugin behavior
|
||||
- Adjusts navigator properties
|
||||
- Fixes common automation leaks
|
||||
|
||||
## Undetected Browser Mode
|
||||
|
||||
For sites with sophisticated bot detection that stealth mode can't bypass, use the undetected browser adapter:
|
||||
|
||||
### Key Features
|
||||
|
||||
- **Drop-in Replacement**: Uses the same API as regular browser mode
|
||||
- **Enhanced Stealth**: Built-in patches to evade common detection methods
|
||||
- **Browser Adapter Pattern**: Seamlessly switch between regular and undetected modes
|
||||
- **Automatic Installation**: `crawl4ai-setup` installs all necessary browser dependencies
|
||||
|
||||
### Quick Start
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
UndetectedAdapter
|
||||
)
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
async def main():
|
||||
# Create the undetected adapter
|
||||
undetected_adapter = UndetectedAdapter()
|
||||
|
||||
# Create browser config
|
||||
browser_config = BrowserConfig(
|
||||
headless=False, # Headless mode can be detected easier
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
# Create the crawler strategy with undetected adapter
|
||||
crawler_strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=undetected_adapter
|
||||
)
|
||||
|
||||
# Create the crawler with our custom strategy
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=crawler_strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
# Your crawling code here
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=CrawlerRunConfig()
|
||||
)
|
||||
print(result.markdown[:500])
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
## Combining Both Features
|
||||
|
||||
For maximum evasion, combine stealth mode with undetected browser:
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, UndetectedAdapter
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
# Create browser config with stealth enabled
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True, # Enable stealth mode
|
||||
headless=False
|
||||
)
|
||||
|
||||
# Create undetected adapter
|
||||
adapter = UndetectedAdapter()
|
||||
|
||||
# Create strategy with both features
|
||||
strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=adapter
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
result = await crawler.arun("https://protected-site.com")
|
||||
```
|
||||
|
||||
## Examples
|
||||
|
||||
### Example 1: Basic Stealth Mode
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
|
||||
async def test_stealth_mode():
|
||||
# Simple stealth mode configuration
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True,
|
||||
headless=False
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://bot.sannysoft.com",
|
||||
config=CrawlerRunConfig(screenshot=True)
|
||||
)
|
||||
|
||||
if result.success:
|
||||
print("✓ Successfully accessed bot detection test site")
|
||||
# Save screenshot to verify detection results
|
||||
if result.screenshot:
|
||||
import base64
|
||||
with open("stealth_test.png", "wb") as f:
|
||||
f.write(base64.b64decode(result.screenshot))
|
||||
print("✓ Screenshot saved - check for green (passed) tests")
|
||||
|
||||
asyncio.run(test_stealth_mode())
|
||||
```
|
||||
|
||||
### Example 2: Undetected Browser Mode
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
UndetectedAdapter
|
||||
)
|
||||
from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy
|
||||
|
||||
|
||||
async def main():
|
||||
# Create browser config
|
||||
browser_config = BrowserConfig(
|
||||
headless=False,
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
# Create the undetected adapter
|
||||
undetected_adapter = UndetectedAdapter()
|
||||
|
||||
# Create the crawler strategy with the undetected adapter
|
||||
crawler_strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=undetected_adapter
|
||||
)
|
||||
|
||||
# Create the crawler with our custom strategy
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=crawler_strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
# Configure the crawl
|
||||
crawler_config = CrawlerRunConfig(
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter()
|
||||
),
|
||||
capture_console_messages=True, # Test adapter console capture
|
||||
)
|
||||
|
||||
# Test on a site that typically detects bots
|
||||
print("Testing undetected adapter...")
|
||||
result: CrawlResult = await crawler.arun(
|
||||
url="https://www.helloworld.org",
|
||||
config=crawler_config
|
||||
)
|
||||
|
||||
print(f"Status: {result.status_code}")
|
||||
print(f"Success: {result.success}")
|
||||
print(f"Console messages captured: {len(result.console_messages or [])}")
|
||||
print(f"Markdown content (first 500 chars):\n{result.markdown.raw_markdown[:500]}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
## Browser Adapter Pattern
|
||||
|
||||
The undetected browser support is implemented using an adapter pattern, allowing seamless switching between different browser implementations:
|
||||
|
||||
```python
|
||||
# Regular browser adapter (default)
|
||||
from crawl4ai import PlaywrightAdapter
|
||||
regular_adapter = PlaywrightAdapter()
|
||||
|
||||
# Undetected browser adapter
|
||||
from crawl4ai import UndetectedAdapter
|
||||
undetected_adapter = UndetectedAdapter()
|
||||
```
|
||||
|
||||
The adapter handles:
|
||||
- JavaScript execution
|
||||
- Console message capture
|
||||
- Error handling
|
||||
- Browser-specific optimizations
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Avoid Headless Mode**: Detection is easier in headless mode
|
||||
```python
|
||||
browser_config = BrowserConfig(headless=False)
|
||||
```
|
||||
|
||||
2. **Use Reasonable Delays**: Don't rush through pages
|
||||
```python
|
||||
crawler_config = CrawlerRunConfig(
|
||||
wait_time=3.0, # Wait 3 seconds after page load
|
||||
delay_before_return_html=2.0 # Additional delay
|
||||
)
|
||||
```
|
||||
|
||||
3. **Rotate User Agents**: You can customize user agents
|
||||
```python
|
||||
browser_config = BrowserConfig(
|
||||
headers={"User-Agent": "your-user-agent"}
|
||||
)
|
||||
```
|
||||
|
||||
4. **Handle Failures Gracefully**: Some sites may still detect and block
|
||||
```python
|
||||
if not result.success:
|
||||
print(f"Crawl failed: {result.error_message}")
|
||||
```
|
||||
|
||||
## Advanced Usage Tips
|
||||
|
||||
### Progressive Detection Handling
|
||||
|
||||
```python
|
||||
async def crawl_with_progressive_evasion(url):
|
||||
# Step 1: Try regular browser with stealth
|
||||
browser_config = BrowserConfig(
|
||||
enable_stealth=True,
|
||||
headless=False
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url)
|
||||
if result.success and "Access Denied" not in result.html:
|
||||
return result
|
||||
|
||||
# Step 2: If blocked, try undetected browser
|
||||
print("Regular + stealth blocked, trying undetected browser...")
|
||||
|
||||
adapter = UndetectedAdapter()
|
||||
strategy = AsyncPlaywrightCrawlerStrategy(
|
||||
browser_config=browser_config,
|
||||
browser_adapter=adapter
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
crawler_strategy=strategy,
|
||||
config=browser_config
|
||||
) as crawler:
|
||||
result = await crawler.arun(url)
|
||||
return result
|
||||
```
|
||||
|
||||
## Installation
|
||||
|
||||
The undetected browser dependencies are automatically installed when you run:
|
||||
|
||||
```bash
|
||||
crawl4ai-setup
|
||||
```
|
||||
|
||||
This command installs all necessary browser dependencies for both regular and undetected modes.
|
||||
|
||||
## Limitations
|
||||
|
||||
- **Performance**: Slightly slower than regular mode due to additional patches
|
||||
- **Headless Detection**: Some sites can still detect headless mode
|
||||
- **Resource Usage**: May use more resources than regular mode
|
||||
- **Not 100% Guaranteed**: Advanced anti-bot services are constantly evolving
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Browser Not Found
|
||||
|
||||
Run the setup command:
|
||||
```bash
|
||||
crawl4ai-setup
|
||||
```
|
||||
|
||||
### Detection Still Occurring
|
||||
|
||||
Try combining with other features:
|
||||
```python
|
||||
crawler_config = CrawlerRunConfig(
|
||||
simulate_user=True, # Add user simulation
|
||||
magic=True, # Enable magic mode
|
||||
wait_time=5.0, # Longer waits
|
||||
)
|
||||
```
|
||||
|
||||
### Performance Issues
|
||||
|
||||
If experiencing slow performance:
|
||||
```python
|
||||
# Use selective undetected mode only for protected sites
|
||||
if is_protected_site(url):
|
||||
adapter = UndetectedAdapter()
|
||||
else:
|
||||
adapter = PlaywrightAdapter() # Default adapter
|
||||
```
|
||||
|
||||
## Future Plans
|
||||
|
||||
**Note**: In future versions of Crawl4AI, we may enable stealth mode and undetected browser by default to provide better out-of-the-box success rates. For now, users should explicitly enable these features when needed.
|
||||
|
||||
## Conclusion
|
||||
|
||||
Crawl4AI provides flexible anti-bot solutions:
|
||||
|
||||
1. **Start Simple**: Use regular browser + stealth mode for most sites
|
||||
2. **Escalate if Needed**: Switch to undetected browser for sophisticated protection
|
||||
3. **Combine for Maximum Effect**: Use both features together when facing the toughest challenges
|
||||
|
||||
Remember:
|
||||
- Always respect robots.txt and website terms of service
|
||||
- Use appropriate delays to avoid overwhelming servers
|
||||
- Consider the performance trade-offs of each approach
|
||||
- Test progressively to find the minimum necessary evasion level
|
||||
|
||||
## See Also
|
||||
|
||||
- [Advanced Features](advanced-features.md) - Overview of all advanced features
|
||||
- [Proxy & Security](proxy-security.md) - Using proxies with anti-bot features
|
||||
- [Session Management](session-management.md) - Maintaining sessions across requests
|
||||
- [Identity Based Crawling](identity-based-crawling.md) - Additional anti-detection strategies
|
||||
@@ -91,12 +91,13 @@ async def crawl_twitter_timeline():
|
||||
wait_after_scroll=1.0 # Twitter needs time to load
|
||||
)
|
||||
|
||||
browser_config = BrowserConfig(headless=True) # Set to False to watch it work
|
||||
config = CrawlerRunConfig(
|
||||
virtual_scroll_config=virtual_config
|
||||
virtual_scroll_config=virtual_config,
|
||||
# Optional: Set headless=False to watch it work
|
||||
# browser_config=BrowserConfig(headless=False)
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://twitter.com/search?q=AI",
|
||||
config=config
|
||||
@@ -199,7 +200,7 @@ Use **scan_full_page** when:
|
||||
Virtual Scroll works seamlessly with extraction strategies:
|
||||
|
||||
```python
|
||||
from crawl4ai import LLMExtractionStrategy, LLMConfig
|
||||
from crawl4ai import LLMExtractionStrategy
|
||||
|
||||
# Define extraction schema
|
||||
schema = {
|
||||
@@ -221,7 +222,7 @@ config = CrawlerRunConfig(
|
||||
scroll_count=20
|
||||
),
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
llm_config=LLMConfig(provider="openai/gpt-4o-mini"),
|
||||
provider="openai/gpt-4o-mini",
|
||||
schema=schema
|
||||
)
|
||||
)
|
||||
|
||||
@@ -27,7 +27,7 @@ async def digest(
|
||||
start_url: str,
|
||||
query: str,
|
||||
resume_from: Optional[Union[str, Path]] = None
|
||||
) -> AdaptiveCrawlResult
|
||||
) -> CrawlState
|
||||
```
|
||||
|
||||
#### Parameters
|
||||
@@ -38,7 +38,7 @@ async def digest(
|
||||
|
||||
#### Returns
|
||||
|
||||
- **AdaptiveCrawlResult**: The final crawl state containing all crawled URLs, knowledge base, and metrics
|
||||
- **CrawlState**: The final crawl state containing all crawled URLs, knowledge base, and metrics
|
||||
|
||||
#### Example
|
||||
|
||||
@@ -92,7 +92,7 @@ Access to the current crawl state.
|
||||
|
||||
```python
|
||||
@property
|
||||
def state(self) -> AdaptiveCrawlResult
|
||||
def state(self) -> CrawlState
|
||||
```
|
||||
|
||||
## Methods
|
||||
|
||||
@@ -9,7 +9,7 @@ async def digest(
|
||||
start_url: str,
|
||||
query: str,
|
||||
resume_from: Optional[Union[str, Path]] = None
|
||||
) -> AdaptiveCrawlResult
|
||||
) -> CrawlState
|
||||
```
|
||||
|
||||
## Parameters
|
||||
@@ -31,7 +31,7 @@ async def digest(
|
||||
|
||||
## Return Value
|
||||
|
||||
Returns a `AdaptiveCrawlResult` object containing:
|
||||
Returns a `CrawlState` object containing:
|
||||
|
||||
- **crawled_urls** (`Set[str]`): All URLs that have been crawled
|
||||
- **knowledge_base** (`List[CrawlResult]`): Collection of crawled pages with content
|
||||
|
||||
@@ -10,8 +10,9 @@ Today I'm releasing Crawl4AI v0.7.0—the Adaptive Intelligence Update. This rel
|
||||
|
||||
- **Adaptive Crawling**: Your crawler now learns and adapts to website patterns
|
||||
- **Virtual Scroll Support**: Complete content extraction from infinite scroll pages
|
||||
- **Link Preview with Intelligent Scoring**: Intelligent link analysis and prioritization
|
||||
- **Link Preview with 3-Layer Scoring**: Intelligent link analysis and prioritization
|
||||
- **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering
|
||||
- **PDF Parsing**: Extract data from PDF documents
|
||||
- **Performance Optimizations**: Significant speed and memory improvements
|
||||
|
||||
## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning
|
||||
@@ -29,41 +30,44 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking:
|
||||
- Extraction confidence scores
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||
import asyncio
|
||||
from crawl4ai import AdaptiveCrawler, AdaptiveConfig, CrawlState
|
||||
|
||||
async def main():
|
||||
|
||||
# Configure adaptive crawler
|
||||
config = AdaptiveConfig(
|
||||
strategy="statistical", # or "embedding" for semantic understanding
|
||||
max_pages=10,
|
||||
confidence_threshold=0.7, # Stop at 70% confidence
|
||||
top_k_links=3, # Follow top 3 links per page
|
||||
min_gain_threshold=0.05 # Need 5% information gain to continue
|
||||
# Initialize with custom learning parameters
|
||||
config = AdaptiveConfig(
|
||||
confidence_threshold=0.7, # Min confidence to use learned patterns
|
||||
max_history=100, # Remember last 100 crawls per domain
|
||||
learning_rate=0.2, # How quickly to adapt to changes
|
||||
patterns_per_page=3, # Patterns to learn per page type
|
||||
extraction_strategy='css' # 'css' or 'xpath'
|
||||
)
|
||||
|
||||
adaptive_crawler = AdaptiveCrawler(config)
|
||||
|
||||
# First crawl - crawler learns the structure
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://news.example.com/article/12345",
|
||||
config=CrawlerRunConfig(
|
||||
adaptive_config=config,
|
||||
extraction_hints={ # Optional hints to speed up learning
|
||||
"title": "article h1",
|
||||
"content": "article .body-content"
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
|
||||
print("Starting adaptive crawl about Python decorators...")
|
||||
result = await adaptive.digest(
|
||||
start_url="https://docs.python.org/3/glossary.html",
|
||||
query="python decorators functions wrapping"
|
||||
)
|
||||
|
||||
print(f"\n✅ Crawling Complete!")
|
||||
print(f"• Confidence Level: {adaptive.confidence:.0%}")
|
||||
print(f"• Pages Crawled: {len(result.crawled_urls)}")
|
||||
print(f"• Knowledge Base: {len(adaptive.state.knowledge_base)} documents")
|
||||
|
||||
# Get most relevant content
|
||||
relevant = adaptive.get_relevant_content(top_k=3)
|
||||
print(f"\nMost Relevant Pages:")
|
||||
for i, page in enumerate(relevant, 1):
|
||||
print(f"{i}. {page['url']} (relevance: {page['score']:.2%})")
|
||||
# Crawler identifies and stores patterns
|
||||
if result.success:
|
||||
state = adaptive_crawler.get_state("news.example.com")
|
||||
print(f"Learned {len(state.patterns)} patterns")
|
||||
print(f"Confidence: {state.avg_confidence:.2%}")
|
||||
|
||||
asyncio.run(main())
|
||||
# Subsequent crawls - uses learned patterns
|
||||
result2 = await crawler.arun(
|
||||
"https://news.example.com/article/67890",
|
||||
config=CrawlerRunConfig(adaptive_config=config)
|
||||
)
|
||||
# Automatically extracts using learned patterns!
|
||||
```
|
||||
|
||||
**Expected Real-World Impact:**
|
||||
@@ -88,7 +92,9 @@ twitter_config = VirtualScrollConfig(
|
||||
container_selector="[data-testid='primaryColumn']",
|
||||
scroll_count=20, # Number of scrolls
|
||||
scroll_by="container_height", # Smart scrolling by container size
|
||||
wait_after_scroll=1.0 # Let content load
|
||||
wait_after_scroll=1.0, # Let content load
|
||||
capture_method="incremental", # Capture new content on each scroll
|
||||
deduplicate=True # Remove duplicate elements
|
||||
)
|
||||
|
||||
# For e-commerce product grids (Instagram style)
|
||||
@@ -96,7 +102,8 @@ grid_config = VirtualScrollConfig(
|
||||
container_selector="main .product-grid",
|
||||
scroll_count=30,
|
||||
scroll_by=800, # Fixed pixel scrolling
|
||||
wait_after_scroll=1.5 # Images need time
|
||||
wait_after_scroll=1.5, # Images need time
|
||||
stop_on_no_change=True # Smart stopping
|
||||
)
|
||||
|
||||
# For news feeds with lazy loading
|
||||
@@ -104,7 +111,9 @@ news_config = VirtualScrollConfig(
|
||||
container_selector=".article-feed",
|
||||
scroll_count=50,
|
||||
scroll_by="page_height", # Viewport-based scrolling
|
||||
wait_after_scroll=0.5 # Wait for content to load
|
||||
wait_after_scroll=0.5,
|
||||
wait_for_selector=".article-card", # Wait for specific elements
|
||||
timeout=30000 # Max 30 seconds total
|
||||
)
|
||||
|
||||
# Use it in your crawl
|
||||
@@ -148,63 +157,68 @@ async with AsyncWebCrawler() as crawler:
|
||||
|
||||
**My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals.
|
||||
|
||||
### Intelligent Link Analysis and Scoring
|
||||
### The Three-Layer Scoring System
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import CrawlerRunConfig, CacheMode, AsyncWebCrawler
|
||||
from crawl4ai.adaptive_crawler import LinkPreviewConfig
|
||||
from crawl4ai import LinkPreviewConfig
|
||||
|
||||
async def main():
|
||||
# Configure intelligent link analysis
|
||||
link_config = LinkPreviewConfig(
|
||||
include_internal=True,
|
||||
include_external=False,
|
||||
max_links=10,
|
||||
concurrency=5,
|
||||
query="python tutorial", # For contextual scoring
|
||||
score_threshold=0.3,
|
||||
verbose=True
|
||||
# Configure intelligent link analysis
|
||||
link_config = LinkPreviewConfig(
|
||||
# What to analyze
|
||||
include_internal=True,
|
||||
include_external=True,
|
||||
max_links=100, # Analyze top 100 links
|
||||
|
||||
# Relevance scoring
|
||||
query="machine learning tutorials", # Your interest
|
||||
score_threshold=0.3, # Minimum relevance score
|
||||
|
||||
# Performance
|
||||
concurrent_requests=10, # Parallel processing
|
||||
timeout_per_link=5000, # 5s per link
|
||||
|
||||
# Advanced scoring weights
|
||||
scoring_weights={
|
||||
"intrinsic": 0.3, # Link quality indicators
|
||||
"contextual": 0.5, # Relevance to query
|
||||
"popularity": 0.2 # Link prominence
|
||||
}
|
||||
)
|
||||
|
||||
# Use in your crawl
|
||||
result = await crawler.arun(
|
||||
"https://tech-blog.example.com",
|
||||
config=CrawlerRunConfig(
|
||||
link_preview_config=link_config,
|
||||
score_links=True
|
||||
)
|
||||
# Use in your crawl
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
"https://www.geeksforgeeks.org/",
|
||||
config=CrawlerRunConfig(
|
||||
link_preview_config=link_config,
|
||||
score_links=True, # Enable intrinsic scoring
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
# Access scored and sorted links
|
||||
if result.success and result.links:
|
||||
for link in result.links.get("internal", []):
|
||||
text = link.get('text', 'No text')[:40]
|
||||
print(
|
||||
text,
|
||||
f"{link.get('intrinsic_score', 0):.1f}/10" if link.get('intrinsic_score') is not None else "0.0/10",
|
||||
f"{link.get('contextual_score', 0):.2f}/1" if link.get('contextual_score') is not None else "0.00/1",
|
||||
f"{link.get('total_score', 0):.3f}" if link.get('total_score') is not None else "0.000"
|
||||
)
|
||||
|
||||
asyncio.run(main())
|
||||
# Access scored and sorted links
|
||||
for link in result.links["internal"][:10]: # Top 10 internal links
|
||||
print(f"Score: {link['total_score']:.3f}")
|
||||
print(f" Intrinsic: {link['intrinsic_score']:.1f}/10") # Position, attributes
|
||||
print(f" Contextual: {link['contextual_score']:.1f}/1") # Relevance to query
|
||||
print(f" URL: {link['href']}")
|
||||
print(f" Title: {link['head_data']['title']}")
|
||||
print(f" Description: {link['head_data']['meta']['description'][:100]}...")
|
||||
```
|
||||
|
||||
**Scoring Components:**
|
||||
|
||||
1. **Intrinsic Score**: Based on link quality indicators
|
||||
1. **Intrinsic Score (0-10)**: Based on link quality indicators
|
||||
- Position on page (navigation, content, footer)
|
||||
- Link attributes (rel, title, class names)
|
||||
- Anchor text quality and length
|
||||
- URL structure and depth
|
||||
|
||||
2. **Contextual Score**: Relevance to your query using BM25 algorithm
|
||||
2. **Contextual Score (0-1)**: Relevance to your query
|
||||
- Semantic similarity using embeddings
|
||||
- Keyword matching in link text and title
|
||||
- Meta description analysis
|
||||
- Content preview scoring
|
||||
|
||||
3. **Total Score**: Combined score for final ranking
|
||||
3. **Total Score**: Weighted combination for final ranking
|
||||
|
||||
**Expected Real-World Impact:**
|
||||
- **Research Efficiency**: Find relevant papers 10x faster by following only high-score links
|
||||
@@ -221,34 +235,58 @@ asyncio.run(main())
|
||||
### Technical Architecture
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
||||
|
||||
async def main():
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
# Discover Python tutorial URLs
|
||||
config = SeedingConfig(
|
||||
source="sitemap", # Use sitemap
|
||||
pattern="*python*", # URL pattern filter
|
||||
extract_head=True, # Get metadata
|
||||
query="python tutorial", # For relevance scoring
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.2,
|
||||
max_urls=10
|
||||
)
|
||||
|
||||
print("Discovering Python async tutorial URLs...")
|
||||
urls = await seeder.urls("https://www.geeksforgeeks.org/", config)
|
||||
|
||||
print(f"\n✅ Found {len(urls)} relevant URLs:")
|
||||
for i, url_info in enumerate(urls[:5], 1):
|
||||
print(f"\n{i}. {url_info['url']}")
|
||||
if url_info.get('relevance_score'):
|
||||
print(f" Relevance: {url_info['relevance_score']:.3f}")
|
||||
if url_info.get('head_data', {}).get('title'):
|
||||
print(f" Title: {url_info['head_data']['title'][:60]}...")
|
||||
# Basic discovery - find all product pages
|
||||
seeder_config = SeedingConfig(
|
||||
# Discovery sources
|
||||
source="sitemap+cc", # Sitemap + Common Crawl
|
||||
|
||||
# Filtering
|
||||
pattern="*/product/*", # URL pattern matching
|
||||
ignore_patterns=["*/reviews/*", "*/questions/*"],
|
||||
|
||||
# Validation
|
||||
live_check=True, # Verify URLs are alive
|
||||
max_urls=5000, # Stop at 5000 URLs
|
||||
|
||||
# Performance
|
||||
concurrency=100, # Parallel requests
|
||||
hits_per_sec=10 # Rate limiting
|
||||
)
|
||||
|
||||
asyncio.run(main())
|
||||
seeder = AsyncUrlSeeder(seeder_config)
|
||||
urls = await seeder.discover("https://shop.example.com")
|
||||
|
||||
# Advanced: Relevance-based discovery
|
||||
research_config = SeedingConfig(
|
||||
source="crawl+sitemap", # Deep crawl + sitemap
|
||||
pattern="*/blog/*", # Blog posts only
|
||||
|
||||
# Content relevance
|
||||
extract_head=True, # Get meta tags
|
||||
query="quantum computing tutorials",
|
||||
scoring_method="bm25", # Or "semantic" (coming soon)
|
||||
score_threshold=0.4, # High relevance only
|
||||
|
||||
# Smart filtering
|
||||
filter_nonsense_urls=True, # Remove .xml, .txt, etc.
|
||||
min_content_length=500, # Skip thin content
|
||||
|
||||
force=True # Bypass cache
|
||||
)
|
||||
|
||||
# Discover with progress tracking
|
||||
discovered = []
|
||||
async for batch in seeder.discover_iter("https://physics-blog.com", research_config):
|
||||
discovered.extend(batch)
|
||||
print(f"Found {len(discovered)} relevant URLs so far...")
|
||||
|
||||
# Results include scores and metadata
|
||||
for url_data in discovered[:5]:
|
||||
print(f"URL: {url_data['url']}")
|
||||
print(f"Score: {url_data['score']:.3f}")
|
||||
print(f"Title: {url_data['title']}")
|
||||
```
|
||||
|
||||
**Discovery Methods:**
|
||||
@@ -271,18 +309,35 @@ This release includes significant performance improvements through optimized res
|
||||
### What We Optimized
|
||||
|
||||
```python
|
||||
# Optimized crawling with v0.7.0 improvements
|
||||
# Before v0.7.0 (slow)
|
||||
results = []
|
||||
for url in urls:
|
||||
result = await crawler.arun(
|
||||
url,
|
||||
config=CrawlerRunConfig(
|
||||
# Performance optimizations
|
||||
wait_until="domcontentloaded", # Faster than networkidle
|
||||
cache_mode=CacheMode.ENABLED # Enable caching
|
||||
)
|
||||
)
|
||||
result = await crawler.arun(url)
|
||||
results.append(result)
|
||||
|
||||
# After v0.7.0 (fast)
|
||||
# Automatic batching and connection pooling
|
||||
results = await crawler.arun_batch(
|
||||
urls,
|
||||
config=CrawlerRunConfig(
|
||||
# New performance options
|
||||
batch_size=10, # Process 10 URLs concurrently
|
||||
reuse_browser=True, # Keep browser warm
|
||||
eager_loading=False, # Load only what's needed
|
||||
streaming_extraction=True, # Stream large extractions
|
||||
|
||||
# Optimized defaults
|
||||
wait_until="domcontentloaded", # Faster than networkidle
|
||||
exclude_external_resources=True, # Skip third-party assets
|
||||
block_ads=True # Ad blocking built-in
|
||||
)
|
||||
)
|
||||
|
||||
# Memory-efficient streaming for large crawls
|
||||
async for result in crawler.arun_stream(large_url_list):
|
||||
# Process results as they complete
|
||||
await process_result(result)
|
||||
# Memory is freed after each iteration
|
||||
```
|
||||
|
||||
**Performance Gains:**
|
||||
@@ -292,6 +347,24 @@ for url in urls:
|
||||
- **Memory Usage**: 60% reduction with streaming processing
|
||||
- **Concurrent Crawls**: Handle 5x more parallel requests
|
||||
|
||||
## 📄 PDF Support
|
||||
|
||||
PDF extraction is now natively supported in Crawl4AI.
|
||||
|
||||
```python
|
||||
# Extract data from PDF documents
|
||||
result = await crawler.arun(
|
||||
"https://example.com/report.pdf",
|
||||
config=CrawlerRunConfig(
|
||||
pdf_extraction=True,
|
||||
extraction_strategy=JsonCssExtractionStrategy({
|
||||
# Works on converted PDF structure
|
||||
"title": {"selector": "h1", "type": "text"},
|
||||
"sections": {"selector": "h2", "type": "list"}
|
||||
})
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
## 🔧 Important Changes
|
||||
|
||||
|
||||
@@ -35,7 +35,7 @@ from crawl4ai import AsyncWebCrawler, AdaptiveCrawler
|
||||
|
||||
async def main():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Create an adaptive crawler (config is optional)
|
||||
# Create an adaptive crawler
|
||||
adaptive = AdaptiveCrawler(crawler)
|
||||
|
||||
# Start crawling with a query
|
||||
@@ -59,13 +59,13 @@ async def main():
|
||||
from crawl4ai import AdaptiveConfig
|
||||
|
||||
config = AdaptiveConfig(
|
||||
confidence_threshold=0.8, # Stop when 80% confident (default: 0.7)
|
||||
max_pages=30, # Maximum pages to crawl (default: 20)
|
||||
top_k_links=5, # Links to follow per page (default: 3)
|
||||
confidence_threshold=0.7, # Stop when 70% confident (default: 0.8)
|
||||
max_pages=20, # Maximum pages to crawl (default: 50)
|
||||
top_k_links=3, # Links to follow per page (default: 5)
|
||||
min_gain_threshold=0.05 # Minimum expected gain to continue (default: 0.1)
|
||||
)
|
||||
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
adaptive = AdaptiveCrawler(crawler, config=config)
|
||||
```
|
||||
|
||||
## Crawling Strategies
|
||||
@@ -198,8 +198,8 @@ if result.metrics.get('is_irrelevant', False):
|
||||
The confidence score (0-1) indicates how sufficient the gathered information is:
|
||||
- **0.0-0.3**: Insufficient information, needs more crawling
|
||||
- **0.3-0.6**: Partial information, may answer basic queries
|
||||
- **0.6-0.7**: Good coverage, can answer most queries
|
||||
- **0.7-1.0**: Excellent coverage, comprehensive information
|
||||
- **0.6-0.8**: Good coverage, can answer most queries
|
||||
- **0.8-1.0**: Excellent coverage, comprehensive information
|
||||
|
||||
### Statistics Display
|
||||
|
||||
@@ -257,9 +257,9 @@ new_adaptive.import_knowledge_base("knowledge_base.jsonl")
|
||||
- Avoid overly broad queries
|
||||
|
||||
### 2. Threshold Tuning
|
||||
- Start with default (0.7) for general use
|
||||
- Lower to 0.5-0.6 for exploratory crawling
|
||||
- Raise to 0.8+ for exhaustive coverage
|
||||
- Start with default (0.8) for general use
|
||||
- Lower to 0.6-0.7 for exploratory crawling
|
||||
- Raise to 0.9+ for exhaustive coverage
|
||||
|
||||
### 3. Performance Optimization
|
||||
- Use appropriate `max_pages` limits
|
||||
|
||||
@@ -29,6 +29,7 @@ class BrowserConfig:
|
||||
text_mode=False,
|
||||
light_mode=False,
|
||||
extra_args=None,
|
||||
enable_stealth=False,
|
||||
# ... other advanced parameters omitted here
|
||||
):
|
||||
...
|
||||
@@ -84,6 +85,11 @@ class BrowserConfig:
|
||||
- Additional flags for the underlying browser.
|
||||
- E.g. `["--disable-extensions"]`.
|
||||
|
||||
11. **`enable_stealth`**:
|
||||
- If `True`, enables stealth mode using playwright-stealth.
|
||||
- Modifies browser fingerprints to avoid basic bot detection.
|
||||
- Default is `False`. Recommended for sites with bot protection.
|
||||
|
||||
### Helper Methods
|
||||
|
||||
Both configuration classes provide a `clone()` method to create modified copies:
|
||||
|
||||
@@ -19,7 +19,6 @@ The new system uses a single `CacheMode` enum:
|
||||
- `CacheMode.READ_ONLY`: Only read from cache
|
||||
- `CacheMode.WRITE_ONLY`: Only write to cache
|
||||
- `CacheMode.BYPASS`: Skip cache for this operation
|
||||
- `CacheMode.SMART`: **NEW** - Intelligently validate cache with HEAD requests
|
||||
|
||||
## Migration Example
|
||||
|
||||
@@ -73,128 +72,4 @@ if __name__ == "__main__":
|
||||
| `bypass_cache=True` | `cache_mode=CacheMode.BYPASS` |
|
||||
| `disable_cache=True` | `cache_mode=CacheMode.DISABLED`|
|
||||
| `no_cache_read=True` | `cache_mode=CacheMode.WRITE_ONLY` |
|
||||
| `no_cache_write=True` | `cache_mode=CacheMode.READ_ONLY` |
|
||||
|
||||
## SMART Cache Mode: Only Crawl When Changes
|
||||
|
||||
Starting from version 0.7.1, Crawl4AI introduces the **SMART cache mode** - an intelligent caching strategy that validates cached content before using it. This mode uses HTTP HEAD requests to check if content has changed, potentially saving 70-95% bandwidth on unchanged content.
|
||||
|
||||
### How SMART Mode Works
|
||||
|
||||
When you use `CacheMode.SMART`, Crawl4AI:
|
||||
|
||||
1. **Retrieves cached content** (if available)
|
||||
2. **Sends a HEAD request** with conditional headers (ETag, Last-Modified)
|
||||
3. **Validates the response**:
|
||||
- If server returns `304 Not Modified` → uses cache
|
||||
- If content changed → performs fresh crawl
|
||||
- If headers indicate changes → performs fresh crawl
|
||||
|
||||
### Benefits
|
||||
|
||||
- **Bandwidth Efficient**: Only downloads full content when necessary
|
||||
- **Always Fresh**: Ensures you get the latest content when it changes
|
||||
- **Cost Effective**: Reduces API calls and bandwidth usage
|
||||
- **Intelligent**: Uses multiple signals to detect changes (ETag, Last-Modified, Content-Length)
|
||||
|
||||
### Basic Usage
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.cache_context import CacheMode
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
|
||||
async def smart_crawl():
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
# First crawl - caches the content
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
||||
result1 = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=config
|
||||
)
|
||||
print(f"First crawl: {len(result1.html)} bytes")
|
||||
|
||||
# Second crawl - uses SMART mode
|
||||
smart_config = CrawlerRunConfig(cache_mode=CacheMode.SMART)
|
||||
result2 = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=smart_config
|
||||
)
|
||||
print(f"SMART crawl: {len(result2.html)} bytes (from cache if unchanged)")
|
||||
|
||||
asyncio.run(smart_crawl())
|
||||
```
|
||||
|
||||
### When to Use SMART Mode
|
||||
|
||||
SMART mode is ideal for:
|
||||
|
||||
- **Periodic crawling** of websites that update irregularly
|
||||
- **News sites** where you want fresh content but avoid re-downloading unchanged pages
|
||||
- **API endpoints** that provide proper caching headers
|
||||
- **Large-scale crawling** where bandwidth costs are significant
|
||||
|
||||
### How It Detects Changes
|
||||
|
||||
SMART mode checks these signals in order:
|
||||
|
||||
1. **304 Not Modified** status (most reliable)
|
||||
2. **Content-Digest** header (RFC 9530)
|
||||
3. **Strong ETag** comparison
|
||||
4. **Last-Modified** timestamp
|
||||
5. **Content-Length** changes (as a hint)
|
||||
|
||||
### Example: News Site Monitoring
|
||||
|
||||
```python
|
||||
async def monitor_news_site():
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.SMART)
|
||||
|
||||
# Check multiple times
|
||||
for i in range(3):
|
||||
result = await crawler.arun(
|
||||
url="https://news.ycombinator.com",
|
||||
config=config
|
||||
)
|
||||
|
||||
# SMART mode will only re-crawl if content changed
|
||||
print(f"Check {i+1}: Retrieved {len(result.html)} bytes")
|
||||
await asyncio.sleep(300) # Wait 5 minutes
|
||||
|
||||
asyncio.run(monitor_news_site())
|
||||
```
|
||||
|
||||
### Understanding SMART Mode Logs
|
||||
|
||||
When using SMART mode with `verbose=True`, you'll see informative logs:
|
||||
|
||||
```
|
||||
[SMART] ℹ SMART cache: 304 Not Modified - Content unchanged - Using cache for https://example.com
|
||||
[SMART] ℹ SMART cache: Content-Length changed (12345 -> 12789) - Re-crawling https://example.com
|
||||
[SMART] ℹ SMART cache: No definitive cache headers matched - Assuming content changed - Re-crawling https://example.com
|
||||
```
|
||||
|
||||
### Limitations
|
||||
|
||||
- Some servers don't properly support HEAD requests
|
||||
- Dynamic content without proper cache headers will always be re-crawled
|
||||
- Content changes must be reflected in HTTP headers for detection
|
||||
|
||||
### Advanced Example
|
||||
|
||||
For a complete example demonstrating SMART mode with both static and dynamic content, check out `docs/examples/smart_cache.py`.
|
||||
|
||||
## Cache Mode Reference
|
||||
|
||||
| Mode | Read from Cache | Write to Cache | Use Case |
|
||||
|------|----------------|----------------|----------|
|
||||
| `ENABLED` | ✓ | ✓ | Normal operation |
|
||||
| `DISABLED` | ✗ | ✗ | No caching needed |
|
||||
| `READ_ONLY` | ✓ | ✗ | Use existing cache only |
|
||||
| `WRITE_ONLY` | ✗ | ✓ | Refresh cache only |
|
||||
| `BYPASS` | ✗ | ✗ | Skip cache for this request |
|
||||
| `SMART` | ✓* | ✓ | Validate before using cache |
|
||||
|
||||
*SMART mode reads from cache but validates it first with a HEAD request.
|
||||
| `no_cache_write=True` | `cache_mode=CacheMode.READ_ONLY` |
|
||||
@@ -28,21 +28,12 @@ This page provides a comprehensive list of example scripts that demonstrate vari
|
||||
| Example | Description | Link |
|
||||
|---------|-------------|------|
|
||||
| Deep Crawling | An extensive tutorial on deep crawling capabilities, demonstrating BFS and BestFirst strategies, stream vs. non-stream execution, filters, scorers, and advanced configurations. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/deepcrawl_example.py) |
|
||||
<<<<<<< HEAD
|
||||
| Virtual Scroll | Comprehensive examples for handling virtualized scrolling on sites like Twitter, Instagram. Demonstrates different scrolling scenarios with local test server. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/virtual_scroll_example.py) |
|
||||
=======
|
||||
| Adaptive Crawling | Demonstrates intelligent crawling that automatically determines when sufficient information has been gathered. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/adaptive_crawling/) |
|
||||
>>>>>>> feature/progressive-crawling
|
||||
| Dispatcher | Shows how to use the crawl dispatcher for advanced workload management. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/dispatcher_example.py) |
|
||||
| Storage State | Tutorial on managing browser storage state for persistence. | [View Guide](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/storage_state_tutorial.md) |
|
||||
| Network Console Capture | Demonstrates how to capture and analyze network requests and console logs. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/network_console_capture_example.py) |
|
||||
|
||||
## Caching & Performance
|
||||
|
||||
| Example | Description | Link |
|
||||
|---------|-------------|------|
|
||||
| SMART Cache Mode | Demonstrates the intelligent SMART cache mode that validates cached content using HEAD requests, saving 70-95% bandwidth while ensuring fresh content. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/smart_cache.py) |
|
||||
|
||||
## Extraction Strategies
|
||||
|
||||
| Example | Description | Link |
|
||||
@@ -63,6 +54,16 @@ This page provides a comprehensive list of example scripts that demonstrate vari
|
||||
| Crypto Analysis | Demonstrates how to crawl and analyze cryptocurrency data. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/crypto_analysis_example.py) |
|
||||
| SERP API | Demonstrates using Crawl4AI with search engine result pages. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/serp_api_project_11_feb.py) |
|
||||
|
||||
## Anti-Bot & Stealth Features
|
||||
|
||||
| Example | Description | Link |
|
||||
|---------|-------------|------|
|
||||
| Stealth Mode Quick Start | Five practical examples showing how to use stealth mode for bypassing basic bot detection. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/stealth_mode_quick_start.py) |
|
||||
| Stealth Mode Comprehensive | Comprehensive demonstration of stealth mode features with bot detection testing and comparisons. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/stealth_mode_example.py) |
|
||||
| Undetected Browser | Simple example showing how to use the undetected browser adapter. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/hello_world_undetected.py) |
|
||||
| Undetected Browser Demo | Basic demo comparing regular and undetected browser modes. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/undetected_simple_demo.py) |
|
||||
| Undetected Tests | Advanced tests comparing regular vs undetected browsers on various bot detection services. | [View Folder](https://github.com/unclecode/crawl4ai/tree/main/docs/examples/undetectability/) |
|
||||
|
||||
## Customization & Security
|
||||
|
||||
| Example | Description | Link |
|
||||
|
||||
@@ -18,7 +18,7 @@ crawl4ai-setup
|
||||
```
|
||||
|
||||
**What does it do?**
|
||||
- Installs or updates required Playwright browsers (Chromium, Firefox, etc.)
|
||||
- Installs or updates required browser dependencies for both regular and undetected modes
|
||||
- Performs OS-level checks (e.g., missing libs on Linux)
|
||||
- Confirms your environment is ready to crawl
|
||||
|
||||
|
||||
@@ -125,7 +125,7 @@ Here's a full example you can copy, paste, and run immediately:
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai import LinkPreviewConfig
|
||||
from crawl4ai.async_configs import LinkPreviewConfig
|
||||
|
||||
async def extract_link_heads_example():
|
||||
"""
|
||||
@@ -237,7 +237,7 @@ if __name__ == "__main__":
|
||||
The `LinkPreviewConfig` class supports these options:
|
||||
|
||||
```python
|
||||
from crawl4ai import LinkPreviewConfig
|
||||
from crawl4ai.async_configs import LinkPreviewConfig
|
||||
|
||||
link_preview_config = LinkPreviewConfig(
|
||||
# BASIC SETTINGS
|
||||
|
||||
@@ -79,7 +79,7 @@ if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
> IMPORTANT: By default cache mode is set to `CacheMode.ENABLED`. So to have fresh content, you need to set it to `CacheMode.BYPASS`. For intelligent caching that validates content before using cache, use the new `CacheMode.SMART` - it saves bandwidth while ensuring fresh content.
|
||||
> IMPORTANT: By default cache mode is set to `CacheMode.ENABLED`. So to have fresh content, you need to set it to `CacheMode.BYPASS`
|
||||
|
||||
We’ll explore more advanced config in later tutorials (like enabling proxies, PDF output, multi-tab sessions, etc.). For now, just note how you pass these objects to manage crawling.
|
||||
|
||||
|
||||
@@ -137,7 +137,7 @@ async def smart_blog_crawler():
|
||||
word_count_threshold=300 # Only substantial articles
|
||||
)
|
||||
|
||||
# Extract URLs and crawl them
|
||||
# Extract URLs and stream results as they come
|
||||
tutorial_urls = [t["url"] for t in tutorials[:10]]
|
||||
results = await crawler.arun_many(tutorial_urls, config=config)
|
||||
|
||||
@@ -231,7 +231,7 @@ Common Crawl is a massive public dataset that regularly crawls the entire web. I
|
||||
|
||||
```python
|
||||
# Use both sources
|
||||
config = SeedingConfig(source="sitemap+cc")
|
||||
config = SeedingConfig(source="cc+sitemap")
|
||||
urls = await seeder.urls("example.com", config)
|
||||
```
|
||||
|
||||
@@ -241,13 +241,13 @@ The `SeedingConfig` object is your control panel. Here's everything you can conf
|
||||
|
||||
| Parameter | Type | Default | Description |
|
||||
|-----------|------|---------|-------------|
|
||||
| `source` | str | "sitemap+cc" | URL source: "cc" (Common Crawl), "sitemap", or "sitemap+cc" |
|
||||
| `source` | str | "cc" | URL source: "cc" (Common Crawl), "sitemap", or "cc+sitemap" |
|
||||
| `pattern` | str | "*" | URL pattern filter (e.g., "*/blog/*", "*.html") |
|
||||
| `extract_head` | bool | False | Extract metadata from page `<head>` |
|
||||
| `live_check` | bool | False | Verify URLs are accessible |
|
||||
| `max_urls` | int | -1 | Maximum URLs to return (-1 = unlimited) |
|
||||
| `concurrency` | int | 10 | Parallel workers for fetching |
|
||||
| `hits_per_sec` | int | 5 | Rate limit for requests |
|
||||
| `hits_per_sec` | int | None | Rate limit for requests |
|
||||
| `force` | bool | False | Bypass cache, fetch fresh data |
|
||||
| `verbose` | bool | False | Show detailed progress |
|
||||
| `query` | str | None | Search query for BM25 scoring |
|
||||
@@ -522,7 +522,7 @@ urls = await seeder.urls("docs.example.com", config)
|
||||
```python
|
||||
# Find specific products
|
||||
config = SeedingConfig(
|
||||
source="sitemap+cc", # Use both sources
|
||||
source="cc+sitemap", # Use both sources
|
||||
extract_head=True,
|
||||
query="wireless headphones noise canceling",
|
||||
scoring_method="bm25",
|
||||
@@ -782,7 +782,7 @@ class ResearchAssistant:
|
||||
|
||||
# Step 1: Discover relevant URLs
|
||||
config = SeedingConfig(
|
||||
source="sitemap+cc", # Maximum coverage
|
||||
source="cc+sitemap", # Maximum coverage
|
||||
extract_head=True, # Get metadata
|
||||
query=topic, # Research topic
|
||||
scoring_method="bm25", # Smart scoring
|
||||
@@ -832,8 +832,7 @@ class ResearchAssistant:
|
||||
# Extract URLs and crawl all articles
|
||||
article_urls = [article['url'] for article in top_articles]
|
||||
results = []
|
||||
crawl_results = await crawler.arun_many(article_urls, config=config)
|
||||
async for result in crawl_results:
|
||||
async for result in await crawler.arun_many(article_urls, config=config):
|
||||
if result.success:
|
||||
results.append({
|
||||
'url': result.url,
|
||||
@@ -934,10 +933,10 @@ config = SeedingConfig(concurrency=10, hits_per_sec=5)
|
||||
# When crawling many URLs
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Assuming urls is a list of URL strings
|
||||
crawl_results = await crawler.arun_many(urls, config=config)
|
||||
results = await crawler.arun_many(urls, config=config)
|
||||
|
||||
# Process as they arrive
|
||||
async for result in crawl_results:
|
||||
async for result in results:
|
||||
process_immediately(result) # Don't wait for all
|
||||
```
|
||||
|
||||
@@ -1021,7 +1020,7 @@ config = SeedingConfig(
|
||||
|
||||
# E-commerce product discovery
|
||||
config = SeedingConfig(
|
||||
source="sitemap+cc",
|
||||
source="cc+sitemap",
|
||||
pattern="*/product/*",
|
||||
extract_head=True,
|
||||
live_check=True
|
||||
|
||||
@@ -28,7 +28,7 @@ from rich import box
|
||||
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, AdaptiveCrawler, AdaptiveConfig, BrowserConfig, CacheMode
|
||||
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
||||
from crawl4ai import LinkPreviewConfig, VirtualScrollConfig
|
||||
from crawl4ai.async_configs import LinkPreviewConfig, VirtualScrollConfig
|
||||
from crawl4ai import c4a_compile, CompilationResult
|
||||
|
||||
# Initialize Rich console for beautiful output
|
||||
|
||||
@@ -13,13 +13,14 @@ from crawl4ai import (
|
||||
BrowserConfig,
|
||||
CacheMode,
|
||||
# New imports for v0.7.0
|
||||
VirtualScrollConfig,
|
||||
LinkPreviewConfig,
|
||||
VirtualScrollConfig,
|
||||
AdaptiveCrawler,
|
||||
AdaptiveConfig,
|
||||
AsyncUrlSeeder,
|
||||
SeedingConfig,
|
||||
c4a_compile,
|
||||
CompilationResult
|
||||
)
|
||||
|
||||
|
||||
@@ -169,16 +170,16 @@ async def demo_url_seeder():
|
||||
# Discover Python tutorial URLs
|
||||
config = SeedingConfig(
|
||||
source="sitemap", # Use sitemap
|
||||
pattern="*python*", # URL pattern filter
|
||||
pattern="*tutorial*", # URL pattern filter
|
||||
extract_head=True, # Get metadata
|
||||
query="python tutorial", # For relevance scoring
|
||||
query="python async programming", # For relevance scoring
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.2,
|
||||
max_urls=10
|
||||
)
|
||||
|
||||
print("Discovering Python async tutorial URLs...")
|
||||
urls = await seeder.urls("https://www.geeksforgeeks.org/", config)
|
||||
urls = await seeder.urls("docs.python.org", config)
|
||||
|
||||
print(f"\n✅ Found {len(urls)} relevant URLs:")
|
||||
for i, url_info in enumerate(urls[:5], 1):
|
||||
@@ -244,6 +245,39 @@ IF (EXISTS `.price-filter`) THEN CLICK `input[data-max-price="100"]`
|
||||
print(f"❌ Compilation error: {result.first_error.message}")
|
||||
|
||||
|
||||
async def demo_pdf_support():
|
||||
"""
|
||||
Demo 6: PDF Parsing Support
|
||||
|
||||
Shows how to extract content from PDF files.
|
||||
Note: Requires 'pip install crawl4ai[pdf]'
|
||||
"""
|
||||
print("\n" + "="*60)
|
||||
print("📄 DEMO 6: PDF Parsing Support")
|
||||
print("="*60)
|
||||
|
||||
try:
|
||||
# Check if PDF support is installed
|
||||
import PyPDF2
|
||||
|
||||
# Example: Process a PDF URL
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
pdf=True, # Enable PDF generation
|
||||
extract_text_from_pdf=True # Extract text content
|
||||
)
|
||||
|
||||
print("PDF parsing is available!")
|
||||
print("You can now crawl PDF URLs and extract their content.")
|
||||
print("\nExample usage:")
|
||||
print(' result = await crawler.arun("https://example.com/document.pdf")')
|
||||
print(' pdf_text = result.extracted_content # Contains extracted text')
|
||||
|
||||
except ImportError:
|
||||
print("⚠️ PDF support not installed.")
|
||||
print("Install with: pip install crawl4ai[pdf]")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all demos"""
|
||||
print("\n🚀 Crawl4AI v0.7.0 Feature Demonstrations")
|
||||
@@ -255,6 +289,7 @@ async def main():
|
||||
("Virtual Scroll", demo_virtual_scroll),
|
||||
("URL Seeder", demo_url_seeder),
|
||||
("C4A Script", demo_c4a_script),
|
||||
("PDF Support", demo_pdf_support)
|
||||
]
|
||||
|
||||
for name, demo_func in demos:
|
||||
@@ -274,6 +309,7 @@ async def main():
|
||||
print("• Virtual Scroll: Capture all content from modern web pages")
|
||||
print("• URL Seeder: Pre-discover and filter URLs efficiently")
|
||||
print("• C4A Script: Simple language for complex automations")
|
||||
print("• PDF Support: Extract content from PDF documents")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -45,6 +45,7 @@ nav:
|
||||
- "Lazy Loading": "advanced/lazy-loading.md"
|
||||
- "Hooks & Auth": "advanced/hooks-auth.md"
|
||||
- "Proxy & Security": "advanced/proxy-security.md"
|
||||
- "Undetected Browser": "advanced/undetected-browser.md"
|
||||
- "Session Management": "advanced/session-management.md"
|
||||
- "Multi-URL Crawling": "advanced/multi-url-crawling.md"
|
||||
- "Crawl Dispatcher": "advanced/crawl-dispatcher.md"
|
||||
|
||||
@@ -13,37 +13,38 @@ authors = [
|
||||
{name = "Unclecode", email = "unclecode@kidocode.com"}
|
||||
]
|
||||
dependencies = [
|
||||
"aiofiles>=24.1.0",
|
||||
"aiohttp>=3.11.11",
|
||||
"aiosqlite~=0.20",
|
||||
"anyio>=4.0.0",
|
||||
"lxml~=5.3",
|
||||
"litellm>=1.53.1",
|
||||
"numpy>=1.26.0,<3",
|
||||
"pillow>=10.4",
|
||||
"playwright>=1.49.0",
|
||||
"patchright>=1.49.0",
|
||||
"python-dotenv~=1.0",
|
||||
"requests~=2.26",
|
||||
"beautifulsoup4~=4.12",
|
||||
"tf-playwright-stealth>=1.1.0",
|
||||
"xxhash~=3.4",
|
||||
"rank-bm25~=0.2",
|
||||
"aiofiles>=24.1.0",
|
||||
"snowballstemmer~=2.2",
|
||||
"pydantic>=2.10",
|
||||
"pyOpenSSL>=24.3.0",
|
||||
"psutil>=6.1.1",
|
||||
"PyYAML>=6.0",
|
||||
"nltk>=3.9.1",
|
||||
"rich>=13.9.4",
|
||||
"cssselect>=1.2.0",
|
||||
"httpx>=0.27.2",
|
||||
"httpx[http2]>=0.27.2",
|
||||
"fake-useragent>=2.0.3",
|
||||
"click>=8.1.7",
|
||||
"pyperclip>=1.8.2",
|
||||
"chardet>=5.2.0",
|
||||
"aiohttp>=3.11.11",
|
||||
"brotli>=1.1.0",
|
||||
"humanize>=4.10.0",
|
||||
"lark>=1.2.2",
|
||||
"alphashape>=1.3.1",
|
||||
"shapely>=2.0.0"
|
||||
"sentence-transformers>=2.2.0"
|
||||
]
|
||||
classifiers = [
|
||||
"Development Status :: 4 - Beta",
|
||||
@@ -57,20 +58,17 @@ classifiers = [
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
pdf = ["pypdf>=3.0.0"] # PyPDF2 is deprecated, use pypdf instead
|
||||
pdf = ["PyPDF2>=3.0.0"]
|
||||
torch = ["torch>=2.0.0", "nltk>=3.9.1", "scikit-learn>=1.3.0"]
|
||||
transformer = ["transformers>=4.34.0", "tokenizers>=0.15.0", "sentence-transformers>=2.2.0"]
|
||||
cosine = ["torch>=2.0.0", "transformers>=4.34.0", "nltk>=3.9.1", "sentence-transformers>=2.2.0"]
|
||||
sync = ["selenium>=4.0.0"]
|
||||
transformer = ["transformers>=4.30.0", "tokenizers>=0.13.0"]
|
||||
cosine = ["torch>=2.0.0", "transformers>=4.30.0", "nltk>=3.9.1"]
|
||||
all = [
|
||||
"pypdf>=3.0.0",
|
||||
"PyPDF2>=3.0.0",
|
||||
"torch>=2.0.0",
|
||||
"nltk>=3.9.1",
|
||||
"scikit-learn>=1.3.0",
|
||||
"transformers>=4.34.0",
|
||||
"tokenizers>=0.15.0",
|
||||
"sentence-transformers>=2.2.0",
|
||||
"selenium>=4.0.0"
|
||||
"transformers>=4.30.0",
|
||||
"tokenizers>=0.13.0"
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
|
||||
@@ -1,32 +1,33 @@
|
||||
# Note: These requirements are also specified in pyproject.toml
|
||||
# This file is kept for development environment setup and compatibility
|
||||
aiofiles>=24.1.0
|
||||
aiohttp>=3.11.11
|
||||
aiosqlite~=0.20
|
||||
anyio>=4.0.0
|
||||
lxml~=5.3
|
||||
litellm>=1.53.1
|
||||
numpy>=1.26.0,<3
|
||||
pillow>=10.4
|
||||
playwright>=1.49.0
|
||||
patchright>=1.49.0
|
||||
python-dotenv~=1.0
|
||||
requests~=2.26
|
||||
beautifulsoup4~=4.12
|
||||
tf-playwright-stealth>=1.1.0
|
||||
xxhash~=3.4
|
||||
rank-bm25~=0.2
|
||||
aiofiles>=24.1.0
|
||||
colorama~=0.4
|
||||
snowballstemmer~=2.2
|
||||
pydantic>=2.10
|
||||
pyOpenSSL>=24.3.0
|
||||
psutil>=6.1.1
|
||||
PyYAML>=6.0
|
||||
nltk>=3.9.1
|
||||
rich>=13.9.4
|
||||
cssselect>=1.2.0
|
||||
chardet>=5.2.0
|
||||
brotli>=1.1.0
|
||||
httpx[http2]>=0.27.2
|
||||
sentence-transformers>=2.2.0
|
||||
alphashape>=1.3.1
|
||||
shapely>=2.0.0
|
||||
|
||||
fake-useragent>=2.2.0
|
||||
pdf2image>=1.17.0
|
||||
|
||||
@@ -23,7 +23,7 @@ from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
AdaptiveCrawler,
|
||||
AdaptiveConfig,
|
||||
AdaptiveCrawlResult
|
||||
CrawlState
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@ import math
|
||||
sys.path.append(str(Path(__file__).parent.parent))
|
||||
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.adaptive_crawler import AdaptiveCrawlResult, StatisticalStrategy
|
||||
from crawl4ai.adaptive_crawler import CrawlState, StatisticalStrategy
|
||||
from crawl4ai.models import CrawlResult
|
||||
|
||||
|
||||
@@ -37,7 +37,7 @@ class ConfidenceTestHarness:
|
||||
print("=" * 80)
|
||||
|
||||
# Initialize state
|
||||
state = AdaptiveCrawlResult(query=self.query)
|
||||
state = CrawlState(query=self.query)
|
||||
|
||||
# Create crawler
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
@@ -107,7 +107,7 @@ class ConfidenceTestHarness:
|
||||
|
||||
state.metrics['prev_confidence'] = confidence
|
||||
|
||||
def _debug_coverage_calculation(self, state: AdaptiveCrawlResult, query_terms: List[str]):
|
||||
def _debug_coverage_calculation(self, state: CrawlState, query_terms: List[str]):
|
||||
"""Debug coverage calculation step by step"""
|
||||
coverage_score = 0.0
|
||||
max_possible_score = 0.0
|
||||
@@ -136,7 +136,7 @@ class ConfidenceTestHarness:
|
||||
new_coverage = self._calculate_coverage_new(state, query_terms)
|
||||
print(f" → New Coverage: {new_coverage:.3f}")
|
||||
|
||||
def _calculate_coverage_new(self, state: AdaptiveCrawlResult, query_terms: List[str]) -> float:
|
||||
def _calculate_coverage_new(self, state: CrawlState, query_terms: List[str]) -> float:
|
||||
"""New coverage calculation without IDF"""
|
||||
if not query_terms or state.total_documents == 0:
|
||||
return 0.0
|
||||
|
||||
@@ -15,7 +15,7 @@ import os
|
||||
sys.path.append(str(Path(__file__).parent.parent.parent))
|
||||
|
||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||
from crawl4ai.adaptive_crawler import EmbeddingStrategy, AdaptiveCrawlResult
|
||||
from crawl4ai.adaptive_crawler import EmbeddingStrategy, CrawlState
|
||||
from crawl4ai.models import CrawlResult
|
||||
|
||||
|
||||
@@ -132,7 +132,7 @@ async def test_embedding_performance():
|
||||
strategy.config = config
|
||||
|
||||
# Initialize state
|
||||
state = AdaptiveCrawlResult()
|
||||
state = CrawlState()
|
||||
state.query = "async await coroutines event loops tasks"
|
||||
|
||||
# Start performance monitoring
|
||||
|
||||
@@ -20,7 +20,7 @@ from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
AdaptiveCrawler,
|
||||
AdaptiveConfig,
|
||||
AdaptiveCrawlResult
|
||||
CrawlState
|
||||
)
|
||||
|
||||
console = Console()
|
||||
|
||||
344
tests/check_dependencies.py
Executable file
344
tests/check_dependencies.py
Executable file
@@ -0,0 +1,344 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Dependency checker for Crawl4AI
|
||||
Analyzes imports in the codebase and shows which files use them
|
||||
"""
|
||||
|
||||
import ast
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Set, Dict, List, Tuple
|
||||
from collections import defaultdict
|
||||
import re
|
||||
import toml
|
||||
|
||||
# Standard library modules to ignore
|
||||
STDLIB_MODULES = {
|
||||
'abc', 'argparse', 'asyncio', 'base64', 'collections', 'concurrent', 'contextlib',
|
||||
'copy', 'datetime', 'decimal', 'email', 'enum', 'functools', 'glob', 'hashlib',
|
||||
'http', 'importlib', 'io', 'itertools', 'json', 'logging', 'math', 'mimetypes',
|
||||
'multiprocessing', 'os', 'pathlib', 'pickle', 'platform', 'pprint', 'random',
|
||||
're', 'shutil', 'signal', 'socket', 'sqlite3', 'string', 'subprocess', 'sys',
|
||||
'tempfile', 'threading', 'time', 'traceback', 'typing', 'unittest', 'urllib',
|
||||
'uuid', 'warnings', 'weakref', 'xml', 'zipfile', 'dataclasses', 'secrets',
|
||||
'statistics', 'textwrap', 'queue', 'csv', 'gzip', 'tarfile', 'configparser',
|
||||
'inspect', 'operator', 'struct', 'binascii', 'codecs', 'locale', 'gc',
|
||||
'atexit', 'builtins', 'html', 'errno', 'fcntl', 'pwd', 'grp', 'resource',
|
||||
'termios', 'tty', 'pty', 'select', 'selectors', 'ssl', 'zlib', 'bz2',
|
||||
'lzma', 'types', 'copy', 'pydoc', 'profile', 'cProfile', 'timeit',
|
||||
'trace', 'doctest', 'pdb', 'contextvars', 'dataclasses', 'graphlib',
|
||||
'zoneinfo', 'tomllib', 'cgi', 'wsgiref', 'fileinput', 'linecache',
|
||||
'tokenize', 'tabnanny', 'compileall', 'dis', 'pickletools', 'formatter',
|
||||
'__future__', 'array', 'ctypes', 'heapq', 'bisect', 'array', 'weakref',
|
||||
'types', 'copy', 'pprint', 'repr', 'numbers', 'cmath', 'fractions',
|
||||
'statistics', 'itertools', 'functools', 'operator', 'pathlib', 'fileinput',
|
||||
'stat', 'filecmp', 'tempfile', 'glob', 'fnmatch', 'linecache', 'shutil',
|
||||
'pickle', 'copyreg', 'shelve', 'marshal', 'dbm', 'sqlite3', 'zlib', 'gzip',
|
||||
'bz2', 'lzma', 'zipfile', 'tarfile', 'configparser', 'netrc', 'xdrlib',
|
||||
'plistlib', 'hashlib', 'hmac', 'secrets', 'os', 'io', 'time', 'argparse',
|
||||
'getopt', 'logging', 'getpass', 'curses', 'platform', 'errno', 'ctypes',
|
||||
'threading', 'multiprocessing', 'concurrent', 'subprocess', 'sched', 'queue',
|
||||
'contextvars', 'asyncio', 'socket', 'ssl', 'email', 'json', 'mailcap',
|
||||
'mailbox', 'mimetypes', 'base64', 'binhex', 'binascii', 'quopri', 'uu',
|
||||
'html', 'xml', 'webbrowser', 'cgi', 'cgitb', 'wsgiref', 'urllib', 'http',
|
||||
'ftplib', 'poplib', 'imaplib', 'nntplib', 'smtplib', 'smtpd', 'telnetlib',
|
||||
'uuid', 'socketserver', 'xmlrpc', 'ipaddress', 'audioop', 'aifc', 'sunau',
|
||||
'wave', 'chunk', 'colorsys', 'imghdr', 'sndhdr', 'ossaudiodev', 'gettext',
|
||||
'locale', 'turtle', 'cmd', 'shlex', 'tkinter', 'typing', 'pydoc', 'doctest',
|
||||
'unittest', 'test', '2to3', 'distutils', 'venv', 'ensurepip', 'zipapp',
|
||||
'py_compile', 'compileall', 'dis', 'pickletools', 'pdb', 'timeit', 'trace',
|
||||
'tracemalloc', 'warnings', 'faulthandler', 'pdb', 'dataclasses', 'cgi',
|
||||
'cgitb', 'chunk', 'crypt', 'imghdr', 'mailcap', 'nis', 'nntplib', 'optparse',
|
||||
'ossaudiodev', 'pipes', 'smtpd', 'sndhdr', 'spwd', 'sunau', 'telnetlib',
|
||||
'uu', 'xdrlib', 'msilib', 'pstats', 'rlcompleter', 'tkinter', 'ast'
|
||||
}
|
||||
|
||||
# Known package name mappings (import name -> package name)
|
||||
PACKAGE_MAPPINGS = {
|
||||
'bs4': 'beautifulsoup4',
|
||||
'PIL': 'pillow',
|
||||
'cv2': 'opencv-python',
|
||||
'sklearn': 'scikit-learn',
|
||||
'yaml': 'PyYAML',
|
||||
'OpenSSL': 'pyOpenSSL',
|
||||
'sqlalchemy': 'SQLAlchemy',
|
||||
'playwright': 'playwright',
|
||||
'patchright': 'patchright',
|
||||
'dotenv': 'python-dotenv',
|
||||
'fake_useragent': 'fake-useragent',
|
||||
'playwright_stealth': 'tf-playwright-stealth',
|
||||
'sentence_transformers': 'sentence-transformers',
|
||||
'rank_bm25': 'rank-bm25',
|
||||
'snowballstemmer': 'snowballstemmer',
|
||||
'PyPDF2': 'PyPDF2',
|
||||
'pdf2image': 'pdf2image',
|
||||
}
|
||||
|
||||
|
||||
class ImportVisitor(ast.NodeVisitor):
|
||||
"""AST visitor to extract imports from Python files"""
|
||||
|
||||
def __init__(self):
|
||||
self.imports = {} # Changed to dict to store line numbers
|
||||
self.from_imports = {}
|
||||
|
||||
def visit_Import(self, node):
|
||||
for alias in node.names:
|
||||
module_name = alias.name.split('.')[0]
|
||||
if module_name not in self.imports:
|
||||
self.imports[module_name] = []
|
||||
self.imports[module_name].append(node.lineno)
|
||||
|
||||
def visit_ImportFrom(self, node):
|
||||
if node.module and node.level == 0: # absolute imports only
|
||||
module_name = node.module.split('.')[0]
|
||||
if module_name not in self.from_imports:
|
||||
self.from_imports[module_name] = []
|
||||
self.from_imports[module_name].append(node.lineno)
|
||||
|
||||
|
||||
def extract_imports_from_file(filepath: Path) -> Dict[str, List[int]]:
|
||||
"""Extract all imports from a Python file with line numbers"""
|
||||
all_imports = {}
|
||||
|
||||
try:
|
||||
with open(filepath, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
tree = ast.parse(content)
|
||||
visitor = ImportVisitor()
|
||||
visitor.visit(tree)
|
||||
|
||||
# Merge imports and from_imports
|
||||
for module, lines in visitor.imports.items():
|
||||
if module not in all_imports:
|
||||
all_imports[module] = []
|
||||
all_imports[module].extend(lines)
|
||||
|
||||
for module, lines in visitor.from_imports.items():
|
||||
if module not in all_imports:
|
||||
all_imports[module] = []
|
||||
all_imports[module].extend(lines)
|
||||
|
||||
except Exception as e:
|
||||
# Silently skip files that can't be parsed
|
||||
pass
|
||||
|
||||
return all_imports
|
||||
|
||||
|
||||
def get_codebase_imports_with_files(root_dir: Path) -> Dict[str, List[Tuple[str, List[int]]]]:
|
||||
"""Get all imports from the crawl4ai library and docs folders with file locations and line numbers"""
|
||||
import_to_files = defaultdict(list)
|
||||
|
||||
# Only scan crawl4ai library folder and docs folder
|
||||
target_dirs = [
|
||||
root_dir / 'crawl4ai',
|
||||
root_dir / 'docs'
|
||||
]
|
||||
|
||||
for target_dir in target_dirs:
|
||||
if not target_dir.exists():
|
||||
continue
|
||||
|
||||
for py_file in target_dir.rglob('*.py'):
|
||||
# Skip __pycache__ directories
|
||||
if '__pycache__' in py_file.parts:
|
||||
continue
|
||||
|
||||
# Skip setup.py and similar files
|
||||
if py_file.name in ['setup.py', 'setup.cfg', 'conf.py']:
|
||||
continue
|
||||
|
||||
imports = extract_imports_from_file(py_file)
|
||||
|
||||
# Map each import to the file and line numbers
|
||||
for imp, line_numbers in imports.items():
|
||||
relative_path = py_file.relative_to(root_dir)
|
||||
import_to_files[imp].append((str(relative_path), sorted(line_numbers)))
|
||||
|
||||
return dict(import_to_files)
|
||||
|
||||
|
||||
def get_declared_dependencies() -> Set[str]:
|
||||
"""Get declared dependencies from pyproject.toml and requirements.txt"""
|
||||
declared = set()
|
||||
|
||||
# Read from pyproject.toml
|
||||
if Path('pyproject.toml').exists():
|
||||
with open('pyproject.toml', 'r') as f:
|
||||
data = toml.load(f)
|
||||
|
||||
# Get main dependencies
|
||||
deps = data.get('project', {}).get('dependencies', [])
|
||||
for dep in deps:
|
||||
# Parse dependency string (e.g., "numpy>=1.26.0,<3")
|
||||
match = re.match(r'^([a-zA-Z0-9_-]+)', dep)
|
||||
if match:
|
||||
pkg_name = match.group(1).lower()
|
||||
declared.add(pkg_name)
|
||||
|
||||
# Get optional dependencies
|
||||
optional = data.get('project', {}).get('optional-dependencies', {})
|
||||
for group, deps in optional.items():
|
||||
for dep in deps:
|
||||
match = re.match(r'^([a-zA-Z0-9_-]+)', dep)
|
||||
if match:
|
||||
pkg_name = match.group(1).lower()
|
||||
declared.add(pkg_name)
|
||||
|
||||
# Also check requirements.txt as backup
|
||||
if Path('requirements.txt').exists():
|
||||
with open('requirements.txt', 'r') as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line and not line.startswith('#'):
|
||||
match = re.match(r'^([a-zA-Z0-9_-]+)', line)
|
||||
if match:
|
||||
pkg_name = match.group(1).lower()
|
||||
declared.add(pkg_name)
|
||||
|
||||
return declared
|
||||
|
||||
|
||||
def normalize_package_name(name: str) -> str:
|
||||
"""Normalize package name for comparison"""
|
||||
# Handle known mappings first
|
||||
if name in PACKAGE_MAPPINGS:
|
||||
return PACKAGE_MAPPINGS[name].lower()
|
||||
|
||||
# Basic normalization
|
||||
return name.lower().replace('_', '-')
|
||||
|
||||
|
||||
def check_missing_dependencies():
|
||||
"""Main function to check for missing dependencies"""
|
||||
print("🔍 Analyzing crawl4ai library and docs folders...\n")
|
||||
|
||||
# Get all imports with their file locations
|
||||
root_dir = Path('.')
|
||||
import_to_files = get_codebase_imports_with_files(root_dir)
|
||||
|
||||
# Get declared dependencies
|
||||
declared_deps = get_declared_dependencies()
|
||||
|
||||
# Normalize declared dependencies
|
||||
normalized_declared = {normalize_package_name(dep) for dep in declared_deps}
|
||||
|
||||
# Categorize imports
|
||||
external_imports = {}
|
||||
local_imports = {}
|
||||
|
||||
# Known local packages
|
||||
local_packages = {'crawl4ai'}
|
||||
|
||||
for imp, file_info in import_to_files.items():
|
||||
# Skip standard library
|
||||
if imp in STDLIB_MODULES:
|
||||
continue
|
||||
|
||||
# Check if it's a local import
|
||||
if any(imp.startswith(local) for local in local_packages):
|
||||
local_imports[imp] = file_info
|
||||
else:
|
||||
external_imports[imp] = file_info
|
||||
|
||||
# Check which external imports are not declared
|
||||
not_declared = {}
|
||||
declared_imports = {}
|
||||
|
||||
for imp, file_info in external_imports.items():
|
||||
normalized_imp = normalize_package_name(imp)
|
||||
|
||||
# Check if import is covered by declared dependencies
|
||||
found = False
|
||||
for declared in normalized_declared:
|
||||
if normalized_imp == declared or normalized_imp.startswith(declared + '.') or declared.startswith(normalized_imp):
|
||||
found = True
|
||||
break
|
||||
|
||||
if found:
|
||||
declared_imports[imp] = file_info
|
||||
else:
|
||||
not_declared[imp] = file_info
|
||||
|
||||
# Print results
|
||||
print(f"📊 Summary:")
|
||||
print(f" - Total unique imports: {len(import_to_files)}")
|
||||
print(f" - External imports: {len(external_imports)}")
|
||||
print(f" - Declared dependencies: {len(declared_deps)}")
|
||||
print(f" - External imports NOT in dependencies: {len(not_declared)}\n")
|
||||
|
||||
if not_declared:
|
||||
print("❌ External imports NOT declared in pyproject.toml or requirements.txt:\n")
|
||||
|
||||
# Sort by import name
|
||||
for imp in sorted(not_declared.keys()):
|
||||
file_info = not_declared[imp]
|
||||
print(f" 📦 {imp}")
|
||||
if imp in PACKAGE_MAPPINGS:
|
||||
print(f" → Package name: {PACKAGE_MAPPINGS[imp]}")
|
||||
|
||||
# Show up to 3 files that use this import
|
||||
for i, (file_path, line_numbers) in enumerate(file_info[:3]):
|
||||
# Format line numbers for clickable output
|
||||
if len(line_numbers) == 1:
|
||||
print(f" - {file_path}:{line_numbers[0]}")
|
||||
else:
|
||||
# Show first few line numbers
|
||||
line_str = ','.join(str(ln) for ln in line_numbers[:3])
|
||||
if len(line_numbers) > 3:
|
||||
line_str += f"... ({len(line_numbers)} imports)"
|
||||
print(f" - {file_path}: lines {line_str}")
|
||||
|
||||
if len(file_info) > 3:
|
||||
print(f" ... and {len(file_info) - 3} more files")
|
||||
print()
|
||||
|
||||
# Check for potentially unused dependencies
|
||||
print("\n🔎 Checking declared dependencies usage...\n")
|
||||
|
||||
# Get all used external packages
|
||||
used_packages = set()
|
||||
for imp in external_imports.keys():
|
||||
normalized = normalize_package_name(imp)
|
||||
used_packages.add(normalized)
|
||||
|
||||
# Find unused
|
||||
unused = []
|
||||
for dep in declared_deps:
|
||||
normalized_dep = normalize_package_name(dep)
|
||||
|
||||
# Check if any import uses this dependency
|
||||
found_usage = False
|
||||
for used in used_packages:
|
||||
if used == normalized_dep or used.startswith(normalized_dep) or normalized_dep.startswith(used):
|
||||
found_usage = True
|
||||
break
|
||||
|
||||
if not found_usage:
|
||||
# Some packages are commonly unused directly
|
||||
indirect_deps = {'wheel', 'setuptools', 'pip', 'colorama', 'certifi', 'packaging', 'urllib3'}
|
||||
if normalized_dep not in indirect_deps:
|
||||
unused.append(dep)
|
||||
|
||||
if unused:
|
||||
print("⚠️ Declared dependencies with NO imports found:")
|
||||
for dep in sorted(unused):
|
||||
print(f" - {dep}")
|
||||
print("\n Note: These might be used indirectly or by other dependencies")
|
||||
else:
|
||||
print("✅ All declared dependencies have corresponding imports")
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("💡 How to use this report:")
|
||||
print(" 1. Check each ❌ import to see if it's legitimate")
|
||||
print(" 2. If legitimate, add the package to pyproject.toml")
|
||||
print(" 3. If it's an internal module or typo, fix the import")
|
||||
print(" 4. Review unused dependencies - remove if truly not needed")
|
||||
print("="*60)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
check_missing_dependencies()
|
||||
@@ -1,345 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Simple API Test for Crawl4AI Docker Server v0.7.0
|
||||
Uses only built-in Python modules to test all endpoints.
|
||||
"""
|
||||
|
||||
import urllib.request
|
||||
import urllib.parse
|
||||
import json
|
||||
import time
|
||||
import sys
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
# Configuration
|
||||
BASE_URL = "http://localhost:11234" # Change to your server URL
|
||||
TEST_TIMEOUT = 30
|
||||
|
||||
class SimpleApiTester:
|
||||
def __init__(self, base_url: str = BASE_URL):
|
||||
self.base_url = base_url
|
||||
self.token = None
|
||||
self.results = []
|
||||
|
||||
def log(self, message: str):
|
||||
print(f"[INFO] {message}")
|
||||
|
||||
def test_get_endpoint(self, endpoint: str) -> Dict:
|
||||
"""Test a GET endpoint"""
|
||||
url = f"{self.base_url}{endpoint}"
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
req = urllib.request.Request(url)
|
||||
if self.token:
|
||||
req.add_header('Authorization', f'Bearer {self.token}')
|
||||
|
||||
with urllib.request.urlopen(req, timeout=TEST_TIMEOUT) as response:
|
||||
response_time = time.time() - start_time
|
||||
status_code = response.getcode()
|
||||
content = response.read().decode('utf-8')
|
||||
|
||||
# Try to parse JSON
|
||||
try:
|
||||
data = json.loads(content)
|
||||
except:
|
||||
data = {"raw_response": content[:200]}
|
||||
|
||||
return {
|
||||
"endpoint": endpoint,
|
||||
"method": "GET",
|
||||
"status": "PASS" if status_code < 400 else "FAIL",
|
||||
"status_code": status_code,
|
||||
"response_time": response_time,
|
||||
"data": data
|
||||
}
|
||||
except Exception as e:
|
||||
response_time = time.time() - start_time
|
||||
return {
|
||||
"endpoint": endpoint,
|
||||
"method": "GET",
|
||||
"status": "FAIL",
|
||||
"status_code": None,
|
||||
"response_time": response_time,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
def test_post_endpoint(self, endpoint: str, payload: Dict) -> Dict:
|
||||
"""Test a POST endpoint"""
|
||||
url = f"{self.base_url}{endpoint}"
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
data = json.dumps(payload).encode('utf-8')
|
||||
req = urllib.request.Request(url, data=data, method='POST')
|
||||
req.add_header('Content-Type', 'application/json')
|
||||
|
||||
if self.token:
|
||||
req.add_header('Authorization', f'Bearer {self.token}')
|
||||
|
||||
with urllib.request.urlopen(req, timeout=TEST_TIMEOUT) as response:
|
||||
response_time = time.time() - start_time
|
||||
status_code = response.getcode()
|
||||
content = response.read().decode('utf-8')
|
||||
|
||||
# Try to parse JSON
|
||||
try:
|
||||
data = json.loads(content)
|
||||
except:
|
||||
data = {"raw_response": content[:200]}
|
||||
|
||||
return {
|
||||
"endpoint": endpoint,
|
||||
"method": "POST",
|
||||
"status": "PASS" if status_code < 400 else "FAIL",
|
||||
"status_code": status_code,
|
||||
"response_time": response_time,
|
||||
"data": data
|
||||
}
|
||||
except Exception as e:
|
||||
response_time = time.time() - start_time
|
||||
return {
|
||||
"endpoint": endpoint,
|
||||
"method": "POST",
|
||||
"status": "FAIL",
|
||||
"status_code": None,
|
||||
"response_time": response_time,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
def print_result(self, result: Dict):
|
||||
"""Print a formatted test result"""
|
||||
status_color = {
|
||||
"PASS": "✅",
|
||||
"FAIL": "❌",
|
||||
"SKIP": "⏭️"
|
||||
}
|
||||
|
||||
print(f"{status_color[result['status']]} {result['method']} {result['endpoint']} "
|
||||
f"| {result['response_time']:.3f}s | Status: {result['status_code'] or 'N/A'}")
|
||||
|
||||
if result['status'] == 'FAIL' and 'error' in result:
|
||||
print(f" Error: {result['error']}")
|
||||
|
||||
self.results.append(result)
|
||||
|
||||
def run_all_tests(self):
|
||||
"""Run all API tests"""
|
||||
print("🚀 Starting Crawl4AI v0.7.0 API Test Suite")
|
||||
print(f"📡 Testing server at: {self.base_url}")
|
||||
print("=" * 60)
|
||||
|
||||
# # Test basic endpoints
|
||||
# print("\n=== BASIC ENDPOINTS ===")
|
||||
|
||||
# # Health check
|
||||
# result = self.test_get_endpoint("/health")
|
||||
# self.print_result(result)
|
||||
|
||||
|
||||
# # Schema endpoint
|
||||
# result = self.test_get_endpoint("/schema")
|
||||
# self.print_result(result)
|
||||
|
||||
# # Metrics endpoint
|
||||
# result = self.test_get_endpoint("/metrics")
|
||||
# self.print_result(result)
|
||||
|
||||
# # Root redirect
|
||||
# result = self.test_get_endpoint("/")
|
||||
# self.print_result(result)
|
||||
|
||||
# # Test authentication
|
||||
# print("\n=== AUTHENTICATION ===")
|
||||
|
||||
# # Get token
|
||||
# token_payload = {"email": "test@example.com"}
|
||||
# result = self.test_post_endpoint("/token", token_payload)
|
||||
# self.print_result(result)
|
||||
|
||||
# # Extract token if successful
|
||||
# if result['status'] == 'PASS' and 'data' in result:
|
||||
# token = result['data'].get('access_token')
|
||||
# if token:
|
||||
# self.token = token
|
||||
# self.log(f"Successfully obtained auth token: {token[:20]}...")
|
||||
|
||||
# Test core APIs
|
||||
print("\n=== CORE APIs ===")
|
||||
|
||||
test_url = "https://example.com"
|
||||
|
||||
# Test markdown endpoint
|
||||
md_payload = {
|
||||
"url": test_url,
|
||||
"f": "fit",
|
||||
"q": "test query",
|
||||
"c": "0"
|
||||
}
|
||||
result = self.test_post_endpoint("/md", md_payload)
|
||||
# print(result['data'].get('markdown', ''))
|
||||
self.print_result(result)
|
||||
|
||||
# Test HTML endpoint
|
||||
html_payload = {"url": test_url}
|
||||
result = self.test_post_endpoint("/html", html_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test screenshot endpoint
|
||||
screenshot_payload = {
|
||||
"url": test_url,
|
||||
"screenshot_wait_for": 2
|
||||
}
|
||||
result = self.test_post_endpoint("/screenshot", screenshot_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test PDF endpoint
|
||||
pdf_payload = {"url": test_url}
|
||||
result = self.test_post_endpoint("/pdf", pdf_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test JavaScript execution
|
||||
js_payload = {
|
||||
"url": test_url,
|
||||
"scripts": ["(() => document.title)()"]
|
||||
}
|
||||
result = self.test_post_endpoint("/execute_js", js_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test crawl endpoint
|
||||
crawl_payload = {
|
||||
"urls": [test_url],
|
||||
"browser_config": {},
|
||||
"crawler_config": {}
|
||||
}
|
||||
result = self.test_post_endpoint("/crawl", crawl_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test config dump
|
||||
config_payload = {"code": "CrawlerRunConfig()"}
|
||||
result = self.test_post_endpoint("/config/dump", config_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test LLM endpoint
|
||||
llm_endpoint = f"/llm/{test_url}?q=Extract%20main%20content"
|
||||
result = self.test_get_endpoint(llm_endpoint)
|
||||
self.print_result(result)
|
||||
|
||||
# Test ask endpoint
|
||||
ask_endpoint = "/ask?context_type=all&query=crawl4ai&max_results=5"
|
||||
result = self.test_get_endpoint(ask_endpoint)
|
||||
print(result)
|
||||
self.print_result(result)
|
||||
|
||||
# Test job APIs
|
||||
print("\n=== JOB APIs ===")
|
||||
|
||||
# Test LLM job
|
||||
llm_job_payload = {
|
||||
"url": test_url,
|
||||
"q": "Extract main content",
|
||||
"cache": False
|
||||
}
|
||||
result = self.test_post_endpoint("/llm/job", llm_job_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test crawl job
|
||||
crawl_job_payload = {
|
||||
"urls": [test_url],
|
||||
"browser_config": {},
|
||||
"crawler_config": {}
|
||||
}
|
||||
result = self.test_post_endpoint("/crawl/job", crawl_job_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test MCP
|
||||
print("\n=== MCP APIs ===")
|
||||
|
||||
# Test MCP schema
|
||||
result = self.test_get_endpoint("/mcp/schema")
|
||||
self.print_result(result)
|
||||
|
||||
# Test error handling
|
||||
print("\n=== ERROR HANDLING ===")
|
||||
|
||||
# Test invalid URL
|
||||
invalid_payload = {"url": "invalid-url", "f": "fit"}
|
||||
result = self.test_post_endpoint("/md", invalid_payload)
|
||||
self.print_result(result)
|
||||
|
||||
# Test invalid endpoint
|
||||
result = self.test_get_endpoint("/nonexistent")
|
||||
self.print_result(result)
|
||||
|
||||
# Print summary
|
||||
self.print_summary()
|
||||
|
||||
def print_summary(self):
|
||||
"""Print test results summary"""
|
||||
print("\n" + "=" * 60)
|
||||
print("📊 TEST RESULTS SUMMARY")
|
||||
print("=" * 60)
|
||||
|
||||
total = len(self.results)
|
||||
passed = sum(1 for r in self.results if r['status'] == 'PASS')
|
||||
failed = sum(1 for r in self.results if r['status'] == 'FAIL')
|
||||
|
||||
print(f"Total Tests: {total}")
|
||||
print(f"✅ Passed: {passed}")
|
||||
print(f"❌ Failed: {failed}")
|
||||
print(f"📈 Success Rate: {(passed/total)*100:.1f}%")
|
||||
|
||||
if failed > 0:
|
||||
print("\n❌ FAILED TESTS:")
|
||||
for result in self.results:
|
||||
if result['status'] == 'FAIL':
|
||||
print(f" • {result['method']} {result['endpoint']}")
|
||||
if 'error' in result:
|
||||
print(f" Error: {result['error']}")
|
||||
|
||||
# Performance statistics
|
||||
response_times = [r['response_time'] for r in self.results if r['response_time'] > 0]
|
||||
if response_times:
|
||||
avg_time = sum(response_times) / len(response_times)
|
||||
max_time = max(response_times)
|
||||
print(f"\n⏱️ Average Response Time: {avg_time:.3f}s")
|
||||
print(f"⏱️ Max Response Time: {max_time:.3f}s")
|
||||
|
||||
# Save detailed report
|
||||
report_file = f"crawl4ai_test_report_{int(time.time())}.json"
|
||||
with open(report_file, 'w') as f:
|
||||
json.dump({
|
||||
"timestamp": time.time(),
|
||||
"server_url": self.base_url,
|
||||
"version": "0.7.0",
|
||||
"summary": {
|
||||
"total": total,
|
||||
"passed": passed,
|
||||
"failed": failed
|
||||
},
|
||||
"results": self.results
|
||||
}, f, indent=2)
|
||||
|
||||
print(f"\n📄 Detailed report saved to: {report_file}")
|
||||
|
||||
def main():
|
||||
"""Main test runner"""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description='Crawl4AI v0.7.0 API Test Suite')
|
||||
parser.add_argument('--url', default=BASE_URL, help='Base URL of the server')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
tester = SimpleApiTester(args.url)
|
||||
|
||||
try:
|
||||
tester.run_all_tests()
|
||||
except KeyboardInterrupt:
|
||||
print("\n🛑 Test suite interrupted by user")
|
||||
except Exception as e:
|
||||
print(f"\n💥 Test suite failed with error: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -5,7 +5,7 @@ Test script for Link Extractor functionality
|
||||
|
||||
from crawl4ai.models import Link
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai import LinkPreviewConfig
|
||||
from crawl4ai.async_configs import LinkPreviewConfig
|
||||
import asyncio
|
||||
import sys
|
||||
import os
|
||||
@@ -237,7 +237,7 @@ def test_config_examples():
|
||||
print(f" {key}: {value}")
|
||||
|
||||
print(" Usage:")
|
||||
print(" from crawl4ai import LinkPreviewConfig")
|
||||
print(" from crawl4ai.async_configs import LinkPreviewConfig")
|
||||
print(" config = CrawlerRunConfig(")
|
||||
print(" link_preview_config=LinkPreviewConfig(")
|
||||
for key, value in config_dict.items():
|
||||
|
||||
@@ -1,211 +0,0 @@
|
||||
import asyncio
|
||||
import httpx
|
||||
import email.utils
|
||||
from datetime import datetime
|
||||
import json
|
||||
from typing import Dict, Optional
|
||||
import time
|
||||
|
||||
|
||||
async def should_crawl(url: str, cache: Optional[Dict[str, str]] = None) -> bool:
|
||||
"""
|
||||
Check if a URL should be crawled based on HEAD request headers.
|
||||
|
||||
Args:
|
||||
url: The URL to check
|
||||
cache: Previous cache data containing etag, last_modified, digest, content_length
|
||||
|
||||
Returns:
|
||||
True if the page has changed and should be crawled, False otherwise
|
||||
"""
|
||||
if cache is None:
|
||||
cache = {}
|
||||
|
||||
headers = {
|
||||
"Accept-Encoding": "identity",
|
||||
"Want-Content-Digest": "sha-256",
|
||||
}
|
||||
|
||||
if cache.get("etag"):
|
||||
headers["If-None-Match"] = cache["etag"]
|
||||
if cache.get("last_modified"):
|
||||
headers["If-Modified-Since"] = cache["last_modified"]
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(follow_redirects=True, timeout=5) as client:
|
||||
response = await client.head(url, headers=headers)
|
||||
|
||||
# 304 Not Modified - content hasn't changed
|
||||
if response.status_code == 304:
|
||||
print(f"✓ 304 Not Modified - No need to crawl {url}")
|
||||
return False
|
||||
|
||||
h = response.headers
|
||||
|
||||
# Check Content-Digest (most reliable)
|
||||
if h.get("content-digest") and h["content-digest"] == cache.get("digest"):
|
||||
print(f"✓ Content-Digest matches - No need to crawl {url}")
|
||||
return False
|
||||
|
||||
# Check strong ETag
|
||||
if h.get("etag") and h["etag"].startswith('"') and h["etag"] == cache.get("etag"):
|
||||
print(f"✓ Strong ETag matches - No need to crawl {url}")
|
||||
return False
|
||||
|
||||
# Check Last-Modified
|
||||
if h.get("last-modified") and cache.get("last_modified"):
|
||||
try:
|
||||
lm_new = email.utils.parsedate_to_datetime(h["last-modified"])
|
||||
lm_old = email.utils.parsedate_to_datetime(cache["last_modified"])
|
||||
if lm_new <= lm_old:
|
||||
print(f"✓ Last-Modified not newer - No need to crawl {url}")
|
||||
return False
|
||||
except:
|
||||
pass
|
||||
|
||||
# Check Content-Length (weakest signal - only as a hint, not definitive)
|
||||
# Note: Same content length doesn't mean same content!
|
||||
# This should only be used when no other signals are available
|
||||
if h.get("content-length") and cache.get("content_length"):
|
||||
try:
|
||||
if int(h["content-length"]) != cache.get("content_length"):
|
||||
print(f"✗ Content-Length changed - Should crawl {url}")
|
||||
return True
|
||||
else:
|
||||
print(f"⚠️ Content-Length unchanged but content might have changed - Should crawl {url}")
|
||||
return True # When in doubt, crawl!
|
||||
except:
|
||||
pass
|
||||
|
||||
print(f"✗ Content has changed - Should crawl {url}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Error checking {url}: {e}")
|
||||
return True # On error, assume we should crawl
|
||||
|
||||
|
||||
async def crawl_page(url: str) -> Dict[str, str]:
|
||||
"""
|
||||
Simulate crawling a page and extracting cache headers.
|
||||
"""
|
||||
print(f"\n🕷️ Crawling {url}...")
|
||||
|
||||
async with httpx.AsyncClient(follow_redirects=True, timeout=10) as client:
|
||||
response = await client.get(url)
|
||||
|
||||
cache_data = {}
|
||||
h = response.headers
|
||||
|
||||
if h.get("etag"):
|
||||
cache_data["etag"] = h["etag"]
|
||||
print(f" Stored ETag: {h['etag']}")
|
||||
|
||||
if h.get("last-modified"):
|
||||
cache_data["last_modified"] = h["last-modified"]
|
||||
print(f" Stored Last-Modified: {h['last-modified']}")
|
||||
|
||||
if h.get("content-digest"):
|
||||
cache_data["digest"] = h["content-digest"]
|
||||
print(f" Stored Content-Digest: {h['content-digest']}")
|
||||
|
||||
if h.get("content-length"):
|
||||
cache_data["content_length"] = int(h["content-length"])
|
||||
print(f" Stored Content-Length: {h['content-length']}")
|
||||
|
||||
print(f" Response size: {len(response.content)} bytes")
|
||||
return cache_data
|
||||
|
||||
|
||||
async def test_static_site():
|
||||
"""Test with a static website (example.com)"""
|
||||
print("=" * 60)
|
||||
print("Testing with static site: example.com")
|
||||
print("=" * 60)
|
||||
|
||||
url = "https://example.com"
|
||||
|
||||
# First crawl - always happens
|
||||
cache = await crawl_page(url)
|
||||
|
||||
# Wait a bit
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Second check - should not need to crawl
|
||||
print(f"\n📊 Checking if we need to re-crawl...")
|
||||
needs_crawl = await should_crawl(url, cache)
|
||||
|
||||
if not needs_crawl:
|
||||
print("✅ Correctly identified: No need to re-crawl static content")
|
||||
else:
|
||||
print("❌ Unexpected: Static content flagged as changed")
|
||||
|
||||
|
||||
async def test_dynamic_site():
|
||||
"""Test with dynamic websites that change frequently"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Testing with dynamic sites")
|
||||
print("=" * 60)
|
||||
|
||||
# Test with a few dynamic sites
|
||||
dynamic_sites = [
|
||||
"https://api.github.com/", # GitHub API root (changes with rate limit info)
|
||||
"https://worldtimeapi.org/api/timezone/UTC", # Current time API
|
||||
"https://httpbin.org/uuid", # Generates new UUID each request
|
||||
]
|
||||
|
||||
for url in dynamic_sites:
|
||||
print(f"\n🔄 Testing {url}")
|
||||
try:
|
||||
# First crawl
|
||||
cache = await crawl_page(url)
|
||||
|
||||
# Wait a bit
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Check if content changed
|
||||
print(f"\n📊 Checking if we need to re-crawl...")
|
||||
needs_crawl = await should_crawl(url, cache)
|
||||
|
||||
if needs_crawl:
|
||||
print("✅ Correctly identified: Dynamic content has changed")
|
||||
else:
|
||||
print("⚠️ Note: Dynamic content appears unchanged (might have caching)")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error testing {url}: {e}")
|
||||
|
||||
|
||||
async def test_conditional_get():
|
||||
"""Test conditional GET fallback when HEAD doesn't provide enough info"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Testing conditional GET scenario")
|
||||
print("=" * 60)
|
||||
|
||||
url = "https://httpbin.org/etag/test-etag-123"
|
||||
|
||||
# Simulate a scenario where we have an ETag
|
||||
cache = {"etag": '"test-etag-123"'}
|
||||
|
||||
print(f"Testing with cached ETag: {cache['etag']}")
|
||||
needs_crawl = await should_crawl(url, cache)
|
||||
|
||||
if not needs_crawl:
|
||||
print("✅ ETag matched - no crawl needed")
|
||||
else:
|
||||
print("✅ ETag didn't match - crawl needed")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all tests"""
|
||||
print("🚀 Starting HEAD request change detection tests\n")
|
||||
|
||||
await test_static_site()
|
||||
await test_dynamic_site()
|
||||
await test_conditional_get()
|
||||
|
||||
print("\n✨ All tests completed!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,186 +0,0 @@
|
||||
import asyncio
|
||||
import httpx
|
||||
import email.utils
|
||||
from datetime import datetime
|
||||
import json
|
||||
from typing import Dict, Optional
|
||||
import time
|
||||
|
||||
|
||||
async def should_crawl(url: str, cache: Optional[Dict[str, str]] = None) -> bool:
|
||||
"""
|
||||
Check if a URL should be crawled based on HEAD request headers.
|
||||
"""
|
||||
if cache is None:
|
||||
cache = {}
|
||||
|
||||
headers = {
|
||||
"Accept-Encoding": "identity",
|
||||
"Want-Content-Digest": "sha-256",
|
||||
"User-Agent": "Mozilla/5.0 (compatible; crawl4ai/1.0)"
|
||||
}
|
||||
|
||||
if cache.get("etag"):
|
||||
headers["If-None-Match"] = cache["etag"]
|
||||
if cache.get("last_modified"):
|
||||
headers["If-Modified-Since"] = cache["last_modified"]
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(follow_redirects=True, timeout=5) as client:
|
||||
response = await client.head(url, headers=headers)
|
||||
|
||||
print(f"\nHEAD Response Status: {response.status_code}")
|
||||
print(f"Headers received: {dict(response.headers)}")
|
||||
|
||||
# 304 Not Modified
|
||||
if response.status_code == 304:
|
||||
return False
|
||||
|
||||
h = response.headers
|
||||
|
||||
# Check headers in order of reliability
|
||||
if h.get("content-digest") and h["content-digest"] == cache.get("digest"):
|
||||
return False
|
||||
|
||||
if h.get("etag") and h["etag"].startswith('"') and h["etag"] == cache.get("etag"):
|
||||
return False
|
||||
|
||||
if h.get("last-modified") and cache.get("last_modified"):
|
||||
try:
|
||||
lm_new = email.utils.parsedate_to_datetime(h["last-modified"])
|
||||
lm_old = email.utils.parsedate_to_datetime(cache["last_modified"])
|
||||
if lm_new <= lm_old:
|
||||
return False
|
||||
except:
|
||||
pass
|
||||
|
||||
# Check Content-Length (weakest signal - only as a hint, not definitive)
|
||||
# Note: Same content length doesn't mean same content!
|
||||
if h.get("content-length") and cache.get("content_length"):
|
||||
try:
|
||||
if int(h["content-length"]) != cache.get("content_length"):
|
||||
return True # Length changed, likely content changed
|
||||
# If length is same, we can't be sure - default to crawling
|
||||
except:
|
||||
pass
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error during HEAD request: {e}")
|
||||
return True
|
||||
|
||||
|
||||
async def test_with_changing_content():
|
||||
"""Test with a real changing website"""
|
||||
print("=" * 60)
|
||||
print("Testing with real changing content")
|
||||
print("=" * 60)
|
||||
|
||||
# Using httpbin's cache endpoint that changes after specified seconds
|
||||
url = "https://httpbin.org/cache/1" # Cache for 1 second
|
||||
|
||||
print(f"\n1️⃣ First request to {url}")
|
||||
async with httpx.AsyncClient() as client:
|
||||
response1 = await client.get(url)
|
||||
cache = {}
|
||||
if response1.headers.get("etag"):
|
||||
cache["etag"] = response1.headers["etag"]
|
||||
if response1.headers.get("last-modified"):
|
||||
cache["last_modified"] = response1.headers["last-modified"]
|
||||
print(f"Cached ETag: {cache.get('etag', 'None')}")
|
||||
print(f"Cached Last-Modified: {cache.get('last_modified', 'None')}")
|
||||
|
||||
# Check immediately (should not need crawl)
|
||||
print(f"\n2️⃣ Checking immediately after first request...")
|
||||
needs_crawl = await should_crawl(url, cache)
|
||||
print(f"Result: {'NEED TO CRAWL' if needs_crawl else 'NO NEED TO CRAWL'}")
|
||||
|
||||
# Wait for cache to expire
|
||||
print(f"\n⏳ Waiting 2 seconds for cache to expire...")
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Check again (should need crawl now)
|
||||
print(f"\n3️⃣ Checking after cache expiry...")
|
||||
needs_crawl = await should_crawl(url, cache)
|
||||
print(f"Result: {'NEED TO CRAWL' if needs_crawl else 'NO NEED TO CRAWL'}")
|
||||
|
||||
|
||||
async def test_news_website():
|
||||
"""Test with a news website that updates frequently"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Testing with news website (BBC)")
|
||||
print("=" * 60)
|
||||
|
||||
url = "https://www.bbc.com"
|
||||
|
||||
print(f"\n1️⃣ First crawl of {url}")
|
||||
async with httpx.AsyncClient() as client:
|
||||
response1 = await client.get(url)
|
||||
cache = {}
|
||||
h = response1.headers
|
||||
|
||||
if h.get("etag"):
|
||||
cache["etag"] = h["etag"]
|
||||
print(f"Stored ETag: {h['etag'][:50]}...")
|
||||
if h.get("last-modified"):
|
||||
cache["last_modified"] = h["last-modified"]
|
||||
print(f"Stored Last-Modified: {h['last-modified']}")
|
||||
if h.get("content-length"):
|
||||
cache["content_length"] = int(h["content-length"])
|
||||
print(f"Stored Content-Length: {h['content-length']}")
|
||||
|
||||
# Check multiple times
|
||||
for i in range(3):
|
||||
await asyncio.sleep(5)
|
||||
print(f"\n📊 Check #{i+2} - {datetime.now().strftime('%H:%M:%S')}")
|
||||
needs_crawl = await should_crawl(url, cache)
|
||||
print(f"Result: {'NEED TO CRAWL ✓' if needs_crawl else 'NO NEED TO CRAWL ✗'}")
|
||||
|
||||
|
||||
async def test_api_endpoint():
|
||||
"""Test with an API that provides proper caching headers"""
|
||||
print("\n" + "=" * 60)
|
||||
print("Testing with GitHub API")
|
||||
print("=" * 60)
|
||||
|
||||
# GitHub user API (updates when user data changes)
|
||||
url = "https://api.github.com/users/github"
|
||||
|
||||
headers = {"User-Agent": "crawl4ai-test"}
|
||||
|
||||
print(f"\n1️⃣ First request to {url}")
|
||||
async with httpx.AsyncClient() as client:
|
||||
response1 = await client.get(url, headers=headers)
|
||||
cache = {}
|
||||
h = response1.headers
|
||||
|
||||
if h.get("etag"):
|
||||
cache["etag"] = h["etag"]
|
||||
print(f"Stored ETag: {h['etag']}")
|
||||
if h.get("last-modified"):
|
||||
cache["last_modified"] = h["last-modified"]
|
||||
print(f"Stored Last-Modified: {h['last-modified']}")
|
||||
|
||||
# Print rate limit info
|
||||
print(f"Rate Limit Remaining: {h.get('x-ratelimit-remaining', 'N/A')}")
|
||||
|
||||
# Check if content changed
|
||||
print(f"\n2️⃣ Checking if content changed...")
|
||||
needs_crawl = await should_crawl(url, cache)
|
||||
print(f"Result: {'NEED TO CRAWL' if needs_crawl else 'NO NEED TO CRAWL (content unchanged)'}")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all tests"""
|
||||
print("🚀 Testing HEAD request change detection with real websites\n")
|
||||
|
||||
await test_with_changing_content()
|
||||
await test_news_website()
|
||||
await test_api_endpoint()
|
||||
|
||||
print("\n✨ All tests completed!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,196 +0,0 @@
|
||||
"""
|
||||
Test SMART cache mode functionality in crawl4ai.
|
||||
|
||||
This test demonstrates:
|
||||
1. Initial crawl with caching enabled
|
||||
2. Re-crawl with SMART mode on static content (should use cache)
|
||||
3. Re-crawl with SMART mode on dynamic content (should re-crawl)
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
from crawl4ai.cache_context import CacheMode
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
async def test_smart_cache_mode():
|
||||
"""Test the SMART cache mode with both static and dynamic URLs"""
|
||||
|
||||
print("=" * 60)
|
||||
print("Testing SMART Cache Mode")
|
||||
print("=" * 60)
|
||||
|
||||
# URLs for testing
|
||||
static_url = "https://example.com" # Rarely changes
|
||||
dynamic_url = "https://httpbin.org/uuid" # Changes every request
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
|
||||
# Test 1: Initial crawl with caching enabled
|
||||
print("\n1️⃣ Initial crawl with ENABLED cache mode")
|
||||
print("-" * 40)
|
||||
|
||||
# Crawl static URL
|
||||
config_static = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.ENABLED,
|
||||
verbose=True
|
||||
)
|
||||
result_static_1 = await crawler.arun(url=static_url, config=config_static)
|
||||
print(f"✓ Static URL crawled: {len(result_static_1.html)} bytes")
|
||||
print(f" Response headers: {list(result_static_1.response_headers.keys())[:5]}...")
|
||||
|
||||
# Crawl dynamic URL
|
||||
config_dynamic = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.ENABLED,
|
||||
verbose=True
|
||||
)
|
||||
result_dynamic_1 = await crawler.arun(url=dynamic_url, config=config_dynamic)
|
||||
print(f"✓ Dynamic URL crawled: {len(result_dynamic_1.html)} bytes")
|
||||
dynamic_content_1 = result_dynamic_1.html
|
||||
|
||||
# Wait a bit
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Test 2: Re-crawl static URL with SMART mode
|
||||
print("\n2️⃣ Re-crawl static URL with SMART cache mode")
|
||||
print("-" * 40)
|
||||
|
||||
config_smart = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.SMART, # This will be our new mode
|
||||
verbose=True
|
||||
)
|
||||
|
||||
start_time = time.time()
|
||||
result_static_2 = await crawler.arun(url=static_url, config=config_smart)
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
print(f"✓ Static URL with SMART mode completed in {elapsed:.2f}s")
|
||||
print(f" Should use cache (content unchanged)")
|
||||
print(f" HTML length: {len(result_static_2.html)} bytes")
|
||||
|
||||
# Test 3: Re-crawl dynamic URL with SMART mode
|
||||
print("\n3️⃣ Re-crawl dynamic URL with SMART cache mode")
|
||||
print("-" * 40)
|
||||
|
||||
start_time = time.time()
|
||||
result_dynamic_2 = await crawler.arun(url=dynamic_url, config=config_smart)
|
||||
elapsed = time.time() - start_time
|
||||
dynamic_content_2 = result_dynamic_2.html
|
||||
|
||||
print(f"✓ Dynamic URL with SMART mode completed in {elapsed:.2f}s")
|
||||
print(f" Should re-crawl (content changes every request)")
|
||||
print(f" HTML length: {len(result_dynamic_2.html)} bytes")
|
||||
print(f" Content changed: {dynamic_content_1 != dynamic_content_2}")
|
||||
|
||||
# Test 4: Test with a news website (content changes frequently)
|
||||
print("\n4️⃣ Testing with news website")
|
||||
print("-" * 40)
|
||||
|
||||
news_url = "https://news.ycombinator.com"
|
||||
|
||||
# First crawl
|
||||
result_news_1 = await crawler.arun(
|
||||
url=news_url,
|
||||
config=CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
||||
)
|
||||
print(f"✓ News site initial crawl: {len(result_news_1.html)} bytes")
|
||||
|
||||
# Wait a bit
|
||||
await asyncio.sleep(5)
|
||||
|
||||
# Re-crawl with SMART mode
|
||||
start_time = time.time()
|
||||
result_news_2 = await crawler.arun(
|
||||
url=news_url,
|
||||
config=CrawlerRunConfig(cache_mode=CacheMode.SMART)
|
||||
)
|
||||
elapsed = time.time() - start_time
|
||||
|
||||
print(f"✓ News site SMART mode completed in {elapsed:.2f}s")
|
||||
print(f" Content length changed: {len(result_news_1.html) != len(result_news_2.html)}")
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 60)
|
||||
print("Summary")
|
||||
print("=" * 60)
|
||||
print("✅ SMART cache mode should:")
|
||||
print(" - Use cache for static content (example.com)")
|
||||
print(" - Re-crawl dynamic content (httpbin.org/uuid)")
|
||||
print(" - Make intelligent decisions based on HEAD requests")
|
||||
print(" - Save bandwidth on unchanged content")
|
||||
|
||||
|
||||
async def test_smart_cache_edge_cases():
|
||||
"""Test edge cases for SMART cache mode"""
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Testing SMART Cache Mode Edge Cases")
|
||||
print("=" * 60)
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
|
||||
# Test with URL that doesn't support HEAD
|
||||
print("\n🔧 Testing URL with potential HEAD issues")
|
||||
print("-" * 40)
|
||||
|
||||
# Some servers don't handle HEAD well
|
||||
problematic_url = "https://httpbin.org/status/200"
|
||||
|
||||
# Initial crawl
|
||||
await crawler.arun(
|
||||
url=problematic_url,
|
||||
config=CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
||||
)
|
||||
|
||||
# Try SMART mode
|
||||
result = await crawler.arun(
|
||||
url=problematic_url,
|
||||
config=CrawlerRunConfig(cache_mode=CacheMode.SMART)
|
||||
)
|
||||
print(f"✓ Handled potentially problematic URL: {result.success}")
|
||||
|
||||
# Test with URL that has no caching headers
|
||||
print("\n🔧 Testing URL with no cache headers")
|
||||
print("-" * 40)
|
||||
|
||||
no_cache_url = "https://httpbin.org/html"
|
||||
|
||||
# Initial crawl
|
||||
await crawler.arun(
|
||||
url=no_cache_url,
|
||||
config=CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
||||
)
|
||||
|
||||
# SMART mode should handle gracefully
|
||||
result = await crawler.arun(
|
||||
url=no_cache_url,
|
||||
config=CrawlerRunConfig(cache_mode=CacheMode.SMART)
|
||||
)
|
||||
print(f"✓ Handled URL with no cache headers: {result.success}")
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all tests"""
|
||||
try:
|
||||
# Run main test
|
||||
await test_smart_cache_mode()
|
||||
|
||||
# Run edge case tests
|
||||
await test_smart_cache_edge_cases()
|
||||
|
||||
print("\n✨ All tests completed!")
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error during testing: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Note: This test will fail until SMART mode is implemented
|
||||
print("⚠️ Note: This test expects CacheMode.SMART to be implemented")
|
||||
print("⚠️ It will fail with AttributeError until the feature is added\n")
|
||||
|
||||
asyncio.run(main())
|
||||
@@ -1,69 +0,0 @@
|
||||
"""
|
||||
Simple test for SMART cache mode functionality.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
from crawl4ai.cache_context import CacheMode
|
||||
import time
|
||||
|
||||
|
||||
async def test_smart_cache():
|
||||
"""Test SMART cache mode with a simple example"""
|
||||
|
||||
print("Testing SMART Cache Mode")
|
||||
print("-" * 40)
|
||||
|
||||
# Test URL
|
||||
url = "https://example.com"
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
# First crawl with normal caching
|
||||
print("\n1. Initial crawl with ENABLED mode:")
|
||||
config1 = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
||||
result1 = await crawler.arun(url=url, config=config1)
|
||||
print(f" Crawled: {len(result1.html)} bytes")
|
||||
print(f" Headers: {list(result1.response_headers.keys())[:3]}...")
|
||||
|
||||
# Wait a moment
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Re-crawl with SMART mode
|
||||
print("\n2. Re-crawl with SMART mode:")
|
||||
config2 = CrawlerRunConfig(cache_mode=CacheMode.SMART)
|
||||
start = time.time()
|
||||
result2 = await crawler.arun(url=url, config=config2)
|
||||
elapsed = time.time() - start
|
||||
|
||||
print(f" Time: {elapsed:.2f}s")
|
||||
print(f" Result: {len(result2.html)} bytes")
|
||||
print(f" Should use cache (content unchanged)")
|
||||
|
||||
# Test with dynamic content
|
||||
print("\n3. Testing with dynamic URL:")
|
||||
dynamic_url = "https://httpbin.org/uuid"
|
||||
|
||||
# First crawl
|
||||
config3 = CrawlerRunConfig(cache_mode=CacheMode.ENABLED)
|
||||
result3 = await crawler.arun(url=dynamic_url, config=config3)
|
||||
content1 = result3.html
|
||||
|
||||
# Re-crawl with SMART
|
||||
config4 = CrawlerRunConfig(cache_mode=CacheMode.SMART)
|
||||
result4 = await crawler.arun(url=dynamic_url, config=config4)
|
||||
content2 = result4.html
|
||||
|
||||
print(f" Content changed: {content1 != content2}")
|
||||
print(f" Should re-crawl (dynamic content)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(f"Python path: {sys.path[0]}")
|
||||
print(f"CacheMode values: {[e.value for e in CacheMode]}")
|
||||
print()
|
||||
asyncio.run(test_smart_cache())
|
||||
Reference in New Issue
Block a user