Compare commits
14 Commits
fix/reques
...
docker/bas
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a1950afd98 | ||
|
|
d0eb5a6ffe | ||
|
|
77559f3373 | ||
|
|
3899ac3d3b | ||
|
|
23431d8109 | ||
|
|
1717827732 | ||
|
|
f8eaf01ed1 | ||
|
|
14b42b1f9a | ||
|
|
3bc56dd028 | ||
|
|
0482c1eafc | ||
|
|
6e728096fa | ||
|
|
4ed33fce9e | ||
|
|
f7a3366f72 | ||
|
|
88a9fbbb7e |
2
.gitignore
vendored
2
.gitignore
vendored
@@ -265,7 +265,7 @@ CLAUDE.md
|
||||
tests/**/test_site
|
||||
tests/**/reports
|
||||
tests/**/benchmark_reports
|
||||
|
||||
test_scripts/
|
||||
docs/**/data
|
||||
.codecat/
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ import re
|
||||
from pathlib import Path
|
||||
|
||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||
from crawl4ai.async_configs import CrawlerRunConfig, LinkPreviewConfig
|
||||
from crawl4ai.async_configs import CrawlerRunConfig, LinkPreviewConfig, LLMConfig
|
||||
from crawl4ai.models import Link, CrawlResult
|
||||
import numpy as np
|
||||
|
||||
@@ -178,7 +178,7 @@ class AdaptiveConfig:
|
||||
|
||||
# Embedding strategy parameters
|
||||
embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
|
||||
embedding_llm_config: Optional[Dict] = None # Separate config for embeddings
|
||||
embedding_llm_config: Optional[Union[LLMConfig, Dict]] = None # Separate config for embeddings
|
||||
n_query_variations: int = 10
|
||||
coverage_threshold: float = 0.85
|
||||
alpha_shape_alpha: float = 0.5
|
||||
@@ -250,6 +250,30 @@ class AdaptiveConfig:
|
||||
assert 0 <= self.embedding_quality_max_confidence <= 1, "embedding_quality_max_confidence must be between 0 and 1"
|
||||
assert self.embedding_quality_scale_factor > 0, "embedding_quality_scale_factor must be positive"
|
||||
assert 0 <= self.embedding_min_confidence_threshold <= 1, "embedding_min_confidence_threshold must be between 0 and 1"
|
||||
|
||||
@property
|
||||
def _embedding_llm_config_dict(self) -> Optional[Dict]:
|
||||
"""Convert LLMConfig to dict format for backward compatibility."""
|
||||
if self.embedding_llm_config is None:
|
||||
return None
|
||||
|
||||
if isinstance(self.embedding_llm_config, dict):
|
||||
# Already a dict - return as-is for backward compatibility
|
||||
return self.embedding_llm_config
|
||||
|
||||
# Convert LLMConfig object to dict format
|
||||
return {
|
||||
'provider': self.embedding_llm_config.provider,
|
||||
'api_token': self.embedding_llm_config.api_token,
|
||||
'base_url': getattr(self.embedding_llm_config, 'base_url', None),
|
||||
'temperature': getattr(self.embedding_llm_config, 'temperature', None),
|
||||
'max_tokens': getattr(self.embedding_llm_config, 'max_tokens', None),
|
||||
'top_p': getattr(self.embedding_llm_config, 'top_p', None),
|
||||
'frequency_penalty': getattr(self.embedding_llm_config, 'frequency_penalty', None),
|
||||
'presence_penalty': getattr(self.embedding_llm_config, 'presence_penalty', None),
|
||||
'stop': getattr(self.embedding_llm_config, 'stop', None),
|
||||
'n': getattr(self.embedding_llm_config, 'n', None),
|
||||
}
|
||||
|
||||
|
||||
class CrawlStrategy(ABC):
|
||||
@@ -593,7 +617,7 @@ class StatisticalStrategy(CrawlStrategy):
|
||||
class EmbeddingStrategy(CrawlStrategy):
|
||||
"""Embedding-based adaptive crawling using semantic space coverage"""
|
||||
|
||||
def __init__(self, embedding_model: str = None, llm_config: Dict = None):
|
||||
def __init__(self, embedding_model: str = None, llm_config: Union[LLMConfig, Dict] = None):
|
||||
self.embedding_model = embedding_model or "sentence-transformers/all-MiniLM-L6-v2"
|
||||
self.llm_config = llm_config
|
||||
self._embedding_cache = {}
|
||||
@@ -605,14 +629,24 @@ class EmbeddingStrategy(CrawlStrategy):
|
||||
self._kb_embeddings_hash = None # Track KB changes
|
||||
self._validation_embeddings_cache = None # Cache validation query embeddings
|
||||
self._kb_similarity_threshold = 0.95 # Threshold for deduplication
|
||||
|
||||
def _get_embedding_llm_config_dict(self) -> Dict:
|
||||
"""Get embedding LLM config as dict with fallback to default."""
|
||||
if hasattr(self, 'config') and self.config:
|
||||
config_dict = self.config._embedding_llm_config_dict
|
||||
if config_dict:
|
||||
return config_dict
|
||||
|
||||
# Fallback to default if no config provided
|
||||
return {
|
||||
'provider': 'openai/text-embedding-3-small',
|
||||
'api_token': os.getenv('OPENAI_API_KEY')
|
||||
}
|
||||
|
||||
async def _get_embeddings(self, texts: List[str]) -> Any:
|
||||
"""Get embeddings using configured method"""
|
||||
from .utils import get_text_embeddings
|
||||
embedding_llm_config = {
|
||||
'provider': 'openai/text-embedding-3-small',
|
||||
'api_token': os.getenv('OPENAI_API_KEY')
|
||||
}
|
||||
embedding_llm_config = self._get_embedding_llm_config_dict()
|
||||
return await get_text_embeddings(
|
||||
texts,
|
||||
embedding_llm_config,
|
||||
@@ -679,8 +713,20 @@ class EmbeddingStrategy(CrawlStrategy):
|
||||
Return as a JSON array of strings."""
|
||||
|
||||
# Use the LLM for query generation
|
||||
provider = self.llm_config.get('provider', 'openai/gpt-4o-mini') if self.llm_config else 'openai/gpt-4o-mini'
|
||||
api_token = self.llm_config.get('api_token') if self.llm_config else None
|
||||
# Convert LLMConfig to dict if needed
|
||||
llm_config_dict = None
|
||||
if self.llm_config:
|
||||
if isinstance(self.llm_config, dict):
|
||||
llm_config_dict = self.llm_config
|
||||
else:
|
||||
# Convert LLMConfig object to dict
|
||||
llm_config_dict = {
|
||||
'provider': self.llm_config.provider,
|
||||
'api_token': self.llm_config.api_token
|
||||
}
|
||||
|
||||
provider = llm_config_dict.get('provider', 'openai/gpt-4o-mini') if llm_config_dict else 'openai/gpt-4o-mini'
|
||||
api_token = llm_config_dict.get('api_token') if llm_config_dict else None
|
||||
|
||||
# response = perform_completion_with_backoff(
|
||||
# provider=provider,
|
||||
@@ -843,10 +889,7 @@ class EmbeddingStrategy(CrawlStrategy):
|
||||
|
||||
# Batch embed only uncached links
|
||||
if texts_to_embed:
|
||||
embedding_llm_config = {
|
||||
'provider': 'openai/text-embedding-3-small',
|
||||
'api_token': os.getenv('OPENAI_API_KEY')
|
||||
}
|
||||
embedding_llm_config = self._get_embedding_llm_config_dict()
|
||||
new_embeddings = await get_text_embeddings(texts_to_embed, embedding_llm_config, self.embedding_model)
|
||||
|
||||
# Cache the new embeddings
|
||||
@@ -1184,10 +1227,7 @@ class EmbeddingStrategy(CrawlStrategy):
|
||||
return
|
||||
|
||||
# Get embeddings for new texts
|
||||
embedding_llm_config = {
|
||||
'provider': 'openai/text-embedding-3-small',
|
||||
'api_token': os.getenv('OPENAI_API_KEY')
|
||||
}
|
||||
embedding_llm_config = self._get_embedding_llm_config_dict()
|
||||
new_embeddings = await get_text_embeddings(new_texts, embedding_llm_config, self.embedding_model)
|
||||
|
||||
# Deduplicate embeddings before adding to KB
|
||||
@@ -1256,10 +1296,12 @@ class AdaptiveCrawler:
|
||||
if strategy_name == "statistical":
|
||||
return StatisticalStrategy()
|
||||
elif strategy_name == "embedding":
|
||||
return EmbeddingStrategy(
|
||||
strategy = EmbeddingStrategy(
|
||||
embedding_model=self.config.embedding_model,
|
||||
llm_config=self.config.embedding_llm_config
|
||||
)
|
||||
strategy.config = self.config # Pass config to strategy
|
||||
return strategy
|
||||
else:
|
||||
raise ValueError(f"Unknown strategy: {strategy_name}")
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import os
|
||||
from typing import Union
|
||||
import warnings
|
||||
from .config import (
|
||||
DEFAULT_PROVIDER,
|
||||
DEFAULT_PROVIDER_API_KEY,
|
||||
@@ -257,24 +258,39 @@ class ProxyConfig:
|
||||
|
||||
@staticmethod
|
||||
def from_string(proxy_str: str) -> "ProxyConfig":
|
||||
"""Create a ProxyConfig from a string in the format 'ip:port:username:password'."""
|
||||
parts = proxy_str.split(":")
|
||||
if len(parts) == 4: # ip:port:username:password
|
||||
"""Create a ProxyConfig from a string.
|
||||
|
||||
Supported formats:
|
||||
- 'http://username:password@ip:port'
|
||||
- 'http://ip:port'
|
||||
- 'socks5://ip:port'
|
||||
- 'ip:port:username:password'
|
||||
- 'ip:port'
|
||||
"""
|
||||
s = (proxy_str or "").strip()
|
||||
# URL with credentials
|
||||
if "@" in s and "://" in s:
|
||||
auth_part, server_part = s.split("@", 1)
|
||||
protocol, credentials = auth_part.split("://", 1)
|
||||
if ":" in credentials:
|
||||
username, password = credentials.split(":", 1)
|
||||
return ProxyConfig(
|
||||
server=f"{protocol}://{server_part}",
|
||||
username=username,
|
||||
password=password,
|
||||
)
|
||||
# URL without credentials (keep scheme)
|
||||
if "://" in s and "@" not in s:
|
||||
return ProxyConfig(server=s)
|
||||
# Colon separated forms
|
||||
parts = s.split(":")
|
||||
if len(parts) == 4:
|
||||
ip, port, username, password = parts
|
||||
return ProxyConfig(
|
||||
server=f"http://{ip}:{port}",
|
||||
username=username,
|
||||
password=password,
|
||||
ip=ip
|
||||
)
|
||||
elif len(parts) == 2: # ip:port only
|
||||
return ProxyConfig(server=f"http://{ip}:{port}", username=username, password=password)
|
||||
if len(parts) == 2:
|
||||
ip, port = parts
|
||||
return ProxyConfig(
|
||||
server=f"http://{ip}:{port}",
|
||||
ip=ip
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Invalid proxy string format: {proxy_str}")
|
||||
return ProxyConfig(server=f"http://{ip}:{port}")
|
||||
raise ValueError(f"Invalid proxy string format: {proxy_str}")
|
||||
|
||||
@staticmethod
|
||||
def from_dict(proxy_dict: Dict) -> "ProxyConfig":
|
||||
@@ -438,6 +454,7 @@ class BrowserConfig:
|
||||
host: str = "localhost",
|
||||
enable_stealth: bool = False,
|
||||
):
|
||||
|
||||
self.browser_type = browser_type
|
||||
self.headless = headless
|
||||
self.browser_mode = browser_mode
|
||||
@@ -450,13 +467,22 @@ class BrowserConfig:
|
||||
if self.browser_type in ["firefox", "webkit"]:
|
||||
self.channel = ""
|
||||
self.chrome_channel = ""
|
||||
if proxy:
|
||||
warnings.warn("The 'proxy' parameter is deprecated and will be removed in a future release. Use 'proxy_config' instead.", UserWarning)
|
||||
self.proxy = proxy
|
||||
self.proxy_config = proxy_config
|
||||
if isinstance(self.proxy_config, dict):
|
||||
self.proxy_config = ProxyConfig.from_dict(self.proxy_config)
|
||||
if isinstance(self.proxy_config, str):
|
||||
self.proxy_config = ProxyConfig.from_string(self.proxy_config)
|
||||
|
||||
|
||||
if self.proxy and self.proxy_config:
|
||||
warnings.warn("Both 'proxy' and 'proxy_config' are provided. 'proxy_config' will take precedence.", UserWarning)
|
||||
self.proxy = None
|
||||
elif self.proxy:
|
||||
# Convert proxy string to ProxyConfig if proxy_config is not provided
|
||||
self.proxy_config = ProxyConfig.from_string(self.proxy)
|
||||
self.proxy = None
|
||||
|
||||
self.viewport_width = viewport_width
|
||||
self.viewport_height = viewport_height
|
||||
|
||||
@@ -148,6 +148,134 @@ class PlaywrightAdapter(BrowserAdapter):
|
||||
return Page, Error, PlaywrightTimeoutError
|
||||
|
||||
|
||||
class StealthAdapter(BrowserAdapter):
|
||||
"""Adapter for Playwright with stealth features using playwright_stealth"""
|
||||
|
||||
def __init__(self):
|
||||
self._console_script_injected = {}
|
||||
self._stealth_available = self._check_stealth_availability()
|
||||
|
||||
def _check_stealth_availability(self) -> bool:
|
||||
"""Check if playwright_stealth is available and get the correct function"""
|
||||
try:
|
||||
from playwright_stealth import stealth_async
|
||||
self._stealth_function = stealth_async
|
||||
return True
|
||||
except ImportError:
|
||||
try:
|
||||
from playwright_stealth import stealth_sync
|
||||
self._stealth_function = stealth_sync
|
||||
return True
|
||||
except ImportError:
|
||||
self._stealth_function = None
|
||||
return False
|
||||
|
||||
async def apply_stealth(self, page: Page):
|
||||
"""Apply stealth to a page if available"""
|
||||
if self._stealth_available and self._stealth_function:
|
||||
try:
|
||||
if hasattr(self._stealth_function, '__call__'):
|
||||
if 'async' in getattr(self._stealth_function, '__name__', ''):
|
||||
await self._stealth_function(page)
|
||||
else:
|
||||
self._stealth_function(page)
|
||||
except Exception as e:
|
||||
# Fail silently or log error depending on requirements
|
||||
pass
|
||||
|
||||
async def evaluate(self, page: Page, expression: str, arg: Any = None) -> Any:
|
||||
"""Standard Playwright evaluate with stealth applied"""
|
||||
if arg is not None:
|
||||
return await page.evaluate(expression, arg)
|
||||
return await page.evaluate(expression)
|
||||
|
||||
async def setup_console_capture(self, page: Page, captured_console: List[Dict]) -> Optional[Callable]:
|
||||
"""Setup console capture using Playwright's event system with stealth"""
|
||||
# Apply stealth to the page first
|
||||
await self.apply_stealth(page)
|
||||
|
||||
def handle_console_capture(msg):
|
||||
try:
|
||||
message_type = "unknown"
|
||||
try:
|
||||
message_type = msg.type
|
||||
except:
|
||||
pass
|
||||
|
||||
message_text = "unknown"
|
||||
try:
|
||||
message_text = msg.text
|
||||
except:
|
||||
pass
|
||||
|
||||
entry = {
|
||||
"type": message_type,
|
||||
"text": message_text,
|
||||
"timestamp": time.time()
|
||||
}
|
||||
|
||||
captured_console.append(entry)
|
||||
|
||||
except Exception as e:
|
||||
captured_console.append({
|
||||
"type": "console_capture_error",
|
||||
"error": str(e),
|
||||
"timestamp": time.time()
|
||||
})
|
||||
|
||||
page.on("console", handle_console_capture)
|
||||
return handle_console_capture
|
||||
|
||||
async def setup_error_capture(self, page: Page, captured_console: List[Dict]) -> Optional[Callable]:
|
||||
"""Setup error capture using Playwright's event system"""
|
||||
def handle_pageerror_capture(err):
|
||||
try:
|
||||
error_message = "Unknown error"
|
||||
try:
|
||||
error_message = err.message
|
||||
except:
|
||||
pass
|
||||
|
||||
error_stack = ""
|
||||
try:
|
||||
error_stack = err.stack
|
||||
except:
|
||||
pass
|
||||
|
||||
captured_console.append({
|
||||
"type": "error",
|
||||
"text": error_message,
|
||||
"stack": error_stack,
|
||||
"timestamp": time.time()
|
||||
})
|
||||
except Exception as e:
|
||||
captured_console.append({
|
||||
"type": "pageerror_capture_error",
|
||||
"error": str(e),
|
||||
"timestamp": time.time()
|
||||
})
|
||||
|
||||
page.on("pageerror", handle_pageerror_capture)
|
||||
return handle_pageerror_capture
|
||||
|
||||
async def retrieve_console_messages(self, page: Page) -> List[Dict]:
|
||||
"""Not needed for Playwright - messages are captured via events"""
|
||||
return []
|
||||
|
||||
async def cleanup_console_capture(self, page: Page, handle_console: Optional[Callable], handle_error: Optional[Callable]):
|
||||
"""Remove event listeners"""
|
||||
if handle_console:
|
||||
page.remove_listener("console", handle_console)
|
||||
if handle_error:
|
||||
page.remove_listener("pageerror", handle_error)
|
||||
|
||||
def get_imports(self) -> tuple:
|
||||
"""Return Playwright imports"""
|
||||
from playwright.async_api import Page, Error
|
||||
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
|
||||
return Page, Error, PlaywrightTimeoutError
|
||||
|
||||
|
||||
class UndetectedAdapter(BrowserAdapter):
|
||||
"""Adapter for undetected browser automation with stealth features"""
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@ from .js_snippet import load_js_script
|
||||
from .config import DOWNLOAD_PAGE_TIMEOUT
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from .utils import get_chromium_path
|
||||
import warnings
|
||||
|
||||
|
||||
BROWSER_DISABLE_OPTIONS = [
|
||||
@@ -613,9 +614,11 @@ class BrowserManager:
|
||||
# for all racers). Prevents 'Target page/context closed' errors.
|
||||
self._page_lock = asyncio.Lock()
|
||||
|
||||
# Stealth-related attributes
|
||||
self._stealth_instance = None
|
||||
self._stealth_cm = None
|
||||
# Stealth adapter for stealth mode
|
||||
self._stealth_adapter = None
|
||||
if self.config.enable_stealth and not self.use_undetected:
|
||||
from .browser_adapter import StealthAdapter
|
||||
self._stealth_adapter = StealthAdapter()
|
||||
|
||||
# Initialize ManagedBrowser if needed
|
||||
if self.config.use_managed_browser:
|
||||
@@ -649,16 +652,8 @@ class BrowserManager:
|
||||
else:
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
# Initialize playwright with or without stealth
|
||||
if self.config.enable_stealth and not self.use_undetected:
|
||||
# Import stealth only when needed
|
||||
from playwright_stealth import Stealth
|
||||
# Use the recommended stealth wrapper approach
|
||||
self._stealth_instance = Stealth()
|
||||
self._stealth_cm = self._stealth_instance.use_async(async_playwright())
|
||||
self.playwright = await self._stealth_cm.__aenter__()
|
||||
else:
|
||||
self.playwright = await async_playwright().start()
|
||||
# Initialize playwright
|
||||
self.playwright = await async_playwright().start()
|
||||
|
||||
if self.config.cdp_url or self.config.use_managed_browser:
|
||||
self.config.use_managed_browser = True
|
||||
@@ -741,17 +736,18 @@ class BrowserManager:
|
||||
)
|
||||
os.makedirs(browser_args["downloads_path"], exist_ok=True)
|
||||
|
||||
if self.config.proxy or self.config.proxy_config:
|
||||
if self.config.proxy:
|
||||
warnings.warn(
|
||||
"BrowserConfig.proxy is deprecated and ignored. Use proxy_config instead.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
if self.config.proxy_config:
|
||||
from playwright.async_api import ProxySettings
|
||||
|
||||
proxy_settings = (
|
||||
ProxySettings(server=self.config.proxy)
|
||||
if self.config.proxy
|
||||
else ProxySettings(
|
||||
server=self.config.proxy_config.server,
|
||||
username=self.config.proxy_config.username,
|
||||
password=self.config.proxy_config.password,
|
||||
)
|
||||
proxy_settings = ProxySettings(
|
||||
server=self.config.proxy_config.server,
|
||||
username=self.config.proxy_config.username,
|
||||
password=self.config.proxy_config.password,
|
||||
)
|
||||
browser_args["proxy"] = proxy_settings
|
||||
|
||||
@@ -1007,6 +1003,19 @@ class BrowserManager:
|
||||
signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest()
|
||||
return signature_hash
|
||||
|
||||
async def _apply_stealth_to_page(self, page):
|
||||
"""Apply stealth to a page if stealth mode is enabled"""
|
||||
if self._stealth_adapter:
|
||||
try:
|
||||
await self._stealth_adapter.apply_stealth(page)
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.warning(
|
||||
message="Failed to apply stealth to page: {error}",
|
||||
tag="STEALTH",
|
||||
params={"error": str(e)}
|
||||
)
|
||||
|
||||
async def get_page(self, crawlerRunConfig: CrawlerRunConfig):
|
||||
"""
|
||||
Get a page for the given session ID, creating a new one if needed.
|
||||
@@ -1036,6 +1045,7 @@ class BrowserManager:
|
||||
# See GH-1198: context.pages can be empty under races
|
||||
async with self._page_lock:
|
||||
page = await ctx.new_page()
|
||||
await self._apply_stealth_to_page(page)
|
||||
else:
|
||||
context = self.default_context
|
||||
pages = context.pages
|
||||
@@ -1052,6 +1062,7 @@ class BrowserManager:
|
||||
page = pages[0]
|
||||
else:
|
||||
page = await context.new_page()
|
||||
await self._apply_stealth_to_page(page)
|
||||
else:
|
||||
# Otherwise, check if we have an existing context for this config
|
||||
config_signature = self._make_config_signature(crawlerRunConfig)
|
||||
@@ -1067,6 +1078,7 @@ class BrowserManager:
|
||||
|
||||
# Create a new page from the chosen context
|
||||
page = await context.new_page()
|
||||
await self._apply_stealth_to_page(page)
|
||||
|
||||
# If a session_id is specified, store this session so we can reuse later
|
||||
if crawlerRunConfig.session_id:
|
||||
@@ -1133,19 +1145,5 @@ class BrowserManager:
|
||||
self.managed_browser = None
|
||||
|
||||
if self.playwright:
|
||||
# Handle stealth context manager cleanup if it exists
|
||||
if hasattr(self, '_stealth_cm') and self._stealth_cm is not None:
|
||||
try:
|
||||
await self._stealth_cm.__aexit__(None, None, None)
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.error(
|
||||
message="Error closing stealth context: {error}",
|
||||
tag="ERROR",
|
||||
params={"error": str(e)}
|
||||
)
|
||||
self._stealth_cm = None
|
||||
self._stealth_instance = None
|
||||
else:
|
||||
await self.playwright.stop()
|
||||
await self.playwright.stop()
|
||||
self.playwright = None
|
||||
|
||||
@@ -122,11 +122,6 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
||||
|
||||
valid_links.append(base_url)
|
||||
|
||||
# If we have more valid links than capacity, limit them
|
||||
if len(valid_links) > remaining_capacity:
|
||||
valid_links = valid_links[:remaining_capacity]
|
||||
self.logger.info(f"Limiting to {remaining_capacity} URLs due to max_pages limit")
|
||||
|
||||
# Record the new depths and add to next_links
|
||||
for url in valid_links:
|
||||
depths[url] = new_depth
|
||||
@@ -146,7 +141,8 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
||||
"""
|
||||
queue: asyncio.PriorityQueue = asyncio.PriorityQueue()
|
||||
# Push the initial URL with score 0 and depth 0.
|
||||
await queue.put((0, 0, start_url, None))
|
||||
initial_score = self.url_scorer.score(start_url) if self.url_scorer else 0
|
||||
await queue.put((-initial_score, 0, start_url, None))
|
||||
visited: Set[str] = set()
|
||||
depths: Dict[str, int] = {start_url: 0}
|
||||
|
||||
@@ -193,7 +189,7 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
||||
result.metadata = result.metadata or {}
|
||||
result.metadata["depth"] = depth
|
||||
result.metadata["parent_url"] = parent_url
|
||||
result.metadata["score"] = score
|
||||
result.metadata["score"] = -score
|
||||
|
||||
# Count only successful crawls toward max_pages limit
|
||||
if result.success:
|
||||
@@ -214,7 +210,7 @@ class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
||||
for new_url, new_parent in new_links:
|
||||
new_depth = depths.get(new_url, depth + 1)
|
||||
new_score = self.url_scorer.score(new_url) if self.url_scorer else 0
|
||||
await queue.put((new_score, new_depth, new_url, new_parent))
|
||||
await queue.put((-new_score, new_depth, new_url, new_parent))
|
||||
|
||||
# End of crawl.
|
||||
|
||||
|
||||
@@ -469,10 +469,13 @@ async def handle_crawl_request(
|
||||
# await crawler.start()
|
||||
|
||||
base_config = config["crawler"]["base_config"]
|
||||
# Iterate on key-value pairs in global_config then use haseattr to set them
|
||||
# Iterate on key-value pairs in global_config then use hasattr to set them
|
||||
for key, value in base_config.items():
|
||||
if hasattr(crawler_config, key):
|
||||
setattr(crawler_config, key, value)
|
||||
current_value = getattr(crawler_config, key)
|
||||
# Only set base config if user didn't provide a value
|
||||
if current_value is None or current_value == "":
|
||||
setattr(crawler_config, key, value)
|
||||
|
||||
results = []
|
||||
func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many")
|
||||
|
||||
@@ -28,25 +28,43 @@ def create_access_token(data: dict, expires_delta: Optional[timedelta] = None) -
|
||||
signing_key = get_jwk_from_secret(SECRET_KEY)
|
||||
return instance.encode(to_encode, signing_key, alg='HS256')
|
||||
|
||||
def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)) -> Dict:
|
||||
def verify_token(credentials: HTTPAuthorizationCredentials) -> Dict:
|
||||
"""Verify the JWT token from the Authorization header."""
|
||||
|
||||
if credentials is None:
|
||||
return None
|
||||
|
||||
if not credentials or not credentials.credentials:
|
||||
raise HTTPException(
|
||||
status_code=401,
|
||||
detail="No token provided",
|
||||
headers={"WWW-Authenticate": "Bearer"}
|
||||
)
|
||||
|
||||
token = credentials.credentials
|
||||
verifying_key = get_jwk_from_secret(SECRET_KEY)
|
||||
try:
|
||||
payload = instance.decode(token, verifying_key, do_time_check=True, algorithms='HS256')
|
||||
return payload
|
||||
except Exception:
|
||||
raise HTTPException(status_code=401, detail="Invalid or expired token")
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=401,
|
||||
detail=f"Invalid or expired token: {str(e)}",
|
||||
headers={"WWW-Authenticate": "Bearer"}
|
||||
)
|
||||
|
||||
|
||||
def get_token_dependency(config: Dict):
|
||||
"""Return the token dependency if JWT is enabled, else a function that returns None."""
|
||||
|
||||
|
||||
if config.get("security", {}).get("jwt_enabled", False):
|
||||
return verify_token
|
||||
def jwt_required(credentials: HTTPAuthorizationCredentials = Depends(security)) -> Dict:
|
||||
"""Enforce JWT authentication when enabled."""
|
||||
if credentials is None:
|
||||
raise HTTPException(
|
||||
status_code=401,
|
||||
detail="Authentication required. Please provide a valid Bearer token.",
|
||||
headers={"WWW-Authenticate": "Bearer"}
|
||||
)
|
||||
return verify_token(credentials)
|
||||
return jwt_required
|
||||
else:
|
||||
return lambda: None
|
||||
|
||||
|
||||
@@ -7520,17 +7520,18 @@ class BrowserManager:
|
||||
)
|
||||
os.makedirs(browser_args["downloads_path"], exist_ok=True)
|
||||
|
||||
if self.config.proxy or self.config.proxy_config:
|
||||
if self.config.proxy:
|
||||
warnings.warn(
|
||||
"BrowserConfig.proxy is deprecated and ignored. Use proxy_config instead.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
if self.config.proxy_config:
|
||||
from playwright.async_api import ProxySettings
|
||||
|
||||
proxy_settings = (
|
||||
ProxySettings(server=self.config.proxy)
|
||||
if self.config.proxy
|
||||
else ProxySettings(
|
||||
server=self.config.proxy_config.server,
|
||||
username=self.config.proxy_config.username,
|
||||
password=self.config.proxy_config.password,
|
||||
)
|
||||
proxy_settings = ProxySettings(
|
||||
server=self.config.proxy_config.server,
|
||||
username=self.config.proxy_config.username,
|
||||
password=self.config.proxy_config.password,
|
||||
)
|
||||
browser_args["proxy"] = proxy_settings
|
||||
|
||||
|
||||
@@ -38,8 +38,8 @@ rate_limiting:
|
||||
|
||||
# Security Configuration
|
||||
security:
|
||||
enabled: false
|
||||
jwt_enabled: false
|
||||
enabled: false
|
||||
jwt_enabled: false
|
||||
https_redirect: false
|
||||
trusted_hosts: ["*"]
|
||||
headers:
|
||||
|
||||
154
docs/examples/adaptive_crawling/llm_config_example.py
Normal file
154
docs/examples/adaptive_crawling/llm_config_example.py
Normal file
@@ -0,0 +1,154 @@
|
||||
import asyncio
|
||||
import os
|
||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig, LLMConfig
|
||||
|
||||
|
||||
async def test_configuration(name: str, config: AdaptiveConfig, url: str, query: str):
|
||||
"""Test a specific configuration"""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Configuration: {name}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
result = await adaptive.digest(start_url=url, query=query)
|
||||
|
||||
print("\n" + "="*50)
|
||||
print("CRAWL STATISTICS")
|
||||
print("="*50)
|
||||
adaptive.print_stats(detailed=False)
|
||||
|
||||
# Get the most relevant content found
|
||||
print("\n" + "="*50)
|
||||
print("MOST RELEVANT PAGES")
|
||||
print("="*50)
|
||||
|
||||
relevant_pages = adaptive.get_relevant_content(top_k=5)
|
||||
for i, page in enumerate(relevant_pages, 1):
|
||||
print(f"\n{i}. {page['url']}")
|
||||
print(f" Relevance Score: {page['score']:.2%}")
|
||||
|
||||
# Show a snippet of the content
|
||||
content = page['content'] or ""
|
||||
if content:
|
||||
snippet = content[:200].replace('\n', ' ')
|
||||
if len(content) > 200:
|
||||
snippet += "..."
|
||||
print(f" Preview: {snippet}")
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f"Pages crawled: {len(result.crawled_urls)}")
|
||||
print(f"Final confidence: {adaptive.confidence:.1%}")
|
||||
print(f"Stopped reason: {result.metrics.get('stopped_reason', 'max_pages')}")
|
||||
|
||||
if result.metrics.get('is_irrelevant', False):
|
||||
print("⚠️ Query detected as irrelevant!")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
async def llm_embedding():
|
||||
"""Demonstrate various embedding configurations"""
|
||||
|
||||
print("EMBEDDING STRATEGY CONFIGURATION EXAMPLES")
|
||||
print("=" * 60)
|
||||
|
||||
# Base URL and query for testing
|
||||
test_url = "https://docs.python.org/3/library/asyncio.html"
|
||||
|
||||
openai_llm_config = LLMConfig(
|
||||
provider='openai/text-embedding-3-small',
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
temperature=0.7,
|
||||
max_tokens=2000
|
||||
)
|
||||
config_openai = AdaptiveConfig(
|
||||
strategy="embedding",
|
||||
max_pages=10,
|
||||
|
||||
# Use OpenAI embeddings
|
||||
embedding_llm_config=openai_llm_config,
|
||||
# embedding_llm_config={
|
||||
# 'provider': 'openai/text-embedding-3-small',
|
||||
# 'api_token': os.getenv('OPENAI_API_KEY')
|
||||
# },
|
||||
|
||||
# OpenAI embeddings are high quality, can be stricter
|
||||
embedding_k_exp=4.0,
|
||||
n_query_variations=12
|
||||
)
|
||||
|
||||
await test_configuration(
|
||||
"OpenAI Embeddings",
|
||||
config_openai,
|
||||
test_url,
|
||||
# "event-driven architecture patterns"
|
||||
"async await context managers coroutines"
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
|
||||
async def basic_adaptive_crawling():
|
||||
"""Basic adaptive crawling example"""
|
||||
|
||||
# Initialize the crawler
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
# Create an adaptive crawler with default settings (statistical strategy)
|
||||
adaptive = AdaptiveCrawler(crawler)
|
||||
|
||||
# Note: You can also use embedding strategy for semantic understanding:
|
||||
# from crawl4ai import AdaptiveConfig
|
||||
# config = AdaptiveConfig(strategy="embedding")
|
||||
# adaptive = AdaptiveCrawler(crawler, config)
|
||||
|
||||
# Start adaptive crawling
|
||||
print("Starting adaptive crawl for Python async programming information...")
|
||||
result = await adaptive.digest(
|
||||
start_url="https://docs.python.org/3/library/asyncio.html",
|
||||
query="async await context managers coroutines"
|
||||
)
|
||||
|
||||
# Display crawl statistics
|
||||
print("\n" + "="*50)
|
||||
print("CRAWL STATISTICS")
|
||||
print("="*50)
|
||||
adaptive.print_stats(detailed=False)
|
||||
|
||||
# Get the most relevant content found
|
||||
print("\n" + "="*50)
|
||||
print("MOST RELEVANT PAGES")
|
||||
print("="*50)
|
||||
|
||||
relevant_pages = adaptive.get_relevant_content(top_k=5)
|
||||
for i, page in enumerate(relevant_pages, 1):
|
||||
print(f"\n{i}. {page['url']}")
|
||||
print(f" Relevance Score: {page['score']:.2%}")
|
||||
|
||||
# Show a snippet of the content
|
||||
content = page['content'] or ""
|
||||
if content:
|
||||
snippet = content[:200].replace('\n', ' ')
|
||||
if len(content) > 200:
|
||||
snippet += "..."
|
||||
print(f" Preview: {snippet}")
|
||||
|
||||
# Show final confidence
|
||||
print(f"\n{'='*50}")
|
||||
print(f"Final Confidence: {adaptive.confidence:.2%}")
|
||||
print(f"Total Pages Crawled: {len(result.crawled_urls)}")
|
||||
print(f"Knowledge Base Size: {len(adaptive.state.knowledge_base)} documents")
|
||||
|
||||
|
||||
if adaptive.confidence >= 0.8:
|
||||
print("✓ High confidence - can answer detailed questions about async Python")
|
||||
elif adaptive.confidence >= 0.6:
|
||||
print("~ Moderate confidence - can answer basic questions")
|
||||
else:
|
||||
print("✗ Low confidence - need more information")
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(llm_embedding())
|
||||
# asyncio.run(basic_adaptive_crawling())
|
||||
@@ -7,13 +7,13 @@ Simple proxy configuration with `BrowserConfig`:
|
||||
```python
|
||||
from crawl4ai.async_configs import BrowserConfig
|
||||
|
||||
# Using proxy URL
|
||||
browser_config = BrowserConfig(proxy="http://proxy.example.com:8080")
|
||||
# Using HTTP proxy
|
||||
browser_config = BrowserConfig(proxy_config={"server": "http://proxy.example.com:8080"})
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
|
||||
# Using SOCKS proxy
|
||||
browser_config = BrowserConfig(proxy="socks5://proxy.example.com:1080")
|
||||
browser_config = BrowserConfig(proxy_config={"server": "socks5://proxy.example.com:1080"})
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
```
|
||||
@@ -25,7 +25,11 @@ Use an authenticated proxy with `BrowserConfig`:
|
||||
```python
|
||||
from crawl4ai.async_configs import BrowserConfig
|
||||
|
||||
browser_config = BrowserConfig(proxy="http://[username]:[password]@[host]:[port]")
|
||||
browser_config = BrowserConfig(proxy_config={
|
||||
"server": "http://[host]:[port]",
|
||||
"username": "[username]",
|
||||
"password": "[password]",
|
||||
})
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
```
|
||||
|
||||
@@ -23,7 +23,7 @@ browser_cfg = BrowserConfig(
|
||||
| **`headless`** | `bool` (default: `True`) | Headless means no visible UI. `False` is handy for debugging. |
|
||||
| **`viewport_width`** | `int` (default: `1080`) | Initial page width (in px). Useful for testing responsive layouts. |
|
||||
| **`viewport_height`** | `int` (default: `600`) | Initial page height (in px). |
|
||||
| **`proxy`** | `str` (default: `None`) | Single-proxy URL if you want all traffic to go through it, e.g. `"http://user:pass@proxy:8080"`. |
|
||||
| **`proxy`** | `str` (deprecated) | Deprecated. Use `proxy_config` instead. If set, it will be auto-converted internally. |
|
||||
| **`proxy_config`** | `dict` (default: `None`) | For advanced or multi-proxy needs, specify details like `{"server": "...", "username": "...", ...}`. |
|
||||
| **`use_persistent_context`** | `bool` (default: `False`) | If `True`, uses a **persistent** browser context (keep cookies, sessions across runs). Also sets `use_managed_browser=True`. |
|
||||
| **`user_data_dir`** | `str or None` (default: `None`) | Directory to store user data (profiles, cookies). Must be set if you want permanent sessions. |
|
||||
|
||||
@@ -108,7 +108,19 @@ config = AdaptiveConfig(
|
||||
embedding_min_confidence_threshold=0.1 # Stop if completely irrelevant
|
||||
)
|
||||
|
||||
# With custom embedding provider (e.g., OpenAI)
|
||||
# With custom LLM provider for query expansion (recommended)
|
||||
from crawl4ai import LLMConfig
|
||||
|
||||
config = AdaptiveConfig(
|
||||
strategy="embedding",
|
||||
embedding_llm_config=LLMConfig(
|
||||
provider='openai/text-embedding-3-small',
|
||||
api_token='your-api-key',
|
||||
temperature=0.7
|
||||
)
|
||||
)
|
||||
|
||||
# Alternative: Dictionary format (backward compatible)
|
||||
config = AdaptiveConfig(
|
||||
strategy="embedding",
|
||||
embedding_llm_config={
|
||||
|
||||
154
tests/adaptive/test_llm_embedding.py
Normal file
154
tests/adaptive/test_llm_embedding.py
Normal file
@@ -0,0 +1,154 @@
|
||||
import asyncio
|
||||
import os
|
||||
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig, LLMConfig
|
||||
|
||||
|
||||
async def test_configuration(name: str, config: AdaptiveConfig, url: str, query: str):
|
||||
"""Test a specific configuration"""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Configuration: {name}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||
adaptive = AdaptiveCrawler(crawler, config)
|
||||
result = await adaptive.digest(start_url=url, query=query)
|
||||
|
||||
print("\n" + "="*50)
|
||||
print("CRAWL STATISTICS")
|
||||
print("="*50)
|
||||
adaptive.print_stats(detailed=False)
|
||||
|
||||
# Get the most relevant content found
|
||||
print("\n" + "="*50)
|
||||
print("MOST RELEVANT PAGES")
|
||||
print("="*50)
|
||||
|
||||
relevant_pages = adaptive.get_relevant_content(top_k=5)
|
||||
for i, page in enumerate(relevant_pages, 1):
|
||||
print(f"\n{i}. {page['url']}")
|
||||
print(f" Relevance Score: {page['score']:.2%}")
|
||||
|
||||
# Show a snippet of the content
|
||||
content = page['content'] or ""
|
||||
if content:
|
||||
snippet = content[:200].replace('\n', ' ')
|
||||
if len(content) > 200:
|
||||
snippet += "..."
|
||||
print(f" Preview: {snippet}")
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f"Pages crawled: {len(result.crawled_urls)}")
|
||||
print(f"Final confidence: {adaptive.confidence:.1%}")
|
||||
print(f"Stopped reason: {result.metrics.get('stopped_reason', 'max_pages')}")
|
||||
|
||||
if result.metrics.get('is_irrelevant', False):
|
||||
print("⚠️ Query detected as irrelevant!")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
async def llm_embedding():
|
||||
"""Demonstrate various embedding configurations"""
|
||||
|
||||
print("EMBEDDING STRATEGY CONFIGURATION EXAMPLES")
|
||||
print("=" * 60)
|
||||
|
||||
# Base URL and query for testing
|
||||
test_url = "https://docs.python.org/3/library/asyncio.html"
|
||||
|
||||
openai_llm_config = LLMConfig(
|
||||
provider='openai/text-embedding-3-small',
|
||||
api_token=os.getenv('OPENAI_API_KEY'),
|
||||
temperature=0.7,
|
||||
max_tokens=2000
|
||||
)
|
||||
config_openai = AdaptiveConfig(
|
||||
strategy="embedding",
|
||||
max_pages=10,
|
||||
|
||||
# Use OpenAI embeddings
|
||||
embedding_llm_config=openai_llm_config,
|
||||
# embedding_llm_config={
|
||||
# 'provider': 'openai/text-embedding-3-small',
|
||||
# 'api_token': os.getenv('OPENAI_API_KEY')
|
||||
# },
|
||||
|
||||
# OpenAI embeddings are high quality, can be stricter
|
||||
embedding_k_exp=4.0,
|
||||
n_query_variations=12
|
||||
)
|
||||
|
||||
await test_configuration(
|
||||
"OpenAI Embeddings",
|
||||
config_openai,
|
||||
test_url,
|
||||
# "event-driven architecture patterns"
|
||||
"async await context managers coroutines"
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
|
||||
async def basic_adaptive_crawling():
|
||||
"""Basic adaptive crawling example"""
|
||||
|
||||
# Initialize the crawler
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
# Create an adaptive crawler with default settings (statistical strategy)
|
||||
adaptive = AdaptiveCrawler(crawler)
|
||||
|
||||
# Note: You can also use embedding strategy for semantic understanding:
|
||||
# from crawl4ai import AdaptiveConfig
|
||||
# config = AdaptiveConfig(strategy="embedding")
|
||||
# adaptive = AdaptiveCrawler(crawler, config)
|
||||
|
||||
# Start adaptive crawling
|
||||
print("Starting adaptive crawl for Python async programming information...")
|
||||
result = await adaptive.digest(
|
||||
start_url="https://docs.python.org/3/library/asyncio.html",
|
||||
query="async await context managers coroutines"
|
||||
)
|
||||
|
||||
# Display crawl statistics
|
||||
print("\n" + "="*50)
|
||||
print("CRAWL STATISTICS")
|
||||
print("="*50)
|
||||
adaptive.print_stats(detailed=False)
|
||||
|
||||
# Get the most relevant content found
|
||||
print("\n" + "="*50)
|
||||
print("MOST RELEVANT PAGES")
|
||||
print("="*50)
|
||||
|
||||
relevant_pages = adaptive.get_relevant_content(top_k=5)
|
||||
for i, page in enumerate(relevant_pages, 1):
|
||||
print(f"\n{i}. {page['url']}")
|
||||
print(f" Relevance Score: {page['score']:.2%}")
|
||||
|
||||
# Show a snippet of the content
|
||||
content = page['content'] or ""
|
||||
if content:
|
||||
snippet = content[:200].replace('\n', ' ')
|
||||
if len(content) > 200:
|
||||
snippet += "..."
|
||||
print(f" Preview: {snippet}")
|
||||
|
||||
# Show final confidence
|
||||
print(f"\n{'='*50}")
|
||||
print(f"Final Confidence: {adaptive.confidence:.2%}")
|
||||
print(f"Total Pages Crawled: {len(result.crawled_urls)}")
|
||||
print(f"Knowledge Base Size: {len(adaptive.state.knowledge_base)} documents")
|
||||
|
||||
|
||||
if adaptive.confidence >= 0.8:
|
||||
print("✓ High confidence - can answer detailed questions about async Python")
|
||||
elif adaptive.confidence >= 0.6:
|
||||
print("~ Moderate confidence - can answer basic questions")
|
||||
else:
|
||||
print("✗ Low confidence - need more information")
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(llm_embedding())
|
||||
# asyncio.run(basic_adaptive_crawling())
|
||||
@@ -112,7 +112,7 @@ async def test_proxy_settings():
|
||||
headless=True,
|
||||
verbose=False,
|
||||
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
|
||||
proxy="http://127.0.0.1:8080", # Assuming local proxy server for test
|
||||
proxy_config={"server": "http://127.0.0.1:8080"}, # Assuming local proxy server for test
|
||||
use_managed_browser=False,
|
||||
use_persistent_context=False,
|
||||
) as crawler:
|
||||
|
||||
117
tests/general/test_bff_scoring.py
Normal file
117
tests/general/test_bff_scoring.py
Normal file
@@ -0,0 +1,117 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Simple test to verify BestFirstCrawlingStrategy fixes.
|
||||
This test crawls a real website and shows that:
|
||||
1. Higher-scoring pages are crawled first (priority queue fix)
|
||||
2. Links are scored before truncation (link discovery fix)
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
|
||||
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
|
||||
|
||||
async def test_best_first_strategy():
|
||||
"""Test BestFirstCrawlingStrategy with keyword scoring"""
|
||||
|
||||
print("=" * 70)
|
||||
print("Testing BestFirstCrawlingStrategy with Real URL")
|
||||
print("=" * 70)
|
||||
print("\nThis test will:")
|
||||
print("1. Crawl Python.org documentation")
|
||||
print("2. Score pages based on keywords: 'tutorial', 'guide', 'reference'")
|
||||
print("3. Show that higher-scoring pages are crawled first")
|
||||
print("-" * 70)
|
||||
|
||||
# Create a keyword scorer that prioritizes tutorial/guide pages
|
||||
scorer = KeywordRelevanceScorer(
|
||||
keywords=["tutorial", "guide", "reference", "documentation"],
|
||||
weight=1.0,
|
||||
case_sensitive=False
|
||||
)
|
||||
|
||||
# Create the strategy with scoring
|
||||
strategy = BestFirstCrawlingStrategy(
|
||||
max_depth=2, # Crawl 2 levels deep
|
||||
max_pages=10, # Limit to 10 pages total
|
||||
url_scorer=scorer, # Use keyword scoring
|
||||
include_external=False # Only internal links
|
||||
)
|
||||
|
||||
# Configure browser and crawler
|
||||
browser_config = BrowserConfig(
|
||||
headless=True, # Run in background
|
||||
verbose=False # Reduce output noise
|
||||
)
|
||||
|
||||
crawler_config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=strategy,
|
||||
verbose=False
|
||||
)
|
||||
|
||||
print("\nStarting crawl of https://docs.python.org/3/")
|
||||
print("Looking for pages with keywords: tutorial, guide, reference, documentation")
|
||||
print("-" * 70)
|
||||
|
||||
crawled_urls = []
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Crawl and collect results
|
||||
results = await crawler.arun(
|
||||
url="https://docs.python.org/3/",
|
||||
config=crawler_config
|
||||
)
|
||||
|
||||
# Process results
|
||||
if isinstance(results, list):
|
||||
for result in results:
|
||||
score = result.metadata.get('score', 0) if result.metadata else 0
|
||||
depth = result.metadata.get('depth', 0) if result.metadata else 0
|
||||
crawled_urls.append({
|
||||
'url': result.url,
|
||||
'score': score,
|
||||
'depth': depth,
|
||||
'success': result.success
|
||||
})
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("CRAWL RESULTS (in order of crawling)")
|
||||
print("=" * 70)
|
||||
|
||||
for i, item in enumerate(crawled_urls, 1):
|
||||
status = "✓" if item['success'] else "✗"
|
||||
# Highlight high-scoring pages
|
||||
if item['score'] > 0.5:
|
||||
print(f"{i:2}. [{status}] Score: {item['score']:.2f} | Depth: {item['depth']} | {item['url']}")
|
||||
print(f" ^ HIGH SCORE - Contains keywords!")
|
||||
else:
|
||||
print(f"{i:2}. [{status}] Score: {item['score']:.2f} | Depth: {item['depth']} | {item['url']}")
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("ANALYSIS")
|
||||
print("=" * 70)
|
||||
|
||||
# Check if higher scores appear early in the crawl
|
||||
scores = [item['score'] for item in crawled_urls[1:]] # Skip initial URL
|
||||
high_score_indices = [i for i, s in enumerate(scores) if s > 0.3]
|
||||
|
||||
if high_score_indices and high_score_indices[0] < len(scores) / 2:
|
||||
print("✅ SUCCESS: Higher-scoring pages (with keywords) were crawled early!")
|
||||
print(" This confirms the priority queue fix is working.")
|
||||
else:
|
||||
print("⚠️ Check the crawl order above - higher scores should appear early")
|
||||
|
||||
# Show score distribution
|
||||
print(f"\nScore Statistics:")
|
||||
print(f" - Total pages crawled: {len(crawled_urls)}")
|
||||
print(f" - Average score: {sum(item['score'] for item in crawled_urls) / len(crawled_urls):.2f}")
|
||||
print(f" - Max score: {max(item['score'] for item in crawled_urls):.2f}")
|
||||
print(f" - Pages with keywords: {sum(1 for item in crawled_urls if item['score'] > 0.3)}")
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("TEST COMPLETE")
|
||||
print("=" * 70)
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("\n🔍 BestFirstCrawlingStrategy Simple Test\n")
|
||||
asyncio.run(test_best_first_strategy())
|
||||
@@ -24,7 +24,7 @@ CASES = [
|
||||
# --- BrowserConfig variants ---
|
||||
"BrowserConfig()",
|
||||
"BrowserConfig(headless=False, extra_args=['--disable-gpu'])",
|
||||
"BrowserConfig(browser_mode='builtin', proxy='http://1.2.3.4:8080')",
|
||||
"BrowserConfig(browser_mode='builtin', proxy_config={'server': 'http://1.2.3.4:8080'})",
|
||||
]
|
||||
|
||||
for code in CASES:
|
||||
|
||||
42
tests/proxy/test_proxy_deprecation.py
Normal file
42
tests/proxy/test_proxy_deprecation.py
Normal file
@@ -0,0 +1,42 @@
|
||||
import warnings
|
||||
|
||||
import pytest
|
||||
|
||||
from crawl4ai.async_configs import BrowserConfig, ProxyConfig
|
||||
|
||||
|
||||
def test_browser_config_proxy_string_emits_deprecation_and_autoconverts():
|
||||
warnings.simplefilter("always", DeprecationWarning)
|
||||
|
||||
proxy_str = "23.95.150.145:6114:username:password"
|
||||
with warnings.catch_warnings(record=True) as caught:
|
||||
cfg = BrowserConfig(proxy=proxy_str, headless=True)
|
||||
|
||||
dep_warnings = [w for w in caught if issubclass(w.category, DeprecationWarning)]
|
||||
assert dep_warnings, "Expected DeprecationWarning when using BrowserConfig(proxy=...)"
|
||||
|
||||
assert cfg.proxy is None, "cfg.proxy should be None after auto-conversion"
|
||||
assert isinstance(cfg.proxy_config, ProxyConfig), "cfg.proxy_config should be ProxyConfig instance"
|
||||
assert cfg.proxy_config.username == "username"
|
||||
assert cfg.proxy_config.password == "password"
|
||||
assert cfg.proxy_config.server.startswith("http://")
|
||||
assert cfg.proxy_config.server.endswith(":6114")
|
||||
|
||||
|
||||
def test_browser_config_with_proxy_config_emits_no_deprecation():
|
||||
warnings.simplefilter("always", DeprecationWarning)
|
||||
|
||||
with warnings.catch_warnings(record=True) as caught:
|
||||
cfg = BrowserConfig(
|
||||
headless=True,
|
||||
proxy_config={
|
||||
"server": "http://127.0.0.1:8080",
|
||||
"username": "u",
|
||||
"password": "p",
|
||||
},
|
||||
)
|
||||
|
||||
dep_warnings = [w for w in caught if issubclass(w.category, DeprecationWarning)]
|
||||
assert not dep_warnings, "Did not expect DeprecationWarning when using proxy_config"
|
||||
assert cfg.proxy is None
|
||||
assert isinstance(cfg.proxy_config, ProxyConfig)
|
||||
Reference in New Issue
Block a user