- Updated ProxyConfig.from_string to support multiple proxy formats, including URLs with credentials. - Deprecated the 'proxy' parameter in BrowserConfig, replacing it with 'proxy_config' for better flexibility. - Added warnings for deprecated usage and clarified behavior when both parameters are provided. - Updated documentation and tests to reflect changes in proxy configuration handling.
11633 lines
430 KiB
Markdown
11633 lines
430 KiB
Markdown
# Crawl4AI Code Context
|
||
|
||
Generated on 2025-04-21
|
||
|
||
## File: crawl4ai/async_configs.py
|
||
|
||
```py
|
||
import os
|
||
from .config import (
|
||
DEFAULT_PROVIDER,
|
||
DEFAULT_PROVIDER_API_KEY,
|
||
MIN_WORD_THRESHOLD,
|
||
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
||
PROVIDER_MODELS,
|
||
PROVIDER_MODELS_PREFIXES,
|
||
SCREENSHOT_HEIGHT_TRESHOLD,
|
||
PAGE_TIMEOUT,
|
||
IMAGE_SCORE_THRESHOLD,
|
||
SOCIAL_MEDIA_DOMAINS,
|
||
)
|
||
|
||
from .user_agent_generator import UAGen, ValidUAGenerator # , OnlineUAGenerator
|
||
from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy
|
||
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
||
|
||
from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
|
||
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
|
||
from .deep_crawling import DeepCrawlStrategy
|
||
|
||
from .cache_context import CacheMode
|
||
from .proxy_strategy import ProxyRotationStrategy
|
||
|
||
from typing import Union, List
|
||
import inspect
|
||
from typing import Any, Dict, Optional
|
||
from enum import Enum
|
||
|
||
# from .proxy_strategy import ProxyConfig
|
||
|
||
|
||
|
||
def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
|
||
"""
|
||
Recursively convert an object to a serializable dictionary using {type, params} structure
|
||
for complex objects.
|
||
"""
|
||
if obj is None:
|
||
return None
|
||
|
||
# Handle basic types
|
||
if isinstance(obj, (str, int, float, bool)):
|
||
return obj
|
||
|
||
# Handle Enum
|
||
if isinstance(obj, Enum):
|
||
return {"type": obj.__class__.__name__, "params": obj.value}
|
||
|
||
# Handle datetime objects
|
||
if hasattr(obj, "isoformat"):
|
||
return obj.isoformat()
|
||
|
||
# Handle lists, tuples, and sets, and basically any iterable
|
||
if isinstance(obj, (list, tuple, set)) or hasattr(obj, '__iter__') and not isinstance(obj, dict):
|
||
return [to_serializable_dict(item) for item in obj]
|
||
|
||
# Handle frozensets, which are not iterable
|
||
if isinstance(obj, frozenset):
|
||
return [to_serializable_dict(item) for item in list(obj)]
|
||
|
||
# Handle dictionaries - preserve them as-is
|
||
if isinstance(obj, dict):
|
||
return {
|
||
"type": "dict", # Mark as plain dictionary
|
||
"value": {str(k): to_serializable_dict(v) for k, v in obj.items()},
|
||
}
|
||
|
||
_type = obj.__class__.__name__
|
||
|
||
# Handle class instances
|
||
if hasattr(obj, "__class__"):
|
||
# Get constructor signature
|
||
sig = inspect.signature(obj.__class__.__init__)
|
||
params = sig.parameters
|
||
|
||
# Get current values
|
||
current_values = {}
|
||
for name, param in params.items():
|
||
if name == "self":
|
||
continue
|
||
|
||
value = getattr(obj, name, param.default)
|
||
|
||
# Only include if different from default, considering empty values
|
||
if not (is_empty_value(value) and is_empty_value(param.default)):
|
||
if value != param.default and not ignore_default_value:
|
||
current_values[name] = to_serializable_dict(value)
|
||
|
||
if hasattr(obj, '__slots__'):
|
||
for slot in obj.__slots__:
|
||
if slot.startswith('_'): # Handle private slots
|
||
attr_name = slot[1:] # Remove leading '_'
|
||
value = getattr(obj, slot, None)
|
||
if value is not None:
|
||
current_values[attr_name] = to_serializable_dict(value)
|
||
|
||
|
||
|
||
return {
|
||
"type": obj.__class__.__name__,
|
||
"params": current_values
|
||
}
|
||
|
||
return str(obj)
|
||
|
||
|
||
def from_serializable_dict(data: Any) -> Any:
|
||
"""
|
||
Recursively convert a serializable dictionary back to an object instance.
|
||
"""
|
||
if data is None:
|
||
return None
|
||
|
||
# Handle basic types
|
||
if isinstance(data, (str, int, float, bool)):
|
||
return data
|
||
|
||
# Handle typed data
|
||
if isinstance(data, dict) and "type" in data:
|
||
# Handle plain dictionaries
|
||
if data["type"] == "dict" and "value" in data:
|
||
return {k: from_serializable_dict(v) for k, v in data["value"].items()}
|
||
|
||
# Import from crawl4ai for class instances
|
||
import crawl4ai
|
||
|
||
if hasattr(crawl4ai, data["type"]):
|
||
cls = getattr(crawl4ai, data["type"])
|
||
|
||
# Handle Enum
|
||
if issubclass(cls, Enum):
|
||
return cls(data["params"])
|
||
|
||
if "params" in data:
|
||
# Handle class instances
|
||
constructor_args = {
|
||
k: from_serializable_dict(v) for k, v in data["params"].items()
|
||
}
|
||
return cls(**constructor_args)
|
||
|
||
# Handle lists
|
||
if isinstance(data, list):
|
||
return [from_serializable_dict(item) for item in data]
|
||
|
||
# Handle raw dictionaries (legacy support)
|
||
if isinstance(data, dict):
|
||
return {k: from_serializable_dict(v) for k, v in data.items()}
|
||
|
||
return data
|
||
|
||
|
||
def is_empty_value(value: Any) -> bool:
|
||
"""Check if a value is effectively empty/null."""
|
||
if value is None:
|
||
return True
|
||
if isinstance(value, (list, tuple, set, dict, str)) and len(value) == 0:
|
||
return True
|
||
return False
|
||
|
||
class ProxyConfig:
|
||
def __init__(
|
||
self,
|
||
server: str,
|
||
username: Optional[str] = None,
|
||
password: Optional[str] = None,
|
||
ip: Optional[str] = None,
|
||
):
|
||
"""Configuration class for a single proxy.
|
||
|
||
Args:
|
||
server: Proxy server URL (e.g., "http://127.0.0.1:8080")
|
||
username: Optional username for proxy authentication
|
||
password: Optional password for proxy authentication
|
||
ip: Optional IP address for verification purposes
|
||
"""
|
||
self.server = server
|
||
self.username = username
|
||
self.password = password
|
||
|
||
# Extract IP from server if not explicitly provided
|
||
self.ip = ip or self._extract_ip_from_server()
|
||
|
||
def _extract_ip_from_server(self) -> Optional[str]:
|
||
"""Extract IP address from server URL."""
|
||
try:
|
||
# Simple extraction assuming http://ip:port format
|
||
if "://" in self.server:
|
||
parts = self.server.split("://")[1].split(":")
|
||
return parts[0]
|
||
else:
|
||
parts = self.server.split(":")
|
||
return parts[0]
|
||
except Exception:
|
||
return None
|
||
|
||
@staticmethod
|
||
def from_string(proxy_str: str) -> "ProxyConfig":
|
||
"""Create a ProxyConfig from a string in the format 'ip:port:username:password'."""
|
||
parts = proxy_str.split(":")
|
||
if len(parts) == 4: # ip:port:username:password
|
||
ip, port, username, password = parts
|
||
return ProxyConfig(
|
||
server=f"http://{ip}:{port}",
|
||
username=username,
|
||
password=password,
|
||
ip=ip
|
||
)
|
||
elif len(parts) == 2: # ip:port only
|
||
ip, port = parts
|
||
return ProxyConfig(
|
||
server=f"http://{ip}:{port}",
|
||
ip=ip
|
||
)
|
||
else:
|
||
raise ValueError(f"Invalid proxy string format: {proxy_str}")
|
||
|
||
@staticmethod
|
||
def from_dict(proxy_dict: Dict) -> "ProxyConfig":
|
||
"""Create a ProxyConfig from a dictionary."""
|
||
return ProxyConfig(
|
||
server=proxy_dict.get("server"),
|
||
username=proxy_dict.get("username"),
|
||
password=proxy_dict.get("password"),
|
||
ip=proxy_dict.get("ip")
|
||
)
|
||
|
||
@staticmethod
|
||
def from_env(env_var: str = "PROXIES") -> List["ProxyConfig"]:
|
||
"""Load proxies from environment variable.
|
||
|
||
Args:
|
||
env_var: Name of environment variable containing comma-separated proxy strings
|
||
|
||
Returns:
|
||
List of ProxyConfig objects
|
||
"""
|
||
proxies = []
|
||
try:
|
||
proxy_list = os.getenv(env_var, "").split(",")
|
||
for proxy in proxy_list:
|
||
if not proxy:
|
||
continue
|
||
proxies.append(ProxyConfig.from_string(proxy))
|
||
except Exception as e:
|
||
print(f"Error loading proxies from environment: {e}")
|
||
return proxies
|
||
|
||
def to_dict(self) -> Dict:
|
||
"""Convert to dictionary representation."""
|
||
return {
|
||
"server": self.server,
|
||
"username": self.username,
|
||
"password": self.password,
|
||
"ip": self.ip
|
||
}
|
||
|
||
def clone(self, **kwargs) -> "ProxyConfig":
|
||
"""Create a copy of this configuration with updated values.
|
||
|
||
Args:
|
||
**kwargs: Key-value pairs of configuration options to update
|
||
|
||
Returns:
|
||
ProxyConfig: A new instance with the specified updates
|
||
"""
|
||
config_dict = self.to_dict()
|
||
config_dict.update(kwargs)
|
||
return ProxyConfig.from_dict(config_dict)
|
||
|
||
|
||
|
||
class BrowserConfig:
|
||
"""
|
||
Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy.
|
||
|
||
This class centralizes all parameters that affect browser and context creation. Instead of passing
|
||
scattered keyword arguments, users can instantiate and modify this configuration object. The crawler
|
||
code will then reference these settings to initialize the browser in a consistent, documented manner.
|
||
|
||
Attributes:
|
||
browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
|
||
Default: "chromium".
|
||
headless (bool): Whether to run the browser in headless mode (no visible GUI).
|
||
Default: True.
|
||
browser_mode (str): Determines how the browser should be initialized:
|
||
"builtin" - use the builtin CDP browser running in background
|
||
"dedicated" - create a new dedicated browser instance each time
|
||
"cdp" - use explicit CDP settings provided in cdp_url
|
||
"docker" - run browser in Docker container with isolation
|
||
Default: "dedicated"
|
||
use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing
|
||
advanced manipulation. Default: False.
|
||
cdp_url (str): URL for the Chrome DevTools Protocol (CDP) endpoint. Default: "ws://localhost:9222/devtools/browser/".
|
||
debugging_port (int): Port for the browser debugging protocol. Default: 9222.
|
||
use_persistent_context (bool): Use a persistent browser context (like a persistent profile).
|
||
Automatically sets use_managed_browser=True. Default: False.
|
||
user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
|
||
temporary directory may be used. Default: None.
|
||
chrome_channel (str): The Chrome channel to launch (e.g., "chrome", "msedge"). Only applies if browser_type
|
||
is "chromium". Default: "chromium".
|
||
channel (str): The channel to launch (e.g., "chromium", "chrome", "msedge"). Only applies if browser_type
|
||
is "chromium". Default: "chromium".
|
||
proxy (Optional[str]): Proxy server URL (e.g., "http://username:password@proxy:port"). If None, no proxy is used.
|
||
Default: None.
|
||
proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
|
||
If None, no additional proxy config. Default: None.
|
||
viewport_width (int): Default viewport width for pages. Default: 1080.
|
||
viewport_height (int): Default viewport height for pages. Default: 600.
|
||
viewport (dict): Default viewport dimensions for pages. If set, overrides viewport_width and viewport_height.
|
||
Default: None.
|
||
verbose (bool): Enable verbose logging.
|
||
Default: True.
|
||
accept_downloads (bool): Whether to allow file downloads. If True, requires a downloads_path.
|
||
Default: False.
|
||
downloads_path (str or None): Directory to store downloaded files. If None and accept_downloads is True,
|
||
a default path will be created. Default: None.
|
||
storage_state (str or dict or None): An in-memory storage state (cookies, localStorage).
|
||
Default: None.
|
||
ignore_https_errors (bool): Ignore HTTPS certificate errors. Default: True.
|
||
java_script_enabled (bool): Enable JavaScript execution in pages. Default: True.
|
||
cookies (list): List of cookies to add to the browser context. Each cookie is a dict with fields like
|
||
{"name": "...", "value": "...", "url": "..."}.
|
||
Default: [].
|
||
headers (dict): Extra HTTP headers to apply to all requests in this context.
|
||
Default: {}.
|
||
user_agent (str): Custom User-Agent string to use. Default: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36".
|
||
user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided
|
||
user_agent as-is. Default: None.
|
||
user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set.
|
||
Default: None.
|
||
text_mode (bool): If True, disables images and other rich content for potentially faster load times.
|
||
Default: False.
|
||
light_mode (bool): Disables certain background features for performance gains. Default: False.
|
||
extra_args (list): Additional command-line arguments passed to the browser.
|
||
Default: [].
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
browser_type: str = "chromium",
|
||
headless: bool = True,
|
||
browser_mode: str = "dedicated",
|
||
use_managed_browser: bool = False,
|
||
cdp_url: str = None,
|
||
use_persistent_context: bool = False,
|
||
user_data_dir: str = None,
|
||
chrome_channel: str = "chromium",
|
||
channel: str = "chromium",
|
||
proxy: str = None,
|
||
proxy_config: Union[ProxyConfig, dict, None] = None,
|
||
viewport_width: int = 1080,
|
||
viewport_height: int = 600,
|
||
viewport: dict = None,
|
||
accept_downloads: bool = False,
|
||
downloads_path: str = None,
|
||
storage_state: Union[str, dict, None] = None,
|
||
ignore_https_errors: bool = True,
|
||
java_script_enabled: bool = True,
|
||
sleep_on_close: bool = False,
|
||
verbose: bool = True,
|
||
cookies: list = None,
|
||
headers: dict = None,
|
||
user_agent: str = (
|
||
# "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 "
|
||
# "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||
# "(KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47"
|
||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36"
|
||
),
|
||
user_agent_mode: str = "",
|
||
user_agent_generator_config: dict = {},
|
||
text_mode: bool = False,
|
||
light_mode: bool = False,
|
||
extra_args: list = None,
|
||
debugging_port: int = 9222,
|
||
host: str = "localhost",
|
||
):
|
||
self.browser_type = browser_type
|
||
self.headless = headless or True
|
||
self.browser_mode = browser_mode
|
||
self.use_managed_browser = use_managed_browser
|
||
self.cdp_url = cdp_url
|
||
self.use_persistent_context = use_persistent_context
|
||
self.user_data_dir = user_data_dir
|
||
self.chrome_channel = chrome_channel or self.browser_type or "chromium"
|
||
self.channel = channel or self.browser_type or "chromium"
|
||
if self.browser_type in ["firefox", "webkit"]:
|
||
self.channel = ""
|
||
self.chrome_channel = ""
|
||
self.proxy = proxy
|
||
self.proxy_config = proxy_config
|
||
|
||
|
||
self.viewport_width = viewport_width
|
||
self.viewport_height = viewport_height
|
||
self.viewport = viewport
|
||
if self.viewport is not None:
|
||
self.viewport_width = self.viewport.get("width", 1080)
|
||
self.viewport_height = self.viewport.get("height", 600)
|
||
self.accept_downloads = accept_downloads
|
||
self.downloads_path = downloads_path
|
||
self.storage_state = storage_state
|
||
self.ignore_https_errors = ignore_https_errors
|
||
self.java_script_enabled = java_script_enabled
|
||
self.cookies = cookies if cookies is not None else []
|
||
self.headers = headers if headers is not None else {}
|
||
self.user_agent = user_agent
|
||
self.user_agent_mode = user_agent_mode
|
||
self.user_agent_generator_config = user_agent_generator_config
|
||
self.text_mode = text_mode
|
||
self.light_mode = light_mode
|
||
self.extra_args = extra_args if extra_args is not None else []
|
||
self.sleep_on_close = sleep_on_close
|
||
self.verbose = verbose
|
||
self.debugging_port = debugging_port
|
||
self.host = host
|
||
|
||
fa_user_agenr_generator = ValidUAGenerator()
|
||
if self.user_agent_mode == "random":
|
||
self.user_agent = fa_user_agenr_generator.generate(
|
||
**(self.user_agent_generator_config or {})
|
||
)
|
||
else:
|
||
pass
|
||
|
||
self.browser_hint = UAGen.generate_client_hints(self.user_agent)
|
||
self.headers.setdefault("sec-ch-ua", self.browser_hint)
|
||
|
||
# Set appropriate browser management flags based on browser_mode
|
||
if self.browser_mode == "builtin":
|
||
# Builtin mode uses managed browser connecting to builtin CDP endpoint
|
||
self.use_managed_browser = True
|
||
# cdp_url will be set later by browser_manager
|
||
elif self.browser_mode == "docker":
|
||
# Docker mode uses managed browser with CDP to connect to browser in container
|
||
self.use_managed_browser = True
|
||
# cdp_url will be set later by docker browser strategy
|
||
elif self.browser_mode == "custom" and self.cdp_url:
|
||
# Custom mode with explicit CDP URL
|
||
self.use_managed_browser = True
|
||
elif self.browser_mode == "dedicated":
|
||
# Dedicated mode uses a new browser instance each time
|
||
pass
|
||
|
||
# If persistent context is requested, ensure managed browser is enabled
|
||
if self.use_persistent_context:
|
||
self.use_managed_browser = True
|
||
|
||
@staticmethod
|
||
def from_kwargs(kwargs: dict) -> "BrowserConfig":
|
||
return BrowserConfig(
|
||
browser_type=kwargs.get("browser_type", "chromium"),
|
||
headless=kwargs.get("headless", True),
|
||
browser_mode=kwargs.get("browser_mode", "dedicated"),
|
||
use_managed_browser=kwargs.get("use_managed_browser", False),
|
||
cdp_url=kwargs.get("cdp_url"),
|
||
use_persistent_context=kwargs.get("use_persistent_context", False),
|
||
user_data_dir=kwargs.get("user_data_dir"),
|
||
chrome_channel=kwargs.get("chrome_channel", "chromium"),
|
||
channel=kwargs.get("channel", "chromium"),
|
||
proxy=kwargs.get("proxy"),
|
||
proxy_config=kwargs.get("proxy_config", None),
|
||
viewport_width=kwargs.get("viewport_width", 1080),
|
||
viewport_height=kwargs.get("viewport_height", 600),
|
||
accept_downloads=kwargs.get("accept_downloads", False),
|
||
downloads_path=kwargs.get("downloads_path"),
|
||
storage_state=kwargs.get("storage_state"),
|
||
ignore_https_errors=kwargs.get("ignore_https_errors", True),
|
||
java_script_enabled=kwargs.get("java_script_enabled", True),
|
||
cookies=kwargs.get("cookies", []),
|
||
headers=kwargs.get("headers", {}),
|
||
user_agent=kwargs.get(
|
||
"user_agent",
|
||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
|
||
),
|
||
user_agent_mode=kwargs.get("user_agent_mode"),
|
||
user_agent_generator_config=kwargs.get("user_agent_generator_config"),
|
||
text_mode=kwargs.get("text_mode", False),
|
||
light_mode=kwargs.get("light_mode", False),
|
||
extra_args=kwargs.get("extra_args", []),
|
||
debugging_port=kwargs.get("debugging_port", 9222),
|
||
host=kwargs.get("host", "localhost"),
|
||
)
|
||
|
||
def to_dict(self):
|
||
result = {
|
||
"browser_type": self.browser_type,
|
||
"headless": self.headless,
|
||
"browser_mode": self.browser_mode,
|
||
"use_managed_browser": self.use_managed_browser,
|
||
"cdp_url": self.cdp_url,
|
||
"use_persistent_context": self.use_persistent_context,
|
||
"user_data_dir": self.user_data_dir,
|
||
"chrome_channel": self.chrome_channel,
|
||
"channel": self.channel,
|
||
"proxy": self.proxy,
|
||
"proxy_config": self.proxy_config,
|
||
"viewport_width": self.viewport_width,
|
||
"viewport_height": self.viewport_height,
|
||
"accept_downloads": self.accept_downloads,
|
||
"downloads_path": self.downloads_path,
|
||
"storage_state": self.storage_state,
|
||
"ignore_https_errors": self.ignore_https_errors,
|
||
"java_script_enabled": self.java_script_enabled,
|
||
"cookies": self.cookies,
|
||
"headers": self.headers,
|
||
"user_agent": self.user_agent,
|
||
"user_agent_mode": self.user_agent_mode,
|
||
"user_agent_generator_config": self.user_agent_generator_config,
|
||
"text_mode": self.text_mode,
|
||
"light_mode": self.light_mode,
|
||
"extra_args": self.extra_args,
|
||
"sleep_on_close": self.sleep_on_close,
|
||
"verbose": self.verbose,
|
||
"debugging_port": self.debugging_port,
|
||
"host": self.host,
|
||
}
|
||
|
||
|
||
return result
|
||
|
||
def clone(self, **kwargs):
|
||
"""Create a copy of this configuration with updated values.
|
||
|
||
Args:
|
||
**kwargs: Key-value pairs of configuration options to update
|
||
|
||
Returns:
|
||
BrowserConfig: A new instance with the specified updates
|
||
"""
|
||
config_dict = self.to_dict()
|
||
config_dict.update(kwargs)
|
||
return BrowserConfig.from_kwargs(config_dict)
|
||
|
||
# Create a funciton returns dict of the object
|
||
def dump(self) -> dict:
|
||
# Serialize the object to a dictionary
|
||
return to_serializable_dict(self)
|
||
|
||
@staticmethod
|
||
def load(data: dict) -> "BrowserConfig":
|
||
# Deserialize the object from a dictionary
|
||
config = from_serializable_dict(data)
|
||
if isinstance(config, BrowserConfig):
|
||
return config
|
||
return BrowserConfig.from_kwargs(config)
|
||
|
||
|
||
class HTTPCrawlerConfig:
|
||
"""HTTP-specific crawler configuration"""
|
||
|
||
method: str = "GET"
|
||
headers: Optional[Dict[str, str]] = None
|
||
data: Optional[Dict[str, Any]] = None
|
||
json: Optional[Dict[str, Any]] = None
|
||
follow_redirects: bool = True
|
||
verify_ssl: bool = True
|
||
|
||
def __init__(
|
||
self,
|
||
method: str = "GET",
|
||
headers: Optional[Dict[str, str]] = None,
|
||
data: Optional[Dict[str, Any]] = None,
|
||
json: Optional[Dict[str, Any]] = None,
|
||
follow_redirects: bool = True,
|
||
verify_ssl: bool = True,
|
||
):
|
||
self.method = method
|
||
self.headers = headers
|
||
self.data = data
|
||
self.json = json
|
||
self.follow_redirects = follow_redirects
|
||
self.verify_ssl = verify_ssl
|
||
|
||
@staticmethod
|
||
def from_kwargs(kwargs: dict) -> "HTTPCrawlerConfig":
|
||
return HTTPCrawlerConfig(
|
||
method=kwargs.get("method", "GET"),
|
||
headers=kwargs.get("headers"),
|
||
data=kwargs.get("data"),
|
||
json=kwargs.get("json"),
|
||
follow_redirects=kwargs.get("follow_redirects", True),
|
||
verify_ssl=kwargs.get("verify_ssl", True),
|
||
)
|
||
|
||
def to_dict(self):
|
||
return {
|
||
"method": self.method,
|
||
"headers": self.headers,
|
||
"data": self.data,
|
||
"json": self.json,
|
||
"follow_redirects": self.follow_redirects,
|
||
"verify_ssl": self.verify_ssl,
|
||
}
|
||
|
||
def clone(self, **kwargs):
|
||
"""Create a copy of this configuration with updated values.
|
||
|
||
Args:
|
||
**kwargs: Key-value pairs of configuration options to update
|
||
|
||
Returns:
|
||
HTTPCrawlerConfig: A new instance with the specified updates
|
||
"""
|
||
config_dict = self.to_dict()
|
||
config_dict.update(kwargs)
|
||
return HTTPCrawlerConfig.from_kwargs(config_dict)
|
||
|
||
def dump(self) -> dict:
|
||
return to_serializable_dict(self)
|
||
|
||
@staticmethod
|
||
def load(data: dict) -> "HTTPCrawlerConfig":
|
||
config = from_serializable_dict(data)
|
||
if isinstance(config, HTTPCrawlerConfig):
|
||
return config
|
||
return HTTPCrawlerConfig.from_kwargs(config)
|
||
|
||
class CrawlerRunConfig():
|
||
_UNWANTED_PROPS = {
|
||
'disable_cache' : 'Instead, use cache_mode=CacheMode.DISABLED',
|
||
'bypass_cache' : 'Instead, use cache_mode=CacheMode.BYPASS',
|
||
'no_cache_read' : 'Instead, use cache_mode=CacheMode.WRITE_ONLY',
|
||
'no_cache_write' : 'Instead, use cache_mode=CacheMode.READ_ONLY',
|
||
}
|
||
|
||
"""
|
||
Configuration class for controlling how the crawler runs each crawl operation.
|
||
This includes parameters for content extraction, page manipulation, waiting conditions,
|
||
caching, and other runtime behaviors.
|
||
|
||
This centralizes parameters that were previously scattered as kwargs to `arun()` and related methods.
|
||
By using this class, you have a single place to understand and adjust the crawling options.
|
||
|
||
Attributes:
|
||
# Deep Crawl Parameters
|
||
deep_crawl_strategy (DeepCrawlStrategy or None): Strategy to use for deep crawling.
|
||
|
||
# Content Processing Parameters
|
||
word_count_threshold (int): Minimum word count threshold before processing content.
|
||
Default: MIN_WORD_THRESHOLD (typically 200).
|
||
extraction_strategy (ExtractionStrategy or None): Strategy to extract structured data from crawled pages.
|
||
Default: None (NoExtractionStrategy is used if None).
|
||
chunking_strategy (ChunkingStrategy): Strategy to chunk content before extraction.
|
||
Default: RegexChunking().
|
||
markdown_generator (MarkdownGenerationStrategy): Strategy for generating markdown.
|
||
Default: None.
|
||
only_text (bool): If True, attempt to extract text-only content where applicable.
|
||
Default: False.
|
||
css_selector (str or None): CSS selector to extract a specific portion of the page.
|
||
Default: None.
|
||
|
||
target_elements (list of str or None): List of CSS selectors for specific elements for Markdown generation
|
||
and structured data extraction. When you set this, only the contents
|
||
of these elements are processed for extraction and Markdown generation.
|
||
If you do not set any value, the entire page is processed.
|
||
The difference between this and css_selector is that this will shrink
|
||
the initial raw HTML to the selected element, while this will only affect
|
||
the extraction and Markdown generation.
|
||
Default: None
|
||
excluded_tags (list of str or None): List of HTML tags to exclude from processing.
|
||
Default: None.
|
||
excluded_selector (str or None): CSS selector to exclude from processing.
|
||
Default: None.
|
||
keep_data_attributes (bool): If True, retain `data-*` attributes while removing unwanted attributes.
|
||
Default: False.
|
||
keep_attrs (list of str): List of HTML attributes to keep during processing.
|
||
Default: [].
|
||
remove_forms (bool): If True, remove all `<form>` elements from the HTML.
|
||
Default: False.
|
||
prettiify (bool): If True, apply `fast_format_html` to produce prettified HTML output.
|
||
Default: False.
|
||
parser_type (str): Type of parser to use for HTML parsing.
|
||
Default: "lxml".
|
||
scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
|
||
Default: WebScrapingStrategy.
|
||
proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
|
||
If None, no additional proxy config. Default: None.
|
||
|
||
# SSL Parameters
|
||
fetch_ssl_certificate: bool = False,
|
||
# Caching Parameters
|
||
cache_mode (CacheMode or None): Defines how caching is handled.
|
||
If None, defaults to CacheMode.ENABLED internally.
|
||
Default: CacheMode.BYPASS.
|
||
session_id (str or None): Optional session ID to persist the browser context and the created
|
||
page instance. If the ID already exists, the crawler does not
|
||
create a new page and uses the current page to preserve the state.
|
||
bypass_cache (bool): Legacy parameter, if True acts like CacheMode.BYPASS.
|
||
Default: False.
|
||
disable_cache (bool): Legacy parameter, if True acts like CacheMode.DISABLED.
|
||
Default: False.
|
||
no_cache_read (bool): Legacy parameter, if True acts like CacheMode.WRITE_ONLY.
|
||
Default: False.
|
||
no_cache_write (bool): Legacy parameter, if True acts like CacheMode.READ_ONLY.
|
||
Default: False.
|
||
shared_data (dict or None): Shared data to be passed between hooks.
|
||
Default: None.
|
||
|
||
# Page Navigation and Timing Parameters
|
||
wait_until (str): The condition to wait for when navigating, e.g. "domcontentloaded".
|
||
Default: "domcontentloaded".
|
||
page_timeout (int): Timeout in ms for page operations like navigation.
|
||
Default: 60000 (60 seconds).
|
||
wait_for (str or None): A CSS selector or JS condition to wait for before extracting content.
|
||
Default: None.
|
||
wait_for_images (bool): If True, wait for images to load before extracting content.
|
||
Default: False.
|
||
delay_before_return_html (float): Delay in seconds before retrieving final HTML.
|
||
Default: 0.1.
|
||
mean_delay (float): Mean base delay between requests when calling arun_many.
|
||
Default: 0.1.
|
||
max_range (float): Max random additional delay range for requests in arun_many.
|
||
Default: 0.3.
|
||
semaphore_count (int): Number of concurrent operations allowed.
|
||
Default: 5.
|
||
|
||
# Page Interaction Parameters
|
||
js_code (str or list of str or None): JavaScript code/snippets to run on the page.
|
||
Default: None.
|
||
js_only (bool): If True, indicates subsequent calls are JS-driven updates, not full page loads.
|
||
Default: False.
|
||
ignore_body_visibility (bool): If True, ignore whether the body is visible before proceeding.
|
||
Default: True.
|
||
scan_full_page (bool): If True, scroll through the entire page to load all content.
|
||
Default: False.
|
||
scroll_delay (float): Delay in seconds between scroll steps if scan_full_page is True.
|
||
Default: 0.2.
|
||
process_iframes (bool): If True, attempts to process and inline iframe content.
|
||
Default: False.
|
||
remove_overlay_elements (bool): If True, remove overlays/popups before extracting HTML.
|
||
Default: False.
|
||
simulate_user (bool): If True, simulate user interactions (mouse moves, clicks) for anti-bot measures.
|
||
Default: False.
|
||
override_navigator (bool): If True, overrides navigator properties for more human-like behavior.
|
||
Default: False.
|
||
magic (bool): If True, attempts automatic handling of overlays/popups.
|
||
Default: False.
|
||
adjust_viewport_to_content (bool): If True, adjust viewport according to the page content dimensions.
|
||
Default: False.
|
||
|
||
# Media Handling Parameters
|
||
screenshot (bool): Whether to take a screenshot after crawling.
|
||
Default: False.
|
||
screenshot_wait_for (float or None): Additional wait time before taking a screenshot.
|
||
Default: None.
|
||
screenshot_height_threshold (int): Threshold for page height to decide screenshot strategy.
|
||
Default: SCREENSHOT_HEIGHT_TRESHOLD (from config, e.g. 20000).
|
||
pdf (bool): Whether to generate a PDF of the page.
|
||
Default: False.
|
||
image_description_min_word_threshold (int): Minimum words for image description extraction.
|
||
Default: IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD (e.g., 50).
|
||
image_score_threshold (int): Minimum score threshold for processing an image.
|
||
Default: IMAGE_SCORE_THRESHOLD (e.g., 3).
|
||
exclude_external_images (bool): If True, exclude all external images from processing.
|
||
Default: False.
|
||
table_score_threshold (int): Minimum score threshold for processing a table.
|
||
Default: 7.
|
||
|
||
# Link and Domain Handling Parameters
|
||
exclude_social_media_domains (list of str): List of domains to exclude for social media links.
|
||
Default: SOCIAL_MEDIA_DOMAINS (from config).
|
||
exclude_external_links (bool): If True, exclude all external links from the results.
|
||
Default: False.
|
||
exclude_internal_links (bool): If True, exclude internal links from the results.
|
||
Default: False.
|
||
exclude_social_media_links (bool): If True, exclude links pointing to social media domains.
|
||
Default: False.
|
||
exclude_domains (list of str): List of specific domains to exclude from results.
|
||
Default: [].
|
||
exclude_internal_links (bool): If True, exclude internal links from the results.
|
||
Default: False.
|
||
|
||
# Debugging and Logging Parameters
|
||
verbose (bool): Enable verbose logging.
|
||
Default: True.
|
||
log_console (bool): If True, log console messages from the page.
|
||
Default: False.
|
||
|
||
# HTTP Crwler Strategy Parameters
|
||
method (str): HTTP method to use for the request, when using AsyncHTTPCrwalerStrategy.
|
||
Default: "GET".
|
||
data (dict): Data to send in the request body, when using AsyncHTTPCrwalerStrategy.
|
||
Default: None.
|
||
json (dict): JSON data to send in the request body, when using AsyncHTTPCrwalerStrategy.
|
||
|
||
# Connection Parameters
|
||
stream (bool): If True, enables streaming of crawled URLs as they are processed when used with arun_many.
|
||
Default: False.
|
||
|
||
check_robots_txt (bool): Whether to check robots.txt rules before crawling. Default: False
|
||
Default: False.
|
||
user_agent (str): Custom User-Agent string to use.
|
||
Default: None.
|
||
user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided user_agent as-is.
|
||
Default: None.
|
||
user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set.
|
||
Default: None.
|
||
|
||
# Experimental Parameters
|
||
experimental (dict): Dictionary containing experimental parameters that are in beta phase.
|
||
This allows passing temporary features that are not yet fully integrated
|
||
into the main parameter set.
|
||
Default: None.
|
||
|
||
url: str = None # This is not a compulsory parameter
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
# Content Processing Parameters
|
||
word_count_threshold: int = MIN_WORD_THRESHOLD,
|
||
extraction_strategy: ExtractionStrategy = None,
|
||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||
markdown_generator: MarkdownGenerationStrategy = DefaultMarkdownGenerator(),
|
||
only_text: bool = False,
|
||
css_selector: str = None,
|
||
target_elements: List[str] = None,
|
||
excluded_tags: list = None,
|
||
excluded_selector: str = None,
|
||
keep_data_attributes: bool = False,
|
||
keep_attrs: list = None,
|
||
remove_forms: bool = False,
|
||
prettiify: bool = False,
|
||
parser_type: str = "lxml",
|
||
scraping_strategy: ContentScrapingStrategy = None,
|
||
proxy_config: Union[ProxyConfig, dict, None] = None,
|
||
proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None,
|
||
# SSL Parameters
|
||
fetch_ssl_certificate: bool = False,
|
||
# Caching Parameters
|
||
cache_mode: CacheMode = CacheMode.BYPASS,
|
||
session_id: str = None,
|
||
bypass_cache: bool = False,
|
||
disable_cache: bool = False,
|
||
no_cache_read: bool = False,
|
||
no_cache_write: bool = False,
|
||
shared_data: dict = None,
|
||
# Page Navigation and Timing Parameters
|
||
wait_until: str = "domcontentloaded",
|
||
page_timeout: int = PAGE_TIMEOUT,
|
||
wait_for: str = None,
|
||
wait_for_images: bool = False,
|
||
delay_before_return_html: float = 0.1,
|
||
mean_delay: float = 0.1,
|
||
max_range: float = 0.3,
|
||
semaphore_count: int = 5,
|
||
# Page Interaction Parameters
|
||
js_code: Union[str, List[str]] = None,
|
||
js_only: bool = False,
|
||
ignore_body_visibility: bool = True,
|
||
scan_full_page: bool = False,
|
||
scroll_delay: float = 0.2,
|
||
process_iframes: bool = False,
|
||
remove_overlay_elements: bool = False,
|
||
simulate_user: bool = False,
|
||
override_navigator: bool = False,
|
||
magic: bool = False,
|
||
adjust_viewport_to_content: bool = False,
|
||
# Media Handling Parameters
|
||
screenshot: bool = False,
|
||
screenshot_wait_for: float = None,
|
||
screenshot_height_threshold: int = SCREENSHOT_HEIGHT_TRESHOLD,
|
||
pdf: bool = False,
|
||
capture_mhtml: bool = False,
|
||
image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
||
image_score_threshold: int = IMAGE_SCORE_THRESHOLD,
|
||
table_score_threshold: int = 7,
|
||
exclude_external_images: bool = False,
|
||
exclude_all_images: bool = False,
|
||
# Link and Domain Handling Parameters
|
||
exclude_social_media_domains: list = None,
|
||
exclude_external_links: bool = False,
|
||
exclude_social_media_links: bool = False,
|
||
exclude_domains: list = None,
|
||
exclude_internal_links: bool = False,
|
||
# Debugging and Logging Parameters
|
||
verbose: bool = True,
|
||
log_console: bool = False,
|
||
# Network and Console Capturing Parameters
|
||
capture_network_requests: bool = False,
|
||
capture_console_messages: bool = False,
|
||
# Connection Parameters
|
||
method: str = "GET",
|
||
stream: bool = False,
|
||
url: str = None,
|
||
check_robots_txt: bool = False,
|
||
user_agent: str = None,
|
||
user_agent_mode: str = None,
|
||
user_agent_generator_config: dict = {},
|
||
# Deep Crawl Parameters
|
||
deep_crawl_strategy: Optional[DeepCrawlStrategy] = None,
|
||
# Experimental Parameters
|
||
experimental: Dict[str, Any] = None,
|
||
):
|
||
# TODO: Planning to set properties dynamically based on the __init__ signature
|
||
self.url = url
|
||
|
||
# Content Processing Parameters
|
||
self.word_count_threshold = word_count_threshold
|
||
self.extraction_strategy = extraction_strategy
|
||
self.chunking_strategy = chunking_strategy
|
||
self.markdown_generator = markdown_generator
|
||
self.only_text = only_text
|
||
self.css_selector = css_selector
|
||
self.target_elements = target_elements or []
|
||
self.excluded_tags = excluded_tags or []
|
||
self.excluded_selector = excluded_selector or ""
|
||
self.keep_data_attributes = keep_data_attributes
|
||
self.keep_attrs = keep_attrs or []
|
||
self.remove_forms = remove_forms
|
||
self.prettiify = prettiify
|
||
self.parser_type = parser_type
|
||
self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
|
||
self.proxy_config = proxy_config
|
||
self.proxy_rotation_strategy = proxy_rotation_strategy
|
||
|
||
# SSL Parameters
|
||
self.fetch_ssl_certificate = fetch_ssl_certificate
|
||
|
||
# Caching Parameters
|
||
self.cache_mode = cache_mode
|
||
self.session_id = session_id
|
||
self.bypass_cache = bypass_cache
|
||
self.disable_cache = disable_cache
|
||
self.no_cache_read = no_cache_read
|
||
self.no_cache_write = no_cache_write
|
||
self.shared_data = shared_data
|
||
|
||
# Page Navigation and Timing Parameters
|
||
self.wait_until = wait_until
|
||
self.page_timeout = page_timeout
|
||
self.wait_for = wait_for
|
||
self.wait_for_images = wait_for_images
|
||
self.delay_before_return_html = delay_before_return_html
|
||
self.mean_delay = mean_delay
|
||
self.max_range = max_range
|
||
self.semaphore_count = semaphore_count
|
||
|
||
# Page Interaction Parameters
|
||
self.js_code = js_code
|
||
self.js_only = js_only
|
||
self.ignore_body_visibility = ignore_body_visibility
|
||
self.scan_full_page = scan_full_page
|
||
self.scroll_delay = scroll_delay
|
||
self.process_iframes = process_iframes
|
||
self.remove_overlay_elements = remove_overlay_elements
|
||
self.simulate_user = simulate_user
|
||
self.override_navigator = override_navigator
|
||
self.magic = magic
|
||
self.adjust_viewport_to_content = adjust_viewport_to_content
|
||
|
||
# Media Handling Parameters
|
||
self.screenshot = screenshot
|
||
self.screenshot_wait_for = screenshot_wait_for
|
||
self.screenshot_height_threshold = screenshot_height_threshold
|
||
self.pdf = pdf
|
||
self.capture_mhtml = capture_mhtml
|
||
self.image_description_min_word_threshold = image_description_min_word_threshold
|
||
self.image_score_threshold = image_score_threshold
|
||
self.exclude_external_images = exclude_external_images
|
||
self.exclude_all_images = exclude_all_images
|
||
self.table_score_threshold = table_score_threshold
|
||
|
||
# Link and Domain Handling Parameters
|
||
self.exclude_social_media_domains = (
|
||
exclude_social_media_domains or SOCIAL_MEDIA_DOMAINS
|
||
)
|
||
self.exclude_external_links = exclude_external_links
|
||
self.exclude_social_media_links = exclude_social_media_links
|
||
self.exclude_domains = exclude_domains or []
|
||
self.exclude_internal_links = exclude_internal_links
|
||
|
||
# Debugging and Logging Parameters
|
||
self.verbose = verbose
|
||
self.log_console = log_console
|
||
|
||
# Network and Console Capturing Parameters
|
||
self.capture_network_requests = capture_network_requests
|
||
self.capture_console_messages = capture_console_messages
|
||
|
||
# Connection Parameters
|
||
self.stream = stream
|
||
self.method = method
|
||
|
||
# Robots.txt Handling Parameters
|
||
self.check_robots_txt = check_robots_txt
|
||
|
||
# User Agent Parameters
|
||
self.user_agent = user_agent
|
||
self.user_agent_mode = user_agent_mode
|
||
self.user_agent_generator_config = user_agent_generator_config
|
||
|
||
# Validate type of extraction strategy and chunking strategy if they are provided
|
||
if self.extraction_strategy is not None and not isinstance(
|
||
self.extraction_strategy, ExtractionStrategy
|
||
):
|
||
raise ValueError(
|
||
"extraction_strategy must be an instance of ExtractionStrategy"
|
||
)
|
||
if self.chunking_strategy is not None and not isinstance(
|
||
self.chunking_strategy, ChunkingStrategy
|
||
):
|
||
raise ValueError(
|
||
"chunking_strategy must be an instance of ChunkingStrategy"
|
||
)
|
||
|
||
# Set default chunking strategy if None
|
||
if self.chunking_strategy is None:
|
||
self.chunking_strategy = RegexChunking()
|
||
|
||
# Deep Crawl Parameters
|
||
self.deep_crawl_strategy = deep_crawl_strategy
|
||
|
||
# Experimental Parameters
|
||
self.experimental = experimental or {}
|
||
|
||
|
||
def __getattr__(self, name):
|
||
"""Handle attribute access."""
|
||
if name in self._UNWANTED_PROPS:
|
||
raise AttributeError(f"Getting '{name}' is deprecated. {self._UNWANTED_PROPS[name]}")
|
||
raise AttributeError(f"'{self.__class__.__name__}' has no attribute '{name}'")
|
||
|
||
def __setattr__(self, name, value):
|
||
"""Handle attribute setting."""
|
||
# TODO: Planning to set properties dynamically based on the __init__ signature
|
||
sig = inspect.signature(self.__init__)
|
||
all_params = sig.parameters # Dictionary of parameter names and their details
|
||
|
||
if name in self._UNWANTED_PROPS and value is not all_params[name].default:
|
||
raise AttributeError(f"Setting '{name}' is deprecated. {self._UNWANTED_PROPS[name]}")
|
||
|
||
super().__setattr__(name, value)
|
||
|
||
@staticmethod
|
||
def from_kwargs(kwargs: dict) -> "CrawlerRunConfig":
|
||
return CrawlerRunConfig(
|
||
# Content Processing Parameters
|
||
word_count_threshold=kwargs.get("word_count_threshold", 200),
|
||
extraction_strategy=kwargs.get("extraction_strategy"),
|
||
chunking_strategy=kwargs.get("chunking_strategy", RegexChunking()),
|
||
markdown_generator=kwargs.get("markdown_generator"),
|
||
only_text=kwargs.get("only_text", False),
|
||
css_selector=kwargs.get("css_selector"),
|
||
target_elements=kwargs.get("target_elements", []),
|
||
excluded_tags=kwargs.get("excluded_tags", []),
|
||
excluded_selector=kwargs.get("excluded_selector", ""),
|
||
keep_data_attributes=kwargs.get("keep_data_attributes", False),
|
||
keep_attrs=kwargs.get("keep_attrs", []),
|
||
remove_forms=kwargs.get("remove_forms", False),
|
||
prettiify=kwargs.get("prettiify", False),
|
||
parser_type=kwargs.get("parser_type", "lxml"),
|
||
scraping_strategy=kwargs.get("scraping_strategy"),
|
||
proxy_config=kwargs.get("proxy_config"),
|
||
proxy_rotation_strategy=kwargs.get("proxy_rotation_strategy"),
|
||
# SSL Parameters
|
||
fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
|
||
# Caching Parameters
|
||
cache_mode=kwargs.get("cache_mode", CacheMode.BYPASS),
|
||
session_id=kwargs.get("session_id"),
|
||
bypass_cache=kwargs.get("bypass_cache", False),
|
||
disable_cache=kwargs.get("disable_cache", False),
|
||
no_cache_read=kwargs.get("no_cache_read", False),
|
||
no_cache_write=kwargs.get("no_cache_write", False),
|
||
shared_data=kwargs.get("shared_data", None),
|
||
# Page Navigation and Timing Parameters
|
||
wait_until=kwargs.get("wait_until", "domcontentloaded"),
|
||
page_timeout=kwargs.get("page_timeout", 60000),
|
||
wait_for=kwargs.get("wait_for"),
|
||
wait_for_images=kwargs.get("wait_for_images", False),
|
||
delay_before_return_html=kwargs.get("delay_before_return_html", 0.1),
|
||
mean_delay=kwargs.get("mean_delay", 0.1),
|
||
max_range=kwargs.get("max_range", 0.3),
|
||
semaphore_count=kwargs.get("semaphore_count", 5),
|
||
# Page Interaction Parameters
|
||
js_code=kwargs.get("js_code"),
|
||
js_only=kwargs.get("js_only", False),
|
||
ignore_body_visibility=kwargs.get("ignore_body_visibility", True),
|
||
scan_full_page=kwargs.get("scan_full_page", False),
|
||
scroll_delay=kwargs.get("scroll_delay", 0.2),
|
||
process_iframes=kwargs.get("process_iframes", False),
|
||
remove_overlay_elements=kwargs.get("remove_overlay_elements", False),
|
||
simulate_user=kwargs.get("simulate_user", False),
|
||
override_navigator=kwargs.get("override_navigator", False),
|
||
magic=kwargs.get("magic", False),
|
||
adjust_viewport_to_content=kwargs.get("adjust_viewport_to_content", False),
|
||
# Media Handling Parameters
|
||
screenshot=kwargs.get("screenshot", False),
|
||
screenshot_wait_for=kwargs.get("screenshot_wait_for"),
|
||
screenshot_height_threshold=kwargs.get(
|
||
"screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD
|
||
),
|
||
pdf=kwargs.get("pdf", False),
|
||
capture_mhtml=kwargs.get("capture_mhtml", False),
|
||
image_description_min_word_threshold=kwargs.get(
|
||
"image_description_min_word_threshold",
|
||
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
||
),
|
||
image_score_threshold=kwargs.get(
|
||
"image_score_threshold", IMAGE_SCORE_THRESHOLD
|
||
),
|
||
table_score_threshold=kwargs.get("table_score_threshold", 7),
|
||
exclude_all_images=kwargs.get("exclude_all_images", False),
|
||
exclude_external_images=kwargs.get("exclude_external_images", False),
|
||
# Link and Domain Handling Parameters
|
||
exclude_social_media_domains=kwargs.get(
|
||
"exclude_social_media_domains", SOCIAL_MEDIA_DOMAINS
|
||
),
|
||
exclude_external_links=kwargs.get("exclude_external_links", False),
|
||
exclude_social_media_links=kwargs.get("exclude_social_media_links", False),
|
||
exclude_domains=kwargs.get("exclude_domains", []),
|
||
exclude_internal_links=kwargs.get("exclude_internal_links", False),
|
||
# Debugging and Logging Parameters
|
||
verbose=kwargs.get("verbose", True),
|
||
log_console=kwargs.get("log_console", False),
|
||
# Network and Console Capturing Parameters
|
||
capture_network_requests=kwargs.get("capture_network_requests", False),
|
||
capture_console_messages=kwargs.get("capture_console_messages", False),
|
||
# Connection Parameters
|
||
method=kwargs.get("method", "GET"),
|
||
stream=kwargs.get("stream", False),
|
||
check_robots_txt=kwargs.get("check_robots_txt", False),
|
||
user_agent=kwargs.get("user_agent"),
|
||
user_agent_mode=kwargs.get("user_agent_mode"),
|
||
user_agent_generator_config=kwargs.get("user_agent_generator_config", {}),
|
||
# Deep Crawl Parameters
|
||
deep_crawl_strategy=kwargs.get("deep_crawl_strategy"),
|
||
url=kwargs.get("url"),
|
||
# Experimental Parameters
|
||
experimental=kwargs.get("experimental"),
|
||
)
|
||
|
||
# Create a funciton returns dict of the object
|
||
def dump(self) -> dict:
|
||
# Serialize the object to a dictionary
|
||
return to_serializable_dict(self)
|
||
|
||
@staticmethod
|
||
def load(data: dict) -> "CrawlerRunConfig":
|
||
# Deserialize the object from a dictionary
|
||
config = from_serializable_dict(data)
|
||
if isinstance(config, CrawlerRunConfig):
|
||
return config
|
||
return CrawlerRunConfig.from_kwargs(config)
|
||
|
||
def to_dict(self):
|
||
return {
|
||
"word_count_threshold": self.word_count_threshold,
|
||
"extraction_strategy": self.extraction_strategy,
|
||
"chunking_strategy": self.chunking_strategy,
|
||
"markdown_generator": self.markdown_generator,
|
||
"only_text": self.only_text,
|
||
"css_selector": self.css_selector,
|
||
"target_elements": self.target_elements,
|
||
"excluded_tags": self.excluded_tags,
|
||
"excluded_selector": self.excluded_selector,
|
||
"keep_data_attributes": self.keep_data_attributes,
|
||
"keep_attrs": self.keep_attrs,
|
||
"remove_forms": self.remove_forms,
|
||
"prettiify": self.prettiify,
|
||
"parser_type": self.parser_type,
|
||
"scraping_strategy": self.scraping_strategy,
|
||
"proxy_config": self.proxy_config,
|
||
"proxy_rotation_strategy": self.proxy_rotation_strategy,
|
||
"fetch_ssl_certificate": self.fetch_ssl_certificate,
|
||
"cache_mode": self.cache_mode,
|
||
"session_id": self.session_id,
|
||
"bypass_cache": self.bypass_cache,
|
||
"disable_cache": self.disable_cache,
|
||
"no_cache_read": self.no_cache_read,
|
||
"no_cache_write": self.no_cache_write,
|
||
"shared_data": self.shared_data,
|
||
"wait_until": self.wait_until,
|
||
"page_timeout": self.page_timeout,
|
||
"wait_for": self.wait_for,
|
||
"wait_for_images": self.wait_for_images,
|
||
"delay_before_return_html": self.delay_before_return_html,
|
||
"mean_delay": self.mean_delay,
|
||
"max_range": self.max_range,
|
||
"semaphore_count": self.semaphore_count,
|
||
"js_code": self.js_code,
|
||
"js_only": self.js_only,
|
||
"ignore_body_visibility": self.ignore_body_visibility,
|
||
"scan_full_page": self.scan_full_page,
|
||
"scroll_delay": self.scroll_delay,
|
||
"process_iframes": self.process_iframes,
|
||
"remove_overlay_elements": self.remove_overlay_elements,
|
||
"simulate_user": self.simulate_user,
|
||
"override_navigator": self.override_navigator,
|
||
"magic": self.magic,
|
||
"adjust_viewport_to_content": self.adjust_viewport_to_content,
|
||
"screenshot": self.screenshot,
|
||
"screenshot_wait_for": self.screenshot_wait_for,
|
||
"screenshot_height_threshold": self.screenshot_height_threshold,
|
||
"pdf": self.pdf,
|
||
"capture_mhtml": self.capture_mhtml,
|
||
"image_description_min_word_threshold": self.image_description_min_word_threshold,
|
||
"image_score_threshold": self.image_score_threshold,
|
||
"table_score_threshold": self.table_score_threshold,
|
||
"exclude_all_images": self.exclude_all_images,
|
||
"exclude_external_images": self.exclude_external_images,
|
||
"exclude_social_media_domains": self.exclude_social_media_domains,
|
||
"exclude_external_links": self.exclude_external_links,
|
||
"exclude_social_media_links": self.exclude_social_media_links,
|
||
"exclude_domains": self.exclude_domains,
|
||
"exclude_internal_links": self.exclude_internal_links,
|
||
"verbose": self.verbose,
|
||
"log_console": self.log_console,
|
||
"capture_network_requests": self.capture_network_requests,
|
||
"capture_console_messages": self.capture_console_messages,
|
||
"method": self.method,
|
||
"stream": self.stream,
|
||
"check_robots_txt": self.check_robots_txt,
|
||
"user_agent": self.user_agent,
|
||
"user_agent_mode": self.user_agent_mode,
|
||
"user_agent_generator_config": self.user_agent_generator_config,
|
||
"deep_crawl_strategy": self.deep_crawl_strategy,
|
||
"url": self.url,
|
||
"experimental": self.experimental,
|
||
}
|
||
|
||
def clone(self, **kwargs):
|
||
"""Create a copy of this configuration with updated values.
|
||
|
||
Args:
|
||
**kwargs: Key-value pairs of configuration options to update
|
||
|
||
Returns:
|
||
CrawlerRunConfig: A new instance with the specified updates
|
||
|
||
Example:
|
||
```python
|
||
# Create a new config with streaming enabled
|
||
stream_config = config.clone(stream=True)
|
||
|
||
# Create a new config with multiple updates
|
||
new_config = config.clone(
|
||
stream=True,
|
||
cache_mode=CacheMode.BYPASS,
|
||
verbose=True
|
||
)
|
||
```
|
||
"""
|
||
config_dict = self.to_dict()
|
||
config_dict.update(kwargs)
|
||
return CrawlerRunConfig.from_kwargs(config_dict)
|
||
|
||
|
||
class LLMConfig:
|
||
def __init__(
|
||
self,
|
||
provider: str = DEFAULT_PROVIDER,
|
||
api_token: Optional[str] = None,
|
||
base_url: Optional[str] = None,
|
||
temperature: Optional[float] = None,
|
||
max_tokens: Optional[int] = None,
|
||
top_p: Optional[float] = None,
|
||
frequency_penalty: Optional[float] = None,
|
||
presence_penalty: Optional[float] = None,
|
||
stop: Optional[List[str]] = None,
|
||
n: Optional[int] = None,
|
||
):
|
||
"""Configuaration class for LLM provider and API token."""
|
||
self.provider = provider
|
||
if api_token and not api_token.startswith("env:"):
|
||
self.api_token = api_token
|
||
elif api_token and api_token.startswith("env:"):
|
||
self.api_token = os.getenv(api_token[4:])
|
||
else:
|
||
# Check if given provider starts with any of key in PROVIDER_MODELS_PREFIXES
|
||
# If not, check if it is in PROVIDER_MODELS
|
||
prefixes = PROVIDER_MODELS_PREFIXES.keys()
|
||
if any(provider.startswith(prefix) for prefix in prefixes):
|
||
selected_prefix = next(
|
||
(prefix for prefix in prefixes if provider.startswith(prefix)),
|
||
None,
|
||
)
|
||
self.api_token = PROVIDER_MODELS_PREFIXES.get(selected_prefix)
|
||
else:
|
||
self.provider = DEFAULT_PROVIDER
|
||
self.api_token = os.getenv(DEFAULT_PROVIDER_API_KEY)
|
||
self.base_url = base_url
|
||
self.temperature = temperature
|
||
self.max_tokens = max_tokens
|
||
self.top_p = top_p
|
||
self.frequency_penalty = frequency_penalty
|
||
self.presence_penalty = presence_penalty
|
||
self.stop = stop
|
||
self.n = n
|
||
|
||
@staticmethod
|
||
def from_kwargs(kwargs: dict) -> "LLMConfig":
|
||
return LLMConfig(
|
||
provider=kwargs.get("provider", DEFAULT_PROVIDER),
|
||
api_token=kwargs.get("api_token"),
|
||
base_url=kwargs.get("base_url"),
|
||
temperature=kwargs.get("temperature"),
|
||
max_tokens=kwargs.get("max_tokens"),
|
||
top_p=kwargs.get("top_p"),
|
||
frequency_penalty=kwargs.get("frequency_penalty"),
|
||
presence_penalty=kwargs.get("presence_penalty"),
|
||
stop=kwargs.get("stop"),
|
||
n=kwargs.get("n")
|
||
)
|
||
|
||
def to_dict(self):
|
||
return {
|
||
"provider": self.provider,
|
||
"api_token": self.api_token,
|
||
"base_url": self.base_url,
|
||
"temperature": self.temperature,
|
||
"max_tokens": self.max_tokens,
|
||
"top_p": self.top_p,
|
||
"frequency_penalty": self.frequency_penalty,
|
||
"presence_penalty": self.presence_penalty,
|
||
"stop": self.stop,
|
||
"n": self.n
|
||
}
|
||
|
||
def clone(self, **kwargs):
|
||
"""Create a copy of this configuration with updated values.
|
||
|
||
Args:
|
||
**kwargs: Key-value pairs of configuration options to update
|
||
|
||
Returns:
|
||
llm_config: A new instance with the specified updates
|
||
"""
|
||
config_dict = self.to_dict()
|
||
config_dict.update(kwargs)
|
||
return LLMConfig.from_kwargs(config_dict)
|
||
|
||
|
||
|
||
```
|
||
|
||
|
||
## File: crawl4ai/async_webcrawler.py
|
||
|
||
```py
|
||
from .__version__ import __version__ as crawl4ai_version
|
||
import os
|
||
import sys
|
||
import time
|
||
from colorama import Fore
|
||
from pathlib import Path
|
||
from typing import Optional, List
|
||
import json
|
||
import asyncio
|
||
|
||
# from contextlib import nullcontext, asynccontextmanager
|
||
from contextlib import asynccontextmanager
|
||
from .models import (
|
||
CrawlResult,
|
||
MarkdownGenerationResult,
|
||
DispatchResult,
|
||
ScrapingResult,
|
||
CrawlResultContainer,
|
||
RunManyReturn
|
||
)
|
||
from .async_database import async_db_manager
|
||
from .chunking_strategy import * # noqa: F403
|
||
from .chunking_strategy import IdentityChunking
|
||
from .content_filter_strategy import * # noqa: F403
|
||
from .extraction_strategy import * # noqa: F403
|
||
from .extraction_strategy import NoExtractionStrategy
|
||
from .async_crawler_strategy import (
|
||
AsyncCrawlerStrategy,
|
||
AsyncPlaywrightCrawlerStrategy,
|
||
AsyncCrawlResponse,
|
||
)
|
||
from .cache_context import CacheMode, CacheContext
|
||
from .markdown_generation_strategy import (
|
||
DefaultMarkdownGenerator,
|
||
MarkdownGenerationStrategy,
|
||
)
|
||
from .deep_crawling import DeepCrawlDecorator
|
||
from .async_logger import AsyncLogger, AsyncLoggerBase
|
||
from .async_configs import BrowserConfig, CrawlerRunConfig, ProxyConfig
|
||
from .async_dispatcher import * # noqa: F403
|
||
from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
|
||
|
||
from .utils import (
|
||
sanitize_input_encode,
|
||
InvalidCSSSelectorError,
|
||
fast_format_html,
|
||
create_box_message,
|
||
get_error_context,
|
||
RobotsParser,
|
||
preprocess_html_for_schema,
|
||
)
|
||
|
||
|
||
class AsyncWebCrawler:
|
||
"""
|
||
Asynchronous web crawler with flexible caching capabilities.
|
||
|
||
There are two ways to use the crawler:
|
||
|
||
1. Using context manager (recommended for simple cases):
|
||
```python
|
||
async with AsyncWebCrawler() as crawler:
|
||
result = await crawler.arun(url="https://example.com")
|
||
```
|
||
|
||
2. Using explicit lifecycle management (recommended for long-running applications):
|
||
```python
|
||
crawler = AsyncWebCrawler()
|
||
await crawler.start()
|
||
|
||
# Use the crawler multiple times
|
||
result1 = await crawler.arun(url="https://example.com")
|
||
result2 = await crawler.arun(url="https://another.com")
|
||
|
||
await crawler.close()
|
||
```
|
||
|
||
Attributes:
|
||
browser_config (BrowserConfig): Configuration object for browser settings.
|
||
crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages.
|
||
logger (AsyncLogger): Logger instance for recording events and errors.
|
||
crawl4ai_folder (str): Directory for storing cache.
|
||
base_directory (str): Base directory for storing cache.
|
||
ready (bool): Whether the crawler is ready for use.
|
||
|
||
Methods:
|
||
start(): Start the crawler explicitly without using context manager.
|
||
close(): Close the crawler explicitly without using context manager.
|
||
arun(): Run the crawler for a single source: URL (web, local file, or raw HTML).
|
||
awarmup(): Perform warmup sequence.
|
||
arun_many(): Run the crawler for multiple sources.
|
||
aprocess_html(): Process HTML content.
|
||
|
||
Typical Usage:
|
||
async with AsyncWebCrawler() as crawler:
|
||
result = await crawler.arun(url="https://example.com")
|
||
print(result.markdown)
|
||
|
||
Using configuration:
|
||
browser_config = BrowserConfig(browser_type="chromium", headless=True)
|
||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||
crawler_config = CrawlerRunConfig(
|
||
cache_mode=CacheMode.BYPASS
|
||
)
|
||
result = await crawler.arun(url="https://example.com", config=crawler_config)
|
||
print(result.markdown)
|
||
"""
|
||
|
||
_domain_last_hit = {}
|
||
|
||
def __init__(
|
||
self,
|
||
crawler_strategy: AsyncCrawlerStrategy = None,
|
||
config: BrowserConfig = None,
|
||
base_directory: str = str(
|
||
os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
|
||
thread_safe: bool = False,
|
||
logger: AsyncLoggerBase = None,
|
||
**kwargs,
|
||
):
|
||
"""
|
||
Initialize the AsyncWebCrawler.
|
||
|
||
Args:
|
||
crawler_strategy: Strategy for crawling web pages. Default AsyncPlaywrightCrawlerStrategy
|
||
config: Configuration object for browser settings. Default BrowserConfig()
|
||
base_directory: Base directory for storing cache
|
||
thread_safe: Whether to use thread-safe operations
|
||
**kwargs: Additional arguments for backwards compatibility
|
||
"""
|
||
# Handle browser configuration
|
||
browser_config = config or BrowserConfig()
|
||
|
||
self.browser_config = browser_config
|
||
|
||
# Initialize logger first since other components may need it
|
||
self.logger = logger or AsyncLogger(
|
||
log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"),
|
||
verbose=self.browser_config.verbose,
|
||
tag_width=10,
|
||
)
|
||
|
||
# Initialize crawler strategy
|
||
params = {k: v for k, v in kwargs.items() if k in [
|
||
"browser_config", "logger"]}
|
||
self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy(
|
||
browser_config=browser_config,
|
||
logger=self.logger,
|
||
**params, # Pass remaining kwargs for backwards compatibility
|
||
)
|
||
|
||
# Thread safety setup
|
||
self._lock = asyncio.Lock() if thread_safe else None
|
||
|
||
# Initialize directories
|
||
self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai")
|
||
os.makedirs(self.crawl4ai_folder, exist_ok=True)
|
||
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
|
||
|
||
# Initialize robots parser
|
||
self.robots_parser = RobotsParser()
|
||
|
||
self.ready = False
|
||
|
||
# Decorate arun method with deep crawling capabilities
|
||
self._deep_handler = DeepCrawlDecorator(self)
|
||
self.arun = self._deep_handler(self.arun)
|
||
|
||
async def start(self):
|
||
"""
|
||
Start the crawler explicitly without using context manager.
|
||
This is equivalent to using 'async with' but gives more control over the lifecycle.
|
||
Returns:
|
||
AsyncWebCrawler: The initialized crawler instance
|
||
"""
|
||
await self.crawler_strategy.__aenter__()
|
||
self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
|
||
self.ready = True
|
||
return self
|
||
|
||
async def close(self):
|
||
"""
|
||
Close the crawler explicitly without using context manager.
|
||
This should be called when you're done with the crawler if you used start().
|
||
|
||
This method will:
|
||
1. Clean up browser resources
|
||
2. Close any open pages and contexts
|
||
"""
|
||
await self.crawler_strategy.__aexit__(None, None, None)
|
||
|
||
async def __aenter__(self):
|
||
return await self.start()
|
||
|
||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||
await self.close()
|
||
|
||
@asynccontextmanager
|
||
async def nullcontext(self):
|
||
"""异步空上下文管理器"""
|
||
yield
|
||
|
||
async def arun(
|
||
self,
|
||
url: str,
|
||
config: CrawlerRunConfig = None,
|
||
**kwargs,
|
||
) -> RunManyReturn:
|
||
"""
|
||
Runs the crawler for a single source: URL (web, local file, or raw HTML).
|
||
|
||
Migration Guide:
|
||
Old way (deprecated):
|
||
result = await crawler.arun(
|
||
url="https://example.com",
|
||
word_count_threshold=200,
|
||
screenshot=True,
|
||
...
|
||
)
|
||
|
||
New way (recommended):
|
||
config = CrawlerRunConfig(
|
||
word_count_threshold=200,
|
||
screenshot=True,
|
||
...
|
||
)
|
||
result = await crawler.arun(url="https://example.com", crawler_config=config)
|
||
|
||
Args:
|
||
url: The URL to crawl (http://, https://, file://, or raw:)
|
||
crawler_config: Configuration object controlling crawl behavior
|
||
[other parameters maintained for backwards compatibility]
|
||
|
||
Returns:
|
||
CrawlResult: The result of crawling and processing
|
||
"""
|
||
# Auto-start if not ready
|
||
if not self.ready:
|
||
await self.start()
|
||
|
||
config = config or CrawlerRunConfig()
|
||
if not isinstance(url, str) or not url:
|
||
raise ValueError(
|
||
"Invalid URL, make sure the URL is a non-empty string")
|
||
|
||
async with self._lock or self.nullcontext():
|
||
try:
|
||
self.logger.verbose = config.verbose
|
||
|
||
# Default to ENABLED if no cache mode specified
|
||
if config.cache_mode is None:
|
||
config.cache_mode = CacheMode.ENABLED
|
||
|
||
# Create cache context
|
||
cache_context = CacheContext(url, config.cache_mode, False)
|
||
|
||
# Initialize processing variables
|
||
async_response: AsyncCrawlResponse = None
|
||
cached_result: CrawlResult = None
|
||
screenshot_data = None
|
||
pdf_data = None
|
||
extracted_content = None
|
||
start_time = time.perf_counter()
|
||
|
||
# Try to get cached result if appropriate
|
||
if cache_context.should_read():
|
||
cached_result = await async_db_manager.aget_cached_url(url)
|
||
|
||
if cached_result:
|
||
html = sanitize_input_encode(cached_result.html)
|
||
extracted_content = sanitize_input_encode(
|
||
cached_result.extracted_content or ""
|
||
)
|
||
extracted_content = (
|
||
None
|
||
if not extracted_content or extracted_content == "[]"
|
||
else extracted_content
|
||
)
|
||
# If screenshot is requested but its not in cache, then set cache_result to None
|
||
screenshot_data = cached_result.screenshot
|
||
pdf_data = cached_result.pdf
|
||
# if config.screenshot and not screenshot or config.pdf and not pdf:
|
||
if config.screenshot and not screenshot_data:
|
||
cached_result = None
|
||
|
||
if config.pdf and not pdf_data:
|
||
cached_result = None
|
||
|
||
self.logger.url_status(
|
||
url=cache_context.display_url,
|
||
success=bool(html),
|
||
timing=time.perf_counter() - start_time,
|
||
tag="FETCH",
|
||
)
|
||
|
||
# Update proxy configuration from rotation strategy if available
|
||
if config and config.proxy_rotation_strategy:
|
||
next_proxy: ProxyConfig = await config.proxy_rotation_strategy.get_next_proxy()
|
||
if next_proxy:
|
||
self.logger.info(
|
||
message="Switch proxy: {proxy}",
|
||
tag="PROXY",
|
||
params={"proxy": next_proxy.server}
|
||
)
|
||
config.proxy_config = next_proxy
|
||
# config = config.clone(proxy_config=next_proxy)
|
||
|
||
# Fetch fresh content if needed
|
||
if not cached_result or not html:
|
||
t1 = time.perf_counter()
|
||
|
||
if config.user_agent:
|
||
self.crawler_strategy.update_user_agent(
|
||
config.user_agent)
|
||
|
||
# Check robots.txt if enabled
|
||
if config and config.check_robots_txt:
|
||
if not await self.robots_parser.can_fetch(
|
||
url, self.browser_config.user_agent
|
||
):
|
||
return CrawlResult(
|
||
url=url,
|
||
html="",
|
||
success=False,
|
||
status_code=403,
|
||
error_message="Access denied by robots.txt",
|
||
response_headers={
|
||
"X-Robots-Status": "Blocked by robots.txt"
|
||
},
|
||
)
|
||
|
||
##############################
|
||
# Call CrawlerStrategy.crawl #
|
||
##############################
|
||
async_response = await self.crawler_strategy.crawl(
|
||
url,
|
||
config=config, # Pass the entire config object
|
||
)
|
||
|
||
html = sanitize_input_encode(async_response.html)
|
||
screenshot_data = async_response.screenshot
|
||
pdf_data = async_response.pdf_data
|
||
js_execution_result = async_response.js_execution_result
|
||
|
||
t2 = time.perf_counter()
|
||
self.logger.url_status(
|
||
url=cache_context.display_url,
|
||
success=bool(html),
|
||
timing=t2 - t1,
|
||
tag="FETCH",
|
||
)
|
||
|
||
###############################################################
|
||
# Process the HTML content, Call CrawlerStrategy.process_html #
|
||
###############################################################
|
||
crawl_result: CrawlResult = await self.aprocess_html(
|
||
url=url,
|
||
html=html,
|
||
extracted_content=extracted_content,
|
||
config=config, # Pass the config object instead of individual parameters
|
||
screenshot=screenshot_data,
|
||
pdf_data=pdf_data,
|
||
verbose=config.verbose,
|
||
is_raw_html=True if url.startswith("raw:") else False,
|
||
**kwargs,
|
||
)
|
||
|
||
crawl_result.status_code = async_response.status_code
|
||
crawl_result.redirected_url = async_response.redirected_url or url
|
||
crawl_result.response_headers = async_response.response_headers
|
||
crawl_result.downloaded_files = async_response.downloaded_files
|
||
crawl_result.js_execution_result = js_execution_result
|
||
crawl_result.mhtml = async_response.mhtml_data
|
||
crawl_result.ssl_certificate = async_response.ssl_certificate
|
||
# Add captured network and console data if available
|
||
crawl_result.network_requests = async_response.network_requests
|
||
crawl_result.console_messages = async_response.console_messages
|
||
|
||
crawl_result.success = bool(html)
|
||
crawl_result.session_id = getattr(
|
||
config, "session_id", None)
|
||
|
||
self.logger.success(
|
||
message="{url:.50}... | Status: {status} | Total: {timing}",
|
||
tag="COMPLETE",
|
||
params={
|
||
"url": cache_context.display_url,
|
||
"status": crawl_result.success,
|
||
"timing": f"{time.perf_counter() - start_time:.2f}s",
|
||
},
|
||
colors={
|
||
"status": Fore.GREEN if crawl_result.success else Fore.RED,
|
||
"timing": Fore.YELLOW,
|
||
},
|
||
)
|
||
|
||
# Update cache if appropriate
|
||
if cache_context.should_write() and not bool(cached_result):
|
||
await async_db_manager.acache_url(crawl_result)
|
||
|
||
return CrawlResultContainer(crawl_result)
|
||
|
||
else:
|
||
self.logger.success(
|
||
message="{url:.50}... | Status: {status} | Total: {timing}",
|
||
tag="COMPLETE",
|
||
params={
|
||
"url": cache_context.display_url,
|
||
"status": True,
|
||
"timing": f"{time.perf_counter() - start_time:.2f}s",
|
||
},
|
||
colors={"status": Fore.GREEN, "timing": Fore.YELLOW},
|
||
)
|
||
|
||
cached_result.success = bool(html)
|
||
cached_result.session_id = getattr(
|
||
config, "session_id", None)
|
||
cached_result.redirected_url = cached_result.redirected_url or url
|
||
return CrawlResultContainer(cached_result)
|
||
|
||
except Exception as e:
|
||
error_context = get_error_context(sys.exc_info())
|
||
|
||
error_message = (
|
||
f"Unexpected error in _crawl_web at line {error_context['line_no']} "
|
||
f"in {error_context['function']} ({error_context['filename']}):\n"
|
||
f"Error: {str(e)}\n\n"
|
||
f"Code context:\n{error_context['code_context']}"
|
||
)
|
||
|
||
self.logger.error_status(
|
||
url=url,
|
||
error=create_box_message(error_message, type="error"),
|
||
tag="ERROR",
|
||
)
|
||
|
||
return CrawlResultContainer(
|
||
CrawlResult(
|
||
url=url, html="", success=False, error_message=error_message
|
||
)
|
||
)
|
||
|
||
async def aprocess_html(
|
||
self,
|
||
url: str,
|
||
html: str,
|
||
extracted_content: str,
|
||
config: CrawlerRunConfig,
|
||
screenshot: str,
|
||
pdf_data: str,
|
||
verbose: bool,
|
||
**kwargs,
|
||
) -> CrawlResult:
|
||
"""
|
||
Process HTML content using the provided configuration.
|
||
|
||
Args:
|
||
url: The URL being processed
|
||
html: Raw HTML content
|
||
extracted_content: Previously extracted content (if any)
|
||
config: Configuration object controlling processing behavior
|
||
screenshot: Screenshot data (if any)
|
||
pdf_data: PDF data (if any)
|
||
verbose: Whether to enable verbose logging
|
||
**kwargs: Additional parameters for backwards compatibility
|
||
|
||
Returns:
|
||
CrawlResult: Processed result containing extracted and formatted content
|
||
"""
|
||
cleaned_html = ""
|
||
try:
|
||
_url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
|
||
t1 = time.perf_counter()
|
||
|
||
# Get scraping strategy and ensure it has a logger
|
||
scraping_strategy = config.scraping_strategy
|
||
if not scraping_strategy.logger:
|
||
scraping_strategy.logger = self.logger
|
||
|
||
# Process HTML content
|
||
params = config.__dict__.copy()
|
||
params.pop("url", None)
|
||
# add keys from kwargs to params that doesn't exist in params
|
||
params.update({k: v for k, v in kwargs.items()
|
||
if k not in params.keys()})
|
||
|
||
################################
|
||
# Scraping Strategy Execution #
|
||
################################
|
||
result: ScrapingResult = scraping_strategy.scrap(
|
||
url, html, **params)
|
||
|
||
if result is None:
|
||
raise ValueError(
|
||
f"Process HTML, Failed to extract content from the website: {url}"
|
||
)
|
||
|
||
except InvalidCSSSelectorError as e:
|
||
raise ValueError(str(e))
|
||
except Exception as e:
|
||
raise ValueError(
|
||
f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}"
|
||
)
|
||
|
||
# Extract results - handle both dict and ScrapingResult
|
||
if isinstance(result, dict):
|
||
cleaned_html = sanitize_input_encode(
|
||
result.get("cleaned_html", ""))
|
||
media = result.get("media", {})
|
||
links = result.get("links", {})
|
||
metadata = result.get("metadata", {})
|
||
else:
|
||
cleaned_html = sanitize_input_encode(result.cleaned_html)
|
||
media = result.media.model_dump()
|
||
links = result.links.model_dump()
|
||
metadata = result.metadata
|
||
|
||
################################
|
||
# Generate Markdown #
|
||
################################
|
||
markdown_generator: Optional[MarkdownGenerationStrategy] = (
|
||
config.markdown_generator or DefaultMarkdownGenerator()
|
||
)
|
||
|
||
# --- SELECT HTML SOURCE BASED ON CONTENT_SOURCE ---
|
||
# Get the desired source from the generator config, default to 'cleaned_html'
|
||
selected_html_source = getattr(markdown_generator, 'content_source', 'cleaned_html')
|
||
|
||
# Define the source selection logic using dict dispatch
|
||
html_source_selector = {
|
||
"raw_html": lambda: html, # The original raw HTML
|
||
"cleaned_html": lambda: cleaned_html, # The HTML after scraping strategy
|
||
"fit_html": lambda: preprocess_html_for_schema(html_content=html), # Preprocessed raw HTML
|
||
}
|
||
|
||
markdown_input_html = cleaned_html # Default to cleaned_html
|
||
|
||
try:
|
||
# Get the appropriate lambda function, default to returning cleaned_html if key not found
|
||
source_lambda = html_source_selector.get(selected_html_source, lambda: cleaned_html)
|
||
# Execute the lambda to get the selected HTML
|
||
markdown_input_html = source_lambda()
|
||
|
||
# Log which source is being used (optional, but helpful for debugging)
|
||
# if self.logger and verbose:
|
||
# actual_source_used = selected_html_source if selected_html_source in html_source_selector else 'cleaned_html (default)'
|
||
# self.logger.debug(f"Using '{actual_source_used}' as source for Markdown generation for {url}", tag="MARKDOWN_SRC")
|
||
|
||
except Exception as e:
|
||
# Handle potential errors, especially from preprocess_html_for_schema
|
||
if self.logger:
|
||
self.logger.warning(
|
||
f"Error getting/processing '{selected_html_source}' for markdown source: {e}. Falling back to cleaned_html.",
|
||
tag="MARKDOWN_SRC"
|
||
)
|
||
# Ensure markdown_input_html is still the default cleaned_html in case of error
|
||
markdown_input_html = cleaned_html
|
||
# --- END: HTML SOURCE SELECTION ---
|
||
|
||
# Uncomment if by default we want to use PruningContentFilter
|
||
# if not config.content_filter and not markdown_generator.content_filter:
|
||
# markdown_generator.content_filter = PruningContentFilter()
|
||
|
||
markdown_result: MarkdownGenerationResult = (
|
||
markdown_generator.generate_markdown(
|
||
input_html=markdown_input_html,
|
||
base_url=url,
|
||
# html2text_options=kwargs.get('html2text', {})
|
||
)
|
||
)
|
||
|
||
# Log processing completion
|
||
self.logger.info(
|
||
message="{url:.50}... | Time: {timing}s",
|
||
tag="SCRAPE",
|
||
params={
|
||
"url": _url,
|
||
"timing": int((time.perf_counter() - t1) * 1000) / 1000,
|
||
},
|
||
)
|
||
|
||
################################
|
||
# Structured Content Extraction #
|
||
################################
|
||
if (
|
||
not bool(extracted_content)
|
||
and config.extraction_strategy
|
||
and not isinstance(config.extraction_strategy, NoExtractionStrategy)
|
||
):
|
||
t1 = time.perf_counter()
|
||
# Choose content based on input_format
|
||
content_format = config.extraction_strategy.input_format
|
||
if content_format == "fit_markdown" and not markdown_result.fit_markdown:
|
||
self.logger.warning(
|
||
message="Fit markdown requested but not available. Falling back to raw markdown.",
|
||
tag="EXTRACT",
|
||
params={"url": _url},
|
||
)
|
||
content_format = "markdown"
|
||
|
||
content = {
|
||
"markdown": markdown_result.raw_markdown,
|
||
"html": html,
|
||
"cleaned_html": cleaned_html,
|
||
"fit_markdown": markdown_result.fit_markdown,
|
||
}.get(content_format, markdown_result.raw_markdown)
|
||
|
||
# Use IdentityChunking for HTML input, otherwise use provided chunking strategy
|
||
chunking = (
|
||
IdentityChunking()
|
||
if content_format in ["html", "cleaned_html"]
|
||
else config.chunking_strategy
|
||
)
|
||
sections = chunking.chunk(content)
|
||
extracted_content = config.extraction_strategy.run(url, sections)
|
||
extracted_content = json.dumps(
|
||
extracted_content, indent=4, default=str, ensure_ascii=False
|
||
)
|
||
|
||
# Log extraction completion
|
||
self.logger.info(
|
||
message="Completed for {url:.50}... | Time: {timing}s",
|
||
tag="EXTRACT",
|
||
params={"url": _url, "timing": time.perf_counter() - t1},
|
||
)
|
||
|
||
# Handle screenshot and PDF data
|
||
screenshot_data = None if not screenshot else screenshot
|
||
pdf_data = None if not pdf_data else pdf_data
|
||
|
||
# Apply HTML formatting if requested
|
||
if config.prettiify:
|
||
cleaned_html = fast_format_html(cleaned_html)
|
||
|
||
# Return complete crawl result
|
||
return CrawlResult(
|
||
url=url,
|
||
html=html,
|
||
cleaned_html=cleaned_html,
|
||
markdown=markdown_result,
|
||
media=media,
|
||
links=links,
|
||
metadata=metadata,
|
||
screenshot=screenshot_data,
|
||
pdf=pdf_data,
|
||
extracted_content=extracted_content,
|
||
success=True,
|
||
error_message="",
|
||
)
|
||
|
||
async def arun_many(
|
||
self,
|
||
urls: List[str],
|
||
config: Optional[CrawlerRunConfig] = None,
|
||
dispatcher: Optional[BaseDispatcher] = None,
|
||
# Legacy parameters maintained for backwards compatibility
|
||
# word_count_threshold=MIN_WORD_THRESHOLD,
|
||
# extraction_strategy: ExtractionStrategy = None,
|
||
# chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||
# content_filter: RelevantContentFilter = None,
|
||
# cache_mode: Optional[CacheMode] = None,
|
||
# bypass_cache: bool = False,
|
||
# css_selector: str = None,
|
||
# screenshot: bool = False,
|
||
# pdf: bool = False,
|
||
# user_agent: str = None,
|
||
# verbose=True,
|
||
**kwargs,
|
||
) -> RunManyReturn:
|
||
"""
|
||
Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy.
|
||
|
||
Args:
|
||
urls: List of URLs to crawl
|
||
config: Configuration object controlling crawl behavior for all URLs
|
||
dispatcher: The dispatcher strategy instance to use. Defaults to MemoryAdaptiveDispatcher
|
||
[other parameters maintained for backwards compatibility]
|
||
|
||
Returns:
|
||
Union[List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
|
||
Either a list of all results or an async generator yielding results
|
||
|
||
Examples:
|
||
|
||
# Batch processing (default)
|
||
results = await crawler.arun_many(
|
||
urls=["https://example1.com", "https://example2.com"],
|
||
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||
)
|
||
for result in results:
|
||
print(f"Processed {result.url}: {len(result.markdown)} chars")
|
||
|
||
# Streaming results
|
||
async for result in await crawler.arun_many(
|
||
urls=["https://example1.com", "https://example2.com"],
|
||
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=True),
|
||
):
|
||
print(f"Processed {result.url}: {len(result.markdown)} chars")
|
||
"""
|
||
config = config or CrawlerRunConfig()
|
||
# if config is None:
|
||
# config = CrawlerRunConfig(
|
||
# word_count_threshold=word_count_threshold,
|
||
# extraction_strategy=extraction_strategy,
|
||
# chunking_strategy=chunking_strategy,
|
||
# content_filter=content_filter,
|
||
# cache_mode=cache_mode,
|
||
# bypass_cache=bypass_cache,
|
||
# css_selector=css_selector,
|
||
# screenshot=screenshot,
|
||
# pdf=pdf,
|
||
# verbose=verbose,
|
||
# **kwargs,
|
||
# )
|
||
|
||
if dispatcher is None:
|
||
dispatcher = MemoryAdaptiveDispatcher(
|
||
rate_limiter=RateLimiter(
|
||
base_delay=(1.0, 3.0), max_delay=60.0, max_retries=3
|
||
),
|
||
)
|
||
|
||
def transform_result(task_result):
|
||
return (
|
||
setattr(
|
||
task_result.result,
|
||
"dispatch_result",
|
||
DispatchResult(
|
||
task_id=task_result.task_id,
|
||
memory_usage=task_result.memory_usage,
|
||
peak_memory=task_result.peak_memory,
|
||
start_time=task_result.start_time,
|
||
end_time=task_result.end_time,
|
||
error_message=task_result.error_message,
|
||
),
|
||
)
|
||
or task_result.result
|
||
)
|
||
|
||
stream = config.stream
|
||
|
||
if stream:
|
||
|
||
async def result_transformer():
|
||
async for task_result in dispatcher.run_urls_stream(
|
||
crawler=self, urls=urls, config=config
|
||
):
|
||
yield transform_result(task_result)
|
||
|
||
return result_transformer()
|
||
else:
|
||
_results = await dispatcher.run_urls(crawler=self, urls=urls, config=config)
|
||
return [transform_result(res) for res in _results]
|
||
|
||
```
|
||
|
||
|
||
## File: crawl4ai/cli.py
|
||
|
||
```py
|
||
import click
|
||
import os
|
||
import sys
|
||
import time
|
||
|
||
import humanize
|
||
from typing import Dict, Any, Optional, List
|
||
import json
|
||
import yaml
|
||
import anyio
|
||
from rich.console import Console
|
||
from rich.table import Table
|
||
from rich.panel import Panel
|
||
from rich.prompt import Prompt, Confirm
|
||
|
||
from crawl4ai import (
|
||
CacheMode,
|
||
AsyncWebCrawler,
|
||
CrawlResult,
|
||
BrowserConfig,
|
||
CrawlerRunConfig,
|
||
LLMExtractionStrategy,
|
||
LXMLWebScrapingStrategy,
|
||
JsonCssExtractionStrategy,
|
||
JsonXPathExtractionStrategy,
|
||
BM25ContentFilter,
|
||
PruningContentFilter,
|
||
BrowserProfiler,
|
||
DefaultMarkdownGenerator,
|
||
LLMConfig
|
||
)
|
||
from crawl4ai.config import USER_SETTINGS
|
||
from litellm import completion
|
||
from pathlib import Path
|
||
|
||
|
||
# Initialize rich console
|
||
console = Console()
|
||
|
||
def get_global_config() -> dict:
|
||
config_dir = Path.home() / ".crawl4ai"
|
||
config_file = config_dir / "global.yml"
|
||
|
||
if not config_file.exists():
|
||
config_dir.mkdir(parents=True, exist_ok=True)
|
||
return {}
|
||
|
||
with open(config_file) as f:
|
||
return yaml.safe_load(f) or {}
|
||
|
||
def save_global_config(config: dict):
|
||
config_file = Path.home() / ".crawl4ai" / "global.yml"
|
||
with open(config_file, "w") as f:
|
||
yaml.dump(config, f)
|
||
|
||
def setup_llm_config() -> tuple[str, str]:
|
||
config = get_global_config()
|
||
provider = config.get("DEFAULT_LLM_PROVIDER")
|
||
token = config.get("DEFAULT_LLM_PROVIDER_TOKEN")
|
||
|
||
if not provider:
|
||
click.echo("\nNo default LLM provider configured.")
|
||
click.echo("Provider format: 'company/model' (e.g., 'openai/gpt-4o', 'anthropic/claude-3-sonnet')")
|
||
click.echo("See available providers at: https://docs.litellm.ai/docs/providers")
|
||
provider = click.prompt("Enter provider")
|
||
|
||
if not provider.startswith("ollama/"):
|
||
if not token:
|
||
token = click.prompt("Enter API token for " + provider, hide_input=True)
|
||
else:
|
||
token = "no-token"
|
||
|
||
if not config.get("DEFAULT_LLM_PROVIDER") or not config.get("DEFAULT_LLM_PROVIDER_TOKEN"):
|
||
config["DEFAULT_LLM_PROVIDER"] = provider
|
||
config["DEFAULT_LLM_PROVIDER_TOKEN"] = token
|
||
save_global_config(config)
|
||
click.echo("\nConfiguration saved to ~/.crawl4ai/global.yml")
|
||
|
||
return provider, token
|
||
|
||
async def stream_llm_response(url: str, markdown: str, query: str, provider: str, token: str):
|
||
response = completion(
|
||
model=provider,
|
||
api_key=token,
|
||
messages=[
|
||
{
|
||
"content": f"You are Crawl4ai assistant, answering user question based on the provided context which is crawled from {url}.",
|
||
"role": "system"
|
||
},
|
||
{
|
||
"content": f"<|start of context|>\n{markdown}\n<|end of context|>\n\n{query}",
|
||
"role": "user"
|
||
},
|
||
],
|
||
stream=True,
|
||
)
|
||
|
||
for chunk in response:
|
||
if content := chunk["choices"][0]["delta"].get("content"):
|
||
print(content, end="", flush=True)
|
||
print() # New line at end
|
||
|
||
|
||
|
||
def parse_key_values(ctx, param, value) -> Dict[str, Any]:
|
||
if not value:
|
||
return {}
|
||
result = {}
|
||
pairs = value.split(',')
|
||
for pair in pairs:
|
||
try:
|
||
k, v = pair.split('=', 1)
|
||
# Handle common value types
|
||
if v.lower() == 'true': v = True
|
||
elif v.lower() == 'false': v = False
|
||
elif v.isdigit(): v = int(v)
|
||
elif v.replace('.','',1).isdigit(): v = float(v)
|
||
elif v.startswith('[') and v.endswith(']'):
|
||
v = [x.strip() for x in v[1:-1].split(',') if x.strip()]
|
||
elif v.startswith('{') and v.endswith('}'):
|
||
try:
|
||
v = json.loads(v)
|
||
except json.JSONDecodeError:
|
||
raise click.BadParameter(f'Invalid JSON object: {v}')
|
||
result[k.strip()] = v
|
||
except ValueError:
|
||
raise click.BadParameter(f'Invalid key=value pair: {pair}')
|
||
return result
|
||
|
||
def load_config_file(path: Optional[str]) -> dict:
|
||
if not path:
|
||
return {}
|
||
|
||
try:
|
||
with open(path) as f:
|
||
if path.endswith((".yaml", ".yml")):
|
||
return yaml.safe_load(f)
|
||
return json.load(f)
|
||
except Exception as e:
|
||
raise click.BadParameter(f'Error loading config file {path}: {str(e)}')
|
||
|
||
def load_schema_file(path: Optional[str]) -> dict:
|
||
if not path:
|
||
return None
|
||
return load_config_file(path)
|
||
|
||
async def run_crawler(url: str, browser_cfg: BrowserConfig, crawler_cfg: CrawlerRunConfig, verbose: bool):
|
||
if verbose:
|
||
click.echo("Starting crawler with configurations:")
|
||
click.echo(f"Browser config: {browser_cfg.dump()}")
|
||
click.echo(f"Crawler config: {crawler_cfg.dump()}")
|
||
|
||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||
try:
|
||
result = await crawler.arun(url=url, config=crawler_cfg)
|
||
return result
|
||
except Exception as e:
|
||
raise click.ClickException(f"Crawling failed: {str(e)}")
|
||
|
||
def show_examples():
|
||
examples = """
|
||
🚀 Crawl4AI CLI Examples
|
||
|
||
1️⃣ Basic Usage:
|
||
# Simple crawl with default settings
|
||
crwl https://example.com
|
||
|
||
# Get markdown output
|
||
crwl https://example.com -o markdown
|
||
|
||
# Verbose JSON output with cache bypass
|
||
crwl https://example.com -o json -v --bypass-cache
|
||
|
||
2️⃣ Using Config Files:
|
||
# Using browser and crawler configs
|
||
crwl https://example.com -B browser.yml -C crawler.yml
|
||
|
||
# CSS-based extraction
|
||
crwl https://example.com -e extract_css.yml -s css_schema.json -o json
|
||
|
||
# LLM-based extraction with config file
|
||
crwl https://example.com -e extract_llm.yml -s llm_schema.json -o json
|
||
|
||
# Quick LLM-based JSON extraction (prompts for LLM provider first time)
|
||
crwl https://example.com -j # Auto-extracts structured data
|
||
crwl https://example.com -j "Extract product details including name, price, and features" # With specific instructions
|
||
|
||
3️⃣ Direct Parameters:
|
||
# Browser settings
|
||
crwl https://example.com -b "headless=true,viewport_width=1280,user_agent_mode=random"
|
||
|
||
# Crawler settings
|
||
crwl https://example.com -c "css_selector=#main,delay_before_return_html=2,scan_full_page=true"
|
||
|
||
4️⃣ Profile Management for Identity-Based Crawling:
|
||
# Launch interactive profile manager
|
||
crwl profiles
|
||
|
||
# Create, list, and delete browser profiles for identity-based crawling
|
||
# Use a profile for crawling (keeps you logged in)
|
||
crwl https://example.com -p my-profile-name
|
||
|
||
# Example: Crawl a site that requires login
|
||
# 1. First create a profile and log in:
|
||
crwl profiles
|
||
# 2. Then use that profile to crawl the authenticated site:
|
||
crwl https://site-requiring-login.com/dashboard -p my-profile-name
|
||
|
||
5️⃣ CDP Mode for Browser Automation:
|
||
# Launch browser with CDP debugging on default port 9222
|
||
crwl cdp
|
||
|
||
# Use a specific profile and custom port
|
||
crwl cdp -p my-profile -P 9223
|
||
|
||
# Launch headless browser with CDP enabled
|
||
crwl cdp --headless
|
||
|
||
# Launch in incognito mode (ignores profile)
|
||
crwl cdp --incognito
|
||
|
||
# Use the CDP URL with other tools (Puppeteer, Playwright, etc.)
|
||
# The URL will be displayed in the terminal when the browser starts
|
||
|
||
|
||
6️⃣ Sample Config Files:
|
||
|
||
browser.yml:
|
||
headless: true
|
||
viewport_width: 1280
|
||
user_agent_mode: "random"
|
||
verbose: true
|
||
ignore_https_errors: true
|
||
|
||
extract_css.yml:
|
||
type: "json-css"
|
||
params:
|
||
verbose: true
|
||
|
||
css_schema.json:
|
||
{
|
||
"name": "ArticleExtractor",
|
||
"baseSelector": ".article",
|
||
"fields": [
|
||
{
|
||
"name": "title",
|
||
"selector": "h1.title",
|
||
"type": "text"
|
||
},
|
||
{
|
||
"name": "link",
|
||
"selector": "a.read-more",
|
||
"type": "attribute",
|
||
"attribute": "href"
|
||
}
|
||
]
|
||
}
|
||
|
||
extract_llm.yml:
|
||
type: "llm"
|
||
provider: "openai/gpt-4"
|
||
instruction: "Extract all articles with their titles and links"
|
||
api_token: "your-token"
|
||
params:
|
||
temperature: 0.3
|
||
max_tokens: 1000
|
||
|
||
llm_schema.json:
|
||
{
|
||
"title": "Article",
|
||
"type": "object",
|
||
"properties": {
|
||
"title": {
|
||
"type": "string",
|
||
"description": "The title of the article"
|
||
},
|
||
"link": {
|
||
"type": "string",
|
||
"description": "URL to the full article"
|
||
}
|
||
}
|
||
}
|
||
|
||
7️⃣ Advanced Usage:
|
||
# Combine configs with direct parameters
|
||
crwl https://example.com -B browser.yml -b "headless=false,viewport_width=1920"
|
||
|
||
# Full extraction pipeline with config files
|
||
crwl https://example.com \\
|
||
-B browser.yml \\
|
||
-C crawler.yml \\
|
||
-e extract_llm.yml \\
|
||
-s llm_schema.json \\
|
||
-o json \\
|
||
-v
|
||
|
||
# Quick LLM-based extraction with specific instructions
|
||
crwl https://amazon.com/dp/B01DFKC2SO \\
|
||
-j "Extract product title, current price, original price, rating, and all product specifications" \\
|
||
-b "headless=true,viewport_width=1280" \\
|
||
-v
|
||
|
||
# Content filtering with BM25
|
||
crwl https://example.com \\
|
||
-f filter_bm25.yml \\
|
||
-o markdown-fit
|
||
|
||
# Authenticated crawling with profile
|
||
crwl https://login-required-site.com \\
|
||
-p my-authenticated-profile \\
|
||
-c "css_selector=.dashboard-content" \\
|
||
-o markdown
|
||
|
||
For more documentation visit: https://github.com/unclecode/crawl4ai
|
||
|
||
8️⃣ Q&A with LLM:
|
||
# Ask a question about the content
|
||
crwl https://example.com -q "What is the main topic discussed?"
|
||
|
||
# First view content, then ask questions
|
||
crwl https://example.com -o markdown # See the crawled content first
|
||
crwl https://example.com -q "Summarize the key points"
|
||
crwl https://example.com -q "What are the conclusions?"
|
||
|
||
# Advanced crawling with Q&A
|
||
crwl https://example.com \\
|
||
-B browser.yml \\
|
||
-c "css_selector=article,scan_full_page=true" \\
|
||
-q "What are the pros and cons mentioned?"
|
||
|
||
Note: First time using -q will prompt for LLM provider and API token.
|
||
These will be saved in ~/.crawl4ai/global.yml for future use.
|
||
|
||
Supported provider format: 'company/model'
|
||
Examples:
|
||
- ollama/llama3.3
|
||
- openai/gpt-4
|
||
- anthropic/claude-3-sonnet
|
||
- cohere/command
|
||
- google/gemini-pro
|
||
|
||
See full list of providers: https://docs.litellm.ai/docs/providers
|
||
|
||
# Set default LLM provider and token in advance
|
||
crwl config set DEFAULT_LLM_PROVIDER "anthropic/claude-3-sonnet"
|
||
crwl config set DEFAULT_LLM_PROVIDER_TOKEN "your-api-token-here"
|
||
|
||
# Set default browser behavior
|
||
crwl config set BROWSER_HEADLESS false # Always show browser window
|
||
crwl config set USER_AGENT_MODE random # Use random user agent
|
||
|
||
9️⃣ Profile Management:
|
||
# Launch interactive profile manager
|
||
crwl profiles
|
||
|
||
# Create a profile and use it for crawling
|
||
crwl profiles # Create and set up your profile interactively
|
||
crwl https://example.com -p my-profile-name # Use profile for crawling
|
||
|
||
# Example workflow for authenticated site
|
||
# 1. First create a profile and log in to the site:
|
||
crwl profiles # Select "Create new profile" option
|
||
# 2. Then use that profile to crawl authenticated content:
|
||
crwl https://site-requiring-login.com/dashboard -p my-profile-name
|
||
|
||
🔄 Builtin Browser Management:
|
||
# Start a builtin browser (runs in the background)
|
||
crwl browser start
|
||
|
||
# Check builtin browser status
|
||
crwl browser status
|
||
|
||
# Open a visible window to see the browser
|
||
crwl browser view --url https://example.com
|
||
|
||
# Stop the builtin browser
|
||
crwl browser stop
|
||
|
||
# Restart with different options
|
||
crwl browser restart --browser-type chromium --port 9223 --no-headless
|
||
|
||
# Use the builtin browser in your code
|
||
# (Just set browser_mode="builtin" in your BrowserConfig)
|
||
browser_config = BrowserConfig(
|
||
browser_mode="builtin",
|
||
headless=True
|
||
)
|
||
|
||
# Usage via CLI:
|
||
crwl https://example.com -b "browser_mode=builtin"
|
||
"""
|
||
click.echo(examples)
|
||
|
||
def get_directory_size(path: str) -> int:
|
||
"""Calculate the total size of a directory in bytes"""
|
||
total_size = 0
|
||
for dirpath, _, filenames in os.walk(path):
|
||
for f in filenames:
|
||
fp = os.path.join(dirpath, f)
|
||
if not os.path.islink(fp):
|
||
total_size += os.path.getsize(fp)
|
||
return total_size
|
||
|
||
def display_profiles_table(profiles: List[Dict[str, Any]]):
|
||
"""Display a rich table of browser profiles"""
|
||
if not profiles:
|
||
console.print(Panel("[yellow]No profiles found. Create one with the 'create' command.[/yellow]",
|
||
title="Browser Profiles", border_style="blue"))
|
||
return
|
||
|
||
table = Table(title="Browser Profiles", show_header=True, header_style="bold cyan", border_style="blue")
|
||
table.add_column("#", style="dim", width=4)
|
||
table.add_column("Name", style="cyan", no_wrap=True)
|
||
table.add_column("Path", style="green")
|
||
table.add_column("Created", style="yellow")
|
||
table.add_column("Browser", style="magenta")
|
||
table.add_column("Size", style="blue", justify="right")
|
||
|
||
for i, profile in enumerate(profiles):
|
||
# Calculate folder size
|
||
size = get_directory_size(profile["path"])
|
||
human_size = humanize.naturalsize(size)
|
||
|
||
# Format creation date
|
||
created = profile["created"].strftime("%Y-%m-%d %H:%M")
|
||
|
||
# Add row to table
|
||
table.add_row(
|
||
str(i+1),
|
||
profile["name"],
|
||
profile["path"],
|
||
created,
|
||
profile["type"].capitalize(),
|
||
human_size
|
||
)
|
||
|
||
console.print(table)
|
||
|
||
async def create_profile_interactive(profiler: BrowserProfiler):
|
||
"""Interactive profile creation wizard"""
|
||
console.print(Panel("[bold cyan]Create Browser Profile[/bold cyan]\n"
|
||
"This will open a browser window for you to set up your identity.\n"
|
||
"Log in to sites, adjust settings, then press 'q' to save.",
|
||
border_style="cyan"))
|
||
|
||
profile_name = Prompt.ask("[cyan]Enter profile name[/cyan]", default=f"profile_{int(time.time())}")
|
||
|
||
console.print("[cyan]Creating profile...[/cyan]")
|
||
console.print("[yellow]A browser window will open. After logging in to sites, press 'q' in this terminal to save.[/yellow]")
|
||
|
||
# Create the profile
|
||
try:
|
||
profile_path = await profiler.create_profile(profile_name)
|
||
|
||
if profile_path:
|
||
console.print(f"[green]Profile successfully created at:[/green] {profile_path}")
|
||
else:
|
||
console.print("[red]Failed to create profile.[/red]")
|
||
except Exception as e:
|
||
console.print(f"[red]Error creating profile: {str(e)}[/red]")
|
||
|
||
def delete_profile_interactive(profiler: BrowserProfiler):
|
||
"""Interactive profile deletion"""
|
||
profiles = profiler.list_profiles()
|
||
|
||
if not profiles:
|
||
console.print("[yellow]No profiles found to delete.[/yellow]")
|
||
return
|
||
|
||
# Display profiles
|
||
display_profiles_table(profiles)
|
||
|
||
# Get profile selection
|
||
idx = Prompt.ask(
|
||
"[red]Enter number of profile to delete[/red]",
|
||
console=console,
|
||
choices=[str(i+1) for i in range(len(profiles))],
|
||
show_choices=False
|
||
)
|
||
|
||
try:
|
||
idx = int(idx) - 1
|
||
profile = profiles[idx]
|
||
|
||
# Confirm deletion
|
||
if Confirm.ask(f"[red]Are you sure you want to delete profile '{profile['name']}'?[/red]"):
|
||
success = profiler.delete_profile(profile["path"])
|
||
|
||
if success:
|
||
console.print(f"[green]Profile '{profile['name']}' deleted successfully.[/green]")
|
||
else:
|
||
console.print(f"[red]Failed to delete profile '{profile['name']}'.[/red]")
|
||
except (ValueError, IndexError):
|
||
console.print("[red]Invalid selection.[/red]")
|
||
|
||
async def crawl_with_profile_cli(profile_path, url):
|
||
"""Use a profile to crawl a website via CLI"""
|
||
console.print(f"[cyan]Crawling [bold]{url}[/bold] using profile at [bold]{profile_path}[/bold][/cyan]")
|
||
|
||
# Create browser config with the profile
|
||
browser_cfg = BrowserConfig(
|
||
headless=False, # Set to False to see the browser in action
|
||
use_managed_browser=True,
|
||
user_data_dir=profile_path
|
||
)
|
||
|
||
# Default crawler config
|
||
crawler_cfg = CrawlerRunConfig()
|
||
|
||
# Ask for output format
|
||
output_format = Prompt.ask(
|
||
"[cyan]Output format[/cyan]",
|
||
choices=["all", "json", "markdown", "md", "title"],
|
||
default="markdown"
|
||
)
|
||
|
||
try:
|
||
# Run the crawler
|
||
result = await run_crawler(url, browser_cfg, crawler_cfg, True)
|
||
|
||
# Handle output
|
||
if output_format == "all":
|
||
console.print(json.dumps(result.model_dump(), indent=2))
|
||
elif output_format == "json":
|
||
console.print(json.dumps(json.loads(result.extracted_content), indent=2))
|
||
elif output_format in ["markdown", "md"]:
|
||
console.print(result.markdown.raw_markdown)
|
||
elif output_format == "title":
|
||
console.print(result.metadata.get("title", "No title found"))
|
||
|
||
console.print(f"[green]Successfully crawled[/green] {url}")
|
||
return result
|
||
except Exception as e:
|
||
console.print(f"[red]Error crawling:[/red] {str(e)}")
|
||
return None
|
||
|
||
async def use_profile_to_crawl():
|
||
"""Interactive profile selection for crawling"""
|
||
profiler = BrowserProfiler()
|
||
profiles = profiler.list_profiles()
|
||
|
||
if not profiles:
|
||
console.print("[yellow]No profiles found. Create one first.[/yellow]")
|
||
return
|
||
|
||
# Display profiles
|
||
display_profiles_table(profiles)
|
||
|
||
# Get profile selection
|
||
idx = Prompt.ask(
|
||
"[cyan]Enter number of profile to use[/cyan]",
|
||
console=console,
|
||
choices=[str(i+1) for i in range(len(profiles))],
|
||
show_choices=False
|
||
)
|
||
|
||
try:
|
||
idx = int(idx) - 1
|
||
profile = profiles[idx]
|
||
|
||
# Get URL
|
||
url = Prompt.ask("[cyan]Enter URL to crawl[/cyan]")
|
||
if url:
|
||
# Crawl with the selected profile
|
||
await crawl_with_profile_cli(profile["path"], url)
|
||
else:
|
||
console.print("[red]No URL provided[/red]")
|
||
except (ValueError, IndexError):
|
||
console.print("[red]Invalid selection[/red]")
|
||
|
||
async def manage_profiles():
|
||
"""Interactive profile management menu"""
|
||
profiler = BrowserProfiler()
|
||
|
||
options = {
|
||
"1": "List profiles",
|
||
"2": "Create new profile",
|
||
"3": "Delete profile",
|
||
"4": "Use a profile to crawl a website",
|
||
"5": "Exit",
|
||
}
|
||
|
||
while True:
|
||
console.print(Panel("[bold cyan]Browser Profile Manager[/bold cyan]", border_style="cyan"))
|
||
|
||
for key, value in options.items():
|
||
color = "green" if key == "1" else "yellow" if key == "2" else "red" if key == "3" else "blue" if key == "4" else "cyan"
|
||
console.print(f"[{color}]{key}[/{color}]. {value}")
|
||
|
||
choice = Prompt.ask("Enter choice", choices=list(options.keys()), default="1")
|
||
|
||
if choice == "1":
|
||
# List profiles
|
||
profiles = profiler.list_profiles()
|
||
display_profiles_table(profiles)
|
||
|
||
elif choice == "2":
|
||
# Create profile
|
||
await create_profile_interactive(profiler)
|
||
|
||
elif choice == "3":
|
||
# Delete profile
|
||
delete_profile_interactive(profiler)
|
||
|
||
elif choice == "4":
|
||
# Use profile to crawl
|
||
await use_profile_to_crawl()
|
||
|
||
elif choice == "5":
|
||
# Exit
|
||
console.print("[cyan]Exiting profile manager.[/cyan]")
|
||
break
|
||
|
||
# Add a separator between operations
|
||
console.print("\n")
|
||
|
||
|
||
|
||
@click.group(context_settings={"help_option_names": ["-h", "--help"]})
|
||
def cli():
|
||
"""Crawl4AI CLI - Web content extraction and browser profile management tool"""
|
||
pass
|
||
|
||
|
||
@cli.group("browser")
|
||
def browser_cmd():
|
||
"""Manage browser instances for Crawl4AI
|
||
|
||
Commands to manage browser instances for Crawl4AI, including:
|
||
- status - Check status of the builtin browser
|
||
- start - Start a new builtin browser
|
||
- stop - Stop the running builtin browser
|
||
- restart - Restart the builtin browser
|
||
"""
|
||
pass
|
||
|
||
@browser_cmd.command("status")
|
||
def browser_status_cmd():
|
||
"""Show status of the builtin browser"""
|
||
profiler = BrowserProfiler()
|
||
|
||
try:
|
||
status = anyio.run(profiler.get_builtin_browser_status)
|
||
|
||
if status["running"]:
|
||
info = status["info"]
|
||
console.print(Panel(
|
||
f"[green]Builtin browser is running[/green]\n\n"
|
||
f"CDP URL: [cyan]{info['cdp_url']}[/cyan]\n"
|
||
f"Process ID: [yellow]{info['pid']}[/yellow]\n"
|
||
f"Browser type: [blue]{info['browser_type']}[/blue]\n"
|
||
f"User data directory: [magenta]{info['user_data_dir']}[/magenta]\n"
|
||
f"Started: [cyan]{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(info['start_time']))}[/cyan]",
|
||
title="Builtin Browser Status",
|
||
border_style="green"
|
||
))
|
||
else:
|
||
console.print(Panel(
|
||
"[yellow]Builtin browser is not running[/yellow]\n\n"
|
||
"Use 'crwl browser start' to start a builtin browser",
|
||
title="Builtin Browser Status",
|
||
border_style="yellow"
|
||
))
|
||
|
||
except Exception as e:
|
||
console.print(f"[red]Error checking browser status: {str(e)}[/red]")
|
||
sys.exit(1)
|
||
|
||
@browser_cmd.command("start")
|
||
@click.option("--browser-type", "-b", type=click.Choice(["chromium", "firefox"]), default="chromium",
|
||
help="Browser type (default: chromium)")
|
||
@click.option("--port", "-p", type=int, default=9222, help="Debugging port (default: 9222)")
|
||
@click.option("--headless/--no-headless", default=True, help="Run browser in headless mode")
|
||
def browser_start_cmd(browser_type: str, port: int, headless: bool):
|
||
"""Start a builtin browser instance
|
||
|
||
This will start a persistent browser instance that can be used by Crawl4AI
|
||
by setting browser_mode="builtin" in BrowserConfig.
|
||
"""
|
||
profiler = BrowserProfiler()
|
||
|
||
# First check if browser is already running
|
||
status = anyio.run(profiler.get_builtin_browser_status)
|
||
if status["running"]:
|
||
console.print(Panel(
|
||
"[yellow]Builtin browser is already running[/yellow]\n\n"
|
||
f"CDP URL: [cyan]{status['cdp_url']}[/cyan]\n\n"
|
||
"Use 'crwl browser restart' to restart the browser",
|
||
title="Builtin Browser Start",
|
||
border_style="yellow"
|
||
))
|
||
return
|
||
|
||
try:
|
||
console.print(Panel(
|
||
f"[cyan]Starting builtin browser[/cyan]\n\n"
|
||
f"Browser type: [green]{browser_type}[/green]\n"
|
||
f"Debugging port: [yellow]{port}[/yellow]\n"
|
||
f"Headless: [cyan]{'Yes' if headless else 'No'}[/cyan]",
|
||
title="Builtin Browser Start",
|
||
border_style="cyan"
|
||
))
|
||
|
||
cdp_url = anyio.run(
|
||
profiler.launch_builtin_browser,
|
||
browser_type,
|
||
port,
|
||
headless
|
||
)
|
||
|
||
if cdp_url:
|
||
console.print(Panel(
|
||
f"[green]Builtin browser started successfully[/green]\n\n"
|
||
f"CDP URL: [cyan]{cdp_url}[/cyan]\n\n"
|
||
"This browser will be used automatically when setting browser_mode='builtin'",
|
||
title="Builtin Browser Start",
|
||
border_style="green"
|
||
))
|
||
else:
|
||
console.print(Panel(
|
||
"[red]Failed to start builtin browser[/red]",
|
||
title="Builtin Browser Start",
|
||
border_style="red"
|
||
))
|
||
sys.exit(1)
|
||
|
||
except Exception as e:
|
||
console.print(f"[red]Error starting builtin browser: {str(e)}[/red]")
|
||
sys.exit(1)
|
||
|
||
@browser_cmd.command("stop")
|
||
def browser_stop_cmd():
|
||
"""Stop the running builtin browser"""
|
||
profiler = BrowserProfiler()
|
||
|
||
try:
|
||
# First check if browser is running
|
||
status = anyio.run(profiler.get_builtin_browser_status)
|
||
if not status["running"]:
|
||
console.print(Panel(
|
||
"[yellow]No builtin browser is currently running[/yellow]",
|
||
title="Builtin Browser Stop",
|
||
border_style="yellow"
|
||
))
|
||
return
|
||
|
||
console.print(Panel(
|
||
"[cyan]Stopping builtin browser...[/cyan]",
|
||
title="Builtin Browser Stop",
|
||
border_style="cyan"
|
||
))
|
||
|
||
success = anyio.run(profiler.kill_builtin_browser)
|
||
|
||
if success:
|
||
console.print(Panel(
|
||
"[green]Builtin browser stopped successfully[/green]",
|
||
title="Builtin Browser Stop",
|
||
border_style="green"
|
||
))
|
||
else:
|
||
console.print(Panel(
|
||
"[red]Failed to stop builtin browser[/red]",
|
||
title="Builtin Browser Stop",
|
||
border_style="red"
|
||
))
|
||
sys.exit(1)
|
||
|
||
except Exception as e:
|
||
console.print(f"[red]Error stopping builtin browser: {str(e)}[/red]")
|
||
sys.exit(1)
|
||
|
||
@browser_cmd.command("view")
|
||
@click.option("--url", "-u", help="URL to navigate to (defaults to about:blank)")
|
||
def browser_view_cmd(url: Optional[str]):
|
||
"""
|
||
Open a visible window of the builtin browser
|
||
|
||
This command connects to the running builtin browser and opens a visible window,
|
||
allowing you to see what the browser is currently viewing or navigate to a URL.
|
||
"""
|
||
profiler = BrowserProfiler()
|
||
|
||
try:
|
||
# First check if browser is running
|
||
status = anyio.run(profiler.get_builtin_browser_status)
|
||
if not status["running"]:
|
||
console.print(Panel(
|
||
"[yellow]No builtin browser is currently running[/yellow]\n\n"
|
||
"Use 'crwl browser start' to start a builtin browser first",
|
||
title="Builtin Browser View",
|
||
border_style="yellow"
|
||
))
|
||
return
|
||
|
||
info = status["info"]
|
||
cdp_url = info["cdp_url"]
|
||
|
||
console.print(Panel(
|
||
f"[cyan]Opening visible window connected to builtin browser[/cyan]\n\n"
|
||
f"CDP URL: [green]{cdp_url}[/green]\n"
|
||
f"URL to load: [yellow]{url or 'about:blank'}[/yellow]",
|
||
title="Builtin Browser View",
|
||
border_style="cyan"
|
||
))
|
||
|
||
# Use the CDP URL to launch a new visible window
|
||
import subprocess
|
||
import os
|
||
|
||
# Determine the browser command based on platform
|
||
if sys.platform == "darwin": # macOS
|
||
browser_cmd = ["/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"]
|
||
elif sys.platform == "win32": # Windows
|
||
browser_cmd = ["C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"]
|
||
else: # Linux
|
||
browser_cmd = ["google-chrome"]
|
||
|
||
# Add arguments
|
||
browser_args = [
|
||
f"--remote-debugging-port={info['debugging_port']}",
|
||
"--remote-debugging-address=localhost",
|
||
"--no-first-run",
|
||
"--no-default-browser-check"
|
||
]
|
||
|
||
# Add URL if provided
|
||
if url:
|
||
browser_args.append(url)
|
||
|
||
# Launch browser
|
||
try:
|
||
subprocess.Popen(browser_cmd + browser_args)
|
||
console.print("[green]Browser window opened. Close it when finished viewing.[/green]")
|
||
except Exception as e:
|
||
console.print(f"[red]Error launching browser: {str(e)}[/red]")
|
||
console.print(f"[yellow]Try connecting manually to {cdp_url} in Chrome or using the '--remote-debugging-port' flag.[/yellow]")
|
||
|
||
except Exception as e:
|
||
console.print(f"[red]Error viewing builtin browser: {str(e)}[/red]")
|
||
sys.exit(1)
|
||
|
||
@browser_cmd.command("restart")
|
||
@click.option("--browser-type", "-b", type=click.Choice(["chromium", "firefox"]), default=None,
|
||
help="Browser type (defaults to same as current)")
|
||
@click.option("--port", "-p", type=int, default=None, help="Debugging port (defaults to same as current)")
|
||
@click.option("--headless/--no-headless", default=None, help="Run browser in headless mode")
|
||
def browser_restart_cmd(browser_type: Optional[str], port: Optional[int], headless: Optional[bool]):
|
||
"""Restart the builtin browser
|
||
|
||
Stops the current builtin browser if running and starts a new one.
|
||
By default, uses the same configuration as the current browser.
|
||
"""
|
||
profiler = BrowserProfiler()
|
||
|
||
try:
|
||
# First check if browser is running and get its config
|
||
status = anyio.run(profiler.get_builtin_browser_status)
|
||
current_config = {}
|
||
|
||
if status["running"]:
|
||
info = status["info"]
|
||
current_config = {
|
||
"browser_type": info["browser_type"],
|
||
"port": info["debugging_port"],
|
||
"headless": True # Default assumption
|
||
}
|
||
|
||
# Stop the browser
|
||
console.print(Panel(
|
||
"[cyan]Stopping current builtin browser...[/cyan]",
|
||
title="Builtin Browser Restart",
|
||
border_style="cyan"
|
||
))
|
||
|
||
success = anyio.run(profiler.kill_builtin_browser)
|
||
if not success:
|
||
console.print(Panel(
|
||
"[red]Failed to stop current browser[/red]",
|
||
title="Builtin Browser Restart",
|
||
border_style="red"
|
||
))
|
||
sys.exit(1)
|
||
|
||
# Use provided options or defaults from current config
|
||
browser_type = browser_type or current_config.get("browser_type", "chromium")
|
||
port = port or current_config.get("port", 9222)
|
||
headless = headless if headless is not None else current_config.get("headless", True)
|
||
|
||
# Start a new browser
|
||
console.print(Panel(
|
||
f"[cyan]Starting new builtin browser[/cyan]\n\n"
|
||
f"Browser type: [green]{browser_type}[/green]\n"
|
||
f"Debugging port: [yellow]{port}[/yellow]\n"
|
||
f"Headless: [cyan]{'Yes' if headless else 'No'}[/cyan]",
|
||
title="Builtin Browser Restart",
|
||
border_style="cyan"
|
||
))
|
||
|
||
cdp_url = anyio.run(
|
||
profiler.launch_builtin_browser,
|
||
browser_type,
|
||
port,
|
||
headless
|
||
)
|
||
|
||
if cdp_url:
|
||
console.print(Panel(
|
||
f"[green]Builtin browser restarted successfully[/green]\n\n"
|
||
f"CDP URL: [cyan]{cdp_url}[/cyan]",
|
||
title="Builtin Browser Restart",
|
||
border_style="green"
|
||
))
|
||
else:
|
||
console.print(Panel(
|
||
"[red]Failed to restart builtin browser[/red]",
|
||
title="Builtin Browser Restart",
|
||
border_style="red"
|
||
))
|
||
sys.exit(1)
|
||
|
||
except Exception as e:
|
||
console.print(f"[red]Error restarting builtin browser: {str(e)}[/red]")
|
||
sys.exit(1)
|
||
|
||
@cli.command("cdp")
|
||
@click.option("--user-data-dir", "-d", help="Directory to use for browser data (will be created if it doesn't exist)")
|
||
@click.option("--port", "-P", type=int, default=9222, help="Debugging port (default: 9222)")
|
||
@click.option("--browser-type", "-b", type=click.Choice(["chromium", "firefox"]), default="chromium",
|
||
help="Browser type (default: chromium)")
|
||
@click.option("--headless", is_flag=True, help="Run browser in headless mode")
|
||
@click.option("--incognito", is_flag=True, help="Run in incognito/private mode (ignores user-data-dir)")
|
||
def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless: bool, incognito: bool):
|
||
"""Launch a standalone browser with CDP debugging enabled
|
||
|
||
This command launches a browser with Chrome DevTools Protocol (CDP) debugging enabled,
|
||
prints the CDP URL, and keeps the browser running until you press 'q'.
|
||
|
||
The CDP URL can be used for various automation and debugging tasks.
|
||
|
||
Examples:
|
||
# Launch Chromium with CDP on default port 9222
|
||
crwl cdp
|
||
|
||
# Use a specific directory for browser data and custom port
|
||
crwl cdp --user-data-dir ~/browser-data --port 9223
|
||
|
||
# Launch in headless mode
|
||
crwl cdp --headless
|
||
|
||
# Launch in incognito mode (ignores user-data-dir)
|
||
crwl cdp --incognito
|
||
"""
|
||
profiler = BrowserProfiler()
|
||
|
||
try:
|
||
# Handle data directory
|
||
data_dir = None
|
||
if not incognito and user_data_dir:
|
||
# Expand user path (~/something)
|
||
expanded_path = os.path.expanduser(user_data_dir)
|
||
|
||
# Create directory if it doesn't exist
|
||
if not os.path.exists(expanded_path):
|
||
console.print(f"[yellow]Directory '{expanded_path}' doesn't exist. Creating it.[/yellow]")
|
||
os.makedirs(expanded_path, exist_ok=True)
|
||
|
||
data_dir = expanded_path
|
||
|
||
# Print launch info
|
||
console.print(Panel(
|
||
f"[cyan]Launching browser with CDP debugging[/cyan]\n\n"
|
||
f"Browser type: [green]{browser_type}[/green]\n"
|
||
f"Debugging port: [yellow]{port}[/yellow]\n"
|
||
f"User data directory: [cyan]{data_dir or 'Temporary directory'}[/cyan]\n"
|
||
f"Headless: [cyan]{'Yes' if headless else 'No'}[/cyan]\n"
|
||
f"Incognito: [cyan]{'Yes' if incognito else 'No'}[/cyan]\n\n"
|
||
f"[yellow]Press 'q' to quit when done[/yellow]",
|
||
title="CDP Browser",
|
||
border_style="cyan"
|
||
))
|
||
|
||
# Run the browser
|
||
cdp_url = anyio.run(
|
||
profiler.launch_standalone_browser,
|
||
browser_type,
|
||
data_dir,
|
||
port,
|
||
headless
|
||
)
|
||
|
||
if not cdp_url:
|
||
console.print("[red]Failed to launch browser or get CDP URL[/red]")
|
||
sys.exit(1)
|
||
|
||
except Exception as e:
|
||
console.print(f"[red]Error launching CDP browser: {str(e)}[/red]")
|
||
sys.exit(1)
|
||
|
||
|
||
@cli.command("crawl")
|
||
@click.argument("url", required=True)
|
||
@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
|
||
@click.option("--crawler-config", "-C", type=click.Path(exists=True), help="Crawler config file (YAML/JSON)")
|
||
@click.option("--filter-config", "-f", type=click.Path(exists=True), help="Content filter config file")
|
||
@click.option("--extraction-config", "-e", type=click.Path(exists=True), help="Extraction strategy config file")
|
||
@click.option("--json-extract", "-j", is_flag=False, flag_value="", default=None, help="Extract structured data using LLM with optional description")
|
||
@click.option("--schema", "-s", type=click.Path(exists=True), help="JSON schema for extraction")
|
||
@click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2")
|
||
@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
|
||
@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all")
|
||
@click.option("--output-file", "-O", type=click.Path(), help="Output file path (default: stdout)")
|
||
@click.option("--bypass-cache", "-b", is_flag=True, default=True, help="Bypass cache when crawling")
|
||
@click.option("--question", "-q", help="Ask a question about the crawled content")
|
||
@click.option("--verbose", "-v", is_flag=True)
|
||
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
|
||
def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: str,
|
||
extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
|
||
output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
|
||
"""Crawl a website and extract content
|
||
|
||
Simple Usage:
|
||
crwl crawl https://example.com
|
||
"""
|
||
|
||
# Handle profile option
|
||
if profile:
|
||
profiler = BrowserProfiler()
|
||
profile_path = profiler.get_profile_path(profile)
|
||
|
||
if not profile_path:
|
||
profiles = profiler.list_profiles()
|
||
|
||
if profiles:
|
||
console.print(f"[red]Profile '{profile}' not found. Available profiles:[/red]")
|
||
display_profiles_table(profiles)
|
||
else:
|
||
console.print("[red]No profiles found. Create one with 'crwl profiles'[/red]")
|
||
|
||
return
|
||
|
||
# Include the profile in browser config
|
||
if not browser:
|
||
browser = {}
|
||
browser["user_data_dir"] = profile_path
|
||
browser["use_managed_browser"] = True
|
||
|
||
if verbose:
|
||
console.print(f"[green]Using browser profile:[/green] {profile}")
|
||
|
||
try:
|
||
# Load base configurations
|
||
browser_cfg = BrowserConfig.load(load_config_file(browser_config))
|
||
crawler_cfg = CrawlerRunConfig.load(load_config_file(crawler_config))
|
||
|
||
# Override with CLI params
|
||
if browser:
|
||
browser_cfg = browser_cfg.clone(**browser)
|
||
if crawler:
|
||
crawler_cfg = crawler_cfg.clone(**crawler)
|
||
|
||
# Handle content filter config
|
||
if filter_config or output in ["markdown-fit", "md-fit"]:
|
||
if filter_config:
|
||
filter_conf = load_config_file(filter_config)
|
||
elif not filter_config and output in ["markdown-fit", "md-fit"]:
|
||
filter_conf = {
|
||
"type": "pruning",
|
||
"query": "",
|
||
"threshold": 0.48
|
||
}
|
||
if filter_conf["type"] == "bm25":
|
||
crawler_cfg.markdown_generator = DefaultMarkdownGenerator(
|
||
content_filter = BM25ContentFilter(
|
||
user_query=filter_conf.get("query"),
|
||
bm25_threshold=filter_conf.get("threshold", 1.0)
|
||
)
|
||
)
|
||
elif filter_conf["type"] == "pruning":
|
||
crawler_cfg.markdown_generator = DefaultMarkdownGenerator(
|
||
content_filter = PruningContentFilter(
|
||
user_query=filter_conf.get("query"),
|
||
threshold=filter_conf.get("threshold", 0.48)
|
||
)
|
||
)
|
||
|
||
# Handle json-extract option (takes precedence over extraction-config)
|
||
if json_extract is not None:
|
||
# Get LLM provider and token
|
||
provider, token = setup_llm_config()
|
||
|
||
# Default sophisticated instruction for structured data extraction
|
||
default_instruction = """Analyze the web page content and extract structured data as JSON.
|
||
If the page contains a list of items with repeated patterns, extract all items in an array.
|
||
If the page is an article or contains unique content, extract a comprehensive JSON object with all relevant information.
|
||
Look at the content, intention of content, what it offers and find the data item(s) in the page.
|
||
Always return valid, properly formatted JSON."""
|
||
|
||
|
||
default_instruction_with_user_query = """Analyze the web page content and extract structured data as JSON, following the below instruction and explanation of schema and always return valid, properly formatted JSON. \n\nInstruction:\n\n""" + json_extract
|
||
|
||
# Determine instruction based on whether json_extract is empty or has content
|
||
instruction = default_instruction_with_user_query if json_extract else default_instruction
|
||
|
||
# Create LLM extraction strategy
|
||
crawler_cfg.extraction_strategy = LLMExtractionStrategy(
|
||
llm_config=LLMConfig(provider=provider, api_token=token),
|
||
instruction=instruction,
|
||
schema=load_schema_file(schema), # Will be None if no schema is provided
|
||
extraction_type="schema", #if schema else "block",
|
||
apply_chunking=False,
|
||
force_json_response=True,
|
||
verbose=verbose,
|
||
)
|
||
|
||
# Set output to JSON if not explicitly specified
|
||
if output == "all":
|
||
output = "json"
|
||
|
||
# Handle extraction strategy from config file (only if json-extract wasn't used)
|
||
elif extraction_config:
|
||
extract_conf = load_config_file(extraction_config)
|
||
schema_data = load_schema_file(schema)
|
||
|
||
# Check if type does not exist show proper message
|
||
if not extract_conf.get("type"):
|
||
raise click.ClickException("Extraction type not specified")
|
||
if extract_conf["type"] not in ["llm", "json-css", "json-xpath"]:
|
||
raise click.ClickException(f"Invalid extraction type: {extract_conf['type']}")
|
||
|
||
if extract_conf["type"] == "llm":
|
||
# if no provider show error emssage
|
||
if not extract_conf.get("provider") or not extract_conf.get("api_token"):
|
||
raise click.ClickException("LLM provider and API token are required for LLM extraction")
|
||
|
||
crawler_cfg.extraction_strategy = LLMExtractionStrategy(
|
||
llm_config=LLMConfig(provider=extract_conf["provider"], api_token=extract_conf["api_token"]),
|
||
instruction=extract_conf["instruction"],
|
||
schema=schema_data,
|
||
**extract_conf.get("params", {})
|
||
)
|
||
elif extract_conf["type"] == "json-css":
|
||
crawler_cfg.extraction_strategy = JsonCssExtractionStrategy(
|
||
schema=schema_data
|
||
)
|
||
elif extract_conf["type"] == "json-xpath":
|
||
crawler_cfg.extraction_strategy = JsonXPathExtractionStrategy(
|
||
schema=schema_data
|
||
)
|
||
|
||
|
||
# No cache
|
||
if bypass_cache:
|
||
crawler_cfg.cache_mode = CacheMode.BYPASS
|
||
|
||
crawler_cfg.scraping_strategy = LXMLWebScrapingStrategy()
|
||
|
||
config = get_global_config()
|
||
|
||
browser_cfg.verbose = config.get("VERBOSE", False)
|
||
crawler_cfg.verbose = config.get("VERBOSE", False)
|
||
|
||
# Run crawler
|
||
result : CrawlResult = anyio.run(
|
||
run_crawler,
|
||
url,
|
||
browser_cfg,
|
||
crawler_cfg,
|
||
verbose
|
||
)
|
||
|
||
# Handle question
|
||
if question:
|
||
provider, token = setup_llm_config()
|
||
markdown = result.markdown.raw_markdown
|
||
anyio.run(stream_llm_response, url, markdown, question, provider, token)
|
||
return
|
||
|
||
# Handle output
|
||
if not output_file:
|
||
if output == "all":
|
||
click.echo(json.dumps(result.model_dump(), indent=2))
|
||
elif output == "json":
|
||
print(result.extracted_content)
|
||
extracted_items = json.loads(result.extracted_content)
|
||
click.echo(json.dumps(extracted_items, indent=2))
|
||
|
||
elif output in ["markdown", "md"]:
|
||
click.echo(result.markdown.raw_markdown)
|
||
elif output in ["markdown-fit", "md-fit"]:
|
||
click.echo(result.markdown.fit_markdown)
|
||
else:
|
||
if output == "all":
|
||
with open(output_file, "w") as f:
|
||
f.write(json.dumps(result.model_dump(), indent=2))
|
||
elif output == "json":
|
||
with open(output_file, "w") as f:
|
||
f.write(result.extracted_content)
|
||
elif output in ["markdown", "md"]:
|
||
with open(output_file, "w") as f:
|
||
f.write(result.markdown.raw_markdown)
|
||
elif output in ["markdown-fit", "md-fit"]:
|
||
with open(output_file, "w") as f:
|
||
f.write(result.markdown.fit_markdown)
|
||
|
||
except Exception as e:
|
||
raise click.ClickException(str(e))
|
||
|
||
@cli.command("examples")
|
||
def examples_cmd():
|
||
"""Show usage examples"""
|
||
show_examples()
|
||
|
||
@cli.group("config")
|
||
def config_cmd():
|
||
"""Manage global configuration settings
|
||
|
||
Commands to view and update global configuration settings:
|
||
- list: Display all current configuration settings
|
||
- get: Get the value of a specific setting
|
||
- set: Set the value of a specific setting
|
||
"""
|
||
pass
|
||
|
||
@config_cmd.command("list")
|
||
def config_list_cmd():
|
||
"""List all configuration settings"""
|
||
config = get_global_config()
|
||
|
||
table = Table(title="Crawl4AI Configuration", show_header=True, header_style="bold cyan", border_style="blue")
|
||
table.add_column("Setting", style="cyan")
|
||
table.add_column("Value", style="green")
|
||
table.add_column("Default", style="yellow")
|
||
table.add_column("Description", style="white")
|
||
|
||
for key, setting in USER_SETTINGS.items():
|
||
value = config.get(key, setting["default"])
|
||
|
||
# Handle secret values
|
||
display_value = value
|
||
if setting.get("secret", False) and value:
|
||
display_value = "********"
|
||
|
||
# Handle boolean values
|
||
if setting["type"] == "boolean":
|
||
display_value = str(value).lower()
|
||
default_value = str(setting["default"]).lower()
|
||
else:
|
||
default_value = str(setting["default"])
|
||
|
||
table.add_row(
|
||
key,
|
||
str(display_value),
|
||
default_value,
|
||
setting["description"]
|
||
)
|
||
|
||
console.print(table)
|
||
|
||
@config_cmd.command("get")
|
||
@click.argument("key", required=True)
|
||
def config_get_cmd(key: str):
|
||
"""Get a specific configuration setting"""
|
||
config = get_global_config()
|
||
|
||
# Normalize key to uppercase
|
||
key = key.upper()
|
||
|
||
if key not in USER_SETTINGS:
|
||
console.print(f"[red]Error: Unknown setting '{key}'[/red]")
|
||
return
|
||
|
||
value = config.get(key, USER_SETTINGS[key]["default"])
|
||
|
||
# Handle secret values
|
||
display_value = value
|
||
if USER_SETTINGS[key].get("secret", False) and value:
|
||
display_value = "********"
|
||
|
||
console.print(f"[cyan]{key}[/cyan] = [green]{display_value}[/green]")
|
||
console.print(f"[dim]Description: {USER_SETTINGS[key]['description']}[/dim]")
|
||
|
||
@config_cmd.command("set")
|
||
@click.argument("key", required=True)
|
||
@click.argument("value", required=True)
|
||
def config_set_cmd(key: str, value: str):
|
||
"""Set a configuration setting"""
|
||
config = get_global_config()
|
||
|
||
# Normalize key to uppercase
|
||
key = key.upper()
|
||
|
||
if key not in USER_SETTINGS:
|
||
console.print(f"[red]Error: Unknown setting '{key}'[/red]")
|
||
console.print(f"[yellow]Available settings: {', '.join(USER_SETTINGS.keys())}[/yellow]")
|
||
return
|
||
|
||
setting = USER_SETTINGS[key]
|
||
|
||
# Type conversion and validation
|
||
if setting["type"] == "boolean":
|
||
if value.lower() in ["true", "yes", "1", "y"]:
|
||
typed_value = True
|
||
elif value.lower() in ["false", "no", "0", "n"]:
|
||
typed_value = False
|
||
else:
|
||
console.print(f"[red]Error: Invalid boolean value. Use 'true' or 'false'.[/red]")
|
||
return
|
||
elif setting["type"] == "string":
|
||
typed_value = value
|
||
|
||
# Check if the value should be one of the allowed options
|
||
if "options" in setting and value not in setting["options"]:
|
||
console.print(f"[red]Error: Value must be one of: {', '.join(setting['options'])}[/red]")
|
||
return
|
||
|
||
# Update config
|
||
config[key] = typed_value
|
||
save_global_config(config)
|
||
|
||
# Handle secret values for display
|
||
display_value = typed_value
|
||
if setting.get("secret", False) and typed_value:
|
||
display_value = "********"
|
||
|
||
console.print(f"[green]Successfully set[/green] [cyan]{key}[/cyan] = [green]{display_value}[/green]")
|
||
|
||
@cli.command("profiles")
|
||
def profiles_cmd():
|
||
"""Manage browser profiles interactively
|
||
|
||
Launch an interactive browser profile manager where you can:
|
||
- List all existing profiles
|
||
- Create new profiles for authenticated browsing
|
||
- Delete unused profiles
|
||
"""
|
||
# Run interactive profile manager
|
||
anyio.run(manage_profiles)
|
||
|
||
@cli.command(name="")
|
||
@click.argument("url", required=False)
|
||
@click.option("--example", is_flag=True, help="Show usage examples")
|
||
@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
|
||
@click.option("--crawler-config", "-C", type=click.Path(exists=True), help="Crawler config file (YAML/JSON)")
|
||
@click.option("--filter-config", "-f", type=click.Path(exists=True), help="Content filter config file")
|
||
@click.option("--extraction-config", "-e", type=click.Path(exists=True), help="Extraction strategy config file")
|
||
@click.option("--json-extract", "-j", is_flag=False, flag_value="", default=None, help="Extract structured data using LLM with optional description")
|
||
@click.option("--schema", "-s", type=click.Path(exists=True), help="JSON schema for extraction")
|
||
@click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2")
|
||
@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
|
||
@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all")
|
||
@click.option("--bypass-cache", is_flag=True, default=True, help="Bypass cache when crawling")
|
||
@click.option("--question", "-q", help="Ask a question about the crawled content")
|
||
@click.option("--verbose", "-v", is_flag=True)
|
||
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
|
||
def default(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str,
|
||
extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
|
||
output: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
|
||
"""Crawl4AI CLI - Web content extraction tool
|
||
|
||
Simple Usage:
|
||
crwl https://example.com
|
||
|
||
Run with --example to see detailed usage examples.
|
||
|
||
Other commands:
|
||
crwl profiles - Manage browser profiles for identity-based crawling
|
||
crwl crawl - Crawl a website with advanced options
|
||
crwl cdp - Launch browser with CDP debugging enabled
|
||
crwl browser - Manage builtin browser (start, stop, status, restart)
|
||
crwl config - Manage global configuration settings
|
||
crwl examples - Show more usage examples
|
||
|
||
Configuration Examples:
|
||
crwl config list - List all configuration settings
|
||
crwl config get DEFAULT_LLM_PROVIDER - Show current LLM provider
|
||
crwl config set VERBOSE true - Enable verbose mode globally
|
||
crwl config set BROWSER_HEADLESS false - Default to visible browser
|
||
"""
|
||
|
||
if example:
|
||
show_examples()
|
||
return
|
||
|
||
if not url:
|
||
# Show help without error message
|
||
ctx = click.get_current_context()
|
||
click.echo(ctx.get_help())
|
||
return
|
||
|
||
# Forward to crawl command
|
||
ctx = click.get_current_context()
|
||
ctx.invoke(
|
||
crawl_cmd,
|
||
url=url,
|
||
browser_config=browser_config,
|
||
crawler_config=crawler_config,
|
||
filter_config=filter_config,
|
||
extraction_config=extraction_config,
|
||
json_extract=json_extract,
|
||
schema=schema,
|
||
browser=browser,
|
||
crawler=crawler,
|
||
output=output,
|
||
bypass_cache=bypass_cache,
|
||
question=question,
|
||
verbose=verbose,
|
||
profile=profile
|
||
)
|
||
|
||
def main():
|
||
import sys
|
||
if len(sys.argv) < 2 or sys.argv[1] not in cli.commands:
|
||
sys.argv.insert(1, "crawl")
|
||
cli()
|
||
|
||
if __name__ == "__main__":
|
||
main()
|
||
```
|
||
|
||
|
||
## File: crawl4ai/extraction_strategy.py
|
||
|
||
```py
|
||
from abc import ABC, abstractmethod
|
||
import inspect
|
||
from typing import Any, List, Dict, Optional
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
import json
|
||
import time
|
||
|
||
from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH, PROMPT_EXTRACT_INFERRED_SCHEMA
|
||
from .config import (
|
||
DEFAULT_PROVIDER,
|
||
DEFAULT_PROVIDER_API_KEY,
|
||
CHUNK_TOKEN_THRESHOLD,
|
||
OVERLAP_RATE,
|
||
WORD_TOKEN_RATE,
|
||
)
|
||
from .utils import * # noqa: F403
|
||
|
||
from .utils import (
|
||
sanitize_html,
|
||
escape_json_string,
|
||
perform_completion_with_backoff,
|
||
extract_xml_data,
|
||
split_and_parse_json_objects,
|
||
sanitize_input_encode,
|
||
merge_chunks,
|
||
)
|
||
from .models import * # noqa: F403
|
||
|
||
from .models import TokenUsage
|
||
|
||
from .model_loader import * # noqa: F403
|
||
from .model_loader import (
|
||
get_device,
|
||
load_HF_embedding_model,
|
||
load_text_multilabel_classifier,
|
||
calculate_batch_size
|
||
)
|
||
|
||
from .types import LLMConfig, create_llm_config
|
||
|
||
from functools import partial
|
||
import numpy as np
|
||
import re
|
||
from bs4 import BeautifulSoup
|
||
from lxml import html, etree
|
||
|
||
|
||
class ExtractionStrategy(ABC):
|
||
"""
|
||
Abstract base class for all extraction strategies.
|
||
"""
|
||
|
||
def __init__(self, input_format: str = "markdown", **kwargs):
|
||
"""
|
||
Initialize the extraction strategy.
|
||
|
||
Args:
|
||
input_format: Content format to use for extraction.
|
||
Options: "markdown" (default), "html", "fit_markdown"
|
||
**kwargs: Additional keyword arguments
|
||
"""
|
||
self.input_format = input_format
|
||
self.DEL = "<|DEL|>"
|
||
self.name = self.__class__.__name__
|
||
self.verbose = kwargs.get("verbose", False)
|
||
|
||
@abstractmethod
|
||
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
||
"""
|
||
Extract meaningful blocks or chunks from the given HTML.
|
||
|
||
:param url: The URL of the webpage.
|
||
:param html: The HTML content of the webpage.
|
||
:return: A list of extracted blocks or chunks.
|
||
"""
|
||
pass
|
||
|
||
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
||
"""
|
||
Process sections of text in parallel by default.
|
||
|
||
:param url: The URL of the webpage.
|
||
:param sections: List of sections (strings) to process.
|
||
:return: A list of processed JSON blocks.
|
||
"""
|
||
extracted_content = []
|
||
with ThreadPoolExecutor() as executor:
|
||
futures = [
|
||
executor.submit(self.extract, url, section, **kwargs)
|
||
for section in sections
|
||
]
|
||
for future in as_completed(futures):
|
||
extracted_content.extend(future.result())
|
||
return extracted_content
|
||
|
||
|
||
class NoExtractionStrategy(ExtractionStrategy):
|
||
"""
|
||
A strategy that does not extract any meaningful content from the HTML. It simply returns the entire HTML as a single block.
|
||
"""
|
||
|
||
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
||
"""
|
||
Extract meaningful blocks or chunks from the given HTML.
|
||
"""
|
||
return [{"index": 0, "content": html}]
|
||
|
||
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
||
return [
|
||
{"index": i, "tags": [], "content": section}
|
||
for i, section in enumerate(sections)
|
||
]
|
||
|
||
|
||
#######################################################
|
||
# Strategies using clustering for text data extraction #
|
||
#######################################################
|
||
|
||
|
||
class CosineStrategy(ExtractionStrategy):
|
||
"""
|
||
Extract meaningful blocks or chunks from the given HTML using cosine similarity.
|
||
|
||
How it works:
|
||
1. Pre-filter documents using embeddings and semantic_filter.
|
||
2. Perform clustering using cosine similarity.
|
||
3. Organize texts by their cluster labels, retaining order.
|
||
4. Filter clusters by word count.
|
||
5. Extract meaningful blocks or chunks from the filtered clusters.
|
||
|
||
Attributes:
|
||
semantic_filter (str): A keyword filter for document filtering.
|
||
word_count_threshold (int): Minimum number of words per cluster.
|
||
max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters.
|
||
linkage_method (str): The linkage method for hierarchical clustering.
|
||
top_k (int): Number of top categories to extract.
|
||
model_name (str): The name of the sentence-transformers model.
|
||
sim_threshold (float): The similarity threshold for clustering.
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
semantic_filter=None,
|
||
word_count_threshold=10,
|
||
max_dist=0.2,
|
||
linkage_method="ward",
|
||
top_k=3,
|
||
model_name="sentence-transformers/all-MiniLM-L6-v2",
|
||
sim_threshold=0.3,
|
||
**kwargs,
|
||
):
|
||
"""
|
||
Initialize the strategy with clustering parameters.
|
||
|
||
Args:
|
||
semantic_filter (str): A keyword filter for document filtering.
|
||
word_count_threshold (int): Minimum number of words per cluster.
|
||
max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters.
|
||
linkage_method (str): The linkage method for hierarchical clustering.
|
||
top_k (int): Number of top categories to extract.
|
||
"""
|
||
super().__init__(**kwargs)
|
||
|
||
import numpy as np
|
||
|
||
self.semantic_filter = semantic_filter
|
||
self.word_count_threshold = word_count_threshold
|
||
self.max_dist = max_dist
|
||
self.linkage_method = linkage_method
|
||
self.top_k = top_k
|
||
self.sim_threshold = sim_threshold
|
||
self.timer = time.time()
|
||
self.verbose = kwargs.get("verbose", False)
|
||
|
||
self.buffer_embeddings = np.array([])
|
||
self.get_embedding_method = "direct"
|
||
|
||
self.device = get_device()
|
||
# import torch
|
||
# self.device = torch.device('cpu')
|
||
|
||
self.default_batch_size = calculate_batch_size(self.device)
|
||
|
||
if self.verbose:
|
||
print(f"[LOG] Loading Extraction Model for {self.device.type} device.")
|
||
|
||
# if False and self.device.type == "cpu":
|
||
# self.model = load_onnx_all_MiniLM_l6_v2()
|
||
# self.tokenizer = self.model.tokenizer
|
||
# self.get_embedding_method = "direct"
|
||
# else:
|
||
|
||
self.tokenizer, self.model = load_HF_embedding_model(model_name)
|
||
self.model.to(self.device)
|
||
self.model.eval()
|
||
|
||
self.get_embedding_method = "batch"
|
||
|
||
self.buffer_embeddings = np.array([])
|
||
|
||
# if model_name == "bert-base-uncased":
|
||
# self.tokenizer, self.model = load_bert_base_uncased()
|
||
# self.model.eval() # Ensure the model is in evaluation mode
|
||
# self.get_embedding_method = "batch"
|
||
# elif model_name == "BAAI/bge-small-en-v1.5":
|
||
# self.tokenizer, self.model = load_bge_small_en_v1_5()
|
||
# self.model.eval() # Ensure the model is in evaluation mode
|
||
# self.get_embedding_method = "batch"
|
||
# elif model_name == "sentence-transformers/all-MiniLM-L6-v2":
|
||
# self.model = load_onnx_all_MiniLM_l6_v2()
|
||
# self.tokenizer = self.model.tokenizer
|
||
# self.get_embedding_method = "direct"
|
||
|
||
if self.verbose:
|
||
print(f"[LOG] Loading Multilabel Classifier for {self.device.type} device.")
|
||
|
||
self.nlp, _ = load_text_multilabel_classifier()
|
||
# self.default_batch_size = 16 if self.device.type == 'cpu' else 64
|
||
|
||
if self.verbose:
|
||
print(
|
||
f"[LOG] Model loaded {model_name}, models/reuters, took "
|
||
+ str(time.time() - self.timer)
|
||
+ " seconds"
|
||
)
|
||
|
||
def filter_documents_embeddings(
|
||
self, documents: List[str], semantic_filter: str, at_least_k: int = 20
|
||
) -> List[str]:
|
||
"""
|
||
Filter and sort documents based on the cosine similarity of their embeddings with the semantic_filter embedding.
|
||
|
||
Args:
|
||
documents (List[str]): A list of document texts.
|
||
semantic_filter (str): A keyword filter for document filtering.
|
||
at_least_k (int): The minimum number of documents to return.
|
||
|
||
Returns:
|
||
List[str]: A list of filtered and sorted document texts.
|
||
"""
|
||
|
||
if not semantic_filter:
|
||
return documents
|
||
|
||
if len(documents) < at_least_k:
|
||
at_least_k = len(documents) // 2
|
||
|
||
from sklearn.metrics.pairwise import cosine_similarity
|
||
|
||
# Compute embedding for the keyword filter
|
||
query_embedding = self.get_embeddings([semantic_filter])[0]
|
||
|
||
# Compute embeddings for the documents
|
||
document_embeddings = self.get_embeddings(documents)
|
||
|
||
# Calculate cosine similarity between the query embedding and document embeddings
|
||
similarities = cosine_similarity(
|
||
[query_embedding], document_embeddings
|
||
).flatten()
|
||
|
||
# Filter documents based on the similarity threshold
|
||
filtered_docs = [
|
||
(doc, sim)
|
||
for doc, sim in zip(documents, similarities)
|
||
if sim >= self.sim_threshold
|
||
]
|
||
|
||
# If the number of filtered documents is less than at_least_k, sort remaining documents by similarity
|
||
if len(filtered_docs) < at_least_k:
|
||
remaining_docs = [
|
||
(doc, sim)
|
||
for doc, sim in zip(documents, similarities)
|
||
if sim < self.sim_threshold
|
||
]
|
||
remaining_docs.sort(key=lambda x: x[1], reverse=True)
|
||
filtered_docs.extend(remaining_docs[: at_least_k - len(filtered_docs)])
|
||
|
||
# Extract the document texts from the tuples
|
||
filtered_docs = [doc for doc, _ in filtered_docs]
|
||
|
||
return filtered_docs[:at_least_k]
|
||
|
||
def get_embeddings(
|
||
self, sentences: List[str], batch_size=None, bypass_buffer=False
|
||
):
|
||
"""
|
||
Get BERT embeddings for a list of sentences.
|
||
|
||
Args:
|
||
sentences (List[str]): A list of text chunks (sentences).
|
||
|
||
Returns:
|
||
NumPy array of embeddings.
|
||
"""
|
||
# if self.buffer_embeddings.any() and not bypass_buffer:
|
||
# return self.buffer_embeddings
|
||
|
||
if self.device.type in ["cpu", "gpu", "cuda", "mps"]:
|
||
import torch
|
||
|
||
# Tokenize sentences and convert to tensor
|
||
if batch_size is None:
|
||
batch_size = self.default_batch_size
|
||
|
||
all_embeddings = []
|
||
for i in range(0, len(sentences), batch_size):
|
||
batch_sentences = sentences[i : i + batch_size]
|
||
encoded_input = self.tokenizer(
|
||
batch_sentences, padding=True, truncation=True, return_tensors="pt"
|
||
)
|
||
encoded_input = {
|
||
key: tensor.to(self.device) for key, tensor in encoded_input.items()
|
||
}
|
||
|
||
# Ensure no gradients are calculated
|
||
with torch.no_grad():
|
||
model_output = self.model(**encoded_input)
|
||
|
||
# Get embeddings from the last hidden state (mean pooling)
|
||
embeddings = model_output.last_hidden_state.mean(dim=1).cpu().numpy()
|
||
all_embeddings.append(embeddings)
|
||
|
||
self.buffer_embeddings = np.vstack(all_embeddings)
|
||
elif self.device.type == "cpu":
|
||
# self.buffer_embeddings = self.model(sentences)
|
||
if batch_size is None:
|
||
batch_size = self.default_batch_size
|
||
|
||
all_embeddings = []
|
||
for i in range(0, len(sentences), batch_size):
|
||
batch_sentences = sentences[i : i + batch_size]
|
||
embeddings = self.model(batch_sentences)
|
||
all_embeddings.append(embeddings)
|
||
|
||
self.buffer_embeddings = np.vstack(all_embeddings)
|
||
return self.buffer_embeddings
|
||
|
||
def hierarchical_clustering(self, sentences: List[str], embeddings=None):
|
||
"""
|
||
Perform hierarchical clustering on sentences and return cluster labels.
|
||
|
||
Args:
|
||
sentences (List[str]): A list of text chunks (sentences).
|
||
|
||
Returns:
|
||
NumPy array of cluster labels.
|
||
"""
|
||
# Get embeddings
|
||
from scipy.cluster.hierarchy import linkage, fcluster
|
||
from scipy.spatial.distance import pdist
|
||
|
||
self.timer = time.time()
|
||
embeddings = self.get_embeddings(sentences, bypass_buffer=True)
|
||
# print(f"[LOG] 🚀 Embeddings computed in {time.time() - self.timer:.2f} seconds")
|
||
# Compute pairwise cosine distances
|
||
distance_matrix = pdist(embeddings, "cosine")
|
||
# Perform agglomerative clustering respecting order
|
||
linked = linkage(distance_matrix, method=self.linkage_method)
|
||
# Form flat clusters
|
||
labels = fcluster(linked, self.max_dist, criterion="distance")
|
||
return labels
|
||
|
||
def filter_clusters_by_word_count(
|
||
self, clusters: Dict[int, List[str]]
|
||
) -> Dict[int, List[str]]:
|
||
"""
|
||
Filter clusters to remove those with a word count below the threshold.
|
||
|
||
Args:
|
||
clusters (Dict[int, List[str]]): Dictionary of clusters.
|
||
|
||
Returns:
|
||
Dict[int, List[str]]: Filtered dictionary of clusters.
|
||
"""
|
||
filtered_clusters = {}
|
||
for cluster_id, texts in clusters.items():
|
||
# Concatenate texts for analysis
|
||
full_text = " ".join(texts)
|
||
# Count words
|
||
word_count = len(full_text.split())
|
||
|
||
# Keep clusters with word count above the threshold
|
||
if word_count >= self.word_count_threshold:
|
||
filtered_clusters[cluster_id] = texts
|
||
|
||
return filtered_clusters
|
||
|
||
def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
|
||
"""
|
||
Extract clusters from HTML content using hierarchical clustering.
|
||
|
||
Args:
|
||
url (str): The URL of the webpage.
|
||
html (str): The HTML content of the webpage.
|
||
|
||
Returns:
|
||
List[Dict[str, Any]]: A list of processed JSON blocks.
|
||
"""
|
||
# Assume `html` is a list of text chunks for this strategy
|
||
t = time.time()
|
||
text_chunks = html.split(self.DEL) # Split by lines or paragraphs as needed
|
||
|
||
# Pre-filter documents using embeddings and semantic_filter
|
||
text_chunks = self.filter_documents_embeddings(
|
||
text_chunks, self.semantic_filter
|
||
)
|
||
|
||
if not text_chunks:
|
||
return []
|
||
|
||
# Perform clustering
|
||
labels = self.hierarchical_clustering(text_chunks)
|
||
# print(f"[LOG] 🚀 Clustering done in {time.time() - t:.2f} seconds")
|
||
|
||
# Organize texts by their cluster labels, retaining order
|
||
t = time.time()
|
||
clusters = {}
|
||
for index, label in enumerate(labels):
|
||
clusters.setdefault(label, []).append(text_chunks[index])
|
||
|
||
# Filter clusters by word count
|
||
filtered_clusters = self.filter_clusters_by_word_count(clusters)
|
||
|
||
# Convert filtered clusters to a sorted list of dictionaries
|
||
cluster_list = [
|
||
{"index": int(idx), "tags": [], "content": " ".join(filtered_clusters[idx])}
|
||
for idx in sorted(filtered_clusters)
|
||
]
|
||
|
||
if self.verbose:
|
||
print(f"[LOG] 🚀 Assign tags using {self.device}")
|
||
|
||
if self.device.type in ["gpu", "cuda", "mps", "cpu"]:
|
||
labels = self.nlp([cluster["content"] for cluster in cluster_list])
|
||
|
||
for cluster, label in zip(cluster_list, labels):
|
||
cluster["tags"] = label
|
||
# elif self.device.type == "cpu":
|
||
# # Process the text with the loaded model
|
||
# texts = [cluster['content'] for cluster in cluster_list]
|
||
# # Batch process texts
|
||
# docs = self.nlp.pipe(texts, disable=["tagger", "parser", "ner", "lemmatizer"])
|
||
|
||
# for doc, cluster in zip(docs, cluster_list):
|
||
# tok_k = self.top_k
|
||
# top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
|
||
# cluster['tags'] = [cat for cat, _ in top_categories]
|
||
|
||
# for cluster in cluster_list:
|
||
# doc = self.nlp(cluster['content'])
|
||
# tok_k = self.top_k
|
||
# top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
|
||
# cluster['tags'] = [cat for cat, _ in top_categories]
|
||
|
||
if self.verbose:
|
||
print(f"[LOG] 🚀 Categorization done in {time.time() - t:.2f} seconds")
|
||
|
||
return cluster_list
|
||
|
||
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
||
"""
|
||
Process sections using hierarchical clustering.
|
||
|
||
Args:
|
||
url (str): The URL of the webpage.
|
||
sections (List[str]): List of sections (strings) to process.
|
||
|
||
Returns:
|
||
"""
|
||
# This strategy processes all sections together
|
||
|
||
return self.extract(url, self.DEL.join(sections), **kwargs)
|
||
|
||
|
||
#######################################################
|
||
# Strategies using LLM-based extraction for text data #
|
||
#######################################################
|
||
class LLMExtractionStrategy(ExtractionStrategy):
|
||
"""
|
||
A strategy that uses an LLM to extract meaningful content from the HTML.
|
||
|
||
Attributes:
|
||
llm_config: The LLM configuration object.
|
||
instruction: The instruction to use for the LLM model.
|
||
schema: Pydantic model schema for structured data.
|
||
extraction_type: "block" or "schema".
|
||
chunk_token_threshold: Maximum tokens per chunk.
|
||
overlap_rate: Overlap between chunks.
|
||
word_token_rate: Word to token conversion rate.
|
||
apply_chunking: Whether to apply chunking.
|
||
verbose: Whether to print verbose output.
|
||
usages: List of individual token usages.
|
||
total_usage: Accumulated token usage.
|
||
"""
|
||
_UNWANTED_PROPS = {
|
||
'provider' : 'Instead, use llm_config=LLMConfig(provider="...")',
|
||
'api_token' : 'Instead, use llm_config=LlMConfig(api_token="...")',
|
||
'base_url' : 'Instead, use llm_config=LLMConfig(base_url="...")',
|
||
'api_base' : 'Instead, use llm_config=LLMConfig(base_url="...")',
|
||
}
|
||
def __init__(
|
||
self,
|
||
llm_config: 'LLMConfig' = None,
|
||
instruction: str = None,
|
||
schema: Dict = None,
|
||
extraction_type="block",
|
||
chunk_token_threshold=CHUNK_TOKEN_THRESHOLD,
|
||
overlap_rate=OVERLAP_RATE,
|
||
word_token_rate=WORD_TOKEN_RATE,
|
||
apply_chunking=True,
|
||
input_format: str = "markdown",
|
||
force_json_response=False,
|
||
verbose=False,
|
||
# Deprecated arguments
|
||
provider: str = DEFAULT_PROVIDER,
|
||
api_token: Optional[str] = None,
|
||
base_url: str = None,
|
||
api_base: str = None,
|
||
**kwargs,
|
||
):
|
||
"""
|
||
Initialize the strategy with clustering parameters.
|
||
|
||
Args:
|
||
llm_config: The LLM configuration object.
|
||
instruction: The instruction to use for the LLM model.
|
||
schema: Pydantic model schema for structured data.
|
||
extraction_type: "block" or "schema".
|
||
chunk_token_threshold: Maximum tokens per chunk.
|
||
overlap_rate: Overlap between chunks.
|
||
word_token_rate: Word to token conversion rate.
|
||
apply_chunking: Whether to apply chunking.
|
||
input_format: Content format to use for extraction.
|
||
Options: "markdown" (default), "html", "fit_markdown"
|
||
force_json_response: Whether to force a JSON response from the LLM.
|
||
verbose: Whether to print verbose output.
|
||
|
||
# Deprecated arguments, will be removed very soon
|
||
provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
|
||
api_token: The API token for the provider.
|
||
base_url: The base URL for the API request.
|
||
api_base: The base URL for the API request.
|
||
extra_args: Additional arguments for the API request, such as temperature, max_tokens, etc.
|
||
"""
|
||
super().__init__( input_format=input_format, **kwargs)
|
||
self.llm_config = llm_config
|
||
if not self.llm_config:
|
||
self.llm_config = create_llm_config(
|
||
provider=DEFAULT_PROVIDER,
|
||
api_token=os.environ.get(DEFAULT_PROVIDER_API_KEY),
|
||
)
|
||
self.instruction = instruction
|
||
self.extract_type = extraction_type
|
||
self.schema = schema
|
||
if schema:
|
||
self.extract_type = "schema"
|
||
self.force_json_response = force_json_response
|
||
self.chunk_token_threshold = chunk_token_threshold or CHUNK_TOKEN_THRESHOLD
|
||
self.overlap_rate = overlap_rate
|
||
self.word_token_rate = word_token_rate
|
||
self.apply_chunking = apply_chunking
|
||
self.extra_args = kwargs.get("extra_args", {})
|
||
if not self.apply_chunking:
|
||
self.chunk_token_threshold = 1e9
|
||
self.verbose = verbose
|
||
self.usages = [] # Store individual usages
|
||
self.total_usage = TokenUsage() # Accumulated usage
|
||
|
||
self.provider = provider
|
||
self.api_token = api_token
|
||
self.base_url = base_url
|
||
self.api_base = api_base
|
||
|
||
|
||
def __setattr__(self, name, value):
|
||
"""Handle attribute setting."""
|
||
# TODO: Planning to set properties dynamically based on the __init__ signature
|
||
sig = inspect.signature(self.__init__)
|
||
all_params = sig.parameters # Dictionary of parameter names and their details
|
||
|
||
if name in self._UNWANTED_PROPS and value is not all_params[name].default:
|
||
raise AttributeError(f"Setting '{name}' is deprecated. {self._UNWANTED_PROPS[name]}")
|
||
|
||
super().__setattr__(name, value)
|
||
|
||
def extract(self, url: str, ix: int, html: str) -> List[Dict[str, Any]]:
|
||
"""
|
||
Extract meaningful blocks or chunks from the given HTML using an LLM.
|
||
|
||
How it works:
|
||
1. Construct a prompt with variables.
|
||
2. Make a request to the LLM using the prompt.
|
||
3. Parse the response and extract blocks or chunks.
|
||
|
||
Args:
|
||
url: The URL of the webpage.
|
||
ix: Index of the block.
|
||
html: The HTML content of the webpage.
|
||
|
||
Returns:
|
||
A list of extracted blocks or chunks.
|
||
"""
|
||
if self.verbose:
|
||
# print("[LOG] Extracting blocks from URL:", url)
|
||
print(f"[LOG] Call LLM for {url} - block index: {ix}")
|
||
|
||
variable_values = {
|
||
"URL": url,
|
||
"HTML": escape_json_string(sanitize_html(html)),
|
||
}
|
||
|
||
prompt_with_variables = PROMPT_EXTRACT_BLOCKS
|
||
if self.instruction:
|
||
variable_values["REQUEST"] = self.instruction
|
||
prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
|
||
|
||
if self.extract_type == "schema" and self.schema:
|
||
variable_values["SCHEMA"] = json.dumps(self.schema, indent=2) # if type of self.schema is dict else self.schema
|
||
prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION
|
||
|
||
if self.extract_type == "schema" and not self.schema:
|
||
prompt_with_variables = PROMPT_EXTRACT_INFERRED_SCHEMA
|
||
|
||
for variable in variable_values:
|
||
prompt_with_variables = prompt_with_variables.replace(
|
||
"{" + variable + "}", variable_values[variable]
|
||
)
|
||
|
||
try:
|
||
response = perform_completion_with_backoff(
|
||
self.llm_config.provider,
|
||
prompt_with_variables,
|
||
self.llm_config.api_token,
|
||
base_url=self.llm_config.base_url,
|
||
json_response=self.force_json_response,
|
||
extra_args=self.extra_args,
|
||
) # , json_response=self.extract_type == "schema")
|
||
# Track usage
|
||
usage = TokenUsage(
|
||
completion_tokens=response.usage.completion_tokens,
|
||
prompt_tokens=response.usage.prompt_tokens,
|
||
total_tokens=response.usage.total_tokens,
|
||
completion_tokens_details=response.usage.completion_tokens_details.__dict__
|
||
if response.usage.completion_tokens_details
|
||
else {},
|
||
prompt_tokens_details=response.usage.prompt_tokens_details.__dict__
|
||
if response.usage.prompt_tokens_details
|
||
else {},
|
||
)
|
||
self.usages.append(usage)
|
||
|
||
# Update totals
|
||
self.total_usage.completion_tokens += usage.completion_tokens
|
||
self.total_usage.prompt_tokens += usage.prompt_tokens
|
||
self.total_usage.total_tokens += usage.total_tokens
|
||
|
||
try:
|
||
response = response.choices[0].message.content
|
||
blocks = None
|
||
|
||
if self.force_json_response:
|
||
blocks = json.loads(response)
|
||
if isinstance(blocks, dict):
|
||
# If it has only one key which calue is list then assign that to blocks, exampled: {"news": [..]}
|
||
if len(blocks) == 1 and isinstance(list(blocks.values())[0], list):
|
||
blocks = list(blocks.values())[0]
|
||
else:
|
||
# If it has only one key which value is not list then assign that to blocks, exampled: { "article_id": "1234", ... }
|
||
blocks = [blocks]
|
||
elif isinstance(blocks, list):
|
||
# If it is a list then assign that to blocks
|
||
blocks = blocks
|
||
else:
|
||
# blocks = extract_xml_data(["blocks"], response.choices[0].message.content)["blocks"]
|
||
blocks = extract_xml_data(["blocks"], response)["blocks"]
|
||
blocks = json.loads(blocks)
|
||
|
||
for block in blocks:
|
||
block["error"] = False
|
||
except Exception:
|
||
parsed, unparsed = split_and_parse_json_objects(
|
||
response.choices[0].message.content
|
||
)
|
||
blocks = parsed
|
||
if unparsed:
|
||
blocks.append(
|
||
{"index": 0, "error": True, "tags": ["error"], "content": unparsed}
|
||
)
|
||
|
||
if self.verbose:
|
||
print(
|
||
"[LOG] Extracted",
|
||
len(blocks),
|
||
"blocks from URL:",
|
||
url,
|
||
"block index:",
|
||
ix,
|
||
)
|
||
return blocks
|
||
except Exception as e:
|
||
if self.verbose:
|
||
print(f"[LOG] Error in LLM extraction: {e}")
|
||
# Add error information to extracted_content
|
||
return [
|
||
{
|
||
"index": ix,
|
||
"error": True,
|
||
"tags": ["error"],
|
||
"content": str(e),
|
||
}
|
||
]
|
||
|
||
def _merge(self, documents, chunk_token_threshold, overlap) -> List[str]:
|
||
"""
|
||
Merge documents into sections based on chunk_token_threshold and overlap.
|
||
"""
|
||
sections = merge_chunks(
|
||
docs = documents,
|
||
target_size= chunk_token_threshold,
|
||
overlap=overlap,
|
||
word_token_ratio=self.word_token_rate
|
||
)
|
||
return sections
|
||
|
||
def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:
|
||
"""
|
||
Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy.
|
||
|
||
Args:
|
||
url: The URL of the webpage.
|
||
sections: List of sections (strings) to process.
|
||
|
||
Returns:
|
||
A list of extracted blocks or chunks.
|
||
"""
|
||
|
||
merged_sections = self._merge(
|
||
sections,
|
||
self.chunk_token_threshold,
|
||
overlap=int(self.chunk_token_threshold * self.overlap_rate),
|
||
)
|
||
extracted_content = []
|
||
if self.llm_config.provider.startswith("groq/"):
|
||
# Sequential processing with a delay
|
||
for ix, section in enumerate(merged_sections):
|
||
extract_func = partial(self.extract, url)
|
||
extracted_content.extend(
|
||
extract_func(ix, sanitize_input_encode(section))
|
||
)
|
||
time.sleep(0.5) # 500 ms delay between each processing
|
||
else:
|
||
# Parallel processing using ThreadPoolExecutor
|
||
# extract_func = partial(self.extract, url)
|
||
# for ix, section in enumerate(merged_sections):
|
||
# extracted_content.append(extract_func(ix, section))
|
||
|
||
with ThreadPoolExecutor(max_workers=4) as executor:
|
||
extract_func = partial(self.extract, url)
|
||
futures = [
|
||
executor.submit(extract_func, ix, sanitize_input_encode(section))
|
||
for ix, section in enumerate(merged_sections)
|
||
]
|
||
|
||
for future in as_completed(futures):
|
||
try:
|
||
extracted_content.extend(future.result())
|
||
except Exception as e:
|
||
if self.verbose:
|
||
print(f"Error in thread execution: {e}")
|
||
# Add error information to extracted_content
|
||
extracted_content.append(
|
||
{
|
||
"index": 0,
|
||
"error": True,
|
||
"tags": ["error"],
|
||
"content": str(e),
|
||
}
|
||
)
|
||
|
||
return extracted_content
|
||
|
||
def show_usage(self) -> None:
|
||
"""Print a detailed token usage report showing total and per-request usage."""
|
||
print("\n=== Token Usage Summary ===")
|
||
print(f"{'Type':<15} {'Count':>12}")
|
||
print("-" * 30)
|
||
print(f"{'Completion':<15} {self.total_usage.completion_tokens:>12,}")
|
||
print(f"{'Prompt':<15} {self.total_usage.prompt_tokens:>12,}")
|
||
print(f"{'Total':<15} {self.total_usage.total_tokens:>12,}")
|
||
|
||
print("\n=== Usage History ===")
|
||
print(f"{'Request #':<10} {'Completion':>12} {'Prompt':>12} {'Total':>12}")
|
||
print("-" * 48)
|
||
for i, usage in enumerate(self.usages, 1):
|
||
print(
|
||
f"{i:<10} {usage.completion_tokens:>12,} {usage.prompt_tokens:>12,} {usage.total_tokens:>12,}"
|
||
)
|
||
|
||
|
||
#######################################################
|
||
# New extraction strategies for JSON-based extraction #
|
||
#######################################################
|
||
class JsonElementExtractionStrategy(ExtractionStrategy):
|
||
"""
|
||
Abstract base class for extracting structured JSON from HTML content.
|
||
|
||
How it works:
|
||
1. Parses HTML content using the `_parse_html` method.
|
||
2. Uses a schema to define base selectors, fields, and transformations.
|
||
3. Extracts data hierarchically, supporting nested fields and lists.
|
||
4. Handles computed fields with expressions or functions.
|
||
|
||
Attributes:
|
||
DEL (str): Delimiter used to combine HTML sections. Defaults to '\n'.
|
||
schema (Dict[str, Any]): The schema defining the extraction rules.
|
||
verbose (bool): Enables verbose logging for debugging purposes.
|
||
|
||
Methods:
|
||
extract(url, html_content, *q, **kwargs): Extracts structured data from HTML content.
|
||
_extract_item(element, fields): Extracts fields from a single element.
|
||
_extract_single_field(element, field): Extracts a single field based on its type.
|
||
_apply_transform(value, transform): Applies a transformation to a value.
|
||
_compute_field(item, field): Computes a field value using an expression or function.
|
||
run(url, sections, *q, **kwargs): Combines HTML sections and runs the extraction strategy.
|
||
|
||
Abstract Methods:
|
||
_parse_html(html_content): Parses raw HTML into a structured format (e.g., BeautifulSoup or lxml).
|
||
_get_base_elements(parsed_html, selector): Retrieves base elements using a selector.
|
||
_get_elements(element, selector): Retrieves child elements using a selector.
|
||
_get_element_text(element): Extracts text content from an element.
|
||
_get_element_html(element): Extracts raw HTML from an element.
|
||
_get_element_attribute(element, attribute): Extracts an attribute's value from an element.
|
||
"""
|
||
|
||
DEL = "\n"
|
||
|
||
def __init__(self, schema: Dict[str, Any], **kwargs):
|
||
"""
|
||
Initialize the JSON element extraction strategy with a schema.
|
||
|
||
Args:
|
||
schema (Dict[str, Any]): The schema defining the extraction rules.
|
||
"""
|
||
super().__init__(**kwargs)
|
||
self.schema = schema
|
||
self.verbose = kwargs.get("verbose", False)
|
||
|
||
def extract(
|
||
self, url: str, html_content: str, *q, **kwargs
|
||
) -> List[Dict[str, Any]]:
|
||
"""
|
||
Extract structured data from HTML content.
|
||
|
||
How it works:
|
||
1. Parses the HTML content using the `_parse_html` method.
|
||
2. Identifies base elements using the schema's base selector.
|
||
3. Extracts fields from each base element using `_extract_item`.
|
||
|
||
Args:
|
||
url (str): The URL of the page being processed.
|
||
html_content (str): The raw HTML content to parse and extract.
|
||
*q: Additional positional arguments.
|
||
**kwargs: Additional keyword arguments for custom extraction.
|
||
|
||
Returns:
|
||
List[Dict[str, Any]]: A list of extracted items, each represented as a dictionary.
|
||
"""
|
||
|
||
parsed_html = self._parse_html(html_content)
|
||
base_elements = self._get_base_elements(
|
||
parsed_html, self.schema["baseSelector"]
|
||
)
|
||
|
||
results = []
|
||
for element in base_elements:
|
||
# Extract base element attributes
|
||
item = {}
|
||
if "baseFields" in self.schema:
|
||
for field in self.schema["baseFields"]:
|
||
value = self._extract_single_field(element, field)
|
||
if value is not None:
|
||
item[field["name"]] = value
|
||
|
||
# Extract child fields
|
||
field_data = self._extract_item(element, self.schema["fields"])
|
||
item.update(field_data)
|
||
|
||
if item:
|
||
results.append(item)
|
||
|
||
return results
|
||
|
||
@abstractmethod
|
||
def _parse_html(self, html_content: str):
|
||
"""Parse HTML content into appropriate format"""
|
||
pass
|
||
|
||
@abstractmethod
|
||
def _get_base_elements(self, parsed_html, selector: str):
|
||
"""Get all base elements using the selector"""
|
||
pass
|
||
|
||
@abstractmethod
|
||
def _get_elements(self, element, selector: str):
|
||
"""Get child elements using the selector"""
|
||
pass
|
||
|
||
def _extract_field(self, element, field):
|
||
try:
|
||
if field["type"] == "nested":
|
||
nested_elements = self._get_elements(element, field["selector"])
|
||
nested_element = nested_elements[0] if nested_elements else None
|
||
return (
|
||
self._extract_item(nested_element, field["fields"])
|
||
if nested_element
|
||
else {}
|
||
)
|
||
|
||
if field["type"] == "list":
|
||
elements = self._get_elements(element, field["selector"])
|
||
return [self._extract_list_item(el, field["fields"]) for el in elements]
|
||
|
||
if field["type"] == "nested_list":
|
||
elements = self._get_elements(element, field["selector"])
|
||
return [self._extract_item(el, field["fields"]) for el in elements]
|
||
|
||
return self._extract_single_field(element, field)
|
||
except Exception as e:
|
||
if self.verbose:
|
||
print(f"Error extracting field {field['name']}: {str(e)}")
|
||
return field.get("default")
|
||
|
||
def _extract_single_field(self, element, field):
|
||
"""
|
||
Extract a single field based on its type.
|
||
|
||
How it works:
|
||
1. Selects the target element using the field's selector.
|
||
2. Extracts the field value based on its type (e.g., text, attribute, regex).
|
||
3. Applies transformations if defined in the schema.
|
||
|
||
Args:
|
||
element: The base element to extract the field from.
|
||
field (Dict[str, Any]): The field definition in the schema.
|
||
|
||
Returns:
|
||
Any: The extracted field value.
|
||
"""
|
||
|
||
if "selector" in field:
|
||
selected = self._get_elements(element, field["selector"])
|
||
if not selected:
|
||
return field.get("default")
|
||
selected = selected[0]
|
||
else:
|
||
selected = element
|
||
|
||
value = None
|
||
if field["type"] == "text":
|
||
value = self._get_element_text(selected)
|
||
elif field["type"] == "attribute":
|
||
value = self._get_element_attribute(selected, field["attribute"])
|
||
elif field["type"] == "html":
|
||
value = self._get_element_html(selected)
|
||
elif field["type"] == "regex":
|
||
text = self._get_element_text(selected)
|
||
match = re.search(field["pattern"], text)
|
||
value = match.group(1) if match else None
|
||
|
||
if "transform" in field:
|
||
value = self._apply_transform(value, field["transform"])
|
||
|
||
return value if value is not None else field.get("default")
|
||
|
||
def _extract_list_item(self, element, fields):
|
||
item = {}
|
||
for field in fields:
|
||
value = self._extract_single_field(element, field)
|
||
if value is not None:
|
||
item[field["name"]] = value
|
||
return item
|
||
|
||
def _extract_item(self, element, fields):
|
||
"""
|
||
Extracts fields from a given element.
|
||
|
||
How it works:
|
||
1. Iterates through the fields defined in the schema.
|
||
2. Handles computed, single, and nested field types.
|
||
3. Updates the item dictionary with extracted field values.
|
||
|
||
Args:
|
||
element: The base element to extract fields from.
|
||
fields (List[Dict[str, Any]]): The list of fields to extract.
|
||
|
||
Returns:
|
||
Dict[str, Any]: A dictionary representing the extracted item.
|
||
"""
|
||
|
||
item = {}
|
||
for field in fields:
|
||
if field["type"] == "computed":
|
||
value = self._compute_field(item, field)
|
||
else:
|
||
value = self._extract_field(element, field)
|
||
if value is not None:
|
||
item[field["name"]] = value
|
||
return item
|
||
|
||
def _apply_transform(self, value, transform):
|
||
"""
|
||
Apply a transformation to a value.
|
||
|
||
How it works:
|
||
1. Checks the transformation type (e.g., `lowercase`, `strip`).
|
||
2. Applies the transformation to the value.
|
||
3. Returns the transformed value.
|
||
|
||
Args:
|
||
value (str): The value to transform.
|
||
transform (str): The type of transformation to apply.
|
||
|
||
Returns:
|
||
str: The transformed value.
|
||
"""
|
||
|
||
if transform == "lowercase":
|
||
return value.lower()
|
||
elif transform == "uppercase":
|
||
return value.upper()
|
||
elif transform == "strip":
|
||
return value.strip()
|
||
return value
|
||
|
||
def _compute_field(self, item, field):
|
||
try:
|
||
if "expression" in field:
|
||
return eval(field["expression"], {}, item)
|
||
elif "function" in field:
|
||
return field["function"](item)
|
||
except Exception as e:
|
||
if self.verbose:
|
||
print(f"Error computing field {field['name']}: {str(e)}")
|
||
return field.get("default")
|
||
|
||
def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
|
||
"""
|
||
Run the extraction strategy on a combined HTML content.
|
||
|
||
How it works:
|
||
1. Combines multiple HTML sections using the `DEL` delimiter.
|
||
2. Calls the `extract` method with the combined HTML.
|
||
|
||
Args:
|
||
url (str): The URL of the page being processed.
|
||
sections (List[str]): A list of HTML sections.
|
||
*q: Additional positional arguments.
|
||
**kwargs: Additional keyword arguments for custom extraction.
|
||
|
||
Returns:
|
||
List[Dict[str, Any]]: A list of extracted items.
|
||
"""
|
||
|
||
combined_html = self.DEL.join(sections)
|
||
return self.extract(url, combined_html, **kwargs)
|
||
|
||
@abstractmethod
|
||
def _get_element_text(self, element) -> str:
|
||
"""Get text content from element"""
|
||
pass
|
||
|
||
@abstractmethod
|
||
def _get_element_html(self, element) -> str:
|
||
"""Get HTML content from element"""
|
||
pass
|
||
|
||
@abstractmethod
|
||
def _get_element_attribute(self, element, attribute: str):
|
||
"""Get attribute value from element"""
|
||
pass
|
||
|
||
_GENERATE_SCHEMA_UNWANTED_PROPS = {
|
||
'provider': 'Instead, use llm_config=LLMConfig(provider="...")',
|
||
'api_token': 'Instead, use llm_config=LlMConfig(api_token="...")',
|
||
}
|
||
|
||
@staticmethod
|
||
def generate_schema(
|
||
html: str,
|
||
schema_type: str = "CSS", # or XPATH
|
||
query: str = None,
|
||
target_json_example: str = None,
|
||
llm_config: 'LLMConfig' = create_llm_config(),
|
||
provider: str = None,
|
||
api_token: str = None,
|
||
**kwargs
|
||
) -> dict:
|
||
"""
|
||
Generate extraction schema from HTML content and optional query.
|
||
|
||
Args:
|
||
html (str): The HTML content to analyze
|
||
query (str, optional): Natural language description of what data to extract
|
||
provider (str): Legacy Parameter. LLM provider to use
|
||
api_token (str): Legacy Parameter. API token for LLM provider
|
||
llm_config (LLMConfig): LLM configuration object
|
||
prompt (str, optional): Custom prompt template to use
|
||
**kwargs: Additional args passed to LLM processor
|
||
|
||
Returns:
|
||
dict: Generated schema following the JsonElementExtractionStrategy format
|
||
"""
|
||
from .prompts import JSON_SCHEMA_BUILDER
|
||
from .utils import perform_completion_with_backoff
|
||
for name, message in JsonElementExtractionStrategy._GENERATE_SCHEMA_UNWANTED_PROPS.items():
|
||
if locals()[name] is not None:
|
||
raise AttributeError(f"Setting '{name}' is deprecated. {message}")
|
||
|
||
# Use default or custom prompt
|
||
prompt_template = JSON_SCHEMA_BUILDER if schema_type == "CSS" else JSON_SCHEMA_BUILDER_XPATH
|
||
|
||
# Build the prompt
|
||
system_message = {
|
||
"role": "system",
|
||
"content": f"""You specialize in generating special JSON schemas for web scraping. This schema uses CSS or XPATH selectors to present a repetitive pattern in crawled HTML, such as a product in a product list or a search result item in a list of search results. We use this JSON schema to pass to a language model along with the HTML content to extract structured data from the HTML. The language model uses the JSON schema to extract data from the HTML and retrieve values for fields in the JSON schema, following the schema.
|
||
|
||
Generating this HTML manually is not feasible, so you need to generate the JSON schema using the HTML content. The HTML copied from the crawled website is provided below, which we believe contains the repetitive pattern.
|
||
|
||
# Schema main keys:
|
||
- name: This is the name of the schema.
|
||
- baseSelector: This is the CSS or XPATH selector that identifies the base element that contains all the repetitive patterns.
|
||
- baseFields: This is a list of fields that you extract from the base element itself.
|
||
- fields: This is a list of fields that you extract from the children of the base element. {{name, selector, type}} based on the type, you may have extra keys such as "attribute" when the type is "attribute".
|
||
|
||
# Extra Context:
|
||
In this context, the following items may or may not be present:
|
||
- Example of target JSON object: This is a sample of the final JSON object that we hope to extract from the HTML using the schema you are generating.
|
||
- Extra Instructions: This is optional instructions to consider when generating the schema provided by the user.
|
||
- Query or explanation of target/goal data item: This is a description of what data we are trying to extract from the HTML. This explanation means we're not sure about the rigid schema of the structures we want, so we leave it to you to use your expertise to create the best and most comprehensive structures aimed at maximizing data extraction from this page. You must ensure that you do not pick up nuances that may exist on a particular page. The focus should be on the data we are extracting, and it must be valid, safe, and robust based on the given HTML.
|
||
|
||
# What if there is no example of target JSON object and also no extra instructions or even no explanation of target/goal data item?
|
||
In this scenario, use your best judgment to generate the schema. You need to examine the content of the page and understand the data it provides. If the page contains repetitive data, such as lists of items, products, jobs, places, books, or movies, focus on one single item that repeats. If the page is a detailed page about one product or item, create a schema to extract the entire structured data. At this stage, you must think and decide for yourself. Try to maximize the number of fields that you can extract from the HTML.
|
||
|
||
# What are the instructions and details for this schema generation?
|
||
{prompt_template}"""
|
||
}
|
||
|
||
user_message = {
|
||
"role": "user",
|
||
"content": f"""
|
||
HTML to analyze:
|
||
```html
|
||
{html}
|
||
```
|
||
"""
|
||
}
|
||
|
||
if query:
|
||
user_message["content"] += f"\n\n## Query or explanation of target/goal data item:\n{query}"
|
||
if target_json_example:
|
||
user_message["content"] += f"\n\n## Example of target JSON object:\n```json\n{target_json_example}\n```"
|
||
|
||
if query and not target_json_example:
|
||
user_message["content"] += """IMPORTANT: To remind you, in this process, we are not providing a rigid example of the adjacent objects we seek. We rely on your understanding of the explanation provided in the above section. Make sure to grasp what we are looking for and, based on that, create the best schema.."""
|
||
elif not query and target_json_example:
|
||
user_message["content"] += """IMPORTANT: Please remember that in this process, we provided a proper example of a target JSON object. Make sure to adhere to the structure and create a schema that exactly fits this example. If you find that some elements on the page do not match completely, vote for the majority."""
|
||
elif not query and not target_json_example:
|
||
user_message["content"] += """IMPORTANT: Since we neither have a query nor an example, it is crucial to rely solely on the HTML content provided. Leverage your expertise to determine the schema based on the repetitive patterns observed in the content."""
|
||
|
||
user_message["content"] += """IMPORTANT: Ensure your schema remains reliable by avoiding selectors that appear to generate dynamically and are not dependable. You want a reliable schema, as it consistently returns the same data even after many page reloads.
|
||
|
||
Analyze the HTML and generate a JSON schema that follows the specified format. Only output valid JSON schema, nothing else.
|
||
"""
|
||
|
||
try:
|
||
# Call LLM with backoff handling
|
||
response = perform_completion_with_backoff(
|
||
provider=llm_config.provider,
|
||
prompt_with_variables="\n\n".join([system_message["content"], user_message["content"]]),
|
||
json_response = True,
|
||
api_token=llm_config.api_token,
|
||
base_url=llm_config.base_url,
|
||
extra_args=kwargs
|
||
)
|
||
|
||
# Extract and return schema
|
||
return json.loads(response.choices[0].message.content)
|
||
|
||
except Exception as e:
|
||
raise Exception(f"Failed to generate schema: {str(e)}")
|
||
|
||
class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
|
||
"""
|
||
Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors.
|
||
|
||
How it works:
|
||
1. Parses HTML content with BeautifulSoup.
|
||
2. Selects elements using CSS selectors defined in the schema.
|
||
3. Extracts field data and applies transformations as defined.
|
||
|
||
Attributes:
|
||
schema (Dict[str, Any]): The schema defining the extraction rules.
|
||
verbose (bool): Enables verbose logging for debugging purposes.
|
||
|
||
Methods:
|
||
_parse_html(html_content): Parses HTML content into a BeautifulSoup object.
|
||
_get_base_elements(parsed_html, selector): Selects base elements using a CSS selector.
|
||
_get_elements(element, selector): Selects child elements using a CSS selector.
|
||
_get_element_text(element): Extracts text content from a BeautifulSoup element.
|
||
_get_element_html(element): Extracts the raw HTML content of a BeautifulSoup element.
|
||
_get_element_attribute(element, attribute): Retrieves an attribute value from a BeautifulSoup element.
|
||
"""
|
||
|
||
def __init__(self, schema: Dict[str, Any], **kwargs):
|
||
kwargs["input_format"] = "html" # Force HTML input
|
||
super().__init__(schema, **kwargs)
|
||
|
||
def _parse_html(self, html_content: str):
|
||
# return BeautifulSoup(html_content, "html.parser")
|
||
return BeautifulSoup(html_content, "lxml")
|
||
|
||
def _get_base_elements(self, parsed_html, selector: str):
|
||
return parsed_html.select(selector)
|
||
|
||
def _get_elements(self, element, selector: str):
|
||
# Return all matching elements using select() instead of select_one()
|
||
# This ensures that we get all elements that match the selector, not just the first one
|
||
return element.select(selector)
|
||
|
||
def _get_element_text(self, element) -> str:
|
||
return element.get_text(strip=True)
|
||
|
||
def _get_element_html(self, element) -> str:
|
||
return str(element)
|
||
|
||
def _get_element_attribute(self, element, attribute: str):
|
||
return element.get(attribute)
|
||
|
||
class JsonLxmlExtractionStrategy(JsonElementExtractionStrategy):
|
||
def __init__(self, schema: Dict[str, Any], **kwargs):
|
||
kwargs["input_format"] = "html"
|
||
super().__init__(schema, **kwargs)
|
||
self._selector_cache = {}
|
||
self._xpath_cache = {}
|
||
self._result_cache = {}
|
||
|
||
# Control selector optimization strategy
|
||
self.use_caching = kwargs.get("use_caching", True)
|
||
self.optimize_common_patterns = kwargs.get("optimize_common_patterns", True)
|
||
|
||
# Load lxml dependencies once
|
||
from lxml import etree, html
|
||
from lxml.cssselect import CSSSelector
|
||
self.etree = etree
|
||
self.html_parser = html
|
||
self.CSSSelector = CSSSelector
|
||
|
||
def _parse_html(self, html_content: str):
|
||
"""Parse HTML content with error recovery"""
|
||
try:
|
||
parser = self.etree.HTMLParser(recover=True, remove_blank_text=True)
|
||
return self.etree.fromstring(html_content, parser)
|
||
except Exception as e:
|
||
if self.verbose:
|
||
print(f"Error parsing HTML, falling back to alternative method: {e}")
|
||
try:
|
||
return self.html_parser.fromstring(html_content)
|
||
except Exception as e2:
|
||
if self.verbose:
|
||
print(f"Critical error parsing HTML: {e2}")
|
||
# Create minimal document as fallback
|
||
return self.etree.Element("html")
|
||
|
||
def _optimize_selector(self, selector_str):
|
||
"""Optimize common selector patterns for better performance"""
|
||
if not self.optimize_common_patterns:
|
||
return selector_str
|
||
|
||
# Handle td:nth-child(N) pattern which is very common in table scraping
|
||
import re
|
||
if re.search(r'td:nth-child\(\d+\)', selector_str):
|
||
return selector_str # Already handled specially in _apply_selector
|
||
|
||
# Split complex selectors into parts for optimization
|
||
parts = selector_str.split()
|
||
if len(parts) <= 1:
|
||
return selector_str
|
||
|
||
# For very long selectors, consider using just the last specific part
|
||
if len(parts) > 3 and any(p.startswith('.') or p.startswith('#') for p in parts):
|
||
specific_parts = [p for p in parts if p.startswith('.') or p.startswith('#')]
|
||
if specific_parts:
|
||
return specific_parts[-1] # Use most specific class/id selector
|
||
|
||
return selector_str
|
||
|
||
def _create_selector_function(self, selector_str):
|
||
"""Create a selector function that handles all edge cases"""
|
||
original_selector = selector_str
|
||
|
||
# Try to optimize the selector if appropriate
|
||
if self.optimize_common_patterns:
|
||
selector_str = self._optimize_selector(selector_str)
|
||
|
||
try:
|
||
# Attempt to compile the CSS selector
|
||
compiled = self.CSSSelector(selector_str)
|
||
xpath = compiled.path
|
||
|
||
# Store XPath for later use
|
||
self._xpath_cache[selector_str] = xpath
|
||
|
||
# Create the wrapper function that implements the selection strategy
|
||
def selector_func(element, context_sensitive=True):
|
||
cache_key = None
|
||
|
||
# Use result caching if enabled
|
||
if self.use_caching:
|
||
# Create a cache key based on element and selector
|
||
element_id = element.get('id', '') or str(hash(element))
|
||
cache_key = f"{element_id}::{selector_str}"
|
||
|
||
if cache_key in self._result_cache:
|
||
return self._result_cache[cache_key]
|
||
|
||
results = []
|
||
try:
|
||
# Strategy 1: Direct CSS selector application (fastest)
|
||
results = compiled(element)
|
||
|
||
# If that fails and we need context sensitivity
|
||
if not results and context_sensitive:
|
||
# Strategy 2: Try XPath with context adjustment
|
||
context_xpath = self._make_context_sensitive_xpath(xpath, element)
|
||
if context_xpath:
|
||
results = element.xpath(context_xpath)
|
||
|
||
# Strategy 3: Handle special case - nth-child
|
||
if not results and 'nth-child' in original_selector:
|
||
results = self._handle_nth_child_selector(element, original_selector)
|
||
|
||
# Strategy 4: Direct descendant search for class/ID selectors
|
||
if not results:
|
||
results = self._fallback_class_id_search(element, original_selector)
|
||
|
||
# Strategy 5: Last resort - tag name search for the final part
|
||
if not results:
|
||
parts = original_selector.split()
|
||
if parts:
|
||
last_part = parts[-1]
|
||
# Extract tag name from the selector
|
||
tag_match = re.match(r'^(\w+)', last_part)
|
||
if tag_match:
|
||
tag_name = tag_match.group(1)
|
||
results = element.xpath(f".//{tag_name}")
|
||
|
||
# Cache results if caching is enabled
|
||
if self.use_caching and cache_key:
|
||
self._result_cache[cache_key] = results
|
||
|
||
except Exception as e:
|
||
if self.verbose:
|
||
print(f"Error applying selector '{selector_str}': {e}")
|
||
|
||
return results
|
||
|
||
return selector_func
|
||
|
||
except Exception as e:
|
||
if self.verbose:
|
||
print(f"Error compiling selector '{selector_str}': {e}")
|
||
|
||
# Fallback function for invalid selectors
|
||
return lambda element, context_sensitive=True: []
|
||
|
||
def _make_context_sensitive_xpath(self, xpath, element):
|
||
"""Convert absolute XPath to context-sensitive XPath"""
|
||
try:
|
||
# If starts with descendant-or-self, it's already context-sensitive
|
||
if xpath.startswith('descendant-or-self::'):
|
||
return xpath
|
||
|
||
# Remove leading slash if present
|
||
if xpath.startswith('/'):
|
||
context_xpath = f".{xpath}"
|
||
else:
|
||
context_xpath = f".//{xpath}"
|
||
|
||
# Validate the XPath by trying it
|
||
try:
|
||
element.xpath(context_xpath)
|
||
return context_xpath
|
||
except:
|
||
# If that fails, try a simpler descendant search
|
||
return f".//{xpath.split('/')[-1]}"
|
||
except:
|
||
return None
|
||
|
||
def _handle_nth_child_selector(self, element, selector_str):
|
||
"""Special handling for nth-child selectors in tables"""
|
||
import re
|
||
results = []
|
||
|
||
try:
|
||
# Extract the column number from td:nth-child(N)
|
||
match = re.search(r'td:nth-child\((\d+)\)', selector_str)
|
||
if match:
|
||
col_num = match.group(1)
|
||
|
||
# Check if there's content after the nth-child part
|
||
remaining_selector = selector_str.split(f"td:nth-child({col_num})", 1)[-1].strip()
|
||
|
||
if remaining_selector:
|
||
# If there's a specific element we're looking for after the column
|
||
# Extract any tag names from the remaining selector
|
||
tag_match = re.search(r'(\w+)', remaining_selector)
|
||
tag_name = tag_match.group(1) if tag_match else '*'
|
||
results = element.xpath(f".//td[{col_num}]//{tag_name}")
|
||
else:
|
||
# Just get the column cell
|
||
results = element.xpath(f".//td[{col_num}]")
|
||
except Exception as e:
|
||
if self.verbose:
|
||
print(f"Error handling nth-child selector: {e}")
|
||
|
||
return results
|
||
|
||
def _fallback_class_id_search(self, element, selector_str):
|
||
"""Fallback to search by class or ID"""
|
||
results = []
|
||
|
||
try:
|
||
# Extract class selectors (.classname)
|
||
import re
|
||
class_matches = re.findall(r'\.([a-zA-Z0-9_-]+)', selector_str)
|
||
|
||
# Extract ID selectors (#idname)
|
||
id_matches = re.findall(r'#([a-zA-Z0-9_-]+)', selector_str)
|
||
|
||
# Try each class
|
||
for class_name in class_matches:
|
||
class_results = element.xpath(f".//*[contains(@class, '{class_name}')]")
|
||
results.extend(class_results)
|
||
|
||
# Try each ID (usually more specific)
|
||
for id_name in id_matches:
|
||
id_results = element.xpath(f".//*[@id='{id_name}']")
|
||
results.extend(id_results)
|
||
except Exception as e:
|
||
if self.verbose:
|
||
print(f"Error in fallback class/id search: {e}")
|
||
|
||
return results
|
||
|
||
def _get_selector(self, selector_str):
|
||
"""Get or create a selector function with caching"""
|
||
if selector_str not in self._selector_cache:
|
||
self._selector_cache[selector_str] = self._create_selector_function(selector_str)
|
||
return self._selector_cache[selector_str]
|
||
|
||
def _get_base_elements(self, parsed_html, selector: str):
|
||
"""Get all base elements using the selector"""
|
||
selector_func = self._get_selector(selector)
|
||
# For base elements, we don't need context sensitivity
|
||
return selector_func(parsed_html, context_sensitive=False)
|
||
|
||
def _get_elements(self, element, selector: str):
|
||
"""Get child elements using the selector with context sensitivity"""
|
||
selector_func = self._get_selector(selector)
|
||
return selector_func(element, context_sensitive=True)
|
||
|
||
def _get_element_text(self, element) -> str:
|
||
"""Extract normalized text from element"""
|
||
try:
|
||
# Get all text nodes and normalize
|
||
text = " ".join(t.strip() for t in element.xpath(".//text()") if t.strip())
|
||
return text
|
||
except Exception as e:
|
||
if self.verbose:
|
||
print(f"Error extracting text: {e}")
|
||
# Fallback
|
||
try:
|
||
return element.text_content().strip()
|
||
except:
|
||
return ""
|
||
|
||
def _get_element_html(self, element) -> str:
|
||
"""Get HTML string representation of element"""
|
||
try:
|
||
return self.etree.tostring(element, encoding='unicode', method='html')
|
||
except Exception as e:
|
||
if self.verbose:
|
||
print(f"Error serializing HTML: {e}")
|
||
return ""
|
||
|
||
def _get_element_attribute(self, element, attribute: str):
|
||
"""Get attribute value safely"""
|
||
try:
|
||
return element.get(attribute)
|
||
except Exception as e:
|
||
if self.verbose:
|
||
print(f"Error getting attribute '{attribute}': {e}")
|
||
return None
|
||
|
||
def _clear_caches(self):
|
||
"""Clear caches to free memory"""
|
||
if self.use_caching:
|
||
self._result_cache.clear()
|
||
|
||
class JsonLxmlExtractionStrategy_naive(JsonElementExtractionStrategy):
|
||
def __init__(self, schema: Dict[str, Any], **kwargs):
|
||
kwargs["input_format"] = "html" # Force HTML input
|
||
super().__init__(schema, **kwargs)
|
||
self._selector_cache = {}
|
||
|
||
def _parse_html(self, html_content: str):
|
||
from lxml import etree
|
||
parser = etree.HTMLParser(recover=True)
|
||
return etree.fromstring(html_content, parser)
|
||
|
||
def _get_selector(self, selector_str):
|
||
"""Get a selector function that works within the context of an element"""
|
||
if selector_str not in self._selector_cache:
|
||
from lxml.cssselect import CSSSelector
|
||
try:
|
||
# Store both the compiled selector and its xpath translation
|
||
compiled = CSSSelector(selector_str)
|
||
|
||
# Create a function that will apply this selector appropriately
|
||
def select_func(element):
|
||
try:
|
||
# First attempt: direct CSS selector application
|
||
results = compiled(element)
|
||
if results:
|
||
return results
|
||
|
||
# Second attempt: contextual XPath selection
|
||
# Convert the root-based XPath to a context-based XPath
|
||
xpath = compiled.path
|
||
|
||
# If the XPath already starts with descendant-or-self, handle it specially
|
||
if xpath.startswith('descendant-or-self::'):
|
||
context_xpath = xpath
|
||
else:
|
||
# For normal XPath expressions, make them relative to current context
|
||
context_xpath = f"./{xpath.lstrip('/')}"
|
||
|
||
results = element.xpath(context_xpath)
|
||
if results:
|
||
return results
|
||
|
||
# Final fallback: simple descendant search for common patterns
|
||
if 'nth-child' in selector_str:
|
||
# Handle td:nth-child(N) pattern
|
||
import re
|
||
match = re.search(r'td:nth-child\((\d+)\)', selector_str)
|
||
if match:
|
||
col_num = match.group(1)
|
||
sub_selector = selector_str.split(')', 1)[-1].strip()
|
||
if sub_selector:
|
||
return element.xpath(f".//td[{col_num}]//{sub_selector}")
|
||
else:
|
||
return element.xpath(f".//td[{col_num}]")
|
||
|
||
# Last resort: try each part of the selector separately
|
||
parts = selector_str.split()
|
||
if len(parts) > 1 and parts[-1]:
|
||
return element.xpath(f".//{parts[-1]}")
|
||
|
||
return []
|
||
except Exception as e:
|
||
if self.verbose:
|
||
print(f"Error applying selector '{selector_str}': {e}")
|
||
return []
|
||
|
||
self._selector_cache[selector_str] = select_func
|
||
except Exception as e:
|
||
if self.verbose:
|
||
print(f"Error compiling selector '{selector_str}': {e}")
|
||
|
||
# Fallback function for invalid selectors
|
||
def fallback_func(element):
|
||
return []
|
||
|
||
self._selector_cache[selector_str] = fallback_func
|
||
|
||
return self._selector_cache[selector_str]
|
||
|
||
def _get_base_elements(self, parsed_html, selector: str):
|
||
selector_func = self._get_selector(selector)
|
||
return selector_func(parsed_html)
|
||
|
||
def _get_elements(self, element, selector: str):
|
||
selector_func = self._get_selector(selector)
|
||
return selector_func(element)
|
||
|
||
def _get_element_text(self, element) -> str:
|
||
return "".join(element.xpath(".//text()")).strip()
|
||
|
||
def _get_element_html(self, element) -> str:
|
||
from lxml import etree
|
||
return etree.tostring(element, encoding='unicode')
|
||
|
||
def _get_element_attribute(self, element, attribute: str):
|
||
return element.get(attribute)
|
||
|
||
class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
|
||
"""
|
||
Concrete implementation of `JsonElementExtractionStrategy` using XPath selectors.
|
||
|
||
How it works:
|
||
1. Parses HTML content into an lxml tree.
|
||
2. Selects elements using XPath expressions.
|
||
3. Converts CSS selectors to XPath when needed.
|
||
|
||
Attributes:
|
||
schema (Dict[str, Any]): The schema defining the extraction rules.
|
||
verbose (bool): Enables verbose logging for debugging purposes.
|
||
|
||
Methods:
|
||
_parse_html(html_content): Parses HTML content into an lxml tree.
|
||
_get_base_elements(parsed_html, selector): Selects base elements using an XPath selector.
|
||
_css_to_xpath(css_selector): Converts a CSS selector to an XPath expression.
|
||
_get_elements(element, selector): Selects child elements using an XPath selector.
|
||
_get_element_text(element): Extracts text content from an lxml element.
|
||
_get_element_html(element): Extracts the raw HTML content of an lxml element.
|
||
_get_element_attribute(element, attribute): Retrieves an attribute value from an lxml element.
|
||
"""
|
||
|
||
def __init__(self, schema: Dict[str, Any], **kwargs):
|
||
kwargs["input_format"] = "html" # Force HTML input
|
||
super().__init__(schema, **kwargs)
|
||
|
||
def _parse_html(self, html_content: str):
|
||
return html.fromstring(html_content)
|
||
|
||
def _get_base_elements(self, parsed_html, selector: str):
|
||
return parsed_html.xpath(selector)
|
||
|
||
def _css_to_xpath(self, css_selector: str) -> str:
|
||
"""Convert CSS selector to XPath if needed"""
|
||
if "/" in css_selector: # Already an XPath
|
||
return css_selector
|
||
return self._basic_css_to_xpath(css_selector)
|
||
|
||
def _basic_css_to_xpath(self, css_selector: str) -> str:
|
||
"""Basic CSS to XPath conversion for common cases"""
|
||
if " > " in css_selector:
|
||
parts = css_selector.split(" > ")
|
||
return "//" + "/".join(parts)
|
||
if " " in css_selector:
|
||
parts = css_selector.split(" ")
|
||
return "//" + "//".join(parts)
|
||
return "//" + css_selector
|
||
|
||
def _get_elements(self, element, selector: str):
|
||
xpath = self._css_to_xpath(selector)
|
||
if not xpath.startswith("."):
|
||
xpath = "." + xpath
|
||
return element.xpath(xpath)
|
||
|
||
def _get_element_text(self, element) -> str:
|
||
return "".join(element.xpath(".//text()")).strip()
|
||
|
||
def _get_element_html(self, element) -> str:
|
||
return etree.tostring(element, encoding="unicode")
|
||
|
||
def _get_element_attribute(self, element, attribute: str):
|
||
return element.get(attribute)
|
||
|
||
|
||
```
|
||
|
||
|
||
## File: crawl4ai/models.py
|
||
|
||
```py
|
||
from pydantic import BaseModel, HttpUrl, PrivateAttr
|
||
from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
|
||
from typing import AsyncGenerator
|
||
from typing import Generic, TypeVar
|
||
from enum import Enum
|
||
from dataclasses import dataclass
|
||
from .ssl_certificate import SSLCertificate
|
||
from datetime import datetime
|
||
from datetime import timedelta
|
||
|
||
|
||
###############################
|
||
# Dispatcher Models
|
||
###############################
|
||
@dataclass
|
||
class DomainState:
|
||
last_request_time: float = 0
|
||
current_delay: float = 0
|
||
fail_count: int = 0
|
||
|
||
|
||
@dataclass
|
||
class CrawlerTaskResult:
|
||
task_id: str
|
||
url: str
|
||
result: "CrawlResult"
|
||
memory_usage: float
|
||
peak_memory: float
|
||
start_time: Union[datetime, float]
|
||
end_time: Union[datetime, float]
|
||
error_message: str = ""
|
||
retry_count: int = 0
|
||
wait_time: float = 0.0
|
||
|
||
@property
|
||
def success(self) -> bool:
|
||
return self.result.success
|
||
|
||
class CrawlStatus(Enum):
|
||
QUEUED = "QUEUED"
|
||
IN_PROGRESS = "IN_PROGRESS"
|
||
COMPLETED = "COMPLETED"
|
||
FAILED = "FAILED"
|
||
|
||
@dataclass
|
||
class CrawlStats:
|
||
task_id: str
|
||
url: str
|
||
status: CrawlStatus
|
||
start_time: Optional[Union[datetime, float]] = None
|
||
end_time: Optional[Union[datetime, float]] = None
|
||
memory_usage: float = 0.0
|
||
peak_memory: float = 0.0
|
||
error_message: str = ""
|
||
wait_time: float = 0.0
|
||
retry_count: int = 0
|
||
counted_requeue: bool = False
|
||
|
||
@property
|
||
def duration(self) -> str:
|
||
if not self.start_time:
|
||
return "0:00"
|
||
|
||
# Convert start_time to datetime if it's a float
|
||
start = self.start_time
|
||
if isinstance(start, float):
|
||
start = datetime.fromtimestamp(start)
|
||
|
||
# Get end time or use current time
|
||
end = self.end_time or datetime.now()
|
||
# Convert end_time to datetime if it's a float
|
||
if isinstance(end, float):
|
||
end = datetime.fromtimestamp(end)
|
||
|
||
duration = end - start
|
||
return str(timedelta(seconds=int(duration.total_seconds())))
|
||
|
||
class DisplayMode(Enum):
|
||
DETAILED = "DETAILED"
|
||
AGGREGATED = "AGGREGATED"
|
||
|
||
|
||
###############################
|
||
# Crawler Models
|
||
###############################
|
||
@dataclass
|
||
class TokenUsage:
|
||
completion_tokens: int = 0
|
||
prompt_tokens: int = 0
|
||
total_tokens: int = 0
|
||
completion_tokens_details: Optional[dict] = None
|
||
prompt_tokens_details: Optional[dict] = None
|
||
|
||
class UrlModel(BaseModel):
|
||
url: HttpUrl
|
||
forced: bool = False
|
||
|
||
|
||
|
||
@dataclass
|
||
class TraversalStats:
|
||
"""Statistics for the traversal process"""
|
||
|
||
start_time: datetime = datetime.now()
|
||
urls_processed: int = 0
|
||
urls_failed: int = 0
|
||
urls_skipped: int = 0
|
||
total_depth_reached: int = 0
|
||
current_depth: int = 0
|
||
|
||
class DispatchResult(BaseModel):
|
||
task_id: str
|
||
memory_usage: float
|
||
peak_memory: float
|
||
start_time: Union[datetime, float]
|
||
end_time: Union[datetime, float]
|
||
error_message: str = ""
|
||
|
||
class MarkdownGenerationResult(BaseModel):
|
||
raw_markdown: str
|
||
markdown_with_citations: str
|
||
references_markdown: str
|
||
fit_markdown: Optional[str] = None
|
||
fit_html: Optional[str] = None
|
||
|
||
def __str__(self):
|
||
return self.raw_markdown
|
||
|
||
class CrawlResult(BaseModel):
|
||
url: str
|
||
html: str
|
||
success: bool
|
||
cleaned_html: Optional[str] = None
|
||
media: Dict[str, List[Dict]] = {}
|
||
links: Dict[str, List[Dict]] = {}
|
||
downloaded_files: Optional[List[str]] = None
|
||
js_execution_result: Optional[Dict[str, Any]] = None
|
||
screenshot: Optional[str] = None
|
||
pdf: Optional[bytes] = None
|
||
mhtml: Optional[str] = None
|
||
_markdown: Optional[MarkdownGenerationResult] = PrivateAttr(default=None)
|
||
extracted_content: Optional[str] = None
|
||
metadata: Optional[dict] = None
|
||
error_message: Optional[str] = None
|
||
session_id: Optional[str] = None
|
||
response_headers: Optional[dict] = None
|
||
status_code: Optional[int] = None
|
||
ssl_certificate: Optional[SSLCertificate] = None
|
||
dispatch_result: Optional[DispatchResult] = None
|
||
redirected_url: Optional[str] = None
|
||
network_requests: Optional[List[Dict[str, Any]]] = None
|
||
console_messages: Optional[List[Dict[str, Any]]] = None
|
||
|
||
class Config:
|
||
arbitrary_types_allowed = True
|
||
|
||
# NOTE: The StringCompatibleMarkdown class, custom __init__ method, property getters/setters,
|
||
# and model_dump override all exist to support a smooth transition from markdown as a string
|
||
# to markdown as a MarkdownGenerationResult object, while maintaining backward compatibility.
|
||
#
|
||
# This allows code that expects markdown to be a string to continue working, while also
|
||
# providing access to the full MarkdownGenerationResult object's properties.
|
||
#
|
||
# The markdown_v2 property is deprecated and raises an error directing users to use markdown.
|
||
#
|
||
# When backward compatibility is no longer needed in future versions, this entire mechanism
|
||
# can be simplified to a standard field with no custom accessors or serialization logic.
|
||
|
||
def __init__(self, **data):
|
||
markdown_result = data.pop('markdown', None)
|
||
super().__init__(**data)
|
||
if markdown_result is not None:
|
||
self._markdown = (
|
||
MarkdownGenerationResult(**markdown_result)
|
||
if isinstance(markdown_result, dict)
|
||
else markdown_result
|
||
)
|
||
|
||
@property
|
||
def markdown(self):
|
||
"""
|
||
Property that returns a StringCompatibleMarkdown object that behaves like
|
||
a string but also provides access to MarkdownGenerationResult attributes.
|
||
|
||
This approach allows backward compatibility with code that expects 'markdown'
|
||
to be a string, while providing access to the full MarkdownGenerationResult.
|
||
"""
|
||
if self._markdown is None:
|
||
return None
|
||
return StringCompatibleMarkdown(self._markdown)
|
||
|
||
@markdown.setter
|
||
def markdown(self, value):
|
||
"""
|
||
Setter for the markdown property.
|
||
"""
|
||
self._markdown = value
|
||
|
||
@property
|
||
def markdown_v2(self):
|
||
"""
|
||
Deprecated property that raises an AttributeError when accessed.
|
||
|
||
This property exists to inform users that 'markdown_v2' has been
|
||
deprecated and they should use 'markdown' instead.
|
||
"""
|
||
raise AttributeError(
|
||
"The 'markdown_v2' attribute is deprecated and has been removed. "
|
||
"""Please use 'markdown' instead, which now returns a MarkdownGenerationResult, with
|
||
following properties:
|
||
- raw_markdown: The raw markdown string
|
||
- markdown_with_citations: The markdown string with citations
|
||
- references_markdown: The markdown string with references
|
||
- fit_markdown: The markdown string with fit text
|
||
"""
|
||
)
|
||
|
||
@property
|
||
def fit_markdown(self):
|
||
"""
|
||
Deprecated property that raises an AttributeError when accessed.
|
||
"""
|
||
raise AttributeError(
|
||
"The 'fit_markdown' attribute is deprecated and has been removed. "
|
||
"Please use 'markdown.fit_markdown' instead."
|
||
)
|
||
|
||
@property
|
||
def fit_html(self):
|
||
"""
|
||
Deprecated property that raises an AttributeError when accessed.
|
||
"""
|
||
raise AttributeError(
|
||
"The 'fit_html' attribute is deprecated and has been removed. "
|
||
"Please use 'markdown.fit_html' instead."
|
||
)
|
||
|
||
def model_dump(self, *args, **kwargs):
|
||
"""
|
||
Override model_dump to include the _markdown private attribute in serialization.
|
||
|
||
This override is necessary because:
|
||
1. PrivateAttr fields are excluded from serialization by default
|
||
2. We need to maintain backward compatibility by including the 'markdown' field
|
||
in the serialized output
|
||
3. We're transitioning from 'markdown_v2' to enhancing 'markdown' to hold
|
||
the same type of data
|
||
|
||
Future developers: This method ensures that the markdown content is properly
|
||
serialized despite being stored in a private attribute. If the serialization
|
||
requirements change, this is where you would update the logic.
|
||
"""
|
||
result = super().model_dump(*args, **kwargs)
|
||
if self._markdown is not None:
|
||
result["markdown"] = self._markdown.model_dump()
|
||
return result
|
||
|
||
class StringCompatibleMarkdown(str):
|
||
"""A string subclass that also provides access to MarkdownGenerationResult attributes"""
|
||
def __new__(cls, markdown_result):
|
||
return super().__new__(cls, markdown_result.raw_markdown)
|
||
|
||
def __init__(self, markdown_result):
|
||
self._markdown_result = markdown_result
|
||
|
||
def __getattr__(self, name):
|
||
return getattr(self._markdown_result, name)
|
||
|
||
CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult)
|
||
|
||
class CrawlResultContainer(Generic[CrawlResultT]):
|
||
def __init__(self, results: Union[CrawlResultT, List[CrawlResultT]]):
|
||
# Normalize to a list
|
||
if isinstance(results, list):
|
||
self._results = results
|
||
else:
|
||
self._results = [results]
|
||
|
||
def __iter__(self):
|
||
return iter(self._results)
|
||
|
||
def __getitem__(self, index):
|
||
return self._results[index]
|
||
|
||
def __len__(self):
|
||
return len(self._results)
|
||
|
||
def __getattr__(self, attr):
|
||
# Delegate attribute access to the first element.
|
||
if self._results:
|
||
return getattr(self._results[0], attr)
|
||
raise AttributeError(f"{self.__class__.__name__} object has no attribute '{attr}'")
|
||
|
||
def __repr__(self):
|
||
return f"{self.__class__.__name__}({self._results!r})"
|
||
|
||
RunManyReturn = Union[
|
||
CrawlResultContainer[CrawlResultT],
|
||
AsyncGenerator[CrawlResultT, None]
|
||
]
|
||
|
||
|
||
# END of backward compatibility code for markdown/markdown_v2.
|
||
# When removing this code in the future, make sure to:
|
||
# 1. Replace the private attribute and property with a standard field
|
||
# 2. Update any serialization logic that might depend on the current behavior
|
||
|
||
class AsyncCrawlResponse(BaseModel):
|
||
html: str
|
||
response_headers: Dict[str, str]
|
||
js_execution_result: Optional[Dict[str, Any]] = None
|
||
status_code: int
|
||
screenshot: Optional[str] = None
|
||
pdf_data: Optional[bytes] = None
|
||
mhtml_data: Optional[str] = None
|
||
get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
|
||
downloaded_files: Optional[List[str]] = None
|
||
ssl_certificate: Optional[SSLCertificate] = None
|
||
redirected_url: Optional[str] = None
|
||
network_requests: Optional[List[Dict[str, Any]]] = None
|
||
console_messages: Optional[List[Dict[str, Any]]] = None
|
||
|
||
class Config:
|
||
arbitrary_types_allowed = True
|
||
|
||
###############################
|
||
# Scraping Models
|
||
###############################
|
||
class MediaItem(BaseModel):
|
||
src: Optional[str] = ""
|
||
data: Optional[str] = ""
|
||
alt: Optional[str] = ""
|
||
desc: Optional[str] = ""
|
||
score: Optional[int] = 0
|
||
type: str = "image"
|
||
group_id: Optional[int] = 0
|
||
format: Optional[str] = None
|
||
width: Optional[int] = None
|
||
|
||
|
||
class Link(BaseModel):
|
||
href: Optional[str] = ""
|
||
text: Optional[str] = ""
|
||
title: Optional[str] = ""
|
||
base_domain: Optional[str] = ""
|
||
|
||
|
||
class Media(BaseModel):
|
||
images: List[MediaItem] = []
|
||
videos: List[
|
||
MediaItem
|
||
] = [] # Using MediaItem model for now, can be extended with Video model if needed
|
||
audios: List[
|
||
MediaItem
|
||
] = [] # Using MediaItem model for now, can be extended with Audio model if needed
|
||
tables: List[Dict] = [] # Table data extracted from HTML tables
|
||
|
||
|
||
class Links(BaseModel):
|
||
internal: List[Link] = []
|
||
external: List[Link] = []
|
||
|
||
|
||
class ScrapingResult(BaseModel):
|
||
cleaned_html: str
|
||
success: bool
|
||
media: Media = Media()
|
||
links: Links = Links()
|
||
metadata: Dict[str, Any] = {}
|
||
|
||
```
|
||
|
||
|
||
## File: crawl4ai/content_filter_strategy.py
|
||
|
||
```py
|
||
import inspect
|
||
import re
|
||
import time
|
||
from bs4 import BeautifulSoup, Tag
|
||
from typing import List, Tuple, Dict, Optional
|
||
from rank_bm25 import BM25Okapi
|
||
from collections import deque
|
||
from bs4 import NavigableString, Comment
|
||
|
||
from .utils import (
|
||
clean_tokens,
|
||
perform_completion_with_backoff,
|
||
escape_json_string,
|
||
sanitize_html,
|
||
get_home_folder,
|
||
extract_xml_data,
|
||
merge_chunks,
|
||
)
|
||
from .types import LLMConfig
|
||
from .config import DEFAULT_PROVIDER, OVERLAP_RATE, WORD_TOKEN_RATE
|
||
from abc import ABC, abstractmethod
|
||
import math
|
||
from snowballstemmer import stemmer
|
||
from .models import TokenUsage
|
||
from .prompts import PROMPT_FILTER_CONTENT
|
||
import json
|
||
import hashlib
|
||
from pathlib import Path
|
||
from concurrent.futures import ThreadPoolExecutor
|
||
from .async_logger import AsyncLogger, LogLevel
|
||
from colorama import Fore, Style
|
||
|
||
|
||
class RelevantContentFilter(ABC):
|
||
"""Abstract base class for content filtering strategies"""
|
||
|
||
def __init__(
|
||
self,
|
||
user_query: str = None,
|
||
verbose: bool = False,
|
||
logger: Optional[AsyncLogger] = None,
|
||
):
|
||
"""
|
||
Initializes the RelevantContentFilter class with optional user query.
|
||
|
||
Args:
|
||
user_query (str): User query for filtering (optional).
|
||
verbose (bool): Enable verbose logging (default: False).
|
||
"""
|
||
self.user_query = user_query
|
||
self.included_tags = {
|
||
# Primary structure
|
||
"article",
|
||
"main",
|
||
"section",
|
||
"div",
|
||
# List structures
|
||
"ul",
|
||
"ol",
|
||
"li",
|
||
"dl",
|
||
"dt",
|
||
"dd",
|
||
# Text content
|
||
"p",
|
||
"span",
|
||
"blockquote",
|
||
"pre",
|
||
"code",
|
||
# Headers
|
||
"h1",
|
||
"h2",
|
||
"h3",
|
||
"h4",
|
||
"h5",
|
||
"h6",
|
||
# Tables
|
||
"table",
|
||
"thead",
|
||
"tbody",
|
||
"tr",
|
||
"td",
|
||
"th",
|
||
# Other semantic elements
|
||
"figure",
|
||
"figcaption",
|
||
"details",
|
||
"summary",
|
||
# Text formatting
|
||
"em",
|
||
"strong",
|
||
"b",
|
||
"i",
|
||
"mark",
|
||
"small",
|
||
# Rich content
|
||
"time",
|
||
"address",
|
||
"cite",
|
||
"q",
|
||
}
|
||
self.excluded_tags = {
|
||
"nav",
|
||
"footer",
|
||
"header",
|
||
"aside",
|
||
"script",
|
||
"style",
|
||
"form",
|
||
"iframe",
|
||
"noscript",
|
||
}
|
||
self.header_tags = {"h1", "h2", "h3", "h4", "h5", "h6"}
|
||
self.negative_patterns = re.compile(
|
||
r"nav|footer|header|sidebar|ads|comment|promo|advert|social|share", re.I
|
||
)
|
||
self.min_word_count = 2
|
||
self.verbose = False
|
||
self.logger = logger
|
||
|
||
@abstractmethod
|
||
def filter_content(self, html: str) -> List[str]:
|
||
"""Abstract method to be implemented by specific filtering strategies"""
|
||
pass
|
||
|
||
def extract_page_query(self, soup: BeautifulSoup, body: Tag) -> str:
|
||
"""Common method to extract page metadata with fallbacks"""
|
||
if self.user_query:
|
||
return self.user_query
|
||
|
||
query_parts = []
|
||
|
||
# Title
|
||
try:
|
||
title = soup.title.string
|
||
if title:
|
||
query_parts.append(title)
|
||
except Exception:
|
||
pass
|
||
|
||
if soup.find("h1"):
|
||
query_parts.append(soup.find("h1").get_text())
|
||
|
||
# Meta tags
|
||
temp = ""
|
||
for meta_name in ["keywords", "description"]:
|
||
meta = soup.find("meta", attrs={"name": meta_name})
|
||
if meta and meta.get("content"):
|
||
query_parts.append(meta["content"])
|
||
temp += meta["content"]
|
||
|
||
# If still empty, grab first significant paragraph
|
||
if not temp:
|
||
# Find the first tag P thatits text contains more than 50 characters
|
||
for p in body.find_all("p"):
|
||
if len(p.get_text()) > 150:
|
||
query_parts.append(p.get_text()[:150])
|
||
break
|
||
|
||
return " ".join(filter(None, query_parts))
|
||
|
||
def extract_text_chunks(
|
||
self, body: Tag, min_word_threshold: int = None
|
||
) -> List[Tuple[str, str]]:
|
||
"""
|
||
Extracts text chunks from a BeautifulSoup body element while preserving order.
|
||
Returns list of tuples (text, tag_name) for classification.
|
||
|
||
Args:
|
||
body: BeautifulSoup Tag object representing the body element
|
||
|
||
Returns:
|
||
List of (text, tag_name) tuples
|
||
"""
|
||
# Tags to ignore - inline elements that shouldn't break text flow
|
||
INLINE_TAGS = {
|
||
"a",
|
||
"abbr",
|
||
"acronym",
|
||
"b",
|
||
"bdo",
|
||
"big",
|
||
"br",
|
||
"button",
|
||
"cite",
|
||
"code",
|
||
"dfn",
|
||
"em",
|
||
"i",
|
||
"img",
|
||
"input",
|
||
"kbd",
|
||
"label",
|
||
"map",
|
||
"object",
|
||
"q",
|
||
"samp",
|
||
"script",
|
||
"select",
|
||
"small",
|
||
"span",
|
||
"strong",
|
||
"sub",
|
||
"sup",
|
||
"textarea",
|
||
"time",
|
||
"tt",
|
||
"var",
|
||
}
|
||
|
||
# Tags that typically contain meaningful headers
|
||
HEADER_TAGS = {"h1", "h2", "h3", "h4", "h5", "h6", "header"}
|
||
|
||
chunks = []
|
||
current_text = []
|
||
chunk_index = 0
|
||
|
||
def should_break_chunk(tag: Tag) -> bool:
|
||
"""Determine if a tag should cause a break in the current text chunk"""
|
||
return tag.name not in INLINE_TAGS and not (
|
||
tag.name == "p" and len(current_text) == 0
|
||
)
|
||
|
||
# Use deque for efficient push/pop operations
|
||
stack = deque([(body, False)])
|
||
|
||
while stack:
|
||
element, visited = stack.pop()
|
||
|
||
if visited:
|
||
# End of block element - flush accumulated text
|
||
if current_text and should_break_chunk(element):
|
||
text = " ".join("".join(current_text).split())
|
||
if text:
|
||
tag_type = (
|
||
"header" if element.name in HEADER_TAGS else "content"
|
||
)
|
||
chunks.append((chunk_index, text, tag_type, element))
|
||
chunk_index += 1
|
||
current_text = []
|
||
continue
|
||
|
||
if isinstance(element, NavigableString):
|
||
if str(element).strip():
|
||
current_text.append(str(element).strip())
|
||
continue
|
||
|
||
# Pre-allocate children to avoid multiple list operations
|
||
children = list(element.children)
|
||
if not children:
|
||
continue
|
||
|
||
# Mark block for revisit after processing children
|
||
stack.append((element, True))
|
||
|
||
# Add children in reverse order for correct processing
|
||
for child in reversed(children):
|
||
if isinstance(child, (Tag, NavigableString)):
|
||
stack.append((child, False))
|
||
|
||
# Handle any remaining text
|
||
if current_text:
|
||
text = " ".join("".join(current_text).split())
|
||
if text:
|
||
chunks.append((chunk_index, text, "content", body))
|
||
|
||
if min_word_threshold:
|
||
chunks = [
|
||
chunk for chunk in chunks if len(chunk[1].split()) >= min_word_threshold
|
||
]
|
||
|
||
return chunks
|
||
|
||
def _deprecated_extract_text_chunks(
|
||
self, soup: BeautifulSoup
|
||
) -> List[Tuple[int, str, Tag]]:
|
||
"""Common method for extracting text chunks"""
|
||
_text_cache = {}
|
||
|
||
def fast_text(element: Tag) -> str:
|
||
elem_id = id(element)
|
||
if elem_id in _text_cache:
|
||
return _text_cache[elem_id]
|
||
texts = []
|
||
for content in element.contents:
|
||
if isinstance(content, str):
|
||
text = content.strip()
|
||
if text:
|
||
texts.append(text)
|
||
result = " ".join(texts)
|
||
_text_cache[elem_id] = result
|
||
return result
|
||
|
||
candidates = []
|
||
index = 0
|
||
|
||
def dfs(element):
|
||
nonlocal index
|
||
if isinstance(element, Tag):
|
||
if element.name in self.included_tags:
|
||
if not self.is_excluded(element):
|
||
text = fast_text(element)
|
||
word_count = len(text.split())
|
||
|
||
# Headers pass through with adjusted minimum
|
||
if element.name in self.header_tags:
|
||
if word_count >= 3: # Minimal sanity check for headers
|
||
candidates.append((index, text, element))
|
||
index += 1
|
||
# Regular content uses standard minimum
|
||
elif word_count >= self.min_word_count:
|
||
candidates.append((index, text, element))
|
||
index += 1
|
||
|
||
for child in element.children:
|
||
dfs(child)
|
||
|
||
dfs(soup.body if soup.body else soup)
|
||
return candidates
|
||
|
||
def is_excluded(self, tag: Tag) -> bool:
|
||
"""Common method for exclusion logic"""
|
||
if tag.name in self.excluded_tags:
|
||
return True
|
||
class_id = " ".join(
|
||
filter(None, [" ".join(tag.get("class", [])), tag.get("id", "")])
|
||
)
|
||
return bool(self.negative_patterns.search(class_id))
|
||
|
||
def clean_element(self, tag: Tag) -> str:
|
||
"""Common method for cleaning HTML elements with minimal overhead"""
|
||
if not tag or not isinstance(tag, Tag):
|
||
return ""
|
||
|
||
unwanted_tags = {"script", "style", "aside", "form", "iframe", "noscript"}
|
||
unwanted_attrs = {
|
||
"style",
|
||
"onclick",
|
||
"onmouseover",
|
||
"align",
|
||
"bgcolor",
|
||
"class",
|
||
"id",
|
||
}
|
||
|
||
# Use string builder pattern for better performance
|
||
builder = []
|
||
|
||
def render_tag(elem):
|
||
if not isinstance(elem, Tag):
|
||
if isinstance(elem, str):
|
||
builder.append(elem.strip())
|
||
return
|
||
|
||
if elem.name in unwanted_tags:
|
||
return
|
||
|
||
# Start tag
|
||
builder.append(f"<{elem.name}")
|
||
|
||
# Add cleaned attributes
|
||
attrs = {k: v for k, v in elem.attrs.items() if k not in unwanted_attrs}
|
||
for key, value in attrs.items():
|
||
builder.append(f' {key}="{value}"')
|
||
|
||
builder.append(">")
|
||
|
||
# Process children
|
||
for child in elem.children:
|
||
render_tag(child)
|
||
|
||
# Close tag
|
||
builder.append(f"</{elem.name}>")
|
||
|
||
try:
|
||
render_tag(tag)
|
||
return "".join(builder)
|
||
except Exception:
|
||
return str(tag) # Fallback to original if anything fails
|
||
|
||
|
||
class BM25ContentFilter(RelevantContentFilter):
|
||
"""
|
||
Content filtering using BM25 algorithm with priority tag handling.
|
||
|
||
How it works:
|
||
1. Extracts page metadata with fallbacks.
|
||
2. Extracts text chunks from the body element.
|
||
3. Tokenizes the corpus and query.
|
||
4. Applies BM25 algorithm to calculate scores for each chunk.
|
||
5. Filters out chunks below the threshold.
|
||
6. Sorts chunks by score in descending order.
|
||
7. Returns the top N chunks.
|
||
|
||
Attributes:
|
||
user_query (str): User query for filtering (optional).
|
||
bm25_threshold (float): BM25 threshold for filtering (default: 1.0).
|
||
language (str): Language for stemming (default: 'english').
|
||
|
||
Methods:
|
||
filter_content(self, html: str, min_word_threshold: int = None)
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
user_query: str = None,
|
||
bm25_threshold: float = 1.0,
|
||
language: str = "english",
|
||
):
|
||
"""
|
||
Initializes the BM25ContentFilter class, if not provided, falls back to page metadata.
|
||
|
||
Note:
|
||
If no query is given and no page metadata is available, then it tries to pick up the first significant paragraph.
|
||
|
||
Args:
|
||
user_query (str): User query for filtering (optional).
|
||
bm25_threshold (float): BM25 threshold for filtering (default: 1.0).
|
||
language (str): Language for stemming (default: 'english').
|
||
"""
|
||
super().__init__(user_query=user_query)
|
||
self.bm25_threshold = bm25_threshold
|
||
self.priority_tags = {
|
||
"h1": 5.0,
|
||
"h2": 4.0,
|
||
"h3": 3.0,
|
||
"title": 4.0,
|
||
"strong": 2.0,
|
||
"b": 1.5,
|
||
"em": 1.5,
|
||
"blockquote": 2.0,
|
||
"code": 2.0,
|
||
"pre": 1.5,
|
||
"th": 1.5, # Table headers
|
||
}
|
||
self.stemmer = stemmer(language)
|
||
|
||
def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
|
||
"""
|
||
Implements content filtering using BM25 algorithm with priority tag handling.
|
||
|
||
Note:
|
||
This method implements the filtering logic for the BM25ContentFilter class.
|
||
It takes HTML content as input and returns a list of filtered text chunks.
|
||
|
||
Args:
|
||
html (str): HTML content to be filtered.
|
||
min_word_threshold (int): Minimum word threshold for filtering (optional).
|
||
|
||
Returns:
|
||
List[str]: List of filtered text chunks.
|
||
"""
|
||
if not html or not isinstance(html, str):
|
||
return []
|
||
|
||
soup = BeautifulSoup(html, "lxml")
|
||
|
||
# Check if body is present
|
||
if not soup.body:
|
||
# Wrap in body tag if missing
|
||
soup = BeautifulSoup(f"<body>{html}</body>", "lxml")
|
||
body = soup.find("body")
|
||
|
||
query = self.extract_page_query(soup, body)
|
||
|
||
if not query:
|
||
return []
|
||
# return [self.clean_element(soup)]
|
||
|
||
candidates = self.extract_text_chunks(body, min_word_threshold)
|
||
|
||
if not candidates:
|
||
return []
|
||
|
||
# Tokenize corpus
|
||
# tokenized_corpus = [chunk.lower().split() for _, chunk, _, _ in candidates]
|
||
# tokenized_query = query.lower().split()
|
||
|
||
# tokenized_corpus = [[ps.stem(word) for word in chunk.lower().split()]
|
||
# for _, chunk, _, _ in candidates]
|
||
# tokenized_query = [ps.stem(word) for word in query.lower().split()]
|
||
|
||
tokenized_corpus = [
|
||
[self.stemmer.stemWord(word) for word in chunk.lower().split()]
|
||
for _, chunk, _, _ in candidates
|
||
]
|
||
tokenized_query = [
|
||
self.stemmer.stemWord(word) for word in query.lower().split()
|
||
]
|
||
|
||
# tokenized_corpus = [[self.stemmer.stemWord(word) for word in tokenize_text(chunk.lower())]
|
||
# for _, chunk, _, _ in candidates]
|
||
# tokenized_query = [self.stemmer.stemWord(word) for word in tokenize_text(query.lower())]
|
||
|
||
# Clean from stop words and noise
|
||
tokenized_corpus = [clean_tokens(tokens) for tokens in tokenized_corpus]
|
||
tokenized_query = clean_tokens(tokenized_query)
|
||
|
||
bm25 = BM25Okapi(tokenized_corpus)
|
||
scores = bm25.get_scores(tokenized_query)
|
||
|
||
# Adjust scores with tag weights
|
||
adjusted_candidates = []
|
||
for score, (index, chunk, tag_type, tag) in zip(scores, candidates):
|
||
tag_weight = self.priority_tags.get(tag.name, 1.0)
|
||
adjusted_score = score * tag_weight
|
||
adjusted_candidates.append((adjusted_score, index, chunk, tag))
|
||
|
||
# Filter candidates by threshold
|
||
selected_candidates = [
|
||
(index, chunk, tag)
|
||
for adjusted_score, index, chunk, tag in adjusted_candidates
|
||
if adjusted_score >= self.bm25_threshold
|
||
]
|
||
|
||
if not selected_candidates:
|
||
return []
|
||
|
||
# Sort selected candidates by original document order
|
||
selected_candidates.sort(key=lambda x: x[0])
|
||
|
||
return [self.clean_element(tag) for _, _, tag in selected_candidates]
|
||
|
||
|
||
class PruningContentFilter(RelevantContentFilter):
|
||
"""
|
||
Content filtering using pruning algorithm with dynamic threshold.
|
||
|
||
How it works:
|
||
1. Extracts page metadata with fallbacks.
|
||
2. Extracts text chunks from the body element.
|
||
3. Applies pruning algorithm to calculate scores for each chunk.
|
||
4. Filters out chunks below the threshold.
|
||
5. Sorts chunks by score in descending order.
|
||
6. Returns the top N chunks.
|
||
|
||
Attributes:
|
||
user_query (str): User query for filtering (optional), if not provided, falls back to page metadata.
|
||
min_word_threshold (int): Minimum word threshold for filtering (optional).
|
||
threshold_type (str): Threshold type for dynamic threshold (default: 'fixed').
|
||
threshold (float): Fixed threshold value (default: 0.48).
|
||
|
||
Methods:
|
||
filter_content(self, html: str, min_word_threshold: int = None):
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
user_query: str = None,
|
||
min_word_threshold: int = None,
|
||
threshold_type: str = "fixed",
|
||
threshold: float = 0.48,
|
||
):
|
||
"""
|
||
Initializes the PruningContentFilter class, if not provided, falls back to page metadata.
|
||
|
||
Note:
|
||
If no query is given and no page metadata is available, then it tries to pick up the first significant paragraph.
|
||
|
||
Args:
|
||
user_query (str): User query for filtering (optional).
|
||
min_word_threshold (int): Minimum word threshold for filtering (optional).
|
||
threshold_type (str): Threshold type for dynamic threshold (default: 'fixed').
|
||
threshold (float): Fixed threshold value (default: 0.48).
|
||
"""
|
||
super().__init__(None)
|
||
self.min_word_threshold = min_word_threshold
|
||
self.threshold_type = threshold_type
|
||
self.threshold = threshold
|
||
|
||
# Add tag importance for dynamic threshold
|
||
self.tag_importance = {
|
||
"article": 1.5,
|
||
"main": 1.4,
|
||
"section": 1.3,
|
||
"p": 1.2,
|
||
"h1": 1.4,
|
||
"h2": 1.3,
|
||
"h3": 1.2,
|
||
"div": 0.7,
|
||
"span": 0.6,
|
||
}
|
||
|
||
# Metric configuration
|
||
self.metric_config = {
|
||
"text_density": True,
|
||
"link_density": True,
|
||
"tag_weight": True,
|
||
"class_id_weight": True,
|
||
"text_length": True,
|
||
}
|
||
|
||
self.metric_weights = {
|
||
"text_density": 0.4,
|
||
"link_density": 0.2,
|
||
"tag_weight": 0.2,
|
||
"class_id_weight": 0.1,
|
||
"text_length": 0.1,
|
||
}
|
||
|
||
self.tag_weights = {
|
||
"div": 0.5,
|
||
"p": 1.0,
|
||
"article": 1.5,
|
||
"section": 1.0,
|
||
"span": 0.3,
|
||
"li": 0.5,
|
||
"ul": 0.5,
|
||
"ol": 0.5,
|
||
"h1": 1.2,
|
||
"h2": 1.1,
|
||
"h3": 1.0,
|
||
"h4": 0.9,
|
||
"h5": 0.8,
|
||
"h6": 0.7,
|
||
}
|
||
|
||
def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
|
||
"""
|
||
Implements content filtering using pruning algorithm with dynamic threshold.
|
||
|
||
Note:
|
||
This method implements the filtering logic for the PruningContentFilter class.
|
||
It takes HTML content as input and returns a list of filtered text chunks.
|
||
|
||
Args:
|
||
html (str): HTML content to be filtered.
|
||
min_word_threshold (int): Minimum word threshold for filtering (optional).
|
||
|
||
Returns:
|
||
List[str]: List of filtered text chunks.
|
||
"""
|
||
if not html or not isinstance(html, str):
|
||
return []
|
||
|
||
soup = BeautifulSoup(html, "lxml")
|
||
if not soup.body:
|
||
soup = BeautifulSoup(f"<body>{html}</body>", "lxml")
|
||
|
||
# Remove comments and unwanted tags
|
||
self._remove_comments(soup)
|
||
self._remove_unwanted_tags(soup)
|
||
|
||
# Prune tree starting from body
|
||
body = soup.find("body")
|
||
self._prune_tree(body)
|
||
|
||
# Extract remaining content as list of HTML strings
|
||
content_blocks = []
|
||
for element in body.children:
|
||
if isinstance(element, str) or not hasattr(element, "name"):
|
||
continue
|
||
if len(element.get_text(strip=True)) > 0:
|
||
content_blocks.append(str(element))
|
||
|
||
return content_blocks
|
||
|
||
def _remove_comments(self, soup):
|
||
"""Removes HTML comments"""
|
||
for element in soup(text=lambda text: isinstance(text, Comment)):
|
||
element.extract()
|
||
|
||
def _remove_unwanted_tags(self, soup):
|
||
"""Removes unwanted tags"""
|
||
for tag in self.excluded_tags:
|
||
for element in soup.find_all(tag):
|
||
element.decompose()
|
||
|
||
def _prune_tree(self, node):
|
||
"""
|
||
Prunes the tree starting from the given node.
|
||
|
||
Args:
|
||
node (Tag): The node from which the pruning starts.
|
||
"""
|
||
if not node or not hasattr(node, "name") or node.name is None:
|
||
return
|
||
|
||
text_len = len(node.get_text(strip=True))
|
||
tag_len = len(node.encode_contents().decode("utf-8"))
|
||
link_text_len = sum(
|
||
len(s.strip())
|
||
for s in (a.string for a in node.find_all("a", recursive=False))
|
||
if s
|
||
)
|
||
|
||
metrics = {
|
||
"node": node,
|
||
"tag_name": node.name,
|
||
"text_len": text_len,
|
||
"tag_len": tag_len,
|
||
"link_text_len": link_text_len,
|
||
}
|
||
|
||
score = self._compute_composite_score(metrics, text_len, tag_len, link_text_len)
|
||
|
||
if self.threshold_type == "fixed":
|
||
should_remove = score < self.threshold
|
||
else: # dynamic
|
||
tag_importance = self.tag_importance.get(node.name, 0.7)
|
||
text_ratio = text_len / tag_len if tag_len > 0 else 0
|
||
link_ratio = link_text_len / text_len if text_len > 0 else 1
|
||
|
||
threshold = self.threshold # base threshold
|
||
if tag_importance > 1:
|
||
threshold *= 0.8
|
||
if text_ratio > 0.4:
|
||
threshold *= 0.9
|
||
if link_ratio > 0.6:
|
||
threshold *= 1.2
|
||
|
||
should_remove = score < threshold
|
||
|
||
if should_remove:
|
||
node.decompose()
|
||
else:
|
||
children = [child for child in node.children if hasattr(child, "name")]
|
||
for child in children:
|
||
self._prune_tree(child)
|
||
|
||
def _compute_composite_score(self, metrics, text_len, tag_len, link_text_len):
|
||
"""Computes the composite score"""
|
||
if self.min_word_threshold:
|
||
# Get raw text from metrics node - avoid extra processing
|
||
text = metrics["node"].get_text(strip=True)
|
||
word_count = text.count(" ") + 1
|
||
if word_count < self.min_word_threshold:
|
||
return -1.0 # Guaranteed removal
|
||
score = 0.0
|
||
total_weight = 0.0
|
||
|
||
if self.metric_config["text_density"]:
|
||
density = text_len / tag_len if tag_len > 0 else 0
|
||
score += self.metric_weights["text_density"] * density
|
||
total_weight += self.metric_weights["text_density"]
|
||
|
||
if self.metric_config["link_density"]:
|
||
density = 1 - (link_text_len / text_len if text_len > 0 else 0)
|
||
score += self.metric_weights["link_density"] * density
|
||
total_weight += self.metric_weights["link_density"]
|
||
|
||
if self.metric_config["tag_weight"]:
|
||
tag_score = self.tag_weights.get(metrics["tag_name"], 0.5)
|
||
score += self.metric_weights["tag_weight"] * tag_score
|
||
total_weight += self.metric_weights["tag_weight"]
|
||
|
||
if self.metric_config["class_id_weight"]:
|
||
class_score = self._compute_class_id_weight(metrics["node"])
|
||
score += self.metric_weights["class_id_weight"] * max(0, class_score)
|
||
total_weight += self.metric_weights["class_id_weight"]
|
||
|
||
if self.metric_config["text_length"]:
|
||
score += self.metric_weights["text_length"] * math.log(text_len + 1)
|
||
total_weight += self.metric_weights["text_length"]
|
||
|
||
return score / total_weight if total_weight > 0 else 0
|
||
|
||
def _compute_class_id_weight(self, node):
|
||
"""Computes the class ID weight"""
|
||
class_id_score = 0
|
||
if "class" in node.attrs:
|
||
classes = " ".join(node["class"])
|
||
if self.negative_patterns.match(classes):
|
||
class_id_score -= 0.5
|
||
if "id" in node.attrs:
|
||
element_id = node["id"]
|
||
if self.negative_patterns.match(element_id):
|
||
class_id_score -= 0.5
|
||
return class_id_score
|
||
|
||
|
||
class LLMContentFilter(RelevantContentFilter):
|
||
"""Content filtering using LLMs to generate relevant markdown.
|
||
|
||
How it works:
|
||
1. Extracts page metadata with fallbacks.
|
||
2. Extracts text chunks from the body element.
|
||
3. Applies LLMs to generate markdown for each chunk.
|
||
4. Filters out chunks below the threshold.
|
||
5. Sorts chunks by score in descending order.
|
||
6. Returns the top N chunks.
|
||
|
||
Attributes:
|
||
llm_config (LLMConfig): LLM configuration object.
|
||
instruction (str): Instruction for LLM markdown generation
|
||
chunk_token_threshold (int): Chunk token threshold for splitting (default: 1e9).
|
||
overlap_rate (float): Overlap rate for chunking (default: 0.5).
|
||
word_token_rate (float): Word token rate for chunking (default: 0.2).
|
||
verbose (bool): Enable verbose logging (default: False).
|
||
logger (AsyncLogger): Custom logger for LLM operations (optional).
|
||
"""
|
||
_UNWANTED_PROPS = {
|
||
'provider' : 'Instead, use llm_config=LLMConfig(provider="...")',
|
||
'api_token' : 'Instead, use llm_config=LlMConfig(api_token="...")',
|
||
'base_url' : 'Instead, use llm_config=LLMConfig(base_url="...")',
|
||
'api_base' : 'Instead, use llm_config=LLMConfig(base_url="...")',
|
||
}
|
||
|
||
def __init__(
|
||
self,
|
||
llm_config: "LLMConfig" = None,
|
||
instruction: str = None,
|
||
chunk_token_threshold: int = int(1e9),
|
||
overlap_rate: float = OVERLAP_RATE,
|
||
word_token_rate: float = WORD_TOKEN_RATE,
|
||
# char_token_rate: float = WORD_TOKEN_RATE * 5,
|
||
# chunk_mode: str = "char",
|
||
verbose: bool = False,
|
||
logger: Optional[AsyncLogger] = None,
|
||
ignore_cache: bool = True,
|
||
# Deprecated properties
|
||
provider: str = DEFAULT_PROVIDER,
|
||
api_token: Optional[str] = None,
|
||
base_url: Optional[str] = None,
|
||
api_base: Optional[str] = None,
|
||
extra_args: Dict = None,
|
||
):
|
||
super().__init__(None)
|
||
self.provider = provider
|
||
self.api_token = api_token
|
||
self.base_url = base_url or api_base
|
||
self.llm_config = llm_config
|
||
self.instruction = instruction
|
||
self.chunk_token_threshold = chunk_token_threshold
|
||
self.overlap_rate = overlap_rate
|
||
self.word_token_rate = word_token_rate or WORD_TOKEN_RATE
|
||
# self.chunk_mode: str = chunk_mode
|
||
# self.char_token_rate = char_token_rate or word_token_rate / 5
|
||
# self.token_rate = word_token_rate if chunk_mode == "word" else self.char_token_rate
|
||
self.token_rate = word_token_rate or WORD_TOKEN_RATE
|
||
self.extra_args = extra_args or {}
|
||
self.ignore_cache = ignore_cache
|
||
self.verbose = verbose
|
||
|
||
# Setup logger with custom styling for LLM operations
|
||
if logger:
|
||
self.logger = logger
|
||
elif verbose:
|
||
self.logger = AsyncLogger(
|
||
verbose=verbose,
|
||
icons={
|
||
**AsyncLogger.DEFAULT_ICONS,
|
||
"LLM": "★", # Star for LLM operations
|
||
"CHUNK": "◈", # Diamond for chunks
|
||
"CACHE": "⚡", # Lightning for cache operations
|
||
},
|
||
colors={
|
||
**AsyncLogger.DEFAULT_COLORS,
|
||
LogLevel.INFO: Fore.MAGENTA
|
||
+ Style.DIM, # Dimmed purple for LLM ops
|
||
},
|
||
)
|
||
else:
|
||
self.logger = None
|
||
|
||
self.usages = []
|
||
self.total_usage = TokenUsage()
|
||
|
||
def __setattr__(self, name, value):
|
||
"""Handle attribute setting."""
|
||
# TODO: Planning to set properties dynamically based on the __init__ signature
|
||
sig = inspect.signature(self.__init__)
|
||
all_params = sig.parameters # Dictionary of parameter names and their details
|
||
|
||
if name in self._UNWANTED_PROPS and value is not all_params[name].default:
|
||
raise AttributeError(f"Setting '{name}' is deprecated. {self._UNWANTED_PROPS[name]}")
|
||
|
||
super().__setattr__(name, value)
|
||
|
||
def _get_cache_key(self, html: str, instruction: str) -> str:
|
||
"""Generate a unique cache key based on HTML and instruction"""
|
||
content = f"{html}{instruction}"
|
||
return hashlib.md5(content.encode()).hexdigest()
|
||
|
||
def _merge_chunks(self, text: str) -> List[str]:
|
||
"""Split text into chunks with overlap using char or word mode."""
|
||
ov = int(self.chunk_token_threshold * self.overlap_rate)
|
||
sections = merge_chunks(
|
||
docs=[text],
|
||
target_size=self.chunk_token_threshold,
|
||
overlap=ov,
|
||
word_token_ratio=self.word_token_rate,
|
||
)
|
||
return sections
|
||
|
||
def filter_content(self, html: str, ignore_cache: bool = True) -> List[str]:
|
||
if not html or not isinstance(html, str):
|
||
return []
|
||
|
||
if self.logger:
|
||
self.logger.info(
|
||
"Starting LLM markdown content filtering process",
|
||
tag="LLM",
|
||
params={"provider": self.llm_config.provider},
|
||
colors={"provider": Fore.CYAN},
|
||
)
|
||
|
||
# Cache handling
|
||
cache_dir = Path(get_home_folder()) / "llm_cache" / "content_filter"
|
||
cache_dir.mkdir(parents=True, exist_ok=True)
|
||
cache_key = self._get_cache_key(html, self.instruction or "")
|
||
cache_file = cache_dir / f"{cache_key}.json"
|
||
|
||
# if ignore_cache == None:
|
||
ignore_cache = self.ignore_cache
|
||
|
||
if not ignore_cache and cache_file.exists():
|
||
if self.logger:
|
||
self.logger.info("Found cached markdown result", tag="CACHE")
|
||
try:
|
||
with cache_file.open("r") as f:
|
||
cached_data = json.load(f)
|
||
usage = TokenUsage(**cached_data["usage"])
|
||
self.usages.append(usage)
|
||
self.total_usage.completion_tokens += usage.completion_tokens
|
||
self.total_usage.prompt_tokens += usage.prompt_tokens
|
||
self.total_usage.total_tokens += usage.total_tokens
|
||
return cached_data["blocks"]
|
||
except Exception as e:
|
||
if self.logger:
|
||
self.logger.error(
|
||
f"LLM markdown: Cache read error: {str(e)}", tag="CACHE"
|
||
)
|
||
|
||
# Split into chunks
|
||
html_chunks = self._merge_chunks(html)
|
||
if self.logger:
|
||
self.logger.info(
|
||
"LLM markdown: Split content into {chunk_count} chunks",
|
||
tag="CHUNK",
|
||
params={"chunk_count": len(html_chunks)},
|
||
colors={"chunk_count": Fore.YELLOW},
|
||
)
|
||
|
||
start_time = time.time()
|
||
|
||
# Process chunks in parallel
|
||
with ThreadPoolExecutor(max_workers=4) as executor:
|
||
futures = []
|
||
for i, chunk in enumerate(html_chunks):
|
||
if self.logger:
|
||
self.logger.debug(
|
||
"LLM markdown: Processing chunk {chunk_num}/{total_chunks}",
|
||
tag="CHUNK",
|
||
params={"chunk_num": i + 1, "total_chunks": len(html_chunks)},
|
||
)
|
||
|
||
prompt_variables = {
|
||
"HTML": escape_json_string(sanitize_html(chunk)),
|
||
"REQUEST": self.instruction
|
||
or "Convert this HTML into clean, relevant markdown, removing any noise or irrelevant content.",
|
||
}
|
||
|
||
prompt = PROMPT_FILTER_CONTENT
|
||
for var, value in prompt_variables.items():
|
||
prompt = prompt.replace("{" + var + "}", value)
|
||
|
||
def _proceed_with_chunk(
|
||
provider: str,
|
||
prompt: str,
|
||
api_token: str,
|
||
base_url: Optional[str] = None,
|
||
extra_args: Dict = {},
|
||
) -> List[str]:
|
||
if self.logger:
|
||
self.logger.info(
|
||
"LLM Markdown: Processing chunk {chunk_num}",
|
||
tag="CHUNK",
|
||
params={"chunk_num": i + 1},
|
||
)
|
||
return perform_completion_with_backoff(
|
||
provider,
|
||
prompt,
|
||
api_token,
|
||
base_url=base_url,
|
||
extra_args=extra_args,
|
||
)
|
||
|
||
future = executor.submit(
|
||
_proceed_with_chunk,
|
||
self.llm_config.provider,
|
||
prompt,
|
||
self.llm_config.api_token,
|
||
self.llm_config.base_url,
|
||
self.extra_args,
|
||
)
|
||
futures.append((i, future))
|
||
|
||
# Collect results in order
|
||
ordered_results = []
|
||
for i, future in sorted(futures):
|
||
try:
|
||
response = future.result()
|
||
|
||
# Track usage
|
||
usage = TokenUsage(
|
||
completion_tokens=response.usage.completion_tokens,
|
||
prompt_tokens=response.usage.prompt_tokens,
|
||
total_tokens=response.usage.total_tokens,
|
||
completion_tokens_details=(
|
||
response.usage.completion_tokens_details.__dict__
|
||
if response.usage.completion_tokens_details
|
||
else {}
|
||
),
|
||
prompt_tokens_details=(
|
||
response.usage.prompt_tokens_details.__dict__
|
||
if response.usage.prompt_tokens_details
|
||
else {}
|
||
),
|
||
)
|
||
self.usages.append(usage)
|
||
self.total_usage.completion_tokens += usage.completion_tokens
|
||
self.total_usage.prompt_tokens += usage.prompt_tokens
|
||
self.total_usage.total_tokens += usage.total_tokens
|
||
|
||
blocks = extract_xml_data(
|
||
["content"], response.choices[0].message.content
|
||
)["content"]
|
||
if blocks:
|
||
ordered_results.append(blocks)
|
||
if self.logger:
|
||
self.logger.success(
|
||
"LLM markdown: Successfully processed chunk {chunk_num}",
|
||
tag="CHUNK",
|
||
params={"chunk_num": i + 1},
|
||
)
|
||
except Exception as e:
|
||
if self.logger:
|
||
self.logger.error(
|
||
"LLM markdown: Error processing chunk {chunk_num}: {error}",
|
||
tag="CHUNK",
|
||
params={"chunk_num": i + 1, "error": str(e)},
|
||
)
|
||
|
||
end_time = time.time()
|
||
if self.logger:
|
||
self.logger.success(
|
||
"LLM markdown: Completed processing in {time:.2f}s",
|
||
tag="LLM",
|
||
params={"time": end_time - start_time},
|
||
colors={"time": Fore.YELLOW},
|
||
)
|
||
|
||
result = ordered_results if ordered_results else []
|
||
|
||
# Cache the final result
|
||
cache_data = {"blocks": result, "usage": self.total_usage.__dict__}
|
||
with cache_file.open("w") as f:
|
||
json.dump(cache_data, f)
|
||
if self.logger:
|
||
self.logger.info("Cached results for future use", tag="CACHE")
|
||
|
||
return result
|
||
|
||
def show_usage(self) -> None:
|
||
"""Print usage statistics"""
|
||
print("\n=== Token Usage Summary ===")
|
||
print(f"{'Type':<15} {'Count':>12}")
|
||
print("-" * 30)
|
||
print(f"{'Completion':<15} {self.total_usage.completion_tokens:>12,}")
|
||
print(f"{'Prompt':<15} {self.total_usage.prompt_tokens:>12,}")
|
||
print(f"{'Total':<15} {self.total_usage.total_tokens:>12,}")
|
||
|
||
if self.usages:
|
||
print("\n=== Usage History ===")
|
||
print(f"{'Request #':<10} {'Completion':>12} {'Prompt':>12} {'Total':>12}")
|
||
print("-" * 48)
|
||
for i, usage in enumerate(self.usages, 1):
|
||
print(
|
||
f"{i:<10} {usage.completion_tokens:>12,} "
|
||
f"{usage.prompt_tokens:>12,} {usage.total_tokens:>12,}"
|
||
)
|
||
|
||
```
|
||
|
||
|
||
## File: crawl4ai/markdown_generation_strategy.py
|
||
|
||
```py
|
||
from abc import ABC, abstractmethod
|
||
from typing import Optional, Dict, Any, Tuple
|
||
from .models import MarkdownGenerationResult
|
||
from .html2text import CustomHTML2Text
|
||
# from .types import RelevantContentFilter
|
||
from .content_filter_strategy import RelevantContentFilter
|
||
import re
|
||
from urllib.parse import urljoin
|
||
|
||
# Pre-compile the regex pattern
|
||
LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)')
|
||
|
||
|
||
def fast_urljoin(base: str, url: str) -> str:
|
||
"""Fast URL joining for common cases."""
|
||
if url.startswith(("http://", "https://", "mailto:", "//")):
|
||
return url
|
||
if url.startswith("/"):
|
||
# Handle absolute paths
|
||
if base.endswith("/"):
|
||
return base[:-1] + url
|
||
return base + url
|
||
return urljoin(base, url)
|
||
|
||
|
||
class MarkdownGenerationStrategy(ABC):
|
||
"""Abstract base class for markdown generation strategies."""
|
||
|
||
def __init__(
|
||
self,
|
||
content_filter: Optional[RelevantContentFilter] = None,
|
||
options: Optional[Dict[str, Any]] = None,
|
||
verbose: bool = False,
|
||
content_source: str = "cleaned_html",
|
||
):
|
||
self.content_filter = content_filter
|
||
self.options = options or {}
|
||
self.verbose = verbose
|
||
self.content_source = content_source
|
||
|
||
@abstractmethod
|
||
def generate_markdown(
|
||
self,
|
||
input_html: str,
|
||
base_url: str = "",
|
||
html2text_options: Optional[Dict[str, Any]] = None,
|
||
content_filter: Optional[RelevantContentFilter] = None,
|
||
citations: bool = True,
|
||
**kwargs,
|
||
) -> MarkdownGenerationResult:
|
||
"""Generate markdown from the selected input HTML."""
|
||
pass
|
||
|
||
|
||
class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
|
||
"""
|
||
Default implementation of markdown generation strategy.
|
||
|
||
How it works:
|
||
1. Generate raw markdown from cleaned HTML.
|
||
2. Convert links to citations.
|
||
3. Generate fit markdown if content filter is provided.
|
||
4. Return MarkdownGenerationResult.
|
||
|
||
Args:
|
||
content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
|
||
options (Optional[Dict[str, Any]]): Additional options for markdown generation. Defaults to None.
|
||
content_source (str): Source of content to generate markdown from. Options: "cleaned_html", "raw_html", "fit_html". Defaults to "cleaned_html".
|
||
|
||
Returns:
|
||
MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
content_filter: Optional[RelevantContentFilter] = None,
|
||
options: Optional[Dict[str, Any]] = None,
|
||
content_source: str = "cleaned_html",
|
||
):
|
||
super().__init__(content_filter, options, verbose=False, content_source=content_source)
|
||
|
||
def convert_links_to_citations(
|
||
self, markdown: str, base_url: str = ""
|
||
) -> Tuple[str, str]:
|
||
"""
|
||
Convert links in markdown to citations.
|
||
|
||
How it works:
|
||
1. Find all links in the markdown.
|
||
2. Convert links to citations.
|
||
3. Return converted markdown and references markdown.
|
||
|
||
Note:
|
||
This function uses a regex pattern to find links in markdown.
|
||
|
||
Args:
|
||
markdown (str): Markdown text.
|
||
base_url (str): Base URL for URL joins.
|
||
|
||
Returns:
|
||
Tuple[str, str]: Converted markdown and references markdown.
|
||
"""
|
||
link_map = {}
|
||
url_cache = {} # Cache for URL joins
|
||
parts = []
|
||
last_end = 0
|
||
counter = 1
|
||
|
||
for match in LINK_PATTERN.finditer(markdown):
|
||
parts.append(markdown[last_end : match.start()])
|
||
text, url, title = match.groups()
|
||
|
||
# Use cached URL if available, otherwise compute and cache
|
||
if base_url and not url.startswith(("http://", "https://", "mailto:")):
|
||
if url not in url_cache:
|
||
url_cache[url] = fast_urljoin(base_url, url)
|
||
url = url_cache[url]
|
||
|
||
if url not in link_map:
|
||
desc = []
|
||
if title:
|
||
desc.append(title)
|
||
if text and text != title:
|
||
desc.append(text)
|
||
link_map[url] = (counter, ": " + " - ".join(desc) if desc else "")
|
||
counter += 1
|
||
|
||
num = link_map[url][0]
|
||
parts.append(
|
||
f"{text}⟨{num}⟩"
|
||
if not match.group(0).startswith("!")
|
||
else f"![{text}⟨{num}⟩]"
|
||
)
|
||
last_end = match.end()
|
||
|
||
parts.append(markdown[last_end:])
|
||
converted_text = "".join(parts)
|
||
|
||
# Pre-build reference strings
|
||
references = ["\n\n## References\n\n"]
|
||
references.extend(
|
||
f"⟨{num}⟩ {url}{desc}\n"
|
||
for url, (num, desc) in sorted(link_map.items(), key=lambda x: x[1][0])
|
||
)
|
||
|
||
return converted_text, "".join(references)
|
||
|
||
def generate_markdown(
|
||
self,
|
||
input_html: str,
|
||
base_url: str = "",
|
||
html2text_options: Optional[Dict[str, Any]] = None,
|
||
options: Optional[Dict[str, Any]] = None,
|
||
content_filter: Optional[RelevantContentFilter] = None,
|
||
citations: bool = True,
|
||
**kwargs,
|
||
) -> MarkdownGenerationResult:
|
||
"""
|
||
Generate markdown with citations from the provided input HTML.
|
||
|
||
How it works:
|
||
1. Generate raw markdown from the input HTML.
|
||
2. Convert links to citations.
|
||
3. Generate fit markdown if content filter is provided.
|
||
4. Return MarkdownGenerationResult.
|
||
|
||
Args:
|
||
input_html (str): The HTML content to process (selected based on content_source).
|
||
base_url (str): Base URL for URL joins.
|
||
html2text_options (Optional[Dict[str, Any]]): HTML2Text options.
|
||
options (Optional[Dict[str, Any]]): Additional options for markdown generation.
|
||
content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
|
||
citations (bool): Whether to generate citations.
|
||
|
||
Returns:
|
||
MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
|
||
"""
|
||
try:
|
||
# Initialize HTML2Text with default options for better conversion
|
||
h = CustomHTML2Text(baseurl=base_url)
|
||
default_options = {
|
||
"body_width": 0, # Disable text wrapping
|
||
"ignore_emphasis": False,
|
||
"ignore_links": False,
|
||
"ignore_images": False,
|
||
"protect_links": False,
|
||
"single_line_break": True,
|
||
"mark_code": True,
|
||
"escape_snob": False,
|
||
}
|
||
|
||
# Update with custom options if provided
|
||
if html2text_options:
|
||
default_options.update(html2text_options)
|
||
elif options:
|
||
default_options.update(options)
|
||
elif self.options:
|
||
default_options.update(self.options)
|
||
|
||
h.update_params(**default_options)
|
||
|
||
# Ensure we have valid input
|
||
if not input_html:
|
||
input_html = ""
|
||
elif not isinstance(input_html, str):
|
||
input_html = str(input_html)
|
||
|
||
# Generate raw markdown
|
||
try:
|
||
raw_markdown = h.handle(input_html)
|
||
except Exception as e:
|
||
raw_markdown = f"Error converting HTML to markdown: {str(e)}"
|
||
|
||
raw_markdown = raw_markdown.replace(" ```", "```")
|
||
|
||
# Convert links to citations
|
||
markdown_with_citations: str = raw_markdown
|
||
references_markdown: str = ""
|
||
if citations:
|
||
try:
|
||
(
|
||
markdown_with_citations,
|
||
references_markdown,
|
||
) = self.convert_links_to_citations(raw_markdown, base_url)
|
||
except Exception as e:
|
||
markdown_with_citations = raw_markdown
|
||
references_markdown = f"Error generating citations: {str(e)}"
|
||
|
||
# Generate fit markdown if content filter is provided
|
||
fit_markdown: Optional[str] = ""
|
||
filtered_html: Optional[str] = ""
|
||
if content_filter or self.content_filter:
|
||
try:
|
||
content_filter = content_filter or self.content_filter
|
||
filtered_html = content_filter.filter_content(input_html)
|
||
filtered_html = "\n".join(
|
||
"<div>{}</div>".format(s) for s in filtered_html
|
||
)
|
||
fit_markdown = h.handle(filtered_html)
|
||
except Exception as e:
|
||
fit_markdown = f"Error generating fit markdown: {str(e)}"
|
||
filtered_html = ""
|
||
|
||
return MarkdownGenerationResult(
|
||
raw_markdown=raw_markdown or "",
|
||
markdown_with_citations=markdown_with_citations or "",
|
||
references_markdown=references_markdown or "",
|
||
fit_markdown=fit_markdown or "",
|
||
fit_html=filtered_html or "",
|
||
)
|
||
except Exception as e:
|
||
# If anything fails, return empty strings with error message
|
||
error_msg = f"Error in markdown generation: {str(e)}"
|
||
return MarkdownGenerationResult(
|
||
raw_markdown=error_msg,
|
||
markdown_with_citations=error_msg,
|
||
references_markdown="",
|
||
fit_markdown="",
|
||
fit_html="",
|
||
)
|
||
|
||
```
|
||
|
||
|
||
## File: crawl4ai/browser_manager.py
|
||
|
||
```py
|
||
import asyncio
|
||
import time
|
||
from typing import List, Optional
|
||
import os
|
||
import sys
|
||
import shutil
|
||
import tempfile
|
||
import subprocess
|
||
from playwright.async_api import BrowserContext
|
||
import hashlib
|
||
from .js_snippet import load_js_script
|
||
from .config import DOWNLOAD_PAGE_TIMEOUT
|
||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||
from playwright_stealth import StealthConfig
|
||
from .utils import get_chromium_path
|
||
|
||
stealth_config = StealthConfig(
|
||
webdriver=True,
|
||
chrome_app=True,
|
||
chrome_csi=True,
|
||
chrome_load_times=True,
|
||
chrome_runtime=True,
|
||
navigator_languages=True,
|
||
navigator_plugins=True,
|
||
navigator_permissions=True,
|
||
webgl_vendor=True,
|
||
outerdimensions=True,
|
||
navigator_hardware_concurrency=True,
|
||
media_codecs=True,
|
||
)
|
||
|
||
BROWSER_DISABLE_OPTIONS = [
|
||
"--disable-background-networking",
|
||
"--disable-background-timer-throttling",
|
||
"--disable-backgrounding-occluded-windows",
|
||
"--disable-breakpad",
|
||
"--disable-client-side-phishing-detection",
|
||
"--disable-component-extensions-with-background-pages",
|
||
"--disable-default-apps",
|
||
"--disable-extensions",
|
||
"--disable-features=TranslateUI",
|
||
"--disable-hang-monitor",
|
||
"--disable-ipc-flooding-protection",
|
||
"--disable-popup-blocking",
|
||
"--disable-prompt-on-repost",
|
||
"--disable-sync",
|
||
"--force-color-profile=srgb",
|
||
"--metrics-recording-only",
|
||
"--no-first-run",
|
||
"--password-store=basic",
|
||
"--use-mock-keychain",
|
||
]
|
||
|
||
|
||
class ManagedBrowser:
|
||
"""
|
||
Manages the browser process and context. This class allows to connect to the browser using CDP protocol.
|
||
|
||
Attributes:
|
||
browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
|
||
Default: "chromium".
|
||
user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
|
||
temporary directory may be used. Default: None.
|
||
headless (bool): Whether to run the browser in headless mode (no visible GUI).
|
||
Default: True.
|
||
browser_process (subprocess.Popen): The process object for the browser.
|
||
temp_dir (str): Temporary directory for user data if not provided.
|
||
debugging_port (int): Port for debugging the browser.
|
||
host (str): Host for debugging the browser.
|
||
|
||
Methods:
|
||
start(): Starts the browser process and returns the CDP endpoint URL.
|
||
_get_browser_path(): Returns the browser executable path based on OS and browser type.
|
||
_get_browser_args(): Returns browser-specific command line arguments.
|
||
_get_user_data_dir(): Returns the user data directory path.
|
||
_cleanup(): Terminates the browser process and removes the temporary directory.
|
||
create_profile(): Static method to create a user profile by launching a browser for user interaction.
|
||
"""
|
||
|
||
browser_type: str
|
||
user_data_dir: str
|
||
headless: bool
|
||
browser_process: subprocess.Popen
|
||
temp_dir: str
|
||
debugging_port: int
|
||
host: str
|
||
|
||
def __init__(
|
||
self,
|
||
browser_type: str = "chromium",
|
||
user_data_dir: Optional[str] = None,
|
||
headless: bool = False,
|
||
logger=None,
|
||
host: str = "localhost",
|
||
debugging_port: int = 9222,
|
||
cdp_url: Optional[str] = None,
|
||
browser_config: Optional[BrowserConfig] = None,
|
||
):
|
||
"""
|
||
Initialize the ManagedBrowser instance.
|
||
|
||
Args:
|
||
browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
|
||
Default: "chromium".
|
||
user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
|
||
temporary directory may be used. Default: None.
|
||
headless (bool): Whether to run the browser in headless mode (no visible GUI).
|
||
Default: True.
|
||
logger (logging.Logger): Logger instance for logging messages. Default: None.
|
||
host (str): Host for debugging the browser. Default: "localhost".
|
||
debugging_port (int): Port for debugging the browser. Default: 9222.
|
||
cdp_url (str or None): CDP URL to connect to the browser. Default: None.
|
||
browser_config (BrowserConfig): Configuration object containing all browser settings. Default: None.
|
||
"""
|
||
self.browser_type = browser_config.browser_type
|
||
self.user_data_dir = browser_config.user_data_dir
|
||
self.headless = browser_config.headless
|
||
self.browser_process = None
|
||
self.temp_dir = None
|
||
self.debugging_port = browser_config.debugging_port
|
||
self.host = browser_config.host
|
||
self.logger = logger
|
||
self.shutting_down = False
|
||
self.cdp_url = browser_config.cdp_url
|
||
self.browser_config = browser_config
|
||
|
||
async def start(self) -> str:
|
||
"""
|
||
Starts the browser process or returns CDP endpoint URL.
|
||
If cdp_url is provided, returns it directly.
|
||
If user_data_dir is not provided for local browser, creates a temporary directory.
|
||
|
||
Returns:
|
||
str: CDP endpoint URL
|
||
"""
|
||
# If CDP URL provided, just return it
|
||
if self.cdp_url:
|
||
return self.cdp_url
|
||
|
||
# Create temp dir if needed
|
||
if not self.user_data_dir:
|
||
self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-")
|
||
self.user_data_dir = self.temp_dir
|
||
|
||
# Get browser path and args based on OS and browser type
|
||
# browser_path = self._get_browser_path()
|
||
args = await self._get_browser_args()
|
||
|
||
if self.browser_config.extra_args:
|
||
args.extend(self.browser_config.extra_args)
|
||
|
||
# Start browser process
|
||
try:
|
||
# Use DETACHED_PROCESS flag on Windows to fully detach the process
|
||
# On Unix, we'll use preexec_fn=os.setpgrp to start the process in a new process group
|
||
if sys.platform == "win32":
|
||
self.browser_process = subprocess.Popen(
|
||
args,
|
||
stdout=subprocess.PIPE,
|
||
stderr=subprocess.PIPE,
|
||
creationflags=subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP
|
||
)
|
||
else:
|
||
self.browser_process = subprocess.Popen(
|
||
args,
|
||
stdout=subprocess.PIPE,
|
||
stderr=subprocess.PIPE,
|
||
preexec_fn=os.setpgrp # Start in a new process group
|
||
)
|
||
|
||
# We'll monitor for a short time to make sure it starts properly, but won't keep monitoring
|
||
await asyncio.sleep(0.5) # Give browser time to start
|
||
await self._initial_startup_check()
|
||
await asyncio.sleep(2) # Give browser time to start
|
||
return f"http://{self.host}:{self.debugging_port}"
|
||
except Exception as e:
|
||
await self.cleanup()
|
||
raise Exception(f"Failed to start browser: {e}")
|
||
|
||
async def _initial_startup_check(self):
|
||
"""
|
||
Perform a quick check to make sure the browser started successfully.
|
||
This only runs once at startup rather than continuously monitoring.
|
||
"""
|
||
if not self.browser_process:
|
||
return
|
||
|
||
# Check that process started without immediate termination
|
||
await asyncio.sleep(0.5)
|
||
if self.browser_process.poll() is not None:
|
||
# Process already terminated
|
||
stdout, stderr = b"", b""
|
||
try:
|
||
stdout, stderr = self.browser_process.communicate(timeout=0.5)
|
||
except subprocess.TimeoutExpired:
|
||
pass
|
||
|
||
self.logger.error(
|
||
message="Browser process terminated during startup | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}",
|
||
tag="ERROR",
|
||
params={
|
||
"code": self.browser_process.returncode,
|
||
"stdout": stdout.decode() if stdout else "",
|
||
"stderr": stderr.decode() if stderr else "",
|
||
},
|
||
)
|
||
|
||
async def _monitor_browser_process(self):
|
||
"""
|
||
Monitor the browser process for unexpected termination.
|
||
|
||
How it works:
|
||
1. Read stdout and stderr from the browser process.
|
||
2. If the process has terminated, log the error message and terminate the browser.
|
||
3. If the shutting_down flag is set, log the normal termination message.
|
||
4. If any other error occurs, log the error message.
|
||
|
||
Note: This method should be called in a separate task to avoid blocking the main event loop.
|
||
This is DEPRECATED and should not be used for builtin browsers that need to outlive the Python process.
|
||
"""
|
||
if self.browser_process:
|
||
try:
|
||
stdout, stderr = await asyncio.gather(
|
||
asyncio.to_thread(self.browser_process.stdout.read),
|
||
asyncio.to_thread(self.browser_process.stderr.read),
|
||
)
|
||
|
||
# Check shutting_down flag BEFORE logging anything
|
||
if self.browser_process.poll() is not None:
|
||
if not self.shutting_down:
|
||
self.logger.error(
|
||
message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}",
|
||
tag="ERROR",
|
||
params={
|
||
"code": self.browser_process.returncode,
|
||
"stdout": stdout.decode(),
|
||
"stderr": stderr.decode(),
|
||
},
|
||
)
|
||
await self.cleanup()
|
||
else:
|
||
self.logger.info(
|
||
message="Browser process terminated normally | Code: {code}",
|
||
tag="INFO",
|
||
params={"code": self.browser_process.returncode},
|
||
)
|
||
except Exception as e:
|
||
if not self.shutting_down:
|
||
self.logger.error(
|
||
message="Error monitoring browser process: {error}",
|
||
tag="ERROR",
|
||
params={"error": str(e)},
|
||
)
|
||
|
||
def _get_browser_path_WIP(self) -> str:
|
||
"""Returns the browser executable path based on OS and browser type"""
|
||
if sys.platform == "darwin": # macOS
|
||
paths = {
|
||
"chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
||
"firefox": "/Applications/Firefox.app/Contents/MacOS/firefox",
|
||
"webkit": "/Applications/Safari.app/Contents/MacOS/Safari",
|
||
}
|
||
elif sys.platform == "win32": # Windows
|
||
paths = {
|
||
"chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
|
||
"firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe",
|
||
"webkit": None, # WebKit not supported on Windows
|
||
}
|
||
else: # Linux
|
||
paths = {
|
||
"chromium": "google-chrome",
|
||
"firefox": "firefox",
|
||
"webkit": None, # WebKit not supported on Linux
|
||
}
|
||
|
||
return paths.get(self.browser_type)
|
||
|
||
async def _get_browser_path(self) -> str:
|
||
browser_path = await get_chromium_path(self.browser_type)
|
||
return browser_path
|
||
|
||
async def _get_browser_args(self) -> List[str]:
|
||
"""Returns browser-specific command line arguments"""
|
||
base_args = [await self._get_browser_path()]
|
||
|
||
if self.browser_type == "chromium":
|
||
args = [
|
||
f"--remote-debugging-port={self.debugging_port}",
|
||
f"--user-data-dir={self.user_data_dir}",
|
||
]
|
||
if self.headless:
|
||
args.append("--headless=new")
|
||
elif self.browser_type == "firefox":
|
||
args = [
|
||
"--remote-debugging-port",
|
||
str(self.debugging_port),
|
||
"--profile",
|
||
self.user_data_dir,
|
||
]
|
||
if self.headless:
|
||
args.append("--headless")
|
||
else:
|
||
raise NotImplementedError(f"Browser type {self.browser_type} not supported")
|
||
|
||
return base_args + args
|
||
|
||
async def cleanup(self):
|
||
"""Cleanup browser process and temporary directory"""
|
||
# Set shutting_down flag BEFORE any termination actions
|
||
self.shutting_down = True
|
||
|
||
if self.browser_process:
|
||
try:
|
||
# For builtin browsers that should persist, we should check if it's a detached process
|
||
# Only terminate if we have proper control over the process
|
||
if not self.browser_process.poll():
|
||
# Process is still running
|
||
self.browser_process.terminate()
|
||
# Wait for process to end gracefully
|
||
for _ in range(10): # 10 attempts, 100ms each
|
||
if self.browser_process.poll() is not None:
|
||
break
|
||
await asyncio.sleep(0.1)
|
||
|
||
# Force kill if still running
|
||
if self.browser_process.poll() is None:
|
||
if sys.platform == "win32":
|
||
# On Windows we might need taskkill for detached processes
|
||
try:
|
||
subprocess.run(["taskkill", "/F", "/PID", str(self.browser_process.pid)])
|
||
except Exception:
|
||
self.browser_process.kill()
|
||
else:
|
||
self.browser_process.kill()
|
||
await asyncio.sleep(0.1) # Brief wait for kill to take effect
|
||
|
||
except Exception as e:
|
||
self.logger.error(
|
||
message="Error terminating browser: {error}",
|
||
tag="ERROR",
|
||
params={"error": str(e)},
|
||
)
|
||
|
||
if self.temp_dir and os.path.exists(self.temp_dir):
|
||
try:
|
||
shutil.rmtree(self.temp_dir)
|
||
except Exception as e:
|
||
self.logger.error(
|
||
message="Error removing temporary directory: {error}",
|
||
tag="ERROR",
|
||
params={"error": str(e)},
|
||
)
|
||
|
||
# These methods have been moved to BrowserProfiler class
|
||
@staticmethod
|
||
async def create_profile(browser_config=None, profile_name=None, logger=None):
|
||
"""
|
||
This method has been moved to the BrowserProfiler class.
|
||
|
||
Creates a browser profile by launching a browser for interactive user setup
|
||
and waits until the user closes it. The profile is stored in a directory that
|
||
can be used later with BrowserConfig.user_data_dir.
|
||
|
||
Please use BrowserProfiler.create_profile() instead.
|
||
|
||
Example:
|
||
```python
|
||
from crawl4ai.browser_profiler import BrowserProfiler
|
||
|
||
profiler = BrowserProfiler()
|
||
profile_path = await profiler.create_profile(profile_name="my-login-profile")
|
||
```
|
||
"""
|
||
from .browser_profiler import BrowserProfiler
|
||
|
||
# Create a BrowserProfiler instance and delegate to it
|
||
profiler = BrowserProfiler(logger=logger)
|
||
return await profiler.create_profile(profile_name=profile_name, browser_config=browser_config)
|
||
|
||
@staticmethod
|
||
def list_profiles():
|
||
"""
|
||
This method has been moved to the BrowserProfiler class.
|
||
|
||
Lists all available browser profiles in the Crawl4AI profiles directory.
|
||
|
||
Please use BrowserProfiler.list_profiles() instead.
|
||
|
||
Example:
|
||
```python
|
||
from crawl4ai.browser_profiler import BrowserProfiler
|
||
|
||
profiler = BrowserProfiler()
|
||
profiles = profiler.list_profiles()
|
||
```
|
||
"""
|
||
from .browser_profiler import BrowserProfiler
|
||
|
||
# Create a BrowserProfiler instance and delegate to it
|
||
profiler = BrowserProfiler()
|
||
return profiler.list_profiles()
|
||
|
||
@staticmethod
|
||
def delete_profile(profile_name_or_path):
|
||
"""
|
||
This method has been moved to the BrowserProfiler class.
|
||
|
||
Delete a browser profile by name or path.
|
||
|
||
Please use BrowserProfiler.delete_profile() instead.
|
||
|
||
Example:
|
||
```python
|
||
from crawl4ai.browser_profiler import BrowserProfiler
|
||
|
||
profiler = BrowserProfiler()
|
||
success = profiler.delete_profile("my-profile")
|
||
```
|
||
"""
|
||
from .browser_profiler import BrowserProfiler
|
||
|
||
# Create a BrowserProfiler instance and delegate to it
|
||
profiler = BrowserProfiler()
|
||
return profiler.delete_profile(profile_name_or_path)
|
||
|
||
|
||
|
||
|
||
class BrowserManager:
|
||
"""
|
||
Manages the browser instance and context.
|
||
|
||
Attributes:
|
||
config (BrowserConfig): Configuration object containing all browser settings
|
||
logger: Logger instance for recording events and errors
|
||
browser (Browser): The browser instance
|
||
default_context (BrowserContext): The default browser context
|
||
managed_browser (ManagedBrowser): The managed browser instance
|
||
playwright (Playwright): The Playwright instance
|
||
sessions (dict): Dictionary to store session information
|
||
session_ttl (int): Session timeout in seconds
|
||
"""
|
||
|
||
_playwright_instance = None
|
||
|
||
@classmethod
|
||
async def get_playwright(cls):
|
||
from playwright.async_api import async_playwright
|
||
cls._playwright_instance = await async_playwright().start()
|
||
return cls._playwright_instance
|
||
|
||
def __init__(self, browser_config: BrowserConfig, logger=None):
|
||
"""
|
||
Initialize the BrowserManager with a browser configuration.
|
||
|
||
Args:
|
||
browser_config (BrowserConfig): Configuration object containing all browser settings
|
||
logger: Logger instance for recording events and errors
|
||
"""
|
||
self.config: BrowserConfig = browser_config
|
||
self.logger = logger
|
||
|
||
# Browser state
|
||
self.browser = None
|
||
self.default_context = None
|
||
self.managed_browser = None
|
||
self.playwright = None
|
||
|
||
# Session management
|
||
self.sessions = {}
|
||
self.session_ttl = 1800 # 30 minutes
|
||
|
||
# Keep track of contexts by a "config signature," so each unique config reuses a single context
|
||
self.contexts_by_config = {}
|
||
self._contexts_lock = asyncio.Lock()
|
||
|
||
# Initialize ManagedBrowser if needed
|
||
if self.config.use_managed_browser:
|
||
self.managed_browser = ManagedBrowser(
|
||
browser_type=self.config.browser_type,
|
||
user_data_dir=self.config.user_data_dir,
|
||
headless=self.config.headless,
|
||
logger=self.logger,
|
||
debugging_port=self.config.debugging_port,
|
||
cdp_url=self.config.cdp_url,
|
||
browser_config=self.config,
|
||
)
|
||
|
||
async def start(self):
|
||
"""
|
||
Start the browser instance and set up the default context.
|
||
|
||
How it works:
|
||
1. Check if Playwright is already initialized.
|
||
2. If not, initialize Playwright.
|
||
3. If managed browser is used, start it and connect to the CDP endpoint.
|
||
4. If managed browser is not used, launch the browser and set up the default context.
|
||
|
||
Note: This method should be called in a separate task to avoid blocking the main event loop.
|
||
"""
|
||
if self.playwright is not None:
|
||
await self.close()
|
||
|
||
from playwright.async_api import async_playwright
|
||
|
||
self.playwright = await async_playwright().start()
|
||
|
||
if self.config.cdp_url or self.config.use_managed_browser:
|
||
self.config.use_managed_browser = True
|
||
cdp_url = await self.managed_browser.start() if not self.config.cdp_url else self.config.cdp_url
|
||
self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
|
||
contexts = self.browser.contexts
|
||
if contexts:
|
||
self.default_context = contexts[0]
|
||
else:
|
||
self.default_context = await self.create_browser_context()
|
||
await self.setup_context(self.default_context)
|
||
else:
|
||
browser_args = self._build_browser_args()
|
||
|
||
# Launch appropriate browser type
|
||
if self.config.browser_type == "firefox":
|
||
self.browser = await self.playwright.firefox.launch(**browser_args)
|
||
elif self.config.browser_type == "webkit":
|
||
self.browser = await self.playwright.webkit.launch(**browser_args)
|
||
else:
|
||
self.browser = await self.playwright.chromium.launch(**browser_args)
|
||
|
||
self.default_context = self.browser
|
||
|
||
|
||
def _build_browser_args(self) -> dict:
|
||
"""Build browser launch arguments from config."""
|
||
args = [
|
||
"--disable-gpu",
|
||
"--disable-gpu-compositing",
|
||
"--disable-software-rasterizer",
|
||
"--no-sandbox",
|
||
"--disable-dev-shm-usage",
|
||
"--no-first-run",
|
||
"--no-default-browser-check",
|
||
"--disable-infobars",
|
||
"--window-position=0,0",
|
||
"--ignore-certificate-errors",
|
||
"--ignore-certificate-errors-spki-list",
|
||
"--disable-blink-features=AutomationControlled",
|
||
"--window-position=400,0",
|
||
"--disable-renderer-backgrounding",
|
||
"--disable-ipc-flooding-protection",
|
||
"--force-color-profile=srgb",
|
||
"--mute-audio",
|
||
"--disable-background-timer-throttling",
|
||
# "--single-process",
|
||
f"--window-size={self.config.viewport_width},{self.config.viewport_height}",
|
||
]
|
||
|
||
if self.config.light_mode:
|
||
args.extend(BROWSER_DISABLE_OPTIONS)
|
||
|
||
if self.config.text_mode:
|
||
args.extend(
|
||
[
|
||
"--blink-settings=imagesEnabled=false",
|
||
"--disable-remote-fonts",
|
||
"--disable-images",
|
||
"--disable-javascript",
|
||
"--disable-software-rasterizer",
|
||
"--disable-dev-shm-usage",
|
||
]
|
||
)
|
||
|
||
if self.config.extra_args:
|
||
args.extend(self.config.extra_args)
|
||
|
||
# Deduplicate args
|
||
args = list(dict.fromkeys(args))
|
||
|
||
browser_args = {"headless": self.config.headless, "args": args}
|
||
|
||
if self.config.chrome_channel:
|
||
browser_args["channel"] = self.config.chrome_channel
|
||
|
||
if self.config.accept_downloads:
|
||
browser_args["downloads_path"] = self.config.downloads_path or os.path.join(
|
||
os.getcwd(), "downloads"
|
||
)
|
||
os.makedirs(browser_args["downloads_path"], exist_ok=True)
|
||
|
||
if self.config.proxy:
|
||
warnings.warn(
|
||
"BrowserConfig.proxy is deprecated and ignored. Use proxy_config instead.",
|
||
DeprecationWarning,
|
||
)
|
||
if self.config.proxy_config:
|
||
from playwright.async_api import ProxySettings
|
||
|
||
proxy_settings = ProxySettings(
|
||
server=self.config.proxy_config.server,
|
||
username=self.config.proxy_config.username,
|
||
password=self.config.proxy_config.password,
|
||
)
|
||
browser_args["proxy"] = proxy_settings
|
||
|
||
return browser_args
|
||
|
||
async def setup_context(
|
||
self,
|
||
context: BrowserContext,
|
||
crawlerRunConfig: CrawlerRunConfig = None,
|
||
is_default=False,
|
||
):
|
||
"""
|
||
Set up a browser context with the configured options.
|
||
|
||
How it works:
|
||
1. Set extra HTTP headers if provided.
|
||
2. Add cookies if provided.
|
||
3. Load storage state if provided.
|
||
4. Accept downloads if enabled.
|
||
5. Set default timeouts for navigation and download.
|
||
6. Set user agent if provided.
|
||
7. Set browser hints if provided.
|
||
8. Set proxy if provided.
|
||
9. Set downloads path if provided.
|
||
10. Set storage state if provided.
|
||
11. Set cache if provided.
|
||
12. Set extra HTTP headers if provided.
|
||
13. Add cookies if provided.
|
||
14. Set default timeouts for navigation and download if enabled.
|
||
15. Set user agent if provided.
|
||
16. Set browser hints if provided.
|
||
|
||
Args:
|
||
context (BrowserContext): The browser context to set up
|
||
crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings
|
||
is_default (bool): Flag indicating if this is the default context
|
||
Returns:
|
||
None
|
||
"""
|
||
if self.config.headers:
|
||
await context.set_extra_http_headers(self.config.headers)
|
||
|
||
if self.config.cookies:
|
||
await context.add_cookies(self.config.cookies)
|
||
|
||
if self.config.storage_state:
|
||
await context.storage_state(path=None)
|
||
|
||
if self.config.accept_downloads:
|
||
context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT)
|
||
context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT)
|
||
if self.config.downloads_path:
|
||
context._impl_obj._options["accept_downloads"] = True
|
||
context._impl_obj._options[
|
||
"downloads_path"
|
||
] = self.config.downloads_path
|
||
|
||
# Handle user agent and browser hints
|
||
if self.config.user_agent:
|
||
combined_headers = {
|
||
"User-Agent": self.config.user_agent,
|
||
"sec-ch-ua": self.config.browser_hint,
|
||
}
|
||
combined_headers.update(self.config.headers)
|
||
await context.set_extra_http_headers(combined_headers)
|
||
|
||
# Add default cookie
|
||
await context.add_cookies(
|
||
[
|
||
{
|
||
"name": "cookiesEnabled",
|
||
"value": "true",
|
||
"url": crawlerRunConfig.url
|
||
if crawlerRunConfig and crawlerRunConfig.url
|
||
else "https://crawl4ai.com/",
|
||
}
|
||
]
|
||
)
|
||
|
||
# Handle navigator overrides
|
||
if crawlerRunConfig:
|
||
if (
|
||
crawlerRunConfig.override_navigator
|
||
or crawlerRunConfig.simulate_user
|
||
or crawlerRunConfig.magic
|
||
):
|
||
await context.add_init_script(load_js_script("navigator_overrider"))
|
||
|
||
async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None):
|
||
"""
|
||
Creates and returns a new browser context with configured settings.
|
||
Applies text-only mode settings if text_mode is enabled in config.
|
||
|
||
Returns:
|
||
Context: Browser context object with the specified configurations
|
||
"""
|
||
# Base settings
|
||
user_agent = self.config.headers.get("User-Agent", self.config.user_agent)
|
||
viewport_settings = {
|
||
"width": self.config.viewport_width,
|
||
"height": self.config.viewport_height,
|
||
}
|
||
proxy_settings = {"server": self.config.proxy} if self.config.proxy else None
|
||
|
||
blocked_extensions = [
|
||
# Images
|
||
"jpg",
|
||
"jpeg",
|
||
"png",
|
||
"gif",
|
||
"webp",
|
||
"svg",
|
||
"ico",
|
||
"bmp",
|
||
"tiff",
|
||
"psd",
|
||
# Fonts
|
||
"woff",
|
||
"woff2",
|
||
"ttf",
|
||
"otf",
|
||
"eot",
|
||
# Styles
|
||
# 'css', 'less', 'scss', 'sass',
|
||
# Media
|
||
"mp4",
|
||
"webm",
|
||
"ogg",
|
||
"avi",
|
||
"mov",
|
||
"wmv",
|
||
"flv",
|
||
"m4v",
|
||
"mp3",
|
||
"wav",
|
||
"aac",
|
||
"m4a",
|
||
"opus",
|
||
"flac",
|
||
# Documents
|
||
"pdf",
|
||
"doc",
|
||
"docx",
|
||
"xls",
|
||
"xlsx",
|
||
"ppt",
|
||
"pptx",
|
||
# Archives
|
||
"zip",
|
||
"rar",
|
||
"7z",
|
||
"tar",
|
||
"gz",
|
||
# Scripts and data
|
||
"xml",
|
||
"swf",
|
||
"wasm",
|
||
]
|
||
|
||
# Common context settings
|
||
context_settings = {
|
||
"user_agent": user_agent,
|
||
"viewport": viewport_settings,
|
||
"proxy": proxy_settings,
|
||
"accept_downloads": self.config.accept_downloads,
|
||
"storage_state": self.config.storage_state,
|
||
"ignore_https_errors": self.config.ignore_https_errors,
|
||
"device_scale_factor": 1.0,
|
||
"java_script_enabled": self.config.java_script_enabled,
|
||
}
|
||
|
||
if crawlerRunConfig:
|
||
# Check if there is value for crawlerRunConfig.proxy_config set add that to context
|
||
if crawlerRunConfig.proxy_config:
|
||
proxy_settings = {
|
||
"server": crawlerRunConfig.proxy_config.server,
|
||
}
|
||
if crawlerRunConfig.proxy_config.username:
|
||
proxy_settings.update({
|
||
"username": crawlerRunConfig.proxy_config.username,
|
||
"password": crawlerRunConfig.proxy_config.password,
|
||
})
|
||
context_settings["proxy"] = proxy_settings
|
||
|
||
if self.config.text_mode:
|
||
text_mode_settings = {
|
||
"has_touch": False,
|
||
"is_mobile": False,
|
||
}
|
||
# Update context settings with text mode settings
|
||
context_settings.update(text_mode_settings)
|
||
|
||
# Create and return the context with all settings
|
||
context = await self.browser.new_context(**context_settings)
|
||
|
||
# Apply text mode settings if enabled
|
||
if self.config.text_mode:
|
||
# Create and apply route patterns for each extension
|
||
for ext in blocked_extensions:
|
||
await context.route(f"**/*.{ext}", lambda route: route.abort())
|
||
return context
|
||
|
||
def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str:
|
||
"""
|
||
Converts the crawlerRunConfig into a dict, excludes ephemeral fields,
|
||
then returns a hash of the sorted JSON. This yields a stable signature
|
||
that identifies configurations requiring a unique browser context.
|
||
"""
|
||
import json
|
||
|
||
config_dict = crawlerRunConfig.__dict__.copy()
|
||
# Exclude items that do not affect browser-level setup.
|
||
# Expand or adjust as needed, e.g. chunking_strategy is purely for data extraction, not for browser config.
|
||
ephemeral_keys = [
|
||
"session_id",
|
||
"js_code",
|
||
"scraping_strategy",
|
||
"extraction_strategy",
|
||
"chunking_strategy",
|
||
"cache_mode",
|
||
"content_filter",
|
||
"semaphore_count",
|
||
"url"
|
||
]
|
||
for key in ephemeral_keys:
|
||
if key in config_dict:
|
||
del config_dict[key]
|
||
# Convert to canonical JSON string
|
||
signature_json = json.dumps(config_dict, sort_keys=True, default=str)
|
||
|
||
# Hash the JSON so we get a compact, unique string
|
||
signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest()
|
||
return signature_hash
|
||
|
||
async def get_page(self, crawlerRunConfig: CrawlerRunConfig):
|
||
"""
|
||
Get a page for the given session ID, creating a new one if needed.
|
||
|
||
Args:
|
||
crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings
|
||
|
||
Returns:
|
||
(page, context): The Page and its BrowserContext
|
||
"""
|
||
self._cleanup_expired_sessions()
|
||
|
||
# If a session_id is provided and we already have it, reuse that page + context
|
||
if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions:
|
||
context, page, _ = self.sessions[crawlerRunConfig.session_id]
|
||
# Update last-used timestamp
|
||
self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
|
||
return page, context
|
||
|
||
# If using a managed browser, just grab the shared default_context
|
||
if self.config.use_managed_browser:
|
||
context = self.default_context
|
||
pages = context.pages
|
||
page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
|
||
if not page:
|
||
page = await context.new_page()
|
||
else:
|
||
# Otherwise, check if we have an existing context for this config
|
||
config_signature = self._make_config_signature(crawlerRunConfig)
|
||
|
||
async with self._contexts_lock:
|
||
if config_signature in self.contexts_by_config:
|
||
context = self.contexts_by_config[config_signature]
|
||
else:
|
||
# Create and setup a new context
|
||
context = await self.create_browser_context(crawlerRunConfig)
|
||
await self.setup_context(context, crawlerRunConfig)
|
||
self.contexts_by_config[config_signature] = context
|
||
|
||
# Create a new page from the chosen context
|
||
page = await context.new_page()
|
||
|
||
# If a session_id is specified, store this session so we can reuse later
|
||
if crawlerRunConfig.session_id:
|
||
self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
|
||
|
||
return page, context
|
||
|
||
async def kill_session(self, session_id: str):
|
||
"""
|
||
Kill a browser session and clean up resources.
|
||
|
||
Args:
|
||
session_id (str): The session ID to kill.
|
||
"""
|
||
if session_id in self.sessions:
|
||
context, page, _ = self.sessions[session_id]
|
||
await page.close()
|
||
if not self.config.use_managed_browser:
|
||
await context.close()
|
||
del self.sessions[session_id]
|
||
|
||
def _cleanup_expired_sessions(self):
|
||
"""Clean up expired sessions based on TTL."""
|
||
current_time = time.time()
|
||
expired_sessions = [
|
||
sid
|
||
for sid, (_, _, last_used) in self.sessions.items()
|
||
if current_time - last_used > self.session_ttl
|
||
]
|
||
for sid in expired_sessions:
|
||
asyncio.create_task(self.kill_session(sid))
|
||
|
||
async def close(self):
|
||
"""Close all browser resources and clean up."""
|
||
if self.config.cdp_url:
|
||
return
|
||
|
||
if self.config.sleep_on_close:
|
||
await asyncio.sleep(0.5)
|
||
|
||
session_ids = list(self.sessions.keys())
|
||
for session_id in session_ids:
|
||
await self.kill_session(session_id)
|
||
|
||
# Now close all contexts we created. This reclaims memory from ephemeral contexts.
|
||
for ctx in self.contexts_by_config.values():
|
||
try:
|
||
await ctx.close()
|
||
except Exception as e:
|
||
self.logger.error(
|
||
message="Error closing context: {error}",
|
||
tag="ERROR",
|
||
params={"error": str(e)}
|
||
)
|
||
self.contexts_by_config.clear()
|
||
|
||
if self.browser:
|
||
await self.browser.close()
|
||
self.browser = None
|
||
|
||
if self.managed_browser:
|
||
await asyncio.sleep(0.5)
|
||
await self.managed_browser.cleanup()
|
||
self.managed_browser = None
|
||
|
||
if self.playwright:
|
||
await self.playwright.stop()
|
||
self.playwright = None
|
||
|
||
```
|
||
|
||
|
||
|
||
|
||
## File: docs/examples/quickstart.py
|
||
|
||
```py
|
||
import os, sys
|
||
|
||
from crawl4ai import LLMConfig
|
||
|
||
sys.path.append(
|
||
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||
)
|
||
|
||
import asyncio
|
||
import time
|
||
import json
|
||
import re
|
||
from typing import Dict
|
||
from bs4 import BeautifulSoup
|
||
from pydantic import BaseModel, Field
|
||
from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
|
||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||
from crawl4ai import (
|
||
JsonCssExtractionStrategy,
|
||
LLMExtractionStrategy,
|
||
)
|
||
|
||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||
|
||
print("Crawl4AI: Advanced Web Crawling and Data Extraction")
|
||
print("GitHub Repository: https://github.com/unclecode/crawl4ai")
|
||
print("Twitter: @unclecode")
|
||
print("Website: https://crawl4ai.com")
|
||
|
||
|
||
# Basic Example - Simple Crawl
|
||
async def simple_crawl():
|
||
print("\n--- Basic Usage ---")
|
||
browser_config = BrowserConfig(headless=True)
|
||
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||
|
||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||
result = await crawler.arun(
|
||
url="https://www.nbcnews.com/business", config=crawler_config
|
||
)
|
||
print(result.markdown[:500])
|
||
|
||
|
||
async def clean_content():
|
||
crawler_config = CrawlerRunConfig(
|
||
cache_mode=CacheMode.BYPASS,
|
||
excluded_tags=["nav", "footer", "aside"],
|
||
remove_overlay_elements=True,
|
||
markdown_generator=DefaultMarkdownGenerator(
|
||
content_filter=PruningContentFilter(
|
||
threshold=0.48, threshold_type="fixed", min_word_threshold=0
|
||
),
|
||
options={"ignore_links": True},
|
||
),
|
||
)
|
||
async with AsyncWebCrawler() as crawler:
|
||
result = await crawler.arun(
|
||
url="https://en.wikipedia.org/wiki/Apple",
|
||
config=crawler_config,
|
||
)
|
||
full_markdown_length = len(result.markdown.raw_markdown)
|
||
fit_markdown_length = len(result.markdown.fit_markdown)
|
||
print(f"Full Markdown Length: {full_markdown_length}")
|
||
print(f"Fit Markdown Length: {fit_markdown_length}")
|
||
|
||
|
||
async def link_analysis():
|
||
crawler_config = CrawlerRunConfig(
|
||
cache_mode=CacheMode.ENABLED,
|
||
exclude_external_links=True,
|
||
exclude_social_media_links=True,
|
||
)
|
||
async with AsyncWebCrawler() as crawler:
|
||
result = await crawler.arun(
|
||
url="https://www.nbcnews.com/business",
|
||
config=crawler_config,
|
||
)
|
||
print(f"Found {len(result.links['internal'])} internal links")
|
||
print(f"Found {len(result.links['external'])} external links")
|
||
|
||
for link in result.links["internal"][:5]:
|
||
print(f"Href: {link['href']}\nText: {link['text']}\n")
|
||
|
||
|
||
# JavaScript Execution Example
|
||
async def simple_example_with_running_js_code():
|
||
print("\n--- Executing JavaScript and Using CSS Selectors ---")
|
||
|
||
browser_config = BrowserConfig(headless=True, java_script_enabled=True)
|
||
|
||
crawler_config = CrawlerRunConfig(
|
||
cache_mode=CacheMode.BYPASS,
|
||
js_code="const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();",
|
||
# wait_for="() => { return Array.from(document.querySelectorAll('article.tease-card')).length > 10; }"
|
||
)
|
||
|
||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||
result = await crawler.arun(
|
||
url="https://www.nbcnews.com/business", config=crawler_config
|
||
)
|
||
print(result.markdown[:500])
|
||
|
||
|
||
# CSS Selector Example
|
||
async def simple_example_with_css_selector():
|
||
print("\n--- Using CSS Selectors ---")
|
||
browser_config = BrowserConfig(headless=True)
|
||
crawler_config = CrawlerRunConfig(
|
||
cache_mode=CacheMode.BYPASS, css_selector=".wide-tease-item__description"
|
||
)
|
||
|
||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||
result = await crawler.arun(
|
||
url="https://www.nbcnews.com/business", config=crawler_config
|
||
)
|
||
print(result.markdown[:500])
|
||
|
||
|
||
async def media_handling():
|
||
crawler_config = CrawlerRunConfig(
|
||
cache_mode=CacheMode.BYPASS, exclude_external_images=True, screenshot=True
|
||
)
|
||
async with AsyncWebCrawler() as crawler:
|
||
result = await crawler.arun(
|
||
url="https://www.nbcnews.com/business", config=crawler_config
|
||
)
|
||
for img in result.media["images"][:5]:
|
||
print(f"Image URL: {img['src']}, Alt: {img['alt']}, Score: {img['score']}")
|
||
|
||
|
||
async def custom_hook_workflow(verbose=True):
|
||
async with AsyncWebCrawler() as crawler:
|
||
# Set a 'before_goto' hook to run custom code just before navigation
|
||
crawler.crawler_strategy.set_hook(
|
||
"before_goto",
|
||
lambda page, context: print("[Hook] Preparing to navigate..."),
|
||
)
|
||
|
||
# Perform the crawl operation
|
||
result = await crawler.arun(url="https://crawl4ai.com")
|
||
print(result.markdown.raw_markdown[:500].replace("\n", " -- "))
|
||
|
||
|
||
# Proxy Example
|
||
async def use_proxy():
|
||
print("\n--- Using a Proxy ---")
|
||
browser_config = BrowserConfig(
|
||
headless=True,
|
||
proxy_config={
|
||
"server": "http://proxy.example.com:8080",
|
||
"username": "username",
|
||
"password": "password",
|
||
},
|
||
)
|
||
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||
|
||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||
result = await crawler.arun(
|
||
url="https://www.nbcnews.com/business", config=crawler_config
|
||
)
|
||
if result.success:
|
||
print(result.markdown[:500])
|
||
|
||
|
||
# Screenshot Example
|
||
async def capture_and_save_screenshot(url: str, output_path: str):
|
||
browser_config = BrowserConfig(headless=True)
|
||
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, screenshot=True)
|
||
|
||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||
result = await crawler.arun(url=url, config=crawler_config)
|
||
|
||
if result.success and result.screenshot:
|
||
import base64
|
||
|
||
screenshot_data = base64.b64decode(result.screenshot)
|
||
with open(output_path, "wb") as f:
|
||
f.write(screenshot_data)
|
||
print(f"Screenshot saved successfully to {output_path}")
|
||
else:
|
||
print("Failed to capture screenshot")
|
||
|
||
|
||
# LLM Extraction Example
|
||
class OpenAIModelFee(BaseModel):
|
||
model_name: str = Field(..., description="Name of the OpenAI model.")
|
||
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
|
||
output_fee: str = Field(
|
||
..., description="Fee for output token for the OpenAI model."
|
||
)
|
||
|
||
|
||
async def extract_structured_data_using_llm(
|
||
provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
|
||
):
|
||
print(f"\n--- Extracting Structured Data with {provider} ---")
|
||
|
||
if api_token is None and provider != "ollama":
|
||
print(f"API token is required for {provider}. Skipping this example.")
|
||
return
|
||
|
||
browser_config = BrowserConfig(headless=True)
|
||
|
||
extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000}
|
||
if extra_headers:
|
||
extra_args["extra_headers"] = extra_headers
|
||
|
||
crawler_config = CrawlerRunConfig(
|
||
cache_mode=CacheMode.BYPASS,
|
||
word_count_threshold=1,
|
||
page_timeout=80000,
|
||
extraction_strategy=LLMExtractionStrategy(
|
||
llm_config=LLMConfig(provider=provider,api_token=api_token),
|
||
schema=OpenAIModelFee.model_json_schema(),
|
||
extraction_type="schema",
|
||
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
||
Do not miss any models in the entire content.""",
|
||
extra_args=extra_args,
|
||
),
|
||
)
|
||
|
||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||
result = await crawler.arun(
|
||
url="https://openai.com/api/pricing/", config=crawler_config
|
||
)
|
||
print(result.extracted_content)
|
||
|
||
|
||
# CSS Extraction Example
|
||
async def extract_structured_data_using_css_extractor():
|
||
print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
|
||
schema = {
|
||
"name": "KidoCode Courses",
|
||
"baseSelector": "section.charge-methodology .framework-collection-item.w-dyn-item",
|
||
"fields": [
|
||
{
|
||
"name": "section_title",
|
||
"selector": "h3.heading-50",
|
||
"type": "text",
|
||
},
|
||
{
|
||
"name": "section_description",
|
||
"selector": ".charge-content",
|
||
"type": "text",
|
||
},
|
||
{
|
||
"name": "course_name",
|
||
"selector": ".text-block-93",
|
||
"type": "text",
|
||
},
|
||
{
|
||
"name": "course_description",
|
||
"selector": ".course-content-text",
|
||
"type": "text",
|
||
},
|
||
{
|
||
"name": "course_icon",
|
||
"selector": ".image-92",
|
||
"type": "attribute",
|
||
"attribute": "src",
|
||
},
|
||
],
|
||
}
|
||
|
||
browser_config = BrowserConfig(headless=True, java_script_enabled=True)
|
||
|
||
js_click_tabs = """
|
||
(async () => {
|
||
const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
|
||
for(let tab of tabs) {
|
||
tab.scrollIntoView();
|
||
tab.click();
|
||
await new Promise(r => setTimeout(r, 500));
|
||
}
|
||
})();
|
||
"""
|
||
|
||
crawler_config = CrawlerRunConfig(
|
||
cache_mode=CacheMode.BYPASS,
|
||
extraction_strategy=JsonCssExtractionStrategy(schema),
|
||
js_code=[js_click_tabs],
|
||
delay_before_return_html=1
|
||
)
|
||
|
||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||
result = await crawler.arun(
|
||
url="https://www.kidocode.com/degrees/technology", config=crawler_config
|
||
)
|
||
|
||
companies = json.loads(result.extracted_content)
|
||
print(f"Successfully extracted {len(companies)} companies")
|
||
print(json.dumps(companies[0], indent=2))
|
||
|
||
|
||
# Dynamic Content Examples - Method 1
|
||
async def crawl_dynamic_content_pages_method_1():
|
||
print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
|
||
first_commit = ""
|
||
|
||
async def on_execution_started(page, **kwargs):
|
||
nonlocal first_commit
|
||
try:
|
||
while True:
|
||
await page.wait_for_selector("li.Box-sc-g0xbh4-0 h4")
|
||
commit = await page.query_selector("li.Box-sc-g0xbh4-0 h4")
|
||
commit = await commit.evaluate("(element) => element.textContent")
|
||
commit = re.sub(r"\s+", "", commit)
|
||
if commit and commit != first_commit:
|
||
first_commit = commit
|
||
break
|
||
await asyncio.sleep(0.5)
|
||
except Exception as e:
|
||
print(f"Warning: New content didn't appear after JavaScript execution: {e}")
|
||
|
||
browser_config = BrowserConfig(headless=False, java_script_enabled=True)
|
||
|
||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||
crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
|
||
|
||
url = "https://github.com/microsoft/TypeScript/commits/main"
|
||
session_id = "typescript_commits_session"
|
||
all_commits = []
|
||
|
||
js_next_page = """
|
||
const button = document.querySelector('a[data-testid="pagination-next-button"]');
|
||
if (button) button.click();
|
||
"""
|
||
|
||
for page in range(3):
|
||
crawler_config = CrawlerRunConfig(
|
||
cache_mode=CacheMode.BYPASS,
|
||
css_selector="li.Box-sc-g0xbh4-0",
|
||
js_code=js_next_page if page > 0 else None,
|
||
js_only=page > 0,
|
||
session_id=session_id,
|
||
)
|
||
|
||
result = await crawler.arun(url=url, config=crawler_config)
|
||
assert result.success, f"Failed to crawl page {page + 1}"
|
||
|
||
soup = BeautifulSoup(result.cleaned_html, "html.parser")
|
||
commits = soup.select("li")
|
||
all_commits.extend(commits)
|
||
|
||
print(f"Page {page + 1}: Found {len(commits)} commits")
|
||
|
||
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
|
||
|
||
|
||
# Dynamic Content Examples - Method 2
|
||
async def crawl_dynamic_content_pages_method_2():
|
||
print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
|
||
|
||
browser_config = BrowserConfig(headless=False, java_script_enabled=True)
|
||
|
||
js_next_page_and_wait = """
|
||
(async () => {
|
||
const getCurrentCommit = () => {
|
||
const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
|
||
return commits.length > 0 ? commits[0].textContent.trim() : null;
|
||
};
|
||
|
||
const initialCommit = getCurrentCommit();
|
||
const button = document.querySelector('a[data-testid="pagination-next-button"]');
|
||
if (button) button.click();
|
||
|
||
while (true) {
|
||
await new Promise(resolve => setTimeout(resolve, 100));
|
||
const newCommit = getCurrentCommit();
|
||
if (newCommit && newCommit !== initialCommit) {
|
||
break;
|
||
}
|
||
}
|
||
})();
|
||
"""
|
||
|
||
schema = {
|
||
"name": "Commit Extractor",
|
||
"baseSelector": "li.Box-sc-g0xbh4-0",
|
||
"fields": [
|
||
{
|
||
"name": "title",
|
||
"selector": "h4.markdown-title",
|
||
"type": "text",
|
||
"transform": "strip",
|
||
},
|
||
],
|
||
}
|
||
|
||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||
url = "https://github.com/microsoft/TypeScript/commits/main"
|
||
session_id = "typescript_commits_session"
|
||
all_commits = []
|
||
|
||
extraction_strategy = JsonCssExtractionStrategy(schema)
|
||
|
||
for page in range(3):
|
||
crawler_config = CrawlerRunConfig(
|
||
cache_mode=CacheMode.BYPASS,
|
||
css_selector="li.Box-sc-g0xbh4-0",
|
||
extraction_strategy=extraction_strategy,
|
||
js_code=js_next_page_and_wait if page > 0 else None,
|
||
js_only=page > 0,
|
||
session_id=session_id,
|
||
)
|
||
|
||
result = await crawler.arun(url=url, config=crawler_config)
|
||
assert result.success, f"Failed to crawl page {page + 1}"
|
||
|
||
commits = json.loads(result.extracted_content)
|
||
all_commits.extend(commits)
|
||
print(f"Page {page + 1}: Found {len(commits)} commits")
|
||
|
||
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
|
||
|
||
|
||
async def cosine_similarity_extraction():
|
||
from crawl4ai import CosineStrategy
|
||
crawl_config = CrawlerRunConfig(
|
||
cache_mode=CacheMode.BYPASS,
|
||
extraction_strategy=CosineStrategy(
|
||
word_count_threshold=10,
|
||
max_dist=0.2, # Maximum distance between two words
|
||
linkage_method="ward", # Linkage method for hierarchical clustering (ward, complete, average, single)
|
||
top_k=3, # Number of top keywords to extract
|
||
sim_threshold=0.3, # Similarity threshold for clustering
|
||
semantic_filter="McDonald's economic impact, American consumer trends", # Keywords to filter the content semantically using embeddings
|
||
verbose=True,
|
||
),
|
||
)
|
||
async with AsyncWebCrawler() as crawler:
|
||
result = await crawler.arun(
|
||
url="https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156",
|
||
config=crawl_config,
|
||
)
|
||
print(json.loads(result.extracted_content)[:5])
|
||
|
||
|
||
# Browser Comparison
|
||
async def crawl_custom_browser_type():
|
||
print("\n--- Browser Comparison ---")
|
||
|
||
# Firefox
|
||
browser_config_firefox = BrowserConfig(browser_type="firefox", headless=True)
|
||
start = time.time()
|
||
async with AsyncWebCrawler(config=browser_config_firefox) as crawler:
|
||
result = await crawler.arun(
|
||
url="https://www.example.com",
|
||
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
|
||
)
|
||
print("Firefox:", time.time() - start)
|
||
print(result.markdown[:500])
|
||
|
||
# WebKit
|
||
browser_config_webkit = BrowserConfig(browser_type="webkit", headless=True)
|
||
start = time.time()
|
||
async with AsyncWebCrawler(config=browser_config_webkit) as crawler:
|
||
result = await crawler.arun(
|
||
url="https://www.example.com",
|
||
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
|
||
)
|
||
print("WebKit:", time.time() - start)
|
||
print(result.markdown[:500])
|
||
|
||
# Chromium (default)
|
||
browser_config_chromium = BrowserConfig(browser_type="chromium", headless=True)
|
||
start = time.time()
|
||
async with AsyncWebCrawler(config=browser_config_chromium) as crawler:
|
||
result = await crawler.arun(
|
||
url="https://www.example.com",
|
||
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
|
||
)
|
||
print("Chromium:", time.time() - start)
|
||
print(result.markdown[:500])
|
||
|
||
|
||
# Anti-Bot and User Simulation
|
||
async def crawl_with_user_simulation():
|
||
browser_config = BrowserConfig(
|
||
headless=True,
|
||
user_agent_mode="random",
|
||
user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
|
||
)
|
||
|
||
crawler_config = CrawlerRunConfig(
|
||
cache_mode=CacheMode.BYPASS,
|
||
magic=True,
|
||
simulate_user=True,
|
||
override_navigator=True,
|
||
)
|
||
|
||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||
result = await crawler.arun(url="YOUR-URL-HERE", config=crawler_config)
|
||
print(result.markdown)
|
||
|
||
|
||
async def ssl_certification():
|
||
# Configure crawler to fetch SSL certificate
|
||
config = CrawlerRunConfig(
|
||
fetch_ssl_certificate=True,
|
||
cache_mode=CacheMode.BYPASS, # Bypass cache to always get fresh certificates
|
||
)
|
||
|
||
async with AsyncWebCrawler() as crawler:
|
||
result = await crawler.arun(url="https://example.com", config=config)
|
||
|
||
if result.success and result.ssl_certificate:
|
||
cert = result.ssl_certificate
|
||
|
||
tmp_dir = os.path.join(__location__, "tmp")
|
||
os.makedirs(tmp_dir, exist_ok=True)
|
||
|
||
# 1. Access certificate properties directly
|
||
print("\nCertificate Information:")
|
||
print(f"Issuer: {cert.issuer.get('CN', '')}")
|
||
print(f"Valid until: {cert.valid_until}")
|
||
print(f"Fingerprint: {cert.fingerprint}")
|
||
|
||
# 2. Export certificate in different formats
|
||
cert.to_json(os.path.join(tmp_dir, "certificate.json")) # For analysis
|
||
print("\nCertificate exported to:")
|
||
print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}")
|
||
|
||
pem_data = cert.to_pem(
|
||
os.path.join(tmp_dir, "certificate.pem")
|
||
) # For web servers
|
||
print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}")
|
||
|
||
der_data = cert.to_der(
|
||
os.path.join(tmp_dir, "certificate.der")
|
||
) # For Java apps
|
||
print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
|
||
|
||
|
||
# Main execution
|
||
async def main():
|
||
# Basic examples
|
||
await simple_crawl()
|
||
await simple_example_with_running_js_code()
|
||
await simple_example_with_css_selector()
|
||
|
||
# Advanced examples
|
||
await extract_structured_data_using_css_extractor()
|
||
await extract_structured_data_using_llm(
|
||
"openai/gpt-4o", os.getenv("OPENAI_API_KEY")
|
||
)
|
||
await crawl_dynamic_content_pages_method_1()
|
||
await crawl_dynamic_content_pages_method_2()
|
||
|
||
# Browser comparisons
|
||
await crawl_custom_browser_type()
|
||
|
||
# Screenshot example
|
||
await capture_and_save_screenshot(
|
||
"https://www.example.com",
|
||
os.path.join(__location__, "tmp/example_screenshot.jpg")
|
||
)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(main())
|
||
|
||
```
|
||
|
||
|
||
## File: docs/examples/quickstart_examples_set_1.py
|
||
|
||
```py
|
||
import asyncio
|
||
import os
|
||
import json
|
||
import base64
|
||
from pathlib import Path
|
||
from typing import List
|
||
from crawl4ai import ProxyConfig
|
||
|
||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult
|
||
from crawl4ai import RoundRobinProxyStrategy
|
||
from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy
|
||
from crawl4ai import LLMConfig
|
||
from crawl4ai import PruningContentFilter, BM25ContentFilter
|
||
from crawl4ai import DefaultMarkdownGenerator
|
||
from crawl4ai import BFSDeepCrawlStrategy, DomainFilter, FilterChain
|
||
from crawl4ai import BrowserConfig
|
||
|
||
__cur_dir__ = Path(__file__).parent
|
||
|
||
async def demo_basic_crawl():
|
||
"""Basic web crawling with markdown generation"""
|
||
print("\n=== 1. Basic Web Crawling ===")
|
||
async with AsyncWebCrawler(config = BrowserConfig(
|
||
viewport_height=800,
|
||
viewport_width=1200,
|
||
headless=True,
|
||
verbose=True,
|
||
)) as crawler:
|
||
results: List[CrawlResult] = await crawler.arun(
|
||
url="https://news.ycombinator.com/"
|
||
)
|
||
|
||
for i, result in enumerate(results):
|
||
print(f"Result {i + 1}:")
|
||
print(f"Success: {result.success}")
|
||
if result.success:
|
||
print(f"Markdown length: {len(result.markdown.raw_markdown)} chars")
|
||
print(f"First 100 chars: {result.markdown.raw_markdown[:100]}...")
|
||
else:
|
||
print("Failed to crawl the URL")
|
||
|
||
async def demo_parallel_crawl():
|
||
"""Crawl multiple URLs in parallel"""
|
||
print("\n=== 2. Parallel Crawling ===")
|
||
|
||
urls = [
|
||
"https://news.ycombinator.com/",
|
||
"https://example.com/",
|
||
"https://httpbin.org/html",
|
||
]
|
||
|
||
async with AsyncWebCrawler() as crawler:
|
||
results: List[CrawlResult] = await crawler.arun_many(
|
||
urls=urls,
|
||
)
|
||
|
||
print(f"Crawled {len(results)} URLs in parallel:")
|
||
for i, result in enumerate(results):
|
||
print(
|
||
f" {i + 1}. {result.url} - {'Success' if result.success else 'Failed'}"
|
||
)
|
||
|
||
async def demo_fit_markdown():
|
||
"""Generate focused markdown with LLM content filter"""
|
||
print("\n=== 3. Fit Markdown with LLM Content Filter ===")
|
||
|
||
async with AsyncWebCrawler() as crawler:
|
||
result: CrawlResult = await crawler.arun(
|
||
url = "https://en.wikipedia.org/wiki/Python_(programming_language)",
|
||
config=CrawlerRunConfig(
|
||
markdown_generator=DefaultMarkdownGenerator(
|
||
content_filter=PruningContentFilter()
|
||
)
|
||
),
|
||
)
|
||
|
||
# Print stats and save the fit markdown
|
||
print(f"Raw: {len(result.markdown.raw_markdown)} chars")
|
||
print(f"Fit: {len(result.markdown.fit_markdown)} chars")
|
||
|
||
async def demo_llm_structured_extraction_no_schema():
|
||
# Create a simple LLM extraction strategy (no schema required)
|
||
extraction_strategy = LLMExtractionStrategy(
|
||
llm_config=LLMConfig(
|
||
provider="groq/qwen-2.5-32b",
|
||
api_token="env:GROQ_API_KEY",
|
||
),
|
||
instruction="This is news.ycombinator.com, extract all news, and for each, I want title, source url, number of comments.",
|
||
extract_type="schema",
|
||
schema="{title: string, url: string, comments: int}",
|
||
extra_args={
|
||
"temperature": 0.0,
|
||
"max_tokens": 4096,
|
||
},
|
||
verbose=True,
|
||
)
|
||
|
||
config = CrawlerRunConfig(extraction_strategy=extraction_strategy)
|
||
|
||
async with AsyncWebCrawler() as crawler:
|
||
results: List[CrawlResult] = await crawler.arun(
|
||
"https://news.ycombinator.com/", config=config
|
||
)
|
||
|
||
for result in results:
|
||
print(f"URL: {result.url}")
|
||
print(f"Success: {result.success}")
|
||
if result.success:
|
||
data = json.loads(result.extracted_content)
|
||
print(json.dumps(data, indent=2))
|
||
else:
|
||
print("Failed to extract structured data")
|
||
|
||
async def demo_css_structured_extraction_no_schema():
|
||
"""Extract structured data using CSS selectors"""
|
||
print("\n=== 5. CSS-Based Structured Extraction ===")
|
||
# Sample HTML for schema generation (one-time cost)
|
||
sample_html = """
|
||
<div class="body-post clear">
|
||
<a class="story-link" href="https://thehackernews.com/2025/04/malicious-python-packages-on-pypi.html">
|
||
<div class="clear home-post-box cf">
|
||
<div class="home-img clear">
|
||
<div class="img-ratio">
|
||
<img alt="..." src="...">
|
||
</div>
|
||
</div>
|
||
<div class="clear home-right">
|
||
<h2 class="home-title">Malicious Python Packages on PyPI Downloaded 39,000+ Times, Steal Sensitive Data</h2>
|
||
<div class="item-label">
|
||
<span class="h-datetime"><i class="icon-font icon-calendar"></i>Apr 05, 2025</span>
|
||
<span class="h-tags">Malware / Supply Chain Attack</span>
|
||
</div>
|
||
<div class="home-desc"> Cybersecurity researchers have...</div>
|
||
</div>
|
||
</div>
|
||
</a>
|
||
</div>
|
||
"""
|
||
|
||
# Check if schema file exists
|
||
schema_file_path = f"{__cur_dir__}/tmp/schema.json"
|
||
if os.path.exists(schema_file_path):
|
||
with open(schema_file_path, "r") as f:
|
||
schema = json.load(f)
|
||
else:
|
||
# Generate schema using LLM (one-time setup)
|
||
schema = JsonCssExtractionStrategy.generate_schema(
|
||
html=sample_html,
|
||
llm_config=LLMConfig(
|
||
provider="groq/qwen-2.5-32b",
|
||
api_token="env:GROQ_API_KEY",
|
||
),
|
||
query="From https://thehackernews.com/, I have shared a sample of one news div with a title, date, and description. Please generate a schema for this news div.",
|
||
)
|
||
|
||
print(f"Generated schema: {json.dumps(schema, indent=2)}")
|
||
# Save the schema to a file , and use it for future extractions, in result for such extraction you will call LLM once
|
||
with open(f"{__cur_dir__}/tmp/schema.json", "w") as f:
|
||
json.dump(schema, f, indent=2)
|
||
|
||
# Create no-LLM extraction strategy with the generated schema
|
||
extraction_strategy = JsonCssExtractionStrategy(schema)
|
||
config = CrawlerRunConfig(extraction_strategy=extraction_strategy)
|
||
|
||
# Use the fast CSS extraction (no LLM calls during extraction)
|
||
async with AsyncWebCrawler() as crawler:
|
||
results: List[CrawlResult] = await crawler.arun(
|
||
"https://thehackernews.com", config=config
|
||
)
|
||
|
||
for result in results:
|
||
print(f"URL: {result.url}")
|
||
print(f"Success: {result.success}")
|
||
if result.success:
|
||
data = json.loads(result.extracted_content)
|
||
print(json.dumps(data, indent=2))
|
||
else:
|
||
print("Failed to extract structured data")
|
||
|
||
async def demo_deep_crawl():
|
||
"""Deep crawling with BFS strategy"""
|
||
print("\n=== 6. Deep Crawling ===")
|
||
|
||
filter_chain = FilterChain([DomainFilter(allowed_domains=["crawl4ai.com"])])
|
||
|
||
deep_crawl_strategy = BFSDeepCrawlStrategy(
|
||
max_depth=1, max_pages=5, filter_chain=filter_chain
|
||
)
|
||
|
||
async with AsyncWebCrawler() as crawler:
|
||
results: List[CrawlResult] = await crawler.arun(
|
||
url="https://docs.crawl4ai.com",
|
||
config=CrawlerRunConfig(deep_crawl_strategy=deep_crawl_strategy),
|
||
)
|
||
|
||
print(f"Deep crawl returned {len(results)} pages:")
|
||
for i, result in enumerate(results):
|
||
depth = result.metadata.get("depth", "unknown")
|
||
print(f" {i + 1}. {result.url} (Depth: {depth})")
|
||
|
||
async def demo_js_interaction():
|
||
"""Execute JavaScript to load more content"""
|
||
print("\n=== 7. JavaScript Interaction ===")
|
||
|
||
# A simple page that needs JS to reveal content
|
||
async with AsyncWebCrawler(config=BrowserConfig(headless=False)) as crawler:
|
||
# Initial load
|
||
|
||
news_schema = {
|
||
"name": "news",
|
||
"baseSelector": "tr.athing",
|
||
"fields": [
|
||
{
|
||
"name": "title",
|
||
"selector": "span.titleline",
|
||
"type": "text",
|
||
}
|
||
],
|
||
}
|
||
results: List[CrawlResult] = await crawler.arun(
|
||
url="https://news.ycombinator.com",
|
||
config=CrawlerRunConfig(
|
||
session_id="hn_session", # Keep session
|
||
extraction_strategy=JsonCssExtractionStrategy(schema=news_schema),
|
||
),
|
||
)
|
||
|
||
news = []
|
||
for result in results:
|
||
if result.success:
|
||
data = json.loads(result.extracted_content)
|
||
news.extend(data)
|
||
print(json.dumps(data, indent=2))
|
||
else:
|
||
print("Failed to extract structured data")
|
||
|
||
print(f"Initial items: {len(news)}")
|
||
|
||
# Click "More" link
|
||
more_config = CrawlerRunConfig(
|
||
js_code="document.querySelector('a.morelink').click();",
|
||
js_only=True, # Continue in same page
|
||
session_id="hn_session", # Keep session
|
||
extraction_strategy=JsonCssExtractionStrategy(
|
||
schema=news_schema,
|
||
),
|
||
)
|
||
|
||
result: List[CrawlResult] = await crawler.arun(
|
||
url="https://news.ycombinator.com", config=more_config
|
||
)
|
||
|
||
# Extract new items
|
||
for result in results:
|
||
if result.success:
|
||
data = json.loads(result.extracted_content)
|
||
news.extend(data)
|
||
print(json.dumps(data, indent=2))
|
||
else:
|
||
print("Failed to extract structured data")
|
||
print(f"Total items: {len(news)}")
|
||
|
||
async def demo_media_and_links():
|
||
"""Extract media and links from a page"""
|
||
print("\n=== 8. Media and Links Extraction ===")
|
||
|
||
async with AsyncWebCrawler() as crawler:
|
||
result: List[CrawlResult] = await crawler.arun("https://en.wikipedia.org/wiki/Main_Page")
|
||
|
||
for i, result in enumerate(result):
|
||
# Extract and save all images
|
||
images = result.media.get("images", [])
|
||
print(f"Found {len(images)} images")
|
||
|
||
# Extract and save all links (internal and external)
|
||
internal_links = result.links.get("internal", [])
|
||
external_links = result.links.get("external", [])
|
||
print(f"Found {len(internal_links)} internal links")
|
||
print(f"Found {len(external_links)} external links")
|
||
|
||
# Print some of the images and links
|
||
for image in images[:3]:
|
||
print(f"Image: {image['src']}")
|
||
for link in internal_links[:3]:
|
||
print(f"Internal link: {link['href']}")
|
||
for link in external_links[:3]:
|
||
print(f"External link: {link['href']}")
|
||
|
||
# # Save everything to files
|
||
with open(f"{__cur_dir__}/tmp/images.json", "w") as f:
|
||
json.dump(images, f, indent=2)
|
||
|
||
with open(f"{__cur_dir__}/tmp/links.json", "w") as f:
|
||
json.dump(
|
||
{"internal": internal_links, "external": external_links},
|
||
f,
|
||
indent=2,
|
||
)
|
||
|
||
async def demo_screenshot_and_pdf():
|
||
"""Capture screenshot and PDF of a page"""
|
||
print("\n=== 9. Screenshot and PDF Capture ===")
|
||
|
||
async with AsyncWebCrawler() as crawler:
|
||
result: List[CrawlResult] = await crawler.arun(
|
||
# url="https://example.com",
|
||
url="https://en.wikipedia.org/wiki/Giant_anteater",
|
||
config=CrawlerRunConfig(screenshot=True, pdf=True),
|
||
)
|
||
|
||
for i, result in enumerate(result):
|
||
# if result.screenshot_data:
|
||
if result.screenshot:
|
||
# Save screenshot
|
||
screenshot_path = f"{__cur_dir__}/tmp/example_screenshot.png"
|
||
with open(screenshot_path, "wb") as f:
|
||
f.write(base64.b64decode(result.screenshot))
|
||
print(f"Screenshot saved to {screenshot_path}")
|
||
|
||
# if result.pdf_data:
|
||
if result.pdf:
|
||
# Save PDF
|
||
pdf_path = f"{__cur_dir__}/tmp/example.pdf"
|
||
with open(pdf_path, "wb") as f:
|
||
f.write(result.pdf)
|
||
print(f"PDF saved to {pdf_path}")
|
||
|
||
async def demo_proxy_rotation():
|
||
"""Proxy rotation for multiple requests"""
|
||
print("\n=== 10. Proxy Rotation ===")
|
||
|
||
# Example proxies (replace with real ones)
|
||
proxies = [
|
||
ProxyConfig(server="http://proxy1.example.com:8080"),
|
||
ProxyConfig(server="http://proxy2.example.com:8080"),
|
||
]
|
||
|
||
proxy_strategy = RoundRobinProxyStrategy(proxies)
|
||
|
||
print(f"Using {len(proxies)} proxies in rotation")
|
||
print(
|
||
"Note: This example uses placeholder proxies - replace with real ones to test"
|
||
)
|
||
|
||
async with AsyncWebCrawler() as crawler:
|
||
config = CrawlerRunConfig(
|
||
proxy_rotation_strategy=proxy_strategy
|
||
)
|
||
|
||
# In a real scenario, these would be run and the proxies would rotate
|
||
print("In a real scenario, requests would rotate through the available proxies")
|
||
|
||
async def demo_raw_html_and_file():
|
||
"""Process raw HTML and local files"""
|
||
print("\n=== 11. Raw HTML and Local Files ===")
|
||
|
||
raw_html = """
|
||
<html><body>
|
||
<h1>Sample Article</h1>
|
||
<p>This is sample content for testing Crawl4AI's raw HTML processing.</p>
|
||
</body></html>
|
||
"""
|
||
|
||
# Save to file
|
||
file_path = Path("docs/examples/tmp/sample.html").absolute()
|
||
with open(file_path, "w") as f:
|
||
f.write(raw_html)
|
||
|
||
async with AsyncWebCrawler() as crawler:
|
||
# Crawl raw HTML
|
||
raw_result = await crawler.arun(
|
||
url="raw:" + raw_html, config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||
)
|
||
print("Raw HTML processing:")
|
||
print(f" Markdown: {raw_result.markdown.raw_markdown[:50]}...")
|
||
|
||
# Crawl local file
|
||
file_result = await crawler.arun(
|
||
url=f"file://{file_path}",
|
||
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
|
||
)
|
||
print("\nLocal file processing:")
|
||
print(f" Markdown: {file_result.markdown.raw_markdown[:50]}...")
|
||
|
||
# Clean up
|
||
os.remove(file_path)
|
||
print(f"Processed both raw HTML and local file ({file_path})")
|
||
|
||
async def main():
|
||
"""Run all demo functions sequentially"""
|
||
print("=== Comprehensive Crawl4AI Demo ===")
|
||
print("Note: Some examples require API keys or other configurations")
|
||
|
||
# Run all demos
|
||
await demo_basic_crawl()
|
||
await demo_parallel_crawl()
|
||
await demo_fit_markdown()
|
||
await demo_llm_structured_extraction_no_schema()
|
||
await demo_css_structured_extraction_no_schema()
|
||
await demo_deep_crawl()
|
||
await demo_js_interaction()
|
||
await demo_media_and_links()
|
||
await demo_screenshot_and_pdf()
|
||
# # await demo_proxy_rotation()
|
||
await demo_raw_html_and_file()
|
||
|
||
# Clean up any temp files that may have been created
|
||
print("\n=== Demo Complete ===")
|
||
print("Check for any generated files (screenshots, PDFs) in the current directory")
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(main())
|
||
|
||
```
|
||
|
||
|
||
|
||
|
||
## File: docs/examples/dispatcher_example.py
|
||
|
||
```py
|
||
import asyncio
|
||
import time
|
||
from rich import print
|
||
from rich.table import Table
|
||
from crawl4ai import (
|
||
AsyncWebCrawler,
|
||
BrowserConfig,
|
||
CrawlerRunConfig,
|
||
MemoryAdaptiveDispatcher,
|
||
SemaphoreDispatcher,
|
||
RateLimiter,
|
||
CrawlerMonitor,
|
||
DisplayMode,
|
||
CacheMode,
|
||
LXMLWebScrapingStrategy,
|
||
)
|
||
|
||
|
||
async def memory_adaptive(urls, browser_config, run_config):
|
||
"""Memory adaptive crawler with monitoring"""
|
||
start = time.perf_counter()
|
||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||
dispatcher = MemoryAdaptiveDispatcher(
|
||
memory_threshold_percent=70.0,
|
||
max_session_permit=10,
|
||
monitor=CrawlerMonitor(
|
||
max_visible_rows=15, display_mode=DisplayMode.DETAILED
|
||
),
|
||
)
|
||
results = await crawler.arun_many(
|
||
urls, config=run_config, dispatcher=dispatcher
|
||
)
|
||
duration = time.perf_counter() - start
|
||
return len(results), duration
|
||
|
||
|
||
async def memory_adaptive_with_rate_limit(urls, browser_config, run_config):
|
||
"""Memory adaptive crawler with rate limiting"""
|
||
start = time.perf_counter()
|
||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||
dispatcher = MemoryAdaptiveDispatcher(
|
||
memory_threshold_percent=95.0,
|
||
max_session_permit=10,
|
||
rate_limiter=RateLimiter(
|
||
base_delay=(1.0, 2.0), max_delay=30.0, max_retries=2
|
||
),
|
||
monitor=CrawlerMonitor(
|
||
max_visible_rows=15, display_mode=DisplayMode.DETAILED
|
||
),
|
||
)
|
||
results = await crawler.arun_many(
|
||
urls, config=run_config, dispatcher=dispatcher
|
||
)
|
||
duration = time.perf_counter() - start
|
||
return len(results), duration
|
||
|
||
|
||
async def semaphore(urls, browser_config, run_config):
|
||
"""Basic semaphore crawler"""
|
||
start = time.perf_counter()
|
||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||
dispatcher = SemaphoreDispatcher(
|
||
semaphore_count=5,
|
||
monitor=CrawlerMonitor(
|
||
max_visible_rows=15, display_mode=DisplayMode.DETAILED
|
||
),
|
||
)
|
||
results = await crawler.arun_many(
|
||
urls, config=run_config, dispatcher=dispatcher
|
||
)
|
||
duration = time.perf_counter() - start
|
||
return len(results), duration
|
||
|
||
|
||
async def semaphore_with_rate_limit(urls, browser_config, run_config):
|
||
"""Semaphore crawler with rate limiting"""
|
||
start = time.perf_counter()
|
||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||
dispatcher = SemaphoreDispatcher(
|
||
semaphore_count=5,
|
||
rate_limiter=RateLimiter(
|
||
base_delay=(1.0, 2.0), max_delay=30.0, max_retries=2
|
||
),
|
||
monitor=CrawlerMonitor(
|
||
max_visible_rows=15, display_mode=DisplayMode.DETAILED
|
||
),
|
||
)
|
||
results = await crawler.arun_many(
|
||
urls, config=run_config, dispatcher=dispatcher
|
||
)
|
||
duration = time.perf_counter() - start
|
||
return len(results), duration
|
||
|
||
|
||
def create_performance_table(results):
|
||
"""Creates a rich table showing performance results"""
|
||
table = Table(title="Crawler Strategy Performance Comparison")
|
||
table.add_column("Strategy", style="cyan")
|
||
table.add_column("URLs Crawled", justify="right", style="green")
|
||
table.add_column("Time (seconds)", justify="right", style="yellow")
|
||
table.add_column("URLs/second", justify="right", style="magenta")
|
||
|
||
sorted_results = sorted(results.items(), key=lambda x: x[1][1])
|
||
|
||
for strategy, (urls_crawled, duration) in sorted_results:
|
||
urls_per_second = urls_crawled / duration
|
||
table.add_row(
|
||
strategy, str(urls_crawled), f"{duration:.2f}", f"{urls_per_second:.2f}"
|
||
)
|
||
|
||
return table
|
||
|
||
|
||
async def main():
|
||
urls = [f"https://example.com/page{i}" for i in range(1, 40)]
|
||
browser_config = BrowserConfig(headless=True, verbose=False)
|
||
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, scraping_strategy=LXMLWebScrapingStrategy())
|
||
|
||
results = {
|
||
"Memory Adaptive": await memory_adaptive(urls, browser_config, run_config),
|
||
# "Memory Adaptive + Rate Limit": await memory_adaptive_with_rate_limit(
|
||
# urls, browser_config, run_config
|
||
# ),
|
||
# "Semaphore": await semaphore(urls, browser_config, run_config),
|
||
# "Semaphore + Rate Limit": await semaphore_with_rate_limit(
|
||
# urls, browser_config, run_config
|
||
# ),
|
||
}
|
||
|
||
table = create_performance_table(results)
|
||
print("\nPerformance Summary:")
|
||
print(table)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(main())
|
||
|
||
```
|
||
|
||
|
||
## File: docs/examples/hello_world.py
|
||
|
||
```py
|
||
import asyncio
|
||
from crawl4ai import (
|
||
AsyncWebCrawler,
|
||
BrowserConfig,
|
||
CrawlerRunConfig,
|
||
CacheMode,
|
||
DefaultMarkdownGenerator,
|
||
PruningContentFilter,
|
||
CrawlResult
|
||
)
|
||
|
||
async def example_cdp():
|
||
browser_conf = BrowserConfig(
|
||
headless=False,
|
||
cdp_url="http://localhost:9223"
|
||
)
|
||
crawler_config = CrawlerRunConfig(
|
||
session_id="test",
|
||
js_code = """(() => { return {"result": "Hello World!"} })()""",
|
||
js_only=True
|
||
)
|
||
async with AsyncWebCrawler(
|
||
config=browser_conf,
|
||
verbose=True,
|
||
) as crawler:
|
||
result : CrawlResult = await crawler.arun(
|
||
url="https://www.helloworld.org",
|
||
config=crawler_config,
|
||
)
|
||
print(result.js_execution_result)
|
||
|
||
|
||
async def main():
|
||
browser_config = BrowserConfig(headless=True, verbose=True)
|
||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||
crawler_config = CrawlerRunConfig(
|
||
cache_mode=CacheMode.BYPASS,
|
||
markdown_generator=DefaultMarkdownGenerator(
|
||
content_filter=PruningContentFilter(
|
||
threshold=0.48, threshold_type="fixed", min_word_threshold=0
|
||
)
|
||
),
|
||
)
|
||
result : CrawlResult = await crawler.arun(
|
||
url="https://www.helloworld.org", config=crawler_config
|
||
)
|
||
print(result.markdown.raw_markdown[:500])
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(main())
|
||
|
||
```
|
||
|
||
|
||
## File: docs/examples/hooks_example.py
|
||
|
||
```py
|
||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||
from playwright.async_api import Page, BrowserContext
|
||
|
||
|
||
async def main():
|
||
print("🔗 Hooks Example: Demonstrating different hook use cases")
|
||
|
||
# Configure browser settings
|
||
browser_config = BrowserConfig(headless=True)
|
||
|
||
# Configure crawler settings
|
||
crawler_run_config = CrawlerRunConfig(
|
||
js_code="window.scrollTo(0, document.body.scrollHeight);",
|
||
wait_for="body",
|
||
cache_mode=CacheMode.BYPASS,
|
||
)
|
||
|
||
# Create crawler instance
|
||
crawler = AsyncWebCrawler(config=browser_config)
|
||
|
||
# Define and set hook functions
|
||
async def on_browser_created(browser, context: BrowserContext, **kwargs):
|
||
"""Hook called after the browser is created"""
|
||
print("[HOOK] on_browser_created - Browser is ready!")
|
||
# Example: Set a cookie that will be used for all requests
|
||
return browser
|
||
|
||
async def on_page_context_created(page: Page, context: BrowserContext, **kwargs):
|
||
"""Hook called after a new page and context are created"""
|
||
print("[HOOK] on_page_context_created - New page created!")
|
||
# Example: Set default viewport size
|
||
await context.add_cookies(
|
||
[
|
||
{
|
||
"name": "session_id",
|
||
"value": "example_session",
|
||
"domain": ".example.com",
|
||
"path": "/",
|
||
}
|
||
]
|
||
)
|
||
await page.set_viewport_size({"width": 1080, "height": 800})
|
||
return page
|
||
|
||
async def on_user_agent_updated(
|
||
page: Page, context: BrowserContext, user_agent: str, **kwargs
|
||
):
|
||
"""Hook called when the user agent is updated"""
|
||
print(f"[HOOK] on_user_agent_updated - New user agent: {user_agent}")
|
||
return page
|
||
|
||
async def on_execution_started(page: Page, context: BrowserContext, **kwargs):
|
||
"""Hook called after custom JavaScript execution"""
|
||
print("[HOOK] on_execution_started - Custom JS executed!")
|
||
return page
|
||
|
||
async def before_goto(page: Page, context: BrowserContext, url: str, **kwargs):
|
||
"""Hook called before navigating to each URL"""
|
||
print(f"[HOOK] before_goto - About to visit: {url}")
|
||
# Example: Add custom headers for the request
|
||
await page.set_extra_http_headers({"Custom-Header": "my-value"})
|
||
return page
|
||
|
||
async def after_goto(
|
||
page: Page, context: BrowserContext, url: str, response: dict, **kwargs
|
||
):
|
||
"""Hook called after navigating to each URL"""
|
||
print(f"[HOOK] after_goto - Successfully loaded: {url}")
|
||
# Example: Wait for a specific element to be loaded
|
||
try:
|
||
await page.wait_for_selector(".content", timeout=1000)
|
||
print("Content element found!")
|
||
except:
|
||
print("Content element not found, continuing anyway")
|
||
return page
|
||
|
||
async def before_retrieve_html(page: Page, context: BrowserContext, **kwargs):
|
||
"""Hook called before retrieving the HTML content"""
|
||
print("[HOOK] before_retrieve_html - About to get HTML content")
|
||
# Example: Scroll to bottom to trigger lazy loading
|
||
await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
|
||
return page
|
||
|
||
async def before_return_html(
|
||
page: Page, context: BrowserContext, html: str, **kwargs
|
||
):
|
||
"""Hook called before returning the HTML content"""
|
||
print(f"[HOOK] before_return_html - Got HTML content (length: {len(html)})")
|
||
# Example: You could modify the HTML content here if needed
|
||
return page
|
||
|
||
# Set all the hooks
|
||
crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created)
|
||
crawler.crawler_strategy.set_hook(
|
||
"on_page_context_created", on_page_context_created
|
||
)
|
||
crawler.crawler_strategy.set_hook("on_user_agent_updated", on_user_agent_updated)
|
||
crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
|
||
crawler.crawler_strategy.set_hook("before_goto", before_goto)
|
||
crawler.crawler_strategy.set_hook("after_goto", after_goto)
|
||
crawler.crawler_strategy.set_hook("before_retrieve_html", before_retrieve_html)
|
||
crawler.crawler_strategy.set_hook("before_return_html", before_return_html)
|
||
|
||
await crawler.start()
|
||
|
||
# Example usage: crawl a simple website
|
||
url = "https://example.com"
|
||
result = await crawler.arun(url, config=crawler_run_config)
|
||
print(f"\nCrawled URL: {result.url}")
|
||
print(f"HTML length: {len(result.html)}")
|
||
|
||
await crawler.close()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
import asyncio
|
||
|
||
asyncio.run(main())
|
||
|
||
```
|
||
|
||
|
||
|
||
## File: crawl4ai/deep_crawling/__init__.py
|
||
|
||
```py
|
||
# deep_crawling/__init__.py
|
||
from .base_strategy import DeepCrawlDecorator, DeepCrawlStrategy
|
||
from .bfs_strategy import BFSDeepCrawlStrategy
|
||
from .bff_strategy import BestFirstCrawlingStrategy
|
||
from .dfs_strategy import DFSDeepCrawlStrategy
|
||
from .filters import (
|
||
FilterChain,
|
||
ContentTypeFilter,
|
||
DomainFilter,
|
||
URLFilter,
|
||
URLPatternFilter,
|
||
FilterStats,
|
||
ContentRelevanceFilter,
|
||
SEOFilter
|
||
)
|
||
from .scorers import (
|
||
KeywordRelevanceScorer,
|
||
URLScorer,
|
||
CompositeScorer,
|
||
DomainAuthorityScorer,
|
||
FreshnessScorer,
|
||
PathDepthScorer,
|
||
ContentTypeScorer
|
||
)
|
||
|
||
__all__ = [
|
||
"DeepCrawlDecorator",
|
||
"DeepCrawlStrategy",
|
||
"BFSDeepCrawlStrategy",
|
||
"BestFirstCrawlingStrategy",
|
||
"DFSDeepCrawlStrategy",
|
||
"FilterChain",
|
||
"ContentTypeFilter",
|
||
"DomainFilter",
|
||
"URLFilter",
|
||
"URLPatternFilter",
|
||
"FilterStats",
|
||
"ContentRelevanceFilter",
|
||
"SEOFilter",
|
||
"KeywordRelevanceScorer",
|
||
"URLScorer",
|
||
"CompositeScorer",
|
||
"DomainAuthorityScorer",
|
||
"FreshnessScorer",
|
||
"PathDepthScorer",
|
||
"ContentTypeScorer",
|
||
]
|
||
|
||
```
|
||
|
||
|
||
## File: crawl4ai/deep_crawling/base_strategy.py
|
||
|
||
```py
|
||
from __future__ import annotations
|
||
|
||
from abc import ABC, abstractmethod
|
||
from typing import AsyncGenerator, Optional, Set, List, Dict
|
||
from functools import wraps
|
||
from contextvars import ContextVar
|
||
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn
|
||
|
||
|
||
class DeepCrawlDecorator:
|
||
"""Decorator that adds deep crawling capability to arun method."""
|
||
deep_crawl_active = ContextVar("deep_crawl_active", default=False)
|
||
|
||
def __init__(self, crawler: AsyncWebCrawler):
|
||
self.crawler = crawler
|
||
|
||
def __call__(self, original_arun):
|
||
@wraps(original_arun)
|
||
async def wrapped_arun(url: str, config: CrawlerRunConfig = None, **kwargs):
|
||
# If deep crawling is already active, call the original method to avoid recursion.
|
||
if config and config.deep_crawl_strategy and not self.deep_crawl_active.get():
|
||
token = self.deep_crawl_active.set(True)
|
||
# Await the arun call to get the actual result object.
|
||
result_obj = await config.deep_crawl_strategy.arun(
|
||
crawler=self.crawler,
|
||
start_url=url,
|
||
config=config
|
||
)
|
||
if config.stream:
|
||
async def result_wrapper():
|
||
try:
|
||
async for result in result_obj:
|
||
yield result
|
||
finally:
|
||
self.deep_crawl_active.reset(token)
|
||
return result_wrapper()
|
||
else:
|
||
try:
|
||
return result_obj
|
||
finally:
|
||
self.deep_crawl_active.reset(token)
|
||
return await original_arun(url, config=config, **kwargs)
|
||
return wrapped_arun
|
||
|
||
class DeepCrawlStrategy(ABC):
|
||
"""
|
||
Abstract base class for deep crawling strategies.
|
||
|
||
Core functions:
|
||
- arun: Main entry point that returns an async generator of CrawlResults.
|
||
- shutdown: Clean up resources.
|
||
- can_process_url: Validate a URL and decide whether to process it.
|
||
- _process_links: Extract and process links from a CrawlResult.
|
||
"""
|
||
|
||
@abstractmethod
|
||
async def _arun_batch(
|
||
self,
|
||
start_url: str,
|
||
crawler: AsyncWebCrawler,
|
||
config: CrawlerRunConfig,
|
||
) -> List[CrawlResult]:
|
||
"""
|
||
Batch (non-streaming) mode:
|
||
Processes one BFS level at a time, then yields all the results.
|
||
"""
|
||
pass
|
||
|
||
@abstractmethod
|
||
async def _arun_stream(
|
||
self,
|
||
start_url: str,
|
||
crawler: AsyncWebCrawler,
|
||
config: CrawlerRunConfig,
|
||
) -> AsyncGenerator[CrawlResult, None]:
|
||
"""
|
||
Streaming mode:
|
||
Processes one BFS level at a time and yields results immediately as they arrive.
|
||
"""
|
||
pass
|
||
|
||
async def arun(
|
||
self,
|
||
start_url: str,
|
||
crawler: AsyncWebCrawler,
|
||
config: Optional[CrawlerRunConfig] = None,
|
||
) -> RunManyReturn:
|
||
"""
|
||
Traverse the given URL using the specified crawler.
|
||
|
||
Args:
|
||
start_url (str): The URL from which to start crawling.
|
||
crawler (AsyncWebCrawler): The crawler instance to use.
|
||
crawler_run_config (Optional[CrawlerRunConfig]): Crawler configuration.
|
||
|
||
Returns:
|
||
Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
||
"""
|
||
if config is None:
|
||
raise ValueError("CrawlerRunConfig must be provided")
|
||
|
||
if config.stream:
|
||
return self._arun_stream(start_url, crawler, config)
|
||
else:
|
||
return await self._arun_batch(start_url, crawler, config)
|
||
|
||
def __call__(self, start_url: str, crawler: AsyncWebCrawler, config: CrawlerRunConfig):
|
||
return self.arun(start_url, crawler, config)
|
||
|
||
@abstractmethod
|
||
async def shutdown(self) -> None:
|
||
"""
|
||
Clean up resources used by the deep crawl strategy.
|
||
"""
|
||
pass
|
||
|
||
@abstractmethod
|
||
async def can_process_url(self, url: str, depth: int) -> bool:
|
||
"""
|
||
Validate the URL format and apply custom filtering logic.
|
||
|
||
Args:
|
||
url (str): The URL to validate.
|
||
depth (int): The current depth in the crawl.
|
||
|
||
Returns:
|
||
bool: True if the URL should be processed, False otherwise.
|
||
"""
|
||
pass
|
||
|
||
@abstractmethod
|
||
async def link_discovery(
|
||
self,
|
||
result: CrawlResult,
|
||
source_url: str,
|
||
current_depth: int,
|
||
visited: Set[str],
|
||
next_level: List[tuple],
|
||
depths: Dict[str, int],
|
||
) -> None:
|
||
"""
|
||
Extract and process links from the given crawl result.
|
||
|
||
This method should:
|
||
- Validate each extracted URL using can_process_url.
|
||
- Optionally score URLs.
|
||
- Append valid URLs (and their parent references) to the next_level list.
|
||
- Update the depths dictionary with the new depth for each URL.
|
||
|
||
Args:
|
||
result (CrawlResult): The result from a crawl operation.
|
||
source_url (str): The URL from which this result was obtained.
|
||
current_depth (int): The depth at which the source URL was processed.
|
||
visited (Set[str]): Set of already visited URLs.
|
||
next_level (List[tuple]): List of tuples (url, parent_url) for the next BFS level.
|
||
depths (Dict[str, int]): Mapping of URLs to their current depth.
|
||
"""
|
||
pass
|
||
|
||
|
||
```
|
||
|
||
|
||
## File: crawl4ai/deep_crawling/bff_strategy.py
|
||
|
||
```py
|
||
# best_first_crawling_strategy.py
|
||
import asyncio
|
||
import logging
|
||
from datetime import datetime
|
||
from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
|
||
from urllib.parse import urlparse
|
||
|
||
from ..models import TraversalStats
|
||
from .filters import FilterChain
|
||
from .scorers import URLScorer
|
||
from . import DeepCrawlStrategy
|
||
|
||
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn
|
||
|
||
from math import inf as infinity
|
||
|
||
# Configurable batch size for processing items from the priority queue
|
||
BATCH_SIZE = 10
|
||
|
||
|
||
class BestFirstCrawlingStrategy(DeepCrawlStrategy):
|
||
"""
|
||
Best-First Crawling Strategy using a priority queue.
|
||
|
||
This strategy prioritizes URLs based on their score, ensuring that higher-value
|
||
pages are crawled first. It reimplements the core traversal loop to use a priority
|
||
queue while keeping URL validation and link discovery consistent with our design.
|
||
|
||
Core methods:
|
||
- arun: Returns either a list (batch mode) or an async generator (stream mode).
|
||
- _arun_best_first: Core generator that uses a priority queue to yield CrawlResults.
|
||
- can_process_url: Validates URLs and applies filtering (inherited behavior).
|
||
- link_discovery: Extracts and validates links from a CrawlResult.
|
||
"""
|
||
def __init__(
|
||
self,
|
||
max_depth: int,
|
||
filter_chain: FilterChain = FilterChain(),
|
||
url_scorer: Optional[URLScorer] = None,
|
||
include_external: bool = False,
|
||
max_pages: int = infinity,
|
||
logger: Optional[logging.Logger] = None,
|
||
):
|
||
self.max_depth = max_depth
|
||
self.filter_chain = filter_chain
|
||
self.url_scorer = url_scorer
|
||
self.include_external = include_external
|
||
self.max_pages = max_pages
|
||
self.logger = logger or logging.getLogger(__name__)
|
||
self.stats = TraversalStats(start_time=datetime.now())
|
||
self._cancel_event = asyncio.Event()
|
||
self._pages_crawled = 0
|
||
|
||
async def can_process_url(self, url: str, depth: int) -> bool:
|
||
"""
|
||
Validate the URL format and apply filtering.
|
||
For the starting URL (depth 0), filtering is bypassed.
|
||
"""
|
||
try:
|
||
parsed = urlparse(url)
|
||
if not parsed.scheme or not parsed.netloc:
|
||
raise ValueError("Missing scheme or netloc")
|
||
if parsed.scheme not in ("http", "https"):
|
||
raise ValueError("Invalid scheme")
|
||
if "." not in parsed.netloc:
|
||
raise ValueError("Invalid domain")
|
||
except Exception as e:
|
||
self.logger.warning(f"Invalid URL: {url}, error: {e}")
|
||
return False
|
||
|
||
if depth != 0 and not await self.filter_chain.apply(url):
|
||
return False
|
||
|
||
return True
|
||
|
||
async def link_discovery(
|
||
self,
|
||
result: CrawlResult,
|
||
source_url: str,
|
||
current_depth: int,
|
||
visited: Set[str],
|
||
next_links: List[Tuple[str, Optional[str]]],
|
||
depths: Dict[str, int],
|
||
) -> None:
|
||
"""
|
||
Extract links from the crawl result, validate them, and append new URLs
|
||
(with their parent references) to next_links.
|
||
Also updates the depths dictionary.
|
||
"""
|
||
new_depth = current_depth + 1
|
||
if new_depth > self.max_depth:
|
||
return
|
||
|
||
# If we've reached the max pages limit, don't discover new links
|
||
remaining_capacity = self.max_pages - self._pages_crawled
|
||
if remaining_capacity <= 0:
|
||
self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping link discovery")
|
||
return
|
||
|
||
# Retrieve internal links; include external links if enabled.
|
||
links = result.links.get("internal", [])
|
||
if self.include_external:
|
||
links += result.links.get("external", [])
|
||
|
||
# If we have more links than remaining capacity, limit how many we'll process
|
||
valid_links = []
|
||
for link in links:
|
||
url = link.get("href")
|
||
if url in visited:
|
||
continue
|
||
if not await self.can_process_url(url, new_depth):
|
||
self.stats.urls_skipped += 1
|
||
continue
|
||
|
||
valid_links.append(url)
|
||
|
||
# If we have more valid links than capacity, limit them
|
||
if len(valid_links) > remaining_capacity:
|
||
valid_links = valid_links[:remaining_capacity]
|
||
self.logger.info(f"Limiting to {remaining_capacity} URLs due to max_pages limit")
|
||
|
||
# Record the new depths and add to next_links
|
||
for url in valid_links:
|
||
depths[url] = new_depth
|
||
next_links.append((url, source_url))
|
||
|
||
async def _arun_best_first(
|
||
self,
|
||
start_url: str,
|
||
crawler: AsyncWebCrawler,
|
||
config: CrawlerRunConfig,
|
||
) -> AsyncGenerator[CrawlResult, None]:
|
||
"""
|
||
Core best-first crawl method using a priority queue.
|
||
|
||
The queue items are tuples of (score, depth, url, parent_url). Lower scores
|
||
are treated as higher priority. URLs are processed in batches for efficiency.
|
||
"""
|
||
queue: asyncio.PriorityQueue = asyncio.PriorityQueue()
|
||
# Push the initial URL with score 0 and depth 0.
|
||
await queue.put((0, 0, start_url, None))
|
||
visited: Set[str] = set()
|
||
depths: Dict[str, int] = {start_url: 0}
|
||
|
||
while not queue.empty() and not self._cancel_event.is_set():
|
||
# Stop if we've reached the max pages limit
|
||
if self._pages_crawled >= self.max_pages:
|
||
self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping crawl")
|
||
break
|
||
|
||
batch: List[Tuple[float, int, str, Optional[str]]] = []
|
||
# Retrieve up to BATCH_SIZE items from the priority queue.
|
||
for _ in range(BATCH_SIZE):
|
||
if queue.empty():
|
||
break
|
||
item = await queue.get()
|
||
score, depth, url, parent_url = item
|
||
if url in visited:
|
||
continue
|
||
visited.add(url)
|
||
batch.append(item)
|
||
|
||
if not batch:
|
||
continue
|
||
|
||
# Process the current batch of URLs.
|
||
urls = [item[2] for item in batch]
|
||
batch_config = config.clone(deep_crawl_strategy=None, stream=True)
|
||
stream_gen = await crawler.arun_many(urls=urls, config=batch_config)
|
||
async for result in stream_gen:
|
||
result_url = result.url
|
||
# Find the corresponding tuple from the batch.
|
||
corresponding = next((item for item in batch if item[2] == result_url), None)
|
||
if not corresponding:
|
||
continue
|
||
score, depth, url, parent_url = corresponding
|
||
result.metadata = result.metadata or {}
|
||
result.metadata["depth"] = depth
|
||
result.metadata["parent_url"] = parent_url
|
||
result.metadata["score"] = score
|
||
|
||
# Count only successful crawls toward max_pages limit
|
||
if result.success:
|
||
self._pages_crawled += 1
|
||
|
||
yield result
|
||
|
||
# Only discover links from successful crawls
|
||
if result.success:
|
||
# Discover new links from this result
|
||
new_links: List[Tuple[str, Optional[str]]] = []
|
||
await self.link_discovery(result, result_url, depth, visited, new_links, depths)
|
||
|
||
for new_url, new_parent in new_links:
|
||
new_depth = depths.get(new_url, depth + 1)
|
||
new_score = self.url_scorer.score(new_url) if self.url_scorer else 0
|
||
await queue.put((new_score, new_depth, new_url, new_parent))
|
||
|
||
# End of crawl.
|
||
|
||
async def _arun_batch(
|
||
self,
|
||
start_url: str,
|
||
crawler: AsyncWebCrawler,
|
||
config: CrawlerRunConfig,
|
||
) -> List[CrawlResult]:
|
||
"""
|
||
Best-first crawl in batch mode.
|
||
|
||
Aggregates all CrawlResults into a list.
|
||
"""
|
||
results: List[CrawlResult] = []
|
||
async for result in self._arun_best_first(start_url, crawler, config):
|
||
results.append(result)
|
||
return results
|
||
|
||
async def _arun_stream(
|
||
self,
|
||
start_url: str,
|
||
crawler: AsyncWebCrawler,
|
||
config: CrawlerRunConfig,
|
||
) -> AsyncGenerator[CrawlResult, None]:
|
||
"""
|
||
Best-first crawl in streaming mode.
|
||
|
||
Yields CrawlResults as they become available.
|
||
"""
|
||
async for result in self._arun_best_first(start_url, crawler, config):
|
||
yield result
|
||
|
||
async def arun(
|
||
self,
|
||
start_url: str,
|
||
crawler: AsyncWebCrawler,
|
||
config: Optional[CrawlerRunConfig] = None,
|
||
) -> "RunManyReturn":
|
||
"""
|
||
Main entry point for best-first crawling.
|
||
|
||
Returns either a list (batch mode) or an async generator (stream mode)
|
||
of CrawlResults.
|
||
"""
|
||
if config is None:
|
||
raise ValueError("CrawlerRunConfig must be provided")
|
||
if config.stream:
|
||
return self._arun_stream(start_url, crawler, config)
|
||
else:
|
||
return await self._arun_batch(start_url, crawler, config)
|
||
|
||
async def shutdown(self) -> None:
|
||
"""
|
||
Signal cancellation and clean up resources.
|
||
"""
|
||
self._cancel_event.set()
|
||
self.stats.end_time = datetime.now()
|
||
|
||
```
|
||
|
||
|
||
## File: crawl4ai/deep_crawling/bfs_strategy.py
|
||
|
||
```py
|
||
# bfs_deep_crawl_strategy.py
|
||
import asyncio
|
||
import logging
|
||
from datetime import datetime
|
||
from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
|
||
from urllib.parse import urlparse
|
||
|
||
from ..models import TraversalStats
|
||
from .filters import FilterChain
|
||
from .scorers import URLScorer
|
||
from . import DeepCrawlStrategy
|
||
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult
|
||
from ..utils import normalize_url_for_deep_crawl, efficient_normalize_url_for_deep_crawl
|
||
from math import inf as infinity
|
||
|
||
class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
||
"""
|
||
Breadth-First Search deep crawling strategy.
|
||
|
||
Core functions:
|
||
- arun: Main entry point; splits execution into batch or stream modes.
|
||
- link_discovery: Extracts, filters, and (if needed) scores the outgoing URLs.
|
||
- can_process_url: Validates URL format and applies the filter chain.
|
||
"""
|
||
def __init__(
|
||
self,
|
||
max_depth: int,
|
||
filter_chain: FilterChain = FilterChain(),
|
||
url_scorer: Optional[URLScorer] = None,
|
||
include_external: bool = False,
|
||
score_threshold: float = -infinity,
|
||
max_pages: int = infinity,
|
||
logger: Optional[logging.Logger] = None,
|
||
):
|
||
self.max_depth = max_depth
|
||
self.filter_chain = filter_chain
|
||
self.url_scorer = url_scorer
|
||
self.include_external = include_external
|
||
self.score_threshold = score_threshold
|
||
self.max_pages = max_pages
|
||
self.logger = logger or logging.getLogger(__name__)
|
||
self.stats = TraversalStats(start_time=datetime.now())
|
||
self._cancel_event = asyncio.Event()
|
||
self._pages_crawled = 0
|
||
|
||
async def can_process_url(self, url: str, depth: int) -> bool:
|
||
"""
|
||
Validates the URL and applies the filter chain.
|
||
For the start URL (depth 0) filtering is bypassed.
|
||
"""
|
||
try:
|
||
parsed = urlparse(url)
|
||
if not parsed.scheme or not parsed.netloc:
|
||
raise ValueError("Missing scheme or netloc")
|
||
if parsed.scheme not in ("http", "https"):
|
||
raise ValueError("Invalid scheme")
|
||
if "." not in parsed.netloc:
|
||
raise ValueError("Invalid domain")
|
||
except Exception as e:
|
||
self.logger.warning(f"Invalid URL: {url}, error: {e}")
|
||
return False
|
||
|
||
if depth != 0 and not await self.filter_chain.apply(url):
|
||
return False
|
||
|
||
return True
|
||
|
||
async def link_discovery(
|
||
self,
|
||
result: CrawlResult,
|
||
source_url: str,
|
||
current_depth: int,
|
||
visited: Set[str],
|
||
next_level: List[Tuple[str, Optional[str]]],
|
||
depths: Dict[str, int],
|
||
) -> None:
|
||
"""
|
||
Extracts links from the crawl result, validates and scores them, and
|
||
prepares the next level of URLs.
|
||
Each valid URL is appended to next_level as a tuple (url, parent_url)
|
||
and its depth is tracked.
|
||
"""
|
||
next_depth = current_depth + 1
|
||
if next_depth > self.max_depth:
|
||
return
|
||
|
||
# If we've reached the max pages limit, don't discover new links
|
||
remaining_capacity = self.max_pages - self._pages_crawled
|
||
if remaining_capacity <= 0:
|
||
self.logger.info(f"Max pages limit ({self.max_pages}) reached, stopping link discovery")
|
||
return
|
||
|
||
# Get internal links and, if enabled, external links.
|
||
links = result.links.get("internal", [])
|
||
if self.include_external:
|
||
links += result.links.get("external", [])
|
||
|
||
valid_links = []
|
||
|
||
# First collect all valid links
|
||
for link in links:
|
||
url = link.get("href")
|
||
# Strip URL fragments to avoid duplicate crawling
|
||
# base_url = url.split('#')[0] if url else url
|
||
base_url = normalize_url_for_deep_crawl(url, source_url)
|
||
if base_url in visited:
|
||
continue
|
||
if not await self.can_process_url(url, next_depth):
|
||
self.stats.urls_skipped += 1
|
||
continue
|
||
|
||
# Score the URL if a scorer is provided
|
||
score = self.url_scorer.score(base_url) if self.url_scorer else 0
|
||
|
||
# Skip URLs with scores below the threshold
|
||
if score < self.score_threshold:
|
||
self.logger.debug(f"URL {url} skipped: score {score} below threshold {self.score_threshold}")
|
||
self.stats.urls_skipped += 1
|
||
continue
|
||
|
||
valid_links.append((base_url, score))
|
||
|
||
# If we have more valid links than capacity, sort by score and take the top ones
|
||
if len(valid_links) > remaining_capacity:
|
||
if self.url_scorer:
|
||
# Sort by score in descending order
|
||
valid_links.sort(key=lambda x: x[1], reverse=True)
|
||
# Take only as many as we have capacity for
|
||
valid_links = valid_links[:remaining_capacity]
|
||
self.logger.info(f"Limiting to {remaining_capacity} URLs due to max_pages limit")
|
||
|
||
# Process the final selected links
|
||
for url, score in valid_links:
|
||
# attach the score to metadata if needed
|
||
if score:
|
||
result.metadata = result.metadata or {}
|
||
result.metadata["score"] = score
|
||
next_level.append((url, source_url))
|
||
depths[url] = next_depth
|
||
|
||
async def _arun_batch(
|
||
self,
|
||
start_url: str,
|
||
crawler: AsyncWebCrawler,
|
||
config: CrawlerRunConfig,
|
||
) -> List[CrawlResult]:
|
||
"""
|
||
Batch (non-streaming) mode:
|
||
Processes one BFS level at a time, then yields all the results.
|
||
"""
|
||
visited: Set[str] = set()
|
||
# current_level holds tuples: (url, parent_url)
|
||
current_level: List[Tuple[str, Optional[str]]] = [(start_url, None)]
|
||
depths: Dict[str, int] = {start_url: 0}
|
||
|
||
results: List[CrawlResult] = []
|
||
|
||
while current_level and not self._cancel_event.is_set():
|
||
next_level: List[Tuple[str, Optional[str]]] = []
|
||
urls = [url for url, _ in current_level]
|
||
visited.update(urls)
|
||
|
||
# Clone the config to disable deep crawling recursion and enforce batch mode.
|
||
batch_config = config.clone(deep_crawl_strategy=None, stream=False)
|
||
batch_results = await crawler.arun_many(urls=urls, config=batch_config)
|
||
|
||
# Update pages crawled counter - count only successful crawls
|
||
successful_results = [r for r in batch_results if r.success]
|
||
self._pages_crawled += len(successful_results)
|
||
|
||
for result in batch_results:
|
||
url = result.url
|
||
depth = depths.get(url, 0)
|
||
result.metadata = result.metadata or {}
|
||
result.metadata["depth"] = depth
|
||
parent_url = next((parent for (u, parent) in current_level if u == url), None)
|
||
result.metadata["parent_url"] = parent_url
|
||
results.append(result)
|
||
|
||
# Only discover links from successful crawls
|
||
if result.success:
|
||
# Link discovery will handle the max pages limit internally
|
||
await self.link_discovery(result, url, depth, visited, next_level, depths)
|
||
|
||
current_level = next_level
|
||
|
||
return results
|
||
|
||
async def _arun_stream(
|
||
self,
|
||
start_url: str,
|
||
crawler: AsyncWebCrawler,
|
||
config: CrawlerRunConfig,
|
||
) -> AsyncGenerator[CrawlResult, None]:
|
||
"""
|
||
Streaming mode:
|
||
Processes one BFS level at a time and yields results immediately as they arrive.
|
||
"""
|
||
visited: Set[str] = set()
|
||
current_level: List[Tuple[str, Optional[str]]] = [(start_url, None)]
|
||
depths: Dict[str, int] = {start_url: 0}
|
||
|
||
while current_level and not self._cancel_event.is_set():
|
||
next_level: List[Tuple[str, Optional[str]]] = []
|
||
urls = [url for url, _ in current_level]
|
||
visited.update(urls)
|
||
|
||
stream_config = config.clone(deep_crawl_strategy=None, stream=True)
|
||
stream_gen = await crawler.arun_many(urls=urls, config=stream_config)
|
||
|
||
# Keep track of processed results for this batch
|
||
results_count = 0
|
||
async for result in stream_gen:
|
||
url = result.url
|
||
depth = depths.get(url, 0)
|
||
result.metadata = result.metadata or {}
|
||
result.metadata["depth"] = depth
|
||
parent_url = next((parent for (u, parent) in current_level if u == url), None)
|
||
result.metadata["parent_url"] = parent_url
|
||
|
||
# Count only successful crawls
|
||
if result.success:
|
||
self._pages_crawled += 1
|
||
|
||
results_count += 1
|
||
yield result
|
||
|
||
# Only discover links from successful crawls
|
||
if result.success:
|
||
# Link discovery will handle the max pages limit internally
|
||
await self.link_discovery(result, url, depth, visited, next_level, depths)
|
||
|
||
# If we didn't get results back (e.g. due to errors), avoid getting stuck in an infinite loop
|
||
# by considering these URLs as visited but not counting them toward the max_pages limit
|
||
if results_count == 0 and urls:
|
||
self.logger.warning(f"No results returned for {len(urls)} URLs, marking as visited")
|
||
|
||
current_level = next_level
|
||
|
||
async def shutdown(self) -> None:
|
||
"""
|
||
Clean up resources and signal cancellation of the crawl.
|
||
"""
|
||
self._cancel_event.set()
|
||
self.stats.end_time = datetime.now()
|
||
|
||
```
|
||
|
||
|
||
## File: crawl4ai/deep_crawling/filters.py
|
||
|
||
```py
|
||
from abc import ABC, abstractmethod
|
||
from typing import List, Pattern, Set, Union
|
||
from urllib.parse import urlparse
|
||
from array import array
|
||
import re
|
||
import logging
|
||
from functools import lru_cache
|
||
import fnmatch
|
||
from dataclasses import dataclass
|
||
import weakref
|
||
import math
|
||
from collections import defaultdict
|
||
from typing import Dict
|
||
from ..utils import HeadPeekr
|
||
import asyncio
|
||
import inspect
|
||
|
||
|
||
@dataclass
|
||
class FilterStats:
|
||
__slots__ = ("_counters",)
|
||
|
||
def __init__(self):
|
||
# Use array of unsigned ints for atomic operations
|
||
self._counters = array("I", [0, 0, 0]) # total, passed, rejected
|
||
|
||
@property
|
||
def total_urls(self):
|
||
return self._counters[0]
|
||
|
||
@property
|
||
def passed_urls(self):
|
||
return self._counters[1]
|
||
|
||
@property
|
||
def rejected_urls(self):
|
||
return self._counters[2]
|
||
|
||
|
||
class URLFilter(ABC):
|
||
"""Optimized base filter class"""
|
||
|
||
__slots__ = ("name", "stats", "_logger_ref")
|
||
|
||
def __init__(self, name: str = None):
|
||
self.name = name or self.__class__.__name__
|
||
self.stats = FilterStats()
|
||
# Lazy logger initialization using weakref
|
||
self._logger_ref = None
|
||
|
||
@property
|
||
def logger(self):
|
||
if self._logger_ref is None or self._logger_ref() is None:
|
||
logger = logging.getLogger(f"urlfilter.{self.name}")
|
||
self._logger_ref = weakref.ref(logger)
|
||
return self._logger_ref()
|
||
|
||
@abstractmethod
|
||
def apply(self, url: str) -> bool:
|
||
pass
|
||
|
||
def _update_stats(self, passed: bool):
|
||
# Use direct array index for speed
|
||
self.stats._counters[0] += 1 # total
|
||
self.stats._counters[1] += passed # passed
|
||
self.stats._counters[2] += not passed # rejected
|
||
|
||
|
||
class FilterChain:
|
||
"""Optimized filter chain"""
|
||
|
||
__slots__ = ("filters", "stats", "_logger_ref")
|
||
|
||
def __init__(self, filters: List[URLFilter] = None):
|
||
self.filters = tuple(filters or []) # Immutable tuple for speed
|
||
self.stats = FilterStats()
|
||
self._logger_ref = None
|
||
|
||
@property
|
||
def logger(self):
|
||
if self._logger_ref is None or self._logger_ref() is None:
|
||
logger = logging.getLogger("urlfilter.chain")
|
||
self._logger_ref = weakref.ref(logger)
|
||
return self._logger_ref()
|
||
|
||
def add_filter(self, filter_: URLFilter) -> "FilterChain":
|
||
"""Add a filter to the chain"""
|
||
self.filters.append(filter_)
|
||
return self # Enable method chaining
|
||
|
||
async def apply(self, url: str) -> bool:
|
||
"""Apply all filters concurrently when possible"""
|
||
self.stats._counters[0] += 1 # Total processed URLs
|
||
|
||
tasks = []
|
||
for f in self.filters:
|
||
result = f.apply(url)
|
||
|
||
if inspect.isawaitable(result):
|
||
tasks.append(result) # Collect async tasks
|
||
elif not result: # Sync rejection
|
||
self.stats._counters[2] += 1 # Sync rejected
|
||
return False
|
||
|
||
if tasks:
|
||
results = await asyncio.gather(*tasks)
|
||
|
||
# Count how many filters rejected
|
||
rejections = results.count(False)
|
||
self.stats._counters[2] += rejections
|
||
|
||
if not all(results):
|
||
return False # Stop early if any filter rejected
|
||
|
||
self.stats._counters[1] += 1 # Passed
|
||
return True
|
||
|
||
|
||
class URLPatternFilter(URLFilter):
|
||
"""Pattern filter balancing speed and completeness"""
|
||
|
||
__slots__ = (
|
||
"_simple_suffixes",
|
||
"_simple_prefixes",
|
||
"_domain_patterns",
|
||
"_path_patterns",
|
||
"_reverse",
|
||
)
|
||
|
||
PATTERN_TYPES = {
|
||
"SUFFIX": 1, # *.html
|
||
"PREFIX": 2, # /foo/*
|
||
"DOMAIN": 3, # *.example.com
|
||
"PATH": 4, # Everything else
|
||
"REGEX": 5,
|
||
}
|
||
|
||
def __init__(
|
||
self,
|
||
patterns: Union[str, Pattern, List[Union[str, Pattern]]],
|
||
use_glob: bool = True,
|
||
reverse: bool = False,
|
||
):
|
||
super().__init__()
|
||
self._reverse = reverse
|
||
patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
|
||
|
||
self._simple_suffixes = set()
|
||
self._simple_prefixes = set()
|
||
self._domain_patterns = []
|
||
self._path_patterns = []
|
||
|
||
for pattern in patterns:
|
||
pattern_type = self._categorize_pattern(pattern)
|
||
self._add_pattern(pattern, pattern_type)
|
||
|
||
def _categorize_pattern(self, pattern: str) -> int:
|
||
"""Categorize pattern for specialized handling"""
|
||
if not isinstance(pattern, str):
|
||
return self.PATTERN_TYPES["PATH"]
|
||
|
||
# Check if it's a regex pattern
|
||
if pattern.startswith("^") or pattern.endswith("$") or "\\d" in pattern:
|
||
return self.PATTERN_TYPES["REGEX"]
|
||
|
||
if pattern.count("*") == 1:
|
||
if pattern.startswith("*."):
|
||
return self.PATTERN_TYPES["SUFFIX"]
|
||
if pattern.endswith("/*"):
|
||
return self.PATTERN_TYPES["PREFIX"]
|
||
|
||
if "://" in pattern and pattern.startswith("*."):
|
||
return self.PATTERN_TYPES["DOMAIN"]
|
||
|
||
return self.PATTERN_TYPES["PATH"]
|
||
|
||
def _add_pattern(self, pattern: str, pattern_type: int):
|
||
"""Add pattern to appropriate matcher"""
|
||
if pattern_type == self.PATTERN_TYPES["REGEX"]:
|
||
# For regex patterns, compile directly without glob translation
|
||
if isinstance(pattern, str) and (
|
||
pattern.startswith("^") or pattern.endswith("$") or "\\d" in pattern
|
||
):
|
||
self._path_patterns.append(re.compile(pattern))
|
||
return
|
||
elif pattern_type == self.PATTERN_TYPES["SUFFIX"]:
|
||
self._simple_suffixes.add(pattern[2:])
|
||
elif pattern_type == self.PATTERN_TYPES["PREFIX"]:
|
||
self._simple_prefixes.add(pattern[:-2])
|
||
elif pattern_type == self.PATTERN_TYPES["DOMAIN"]:
|
||
self._domain_patterns.append(re.compile(pattern.replace("*.", r"[^/]+\.")))
|
||
else:
|
||
if isinstance(pattern, str):
|
||
# Handle complex glob patterns
|
||
if "**" in pattern:
|
||
pattern = pattern.replace("**", ".*")
|
||
if "{" in pattern:
|
||
# Convert {a,b} to (a|b)
|
||
pattern = re.sub(
|
||
r"\{([^}]+)\}",
|
||
lambda m: f'({"|".join(m.group(1).split(","))})',
|
||
pattern,
|
||
)
|
||
pattern = fnmatch.translate(pattern)
|
||
self._path_patterns.append(
|
||
pattern if isinstance(pattern, Pattern) else re.compile(pattern)
|
||
)
|
||
|
||
@lru_cache(maxsize=10000)
|
||
def apply(self, url: str) -> bool:
|
||
# Quick suffix check (*.html)
|
||
if self._simple_suffixes:
|
||
path = url.split("?")[0]
|
||
if path.split("/")[-1].split(".")[-1] in self._simple_suffixes:
|
||
result = True
|
||
self._update_stats(result)
|
||
return not result if self._reverse else result
|
||
|
||
# Domain check
|
||
if self._domain_patterns:
|
||
for pattern in self._domain_patterns:
|
||
if pattern.match(url):
|
||
result = True
|
||
self._update_stats(result)
|
||
return not result if self._reverse else result
|
||
|
||
# Prefix check (/foo/*)
|
||
if self._simple_prefixes:
|
||
path = url.split("?")[0]
|
||
if any(path.startswith(p) for p in self._simple_prefixes):
|
||
result = True
|
||
self._update_stats(result)
|
||
return not result if self._reverse else result
|
||
|
||
# Complex patterns
|
||
if self._path_patterns:
|
||
if any(p.search(url) for p in self._path_patterns):
|
||
result = True
|
||
self._update_stats(result)
|
||
return not result if self._reverse else result
|
||
|
||
result = False
|
||
self._update_stats(result)
|
||
return not result if self._reverse else result
|
||
|
||
|
||
class ContentTypeFilter(URLFilter):
|
||
"""Optimized content type filter using fast lookups"""
|
||
|
||
__slots__ = ("allowed_types", "_ext_map", "_check_extension")
|
||
|
||
# Fast extension to mime type mapping
|
||
_MIME_MAP = {
|
||
# Text Formats
|
||
"txt": "text/plain",
|
||
"html": "text/html",
|
||
"htm": "text/html",
|
||
"xhtml": "application/xhtml+xml",
|
||
"css": "text/css",
|
||
"csv": "text/csv",
|
||
"ics": "text/calendar",
|
||
"js": "application/javascript",
|
||
# Images
|
||
"bmp": "image/bmp",
|
||
"gif": "image/gif",
|
||
"jpeg": "image/jpeg",
|
||
"jpg": "image/jpeg",
|
||
"png": "image/png",
|
||
"svg": "image/svg+xml",
|
||
"tiff": "image/tiff",
|
||
"ico": "image/x-icon",
|
||
"webp": "image/webp",
|
||
# Audio
|
||
"mp3": "audio/mpeg",
|
||
"wav": "audio/wav",
|
||
"ogg": "audio/ogg",
|
||
"m4a": "audio/mp4",
|
||
"aac": "audio/aac",
|
||
# Video
|
||
"mp4": "video/mp4",
|
||
"mpeg": "video/mpeg",
|
||
"webm": "video/webm",
|
||
"avi": "video/x-msvideo",
|
||
"mov": "video/quicktime",
|
||
"flv": "video/x-flv",
|
||
"wmv": "video/x-ms-wmv",
|
||
"mkv": "video/x-matroska",
|
||
# Applications
|
||
"json": "application/json",
|
||
"xml": "application/xml",
|
||
"pdf": "application/pdf",
|
||
"zip": "application/zip",
|
||
"gz": "application/gzip",
|
||
"tar": "application/x-tar",
|
||
"rar": "application/vnd.rar",
|
||
"7z": "application/x-7z-compressed",
|
||
"exe": "application/vnd.microsoft.portable-executable",
|
||
"msi": "application/x-msdownload",
|
||
# Fonts
|
||
"woff": "font/woff",
|
||
"woff2": "font/woff2",
|
||
"ttf": "font/ttf",
|
||
"otf": "font/otf",
|
||
# Microsoft Office
|
||
"doc": "application/msword",
|
||
"dot": "application/msword",
|
||
"docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||
"xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||
"xls": "application/vnd.ms-excel",
|
||
"ppt": "application/vnd.ms-powerpoint",
|
||
"pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||
# OpenDocument Formats
|
||
"odt": "application/vnd.oasis.opendocument.text",
|
||
"ods": "application/vnd.oasis.opendocument.spreadsheet",
|
||
"odp": "application/vnd.oasis.opendocument.presentation",
|
||
# Archives
|
||
"tar.gz": "application/gzip",
|
||
"tgz": "application/gzip",
|
||
"bz2": "application/x-bzip2",
|
||
# Others
|
||
"rtf": "application/rtf",
|
||
"apk": "application/vnd.android.package-archive",
|
||
"epub": "application/epub+zip",
|
||
"jar": "application/java-archive",
|
||
"swf": "application/x-shockwave-flash",
|
||
"midi": "audio/midi",
|
||
"mid": "audio/midi",
|
||
"ps": "application/postscript",
|
||
"ai": "application/postscript",
|
||
"eps": "application/postscript",
|
||
# Custom or less common
|
||
"bin": "application/octet-stream",
|
||
"dmg": "application/x-apple-diskimage",
|
||
"iso": "application/x-iso9660-image",
|
||
"deb": "application/x-debian-package",
|
||
"rpm": "application/x-rpm",
|
||
"sqlite": "application/vnd.sqlite3",
|
||
# Placeholder
|
||
"unknown": "application/octet-stream", # Fallback for unknown file types
|
||
}
|
||
|
||
@staticmethod
|
||
@lru_cache(maxsize=1000)
|
||
def _extract_extension(url: str) -> str:
|
||
"""Extracts file extension from a URL."""
|
||
# Remove scheme (http://, https://) if present
|
||
if "://" in url:
|
||
url = url.split("://", 1)[-1] # Get everything after '://'
|
||
|
||
# Remove domain (everything up to the first '/')
|
||
path_start = url.find("/")
|
||
path = url[path_start:] if path_start != -1 else ""
|
||
|
||
# Extract last filename in path
|
||
filename = path.rsplit("/", 1)[-1] if "/" in path else ""
|
||
|
||
# Extract and validate extension
|
||
if "." not in filename:
|
||
return ""
|
||
|
||
return filename.rpartition(".")[-1].lower()
|
||
|
||
def __init__(
|
||
self,
|
||
allowed_types: Union[str, List[str]],
|
||
check_extension: bool = True,
|
||
ext_map: Dict[str, str] = _MIME_MAP,
|
||
):
|
||
super().__init__()
|
||
# Normalize and store as frozenset for fast lookup
|
||
self.allowed_types = frozenset(
|
||
t.lower()
|
||
for t in (
|
||
allowed_types if isinstance(allowed_types, list) else [allowed_types]
|
||
)
|
||
)
|
||
self._check_extension = check_extension
|
||
|
||
# Pre-compute extension map for allowed types
|
||
self._ext_map = frozenset(
|
||
ext
|
||
for ext, mime in self._MIME_MAP.items()
|
||
if any(allowed in mime for allowed in self.allowed_types)
|
||
)
|
||
|
||
@lru_cache(maxsize=1000)
|
||
def _check_url_cached(self, url: str) -> bool:
|
||
"""Cached URL checking"""
|
||
if not self._check_extension:
|
||
return True
|
||
ext = self._extract_extension(url)
|
||
if not ext:
|
||
return True
|
||
|
||
return ext in self._ext_map
|
||
|
||
def apply(self, url: str) -> bool:
|
||
"""Fast extension check with caching"""
|
||
result = self._check_url_cached(url)
|
||
self._update_stats(result)
|
||
return result
|
||
|
||
|
||
class DomainFilter(URLFilter):
|
||
"""Optimized domain filter with fast lookups and caching"""
|
||
|
||
__slots__ = ("_allowed_domains", "_blocked_domains", "_domain_cache")
|
||
|
||
# Regex for fast domain extraction
|
||
_DOMAIN_REGEX = re.compile(r"://([^/]+)")
|
||
|
||
def __init__(
|
||
self,
|
||
allowed_domains: Union[str, List[str]] = None,
|
||
blocked_domains: Union[str, List[str]] = None,
|
||
):
|
||
super().__init__()
|
||
|
||
# Convert inputs to frozensets for immutable, fast lookups
|
||
self._allowed_domains = (
|
||
frozenset(self._normalize_domains(allowed_domains))
|
||
if allowed_domains
|
||
else None
|
||
)
|
||
self._blocked_domains = (
|
||
frozenset(self._normalize_domains(blocked_domains))
|
||
if blocked_domains
|
||
else frozenset()
|
||
)
|
||
|
||
@staticmethod
|
||
def _normalize_domains(domains: Union[str, List[str]]) -> Set[str]:
|
||
"""Fast domain normalization"""
|
||
if isinstance(domains, str):
|
||
return {domains.lower()}
|
||
return {d.lower() for d in domains}
|
||
|
||
@staticmethod
|
||
def _is_subdomain(domain: str, parent_domain: str) -> bool:
|
||
"""Check if domain is a subdomain of parent_domain"""
|
||
return domain == parent_domain or domain.endswith(f".{parent_domain}")
|
||
|
||
@staticmethod
|
||
@lru_cache(maxsize=10000)
|
||
def _extract_domain(url: str) -> str:
|
||
"""Ultra-fast domain extraction with regex and caching"""
|
||
match = DomainFilter._DOMAIN_REGEX.search(url)
|
||
return match.group(1).lower() if match else ""
|
||
|
||
def apply(self, url: str) -> bool:
|
||
"""Optimized domain checking with early returns"""
|
||
# Skip processing if no filters
|
||
if not self._blocked_domains and self._allowed_domains is None:
|
||
self._update_stats(True)
|
||
return True
|
||
|
||
domain = self._extract_domain(url)
|
||
|
||
# Check for blocked domains, including subdomains
|
||
for blocked in self._blocked_domains:
|
||
if self._is_subdomain(domain, blocked):
|
||
self._update_stats(False)
|
||
return False
|
||
|
||
# If no allowed domains specified, accept all non-blocked
|
||
if self._allowed_domains is None:
|
||
self._update_stats(True)
|
||
return True
|
||
|
||
# Check if domain matches any allowed domain (including subdomains)
|
||
for allowed in self._allowed_domains:
|
||
if self._is_subdomain(domain, allowed):
|
||
self._update_stats(True)
|
||
return True
|
||
|
||
# No matches found
|
||
self._update_stats(False)
|
||
return False
|
||
|
||
|
||
class ContentRelevanceFilter(URLFilter):
|
||
"""BM25-based relevance filter using head section content"""
|
||
|
||
__slots__ = ("query_terms", "threshold", "k1", "b", "avgdl")
|
||
|
||
def __init__(
|
||
self,
|
||
query: str,
|
||
threshold: float,
|
||
k1: float = 1.2,
|
||
b: float = 0.75,
|
||
avgdl: int = 1000,
|
||
):
|
||
super().__init__(name="BM25RelevanceFilter")
|
||
self.query_terms = self._tokenize(query)
|
||
self.threshold = threshold
|
||
self.k1 = k1 # TF saturation parameter
|
||
self.b = b # Length normalization parameter
|
||
self.avgdl = avgdl # Average document length (empirical value)
|
||
|
||
async def apply(self, url: str) -> bool:
|
||
head_content = await HeadPeekr.peek_html(url)
|
||
if not head_content:
|
||
self._update_stats(False)
|
||
return False
|
||
|
||
# Field extraction with weighting
|
||
fields = {
|
||
"title": HeadPeekr.get_title(head_content) or "",
|
||
"meta": HeadPeekr.extract_meta_tags(head_content),
|
||
}
|
||
doc_text = self._build_document(fields)
|
||
|
||
score = self._bm25(doc_text)
|
||
decision = score >= self.threshold
|
||
self._update_stats(decision)
|
||
return decision
|
||
|
||
def _build_document(self, fields: Dict) -> str:
|
||
"""Weighted document construction"""
|
||
return " ".join(
|
||
[
|
||
fields["title"] * 3, # Title weight
|
||
fields["meta"].get("description", "") * 2,
|
||
fields["meta"].get("keywords", ""),
|
||
" ".join(fields["meta"].values()),
|
||
]
|
||
)
|
||
|
||
def _tokenize(self, text: str) -> List[str]:
|
||
"""Fast case-insensitive tokenization"""
|
||
return text.lower().split()
|
||
|
||
def _bm25(self, document: str) -> float:
|
||
"""Optimized BM25 implementation for head sections"""
|
||
doc_terms = self._tokenize(document)
|
||
doc_len = len(doc_terms)
|
||
tf = defaultdict(int)
|
||
|
||
for term in doc_terms:
|
||
tf[term] += 1
|
||
|
||
score = 0.0
|
||
for term in set(self.query_terms):
|
||
term_freq = tf[term]
|
||
idf = math.log((1 + 1) / (term_freq + 0.5) + 1) # Simplified IDF
|
||
numerator = term_freq * (self.k1 + 1)
|
||
denominator = term_freq + self.k1 * (
|
||
1 - self.b + self.b * (doc_len / self.avgdl)
|
||
)
|
||
score += idf * (numerator / denominator)
|
||
|
||
return score
|
||
|
||
|
||
class SEOFilter(URLFilter):
|
||
"""Quantitative SEO quality assessment filter using head section analysis"""
|
||
|
||
__slots__ = ("threshold", "_weights", "_kw_patterns")
|
||
|
||
# Based on SEMrush/Google ranking factors research
|
||
DEFAULT_WEIGHTS = {
|
||
"title_length": 0.15,
|
||
"title_kw": 0.18,
|
||
"meta_description": 0.12,
|
||
"canonical": 0.10,
|
||
"robot_ok": 0.20, # Most critical factor
|
||
"schema_org": 0.10,
|
||
"url_quality": 0.15,
|
||
}
|
||
|
||
def __init__(
|
||
self,
|
||
threshold: float = 0.65,
|
||
keywords: List[str] = None,
|
||
weights: Dict[str, float] = None,
|
||
):
|
||
super().__init__(name="SEOFilter")
|
||
self.threshold = threshold
|
||
self._weights = weights or self.DEFAULT_WEIGHTS
|
||
self._kw_patterns = (
|
||
re.compile(
|
||
r"\b({})\b".format("|".join(map(re.escape, keywords or []))), re.I
|
||
)
|
||
if keywords
|
||
else None
|
||
)
|
||
|
||
async def apply(self, url: str) -> bool:
|
||
head_content = await HeadPeekr.peek_html(url)
|
||
if not head_content:
|
||
self._update_stats(False)
|
||
return False
|
||
|
||
meta = HeadPeekr.extract_meta_tags(head_content)
|
||
title = HeadPeekr.get_title(head_content) or ""
|
||
parsed_url = urlparse(url)
|
||
|
||
scores = {
|
||
"title_length": self._score_title_length(title),
|
||
"title_kw": self._score_keyword_presence(title),
|
||
"meta_description": self._score_meta_description(
|
||
meta.get("description", "")
|
||
),
|
||
"canonical": self._score_canonical(meta.get("canonical"), url),
|
||
"robot_ok": 1.0 if "noindex" not in meta.get("robots", "") else 0.0,
|
||
"schema_org": self._score_schema_org(head_content),
|
||
"url_quality": self._score_url_quality(parsed_url),
|
||
}
|
||
|
||
total_score = sum(
|
||
weight * scores[factor] for factor, weight in self._weights.items()
|
||
)
|
||
|
||
decision = total_score >= self.threshold
|
||
self._update_stats(decision)
|
||
return decision
|
||
|
||
def _score_title_length(self, title: str) -> float:
|
||
length = len(title)
|
||
if 50 <= length <= 60:
|
||
return 1.0
|
||
if 40 <= length < 50 or 60 < length <= 70:
|
||
return 0.7
|
||
return 0.3 # Poor length
|
||
|
||
def _score_keyword_presence(self, text: str) -> float:
|
||
if not self._kw_patterns:
|
||
return 0.0
|
||
matches = len(self._kw_patterns.findall(text))
|
||
return min(matches * 0.3, 1.0) # Max 3 matches
|
||
|
||
def _score_meta_description(self, desc: str) -> float:
|
||
length = len(desc)
|
||
if 140 <= length <= 160:
|
||
return 1.0
|
||
return 0.5 if 120 <= length <= 200 else 0.2
|
||
|
||
def _score_canonical(self, canonical: str, original: str) -> float:
|
||
if not canonical:
|
||
return 0.5 # Neutral score
|
||
return 1.0 if canonical == original else 0.2
|
||
|
||
def _score_schema_org(self, html: str) -> float:
|
||
# Detect any schema.org markup in head
|
||
return (
|
||
1.0
|
||
if re.search(r'<script[^>]+type=["\']application/ld\+json', html)
|
||
else 0.0
|
||
)
|
||
|
||
def _score_url_quality(self, parsed_url) -> float:
|
||
score = 1.0
|
||
path = parsed_url.path.lower()
|
||
|
||
# Penalty factors
|
||
if len(path) > 80:
|
||
score *= 0.7
|
||
if re.search(r"\d{4}", path):
|
||
score *= 0.8 # Numbers in path
|
||
if parsed_url.query:
|
||
score *= 0.6 # URL parameters
|
||
if "_" in path:
|
||
score *= 0.9 # Underscores vs hyphens
|
||
|
||
return score
|
||
|
||
```
|
||
|
||
|
||
## File: crawl4ai/deep_crawling/scorers.py
|
||
|
||
```py
|
||
from abc import ABC, abstractmethod
|
||
from typing import List, Dict, Optional
|
||
from dataclasses import dataclass
|
||
from urllib.parse import urlparse, unquote
|
||
import re
|
||
import logging
|
||
from functools import lru_cache
|
||
from array import array
|
||
import ctypes
|
||
import platform
|
||
PLATFORM = platform.system()
|
||
|
||
# Pre-computed scores for common year differences
|
||
_SCORE_LOOKUP = [1.0, 0.5, 0.3333333333333333, 0.25]
|
||
|
||
# Pre-computed scores for common year differences
|
||
_FRESHNESS_SCORES = [
|
||
1.0, # Current year
|
||
0.9, # Last year
|
||
0.8, # 2 years ago
|
||
0.7, # 3 years ago
|
||
0.6, # 4 years ago
|
||
0.5, # 5 years ago
|
||
]
|
||
|
||
class ScoringStats:
|
||
__slots__ = ('_urls_scored', '_total_score', '_min_score', '_max_score')
|
||
|
||
def __init__(self):
|
||
self._urls_scored = 0
|
||
self._total_score = 0.0
|
||
self._min_score = None # Lazy initialization
|
||
self._max_score = None
|
||
|
||
def update(self, score: float) -> None:
|
||
"""Optimized update with minimal operations"""
|
||
self._urls_scored += 1
|
||
self._total_score += score
|
||
|
||
# Lazy min/max tracking - only if actually accessed
|
||
if self._min_score is not None:
|
||
if score < self._min_score:
|
||
self._min_score = score
|
||
if self._max_score is not None:
|
||
if score > self._max_score:
|
||
self._max_score = score
|
||
|
||
def get_average(self) -> float:
|
||
"""Direct calculation instead of property"""
|
||
return self._total_score / self._urls_scored if self._urls_scored else 0.0
|
||
|
||
def get_min(self) -> float:
|
||
"""Lazy min calculation"""
|
||
if self._min_score is None:
|
||
self._min_score = self._total_score / self._urls_scored if self._urls_scored else 0.0
|
||
return self._min_score
|
||
|
||
def get_max(self) -> float:
|
||
"""Lazy max calculation"""
|
||
if self._max_score is None:
|
||
self._max_score = self._total_score / self._urls_scored if self._urls_scored else 0.0
|
||
return self._max_score
|
||
class URLScorer(ABC):
|
||
__slots__ = ('_weight', '_stats')
|
||
|
||
def __init__(self, weight: float = 1.0):
|
||
# Store weight directly as float32 for memory efficiency
|
||
self._weight = ctypes.c_float(weight).value
|
||
self._stats = ScoringStats()
|
||
|
||
@abstractmethod
|
||
def _calculate_score(self, url: str) -> float:
|
||
"""Calculate raw score for URL."""
|
||
pass
|
||
|
||
def score(self, url: str) -> float:
|
||
"""Calculate weighted score with minimal overhead."""
|
||
score = self._calculate_score(url) * self._weight
|
||
self._stats.update(score)
|
||
return score
|
||
|
||
@property
|
||
def stats(self):
|
||
"""Access to scoring statistics."""
|
||
return self._stats
|
||
|
||
@property
|
||
def weight(self):
|
||
return self._weight
|
||
|
||
class CompositeScorer(URLScorer):
|
||
__slots__ = ('_scorers', '_normalize', '_weights_array', '_score_array')
|
||
|
||
def __init__(self, scorers: List[URLScorer], normalize: bool = True):
|
||
"""Initialize composite scorer combining multiple scoring strategies.
|
||
|
||
Optimized for:
|
||
- Fast parallel scoring
|
||
- Memory efficient score aggregation
|
||
- Quick short-circuit conditions
|
||
- Pre-allocated arrays
|
||
|
||
Args:
|
||
scorers: List of scoring strategies to combine
|
||
normalize: Whether to normalize final score by scorer count
|
||
"""
|
||
super().__init__(weight=1.0)
|
||
self._scorers = scorers
|
||
self._normalize = normalize
|
||
|
||
# Pre-allocate arrays for scores and weights
|
||
self._weights_array = array('f', [s.weight for s in scorers])
|
||
self._score_array = array('f', [0.0] * len(scorers))
|
||
|
||
@lru_cache(maxsize=10000)
|
||
def _calculate_score(self, url: str) -> float:
|
||
"""Calculate combined score from all scoring strategies.
|
||
|
||
Uses:
|
||
1. Pre-allocated arrays for scores
|
||
2. Short-circuit on zero scores
|
||
3. Optimized normalization
|
||
4. Vectorized operations where possible
|
||
|
||
Args:
|
||
url: URL to score
|
||
|
||
Returns:
|
||
Combined and optionally normalized score
|
||
"""
|
||
total_score = 0.0
|
||
scores = self._score_array
|
||
|
||
# Get scores from all scorers
|
||
for i, scorer in enumerate(self._scorers):
|
||
# Use public score() method which applies weight
|
||
scores[i] = scorer.score(url)
|
||
total_score += scores[i]
|
||
|
||
# Normalize if requested
|
||
if self._normalize and self._scorers:
|
||
count = len(self._scorers)
|
||
return total_score / count
|
||
|
||
return total_score
|
||
|
||
def score(self, url: str) -> float:
|
||
"""Public scoring interface with stats tracking.
|
||
|
||
Args:
|
||
url: URL to score
|
||
|
||
Returns:
|
||
Final combined score
|
||
"""
|
||
score = self._calculate_score(url)
|
||
self.stats.update(score)
|
||
return score
|
||
|
||
class KeywordRelevanceScorer(URLScorer):
|
||
__slots__ = ('_weight', '_stats', '_keywords', '_case_sensitive')
|
||
|
||
def __init__(self, keywords: List[str], weight: float = 1.0, case_sensitive: bool = False):
|
||
super().__init__(weight=weight)
|
||
self._case_sensitive = case_sensitive
|
||
# Pre-process keywords once
|
||
self._keywords = [k if case_sensitive else k.lower() for k in keywords]
|
||
|
||
@lru_cache(maxsize=10000)
|
||
def _url_bytes(self, url: str) -> bytes:
|
||
"""Cache decoded URL bytes"""
|
||
return url.encode('utf-8') if self._case_sensitive else url.lower().encode('utf-8')
|
||
|
||
|
||
def _calculate_score(self, url: str) -> float:
|
||
"""Fast string matching without regex or byte conversion"""
|
||
if not self._case_sensitive:
|
||
url = url.lower()
|
||
|
||
matches = sum(1 for k in self._keywords if k in url)
|
||
|
||
# Fast return paths
|
||
if not matches:
|
||
return 0.0
|
||
if matches == len(self._keywords):
|
||
return 1.0
|
||
|
||
return matches / len(self._keywords)
|
||
|
||
class PathDepthScorer(URLScorer):
|
||
__slots__ = ('_weight', '_stats', '_optimal_depth') # Remove _url_cache
|
||
|
||
def __init__(self, optimal_depth: int = 3, weight: float = 1.0):
|
||
super().__init__(weight=weight)
|
||
self._optimal_depth = optimal_depth
|
||
|
||
@staticmethod
|
||
@lru_cache(maxsize=10000)
|
||
def _quick_depth(path: str) -> int:
|
||
"""Ultra fast path depth calculation.
|
||
|
||
Examples:
|
||
- "http://example.com" -> 0 # No path segments
|
||
- "http://example.com/" -> 0 # Empty path
|
||
- "http://example.com/a" -> 1
|
||
- "http://example.com/a/b" -> 2
|
||
"""
|
||
if not path or path == '/':
|
||
return 0
|
||
|
||
if '/' not in path:
|
||
return 0
|
||
|
||
depth = 0
|
||
last_was_slash = True
|
||
|
||
for c in path:
|
||
if c == '/':
|
||
if not last_was_slash:
|
||
depth += 1
|
||
last_was_slash = True
|
||
else:
|
||
last_was_slash = False
|
||
|
||
if not last_was_slash:
|
||
depth += 1
|
||
|
||
return depth
|
||
|
||
@lru_cache(maxsize=10000) # Cache the whole calculation
|
||
def _calculate_score(self, url: str) -> float:
|
||
pos = url.find('/', url.find('://') + 3)
|
||
if pos == -1:
|
||
depth = 0
|
||
else:
|
||
depth = self._quick_depth(url[pos:])
|
||
|
||
# Use lookup table for common distances
|
||
distance = depth - self._optimal_depth
|
||
distance = distance if distance >= 0 else -distance # Faster than abs()
|
||
|
||
if distance < 4:
|
||
return _SCORE_LOOKUP[distance]
|
||
|
||
return 1.0 / (1.0 + distance)
|
||
|
||
class ContentTypeScorer(URLScorer):
|
||
__slots__ = ('_weight', '_exact_types', '_regex_types')
|
||
|
||
def __init__(self, type_weights: Dict[str, float], weight: float = 1.0):
|
||
"""Initialize scorer with type weights map.
|
||
|
||
Args:
|
||
type_weights: Dict mapping file extensions/patterns to scores (e.g. {'.html$': 1.0})
|
||
weight: Overall weight multiplier for this scorer
|
||
"""
|
||
super().__init__(weight=weight)
|
||
self._exact_types = {} # Fast lookup for simple extensions
|
||
self._regex_types = [] # Fallback for complex patterns
|
||
|
||
# Split into exact vs regex matchers for performance
|
||
for pattern, score in type_weights.items():
|
||
if pattern.startswith('.') and pattern.endswith('$'):
|
||
ext = pattern[1:-1]
|
||
self._exact_types[ext] = score
|
||
else:
|
||
self._regex_types.append((re.compile(pattern), score))
|
||
|
||
# Sort complex patterns by score for early exit
|
||
self._regex_types.sort(key=lambda x: -x[1])
|
||
|
||
@staticmethod
|
||
@lru_cache(maxsize=10000)
|
||
def _quick_extension(url: str) -> str:
|
||
"""Extract file extension ultra-fast without regex/splits.
|
||
|
||
Handles:
|
||
- Basic extensions: "example.html" -> "html"
|
||
- Query strings: "page.php?id=1" -> "php"
|
||
- Fragments: "doc.pdf#page=1" -> "pdf"
|
||
- Path params: "file.jpg;width=100" -> "jpg"
|
||
|
||
Args:
|
||
url: URL to extract extension from
|
||
|
||
Returns:
|
||
Extension without dot, or empty string if none found
|
||
"""
|
||
pos = url.rfind('.')
|
||
if pos == -1:
|
||
return ''
|
||
|
||
# Find first non-alphanumeric char after extension
|
||
end = len(url)
|
||
for i in range(pos + 1, len(url)):
|
||
c = url[i]
|
||
# Stop at query string, fragment, path param or any non-alphanumeric
|
||
if c in '?#;' or not c.isalnum():
|
||
end = i
|
||
break
|
||
|
||
return url[pos + 1:end].lower()
|
||
|
||
@lru_cache(maxsize=10000)
|
||
def _calculate_score(self, url: str) -> float:
|
||
"""Calculate content type score for URL.
|
||
|
||
Uses staged approach:
|
||
1. Try exact extension match (fast path)
|
||
2. Fall back to regex patterns if needed
|
||
|
||
Args:
|
||
url: URL to score
|
||
|
||
Returns:
|
||
Score between 0.0 and 1.0 * weight
|
||
"""
|
||
# Fast path: direct extension lookup
|
||
ext = self._quick_extension(url)
|
||
if ext:
|
||
score = self._exact_types.get(ext, None)
|
||
if score is not None:
|
||
return score
|
||
|
||
# Slow path: regex patterns
|
||
for pattern, score in self._regex_types:
|
||
if pattern.search(url):
|
||
return score
|
||
|
||
return 0.0
|
||
|
||
class FreshnessScorer(URLScorer):
|
||
__slots__ = ('_weight', '_date_pattern', '_current_year')
|
||
|
||
def __init__(self, weight: float = 1.0, current_year: int = 2024):
|
||
"""Initialize freshness scorer.
|
||
|
||
Extracts and scores dates from URLs using format:
|
||
- YYYY/MM/DD
|
||
- YYYY-MM-DD
|
||
- YYYY_MM_DD
|
||
- YYYY (year only)
|
||
|
||
Args:
|
||
weight: Score multiplier
|
||
current_year: Year to calculate freshness against (default 2024)
|
||
"""
|
||
super().__init__(weight=weight)
|
||
self._current_year = current_year
|
||
|
||
# Combined pattern for all date formats
|
||
# Uses non-capturing groups (?:) and alternation
|
||
self._date_pattern = re.compile(
|
||
r'(?:/' # Path separator
|
||
r'|[-_])' # or date separators
|
||
r'((?:19|20)\d{2})' # Year group (1900-2099)
|
||
r'(?:' # Optional month/day group
|
||
r'(?:/|[-_])' # Date separator
|
||
r'(?:\d{2})' # Month
|
||
r'(?:' # Optional day
|
||
r'(?:/|[-_])' # Date separator
|
||
r'(?:\d{2})' # Day
|
||
r')?' # Day is optional
|
||
r')?' # Month/day group is optional
|
||
)
|
||
|
||
@lru_cache(maxsize=10000)
|
||
def _extract_year(self, url: str) -> Optional[int]:
|
||
"""Extract the most recent year from URL.
|
||
|
||
Args:
|
||
url: URL to extract year from
|
||
|
||
Returns:
|
||
Year as int or None if no valid year found
|
||
"""
|
||
matches = self._date_pattern.finditer(url)
|
||
latest_year = None
|
||
|
||
# Find most recent year
|
||
for match in matches:
|
||
year = int(match.group(1))
|
||
if (year <= self._current_year and # Sanity check
|
||
(latest_year is None or year > latest_year)):
|
||
latest_year = year
|
||
|
||
return latest_year
|
||
|
||
@lru_cache(maxsize=10000)
|
||
def _calculate_score(self, url: str) -> float:
|
||
"""Calculate freshness score based on URL date.
|
||
|
||
More recent years score higher. Uses pre-computed scoring
|
||
table for common year differences.
|
||
|
||
Args:
|
||
url: URL to score
|
||
|
||
Returns:
|
||
Score between 0.0 and 1.0 * weight
|
||
"""
|
||
year = self._extract_year(url)
|
||
if year is None:
|
||
return 0.5 # Default score
|
||
|
||
# Use lookup table for common year differences
|
||
year_diff = self._current_year - year
|
||
if year_diff < len(_FRESHNESS_SCORES):
|
||
return _FRESHNESS_SCORES[year_diff]
|
||
|
||
# Fallback calculation for older content
|
||
return max(0.1, 1.0 - year_diff * 0.1)
|
||
|
||
class DomainAuthorityScorer(URLScorer):
|
||
__slots__ = ('_weight', '_domain_weights', '_default_weight', '_top_domains')
|
||
|
||
def __init__(
|
||
self,
|
||
domain_weights: Dict[str, float],
|
||
default_weight: float = 0.5,
|
||
weight: float = 1.0,
|
||
):
|
||
"""Initialize domain authority scorer.
|
||
|
||
Args:
|
||
domain_weights: Dict mapping domains to authority scores
|
||
default_weight: Score for unknown domains
|
||
weight: Overall scorer weight multiplier
|
||
|
||
Example:
|
||
{
|
||
'python.org': 1.0,
|
||
'github.com': 0.9,
|
||
'medium.com': 0.7
|
||
}
|
||
"""
|
||
super().__init__(weight=weight)
|
||
|
||
# Pre-process domains for faster lookup
|
||
self._domain_weights = {
|
||
domain.lower(): score
|
||
for domain, score in domain_weights.items()
|
||
}
|
||
self._default_weight = default_weight
|
||
|
||
# Cache top domains for fast path
|
||
self._top_domains = {
|
||
domain: score
|
||
for domain, score in sorted(
|
||
domain_weights.items(),
|
||
key=lambda x: -x[1]
|
||
)[:5] # Keep top 5 highest scoring domains
|
||
}
|
||
|
||
@staticmethod
|
||
@lru_cache(maxsize=10000)
|
||
def _extract_domain(url: str) -> str:
|
||
"""Extract domain from URL ultra-fast.
|
||
|
||
Handles:
|
||
- Basic domains: "example.com"
|
||
- Subdomains: "sub.example.com"
|
||
- Ports: "example.com:8080"
|
||
- IPv4: "192.168.1.1"
|
||
|
||
Args:
|
||
url: Full URL to extract domain from
|
||
|
||
Returns:
|
||
Lowercase domain without port
|
||
"""
|
||
# Find domain start
|
||
start = url.find('://')
|
||
if start == -1:
|
||
start = 0
|
||
else:
|
||
start += 3
|
||
|
||
# Find domain end
|
||
end = url.find('/', start)
|
||
if end == -1:
|
||
end = url.find('?', start)
|
||
if end == -1:
|
||
end = url.find('#', start)
|
||
if end == -1:
|
||
end = len(url)
|
||
|
||
# Extract domain and remove port
|
||
domain = url[start:end]
|
||
port_idx = domain.rfind(':')
|
||
if port_idx != -1:
|
||
domain = domain[:port_idx]
|
||
|
||
return domain.lower()
|
||
|
||
@lru_cache(maxsize=10000)
|
||
def _calculate_score(self, url: str) -> float:
|
||
"""Calculate domain authority score.
|
||
|
||
Uses staged approach:
|
||
1. Check top domains (fastest)
|
||
2. Check full domain weights
|
||
3. Return default weight
|
||
|
||
Args:
|
||
url: URL to score
|
||
|
||
Returns:
|
||
Authority score between 0.0 and 1.0 * weight
|
||
"""
|
||
domain = self._extract_domain(url)
|
||
|
||
# Fast path: check top domains first
|
||
score = self._top_domains.get(domain)
|
||
if score is not None:
|
||
return score
|
||
|
||
# Regular path: check all domains
|
||
return self._domain_weights.get(domain, self._default_weight)
|
||
```
|
||
|
||
|
||
## File: docs/examples/deepcrawl_example.py
|
||
|
||
```py
|
||
import asyncio
|
||
import time
|
||
|
||
from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
|
||
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
||
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
|
||
from crawl4ai.deep_crawling.filters import (
|
||
FilterChain,
|
||
URLPatternFilter,
|
||
DomainFilter,
|
||
ContentTypeFilter,
|
||
ContentRelevanceFilter,
|
||
SEOFilter,
|
||
)
|
||
from crawl4ai.deep_crawling.scorers import (
|
||
KeywordRelevanceScorer,
|
||
)
|
||
|
||
|
||
# 1️⃣ Basic Deep Crawl Setup
|
||
async def basic_deep_crawl():
|
||
"""
|
||
PART 1: Basic Deep Crawl setup - Demonstrates a simple two-level deep crawl.
|
||
|
||
This function shows:
|
||
- How to set up BFSDeepCrawlStrategy (Breadth-First Search)
|
||
- Setting depth and domain parameters
|
||
- Processing the results to show the hierarchy
|
||
"""
|
||
print("\n===== BASIC DEEP CRAWL SETUP =====")
|
||
|
||
# Configure a 2-level deep crawl using Breadth-First Search strategy
|
||
# max_depth=2 means: initial page (depth 0) + 2 more levels
|
||
# include_external=False means: only follow links within the same domain
|
||
config = CrawlerRunConfig(
|
||
deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=2, include_external=False),
|
||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||
verbose=True, # Show progress during crawling
|
||
)
|
||
|
||
async with AsyncWebCrawler() as crawler:
|
||
start_time = time.perf_counter()
|
||
results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
|
||
|
||
# Group results by depth to visualize the crawl tree
|
||
pages_by_depth = {}
|
||
for result in results:
|
||
depth = result.metadata.get("depth", 0)
|
||
if depth not in pages_by_depth:
|
||
pages_by_depth[depth] = []
|
||
pages_by_depth[depth].append(result.url)
|
||
|
||
print(f"✅ Crawled {len(results)} pages total")
|
||
|
||
# Display crawl structure by depth
|
||
for depth, urls in sorted(pages_by_depth.items()):
|
||
print(f"\nDepth {depth}: {len(urls)} pages")
|
||
# Show first 3 URLs for each depth as examples
|
||
for url in urls[:3]:
|
||
print(f" → {url}")
|
||
if len(urls) > 3:
|
||
print(f" ... and {len(urls) - 3} more")
|
||
|
||
print(
|
||
f"\n✅ Performance: {len(results)} pages in {time.perf_counter() - start_time:.2f} seconds"
|
||
)
|
||
|
||
# 2️⃣ Stream vs. Non-Stream Execution
|
||
async def stream_vs_nonstream():
|
||
"""
|
||
PART 2: Demonstrates the difference between stream and non-stream execution.
|
||
|
||
Non-stream: Waits for all results before processing
|
||
Stream: Processes results as they become available
|
||
"""
|
||
print("\n===== STREAM VS. NON-STREAM EXECUTION =====")
|
||
|
||
# Common configuration for both examples
|
||
base_config = CrawlerRunConfig(
|
||
deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1, include_external=False),
|
||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||
verbose=False,
|
||
)
|
||
|
||
async with AsyncWebCrawler() as crawler:
|
||
# NON-STREAMING MODE
|
||
print("\n📊 NON-STREAMING MODE:")
|
||
print(" In this mode, all results are collected before being returned.")
|
||
|
||
non_stream_config = base_config.clone()
|
||
non_stream_config.stream = False
|
||
|
||
start_time = time.perf_counter()
|
||
results = await crawler.arun(
|
||
url="https://docs.crawl4ai.com", config=non_stream_config
|
||
)
|
||
|
||
print(f" ✅ Received all {len(results)} results at once")
|
||
print(f" ✅ Total duration: {time.perf_counter() - start_time:.2f} seconds")
|
||
|
||
# STREAMING MODE
|
||
print("\n📊 STREAMING MODE:")
|
||
print(" In this mode, results are processed as they become available.")
|
||
|
||
stream_config = base_config.clone()
|
||
stream_config.stream = True
|
||
|
||
start_time = time.perf_counter()
|
||
result_count = 0
|
||
first_result_time = None
|
||
|
||
async for result in await crawler.arun(
|
||
url="https://docs.crawl4ai.com", config=stream_config
|
||
):
|
||
result_count += 1
|
||
if result_count == 1:
|
||
first_result_time = time.perf_counter() - start_time
|
||
print(
|
||
f" ✅ First result received after {first_result_time:.2f} seconds: {result.url}"
|
||
)
|
||
elif result_count % 5 == 0: # Show every 5th result for brevity
|
||
print(f" → Result #{result_count}: {result.url}")
|
||
|
||
print(f" ✅ Total: {result_count} results")
|
||
print(f" ✅ First result: {first_result_time:.2f} seconds")
|
||
print(f" ✅ All results: {time.perf_counter() - start_time:.2f} seconds")
|
||
print("\n🔍 Key Takeaway: Streaming allows processing results immediately")
|
||
|
||
# 3️⃣ Introduce Filters & Scorers
|
||
async def filters_and_scorers():
|
||
"""
|
||
PART 3: Demonstrates the use of filters and scorers for more targeted crawling.
|
||
|
||
This function progressively adds:
|
||
1. A single URL pattern filter
|
||
2. Multiple filters in a chain
|
||
3. Scorers for prioritizing pages
|
||
"""
|
||
print("\n===== FILTERS AND SCORERS =====")
|
||
|
||
async with AsyncWebCrawler() as crawler:
|
||
# SINGLE FILTER EXAMPLE
|
||
print("\n📊 EXAMPLE 1: SINGLE URL PATTERN FILTER")
|
||
print(" Only crawl pages containing 'core' in the URL")
|
||
|
||
# Create a filter that only allows URLs with 'guide' in them
|
||
url_filter = URLPatternFilter(patterns=["*core*"])
|
||
|
||
config = CrawlerRunConfig(
|
||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||
max_depth=1,
|
||
include_external=False,
|
||
filter_chain=FilterChain([url_filter]), # Single filter
|
||
),
|
||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||
cache_mode=CacheMode.BYPASS,
|
||
verbose=True,
|
||
)
|
||
|
||
results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
|
||
|
||
print(f" ✅ Crawled {len(results)} pages matching '*core*'")
|
||
for result in results[:3]: # Show first 3 results
|
||
print(f" → {result.url}")
|
||
if len(results) > 3:
|
||
print(f" ... and {len(results) - 3} more")
|
||
|
||
# MULTIPLE FILTERS EXAMPLE
|
||
print("\n📊 EXAMPLE 2: MULTIPLE FILTERS IN A CHAIN")
|
||
print(" Only crawl pages that:")
|
||
print(" 1. Contain '2024' in the URL")
|
||
print(" 2. Are from 'techcrunch.com'")
|
||
print(" 3. Are of text/html or application/javascript content type")
|
||
|
||
# Create a chain of filters
|
||
filter_chain = FilterChain(
|
||
[
|
||
URLPatternFilter(patterns=["*2024*"]),
|
||
DomainFilter(
|
||
allowed_domains=["techcrunch.com"],
|
||
blocked_domains=["guce.techcrunch.com", "oidc.techcrunch.com"],
|
||
),
|
||
ContentTypeFilter(
|
||
allowed_types=["text/html", "application/javascript"]
|
||
),
|
||
]
|
||
)
|
||
|
||
config = CrawlerRunConfig(
|
||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||
max_depth=1, include_external=False, filter_chain=filter_chain
|
||
),
|
||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||
verbose=True,
|
||
)
|
||
|
||
results = await crawler.arun(url="https://techcrunch.com", config=config)
|
||
|
||
print(f" ✅ Crawled {len(results)} pages after applying all filters")
|
||
for result in results[:3]:
|
||
print(f" → {result.url}")
|
||
if len(results) > 3:
|
||
print(f" ... and {len(results) - 3} more")
|
||
|
||
# SCORERS EXAMPLE
|
||
print("\n📊 EXAMPLE 3: USING A KEYWORD RELEVANCE SCORER")
|
||
print(
|
||
"Score pages based on relevance to keywords: 'crawl', 'example', 'async', 'configuration','javascript','css'"
|
||
)
|
||
|
||
# Create a keyword relevance scorer
|
||
keyword_scorer = KeywordRelevanceScorer(
|
||
keywords=["crawl", "example", "async", "configuration","javascript","css"], weight=1
|
||
)
|
||
|
||
config = CrawlerRunConfig(
|
||
deep_crawl_strategy=BestFirstCrawlingStrategy(
|
||
max_depth=1, include_external=False, url_scorer=keyword_scorer
|
||
),
|
||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||
cache_mode=CacheMode.BYPASS,
|
||
verbose=True,
|
||
stream=True,
|
||
)
|
||
|
||
results = []
|
||
async for result in await crawler.arun(
|
||
url="https://docs.crawl4ai.com", config=config
|
||
):
|
||
results.append(result)
|
||
score = result.metadata.get("score")
|
||
print(f" → Score: {score:.2f} | {result.url}")
|
||
|
||
print(f" ✅ Crawler prioritized {len(results)} pages by relevance score")
|
||
print(" 🔍 Note: BestFirstCrawlingStrategy visits highest-scoring pages first")
|
||
|
||
# 4️⃣ Advanced Filters
|
||
async def advanced_filters():
|
||
"""
|
||
PART 4: Demonstrates advanced filtering techniques for specialized crawling.
|
||
|
||
This function covers:
|
||
- SEO filters
|
||
- Text relevancy filtering
|
||
- Combining advanced filters
|
||
"""
|
||
print("\n===== ADVANCED FILTERS =====")
|
||
|
||
async with AsyncWebCrawler() as crawler:
|
||
# SEO FILTER EXAMPLE
|
||
print("\n📊 EXAMPLE 1: SEO FILTERS")
|
||
print(
|
||
"Quantitative SEO quality assessment filter based searching keywords in the head section"
|
||
)
|
||
|
||
seo_filter = SEOFilter(
|
||
threshold=0.5, keywords=["dynamic", "interaction", "javascript"]
|
||
)
|
||
|
||
config = CrawlerRunConfig(
|
||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||
max_depth=1, filter_chain=FilterChain([seo_filter])
|
||
),
|
||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||
verbose=True,
|
||
cache_mode=CacheMode.BYPASS,
|
||
)
|
||
|
||
results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
|
||
|
||
print(f" ✅ Found {len(results)} pages with relevant keywords")
|
||
for result in results:
|
||
print(f" → {result.url}")
|
||
|
||
# ADVANCED TEXT RELEVANCY FILTER
|
||
print("\n📊 EXAMPLE 2: ADVANCED TEXT RELEVANCY FILTER")
|
||
|
||
# More sophisticated content relevance filter
|
||
relevance_filter = ContentRelevanceFilter(
|
||
query="Interact with the web using your authentic digital identity",
|
||
threshold=0.7,
|
||
)
|
||
|
||
config = CrawlerRunConfig(
|
||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||
max_depth=1, filter_chain=FilterChain([relevance_filter])
|
||
),
|
||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||
verbose=True,
|
||
cache_mode=CacheMode.BYPASS,
|
||
)
|
||
|
||
results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
|
||
|
||
print(f" ✅ Found {len(results)} pages")
|
||
for result in results:
|
||
relevance_score = result.metadata.get("relevance_score", 0)
|
||
print(f" → Score: {relevance_score:.2f} | {result.url}")
|
||
|
||
# 5️⃣ Max Pages and Score Thresholds
|
||
async def max_pages_and_thresholds():
|
||
"""
|
||
PART 5: Demonstrates using max_pages and score_threshold parameters with different strategies.
|
||
|
||
This function shows:
|
||
- How to limit the number of pages crawled
|
||
- How to set score thresholds for more targeted crawling
|
||
- Comparing BFS, DFS, and Best-First strategies with these parameters
|
||
"""
|
||
print("\n===== MAX PAGES AND SCORE THRESHOLDS =====")
|
||
|
||
from crawl4ai.deep_crawling import DFSDeepCrawlStrategy
|
||
|
||
async with AsyncWebCrawler() as crawler:
|
||
# Define a common keyword scorer for all examples
|
||
keyword_scorer = KeywordRelevanceScorer(
|
||
keywords=["browser", "crawler", "web", "automation"],
|
||
weight=1.0
|
||
)
|
||
|
||
# EXAMPLE 1: BFS WITH MAX PAGES
|
||
print("\n📊 EXAMPLE 1: BFS STRATEGY WITH MAX PAGES LIMIT")
|
||
print(" Limit the crawler to a maximum of 5 pages")
|
||
|
||
bfs_config = CrawlerRunConfig(
|
||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||
max_depth=2,
|
||
include_external=False,
|
||
url_scorer=keyword_scorer,
|
||
max_pages=5 # Only crawl 5 pages
|
||
),
|
||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||
verbose=True,
|
||
cache_mode=CacheMode.BYPASS,
|
||
)
|
||
|
||
results = await crawler.arun(url="https://docs.crawl4ai.com", config=bfs_config)
|
||
|
||
print(f" ✅ Crawled exactly {len(results)} pages as specified by max_pages")
|
||
for result in results:
|
||
depth = result.metadata.get("depth", 0)
|
||
print(f" → Depth: {depth} | {result.url}")
|
||
|
||
# EXAMPLE 2: DFS WITH SCORE THRESHOLD
|
||
print("\n📊 EXAMPLE 2: DFS STRATEGY WITH SCORE THRESHOLD")
|
||
print(" Only crawl pages with a relevance score above 0.5")
|
||
|
||
dfs_config = CrawlerRunConfig(
|
||
deep_crawl_strategy=DFSDeepCrawlStrategy(
|
||
max_depth=2,
|
||
include_external=False,
|
||
url_scorer=keyword_scorer,
|
||
score_threshold=0.7, # Only process URLs with scores above 0.5
|
||
max_pages=10
|
||
),
|
||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||
verbose=True,
|
||
cache_mode=CacheMode.BYPASS,
|
||
)
|
||
|
||
results = await crawler.arun(url="https://docs.crawl4ai.com", config=dfs_config)
|
||
|
||
print(f" ✅ Crawled {len(results)} pages with scores above threshold")
|
||
for result in results:
|
||
score = result.metadata.get("score", 0)
|
||
depth = result.metadata.get("depth", 0)
|
||
print(f" → Depth: {depth} | Score: {score:.2f} | {result.url}")
|
||
|
||
# EXAMPLE 3: BEST-FIRST WITH BOTH CONSTRAINTS
|
||
print("\n📊 EXAMPLE 3: BEST-FIRST STRATEGY WITH BOTH CONSTRAINTS")
|
||
print(" Limit to 7 pages with scores above 0.3, prioritizing highest scores")
|
||
|
||
bf_config = CrawlerRunConfig(
|
||
deep_crawl_strategy=BestFirstCrawlingStrategy(
|
||
max_depth=2,
|
||
include_external=False,
|
||
url_scorer=keyword_scorer,
|
||
max_pages=7, # Limit to 7 pages total
|
||
),
|
||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||
verbose=True,
|
||
cache_mode=CacheMode.BYPASS,
|
||
stream=True,
|
||
)
|
||
|
||
results = []
|
||
async for result in await crawler.arun(url="https://docs.crawl4ai.com", config=bf_config):
|
||
results.append(result)
|
||
score = result.metadata.get("score", 0)
|
||
depth = result.metadata.get("depth", 0)
|
||
print(f" → Depth: {depth} | Score: {score:.2f} | {result.url}")
|
||
|
||
print(f" ✅ Crawled {len(results)} high-value pages with scores above 0.3")
|
||
if results:
|
||
avg_score = sum(r.metadata.get('score', 0) for r in results) / len(results)
|
||
print(f" ✅ Average score: {avg_score:.2f}")
|
||
print(" 🔍 Note: BestFirstCrawlingStrategy visited highest-scoring pages first")
|
||
|
||
# 6️⃣ Wrap-Up and Key Takeaways
|
||
async def wrap_up():
|
||
"""
|
||
PART 6: Wrap-Up and Key Takeaways
|
||
|
||
Summarize the key concepts learned in this tutorial.
|
||
"""
|
||
print("\n===== COMPLETE CRAWLER EXAMPLE =====")
|
||
print("Combining filters, scorers, and streaming for an optimized crawl")
|
||
|
||
# Create a sophisticated filter chain
|
||
filter_chain = FilterChain(
|
||
[
|
||
DomainFilter(
|
||
allowed_domains=["docs.crawl4ai.com"],
|
||
blocked_domains=["old.docs.crawl4ai.com"],
|
||
),
|
||
URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
|
||
ContentTypeFilter(allowed_types=["text/html"]),
|
||
]
|
||
)
|
||
|
||
# Create a composite scorer that combines multiple scoring strategies
|
||
keyword_scorer = KeywordRelevanceScorer(
|
||
keywords=["crawl", "example", "async", "configuration"], weight=0.7
|
||
)
|
||
# Set up the configuration
|
||
config = CrawlerRunConfig(
|
||
deep_crawl_strategy=BestFirstCrawlingStrategy(
|
||
max_depth=1,
|
||
include_external=False,
|
||
filter_chain=filter_chain,
|
||
url_scorer=keyword_scorer,
|
||
),
|
||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||
stream=True,
|
||
verbose=True,
|
||
)
|
||
|
||
# Execute the crawl
|
||
results = []
|
||
start_time = time.perf_counter()
|
||
|
||
async with AsyncWebCrawler() as crawler:
|
||
async for result in await crawler.arun(
|
||
url="https://docs.crawl4ai.com", config=config
|
||
):
|
||
results.append(result)
|
||
score = result.metadata.get("score", 0)
|
||
depth = result.metadata.get("depth", 0)
|
||
print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")
|
||
|
||
duration = time.perf_counter() - start_time
|
||
|
||
# Summarize the results
|
||
print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
|
||
print(
|
||
f"✅ Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}"
|
||
)
|
||
|
||
# Group by depth
|
||
depth_counts = {}
|
||
for result in results:
|
||
depth = result.metadata.get("depth", 0)
|
||
depth_counts[depth] = depth_counts.get(depth, 0) + 1
|
||
|
||
print("\n📊 Pages crawled by depth:")
|
||
for depth, count in sorted(depth_counts.items()):
|
||
print(f" Depth {depth}: {count} pages")
|
||
|
||
async def run_tutorial():
|
||
"""
|
||
Executes all tutorial sections in sequence.
|
||
"""
|
||
print("\n🚀 CRAWL4AI DEEP CRAWLING TUTORIAL 🚀")
|
||
print("======================================")
|
||
print("This tutorial will walk you through deep crawling techniques,")
|
||
print("from basic to advanced, using the Crawl4AI library.")
|
||
|
||
# Define sections - uncomment to run specific parts during development
|
||
tutorial_sections = [
|
||
basic_deep_crawl,
|
||
stream_vs_nonstream,
|
||
filters_and_scorers,
|
||
max_pages_and_thresholds,
|
||
advanced_filters,
|
||
wrap_up,
|
||
]
|
||
|
||
for section in tutorial_sections:
|
||
await section()
|
||
|
||
print("\n🎉 TUTORIAL COMPLETE! 🎉")
|
||
print("You now have a comprehensive understanding of deep crawling with Crawl4AI.")
|
||
print("For more information, check out https://docs.crawl4ai.com")
|
||
|
||
# Execute the tutorial when run directly
|
||
if __name__ == "__main__":
|
||
asyncio.run(run_tutorial())
|
||
```
|