Merge branch 'next' into 2025-MAR-ALPHA-1

This commit is contained in:
Aravind Karnam
2025-04-17 10:50:02 +05:30
38 changed files with 5574 additions and 878 deletions

View File

@@ -24,7 +24,7 @@ ARG TARGETARCH
LABEL maintainer="unclecode" LABEL maintainer="unclecode"
LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper" LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
LABEL version="1.0" LABEL version="1.0"
RUN apt-get update && apt-get install -y --no-install-recommends \ RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \ build-essential \
@@ -38,6 +38,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
libjpeg-dev \ libjpeg-dev \
redis-server \ redis-server \
supervisor \ supervisor \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
RUN apt-get update && apt-get install -y --no-install-recommends \ RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -62,11 +63,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
libcairo2 \ libcairo2 \
libasound2 \ libasound2 \
libatspi2.0-0 \ libatspi2.0-0 \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \ RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \
apt-get update && apt-get install -y --no-install-recommends \ apt-get update && apt-get install -y --no-install-recommends \
nvidia-cuda-toolkit \ nvidia-cuda-toolkit \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* ; \ && rm -rf /var/lib/apt/lists/* ; \
else \ else \
echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \ echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \
@@ -76,16 +79,24 @@ RUN if [ "$TARGETARCH" = "arm64" ]; then \
echo "🦾 Installing ARM-specific optimizations"; \ echo "🦾 Installing ARM-specific optimizations"; \
apt-get update && apt-get install -y --no-install-recommends \ apt-get update && apt-get install -y --no-install-recommends \
libopenblas-dev \ libopenblas-dev \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*; \ && rm -rf /var/lib/apt/lists/*; \
elif [ "$TARGETARCH" = "amd64" ]; then \ elif [ "$TARGETARCH" = "amd64" ]; then \
echo "🖥️ Installing AMD64-specific optimizations"; \ echo "🖥️ Installing AMD64-specific optimizations"; \
apt-get update && apt-get install -y --no-install-recommends \ apt-get update && apt-get install -y --no-install-recommends \
libomp-dev \ libomp-dev \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*; \ && rm -rf /var/lib/apt/lists/*; \
else \ else \
echo "Skipping platform-specific optimizations (unsupported platform)"; \ echo "Skipping platform-specific optimizations (unsupported platform)"; \
fi fi
# Create a non-root user and group
RUN groupadd -r appuser && useradd --no-log-init -r -g appuser appuser
# Create and set permissions for appuser home directory
RUN mkdir -p /home/appuser && chown -R appuser:appuser /home/appuser
WORKDIR ${APP_HOME} WORKDIR ${APP_HOME}
RUN echo '#!/bin/bash\n\ RUN echo '#!/bin/bash\n\
@@ -103,6 +114,7 @@ fi' > /tmp/install.sh && chmod +x /tmp/install.sh
COPY . /tmp/project/ COPY . /tmp/project/
# Copy supervisor config first (might need root later, but okay for now)
COPY deploy/docker/supervisord.conf . COPY deploy/docker/supervisord.conf .
COPY deploy/docker/requirements.txt . COPY deploy/docker/requirements.txt .
@@ -131,16 +143,31 @@ RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
else \ else \
pip install "/tmp/project" ; \ pip install "/tmp/project" ; \
fi fi
RUN pip install --no-cache-dir --upgrade pip && \ RUN pip install --no-cache-dir --upgrade pip && \
/tmp/install.sh && \ /tmp/install.sh && \
python -c "import crawl4ai; print('✅ crawl4ai is ready to rock!')" && \ python -c "import crawl4ai; print('✅ crawl4ai is ready to rock!')" && \
python -c "from playwright.sync_api import sync_playwright; print('✅ Playwright is feeling dramatic!')" python -c "from playwright.sync_api import sync_playwright; print('✅ Playwright is feeling dramatic!')"
RUN playwright install --with-deps chromium
RUN crawl4ai-setup
RUN playwright install --with-deps
RUN mkdir -p /home/appuser/.cache/ms-playwright \
&& cp -r /root/.cache/ms-playwright/chromium-* /home/appuser/.cache/ms-playwright/ \
&& chown -R appuser:appuser /home/appuser/.cache/ms-playwright
RUN crawl4ai-doctor
# Copy application code
COPY deploy/docker/* ${APP_HOME}/ COPY deploy/docker/* ${APP_HOME}/
# Change ownership of the application directory to the non-root user
RUN chown -R appuser:appuser ${APP_HOME}
# give permissions to redis persistence dirs if used
RUN mkdir -p /var/lib/redis /var/log/redis && chown -R appuser:appuser /var/lib/redis /var/log/redis
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD bash -c '\ CMD bash -c '\
MEM=$(free -m | awk "/^Mem:/{print \$2}"); \ MEM=$(free -m | awk "/^Mem:/{print \$2}"); \
@@ -149,8 +176,14 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
exit 1; \ exit 1; \
fi && \ fi && \
redis-cli ping > /dev/null && \ redis-cli ping > /dev/null && \
curl -f http://localhost:8000/health || exit 1' curl -f http://localhost:11235/health || exit 1'
EXPOSE 6379 EXPOSE 6379
CMD ["supervisord", "-c", "supervisord.conf"] # Switch to the non-root user before starting the application
USER appuser
# Set environment variables to ptoduction
ENV PYTHON_ENV=production
# Start the application using supervisord
CMD ["supervisord", "-c", "supervisord.conf"]

View File

@@ -2,7 +2,7 @@
import warnings import warnings
from .async_webcrawler import AsyncWebCrawler, CacheMode from .async_webcrawler import AsyncWebCrawler, CacheMode
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig
from .content_scraping_strategy import ( from .content_scraping_strategy import (
ContentScrapingStrategy, ContentScrapingStrategy,
@@ -121,6 +121,7 @@ __all__ = [
"Crawl4aiDockerClient", "Crawl4aiDockerClient",
"ProxyRotationStrategy", "ProxyRotationStrategy",
"RoundRobinProxyStrategy", "RoundRobinProxyStrategy",
"ProxyConfig"
] ]

View File

@@ -5,6 +5,7 @@ from .config import (
MIN_WORD_THRESHOLD, MIN_WORD_THRESHOLD,
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
PROVIDER_MODELS, PROVIDER_MODELS,
PROVIDER_MODELS_PREFIXES,
SCREENSHOT_HEIGHT_TRESHOLD, SCREENSHOT_HEIGHT_TRESHOLD,
PAGE_TIMEOUT, PAGE_TIMEOUT,
IMAGE_SCORE_THRESHOLD, IMAGE_SCORE_THRESHOLD,
@@ -27,11 +28,8 @@ import inspect
from typing import Any, Dict, Optional from typing import Any, Dict, Optional
from enum import Enum from enum import Enum
from .proxy_strategy import ProxyConfig # from .proxy_strategy import ProxyConfig
try:
from .browser.models import DockerConfig
except ImportError:
DockerConfig = None
def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict: def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
@@ -122,23 +120,25 @@ def from_serializable_dict(data: Any) -> Any:
# Handle typed data # Handle typed data
if isinstance(data, dict) and "type" in data: if isinstance(data, dict) and "type" in data:
# Handle plain dictionaries # Handle plain dictionaries
if data["type"] == "dict": if data["type"] == "dict" and "value" in data:
return {k: from_serializable_dict(v) for k, v in data["value"].items()} return {k: from_serializable_dict(v) for k, v in data["value"].items()}
# Import from crawl4ai for class instances # Import from crawl4ai for class instances
import crawl4ai import crawl4ai
cls = getattr(crawl4ai, data["type"]) if hasattr(crawl4ai, data["type"]):
cls = getattr(crawl4ai, data["type"])
# Handle Enum # Handle Enum
if issubclass(cls, Enum): if issubclass(cls, Enum):
return cls(data["params"]) return cls(data["params"])
# Handle class instances if "params" in data:
constructor_args = { # Handle class instances
k: from_serializable_dict(v) for k, v in data["params"].items() constructor_args = {
} k: from_serializable_dict(v) for k, v in data["params"].items()
return cls(**constructor_args) }
return cls(**constructor_args)
# Handle lists # Handle lists
if isinstance(data, list): if isinstance(data, list):
@@ -159,6 +159,117 @@ def is_empty_value(value: Any) -> bool:
return True return True
return False return False
class ProxyConfig:
def __init__(
self,
server: str,
username: Optional[str] = None,
password: Optional[str] = None,
ip: Optional[str] = None,
):
"""Configuration class for a single proxy.
Args:
server: Proxy server URL (e.g., "http://127.0.0.1:8080")
username: Optional username for proxy authentication
password: Optional password for proxy authentication
ip: Optional IP address for verification purposes
"""
self.server = server
self.username = username
self.password = password
# Extract IP from server if not explicitly provided
self.ip = ip or self._extract_ip_from_server()
def _extract_ip_from_server(self) -> Optional[str]:
"""Extract IP address from server URL."""
try:
# Simple extraction assuming http://ip:port format
if "://" in self.server:
parts = self.server.split("://")[1].split(":")
return parts[0]
else:
parts = self.server.split(":")
return parts[0]
except Exception:
return None
@staticmethod
def from_string(proxy_str: str) -> "ProxyConfig":
"""Create a ProxyConfig from a string in the format 'ip:port:username:password'."""
parts = proxy_str.split(":")
if len(parts) == 4: # ip:port:username:password
ip, port, username, password = parts
return ProxyConfig(
server=f"http://{ip}:{port}",
username=username,
password=password,
ip=ip
)
elif len(parts) == 2: # ip:port only
ip, port = parts
return ProxyConfig(
server=f"http://{ip}:{port}",
ip=ip
)
else:
raise ValueError(f"Invalid proxy string format: {proxy_str}")
@staticmethod
def from_dict(proxy_dict: Dict) -> "ProxyConfig":
"""Create a ProxyConfig from a dictionary."""
return ProxyConfig(
server=proxy_dict.get("server"),
username=proxy_dict.get("username"),
password=proxy_dict.get("password"),
ip=proxy_dict.get("ip")
)
@staticmethod
def from_env(env_var: str = "PROXIES") -> List["ProxyConfig"]:
"""Load proxies from environment variable.
Args:
env_var: Name of environment variable containing comma-separated proxy strings
Returns:
List of ProxyConfig objects
"""
proxies = []
try:
proxy_list = os.getenv(env_var, "").split(",")
for proxy in proxy_list:
if not proxy:
continue
proxies.append(ProxyConfig.from_string(proxy))
except Exception as e:
print(f"Error loading proxies from environment: {e}")
return proxies
def to_dict(self) -> Dict:
"""Convert to dictionary representation."""
return {
"server": self.server,
"username": self.username,
"password": self.password,
"ip": self.ip
}
def clone(self, **kwargs) -> "ProxyConfig":
"""Create a copy of this configuration with updated values.
Args:
**kwargs: Key-value pairs of configuration options to update
Returns:
ProxyConfig: A new instance with the specified updates
"""
config_dict = self.to_dict()
config_dict.update(kwargs)
return ProxyConfig.from_dict(config_dict)
class BrowserConfig: class BrowserConfig:
""" """
@@ -195,8 +306,6 @@ class BrowserConfig:
Default: None. Default: None.
proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}. proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
If None, no additional proxy config. Default: None. If None, no additional proxy config. Default: None.
docker_config (DockerConfig or dict or None): Configuration for Docker-based browser automation.
Contains settings for Docker container operation. Default: None.
viewport_width (int): Default viewport width for pages. Default: 1080. viewport_width (int): Default viewport width for pages. Default: 1080.
viewport_height (int): Default viewport height for pages. Default: 600. viewport_height (int): Default viewport height for pages. Default: 600.
viewport (dict): Default viewport dimensions for pages. If set, overrides viewport_width and viewport_height. viewport (dict): Default viewport dimensions for pages. If set, overrides viewport_width and viewport_height.
@@ -242,7 +351,6 @@ class BrowserConfig:
channel: str = "chromium", channel: str = "chromium",
proxy: str = None, proxy: str = None,
proxy_config: Union[ProxyConfig, dict, None] = None, proxy_config: Union[ProxyConfig, dict, None] = None,
docker_config: Union[DockerConfig, dict, None] = None,
viewport_width: int = 1080, viewport_width: int = 1080,
viewport_height: int = 600, viewport_height: int = 600,
viewport: dict = None, viewport: dict = None,
@@ -283,15 +391,7 @@ class BrowserConfig:
self.chrome_channel = "" self.chrome_channel = ""
self.proxy = proxy self.proxy = proxy
self.proxy_config = proxy_config self.proxy_config = proxy_config
# Handle docker configuration
if isinstance(docker_config, dict) and DockerConfig is not None:
self.docker_config = DockerConfig.from_kwargs(docker_config)
else:
self.docker_config = docker_config
if self.docker_config:
self.user_data_dir = self.docker_config.user_data_dir
self.viewport_width = viewport_width self.viewport_width = viewport_width
self.viewport_height = viewport_height self.viewport_height = viewport_height
@@ -362,7 +462,6 @@ class BrowserConfig:
channel=kwargs.get("channel", "chromium"), channel=kwargs.get("channel", "chromium"),
proxy=kwargs.get("proxy"), proxy=kwargs.get("proxy"),
proxy_config=kwargs.get("proxy_config", None), proxy_config=kwargs.get("proxy_config", None),
docker_config=kwargs.get("docker_config", None),
viewport_width=kwargs.get("viewport_width", 1080), viewport_width=kwargs.get("viewport_width", 1080),
viewport_height=kwargs.get("viewport_height", 600), viewport_height=kwargs.get("viewport_height", 600),
accept_downloads=kwargs.get("accept_downloads", False), accept_downloads=kwargs.get("accept_downloads", False),
@@ -419,13 +518,7 @@ class BrowserConfig:
"debugging_port": self.debugging_port, "debugging_port": self.debugging_port,
"host": self.host, "host": self.host,
} }
# Include docker_config if it exists
if hasattr(self, "docker_config") and self.docker_config is not None:
if hasattr(self.docker_config, "to_dict"):
result["docker_config"] = self.docker_config.to_dict()
else:
result["docker_config"] = self.docker_config
return result return result
@@ -1178,9 +1271,18 @@ class LLMConfig:
elif api_token and api_token.startswith("env:"): elif api_token and api_token.startswith("env:"):
self.api_token = os.getenv(api_token[4:]) self.api_token = os.getenv(api_token[4:])
else: else:
self.api_token = PROVIDER_MODELS.get(provider, "no-token") or os.getenv( # Check if given provider starts with any of key in PROVIDER_MODELS_PREFIXES
DEFAULT_PROVIDER_API_KEY # If not, check if it is in PROVIDER_MODELS
) prefixes = PROVIDER_MODELS_PREFIXES.keys()
if any(provider.startswith(prefix) for prefix in prefixes):
selected_prefix = next(
(prefix for prefix in prefixes if provider.startswith(prefix)),
None,
)
self.api_token = PROVIDER_MODELS_PREFIXES.get(selected_prefix)
else:
self.provider = DEFAULT_PROVIDER
self.api_token = os.getenv(DEFAULT_PROVIDER_API_KEY)
self.base_url = base_url self.base_url = base_url
self.temprature = temprature self.temprature = temprature
self.max_tokens = max_tokens self.max_tokens = max_tokens

View File

@@ -36,7 +36,7 @@ from .markdown_generation_strategy import (
) )
from .deep_crawling import DeepCrawlDecorator from .deep_crawling import DeepCrawlDecorator
from .async_logger import AsyncLogger, AsyncLoggerBase from .async_logger import AsyncLogger, AsyncLoggerBase
from .async_configs import BrowserConfig, CrawlerRunConfig from .async_configs import BrowserConfig, CrawlerRunConfig, ProxyConfig
from .async_dispatcher import * # noqa: F403 from .async_dispatcher import * # noqa: F403
from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
@@ -291,12 +291,12 @@ class AsyncWebCrawler:
# Update proxy configuration from rotation strategy if available # Update proxy configuration from rotation strategy if available
if config and config.proxy_rotation_strategy: if config and config.proxy_rotation_strategy:
next_proxy = await config.proxy_rotation_strategy.get_next_proxy() next_proxy : ProxyConfig = await config.proxy_rotation_strategy.get_next_proxy()
if next_proxy: if next_proxy:
self.logger.info( self.logger.info(
message="Switch proxy: {proxy}", message="Switch proxy: {proxy}",
tag="PROXY", tag="PROXY",
params={"proxy": next_proxy.server}, params={"proxy": next_proxy.server}
) )
config.proxy_config = next_proxy config.proxy_config = next_proxy
# config = config.clone(proxy_config=next_proxy) # config = config.clone(proxy_config=next_proxy)

View File

@@ -94,6 +94,7 @@ class ManagedBrowser:
host: str = "localhost", host: str = "localhost",
debugging_port: int = 9222, debugging_port: int = 9222,
cdp_url: Optional[str] = None, cdp_url: Optional[str] = None,
browser_config: Optional[BrowserConfig] = None,
): ):
""" """
Initialize the ManagedBrowser instance. Initialize the ManagedBrowser instance.
@@ -109,17 +110,19 @@ class ManagedBrowser:
host (str): Host for debugging the browser. Default: "localhost". host (str): Host for debugging the browser. Default: "localhost".
debugging_port (int): Port for debugging the browser. Default: 9222. debugging_port (int): Port for debugging the browser. Default: 9222.
cdp_url (str or None): CDP URL to connect to the browser. Default: None. cdp_url (str or None): CDP URL to connect to the browser. Default: None.
browser_config (BrowserConfig): Configuration object containing all browser settings. Default: None.
""" """
self.browser_type = browser_type self.browser_type = browser_config.browser_type
self.user_data_dir = user_data_dir self.user_data_dir = browser_config.user_data_dir
self.headless = headless self.headless = browser_config.headless
self.browser_process = None self.browser_process = None
self.temp_dir = None self.temp_dir = None
self.debugging_port = debugging_port self.debugging_port = browser_config.debugging_port
self.host = host self.host = browser_config.host
self.logger = logger self.logger = logger
self.shutting_down = False self.shutting_down = False
self.cdp_url = cdp_url self.cdp_url = browser_config.cdp_url
self.browser_config = browser_config
async def start(self) -> str: async def start(self) -> str:
""" """
@@ -142,6 +145,9 @@ class ManagedBrowser:
# Get browser path and args based on OS and browser type # Get browser path and args based on OS and browser type
# browser_path = self._get_browser_path() # browser_path = self._get_browser_path()
args = await self._get_browser_args() args = await self._get_browser_args()
if self.browser_config.extra_args:
args.extend(self.browser_config.extra_args)
# Start browser process # Start browser process
try: try:
@@ -477,6 +483,7 @@ class BrowserManager:
logger=self.logger, logger=self.logger,
debugging_port=self.config.debugging_port, debugging_port=self.config.debugging_port,
cdp_url=self.config.cdp_url, cdp_url=self.config.cdp_url,
browser_config=self.config,
) )
async def start(self): async def start(self):
@@ -491,10 +498,12 @@ class BrowserManager:
Note: This method should be called in a separate task to avoid blocking the main event loop. Note: This method should be called in a separate task to avoid blocking the main event loop.
""" """
if self.playwright is None: if self.playwright is not None:
from playwright.async_api import async_playwright await self.close()
from playwright.async_api import async_playwright
self.playwright = await async_playwright().start() self.playwright = await async_playwright().start()
if self.config.cdp_url or self.config.use_managed_browser: if self.config.cdp_url or self.config.use_managed_browser:
self.config.use_managed_browser = True self.config.use_managed_browser = True

View File

@@ -29,6 +29,14 @@ PROVIDER_MODELS = {
'gemini/gemini-2.0-flash-lite-preview-02-05': os.getenv("GEMINI_API_KEY"), 'gemini/gemini-2.0-flash-lite-preview-02-05': os.getenv("GEMINI_API_KEY"),
"deepseek/deepseek-chat": os.getenv("DEEPSEEK_API_KEY"), "deepseek/deepseek-chat": os.getenv("DEEPSEEK_API_KEY"),
} }
PROVIDER_MODELS_PREFIXES = {
"ollama": "no-token-needed", # Any model from Ollama no need for API token
"groq": os.getenv("GROQ_API_KEY"),
"openai": os.getenv("OPENAI_API_KEY"),
"anthropic": os.getenv("ANTHROPIC_API_KEY"),
"gemini": os.getenv("GEMINI_API_KEY"),
"deepseek": os.getenv("DEEPSEEK_API_KEY"),
}
# Chunk token threshold # Chunk token threshold
CHUNK_TOKEN_THRESHOLD = 2**11 # 2048 tokens CHUNK_TOKEN_THRESHOLD = 2**11 # 2048 tokens

View File

@@ -7,7 +7,9 @@ import time
from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH, PROMPT_EXTRACT_INFERRED_SCHEMA from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH, PROMPT_EXTRACT_INFERRED_SCHEMA
from .config import ( from .config import (
DEFAULT_PROVIDER, CHUNK_TOKEN_THRESHOLD, DEFAULT_PROVIDER,
DEFAULT_PROVIDER_API_KEY,
CHUNK_TOKEN_THRESHOLD,
OVERLAP_RATE, OVERLAP_RATE,
WORD_TOKEN_RATE, WORD_TOKEN_RATE,
) )
@@ -542,6 +544,11 @@ class LLMExtractionStrategy(ExtractionStrategy):
""" """
super().__init__( input_format=input_format, **kwargs) super().__init__( input_format=input_format, **kwargs)
self.llm_config = llm_config self.llm_config = llm_config
if not self.llm_config:
self.llm_config = create_llm_config(
provider=DEFAULT_PROVIDER,
api_token=os.environ.get(DEFAULT_PROVIDER_API_KEY),
)
self.instruction = instruction self.instruction = instruction
self.extract_type = extraction_type self.extract_type = extraction_type
self.schema = schema self.schema = schema

View File

@@ -40,10 +40,25 @@ def setup_home_directory():
f.write("") f.write("")
def post_install(): def post_install():
"""Run all post-installation tasks""" """
Run all post-installation tasks.
Checks CRAWL4AI_MODE environment variable. If set to 'api',
skips Playwright browser installation.
"""
logger.info("Running post-installation setup...", tag="INIT") logger.info("Running post-installation setup...", tag="INIT")
setup_home_directory() setup_home_directory()
install_playwright()
# Check environment variable to conditionally skip Playwright install
run_mode = os.getenv('CRAWL4AI_MODE')
if run_mode == 'api':
logger.warning(
"CRAWL4AI_MODE=api detected. Skipping Playwright browser installation.",
tag="SETUP"
)
else:
# Proceed with installation only if mode is not 'api'
install_playwright()
run_migration() run_migration()
# TODO: Will be added in the future # TODO: Will be added in the future
# setup_builtin_browser() # setup_builtin_browser()

View File

@@ -4,6 +4,9 @@ from itertools import cycle
import os import os
########### ATTENTION PEOPLE OF EARTH ###########
# I have moved this config to async_configs.py, kept it here, in case someone still importing it, however
# be a dear and follow `from crawl4ai import ProxyConfig` instead :)
class ProxyConfig: class ProxyConfig:
def __init__( def __init__(
self, self,
@@ -119,12 +122,12 @@ class ProxyRotationStrategy(ABC):
"""Base abstract class for proxy rotation strategies""" """Base abstract class for proxy rotation strategies"""
@abstractmethod @abstractmethod
async def get_next_proxy(self) -> Optional[Dict]: async def get_next_proxy(self) -> Optional[ProxyConfig]:
"""Get next proxy configuration from the strategy""" """Get next proxy configuration from the strategy"""
pass pass
@abstractmethod @abstractmethod
def add_proxies(self, proxies: List[Dict]): def add_proxies(self, proxies: List[ProxyConfig]):
"""Add proxy configurations to the strategy""" """Add proxy configurations to the strategy"""
pass pass

View File

@@ -9,83 +9,44 @@ from urllib.parse import urlparse
import OpenSSL.crypto import OpenSSL.crypto
from pathlib import Path from pathlib import Path
# === Inherit from dict ===
class SSLCertificate: class SSLCertificate(dict):
""" """
A class representing an SSL certificate with methods to export in various formats. A class representing an SSL certificate, behaving like a dictionary
for direct JSON serialization. It stores the certificate information internally
and provides methods for export and property access.
Attributes: Inherits from dict, so instances are directly JSON serializable.
cert_info (Dict[str, Any]): The certificate information.
Methods:
from_url(url: str, timeout: int = 10) -> Optional['SSLCertificate']: Create SSLCertificate instance from a URL.
from_file(file_path: str) -> Optional['SSLCertificate']: Create SSLCertificate instance from a file.
from_binary(binary_data: bytes) -> Optional['SSLCertificate']: Create SSLCertificate instance from binary data.
export_as_pem() -> str: Export the certificate as PEM format.
export_as_der() -> bytes: Export the certificate as DER format.
export_as_json() -> Dict[str, Any]: Export the certificate as JSON format.
export_as_text() -> str: Export the certificate as text format.
""" """
# Use __slots__ for potential memory optimization if desired, though less common when inheriting dict
# __slots__ = ("_cert_info",) # If using slots, be careful with dict inheritance interaction
def __init__(self, cert_info: Dict[str, Any]): def __init__(self, cert_info: Dict[str, Any]):
self._cert_info = self._decode_cert_data(cert_info)
@staticmethod
def from_url(url: str, timeout: int = 10) -> Optional["SSLCertificate"]:
""" """
Create SSLCertificate instance from a URL. Initializes the SSLCertificate object.
Args: Args:
url (str): URL of the website. cert_info (Dict[str, Any]): The raw certificate dictionary.
timeout (int): Timeout for the connection (default: 10).
Returns:
Optional[SSLCertificate]: SSLCertificate instance if successful, None otherwise.
""" """
try: # 1. Decode the data (handle bytes -> str)
hostname = urlparse(url).netloc decoded_info = self._decode_cert_data(cert_info)
if ":" in hostname:
hostname = hostname.split(":")[0]
context = ssl.create_default_context() # 2. Store the decoded info internally (optional but good practice)
with socket.create_connection((hostname, 443), timeout=timeout) as sock: # self._cert_info = decoded_info # You can keep this if methods rely on it
with context.wrap_socket(sock, server_hostname=hostname) as ssock:
cert_binary = ssock.getpeercert(binary_form=True)
x509 = OpenSSL.crypto.load_certificate(
OpenSSL.crypto.FILETYPE_ASN1, cert_binary
)
cert_info = { # 3. Initialize the dictionary part of the object with the decoded data
"subject": dict(x509.get_subject().get_components()), super().__init__(decoded_info)
"issuer": dict(x509.get_issuer().get_components()),
"version": x509.get_version(),
"serial_number": hex(x509.get_serial_number()),
"not_before": x509.get_notBefore(),
"not_after": x509.get_notAfter(),
"fingerprint": x509.digest("sha256").hex(),
"signature_algorithm": x509.get_signature_algorithm(),
"raw_cert": base64.b64encode(cert_binary),
}
# Add extensions
extensions = []
for i in range(x509.get_extension_count()):
ext = x509.get_extension(i)
extensions.append(
{"name": ext.get_short_name(), "value": str(ext)}
)
cert_info["extensions"] = extensions
return SSLCertificate(cert_info)
except Exception:
return None
@staticmethod @staticmethod
def _decode_cert_data(data: Any) -> Any: def _decode_cert_data(data: Any) -> Any:
"""Helper method to decode bytes in certificate data.""" """Helper method to decode bytes in certificate data."""
if isinstance(data, bytes): if isinstance(data, bytes):
return data.decode("utf-8") try:
# Try UTF-8 first, fallback to latin-1 for arbitrary bytes
return data.decode("utf-8")
except UnicodeDecodeError:
return data.decode("latin-1") # Or handle as needed, maybe hex representation
elif isinstance(data, dict): elif isinstance(data, dict):
return { return {
( (
@@ -97,36 +58,119 @@ class SSLCertificate:
return [SSLCertificate._decode_cert_data(item) for item in data] return [SSLCertificate._decode_cert_data(item) for item in data]
return data return data
@staticmethod
def from_url(url: str, timeout: int = 10) -> Optional["SSLCertificate"]:
"""
Create SSLCertificate instance from a URL. Fetches cert info and initializes.
(Fetching logic remains the same)
"""
cert_info_raw = None # Variable to hold the fetched dict
try:
hostname = urlparse(url).netloc
if ":" in hostname:
hostname = hostname.split(":")[0]
context = ssl.create_default_context()
# Set check_hostname to False and verify_mode to CERT_NONE temporarily
# for potentially problematic certificates during fetch, but parse the result regardless.
# context.check_hostname = False
# context.verify_mode = ssl.CERT_NONE
with socket.create_connection((hostname, 443), timeout=timeout) as sock:
with context.wrap_socket(sock, server_hostname=hostname) as ssock:
cert_binary = ssock.getpeercert(binary_form=True)
if not cert_binary:
print(f"Warning: No certificate returned for {hostname}")
return None
x509 = OpenSSL.crypto.load_certificate(
OpenSSL.crypto.FILETYPE_ASN1, cert_binary
)
# Create the dictionary directly
cert_info_raw = {
"subject": dict(x509.get_subject().get_components()),
"issuer": dict(x509.get_issuer().get_components()),
"version": x509.get_version(),
"serial_number": hex(x509.get_serial_number()),
"not_before": x509.get_notBefore(), # Keep as bytes initially, _decode handles it
"not_after": x509.get_notAfter(), # Keep as bytes initially
"fingerprint": x509.digest("sha256").hex(), # hex() is already string
"signature_algorithm": x509.get_signature_algorithm(), # Keep as bytes
"raw_cert": base64.b64encode(cert_binary), # Base64 is bytes, _decode handles it
}
# Add extensions
extensions = []
for i in range(x509.get_extension_count()):
ext = x509.get_extension(i)
# get_short_name() returns bytes, str(ext) handles value conversion
extensions.append(
{"name": ext.get_short_name(), "value": str(ext)}
)
cert_info_raw["extensions"] = extensions
except ssl.SSLCertVerificationError as e:
print(f"SSL Verification Error for {url}: {e}")
# Decide if you want to proceed or return None based on your needs
# You might try fetching without verification here if needed, but be cautious.
return None
except socket.gaierror:
print(f"Could not resolve hostname: {hostname}")
return None
except socket.timeout:
print(f"Connection timed out for {url}")
return None
except Exception as e:
print(f"Error fetching/processing certificate for {url}: {e}")
# Log the full error details if needed: logging.exception("Cert fetch error")
return None
# If successful, create the SSLCertificate instance from the dictionary
if cert_info_raw:
return SSLCertificate(cert_info_raw)
else:
return None
# --- Properties now access the dictionary items directly via self[] ---
@property
def issuer(self) -> Dict[str, str]:
return self.get("issuer", {}) # Use self.get for safety
@property
def subject(self) -> Dict[str, str]:
return self.get("subject", {})
@property
def valid_from(self) -> str:
return self.get("not_before", "")
@property
def valid_until(self) -> str:
return self.get("not_after", "")
@property
def fingerprint(self) -> str:
return self.get("fingerprint", "")
# --- Export methods can use `self` directly as it is the dict ---
def to_json(self, filepath: Optional[str] = None) -> Optional[str]: def to_json(self, filepath: Optional[str] = None) -> Optional[str]:
""" """Export certificate as JSON."""
Export certificate as JSON. # `self` is already the dictionary we want to serialize
json_str = json.dumps(self, indent=2, ensure_ascii=False)
Args:
filepath (Optional[str]): Path to save the JSON file (default: None).
Returns:
Optional[str]: JSON string if successful, None otherwise.
"""
json_str = json.dumps(self._cert_info, indent=2, ensure_ascii=False)
if filepath: if filepath:
Path(filepath).write_text(json_str, encoding="utf-8") Path(filepath).write_text(json_str, encoding="utf-8")
return None return None
return json_str return json_str
def to_pem(self, filepath: Optional[str] = None) -> Optional[str]: def to_pem(self, filepath: Optional[str] = None) -> Optional[str]:
""" """Export certificate as PEM."""
Export certificate as PEM.
Args:
filepath (Optional[str]): Path to save the PEM file (default: None).
Returns:
Optional[str]: PEM string if successful, None otherwise.
"""
try: try:
# Decode the raw_cert (which should be string due to _decode)
raw_cert_bytes = base64.b64decode(self.get("raw_cert", ""))
x509 = OpenSSL.crypto.load_certificate( x509 = OpenSSL.crypto.load_certificate(
OpenSSL.crypto.FILETYPE_ASN1, OpenSSL.crypto.FILETYPE_ASN1, raw_cert_bytes
base64.b64decode(self._cert_info["raw_cert"]),
) )
pem_data = OpenSSL.crypto.dump_certificate( pem_data = OpenSSL.crypto.dump_certificate(
OpenSSL.crypto.FILETYPE_PEM, x509 OpenSSL.crypto.FILETYPE_PEM, x509
@@ -136,49 +180,25 @@ class SSLCertificate:
Path(filepath).write_text(pem_data, encoding="utf-8") Path(filepath).write_text(pem_data, encoding="utf-8")
return None return None
return pem_data return pem_data
except Exception: except Exception as e:
return None print(f"Error converting to PEM: {e}")
return None
def to_der(self, filepath: Optional[str] = None) -> Optional[bytes]: def to_der(self, filepath: Optional[str] = None) -> Optional[bytes]:
""" """Export certificate as DER."""
Export certificate as DER.
Args:
filepath (Optional[str]): Path to save the DER file (default: None).
Returns:
Optional[bytes]: DER bytes if successful, None otherwise.
"""
try: try:
der_data = base64.b64decode(self._cert_info["raw_cert"]) # Decode the raw_cert (which should be string due to _decode)
der_data = base64.b64decode(self.get("raw_cert", ""))
if filepath: if filepath:
Path(filepath).write_bytes(der_data) Path(filepath).write_bytes(der_data)
return None return None
return der_data return der_data
except Exception: except Exception as e:
return None print(f"Error converting to DER: {e}")
return None
@property # Optional: Add __repr__ for better debugging
def issuer(self) -> Dict[str, str]: def __repr__(self) -> str:
"""Get certificate issuer information.""" subject_cn = self.subject.get('CN', 'N/A')
return self._cert_info.get("issuer", {}) issuer_cn = self.issuer.get('CN', 'N/A')
return f"<SSLCertificate Subject='{subject_cn}' Issuer='{issuer_cn}'>"
@property
def subject(self) -> Dict[str, str]:
"""Get certificate subject information."""
return self._cert_info.get("subject", {})
@property
def valid_from(self) -> str:
"""Get certificate validity start date."""
return self._cert_info.get("not_before", "")
@property
def valid_until(self) -> str:
"""Get certificate validity end date."""
return self._cert_info.get("not_after", "")
@property
def fingerprint(self) -> str:
"""Get certificate fingerprint."""
return self._cert_info.get("fingerprint", "")

644
deploy/docker/README-new.md Normal file
View File

@@ -0,0 +1,644 @@
# Crawl4AI Docker Guide 🐳
## Table of Contents
- [Prerequisites](#prerequisites)
- [Installation](#installation)
- [Option 1: Using Docker Compose (Recommended)](#option-1-using-docker-compose-recommended)
- [Option 2: Manual Local Build & Run](#option-2-manual-local-build--run)
- [Option 3: Using Pre-built Docker Hub Images](#option-3-using-pre-built-docker-hub-images)
- [Dockerfile Parameters](#dockerfile-parameters)
- [Using the API](#using-the-api)
- [Understanding Request Schema](#understanding-request-schema)
- [REST API Examples](#rest-api-examples)
- [Python SDK](#python-sdk)
- [Metrics & Monitoring](#metrics--monitoring)
- [Deployment Scenarios](#deployment-scenarios)
- [Complete Examples](#complete-examples)
- [Server Configuration](#server-configuration)
- [Understanding config.yml](#understanding-configyml)
- [JWT Authentication](#jwt-authentication)
- [Configuration Tips and Best Practices](#configuration-tips-and-best-practices)
- [Customizing Your Configuration](#customizing-your-configuration)
- [Configuration Recommendations](#configuration-recommendations)
- [Getting Help](#getting-help)
## Prerequisites
Before we dive in, make sure you have:
- Docker installed and running (version 20.10.0 or higher), including `docker compose` (usually bundled with Docker Desktop).
- `git` for cloning the repository.
- At least 4GB of RAM available for the container (more recommended for heavy use).
- Python 3.10+ (if using the Python SDK).
- Node.js 16+ (if using the Node.js examples).
> 💡 **Pro tip**: Run `docker info` to check your Docker installation and available resources.
## Installation
We offer several ways to get the Crawl4AI server running. Docker Compose is the easiest way to manage local builds and runs.
### Option 1: Using Docker Compose (Recommended)
Docker Compose simplifies building and running the service, especially for local development and testing across different platforms.
#### 1. Clone Repository
```bash
git clone https://github.com/unclecode/crawl4ai.git
cd crawl4ai
```
#### 2. Environment Setup (API Keys)
If you plan to use LLMs, copy the example environment file and add your API keys. This file should be in the **project root directory**.
```bash
# Make sure you are in the 'crawl4ai' root directory
cp deploy/docker/.llm.env.example .llm.env
# Now edit .llm.env and add your API keys
# Example content:
# OPENAI_API_KEY=sk-your-key
# ANTHROPIC_API_KEY=your-anthropic-key
# ...
```
> 🔑 **Note**: Keep your API keys secure! Never commit `.llm.env` to version control.
#### 3. Build and Run with Compose
The `docker-compose.yml` file in the project root defines services for different scenarios using **profiles**.
* **Build and Run Locally (AMD64):**
```bash
# Builds the image locally using Dockerfile and runs it
docker compose --profile local-amd64 up --build -d
```
* **Build and Run Locally (ARM64):**
```bash
# Builds the image locally using Dockerfile and runs it
docker compose --profile local-arm64 up --build -d
```
* **Run Pre-built Image from Docker Hub (AMD64):**
```bash
# Pulls and runs the specified AMD64 image from Docker Hub
# (Set VERSION env var for specific tags, e.g., VERSION=0.5.1-d1)
docker compose --profile hub-amd64 up -d
```
* **Run Pre-built Image from Docker Hub (ARM64):**
```bash
# Pulls and runs the specified ARM64 image from Docker Hub
docker compose --profile hub-arm64 up -d
```
> The server will be available at `http://localhost:11235`.
#### 4. Stopping Compose Services
```bash
# Stop the service(s) associated with a profile (e.g., local-amd64)
docker compose --profile local-amd64 down
```
### Option 2: Manual Local Build & Run
If you prefer not to use Docker Compose for local builds.
#### 1. Clone Repository & Setup Environment
Follow steps 1 and 2 from the Docker Compose section above (clone repo, `cd crawl4ai`, create `.llm.env` in the root).
#### 2. Build the Image (Multi-Arch)
Use `docker buildx` to build the image. This example builds for multiple platforms and loads the image matching your host architecture into the local Docker daemon.
```bash
# Make sure you are in the 'crawl4ai' root directory
docker buildx build --platform linux/amd64,linux/arm64 -t crawl4ai-local:latest --load .
```
#### 3. Run the Container
* **Basic run (no LLM support):**
```bash
# Replace --platform if your host is ARM64
docker run -d \
-p 11235:11235 \
--name crawl4ai-standalone \
--shm-size=1g \
--platform linux/amd64 \
crawl4ai-local:latest
```
* **With LLM support:**
```bash
# Make sure .llm.env is in the current directory (project root)
# Replace --platform if your host is ARM64
docker run -d \
-p 11235:11235 \
--name crawl4ai-standalone \
--env-file .llm.env \
--shm-size=1g \
--platform linux/amd64 \
crawl4ai-local:latest
```
> The server will be available at `http://localhost:11235`.
#### 4. Stopping the Manual Container
```bash
docker stop crawl4ai-standalone && docker rm crawl4ai-standalone
```
### Option 3: Using Pre-built Docker Hub Images
Pull and run images directly from Docker Hub without building locally.
#### 1. Pull the Image
We use a versioning scheme like `LIBRARY_VERSION-dREVISION` (e.g., `0.5.1-d1`). The `latest` tag points to the most recent stable release. Images are built with multi-arch manifests, so Docker usually pulls the correct version for your system automatically.
```bash
# Pull a specific version (recommended for stability)
docker pull unclecode/crawl4ai:0.5.1-d1
# Or pull the latest stable version
docker pull unclecode/crawl4ai:latest
```
#### 2. Setup Environment (API Keys)
If using LLMs, create the `.llm.env` file in a directory of your choice, similar to Step 2 in the Compose section.
#### 3. Run the Container
* **Basic run:**
```bash
docker run -d \
-p 11235:11235 \
--name crawl4ai-hub \
--shm-size=1g \
unclecode/crawl4ai:0.5.1-d1 # Or use :latest
```
* **With LLM support:**
```bash
# Make sure .llm.env is in the current directory you are running docker from
docker run -d \
-p 11235:11235 \
--name crawl4ai-hub \
--env-file .llm.env \
--shm-size=1g \
unclecode/crawl4ai:0.5.1-d1 # Or use :latest
```
> The server will be available at `http://localhost:11235`.
#### 4. Stopping the Hub Container
```bash
docker stop crawl4ai-hub && docker rm crawl4ai-hub
```
#### Docker Hub Versioning Explained
* **Image Name:** `unclecode/crawl4ai`
* **Tag Format:** `LIBRARY_VERSION-dREVISION`
* `LIBRARY_VERSION`: The Semantic Version of the core `crawl4ai` Python library included (e.g., `0.5.1`).
* `dREVISION`: An incrementing number (starting at `d1`) for Docker build changes made *without* changing the library version (e.g., base image updates, dependency fixes). Resets to `d1` for each new `LIBRARY_VERSION`.
* **Example:** `unclecode/crawl4ai:0.5.1-d1`
* **`latest` Tag:** Points to the most recent stable `LIBRARY_VERSION-dREVISION`.
* **Multi-Arch:** Images support `linux/amd64` and `linux/arm64`. Docker automatically selects the correct architecture.
---
*(Rest of the document remains largely the same, but with key updates below)*
---
## Dockerfile Parameters
You can customize the image build process using build arguments (`--build-arg`). These are typically used via `docker buildx build` or within the `docker-compose.yml` file.
```bash
# Example: Build with 'all' features using buildx
docker buildx build \
--platform linux/amd64,linux/arm64 \
--build-arg INSTALL_TYPE=all \
-t yourname/crawl4ai-all:latest \
--load \
. # Build from root context
```
### Build Arguments Explained
| Argument | Description | Default | Options |
| :----------- | :--------------------------------------- | :-------- | :--------------------------------- |
| INSTALL_TYPE | Feature set | `default` | `default`, `all`, `torch`, `transformer` |
| ENABLE_GPU | GPU support (CUDA for AMD64) | `false` | `true`, `false` |
| APP_HOME | Install path inside container (advanced) | `/app` | any valid path |
| USE_LOCAL | Install library from local source | `true` | `true`, `false` |
| GITHUB_REPO | Git repo to clone if USE_LOCAL=false | *(see Dockerfile)* | any git URL |
| GITHUB_BRANCH| Git branch to clone if USE_LOCAL=false | `main` | any branch name |
*(Note: PYTHON_VERSION is fixed by the `FROM` instruction in the Dockerfile)*
### Build Best Practices
1. **Choose the Right Install Type**
* `default`: Basic installation, smallest image size. Suitable for most standard web scraping and markdown generation.
* `all`: Full features including `torch` and `transformers` for advanced extraction strategies (e.g., CosineStrategy, certain LLM filters). Significantly larger image. Ensure you need these extras.
2. **Platform Considerations**
* Use `buildx` for building multi-architecture images, especially for pushing to registries.
* Use `docker compose` profiles (`local-amd64`, `local-arm64`) for easy platform-specific local builds.
3. **Performance Optimization**
* The image automatically includes platform-specific optimizations (OpenMP for AMD64, OpenBLAS for ARM64).
---
## Using the API
Communicate with the running Docker server via its REST API (defaulting to `http://localhost:11235`). You can use the Python SDK or make direct HTTP requests.
### Python SDK
Install the SDK: `pip install crawl4ai`
```python
import asyncio
from crawl4ai.docker_client import Crawl4aiDockerClient
from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode # Assuming you have crawl4ai installed
async def main():
# Point to the correct server port
async with Crawl4aiDockerClient(base_url="http://localhost:11235", verbose=True) as client:
# If JWT is enabled on the server, authenticate first:
# await client.authenticate("user@example.com") # See Server Configuration section
# Example Non-streaming crawl
print("--- Running Non-Streaming Crawl ---")
results = await client.crawl(
["https://httpbin.org/html"],
browser_config=BrowserConfig(headless=True), # Use library classes for config aid
crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
)
if results: # client.crawl returns None on failure
print(f"Non-streaming results success: {results.success}")
if results.success:
for result in results: # Iterate through the CrawlResultContainer
print(f"URL: {result.url}, Success: {result.success}")
else:
print("Non-streaming crawl failed.")
# Example Streaming crawl
print("\n--- Running Streaming Crawl ---")
stream_config = CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)
try:
async for result in await client.crawl( # client.crawl returns an async generator for streaming
["https://httpbin.org/html", "https://httpbin.org/links/5/0"],
browser_config=BrowserConfig(headless=True),
crawler_config=stream_config
):
print(f"Streamed result: URL: {result.url}, Success: {result.success}")
except Exception as e:
print(f"Streaming crawl failed: {e}")
# Example Get schema
print("\n--- Getting Schema ---")
schema = await client.get_schema()
print(f"Schema received: {bool(schema)}") # Print whether schema was received
if __name__ == "__main__":
asyncio.run(main())
```
*(SDK parameters like timeout, verify_ssl etc. remain the same)*
### Second Approach: Direct API Calls
Crucially, when sending configurations directly via JSON, they **must** follow the `{"type": "ClassName", "params": {...}}` structure for any non-primitive value (like config objects or strategies). Dictionaries must be wrapped as `{"type": "dict", "value": {...}}`.
*(Keep the detailed explanation of Configuration Structure, Basic Pattern, Simple vs Complex, Strategy Pattern, Complex Nested Example, Quick Grammar Overview, Important Rules, Pro Tip)*
#### More Examples *(Ensure Schema example uses type/value wrapper)*
**Advanced Crawler Configuration**
*(Keep example, ensure cache_mode uses valid enum value like "bypass")*
**Extraction Strategy**
```json
{
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"extraction_strategy": {
"type": "JsonCssExtractionStrategy",
"params": {
"schema": {
"type": "dict",
"value": {
"baseSelector": "article.post",
"fields": [
{"name": "title", "selector": "h1", "type": "text"},
{"name": "content", "selector": ".content", "type": "html"}
]
}
}
}
}
}
}
}
```
**LLM Extraction Strategy** *(Keep example, ensure schema uses type/value wrapper)*
*(Keep Deep Crawler Example)*
### REST API Examples
Update URLs to use port `11235`.
#### Simple Crawl
```python
import requests
# Configuration objects converted to the required JSON structure
browser_config_payload = {
"type": "BrowserConfig",
"params": {"headless": True}
}
crawler_config_payload = {
"type": "CrawlerRunConfig",
"params": {"stream": False, "cache_mode": "bypass"} # Use string value of enum
}
crawl_payload = {
"urls": ["https://httpbin.org/html"],
"browser_config": browser_config_payload,
"crawler_config": crawler_config_payload
}
response = requests.post(
"http://localhost:11235/crawl", # Updated port
# headers={"Authorization": f"Bearer {token}"}, # If JWT is enabled
json=crawl_payload
)
print(f"Status Code: {response.status_code}")
if response.ok:
print(response.json())
else:
print(f"Error: {response.text}")
```
#### Streaming Results
```python
import json
import httpx # Use httpx for async streaming example
async def test_stream_crawl(token: str = None): # Made token optional
"""Test the /crawl/stream endpoint with multiple URLs."""
url = "http://localhost:11235/crawl/stream" # Updated port
payload = {
"urls": [
"https://httpbin.org/html",
"https://httpbin.org/links/5/0",
],
"browser_config": {
"type": "BrowserConfig",
"params": {"headless": True, "viewport": {"type": "dict", "value": {"width": 1200, "height": 800}}} # Viewport needs type:dict
},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {"stream": True, "cache_mode": "bypass"}
}
}
headers = {}
# if token:
# headers = {"Authorization": f"Bearer {token}"} # If JWT is enabled
try:
async with httpx.AsyncClient() as client:
async with client.stream("POST", url, json=payload, headers=headers, timeout=120.0) as response:
print(f"Status: {response.status_code} (Expected: 200)")
response.raise_for_status() # Raise exception for bad status codes
# Read streaming response line-by-line (NDJSON)
async for line in response.aiter_lines():
if line:
try:
data = json.loads(line)
# Check for completion marker
if data.get("status") == "completed":
print("Stream completed.")
break
print(f"Streamed Result: {json.dumps(data, indent=2)}")
except json.JSONDecodeError:
print(f"Warning: Could not decode JSON line: {line}")
except httpx.HTTPStatusError as e:
print(f"HTTP error occurred: {e.response.status_code} - {e.response.text}")
except Exception as e:
print(f"Error in streaming crawl test: {str(e)}")
# To run this example:
# import asyncio
# asyncio.run(test_stream_crawl())
```
---
## Metrics & Monitoring
Keep an eye on your crawler with these endpoints:
- `/health` - Quick health check
- `/metrics` - Detailed Prometheus metrics
- `/schema` - Full API schema
Example health check:
```bash
curl http://localhost:11235/health
```
---
*(Deployment Scenarios and Complete Examples sections remain the same, maybe update links if examples moved)*
---
## Server Configuration
The server's behavior can be customized through the `config.yml` file.
### Understanding config.yml
The configuration file is loaded from `/app/config.yml` inside the container. By default, the file from `deploy/docker/config.yml` in the repository is copied there during the build.
Here's a detailed breakdown of the configuration options (using defaults from `deploy/docker/config.yml`):
```yaml
# Application Configuration
app:
title: "Crawl4AI API"
version: "1.0.0" # Consider setting this to match library version, e.g., "0.5.1"
host: "0.0.0.0"
port: 8020 # NOTE: This port is used ONLY when running server.py directly. Gunicorn overrides this (see supervisord.conf).
reload: False # Default set to False - suitable for production
timeout_keep_alive: 300
# Default LLM Configuration
llm:
provider: "openai/gpt-4o-mini"
api_key_env: "OPENAI_API_KEY"
# api_key: sk-... # If you pass the API key directly then api_key_env will be ignored
# Redis Configuration (Used by internal Redis server managed by supervisord)
redis:
host: "localhost"
port: 6379
db: 0
password: ""
# ... other redis options ...
# Rate Limiting Configuration
rate_limiting:
enabled: True
default_limit: "1000/minute"
trusted_proxies: []
storage_uri: "memory://" # Use "redis://localhost:6379" if you need persistent/shared limits
# Security Configuration
security:
enabled: false # Master toggle for security features
jwt_enabled: false # Enable JWT authentication (requires security.enabled=true)
https_redirect: false # Force HTTPS (requires security.enabled=true)
trusted_hosts: ["*"] # Allowed hosts (use specific domains in production)
headers: # Security headers (applied if security.enabled=true)
x_content_type_options: "nosniff"
x_frame_options: "DENY"
content_security_policy: "default-src 'self'"
strict_transport_security: "max-age=63072000; includeSubDomains"
# Crawler Configuration
crawler:
memory_threshold_percent: 95.0
rate_limiter:
base_delay: [1.0, 2.0] # Min/max delay between requests in seconds for dispatcher
timeouts:
stream_init: 30.0 # Timeout for stream initialization
batch_process: 300.0 # Timeout for non-streaming /crawl processing
# Logging Configuration
logging:
level: "INFO"
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
# Observability Configuration
observability:
prometheus:
enabled: True
endpoint: "/metrics"
health_check:
endpoint: "/health"
```
*(JWT Authentication section remains the same, just note the default port is now 11235 for requests)*
*(Configuration Tips and Best Practices remain the same)*
### Customizing Your Configuration
You can override the default `config.yml`.
#### Method 1: Modify Before Build
1. Edit the `deploy/docker/config.yml` file in your local repository clone.
2. Build the image using `docker buildx` or `docker compose --profile local-... up --build`. The modified file will be copied into the image.
#### Method 2: Runtime Mount (Recommended for Custom Deploys)
1. Create your custom configuration file, e.g., `my-custom-config.yml` locally. Ensure it contains all necessary sections.
2. Mount it when running the container:
* **Using `docker run`:**
```bash
# Assumes my-custom-config.yml is in the current directory
docker run -d -p 11235:11235 \
--name crawl4ai-custom-config \
--env-file .llm.env \
--shm-size=1g \
-v $(pwd)/my-custom-config.yml:/app/config.yml \
unclecode/crawl4ai:latest # Or your specific tag
```
* **Using `docker-compose.yml`:** Add a `volumes` section to the service definition:
```yaml
services:
crawl4ai-hub-amd64: # Or your chosen service
image: unclecode/crawl4ai:latest
profiles: ["hub-amd64"]
<<: *base-config
volumes:
# Mount local custom config over the default one in the container
- ./my-custom-config.yml:/app/config.yml
# Keep the shared memory volume from base-config
- /dev/shm:/dev/shm
```
*(Note: Ensure `my-custom-config.yml` is in the same directory as `docker-compose.yml`)*
> 💡 When mounting, your custom file *completely replaces* the default one. Ensure it's a valid and complete configuration.
### Configuration Recommendations
1. **Security First** 🔒
- Always enable security in production
- Use specific trusted_hosts instead of wildcards
- Set up proper rate limiting to protect your server
- Consider your environment before enabling HTTPS redirect
2. **Resource Management** 💻
- Adjust memory_threshold_percent based on available RAM
- Set timeouts according to your content size and network conditions
- Use Redis for rate limiting in multi-container setups
3. **Monitoring** 📊
- Enable Prometheus if you need metrics
- Set DEBUG logging in development, INFO in production
- Regular health check monitoring is crucial
4. **Performance Tuning** ⚡
- Start with conservative rate limiter delays
- Increase batch_process timeout for large content
- Adjust stream_init timeout based on initial response times
## Getting Help
We're here to help you succeed with Crawl4AI! Here's how to get support:
- 📖 Check our [full documentation](https://docs.crawl4ai.com)
- 🐛 Found a bug? [Open an issue](https://github.com/unclecode/crawl4ai/issues)
- 💬 Join our [Discord community](https://discord.gg/crawl4ai)
- ⭐ Star us on GitHub to show support!
## Summary
In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
- Building and running the Docker container
- Configuring the environment
- Making API requests with proper typing
- Using the Python SDK
- Monitoring your deployment
Remember, the examples in the `examples` folder are your friends - they show real-world usage patterns that you can adapt for your needs.
Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. 🚀
Happy crawling! 🕷️

View File

@@ -391,21 +391,25 @@ async def handle_crawl_request(
) )
) )
async with AsyncWebCrawler(config=browser_config) as crawler: crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config)
results = [] await crawler.start()
func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many") results = []
partial_func = partial(func, func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many")
urls[0] if len(urls) == 1 else urls, partial_func = partial(func,
config=crawler_config, urls[0] if len(urls) == 1 else urls,
dispatcher=dispatcher) config=crawler_config,
results = await partial_func() dispatcher=dispatcher)
return { results = await partial_func()
"success": True, await crawler.close()
"results": [result.model_dump() for result in results] return {
} "success": True,
"results": [result.model_dump() for result in results]
}
except Exception as e: except Exception as e:
logger.error(f"Crawl error: {str(e)}", exc_info=True) logger.error(f"Crawl error: {str(e)}", exc_info=True)
if 'crawler' in locals():
await crawler.close()
raise HTTPException( raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=str(e) detail=str(e)

View File

@@ -4,7 +4,7 @@ app:
version: "1.0.0" version: "1.0.0"
host: "0.0.0.0" host: "0.0.0.0"
port: 8020 port: 8020
reload: True reload: False
timeout_keep_alive: 300 timeout_keep_alive: 300
# Default LLM Configuration # Default LLM Configuration

View File

@@ -1,4 +1,3 @@
crawl4ai
fastapi fastapi
uvicorn uvicorn
gunicorn>=23.0.0 gunicorn>=23.0.0

View File

@@ -1,12 +1,28 @@
[supervisord] [supervisord]
nodaemon=true nodaemon=true ; Run supervisord in the foreground
logfile=/dev/null ; Log supervisord output to stdout/stderr
logfile_maxbytes=0
[program:redis] [program:redis]
command=redis-server command=/usr/bin/redis-server --loglevel notice ; Path to redis-server on Alpine
user=appuser ; Run redis as our non-root user
autorestart=true autorestart=true
priority=10 priority=10
stdout_logfile=/dev/stdout ; Redirect redis stdout to container stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr ; Redirect redis stderr to container stderr
stderr_logfile_maxbytes=0
[program:gunicorn] [program:gunicorn]
command=gunicorn --bind 0.0.0.0:8000 --workers 4 --threads 2 --timeout 300 --graceful-timeout 60 --keep-alive 65 --log-level debug --worker-class uvicorn.workers.UvicornWorker --max-requests 1000 --max-requests-jitter 50 server:app command=/usr/local/bin/gunicorn --bind 0.0.0.0:11235 --workers 2 --threads 2 --timeout 120 --graceful-timeout 30 --keep-alive 60 --log-level info --worker-class uvicorn.workers.UvicornWorker server:app
directory=/app ; Working directory for the app
user=appuser ; Run gunicorn as our non-root user
autorestart=true autorestart=true
priority=20 priority=20
environment=PYTHONUNBUFFERED=1 ; Ensure Python output is sent straight to logs
stdout_logfile=/dev/stdout ; Redirect gunicorn stdout to container stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr ; Redirect gunicorn stderr to container stderr
stderr_logfile_maxbytes=0
# Optional: Add filebeat or other logging agents here if needed

View File

@@ -1,15 +1,30 @@
# Base configuration (not a service, just a reusable config block) # docker-compose.yml
# Base configuration anchor for reusability
x-base-config: &base-config x-base-config: &base-config
ports: ports:
# Map host port 11235 to container port 11235 (where Gunicorn will listen)
- "11235:11235" - "11235:11235"
- "8000:8000" # - "8080:8080" # Uncomment if needed
- "9222:9222"
- "8080:8080" # Load API keys primarily from .llm.env file
# Create .llm.env in the root directory .llm.env.example
env_file:
- .llm.env
# Define environment variables, allowing overrides from host environment
# Syntax ${VAR:-} uses host env var 'VAR' if set, otherwise uses value from .llm.env
environment: environment:
- CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}
- OPENAI_API_KEY=${OPENAI_API_KEY:-} - OPENAI_API_KEY=${OPENAI_API_KEY:-}
- CLAUDE_API_KEY=${CLAUDE_API_KEY:-} - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
- GROQ_API_KEY=${GROQ_API_KEY:-}
- TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
- MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
- GEMINI_API_TOKEN=${GEMINI_API_TOKEN:-}
volumes: volumes:
# Mount /dev/shm for Chromium/Playwright performance
- /dev/shm:/dev/shm - /dev/shm:/dev/shm
deploy: deploy:
resources: resources:
@@ -19,47 +34,47 @@ x-base-config: &base-config
memory: 1G memory: 1G
restart: unless-stopped restart: unless-stopped
healthcheck: healthcheck:
# IMPORTANT: Ensure Gunicorn binds to 11235 in supervisord.conf
test: ["CMD", "curl", "-f", "http://localhost:11235/health"] test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
interval: 30s interval: 30s
timeout: 10s timeout: 10s
retries: 3 retries: 3
start_period: 40s start_period: 40s # Give the server time to start
# Run the container as the non-root user defined in the Dockerfile
user: "appuser"
services: services:
# Local build services for different platforms # --- Local Build Services ---
crawl4ai-amd64: crawl4ai-local-amd64:
build: build:
context: . context: . # Build context is the root directory
dockerfile: Dockerfile dockerfile: Dockerfile # Dockerfile is in the root directory
args: args:
PYTHON_VERSION: "3.10" INSTALL_TYPE: ${INSTALL_TYPE:-default}
INSTALL_TYPE: ${INSTALL_TYPE:-basic} ENABLE_GPU: ${ENABLE_GPU:-false}
ENABLE_GPU: false # PYTHON_VERSION arg is omitted as it's fixed by 'FROM python:3.10-slim' in Dockerfile
platforms: platform: linux/amd64
- linux/amd64
profiles: ["local-amd64"] profiles: ["local-amd64"]
<<: *base-config # extends yerine doğrudan yapılandırmayı dahil ettik <<: *base-config # Inherit base configuration
crawl4ai-arm64: crawl4ai-local-arm64:
build: build:
context: . context: . # Build context is the root directory
dockerfile: Dockerfile dockerfile: Dockerfile # Dockerfile is in the root directory
args: args:
PYTHON_VERSION: "3.10" INSTALL_TYPE: ${INSTALL_TYPE:-default}
INSTALL_TYPE: ${INSTALL_TYPE:-basic} ENABLE_GPU: ${ENABLE_GPU:-false}
ENABLE_GPU: false platform: linux/arm64
platforms:
- linux/arm64
profiles: ["local-arm64"] profiles: ["local-arm64"]
<<: *base-config <<: *base-config
# Hub services for different platforms and versions # --- Docker Hub Image Services ---
crawl4ai-hub-amd64: crawl4ai-hub-amd64:
image: unclecode/crawl4ai:${VERSION:-basic}-amd64 image: unclecode/crawl4ai:${VERSION:-latest}-amd64
profiles: ["hub-amd64"] profiles: ["hub-amd64"]
<<: *base-config <<: *base-config
crawl4ai-hub-arm64: crawl4ai-hub-arm64:
image: unclecode/crawl4ai:${VERSION:-basic}-arm64 image: unclecode/crawl4ai:${VERSION:-latest}-arm64
profiles: ["hub-arm64"] profiles: ["hub-arm64"]
<<: *base-config <<: *base-config

View File

@@ -357,8 +357,7 @@ async def demo_performance_analysis():
async with AsyncWebCrawler() as crawler: async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig( config = CrawlerRunConfig(
capture_network_requests=True, capture_network_requests=True,
wait_until="networkidle", page_timeout=60 * 2 * 1000 # 120 seconds
page_timeout=60000 # 60 seconds
) )
result = await crawler.arun( result = await crawler.arun(
@@ -406,6 +405,13 @@ async def demo_performance_analysis():
"url": url, "url": url,
"duration_ms": duration "duration_ms": duration
}) })
if isinstance(timing, dict) and "requestStart" in timing and "responseStart" in timing and "startTime" in timing:
# Convert to milliseconds
duration = (timing["responseStart"] - timing["requestStart"]) * 1000
resource_timings[resource_type].append({
"url": url,
"duration_ms": duration
})
# Calculate statistics for each resource type # Calculate statistics for each resource type
print("\nPerformance by resource type:") print("\nPerformance by resource type:")
@@ -455,14 +461,14 @@ async def main():
os.makedirs(os.path.join(__cur_dir__, "tmp"), exist_ok=True) os.makedirs(os.path.join(__cur_dir__, "tmp"), exist_ok=True)
# Run basic examples # Run basic examples
await demo_basic_network_capture() # await demo_basic_network_capture()
await demo_basic_console_capture() await demo_basic_console_capture()
await demo_combined_capture() # await demo_combined_capture()
# Run advanced examples # Run advanced examples
await analyze_spa_network_traffic() # await analyze_spa_network_traffic()
await demo_security_analysis() # await demo_security_analysis()
await demo_performance_analysis() # await demo_performance_analysis()
print("\n=== Examples Complete ===") print("\n=== Examples Complete ===")
print(f"Check the tmp directory for output files: {os.path.join(__cur_dir__, 'tmp')}") print(f"Check the tmp directory for output files: {os.path.join(__cur_dir__, 'tmp')}")

View File

@@ -4,7 +4,7 @@ import json
import base64 import base64
from pathlib import Path from pathlib import Path
from typing import List from typing import List
from crawl4ai.proxy_strategy import ProxyConfig from crawl4ai import ProxyConfig
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult
from crawl4ai import RoundRobinProxyStrategy from crawl4ai import RoundRobinProxyStrategy

View File

@@ -13,7 +13,7 @@ from crawl4ai.deep_crawling import (
) )
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
from crawl4ai.proxy_strategy import ProxyConfig from crawl4ai import ProxyConfig
from crawl4ai import RoundRobinProxyStrategy from crawl4ai import RoundRobinProxyStrategy
from crawl4ai.content_filter_strategy import LLMContentFilter from crawl4ai.content_filter_strategy import LLMContentFilter
from crawl4ai import DefaultMarkdownGenerator from crawl4ai import DefaultMarkdownGenerator

View File

@@ -0,0 +1,444 @@
/* ==== File: docs/ask_ai/ask_ai.css ==== */
/* --- Basic Reset & Font --- */
body {
/* Attempt to inherit variables from parent window (iframe context) */
/* Fallback values if variables are not inherited */
--fallback-bg: #070708;
--fallback-font: #e8e9ed;
--fallback-secondary: #a3abba;
--fallback-primary: #50ffff;
--fallback-primary-dimmed: #09b5a5;
--fallback-border: #1d1d20;
--fallback-code-bg: #1e1e1e;
--fallback-invert-font: #222225;
--font-stack: dm, Monaco, Courier New, monospace, serif;
font-family: var(--font-stack, "Courier New", monospace); /* Use theme font stack */
background-color: var(--background-color, var(--fallback-bg));
color: var(--font-color, var(--fallback-font));
margin: 0;
padding: 0;
font-size: 14px; /* Match global font size */
line-height: 1.5em; /* Match global line height */
height: 100vh; /* Ensure body takes full height */
overflow: hidden; /* Prevent body scrollbars, panels handle scroll */
display: flex; /* Use flex for the main container */
}
a {
color: var(--secondary-color, var(--fallback-secondary));
text-decoration: none;
transition: color 0.2s;
}
a:hover {
color: var(--primary-color, var(--fallback-primary));
}
/* --- Main Container Layout --- */
.ai-assistant-container {
display: flex;
width: 100%;
height: 100%;
background-color: var(--background-color, var(--fallback-bg));
}
/* --- Sidebar Styling --- */
.sidebar {
flex-shrink: 0; /* Prevent sidebars from shrinking */
height: 100%;
display: flex;
flex-direction: column;
/* background-color: var(--code-bg-color, var(--fallback-code-bg)); */
overflow-y: hidden; /* Header fixed, list scrolls */
}
.left-sidebar {
flex-basis: 240px; /* Width of history panel */
border-right: 1px solid var(--progress-bar-background, var(--fallback-border));
}
.right-sidebar {
flex-basis: 280px; /* Width of citations panel */
border-left: 1px solid var(--progress-bar-background, var(--fallback-border));
}
.sidebar header {
padding: 0.6em 1em;
border-bottom: 1px solid var(--progress-bar-background, var(--fallback-border));
flex-shrink: 0;
display: flex;
justify-content: space-between;
align-items: center;
}
.sidebar header h3 {
margin: 0;
font-size: 1.1em;
color: var(--font-color, var(--fallback-font));
}
.sidebar ul {
list-style: none;
padding: 0;
margin: 0;
overflow-y: auto; /* Enable scrolling for the list */
flex-grow: 1; /* Allow list to take remaining space */
padding: 0.5em 0;
}
.sidebar ul li {
padding: 0.3em 1em;
}
.sidebar ul li.no-citations,
.sidebar ul li.no-history {
color: var(--secondary-color, var(--fallback-secondary));
font-style: italic;
font-size: 0.9em;
padding-left: 1em;
}
.sidebar ul li a {
color: var(--secondary-color, var(--fallback-secondary));
text-decoration: none;
display: block;
padding: 0.2em 0.5em;
border-radius: 3px;
transition: background-color 0.2s, color 0.2s;
}
.sidebar ul li a:hover {
color: var(--primary-color, var(--fallback-primary));
background-color: rgba(80, 255, 255, 0.08); /* Use primary color with alpha */
}
/* Style for active history item */
#history-list li.active a {
color: var(--primary-dimmed-color, var(--fallback-primary-dimmed));
font-weight: bold;
background-color: rgba(80, 255, 255, 0.12);
}
/* --- Chat Panel Styling --- */
#chat-panel {
flex-grow: 1; /* Take remaining space */
display: flex;
flex-direction: column;
height: 100%;
overflow: hidden; /* Prevent overflow, internal elements handle scroll */
}
#chat-messages {
flex-grow: 1;
overflow-y: auto; /* Scrollable chat history */
padding: 1em 1.5em;
border-bottom: 1px solid var(--progress-bar-background, var(--fallback-border));
}
.message {
margin-bottom: 1em;
padding: 0.8em 1.2em;
border-radius: 8px;
max-width: 90%; /* Slightly wider */
line-height: 1.6;
/* Apply pre-wrap for better handling of spaces/newlines AND wrapping */
white-space: pre-wrap;
word-wrap: break-word; /* Ensure long words break */
}
.user-message {
background-color: var(--progress-bar-background, var(--fallback-border)); /* User message background */
color: var(--font-color, var(--fallback-font));
margin-left: auto; /* Align user messages to the right */
text-align: left;
}
.ai-message {
background-color: var(--code-bg-color, var(--fallback-code-bg)); /* AI message background */
color: var(--font-color, var(--fallback-font));
margin-right: auto; /* Align AI messages to the left */
border: 1px solid var(--progress-bar-background, var(--fallback-border));
}
.ai-message.welcome-message {
border: none;
background-color: transparent;
max-width: 100%;
text-align: center;
color: var(--secondary-color, var(--fallback-secondary));
white-space: normal;
}
/* Styles for code within messages */
.ai-message code {
background-color: var(--invert-font-color, var(--fallback-invert-font)) !important; /* Use light bg for code */
/* color: var(--background-color, var(--fallback-bg)) !important; Dark text */
padding: 0.1em 0.4em;
border-radius: 4px;
font-size: 0.9em;
}
.ai-message pre {
background-color: var(--invert-font-color, var(--fallback-invert-font)) !important;
color: var(--background-color, var(--fallback-bg)) !important;
padding: 1em;
border-radius: 5px;
overflow-x: auto;
margin: 0.8em 0;
white-space: pre;
}
.ai-message pre code {
background-color: transparent !important;
padding: 0;
font-size: inherit;
}
/* Override white-space for specific elements generated by Markdown */
.ai-message p,
.ai-message ul,
.ai-message ol,
.ai-message blockquote {
white-space: normal; /* Allow standard wrapping for block elements */
}
/* --- Markdown Element Styling within Messages --- */
.message p {
margin-top: 0;
margin-bottom: 0.5em;
}
.message p:last-child {
margin-bottom: 0;
}
.message ul,
.message ol {
margin: 0.5em 0 0.5em 1.5em;
padding: 0;
}
.message li {
margin-bottom: 0.2em;
}
/* Code block styling (adjusts previous rules slightly) */
.message code {
/* Inline code */
background-color: var(--invert-font-color, var(--fallback-invert-font)) !important;
color: var(--font-color);
padding: 0.1em 0.4em;
border-radius: 4px;
font-size: 0.9em;
/* Ensure inline code breaks nicely */
word-break: break-all;
white-space: normal; /* Allow inline code to wrap if needed */
}
.message pre {
/* Code block container */
background-color: var(--invert-font-color, var(--fallback-invert-font)) !important;
color: var(--background-color, var(--fallback-bg)) !important;
padding: 1em;
border-radius: 5px;
overflow-x: auto;
margin: 0.8em 0;
font-size: 0.9em; /* Slightly smaller code blocks */
}
.message pre code {
/* Code within code block */
background-color: transparent !important;
padding: 0;
font-size: inherit;
word-break: normal; /* Don't break words in code blocks */
white-space: pre; /* Preserve whitespace strictly in code blocks */
}
/* Thinking indicator */
.message-thinking {
display: inline-block;
width: 5px;
height: 5px;
background-color: var(--primary-color, var(--fallback-primary));
border-radius: 50%;
margin-left: 8px;
vertical-align: middle;
animation: thinking 1s infinite ease-in-out;
}
@keyframes thinking {
0%,
100% {
opacity: 0.5;
transform: scale(0.8);
}
50% {
opacity: 1;
transform: scale(1.2);
}
}
/* --- Thinking Indicator (Blinking Cursor Style) --- */
.thinking-indicator-cursor {
display: inline-block;
width: 10px; /* Width of the cursor */
height: 1.1em; /* Match line height */
background-color: var(--primary-color, var(--fallback-primary));
margin-left: 5px;
vertical-align: text-bottom; /* Align with text baseline */
animation: blink-cursor 1s step-end infinite;
}
@keyframes blink-cursor {
from,
to {
background-color: transparent;
}
50% {
background-color: var(--primary-color, var(--fallback-primary));
}
}
#chat-input-area {
flex-shrink: 0; /* Prevent input area from shrinking */
padding: 1em 1.5em;
display: flex;
align-items: flex-end; /* Align items to bottom */
gap: 10px;
background-color: var(--code-bg-color, var(--fallback-code-bg)); /* Match sidebars */
}
#chat-input-area textarea {
flex-grow: 1;
padding: 0.8em 1em;
border: 1px solid var(--progress-bar-background, var(--fallback-border));
background-color: var(--background-color, var(--fallback-bg));
color: var(--font-color, var(--fallback-font));
border-radius: 5px;
resize: none; /* Disable manual resize */
font-family: inherit;
font-size: 1em;
line-height: 1.4;
max-height: 150px; /* Limit excessive height */
overflow-y: auto;
/* rows: 2; */
}
#chat-input-area button {
/* Basic button styling - maybe inherit from main theme? */
padding: 0.6em 1.2em;
border: 1px solid var(--primary-dimmed-color, var(--fallback-primary-dimmed));
background-color: var(--primary-dimmed-color, var(--fallback-primary-dimmed));
color: var(--background-color, var(--fallback-bg));
border-radius: 5px;
cursor: pointer;
font-size: 0.9em;
transition: background-color 0.2s, border-color 0.2s;
height: min-content; /* Align with bottom of textarea */
}
#chat-input-area button:hover {
background-color: var(--primary-color, var(--fallback-primary));
border-color: var(--primary-color, var(--fallback-primary));
}
#chat-input-area button:disabled {
opacity: 0.6;
cursor: not-allowed;
}
.loading-indicator {
font-size: 0.9em;
color: var(--secondary-color, var(--fallback-secondary));
margin-right: 10px;
align-self: center;
}
/* --- Buttons --- */
/* Inherit some button styles if possible */
.btn.btn-sm {
color: var(--font-color, var(--fallback-font));
padding: 0.2em 0.5em;
font-size: 0.8em;
border: 1px solid var(--secondary-color, var(--fallback-secondary));
background: none;
border-radius: 3px;
cursor: pointer;
}
.btn.btn-sm:hover {
border-color: var(--font-color, var(--fallback-font));
background-color: var(--progress-bar-background, var(--fallback-border));
}
/* --- Basic Responsiveness --- */
@media screen and (max-width: 900px) {
.left-sidebar {
flex-basis: 200px; /* Shrink history */
}
.right-sidebar {
flex-basis: 240px; /* Shrink citations */
}
}
@media screen and (max-width: 768px) {
/* Stack layout on mobile? Or hide sidebars? Hiding for now */
.sidebar {
display: none; /* Hide sidebars on small screens */
}
/* Could add toggle buttons later */
}
/* ==== File: docs/ask_ai/ask-ai.css (Updates V4 - Delete Button) ==== */
.sidebar ul li {
/* Use flexbox to align link and delete button */
display: flex;
justify-content: space-between;
align-items: center;
padding: 0; /* Remove padding from li, add to link/button */
margin: 0.1em 0; /* Small vertical margin */
}
.sidebar ul li a {
/* Link takes most space */
flex-grow: 1;
padding: 0.3em 0.5em 0.3em 1em; /* Adjust padding */
/* Make ellipsis work for long titles */
white-space: nowrap;
overflow: hidden;
text-overflow: ellipsis;
/* Keep existing link styles */
color: var(--secondary-color, var(--fallback-secondary));
text-decoration: none;
display: block;
border-radius: 3px;
transition: background-color 0.2s, color 0.2s;
}
.sidebar ul li a:hover {
color: var(--primary-color, var(--fallback-primary));
background-color: rgba(80, 255, 255, 0.08);
}
/* Style for active history item's link */
#history-list li.active a {
color: var(--primary-dimmed-color, var(--fallback-primary-dimmed));
font-weight: bold;
background-color: rgba(80, 255, 255, 0.12);
}
/* --- Delete Chat Button --- */
.delete-chat-btn {
flex-shrink: 0; /* Don't shrink */
background: none;
border: none;
color: var(--secondary-color, var(--fallback-secondary));
cursor: pointer;
padding: 0.4em 0.8em; /* Padding around icon */
font-size: 0.9em;
opacity: 0.5; /* Dimmed by default */
transition: opacity 0.2s, color 0.2s;
margin-left: 5px; /* Space between link and button */
border-radius: 3px;
}
.sidebar ul li:hover .delete-chat-btn,
.delete-chat-btn:hover {
opacity: 1; /* Show fully on hover */
color: var(--error-color, #ff3c74); /* Use error color on hover */
}
.delete-chat-btn:focus {
outline: 1px dashed var(--error-color, #ff3c74); /* Accessibility */
opacity: 1;
}

603
docs/md_v2/ask_ai/ask-ai.js Normal file
View File

@@ -0,0 +1,603 @@
// ==== File: docs/ask_ai/ask-ai.js (Marked, Streaming, History) ====
document.addEventListener("DOMContentLoaded", () => {
console.log("AI Assistant JS V2 Loaded");
// --- DOM Element Selectors ---
const historyList = document.getElementById("history-list");
const newChatButton = document.getElementById("new-chat-button");
const chatMessages = document.getElementById("chat-messages");
const chatInput = document.getElementById("chat-input");
const sendButton = document.getElementById("send-button");
const citationsList = document.getElementById("citations-list");
// --- Constants ---
const CHAT_INDEX_KEY = "aiAssistantChatIndex_v1";
const CHAT_PREFIX = "aiAssistantChat_v1_";
// --- State ---
let currentChatId = null;
let conversationHistory = []; // Holds message objects { sender: 'user'/'ai', text: '...' }
let isThinking = false;
let streamInterval = null; // To control the streaming interval
// --- Event Listeners ---
sendButton.addEventListener("click", handleSendMessage);
chatInput.addEventListener("keydown", handleInputKeydown);
newChatButton.addEventListener("click", handleNewChat);
chatInput.addEventListener("input", autoGrowTextarea);
// --- Initialization ---
loadChatHistoryIndex(); // Load history list on startup
const initialQuery = checkForInitialQuery(window.parent.location); // Check for query param
if (!initialQuery) {
loadInitialChat(); // Load normally if no query
}
// --- Core Functions ---
function handleSendMessage() {
const userMessageText = chatInput.value.trim();
if (!userMessageText || isThinking) return;
setThinking(true); // Start thinking state
// Add user message to state and UI
const userMessage = { sender: "user", text: userMessageText };
conversationHistory.push(userMessage);
addMessageToChat(userMessage, false); // Add user message without parsing markdown
chatInput.value = "";
autoGrowTextarea(); // Reset textarea height
// Prepare for AI response (create empty div)
const aiMessageDiv = addMessageToChat({ sender: "ai", text: "" }, true); // Add empty div with thinking indicator
// TODO: Generate fingerprint/JWT here
// TODO: Send `conversationHistory` + JWT to backend API
// Replace placeholder below with actual API call
// The backend should ideally return a stream of text tokens
// --- Placeholder Streaming Simulation ---
const simulatedFullResponse = `Okay, Heres a minimal Python script that creates an AsyncWebCrawler, fetches a webpage, and prints the first 300 characters of its Markdown output:
\`\`\`python
import asyncio
from crawl4ai import AsyncWebCrawler
async def main():
async with AsyncWebCrawler() as crawler:
result = await crawler.arun("https://example.com")
print(result.markdown[:300]) # Print first 300 chars
if __name__ == "__main__":
asyncio.run(main())
\`\`\`
A code snippet: \`crawler.run()\`. Check the [quickstart](/core/quickstart).`;
// Simulate receiving the response stream
streamSimulatedResponse(aiMessageDiv, simulatedFullResponse);
// // Simulate receiving citations *after* stream starts (or with first chunk)
// setTimeout(() => {
// addCitations([
// { title: "Simulated Doc 1", url: "#sim1" },
// { title: "Another Concept", url: "#sim2" },
// ]);
// }, 500); // Citations appear shortly after thinking starts
}
function handleInputKeydown(event) {
if (event.key === "Enter" && !event.shiftKey) {
event.preventDefault();
handleSendMessage();
}
}
function addMessageToChat(message, addThinkingIndicator = false) {
const messageDiv = document.createElement("div");
messageDiv.classList.add("message", `${message.sender}-message`);
// Parse markdown and set HTML
messageDiv.innerHTML = message.text ? marked.parse(message.text) : "";
if (message.sender === "ai") {
// Apply Syntax Highlighting AFTER setting innerHTML
messageDiv.querySelectorAll("pre code:not(.hljs)").forEach((block) => {
if (typeof hljs !== "undefined") {
// Check if already highlighted to prevent double-highlighting issues
if (!block.classList.contains("hljs")) {
hljs.highlightElement(block);
}
} else {
console.warn("highlight.js (hljs) not found for syntax highlighting.");
}
});
// Add thinking indicator if needed (and not already present)
if (addThinkingIndicator && !message.text && !messageDiv.querySelector(".thinking-indicator-cursor")) {
const thinkingDiv = document.createElement("div");
thinkingDiv.className = "thinking-indicator-cursor";
messageDiv.appendChild(thinkingDiv);
}
} else {
// User messages remain plain text
// messageDiv.textContent = message.text;
}
// wrap each pre in a div.terminal
messageDiv.querySelectorAll("pre").forEach((block) => {
const wrapper = document.createElement("div");
wrapper.className = "terminal";
block.parentNode.insertBefore(wrapper, block);
wrapper.appendChild(block);
});
chatMessages.appendChild(messageDiv);
// Scroll only if user is near the bottom? (More advanced)
// Simple scroll for now:
scrollToBottom();
return messageDiv; // Return the created element
}
function streamSimulatedResponse(messageDiv, fullText) {
const thinkingIndicator = messageDiv.querySelector(".thinking-indicator-cursor");
if (thinkingIndicator) thinkingIndicator.remove();
const tokens = fullText.split(/(\s+)/);
let currentText = "";
let tokenIndex = 0;
// Clear previous interval just in case
if (streamInterval) clearInterval(streamInterval);
streamInterval = setInterval(() => {
const cursorSpan = '<span class="thinking-indicator-cursor"></span>'; // Cursor for streaming
if (tokenIndex < tokens.length) {
currentText += tokens[tokenIndex];
// Render intermediate markdown + cursor
messageDiv.innerHTML = marked.parse(currentText + cursorSpan);
// Re-highlight code blocks on each stream update - might be slightly inefficient
// but ensures partial code blocks look okay. Highlight only final on completion.
// messageDiv.querySelectorAll('pre code:not(.hljs)').forEach((block) => {
// hljs.highlightElement(block);
// });
scrollToBottom(); // Keep scrolling as content streams
tokenIndex++;
} else {
// Streaming finished
clearInterval(streamInterval);
streamInterval = null;
// Final render without cursor
messageDiv.innerHTML = marked.parse(currentText);
// === Final Syntax Highlighting ===
messageDiv.querySelectorAll("pre code:not(.hljs)").forEach((block) => {
if (typeof hljs !== "undefined" && !block.classList.contains("hljs")) {
hljs.highlightElement(block);
}
});
// === Extract Citations ===
const citations = extractMarkdownLinks(currentText);
// Wrap each pre in a div.terminal
messageDiv.querySelectorAll("pre").forEach((block) => {
const wrapper = document.createElement("div");
wrapper.className = "terminal";
block.parentNode.insertBefore(wrapper, block);
wrapper.appendChild(block);
});
const aiMessage = { sender: "ai", text: currentText, citations: citations };
conversationHistory.push(aiMessage);
updateCitationsDisplay();
saveCurrentChat();
setThinking(false);
}
}, 50); // Adjust speed
}
// === NEW Function to Extract Links ===
function extractMarkdownLinks(markdownText) {
const regex = /\[([^\]]+)\]\(([^)]+)\)/g; // [text](url)
const citations = [];
let match;
while ((match = regex.exec(markdownText)) !== null) {
// Avoid adding self-links from within the citations list if AI includes them
if (!match[2].startsWith("#citation-")) {
citations.push({
title: match[1].trim(),
url: match[2].trim(),
});
}
}
// Optional: Deduplicate links based on URL
const uniqueCitations = citations.filter(
(citation, index, self) => index === self.findIndex((c) => c.url === citation.url)
);
return uniqueCitations;
}
// === REVISED Function to Display Citations ===
function updateCitationsDisplay() {
let lastCitations = null;
// Find the most recent AI message with citations
for (let i = conversationHistory.length - 1; i >= 0; i--) {
if (
conversationHistory[i].sender === "ai" &&
conversationHistory[i].citations &&
conversationHistory[i].citations.length > 0
) {
lastCitations = conversationHistory[i].citations;
break; // Found the latest citations
}
}
citationsList.innerHTML = ""; // Clear previous
if (!lastCitations) {
citationsList.innerHTML = '<li class="no-citations">No citations available.</li>';
return;
}
lastCitations.forEach((citation, index) => {
const li = document.createElement("li");
const a = document.createElement("a");
// Generate a unique ID for potential internal linking if needed
// a.id = `citation-${index}`;
a.href = citation.url || "#";
a.textContent = citation.title;
a.target = "_top"; // Open in main window
li.appendChild(a);
citationsList.appendChild(li);
});
}
function addCitations(citations) {
citationsList.innerHTML = ""; // Clear
if (!citations || citations.length === 0) {
citationsList.innerHTML = '<li class="no-citations">No citations available.</li>';
return;
}
citations.forEach((citation) => {
const li = document.createElement("li");
const a = document.createElement("a");
a.href = citation.url || "#";
a.textContent = citation.title;
a.target = "_top"; // Open in main window
li.appendChild(a);
citationsList.appendChild(li);
});
}
function setThinking(thinking) {
isThinking = thinking;
sendButton.disabled = thinking;
chatInput.disabled = thinking;
chatInput.placeholder = thinking ? "AI is responding..." : "Ask about Crawl4AI...";
// Stop any existing stream if we start thinking again (e.g., rapid resend)
if (thinking && streamInterval) {
clearInterval(streamInterval);
streamInterval = null;
}
}
function autoGrowTextarea() {
chatInput.style.height = "auto";
chatInput.style.height = `${chatInput.scrollHeight}px`;
}
function scrollToBottom() {
chatMessages.scrollTop = chatMessages.scrollHeight;
}
// --- Query Parameter Handling ---
function checkForInitialQuery(locationToCheck) {
// <-- Receive location object
if (!locationToCheck) {
console.warn("Ask AI: Could not access parent window location.");
return false;
}
const urlParams = new URLSearchParams(locationToCheck.search); // <-- Use passed location's search string
const encodedQuery = urlParams.get("qq"); // <-- Use 'qq'
if (encodedQuery) {
console.log("Initial query found (qq):", encodedQuery);
try {
const decodedText = decodeURIComponent(escape(atob(encodedQuery)));
console.log("Decoded query:", decodedText);
// Start new chat immediately
handleNewChat(true);
// Delay setting input and sending message slightly
setTimeout(() => {
chatInput.value = decodedText;
autoGrowTextarea();
handleSendMessage();
// Clean the PARENT window's URL
try {
const cleanUrl = locationToCheck.pathname;
// Use parent's history object
window.parent.history.replaceState({}, window.parent.document.title, cleanUrl);
} catch (e) {
console.warn("Ask AI: Could not clean parent URL using replaceState.", e);
// This might fail due to cross-origin restrictions if served differently,
// but should work fine with mkdocs serve on the same origin.
}
}, 100);
return true; // Query processed
} catch (e) {
console.error("Error decoding initial query (qq):", e);
// Clean the PARENT window's URL even on error
try {
const cleanUrl = locationToCheck.pathname;
window.parent.history.replaceState({}, window.parent.document.title, cleanUrl);
} catch (cleanError) {
console.warn("Ask AI: Could not clean parent URL after decode error.", cleanError);
}
return false;
}
}
return false; // No 'qq' query found
}
// --- History Management ---
function handleNewChat(isFromQuery = false) {
if (isThinking) return; // Don't allow new chat while responding
// Only save if NOT triggered immediately by a query parameter load
if (!isFromQuery) {
saveCurrentChat();
}
currentChatId = `chat_${Date.now()}`;
conversationHistory = []; // Clear message history state
chatMessages.innerHTML = ""; // Start with clean slate for query
if (!isFromQuery) {
// Show welcome only if manually started
chatMessages.innerHTML =
'<div class="message ai-message welcome-message">Started a new chat! Ask me anything about Crawl4AI.</div>';
}
addCitations([]); // Clear citations
updateCitationsDisplay(); // Clear UI
// Add to index and save
let index = loadChatIndex();
// Generate a generic title initially, update later
const newTitle = isFromQuery ? "Chat from Selection" : `Chat ${new Date().toLocaleString()}`;
// index.unshift({ id: currentChatId, title: `Chat ${new Date().toLocaleString()}` }); // Add to start
index.unshift({ id: currentChatId, title: newTitle });
saveChatIndex(index);
renderHistoryList(index); // Update UI
setActiveHistoryItem(currentChatId);
saveCurrentChat(); // Save the empty new chat state
}
function loadChat(chatId) {
if (isThinking || chatId === currentChatId) return;
// Check if chat data actually exists before proceeding
const storedChat = localStorage.getItem(CHAT_PREFIX + chatId);
if (storedChat === null) {
console.warn(`Attempted to load non-existent chat: ${chatId}. Removing from index.`);
deleteChatData(chatId); // Clean up index
loadChatHistoryIndex(); // Reload history list
loadInitialChat(); // Load next available chat
return;
}
console.log(`Loading chat: ${chatId}`);
saveCurrentChat(); // Save current before switching
try {
conversationHistory = JSON.parse(storedChat);
currentChatId = chatId;
renderChatMessages(conversationHistory);
updateCitationsDisplay();
setActiveHistoryItem(chatId);
} catch (e) {
console.error("Error loading chat:", chatId, e);
alert("Failed to load chat data.");
conversationHistory = [];
renderChatMessages(conversationHistory);
updateCitationsDisplay();
}
}
function saveCurrentChat() {
if (currentChatId && conversationHistory.length > 0) {
try {
localStorage.setItem(CHAT_PREFIX + currentChatId, JSON.stringify(conversationHistory));
console.log(`Chat ${currentChatId} saved.`);
// Update title in index (e.g., use first user message)
let index = loadChatIndex();
const currentItem = index.find((item) => item.id === currentChatId);
if (
currentItem &&
conversationHistory[0]?.sender === "user" &&
!currentItem.title.startsWith("Chat about:")
) {
currentItem.title = `Chat about: ${conversationHistory[0].text.substring(0, 30)}...`;
saveChatIndex(index);
// Re-render history list if title changed - small optimization needed here maybe
renderHistoryList(index);
setActiveHistoryItem(currentChatId); // Re-set active after re-render
}
} catch (e) {
console.error("Error saving chat:", currentChatId, e);
// Handle potential storage full errors
if (e.name === "QuotaExceededError") {
alert("Local storage is full. Cannot save chat history.");
// Consider implementing history pruning logic here
}
}
} else if (currentChatId) {
// Save empty state for newly created chats if needed, or remove?
localStorage.setItem(CHAT_PREFIX + currentChatId, JSON.stringify([]));
}
}
function loadChatIndex() {
try {
const storedIndex = localStorage.getItem(CHAT_INDEX_KEY);
return storedIndex ? JSON.parse(storedIndex) : [];
} catch (e) {
console.error("Error loading chat index:", e);
return []; // Return empty array on error
}
}
function saveChatIndex(indexArray) {
try {
localStorage.setItem(CHAT_INDEX_KEY, JSON.stringify(indexArray));
} catch (e) {
console.error("Error saving chat index:", e);
}
}
function renderHistoryList(indexArray) {
historyList.innerHTML = ""; // Clear existing
if (!indexArray || indexArray.length === 0) {
historyList.innerHTML = '<li class="no-history">No past chats found.</li>';
return;
}
indexArray.forEach((item) => {
const li = document.createElement("li");
li.dataset.chatId = item.id; // Add ID to li for easier selection
const a = document.createElement("a");
a.href = "#";
a.dataset.chatId = item.id;
a.textContent = item.title || `Chat ${item.id.split("_")[1] || item.id}`;
a.title = a.textContent; // Tooltip for potentially long titles
a.addEventListener("click", (e) => {
e.preventDefault();
loadChat(item.id);
});
// === Add Delete Button ===
const deleteBtn = document.createElement("button");
deleteBtn.className = "delete-chat-btn";
deleteBtn.innerHTML = "✕"; // Trash can emoji/icon (or use text/SVG/FontAwesome)
deleteBtn.title = "Delete Chat";
deleteBtn.dataset.chatId = item.id; // Store ID on button too
deleteBtn.addEventListener("click", handleDeleteChat);
li.appendChild(a);
li.appendChild(deleteBtn); // Append button to the list item
historyList.appendChild(li);
});
}
function renderChatMessages(messages) {
chatMessages.innerHTML = ""; // Clear existing messages
messages.forEach((message) => {
// Ensure highlighting is applied when loading from history
addMessageToChat(message, false);
});
if (messages.length === 0) {
chatMessages.innerHTML =
'<div class="message ai-message welcome-message">Chat history loaded. Ask a question!</div>';
}
// Scroll to bottom after loading messages
scrollToBottom();
}
function setActiveHistoryItem(chatId) {
document.querySelectorAll("#history-list li").forEach((li) => li.classList.remove("active"));
// Select the LI element directly now
const activeLi = document.querySelector(`#history-list li[data-chat-id="${chatId}"]`);
if (activeLi) {
activeLi.classList.add("active");
}
}
function loadInitialChat() {
const index = loadChatIndex();
if (index.length > 0) {
loadChat(index[0].id);
} else {
// Check if handleNewChat wasn't already called by query handler
if (!currentChatId) {
handleNewChat();
}
}
}
function loadChatHistoryIndex() {
const index = loadChatIndex();
renderHistoryList(index);
if (currentChatId) setActiveHistoryItem(currentChatId);
}
// === NEW Function to Handle Delete Click ===
function handleDeleteChat(event) {
event.stopPropagation(); // Prevent triggering loadChat on the link behind it
const button = event.currentTarget;
const chatIdToDelete = button.dataset.chatId;
if (!chatIdToDelete) return;
// Confirmation dialog
if (
window.confirm(
`Are you sure you want to delete this chat session?\n"${
button.previousElementSibling?.textContent || "Chat " + chatIdToDelete
}"`
)
) {
console.log(`Deleting chat: ${chatIdToDelete}`);
// Perform deletion
const updatedIndex = deleteChatData(chatIdToDelete);
// If the deleted chat was the currently active one, load another chat
if (currentChatId === chatIdToDelete) {
currentChatId = null; // Reset current ID
conversationHistory = []; // Clear state
if (updatedIndex.length > 0) {
// Load the new top chat (most recent remaining)
loadChat(updatedIndex[0].id);
} else {
// No chats left, start a new one
handleNewChat();
}
} else {
// If a different chat was deleted, just re-render the list
renderHistoryList(updatedIndex);
// Re-apply active state in case IDs shifted (though they shouldn't)
setActiveHistoryItem(currentChatId);
}
}
}
// === NEW Function to Delete Chat Data ===
function deleteChatData(chatId) {
// Remove chat data
localStorage.removeItem(CHAT_PREFIX + chatId);
// Update index
let index = loadChatIndex();
index = index.filter((item) => item.id !== chatId);
saveChatIndex(index);
console.log(`Chat ${chatId} data and index entry removed.`);
return index; // Return the updated index
}
// --- Virtual Scrolling Placeholder ---
// NOTE: Virtual scrolling is complex. For now, we do direct rendering.
// If performance becomes an issue with very long chats/history,
// investigate libraries like 'simple-virtual-scroll' or 'virtual-scroller'.
// You would replace parts of `renderChatMessages` and `renderHistoryList`
// to work with the chosen library's API (providing data and item renderers).
console.warn("Virtual scrolling not implemented. Performance may degrade with very long chat histories.");
});

View File

@@ -0,0 +1,64 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Crawl4AI Assistant</title>
<!-- Link main styles first for variable access -->
<link rel="stylesheet" href="../assets/layout.css">
<link rel="stylesheet" href="../assets/styles.css">
<!-- Link specific AI styles -->
<link rel="stylesheet" href="../assets/highlight.css">
<link rel="stylesheet" href="ask-ai.css">
</head>
<body>
<div class="ai-assistant-container">
<!-- Left Sidebar: Conversation History -->
<aside id="history-panel" class="sidebar left-sidebar">
<header>
<h3>History</h3>
<button id="new-chat-button" class="btn btn-sm">New Chat</button>
</header>
<ul id="history-list">
<!-- History items populated by JS -->
</ul>
</aside>
<!-- Main Area: Chat Interface -->
<main id="chat-panel">
<div id="chat-messages">
<!-- Chat messages populated by JS -->
<div class="message ai-message welcome-message">
Welcome to the Crawl4AI Assistant! How can I help you today?
</div>
</div>
<div id="chat-input-area">
<!-- Loading indicator for general waiting (optional) -->
<!-- <div class="loading-indicator" style="display: none;">Thinking...</div> -->
<textarea id="chat-input" placeholder="Ask about Crawl4AI..." rows="2"></textarea>
<button id="send-button">Send</button>
</div>
</main>
<!-- Right Sidebar: Citations / Context -->
<aside id="citations-panel" class="sidebar right-sidebar">
<header>
<h3>Citations</h3>
</header>
<ul id="citations-list">
<!-- Citations populated by JS -->
<li class="no-citations">No citations for this response yet.</li>
</ul>
</aside>
</div>
<!-- Include Marked.js library -->
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
<script src="../assets/highlight.min.js"></script>
<!-- Your AI Assistant Logic -->
<script src="ask-ai.js"></script>
</body>
</html>

View File

@@ -0,0 +1,62 @@
// ==== File: docs/assets/copy_code.js ====
document.addEventListener('DOMContentLoaded', () => {
// Target specifically code blocks within the main content area
const codeBlocks = document.querySelectorAll('#terminal-mkdocs-main-content pre > code');
codeBlocks.forEach((codeElement) => {
const preElement = codeElement.parentElement; // The <pre> tag
// Ensure the <pre> tag can contain a positioned button
if (window.getComputedStyle(preElement).position === 'static') {
preElement.style.position = 'relative';
}
// Create the button
const copyButton = document.createElement('button');
copyButton.className = 'copy-code-button';
copyButton.type = 'button';
copyButton.setAttribute('aria-label', 'Copy code to clipboard');
copyButton.title = 'Copy code to clipboard';
copyButton.innerHTML = 'Copy'; // Or use an icon like an SVG or FontAwesome class
// Append the button to the <pre> element
preElement.appendChild(copyButton);
// Add click event listener
copyButton.addEventListener('click', () => {
copyCodeToClipboard(codeElement, copyButton);
});
});
async function copyCodeToClipboard(codeElement, button) {
// Use innerText to get the rendered text content, preserving line breaks
const textToCopy = codeElement.innerText;
try {
await navigator.clipboard.writeText(textToCopy);
// Visual feedback
button.innerHTML = 'Copied!';
button.classList.add('copied');
button.disabled = true; // Temporarily disable
// Revert button state after a short delay
setTimeout(() => {
button.innerHTML = 'Copy';
button.classList.remove('copied');
button.disabled = false;
}, 2000); // Show "Copied!" for 2 seconds
} catch (err) {
console.error('Failed to copy code: ', err);
// Optional: Provide error feedback on the button
button.innerHTML = 'Error';
setTimeout(() => {
button.innerHTML = 'Copy';
}, 2000);
}
}
console.log("Copy Code Button script loaded.");
});

View File

@@ -0,0 +1,39 @@
// ==== File: docs/assets/floating_ask_ai_button.js ====
document.addEventListener('DOMContentLoaded', () => {
const askAiPagePath = '/core/ask-ai/'; // IMPORTANT: Adjust this path if needed!
const currentPath = window.location.pathname;
// Determine the base URL for constructing the link correctly,
// especially if deployed in a sub-directory.
// This assumes a simple structure; adjust if needed.
const baseUrl = window.location.origin + (currentPath.startsWith('/core/') ? '../..' : '');
// Check if the current page IS the Ask AI page
// Use includes() for flexibility (handles trailing slash or .html)
if (currentPath.includes(askAiPagePath.replace(/\/$/, ''))) { // Remove trailing slash for includes check
console.log("Floating Ask AI Button: Not adding button on the Ask AI page itself.");
return; // Don't add the button on the target page
}
// --- Create the button ---
const fabLink = document.createElement('a');
fabLink.className = 'floating-ask-ai-button';
fabLink.href = askAiPagePath; // Construct the correct URL
fabLink.title = 'Ask Crawl4AI Assistant';
fabLink.setAttribute('aria-label', 'Ask Crawl4AI Assistant');
// Add content (using SVG icon for better visuals)
fabLink.innerHTML = `
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="24" height="24" fill="currentColor">
<path d="M20 2H4c-1.1 0-2 .9-2 2v12c0 1.1.9 2 2 2h14l4 4V4c0-1.1-.9-2-2-2zm-2 12H6v-2h12v2zm0-3H6V9h12v2zm0-3H6V6h12v2z"/>
</svg>
<span>Ask AI</span>
`;
// Append to body
document.body.appendChild(fabLink);
console.log("Floating Ask AI Button added.");
});

View File

@@ -0,0 +1,119 @@
// ==== File: assets/github_stats.js ====
document.addEventListener('DOMContentLoaded', async () => {
// --- Configuration ---
const targetHeaderSelector = '.terminal .container:first-child'; // Selector for your header container
const insertBeforeSelector = '.terminal-nav'; // Selector for the element to insert the badge BEFORE (e.g., the main nav)
// Or set to null to append at the end of the header.
// --- Find elements ---
const headerContainer = document.querySelector(targetHeaderSelector);
if (!headerContainer) {
console.warn('GitHub Stats: Header container not found with selector:', targetHeaderSelector);
return;
}
const repoLinkElement = headerContainer.querySelector('a[href*="github.com/"]'); // Find the existing GitHub link
let repoUrl = 'https://github.com/unclecode/crawl4ai';
// if (repoLinkElement) {
// repoUrl = repoLinkElement.href;
// } else {
// // Fallback: Try finding from config (requires template injection - harder)
// // Or hardcode if necessary, but reading from the link is better.
// console.warn('GitHub Stats: GitHub repo link not found in header.');
// // Try to get repo_url from mkdocs config if available globally (less likely)
// // repoUrl = window.mkdocs_config?.repo_url; // Requires setting this variable
// // if (!repoUrl) return; // Exit if still no URL
// return; // Exit for now if link isn't found
// }
// --- Extract Repo Owner/Name ---
let owner = '';
let repo = '';
try {
const url = new URL(repoUrl);
const pathParts = url.pathname.split('/').filter(part => part.length > 0);
if (pathParts.length >= 2) {
owner = pathParts[0];
repo = pathParts[1];
}
} catch (e) {
console.error('GitHub Stats: Could not parse repository URL:', repoUrl, e);
return;
}
if (!owner || !repo) {
console.warn('GitHub Stats: Could not extract owner/repo from URL:', repoUrl);
return;
}
// --- Get Version (Attempt to extract from site title) ---
let version = '';
const siteTitleElement = headerContainer.querySelector('.terminal-title, .site-title'); // Adjust selector based on theme's title element
// Example title: "Crawl4AI Documentation (v0.5.x)"
if (siteTitleElement) {
const match = siteTitleElement.textContent.match(/\((v?[^)]+)\)/); // Look for text in parentheses starting with 'v' (optional)
if (match && match[1]) {
version = match[1].trim();
}
}
if (!version) {
console.info('GitHub Stats: Could not extract version from title. You might need to adjust the selector or regex.');
// You could fallback to config.extra.version if injected into JS
// version = window.mkdocs_config?.extra?.version || 'N/A';
}
// --- Fetch GitHub API Data ---
let stars = '...';
let forks = '...';
try {
const apiUrl = `https://api.github.com/repos/${owner}/${repo}`;
const response = await fetch(apiUrl);
if (response.ok) {
const data = await response.json();
// Format large numbers (optional)
stars = data.stargazers_count > 1000 ? `${(data.stargazers_count / 1000).toFixed(1)}k` : data.stargazers_count;
forks = data.forks_count > 1000 ? `${(data.forks_count / 1000).toFixed(1)}k` : data.forks_count;
} else {
console.warn(`GitHub Stats: API request failed with status ${response.status}. Rate limit exceeded?`);
stars = 'N/A';
forks = 'N/A';
}
} catch (error) {
console.error('GitHub Stats: Error fetching repository data:', error);
stars = 'N/A';
forks = 'N/A';
}
// --- Create Badge HTML ---
const badgeContainer = document.createElement('div');
badgeContainer.className = 'github-stats-badge';
// Use innerHTML for simplicity, including potential icons (requires FontAwesome or similar)
// Ensure your theme loads FontAwesome or add it yourself if you want icons.
badgeContainer.innerHTML = `
<a href="${repoUrl}" target="_blank" rel="noopener">
<!-- Optional Icon (FontAwesome example) -->
<!-- <i class="fab fa-github"></i> -->
<span class="repo-name">${owner}/${repo}</span>
${version ? `<span class="stat version"><i class="fas fa-tag"></i> ${version}</span>` : ''}
<span class="stat stars"><i class="fas fa-star"></i> ${stars}</span>
<span class="stat forks"><i class="fas fa-code-branch"></i> ${forks}</span>
</a>
`;
// --- Inject Badge into Header ---
const insertBeforeElement = insertBeforeSelector ? headerContainer.querySelector(insertBeforeSelector) : null;
if (insertBeforeElement) {
// headerContainer.insertBefore(badgeContainer, insertBeforeElement);
headerContainer.querySelector(insertBeforeSelector).appendChild(badgeContainer);
} else {
headerContainer.appendChild(badgeContainer);
}
console.info('GitHub Stats: Badge added to header.');
});

View File

@@ -0,0 +1,441 @@
/* ==== File: assets/layout.css (Non-Fluid Centered Layout) ==== */
:root {
--header-height: 55px; /* Adjust if needed */
--sidebar-width: 280px; /* Adjust if needed */
--toc-width: 340px; /* As specified */
--content-max-width: 90em; /* Max width for the centered content */
--layout-transition-speed: 0.2s;
--global-space: 10px;
}
/* --- Basic Setup --- */
html {
scroll-behavior: smooth;
scroll-padding-top: calc(var(--header-height) + 15px);
box-sizing: border-box;
}
*, *:before, *:after {
box-sizing: inherit;
}
body {
padding-top: 0;
padding-bottom: 0;
background-color: var(--background-color);
color: var(--font-color);
/* Prevents horizontal scrollbars during transitions */
overflow-x: hidden;
}
/* --- Fixed Header --- */
/* Full width, fixed header */
.terminal .container:first-child { /* Assuming this targets the header container */
position: fixed;
top: 0;
left: 0;
right: 0;
height: var(--header-height);
background-color: var(--background-color);
z-index: 1000;
border-bottom: 1px solid var(--progress-bar-background);
max-width: none; /* Override any container max-width */
padding: 0 calc(var(--global-space) * 2);
}
/* --- Main Layout Container (Below Header) --- */
/* This container just provides space for the fixed header */
.container:has(.terminal-mkdocs-main-grid) {
margin: 0 auto;
padding: 0;
padding-top: var(--header-height); /* Space for fixed header */
}
/* --- Flex Container: Grid holding content and toc (CENTERED) --- */
/* THIS is the main centered block */
.terminal-mkdocs-main-grid {
display: flex;
align-items: flex-start;
/* Enforce max-width and center */
max-width: var(--content-max-width);
margin-left: auto;
margin-right: auto;
position: relative;
/* Apply side padding within the centered block */
padding-left: calc(var(--global-space) * 2);
padding-right: calc(var(--global-space) * 2);
/* Add margin-left to clear the fixed sidebar */
margin-left: var(--sidebar-width);
}
/* --- 1. Fixed Left Sidebar (Viewport Relative) --- */
#terminal-mkdocs-side-panel {
position: fixed;
top: var(--header-height);
left: max(0px, calc((90vw - var(--content-max-width)) / 2));
bottom: 0;
width: var(--sidebar-width);
background-color: var(--background-color);
border-right: 1px solid var(--progress-bar-background);
overflow-y: auto;
z-index: 900;
padding: 1em calc(var(--global-space) * 2);
padding-bottom: 2em;
/* transition: left var(--layout-transition-speed) ease-in-out; */
}
/* --- 2. Main Content Area (Within Centered Grid) --- */
#terminal-mkdocs-main-content {
flex-grow: 1;
flex-shrink: 1;
min-width: 0; /* Flexbox shrink fix */
/* No left/right margins needed here - handled by parent grid */
margin-left: 0;
margin-right: 0;
/* Internal Padding */
padding: 1.5em 2em;
position: relative;
z-index: 1;
}
/* --- 3. Right Table of Contents (Sticky, Within Centered Grid) --- */
#toc-sidebar {
flex-basis: var(--toc-width);
flex-shrink: 0;
width: var(--toc-width);
position: sticky; /* Sticks within the centered grid */
top: var(--header-height);
align-self: stretch;
height: calc(100vh - var(--header-height));
overflow-y: auto;
padding: 1.5em 1em;
font-size: 0.85em;
border-left: 1px solid var(--progress-bar-background);
z-index: 800;
/* display: none; /* JS handles */
}
/* (ToC link styles remain the same) */
#toc-sidebar h4 { margin-top: 0; margin-bottom: 1em; font-size: 1.1em; color: var(--secondary-color); padding-left: 0.8em; }
#toc-sidebar ul { list-style: none; padding: 0; margin: 0; }
#toc-sidebar ul li a { display: block; padding: 0.3em 0; color: var(--secondary-color); text-decoration: none; border-left: 3px solid transparent; padding-left: 0.8em; transition: all 0.1s ease-in-out; line-height: 1.4; word-break: break-word; }
#toc-sidebar ul li.toc-level-3 a { padding-left: 1.8em; }
#toc-sidebar ul li.toc-level-4 a { padding-left: 2.8em; }
#toc-sidebar ul li a:hover { color: var(--font-color); background-color: rgba(255, 255, 255, 0.05); }
#toc-sidebar ul li a.active { color: var(--primary-color); border-left-color: var(--primary-color); background-color: rgba(80, 255, 255, 0.08); }
/* --- Footer Styling (Respects Centered Layout) --- */
footer {
background-color: var(--code-bg-color);
color: var(--secondary-color);
position: relative;
z-index: 10;
margin-top: 2em;
/* Apply margin-left to clear the fixed sidebar */
margin-left: var(--sidebar-width);
/* Constrain width relative to the centered grid it follows */
max-width: calc(var(--content-max-width) - var(--sidebar-width));
margin-right: auto; /* Keep it left-aligned within the space next to sidebar */
/* Use padding consistent with the grid */
padding: 2em calc(var(--global-space) * 2);
}
/* Adjust footer grid if needed */
.terminal-mkdocs-footer-grid {
display: grid;
grid-template-columns: 1fr auto;
gap: 1em;
align-items: center;
}
/* ==========================================================================
RESPONSIVENESS (Adapting the Non-Fluid Layout)
========================================================================== */
/* --- Medium screens: Hide ToC --- */
@media screen and (max-width: 1200px) {
#toc-sidebar {
display: none;
}
.terminal-mkdocs-main-grid {
/* Grid adjusts automatically as ToC is removed */
/* Ensure grid padding remains */
padding-left: calc(var(--global-space) * 2);
padding-right: calc(var(--global-space) * 2);
}
#terminal-mkdocs-main-content {
/* Content area naturally expands */
}
footer {
/* Footer still respects the left sidebar and overall max width */
margin-left: var(--sidebar-width);
max-width: calc(var(--content-max-width) - var(--sidebar-width));
/* Padding remains consistent */
padding-left: calc(var(--global-space) * 2);
padding-right: calc(var(--global-space) * 2);
}
}
/* --- Small screens: Hide left sidebar, full width content & footer --- */
@media screen and (max-width: 768px) {
#terminal-mkdocs-side-panel {
left: calc(-1 * var(--sidebar-width));
z-index: 1100;
box-shadow: 2px 0 10px rgba(0,0,0,0.3);
}
#terminal-mkdocs-side-panel.sidebar-visible {
left: 0;
}
.terminal-mkdocs-main-grid {
/* Grid now takes full width (minus body padding) */
margin-left: 0; /* Override sidebar margin */
margin-right: 0; /* Override auto margin */
max-width: 100%; /* Allow full width */
padding-left: var(--global-space); /* Reduce padding */
padding-right: var(--global-space);
}
#terminal-mkdocs-main-content {
padding: 1.5em 1em; /* Adjust internal padding */
}
footer {
margin-left: 0; /* Full width footer */
max-width: 100%; /* Allow full width */
padding: 2em 1em; /* Adjust internal padding */
}
.terminal-mkdocs-footer-grid {
grid-template-columns: 1fr; /* Stack footer items */
text-align: center;
gap: 0.5em;
}
/* Remember JS for toggle button & overlay */
}
/* ==== GitHub Stats Badge Styling ==== */
.github-stats-badge {
display: inline-block; /* Or flex if needed */
margin-left: 2em; /* Adjust spacing */
vertical-align: middle; /* Align with other header items */
font-size: 0.9em; /* Slightly smaller font */
}
.github-stats-badge a {
color: var(--secondary-color); /* Use secondary color */
text-decoration: none;
display: flex; /* Use flex for alignment */
align-items: center;
gap: 0.8em; /* Space between items */
padding: 0.2em 0.5em;
border: 1px solid var(--progress-bar-background); /* Subtle border */
border-radius: 4px;
transition: color 0.2s, background-color 0.2s;
}
.github-stats-badge a:hover {
color: var(--font-color); /* Brighter color on hover */
background-color: var(--progress-bar-background); /* Subtle background on hover */
}
.github-stats-badge .repo-name {
color: var(--font-color); /* Make repo name stand out slightly */
font-weight: 500; /* Optional bolder weight */
}
.github-stats-badge .stat {
/* Styles for individual stats (version, stars, forks) */
white-space: nowrap; /* Prevent wrapping */
}
.github-stats-badge .stat i {
/* Optional: Style for FontAwesome icons */
margin-right: 0.3em;
color: var(--secondary-dimmed-color); /* Dimmer color for icons */
}
/* Adjust positioning relative to search/nav if needed */
/* Example: If search is floated right */
/* .terminal-nav { float: left; } */
/* .github-stats-badge { float: left; } */
/* #mkdocs-search-query { float: right; } */
/* --- Responsive adjustments --- */
@media screen and (max-width: 900px) { /* Example breakpoint */
.github-stats-badge .repo-name {
display: none; /* Hide full repo name on smaller screens */
}
.github-stats-badge {
margin-left: 1em;
}
.github-stats-badge a {
gap: 0.5em;
}
}
@media screen and (max-width: 768px) {
/* Further hide or simplify on mobile if needed */
.github-stats-badge {
display: none; /* Example: Hide completely on smallest screens */
}
}
/* --- Ask AI Selection Button --- */
.ask-ai-selection-button {
background-color: var(--primary-dimmed-color, #09b5a5);
color: var(--background-color, #070708);
border: none;
padding: 4px 8px;
font-size: 0.8em;
border-radius: 4px;
cursor: pointer;
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.3);
transition: background-color 0.2s ease;
white-space: nowrap;
}
.ask-ai-selection-button:hover {
background-color: var(--primary-color, #50ffff);
}
/* ==== File: docs/assets/layout.css (Additions) ==== */
/* ... (keep all existing layout CSS) ... */
/* --- Copy Code Button Styling --- */
/* Ensure the parent <pre> can contain the absolutely positioned button */
#terminal-mkdocs-main-content pre {
position: relative; /* Needed for absolute positioning of child */
/* Add a little padding top/right to make space for the button */
padding-top: 2.5em;
padding-right: 1em; /* Ensure padding is sufficient */
}
.copy-code-button {
position: absolute;
top: 0.5em; /* Adjust spacing from top */
left: 0.5em; /* Adjust spacing from left */
z-index: 1; /* Sit on top of code */
background-color: var(--progress-bar-background, #444); /* Use a background */
color: var(--font-color, #eaeaea);
border: 1px solid var(--secondary-color, #727578);
padding: 3px 8px;
font-size: 0.8em;
font-family: var(--font-stack, monospace);
border-radius: 4px;
cursor: pointer;
opacity: 0; /* Hidden by default */
transition: opacity 0.2s ease-in-out, background-color 0.2s ease, color 0.2s ease;
white-space: nowrap;
}
/* Show button on hover of the <pre> container */
#terminal-mkdocs-main-content pre:hover .copy-code-button {
opacity: 0.8; /* Show partially */
}
.copy-code-button:hover {
opacity: 1; /* Fully visible on button hover */
background-color: var(--secondary-color, #727578);
}
.copy-code-button:focus {
opacity: 1; /* Ensure visible when focused */
outline: 1px dashed var(--primary-color);
}
/* Style for "Copied!" state */
.copy-code-button.copied {
background-color: var(--primary-dimmed-color, #09b5a5);
color: var(--background-color, #070708);
border-color: var(--primary-dimmed-color, #09b5a5);
opacity: 1; /* Ensure visible */
}
.copy-code-button.copied:hover {
background-color: var(--primary-dimmed-color, #09b5a5); /* Prevent hover change */
}
/* ==== File: docs/assets/layout.css (Additions) ==== */
/* ... (keep all existing layout CSS) ... */
/* --- Floating Ask AI Button --- */
.floating-ask-ai-button {
position: fixed;
bottom: 25px;
right: 25px;
z-index: 1050; /* Below modals, above most content */
background-color: var(--primary-dimmed-color, #09b5a5);
color: var(--background-color, #070708);
border: none;
border-radius: 50%; /* Make it circular */
width: 60px; /* Adjust size */
height: 60px; /* Adjust size */
padding: 10px; /* Adjust padding */
box-shadow: 0 4px 10px rgba(0, 0, 0, 0.4);
cursor: pointer;
transition: background-color 0.2s ease, transform 0.2s ease;
display: flex;
flex-direction: column; /* Stack icon and text */
align-items: center;
justify-content: center;
text-decoration: none;
text-align: center;
}
.floating-ask-ai-button svg {
width: 24px; /* Control icon size */
height: 24px;
}
.floating-ask-ai-button span {
font-size: 0.7em;
margin-top: 2px; /* Space between icon and text */
display: block; /* Ensure it takes space */
line-height: 1;
}
.floating-ask-ai-button:hover {
background-color: var(--primary-color, #50ffff);
transform: scale(1.05); /* Slight grow effect */
}
.floating-ask-ai-button:focus {
outline: 2px solid var(--primary-color);
outline-offset: 2px;
}
/* Optional: Hide text on smaller screens if needed */
@media screen and (max-width: 768px) {
.floating-ask-ai-button span {
/* display: none; */ /* Uncomment to hide text */
}
.floating-ask-ai-button {
width: 55px;
height: 55px;
bottom: 20px;
right: 20px;
}
}

View File

@@ -0,0 +1,109 @@
// ==== File: docs/assets/selection_ask_ai.js ====
document.addEventListener('DOMContentLoaded', () => {
let askAiButton = null;
const askAiPageUrl = '/core/ask-ai/'; // Adjust if your Ask AI page path is different
function createAskAiButton() {
const button = document.createElement('button');
button.id = 'ask-ai-selection-btn';
button.className = 'ask-ai-selection-button';
button.textContent = 'Ask AI'; // Or use an icon
button.style.display = 'none'; // Initially hidden
button.style.position = 'absolute';
button.style.zIndex = '1500'; // Ensure it's on top
document.body.appendChild(button);
button.addEventListener('click', handleAskAiClick);
return button;
}
function getSafeSelectedText() {
const selection = window.getSelection();
if (!selection || selection.rangeCount === 0) {
return null;
}
// Avoid selecting text within the button itself if it was somehow selected
const container = selection.getRangeAt(0).commonAncestorContainer;
if (askAiButton && askAiButton.contains(container)) {
return null;
}
const text = selection.toString().trim();
return text.length > 0 ? text : null;
}
function positionButton(event) {
const selection = window.getSelection();
if (!selection || selection.rangeCount === 0 || selection.isCollapsed) {
hideButton();
return;
}
const range = selection.getRangeAt(0);
const rect = range.getBoundingClientRect();
// Calculate position: top-right of the selection
const scrollX = window.scrollX;
const scrollY = window.scrollY;
const buttonTop = rect.top + scrollY - askAiButton.offsetHeight - 5; // 5px above
const buttonLeft = rect.right + scrollX + 5; // 5px to the right
askAiButton.style.top = `${buttonTop}px`;
askAiButton.style.left = `${buttonLeft}px`;
askAiButton.style.display = 'block'; // Show the button
}
function hideButton() {
if (askAiButton) {
askAiButton.style.display = 'none';
}
}
function handleAskAiClick(event) {
event.stopPropagation(); // Prevent mousedown from hiding button immediately
const selectedText = getSafeSelectedText();
if (selectedText) {
console.log("Selected Text:", selectedText);
// Base64 encode for URL safety (handles special chars, line breaks)
// Use encodeURIComponent first for proper Unicode handling before btoa
const encodedText = btoa(unescape(encodeURIComponent(selectedText)));
const targetUrl = `${askAiPageUrl}?qq=${encodedText}`;
console.log("Navigating to:", targetUrl);
window.location.href = targetUrl; // Navigate to Ask AI page
}
hideButton(); // Hide after click
}
// --- Event Listeners ---
// Show button on mouse up after selection
document.addEventListener('mouseup', (event) => {
// Slight delay to ensure selection is registered
setTimeout(() => {
const selectedText = getSafeSelectedText();
if (selectedText) {
if (!askAiButton) {
askAiButton = createAskAiButton();
}
// Don't position if the click was ON the button itself
if (event.target !== askAiButton) {
positionButton(event);
}
} else {
hideButton();
}
}, 10); // Small delay
});
// Hide button on scroll or click elsewhere
document.addEventListener('mousedown', (event) => {
// Hide if clicking anywhere EXCEPT the button itself
if (askAiButton && event.target !== askAiButton) {
hideButton();
}
});
document.addEventListener('scroll', hideButton, true); // Capture scroll events
console.log("Selection Ask AI script loaded.");
});

View File

@@ -6,8 +6,8 @@
} }
:root { :root {
--global-font-size: 16px; --global-font-size: 14px;
--global-code-font-size: 16px; --global-code-font-size: 13px;
--global-line-height: 1.5em; --global-line-height: 1.5em;
--global-space: 10px; --global-space: 10px;
--font-stack: Menlo, Monaco, Lucida Console, Liberation Mono, DejaVu Sans Mono, Bitstream Vera Sans Mono, --font-stack: Menlo, Monaco, Lucida Console, Liberation Mono, DejaVu Sans Mono, Bitstream Vera Sans Mono,
@@ -50,8 +50,17 @@
--display-h1-decoration: none; --display-h1-decoration: none;
--display-h1-decoration: none; --display-h1-decoration: none;
--header-height: 65px; /* Adjust based on your actual header height */
--sidebar-width: 280px; /* Adjust based on your desired sidebar width */
--toc-width: 240px; /* Adjust based on your desired ToC width */
--layout-transition-speed: 0.2s; /* For potential future animations */
--page-width : 100em; /* Adjust based on your design */
} }
/* body { /* body {
background-color: var(--background-color); background-color: var(--background-color);
color: var(--font-color); color: var(--font-color);
@@ -256,4 +265,6 @@ div.badges a {
} }
div.badges a > img { div.badges a > img {
width: auto; width: auto;
} }

144
docs/md_v2/assets/toc.js Normal file
View File

@@ -0,0 +1,144 @@
// ==== File: assets/toc.js ====
document.addEventListener('DOMContentLoaded', () => {
const mainContent = document.getElementById('terminal-mkdocs-main-content');
const tocContainer = document.getElementById('toc-sidebar');
const mainGrid = document.querySelector('.terminal-mkdocs-main-grid'); // Get the flex container
if (!mainContent) {
console.warn("TOC Generator: Main content area '#terminal-mkdocs-main-content' not found.");
return;
}
// --- Create ToC container if it doesn't exist ---
let tocElement = tocContainer;
if (!tocElement) {
if (!mainGrid) {
console.warn("TOC Generator: Flex container '.terminal-mkdocs-main-grid' not found to append ToC.");
return;
}
tocElement = document.createElement('aside');
tocElement.id = 'toc-sidebar';
tocElement.style.display = 'none'; // Keep hidden initially
// Append it as the last child of the flex grid
mainGrid.appendChild(tocElement);
console.info("TOC Generator: Created '#toc-sidebar' element.");
}
// --- Find Headings (h2, h3, h4 are common for ToC) ---
const headings = mainContent.querySelectorAll('h2, h3, h4');
if (headings.length === 0) {
console.info("TOC Generator: No headings found on this page. ToC not generated.");
tocElement.style.display = 'none'; // Ensure it's hidden
return;
}
// --- Generate ToC List ---
const tocList = document.createElement('ul');
const observerTargets = []; // Store headings for IntersectionObserver
headings.forEach((heading, index) => {
// Ensure heading has an ID for linking
if (!heading.id) {
// Create a simple slug-like ID
heading.id = `toc-heading-${index}-${heading.textContent.toLowerCase().replace(/\s+/g, '-').replace(/[^a-z0-9-]/g, '')}`;
}
const listItem = document.createElement('li');
const link = document.createElement('a');
link.href = `#${heading.id}`;
link.textContent = heading.textContent;
// Add class for styling based on heading level
const level = parseInt(heading.tagName.substring(1), 10); // Get 2, 3, or 4
listItem.classList.add(`toc-level-${level}`);
listItem.appendChild(link);
tocList.appendChild(listItem);
observerTargets.push(heading); // Add to observer list
});
// --- Populate and Show ToC ---
// Optional: Add a title
const tocTitle = document.createElement('h4');
tocTitle.textContent = 'On this page'; // Customize title if needed
tocElement.innerHTML = ''; // Clear previous content if any
tocElement.appendChild(tocTitle);
tocElement.appendChild(tocList);
tocElement.style.display = ''; // Show the ToC container
console.info(`TOC Generator: Generated ToC with ${headings.length} items.`);
// --- Scroll Spy using Intersection Observer ---
const tocLinks = tocElement.querySelectorAll('a');
let activeLink = null; // Keep track of the current active link
const observerOptions = {
// Observe changes relative to the viewport, offset by the header height
// Negative top margin pushes the intersection trigger point down
// Negative bottom margin ensures elements low on the screen can trigger before they exit
rootMargin: `-${getComputedStyle(document.documentElement).getPropertyValue('--header-height').trim()} 0px -60% 0px`,
threshold: 0 // Trigger as soon as any part enters/exits the boundary
};
const observerCallback = (entries) => {
let topmostVisibleHeading = null;
entries.forEach(entry => {
const link = tocElement.querySelector(`a[href="#${entry.target.id}"]`);
if (!link) return;
// Check if the heading is intersecting (partially or fully visible within rootMargin)
if (entry.isIntersecting) {
// Among visible headings, find the one closest to the top edge (within the rootMargin)
if (!topmostVisibleHeading || entry.boundingClientRect.top < topmostVisibleHeading.boundingClientRect.top) {
topmostVisibleHeading = entry.target;
}
}
});
// If we found a topmost visible heading, activate its link
if (topmostVisibleHeading) {
const newActiveLink = tocElement.querySelector(`a[href="#${topmostVisibleHeading.id}"]`);
if (newActiveLink && newActiveLink !== activeLink) {
// Remove active class from previous link
if (activeLink) {
activeLink.classList.remove('active');
activeLink.parentElement.classList.remove('active-parent'); // Optional parent styling
}
// Add active class to the new link
newActiveLink.classList.add('active');
newActiveLink.parentElement.classList.add('active-parent'); // Optional parent styling
activeLink = newActiveLink;
// Optional: Scroll the ToC sidebar to keep the active link visible
// newActiveLink.scrollIntoView({ behavior: 'smooth', block: 'nearest' });
}
}
// If no headings are intersecting (scrolled past the last one?), maybe deactivate all
// Or keep the last one active - depends on desired behavior. Current logic keeps last active.
};
const observer = new IntersectionObserver(observerCallback, observerOptions);
// Observe all target headings
observerTargets.forEach(heading => observer.observe(heading));
// Initial check in case a heading is already in view on load
// (Requires slight delay for accurate layout calculation)
setTimeout(() => {
observerCallback(observer.takeRecords()); // Process initial state
}, 100);
// move footer and the hr before footer to the end of the main content
const footer = document.querySelector('footer');
const hr = footer.previousElementSibling;
if (hr && hr.tagName === 'HR') {
mainContent.appendChild(hr);
}
mainContent.appendChild(footer);
console.info("TOC Generator: Footer moved to the end of the main content.");
});

View File

@@ -251,7 +251,7 @@ from crawl4ai import (
RoundRobinProxyStrategy, RoundRobinProxyStrategy,
) )
import asyncio import asyncio
from crawl4ai.proxy_strategy import ProxyConfig from crawl4ai import ProxyConfig
async def main(): async def main():
# Load proxies and create rotation strategy # Load proxies and create rotation strategy
proxies = ProxyConfig.from_env() proxies = ProxyConfig.from_env()

74
docs/md_v2/core/ask-ai.md Normal file
View File

@@ -0,0 +1,74 @@
<div class="ask-ai-container">
<iframe id="ask-ai-frame" src="../../ask_ai/index.html" width="100%" style="border:none; display: block;" title="Crawl4AI Assistant"></iframe>
</div>
<script>
// Iframe height adjustment
function resizeAskAiIframe() {
const iframe = document.getElementById('ask-ai-frame');
if (iframe) {
const headerHeight = parseFloat(getComputedStyle(document.documentElement).getPropertyValue('--header-height') || '55');
// Footer is removed by JS below, so calculate height based on header + small buffer
const topOffset = headerHeight + 20; // Header + buffer/margin
const availableHeight = window.innerHeight - topOffset;
iframe.style.height = Math.max(600, availableHeight) + 'px'; // Min height 600px
}
}
// Run immediately and on resize/load
resizeAskAiIframe(); // Initial call
let resizeTimer;
window.addEventListener('load', resizeAskAiIframe);
window.addEventListener('resize', () => {
clearTimeout(resizeTimer);
resizeTimer = setTimeout(resizeAskAiIframe, 150);
});
// Remove Footer & HR from parent page (DOM Ready might be safer)
document.addEventListener('DOMContentLoaded', () => {
setTimeout(() => { // Add slight delay just in case elements render slowly
const footer = window.parent.document.querySelector('footer'); // Target parent document
if (footer) {
const hrBeforeFooter = footer.previousElementSibling;
if (hrBeforeFooter && hrBeforeFooter.tagName === 'HR') {
hrBeforeFooter.remove();
}
footer.remove();
// Trigger resize again after removing footer
resizeAskAiIframe();
} else {
console.warn("Ask AI Page: Could not find footer in parent document to remove.");
}
}, 100); // Shorter delay
});
</script>
<style>
#terminal-mkdocs-main-content {
padding: 0 !important;
margin: 0;
width: 100%;
height: 100%;
overflow: hidden; /* Prevent body scrollbars, panels handle scroll */
}
/* Ensure iframe container takes full space */
#terminal-mkdocs-main-content .ask-ai-container {
/* Remove negative margins if footer removal handles space */
margin: 0;
padding: 0;
max-width: none;
/* Let the JS set the height */
/* height: 600px; Initial fallback height */
overflow: hidden; /* Hide potential overflow before JS resize */
}
/* Hide title/paragraph if they were part of the markdown */
/* Alternatively, just remove them from the .md file directly */
/* #terminal-mkdocs-main-content > h1,
#terminal-mkdocs-main-content > p:first-of-type {
display: none;
} */
</style>

File diff suppressed because it is too large Load Diff

View File

View File

@@ -7,10 +7,11 @@ docs_dir: docs/md_v2
nav: nav:
- Home: 'index.md' - Home: 'index.md'
- "Ask AI": "core/ask-ai.md"
- "Quick Start": "core/quickstart.md"
- Setup & Installation: - Setup & Installation:
- "Installation": "core/installation.md" - "Installation": "core/installation.md"
- "Docker Deployment": "core/docker-deployment.md" - "Docker Deployment": "core/docker-deployment.md"
- "Quick Start": "core/quickstart.md"
- "Blog & Changelog": - "Blog & Changelog":
- "Blog Home": "blog/index.md" - "Blog Home": "blog/index.md"
- "Changelog": "https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md" - "Changelog": "https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md"
@@ -76,6 +77,7 @@ extra:
version: !ENV [CRAWL4AI_VERSION, 'development'] version: !ENV [CRAWL4AI_VERSION, 'development']
extra_css: extra_css:
- assets/layout.css
- assets/styles.css - assets/styles.css
- assets/highlight.css - assets/highlight.css
- assets/dmvendor.css - assets/dmvendor.css
@@ -83,4 +85,9 @@ extra_css:
extra_javascript: extra_javascript:
- assets/highlight.min.js - assets/highlight.min.js
- assets/highlight_init.js - assets/highlight_init.js
- https://buttons.github.io/buttons.js - https://buttons.github.io/buttons.js
- assets/toc.js
- assets/github_stats.js
- assets/selection_ask_ai.js
- assets/copy_code.js
- assets/floating_ask_ai_button.js

View File

@@ -1,20 +0,0 @@
The file /docs/md_v2/api/parameters.md should be updated to include the new network and console capturing parameters.
Here's what needs to be updated:
1. Change section title from:
```
### G) **Debug & Logging**
```
to:
```
### G) **Debug, Logging & Capturing**
```
2. Add new parameters to the table:
```
| **`capture_network_requests`** | `bool` (False) | Captures all network requests, responses, and failures during the crawl. Available in `result.network_requests`. |
| **`capture_console_messages`** | `bool` (False) | Captures all browser console messages (logs, warnings, errors) during the crawl. Available in `result.console_messages`. |
```
These changes demonstrate how to use the new network and console capturing features in the CrawlerRunConfig.

View File

@@ -0,0 +1,596 @@
# ==== File: test_rest_api_deep_crawl.py ====
import pytest
import pytest_asyncio
import httpx
import json
import asyncio
import os
from typing import List, Dict, Any, AsyncGenerator
from dotenv import load_dotenv
load_dotenv() # Load environment variables from .env file if present
# --- Test Configuration ---
BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # If server is running in Docker, use the host's IP
BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # If server is running in dev debug mode
DEEP_CRAWL_BASE_URL = "https://docs.crawl4ai.com/samples/deepcrawl/"
DEEP_CRAWL_DOMAIN = "docs.crawl4ai.com" # Used for domain filter
# --- Helper Functions ---
def load_proxies_from_env() -> List[Dict]:
"""Load proxies from PROXIES environment variable"""
proxies = []
proxies_str = os.getenv("PROXIES", "")
if not proxies_str:
print("PROXIES environment variable not set or empty.")
return proxies
try:
proxy_list = proxies_str.split(",")
for proxy in proxy_list:
proxy = proxy.strip()
if not proxy:
continue
parts = proxy.split(":")
if len(parts) == 4:
ip, port, username, password = parts
proxies.append({
"server": f"http://{ip}:{port}", # Assuming http, adjust if needed
"username": username,
"password": password,
"ip": ip # Store original IP if available
})
elif len(parts) == 2: # ip:port only
ip, port = parts
proxies.append({
"server": f"http://{ip}:{port}",
"ip": ip
})
else:
print(f"Skipping invalid proxy string format: {proxy}")
except Exception as e:
print(f"Error loading proxies from environment: {e}")
return proxies
async def check_server_health(client: httpx.AsyncClient):
"""Check if the server is healthy before running tests."""
try:
response = await client.get("/health")
response.raise_for_status()
print(f"\nServer healthy: {response.json()}")
return True
except (httpx.RequestError, httpx.HTTPStatusError) as e:
pytest.fail(f"Server health check failed: {e}. Is the server running at {BASE_URL}?", pytrace=False)
async def assert_crawl_result_structure(result: Dict[str, Any], check_ssl=False):
"""Asserts the basic structure of a single crawl result."""
assert isinstance(result, dict)
assert "url" in result
assert "success" in result
assert "html" in result # Basic crawls should return HTML
assert "metadata" in result
assert isinstance(result["metadata"], dict)
assert "depth" in result["metadata"] # Deep crawls add depth
if check_ssl:
assert "ssl_certificate" in result # Check if SSL info is present
assert isinstance(result["ssl_certificate"], dict) or result["ssl_certificate"] is None
async def process_streaming_response(response: httpx.Response) -> List[Dict[str, Any]]:
"""Processes an NDJSON streaming response."""
results = []
completed = False
async for line in response.aiter_lines():
if line:
try:
data = json.loads(line)
if data.get("status") == "completed":
completed = True
break # Stop processing after completion marker
elif data.get("url"): # Ensure it looks like a result object
results.append(data)
else:
print(f"Received non-result JSON line: {data}") # Log other status messages if needed
except json.JSONDecodeError:
pytest.fail(f"Failed to decode JSON line: {line}")
assert completed, "Streaming response did not end with a completion marker."
return results
# --- Pytest Fixtures ---
@pytest_asyncio.fixture(scope="function")
async def async_client() -> AsyncGenerator[httpx.AsyncClient, None]:
"""Provides an async HTTP client"""
# Increased timeout for potentially longer deep crawls
async with httpx.AsyncClient(base_url=BASE_URL, timeout=300.0) as client:
yield client
# No explicit close needed with 'async with'
# --- Test Class ---
@pytest.mark.asyncio
class TestDeepCrawlEndpoints:
@pytest_asyncio.fixture(autouse=True)
async def check_health_before_tests(self, async_client: httpx.AsyncClient):
"""Fixture to ensure server is healthy before each test in the class."""
await check_server_health(async_client)
# 1. Basic Deep Crawl
async def test_deep_crawl_basic_bfs(self, async_client: httpx.AsyncClient):
"""Test BFS deep crawl with limited depth and pages."""
max_depth = 1
max_pages = 3 # start_url + 2 more
payload = {
"urls": [DEEP_CRAWL_BASE_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": False,
"cache_mode": "BYPASS", # Use string value for CacheMode
"deep_crawl_strategy": {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": max_depth,
"max_pages": max_pages,
# Minimal filters for basic test
"filter_chain": {
"type": "FilterChain",
"params": {
"filters": [
{
"type": "DomainFilter",
"params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
}
]
}
}
}
}
}
}
}
response = await async_client.post("/crawl", json=payload)
response.raise_for_status()
data = response.json()
assert data["success"] is True
assert isinstance(data["results"], list)
assert len(data["results"]) > 1 # Should be more than just the start URL
assert len(data["results"]) <= max_pages # Respect max_pages
found_depth_0 = False
found_depth_1 = False
for result in data["results"]:
await assert_crawl_result_structure(result)
assert result["success"] is True
assert DEEP_CRAWL_DOMAIN in result["url"]
depth = result["metadata"]["depth"]
assert depth <= max_depth
if depth == 0: found_depth_0 = True
if depth == 1: found_depth_1 = True
assert found_depth_0
assert found_depth_1
# 2. Deep Crawl with Filtering
async def test_deep_crawl_with_filters(self, async_client: httpx.AsyncClient):
"""Test BFS deep crawl with content type and domain filters."""
max_depth = 1
max_pages = 5
payload = {
"urls": [DEEP_CRAWL_BASE_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": False,
"cache_mode": "BYPASS",
"deep_crawl_strategy": {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": max_depth,
"max_pages": max_pages,
"filter_chain": {
"type": "FilterChain",
"params": {
"filters": [
{
"type": "DomainFilter",
"params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
},
{
"type": "ContentTypeFilter",
"params": {"allowed_types": ["text/html"]}
},
# Example: Exclude specific paths using regex
{
"type": "URLPatternFilter",
"params": {
"patterns": ["*/category-3/*"], # Block category 3
"reverse": True # Block if match
}
}
]
}
}
}
}
}
}
}
response = await async_client.post("/crawl", json=payload)
response.raise_for_status()
data = response.json()
assert data["success"] is True
assert len(data["results"]) > 0
assert len(data["results"]) <= max_pages
for result in data["results"]:
await assert_crawl_result_structure(result)
assert result["success"] is True
assert DEEP_CRAWL_DOMAIN in result["url"]
assert "category-3" not in result["url"] # Check if filter worked
assert result["metadata"]["depth"] <= max_depth
# 3. Deep Crawl with Scoring
async def test_deep_crawl_with_scoring(self, async_client: httpx.AsyncClient):
"""Test BFS deep crawl with URL scoring."""
max_depth = 1
max_pages = 4
payload = {
"urls": [DEEP_CRAWL_BASE_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": False,
"cache_mode": "BYPASS",
"deep_crawl_strategy": {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": max_depth,
"max_pages": max_pages,
"filter_chain": { # Keep basic domain filter
"type": "FilterChain",
"params": { "filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
},
"url_scorer": { # Add scorer
"type": "CompositeScorer",
"params": {
"scorers": [
{ # Favor pages with 'product' in the URL
"type": "KeywordRelevanceScorer",
"params": {"keywords": ["product"], "weight": 1.0}
},
{ # Penalize deep paths slightly
"type": "PathDepthScorer",
"params": {"optimal_depth": 2, "weight": -0.2}
}
]
}
},
# Set a threshold if needed: "score_threshold": 0.1
}
}
}
}
}
response = await async_client.post("/crawl", json=payload)
response.raise_for_status()
data = response.json()
assert data["success"] is True
assert len(data["results"]) > 0
assert len(data["results"]) <= max_pages
# Check if results seem biased towards products (harder to assert strictly without knowing exact scores)
product_urls_found = any("product_" in result["url"] for result in data["results"] if result["metadata"]["depth"] > 0)
print(f"Product URLs found among depth > 0 results: {product_urls_found}")
# We expect scoring to prioritize product pages if available within limits
# assert product_urls_found # This might be too strict depending on site structure and limits
for result in data["results"]:
await assert_crawl_result_structure(result)
assert result["success"] is True
assert result["metadata"]["depth"] <= max_depth
# 4. Deep Crawl with CSS Extraction
async def test_deep_crawl_with_css_extraction(self, async_client: httpx.AsyncClient):
"""Test BFS deep crawl combined with JsonCssExtractionStrategy."""
max_depth = 6 # Go deep enough to reach product pages
max_pages = 20
# Schema to extract product details
product_schema = {
"name": "ProductDetails",
"baseSelector": "div.container", # Base for product page
"fields": [
{"name": "product_title", "selector": "h1", "type": "text"},
{"name": "price", "selector": ".product-price", "type": "text"},
{"name": "description", "selector": ".product-description p", "type": "text"},
{"name": "specs", "selector": ".product-specs li", "type": "list", "fields":[
{"name": "spec_name", "selector": ".spec-name", "type": "text"},
{"name": "spec_value", "selector": ".spec-value", "type": "text"}
]}
]
}
payload = {
"urls": [DEEP_CRAWL_BASE_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": False,
"cache_mode": "BYPASS",
"extraction_strategy": { # Apply extraction to ALL crawled pages
"type": "JsonCssExtractionStrategy",
"params": {"schema": {"type": "dict", "value": product_schema}}
},
"deep_crawl_strategy": {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": max_depth,
"max_pages": max_pages,
"filter_chain": { # Only crawl HTML on our domain
"type": "FilterChain",
"params": {
"filters": [
{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
{"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
]
}
}
# Optional: Add scoring to prioritize product pages for extraction
}
}
}
}
}
response = await async_client.post("/crawl", json=payload)
response.raise_for_status()
data = response.json()
assert data["success"] is True
assert len(data["results"]) > 0
# assert len(data["results"]) <= max_pages
found_extracted_product = False
for result in data["results"]:
await assert_crawl_result_structure(result)
assert result["success"] is True
assert "extracted_content" in result
if "product_" in result["url"]: # Check product pages specifically
assert result["extracted_content"] is not None
try:
extracted = json.loads(result["extracted_content"])
# Schema returns list even if one base match
assert isinstance(extracted, list)
if extracted:
item = extracted[0]
assert "product_title" in item and item["product_title"]
assert "price" in item and item["price"]
# Specs might be empty list if not found
assert "specs" in item and isinstance(item["specs"], list)
found_extracted_product = True
print(f"Extracted product: {item.get('product_title')}")
except (json.JSONDecodeError, AssertionError, IndexError) as e:
pytest.fail(f"Extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
# else:
# # Non-product pages might have None or empty list depending on schema match
# assert result["extracted_content"] is None or json.loads(result["extracted_content"]) == []
assert found_extracted_product, "Did not find any pages where product data was successfully extracted."
# 5. Deep Crawl with LLM Extraction (Requires Server LLM Setup)
async def test_deep_crawl_with_llm_extraction(self, async_client: httpx.AsyncClient):
"""Test BFS deep crawl combined with LLMExtractionStrategy."""
max_depth = 1 # Limit depth to keep LLM calls manageable
max_pages = 3
payload = {
"urls": [DEEP_CRAWL_BASE_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": False,
"cache_mode": "BYPASS",
"extraction_strategy": { # Apply LLM extraction to crawled pages
"type": "LLMExtractionStrategy",
"params": {
"instruction": "Extract the main H1 title and the text content of the first paragraph.",
"llm_config": { # Example override, rely on server default if possible
"type": "LLMConfig",
"params": {"provider": "openai/gpt-4.1-mini"} # Use a cheaper model for testing
},
"schema": { # Expected JSON output
"type": "dict",
"value": {
"title": "PageContent", "type": "object",
"properties": {
"h1_title": {"type": "string"},
"first_paragraph": {"type": "string"}
}
}
}
}
},
"deep_crawl_strategy": {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": max_depth,
"max_pages": max_pages,
"filter_chain": {
"type": "FilterChain",
"params": {
"filters": [
{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
{"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
]
}
}
}
}
}
}
}
try:
response = await async_client.post("/crawl", json=payload)
response.raise_for_status()
data = response.json()
except httpx.HTTPStatusError as e:
pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and LLM API key setup.")
except httpx.RequestError as e:
pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}.")
assert data["success"] is True
assert len(data["results"]) > 0
assert len(data["results"]) <= max_pages
found_llm_extraction = False
for result in data["results"]:
await assert_crawl_result_structure(result)
assert result["success"] is True
assert "extracted_content" in result
assert result["extracted_content"] is not None
try:
extracted = json.loads(result["extracted_content"])
if isinstance(extracted, list): extracted = extracted[0] # Handle list output
assert isinstance(extracted, dict)
assert "h1_title" in extracted # Check keys based on schema
assert "first_paragraph" in extracted
found_llm_extraction = True
print(f"LLM extracted from {result['url']}: Title='{extracted.get('h1_title')}'")
except (json.JSONDecodeError, AssertionError, IndexError, TypeError) as e:
pytest.fail(f"LLM extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
assert found_llm_extraction, "LLM extraction did not yield expected data on any crawled page."
# 6. Deep Crawl with SSL Certificate Fetching
async def test_deep_crawl_with_ssl(self, async_client: httpx.AsyncClient):
"""Test BFS deep crawl with fetch_ssl_certificate enabled."""
max_depth = 0 # Only fetch for start URL to keep test fast
max_pages = 1
payload = {
"urls": [DEEP_CRAWL_BASE_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": False,
"cache_mode": "BYPASS",
"fetch_ssl_certificate": True, # <-- Enable SSL fetching
"deep_crawl_strategy": {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": max_depth,
"max_pages": max_pages,
}
}
}
}
}
response = await async_client.post("/crawl", json=payload)
response.raise_for_status()
data = response.json()
assert data["success"] is True
assert len(data["results"]) == 1
result = data["results"][0]
await assert_crawl_result_structure(result, check_ssl=True) # <-- Tell helper to check SSL field
assert result["success"] is True
# Check if SSL info was actually retrieved
if result["ssl_certificate"]:
# Assert directly using dictionary keys
assert isinstance(result["ssl_certificate"], dict) # Verify it's a dict
assert "issuer" in result["ssl_certificate"]
assert "subject" in result["ssl_certificate"]
# --- MODIFIED ASSERTIONS ---
assert "not_before" in result["ssl_certificate"] # Check for the actual key
assert "not_after" in result["ssl_certificate"] # Check for the actual key
# --- END MODIFICATIONS ---
assert "fingerprint" in result["ssl_certificate"] # Check another key
# This print statement using .get() already works correctly with dictionaries
print(f"SSL Issuer Org: {result['ssl_certificate'].get('issuer', {}).get('O', 'N/A')}")
print(f"SSL Valid From: {result['ssl_certificate'].get('not_before', 'N/A')}")
else:
# This part remains the same
print("SSL Certificate was null in the result.")
# 7. Deep Crawl with Proxy Rotation (Requires PROXIES env var)
async def test_deep_crawl_with_proxies(self, async_client: httpx.AsyncClient):
"""Test BFS deep crawl using proxy rotation."""
proxies = load_proxies_from_env()
if not proxies:
pytest.skip("Skipping proxy test: PROXIES environment variable not set or empty.")
print(f"\nTesting with {len(proxies)} proxies loaded from environment.")
max_depth = 1
max_pages = 3
payload = {
"urls": [DEEP_CRAWL_BASE_URL], # Use the dummy site
# Use a BrowserConfig that *might* pick up proxy if set, but rely on CrawlerRunConfig
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": False,
"cache_mode": "BYPASS",
"proxy_rotation_strategy": { # <-- Define the strategy
"type": "RoundRobinProxyStrategy",
"params": {
# Convert ProxyConfig dicts back to the serialized format expected by server
"proxies": [{"type": "ProxyConfig", "params": p} for p in proxies]
}
},
"deep_crawl_strategy": {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": max_depth,
"max_pages": max_pages,
"filter_chain": {
"type": "FilterChain",
"params": { "filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
}
}
}
}
}
}
try:
response = await async_client.post("/crawl", json=payload)
response.raise_for_status()
data = response.json()
except httpx.HTTPStatusError as e:
# Proxies often cause connection errors, catch them
pytest.fail(f"Proxy deep crawl failed: {e}. Response: {e.response.text}. Are proxies valid and accessible by the server?")
except httpx.RequestError as e:
pytest.fail(f"Proxy deep crawl request failed: {e}. Are proxies valid and accessible?")
assert data["success"] is True
assert len(data["results"]) > 0
assert len(data["results"]) <= max_pages
# Primary assertion is that the crawl succeeded *with* proxy config
print(f"Proxy deep crawl completed successfully for {len(data['results'])} pages.")
# Verifying specific proxy usage requires server logs or custom headers/responses
# --- Main Execution Block (for running script directly) ---
if __name__ == "__main__":
pytest_args = ["-v", "-s", __file__]
# Example: Run only proxy test
# pytest_args.append("-k test_deep_crawl_with_proxies")
print(f"Running pytest with args: {pytest_args}")
exit_code = pytest.main(pytest_args)
print(f"Pytest finished with exit code: {exit_code}")

View File

@@ -0,0 +1,655 @@
import pytest
import pytest_asyncio
import httpx
import json
import asyncio
import os
from typing import List, Dict, Any, AsyncGenerator
from dotenv import load_dotenv
load_dotenv()
# Optional: Import crawl4ai classes directly for reference/easier payload creation aid
# You don't strictly NEED these imports for the tests to run against the server,
# but they help in understanding the structure you are mimicking in JSON.
from crawl4ai import (
BrowserConfig,
CrawlerRunConfig,
CacheMode,
DefaultMarkdownGenerator,
PruningContentFilter,
BM25ContentFilter,
BFSDeepCrawlStrategy,
FilterChain,
ContentTypeFilter,
DomainFilter,
CompositeScorer,
KeywordRelevanceScorer,
PathDepthScorer,
JsonCssExtractionStrategy,
LLMExtractionStrategy,
LLMConfig
)
# --- Test Configuration ---
# BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # Make base URL configurable
BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # Make base URL configurable
# Use a known simple HTML page for basic tests
SIMPLE_HTML_URL = "https://httpbin.org/html"
# Use a site suitable for scraping tests
SCRAPE_TARGET_URL = "http://books.toscrape.com/"
# Use a site with internal links for deep crawl tests
DEEP_CRAWL_URL = "https://python.org"
# --- Pytest Fixtures ---
# Use the built-in event_loop fixture from pytest_asyncio
# The custom implementation was causing issues with closing the loop
@pytest_asyncio.fixture(scope="function") # Changed to function scope to avoid event loop issues
async def async_client() -> AsyncGenerator[httpx.AsyncClient, None]:
"""Provides an async HTTP client"""
client = httpx.AsyncClient(base_url=BASE_URL, timeout=120.0)
yield client
await client.aclose()
# --- Helper Functions ---
async def check_server_health(client: httpx.AsyncClient):
"""Check if the server is healthy before running tests."""
try:
response = await client.get("/health")
response.raise_for_status()
print(f"\nServer healthy: {response.json()}")
return True
except (httpx.RequestError, httpx.HTTPStatusError) as e:
pytest.fail(f"Server health check failed: {e}. Is the server running at {BASE_URL}?", pytrace=False)
async def assert_crawl_result_structure(result: Dict[str, Any]):
"""Asserts the basic structure of a single crawl result."""
assert isinstance(result, dict)
assert "url" in result
assert "success" in result
assert "html" in result
# Add more common checks if needed
async def process_streaming_response(response: httpx.Response) -> List[Dict[str, Any]]:
"""Processes an NDJSON streaming response."""
results = []
completed = False
async for line in response.aiter_lines():
if line:
try:
data = json.loads(line)
if data.get("status") == "completed":
completed = True
break # Stop processing after completion marker
else:
results.append(data)
except json.JSONDecodeError:
pytest.fail(f"Failed to decode JSON line: {line}")
assert completed, "Streaming response did not end with a completion marker."
return results
# --- Test Class ---
@pytest.mark.asyncio
class TestCrawlEndpoints:
@pytest_asyncio.fixture(autouse=True)
async def check_health_before_tests(self, async_client: httpx.AsyncClient):
"""Fixture to ensure server is healthy before each test in the class."""
await check_server_health(async_client)
# 1. Simple Requests (Primitives)
async def test_simple_crawl_single_url(self, async_client: httpx.AsyncClient):
"""Test /crawl with a single URL and simple config values."""
payload = {
"urls": [SIMPLE_HTML_URL],
"browser_config": {
"type": "BrowserConfig",
"params": {
"headless": True,
}
},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": False, # Explicitly false for /crawl
"screenshot": False,
"cache_mode": CacheMode.BYPASS.value # Use enum value
}
}
}
try:
response = await async_client.post("/crawl", json=payload)
print(f"Response status: {response.status_code}")
response.raise_for_status()
data = response.json()
except httpx.HTTPStatusError as e:
print(f"Server error: {e}")
print(f"Response content: {e.response.text}")
raise
assert data["success"] is True
assert isinstance(data["results"], list)
assert len(data["results"]) == 1
result = data["results"][0]
await assert_crawl_result_structure(result)
assert result["success"] is True
assert result["url"] == SIMPLE_HTML_URL
assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
# We don't specify a markdown generator in this test, so don't make assumptions about markdown field
# It might be null, missing, or populated depending on the server's default behavior
async def test_simple_crawl_single_url_streaming(self, async_client: httpx.AsyncClient):
"""Test /crawl/stream with a single URL and simple config values."""
payload = {
"urls": [SIMPLE_HTML_URL],
"browser_config": {
"type": "BrowserConfig",
"params": {
"headless": True,
}
},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": True, # Must be true for /crawl/stream
"screenshot": False,
"cache_mode": CacheMode.BYPASS.value
}
}
}
async with async_client.stream("POST", "/crawl/stream", json=payload) as response:
response.raise_for_status()
results = await process_streaming_response(response)
assert len(results) == 1
result = results[0]
await assert_crawl_result_structure(result)
assert result["success"] is True
assert result["url"] == SIMPLE_HTML_URL
assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
# 2. Multi-URL and Dispatcher
async def test_multi_url_crawl(self, async_client: httpx.AsyncClient):
"""Test /crawl with multiple URLs, implicitly testing dispatcher."""
urls = [SIMPLE_HTML_URL, "https://httpbin.org/links/10/0"]
payload = {
"urls": urls,
"browser_config": {
"type": "BrowserConfig",
"params": {"headless": True}
},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {"stream": False, "cache_mode": CacheMode.BYPASS.value}
}
}
try:
print(f"Sending deep crawl request to server...")
response = await async_client.post("/crawl", json=payload)
print(f"Response status: {response.status_code}")
if response.status_code >= 400:
error_detail = response.json().get('detail', 'No detail provided')
print(f"Error detail: {error_detail}")
print(f"Full response: {response.text}")
response.raise_for_status()
data = response.json()
except httpx.HTTPStatusError as e:
print(f"Server error status: {e.response.status_code}")
print(f"Server error response: {e.response.text}")
try:
error_json = e.response.json()
print(f"Parsed error: {error_json}")
except:
print("Could not parse error response as JSON")
raise
assert data["success"] is True
assert isinstance(data["results"], list)
assert len(data["results"]) == len(urls)
for result in data["results"]:
await assert_crawl_result_structure(result)
assert result["success"] is True
assert result["url"] in urls
async def test_multi_url_crawl_streaming(self, async_client: httpx.AsyncClient):
"""Test /crawl/stream with multiple URLs."""
urls = [SIMPLE_HTML_URL, "https://httpbin.org/links/10/0"]
payload = {
"urls": urls,
"browser_config": {
"type": "BrowserConfig",
"params": {"headless": True}
},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {"stream": True, "cache_mode": CacheMode.BYPASS.value}
}
}
async with async_client.stream("POST", "/crawl/stream", json=payload) as response:
response.raise_for_status()
results = await process_streaming_response(response)
assert len(results) == len(urls)
processed_urls = set()
for result in results:
await assert_crawl_result_structure(result)
assert result["success"] is True
assert result["url"] in urls
processed_urls.add(result["url"])
assert processed_urls == set(urls) # Ensure all URLs were processed
# 3. Class Values and Nested Classes (Markdown Generator)
async def test_crawl_with_markdown_pruning_filter(self, async_client: httpx.AsyncClient):
"""Test /crawl with MarkdownGenerator using PruningContentFilter."""
payload = {
"urls": [SIMPLE_HTML_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": CacheMode.ENABLED.value, # Test different cache mode
"markdown_generator": {
"type": "DefaultMarkdownGenerator",
"params": {
"content_filter": {
"type": "PruningContentFilter",
"params": {
"threshold": 0.5, # Example param
"threshold_type": "relative"
}
}
}
}
}
}
}
try:
print(f"Sending deep crawl request to server...")
response = await async_client.post("/crawl", json=payload)
print(f"Response status: {response.status_code}")
if response.status_code >= 400:
error_detail = response.json().get('detail', 'No detail provided')
print(f"Error detail: {error_detail}")
print(f"Full response: {response.text}")
response.raise_for_status()
data = response.json()
except httpx.HTTPStatusError as e:
print(f"Server error status: {e.response.status_code}")
print(f"Server error response: {e.response.text}")
try:
error_json = e.response.json()
print(f"Parsed error: {error_json}")
except:
print("Could not parse error response as JSON")
raise
assert data["success"] is True
assert len(data["results"]) == 1
result = data["results"][0]
await assert_crawl_result_structure(result)
assert result["success"] is True
assert "markdown" in result
assert isinstance(result["markdown"], dict)
assert "raw_markdown" in result["markdown"]
assert "fit_markdown" in result["markdown"] # Pruning creates fit_markdown
assert "Moby-Dick" in result["markdown"]["raw_markdown"]
# Fit markdown content might be different/shorter due to pruning
assert len(result["markdown"]["fit_markdown"]) <= len(result["markdown"]["raw_markdown"])
async def test_crawl_with_markdown_bm25_filter(self, async_client: httpx.AsyncClient):
"""Test /crawl with MarkdownGenerator using BM25ContentFilter."""
payload = {
"urls": [SIMPLE_HTML_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"markdown_generator": {
"type": "DefaultMarkdownGenerator",
"params": {
"content_filter": {
"type": "BM25ContentFilter",
"params": {
"user_query": "Herman Melville", # Query for BM25
"bm25_threshold": 0.1, # Lower threshold to increase matches
"language": "english" # Valid parameters
}
}
}
}
}
}
}
try:
print(f"Payload for BM25 test: {json.dumps(payload)}")
response = await async_client.post("/crawl", json=payload)
print(f"Response status: {response.status_code}")
if response.status_code >= 400:
error_detail = response.json().get('detail', 'No detail provided')
print(f"Error detail: {error_detail}")
print(f"Full response: {response.text}")
response.raise_for_status()
data = response.json()
except httpx.HTTPStatusError as e:
print(f"Server error status: {e.response.status_code}")
print(f"Server error response: {e.response.text}")
try:
error_json = e.response.json()
print(f"Parsed error: {error_json}")
except:
print("Could not parse error response as JSON")
raise
assert data["success"] is True
assert len(data["results"]) == 1
result = data["results"][0]
await assert_crawl_result_structure(result)
assert result["success"] is True
assert "markdown" in result
assert isinstance(result["markdown"], dict)
assert "raw_markdown" in result["markdown"]
assert "fit_markdown" in result["markdown"] # BM25 creates fit_markdown
# Print values for debug
print(f"Raw markdown length: {len(result['markdown']['raw_markdown'])}")
print(f"Fit markdown length: {len(result['markdown']['fit_markdown'])}")
# Either fit_markdown has content (possibly including our query terms)
# or it might be empty if no good BM25 matches were found
# Don't assert specific content since it can be environment-dependent
# 4. Deep Crawling
async def test_deep_crawl(self, async_client: httpx.AsyncClient):
"""Test /crawl with a deep crawl strategy."""
payload = {
"urls": [DEEP_CRAWL_URL], # Start URL
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": False,
"cache_mode": CacheMode.BYPASS.value,
"deep_crawl_strategy": {
"type": "BFSDeepCrawlStrategy",
"params": {
"max_depth": 1, # Limit depth for testing speed
"max_pages": 5, # Limit pages to crawl
"filter_chain": {
"type": "FilterChain",
"params": {
"filters": [
{
"type": "ContentTypeFilter",
"params": {"allowed_types": ["text/html"]}
},
{
"type": "DomainFilter",
"params": {"allowed_domains": ["python.org", "docs.python.org"]} # Include important subdomains
}
]
}
},
"url_scorer": {
"type": "CompositeScorer",
"params": {
"scorers": [
{
"type": "KeywordRelevanceScorer",
"params": {"keywords": ["documentation", "tutorial"]}
},
{
"type": "PathDepthScorer",
"params": {"weight": 0.5, "optimal_depth": 2}
}
]
}
}
}
}
}
}
}
try:
print(f"Sending deep crawl request to server...")
response = await async_client.post("/crawl", json=payload)
print(f"Response status: {response.status_code}")
if response.status_code >= 400:
error_detail = response.json().get('detail', 'No detail provided')
print(f"Error detail: {error_detail}")
print(f"Full response: {response.text}")
response.raise_for_status()
data = response.json()
except httpx.HTTPStatusError as e:
print(f"Server error status: {e.response.status_code}")
print(f"Server error response: {e.response.text}")
try:
error_json = e.response.json()
print(f"Parsed error: {error_json}")
except:
print("Could not parse error response as JSON")
raise
assert data["success"] is True
assert isinstance(data["results"], list)
# Expect more than 1 result due to deep crawl (start URL + crawled links)
assert len(data["results"]) > 1
assert len(data["results"]) <= 6 # Start URL + max_links=5
start_url_found = False
crawled_urls_found = False
for result in data["results"]:
await assert_crawl_result_structure(result)
assert result["success"] is True
# Print URL for debugging
print(f"Crawled URL: {result['url']}")
# Allow URLs that contain python.org (including subdomains like docs.python.org)
assert "python.org" in result["url"]
if result["url"] == DEEP_CRAWL_URL:
start_url_found = True
else:
crawled_urls_found = True
assert start_url_found
assert crawled_urls_found
# 5. Extraction without LLM (JSON/CSS)
async def test_json_css_extraction(self, async_client: httpx.AsyncClient):
"""Test /crawl with JsonCssExtractionStrategy."""
payload = {
"urls": [SCRAPE_TARGET_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": CacheMode.BYPASS.value,
"extraction_strategy": {
"type": "JsonCssExtractionStrategy",
"params": {
"schema": {
"type": "dict", # IMPORTANT: Wrap schema dict with type/value structure
"value": {
"name": "BookList",
"baseSelector": "ol.row li.col-xs-6", # Select each book item
"fields": [
{"name": "title", "selector": "article.product_pod h3 a", "type": "attribute", "attribute": "title"},
{"name": "price", "selector": "article.product_pod .price_color", "type": "text"},
{"name": "rating", "selector": "article.product_pod p.star-rating", "type": "attribute", "attribute": "class"}
]
}
}
}
}
}
}
}
try:
print(f"Sending deep crawl request to server...")
response = await async_client.post("/crawl", json=payload)
print(f"Response status: {response.status_code}")
if response.status_code >= 400:
error_detail = response.json().get('detail', 'No detail provided')
print(f"Error detail: {error_detail}")
print(f"Full response: {response.text}")
response.raise_for_status()
data = response.json()
except httpx.HTTPStatusError as e:
print(f"Server error status: {e.response.status_code}")
print(f"Server error response: {e.response.text}")
try:
error_json = e.response.json()
print(f"Parsed error: {error_json}")
except:
print("Could not parse error response as JSON")
raise
assert data["success"] is True
assert len(data["results"]) == 1
result = data["results"][0]
await assert_crawl_result_structure(result)
assert result["success"] is True
assert "extracted_content" in result
assert result["extracted_content"] is not None
# Extracted content should be a JSON string representing a list of dicts
try:
extracted_data = json.loads(result["extracted_content"])
assert isinstance(extracted_data, list)
assert len(extracted_data) > 0 # Should find some books
# Check structure of the first extracted item
first_item = extracted_data[0]
assert "title" in first_item
assert "price" in first_item
assert "rating" in first_item
assert "star-rating" in first_item["rating"] # e.g., "star-rating Three"
except (json.JSONDecodeError, AssertionError) as e:
pytest.fail(f"Extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}")
# 6. Extraction with LLM
async def test_llm_extraction(self, async_client: httpx.AsyncClient):
"""
Test /crawl with LLMExtractionStrategy.
NOTE: Requires the server to have appropriate LLM API keys (e.g., OPENAI_API_KEY)
configured via .llm.env or environment variables.
This test uses the default provider configured in the server's config.yml.
"""
payload = {
"urls": [SIMPLE_HTML_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": CacheMode.BYPASS.value,
"extraction_strategy": {
"type": "LLMExtractionStrategy",
"params": {
"instruction": "Extract the main title and the author mentioned in the text into JSON.",
# LLMConfig is implicitly defined by server's config.yml and .llm.env
# If you needed to override provider/token PER REQUEST:
"llm_config": {
"type": "LLMConfig",
"params": {
"provider": "openai/gpt-4o", # Example override
"api_token": os.getenv("OPENAI_API_KEY") # Example override
}
},
"schema": { # Optional: Provide a schema for structured output
"type": "dict", # IMPORTANT: Wrap schema dict
"value": {
"title": "Book Info",
"type": "object",
"properties": {
"title": {"type": "string", "description": "The main title of the work"},
"author": {"type": "string", "description": "The author of the work"}
},
"required": ["title", "author"]
}
}
}
}
}
}
}
try:
response = await async_client.post("/crawl", json=payload)
response.raise_for_status() # Will raise if server returns 500 (e.g., bad API key)
data = response.json()
except httpx.HTTPStatusError as e:
# Catch potential server errors (like 500 due to missing/invalid API keys)
pytest.fail(f"LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and ensure API keys are correctly configured for the server.")
except httpx.RequestError as e:
pytest.fail(f"LLM extraction request failed: {e}.")
assert data["success"] is True
assert len(data["results"]) == 1
result = data["results"][0]
await assert_crawl_result_structure(result)
assert result["success"] is True
assert "extracted_content" in result
assert result["extracted_content"] is not None
# Extracted content should be JSON (because we provided a schema)
try:
extracted_data = json.loads(result["extracted_content"])
print(f"\nLLM Extracted Data: {extracted_data}") # Print for verification
# Handle both dict and list formats (server returns a list)
if isinstance(extracted_data, list):
assert len(extracted_data) > 0
extracted_item = extracted_data[0] # Take first item
assert isinstance(extracted_item, dict)
assert "title" in extracted_item
assert "author" in extracted_item
assert "Moby-Dick" in extracted_item.get("title", "")
assert "Herman Melville" in extracted_item.get("author", "")
else:
assert isinstance(extracted_data, dict)
assert "title" in extracted_data
assert "author" in extracted_data
assert "Moby-Dick" in extracted_data.get("title", "")
assert "Herman Melville" in extracted_data.get("author", "")
except (json.JSONDecodeError, AssertionError) as e:
pytest.fail(f"LLM extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}")
except Exception as e: # Catch any other unexpected error
pytest.fail(f"An unexpected error occurred during LLM result processing: {e}\nContent: {result['extracted_content']}")
if __name__ == "__main__":
# Define arguments for pytest programmatically
# -v: verbose output
# -s: show print statements immediately (useful for debugging)
# __file__: tells pytest to run tests in the current file
pytest_args = ["-v", "-s", __file__]
# You can add more pytest arguments here if needed, for example:
# '-k test_llm_extraction': Run only the LLM test function
# pytest_args.append("-k test_llm_extraction")
print(f"Running pytest with args: {pytest_args}")
# Execute pytest
exit_code = pytest.main(pytest_args)
print(f"Pytest finished with exit code: {exit_code}")

View File

@@ -0,0 +1,335 @@
# ==== File: build_dummy_site.py ====
import os
import random
import argparse
from pathlib import Path
from urllib.parse import quote
# --- Configuration ---
NUM_CATEGORIES = 3
NUM_SUBCATEGORIES_PER_CAT = 2 # Results in NUM_CATEGORIES * NUM_SUBCATEGORIES_PER_CAT total L2 categories
NUM_PRODUCTS_PER_SUBCAT = 5 # Products listed on L3 pages
MAX_DEPTH_TARGET = 5 # Explicitly set target depth
# --- Helper Functions ---
def generate_lorem(words=20):
"""Generates simple placeholder text."""
lorem_words = ["lorem", "ipsum", "dolor", "sit", "amet", "consectetur",
"adipiscing", "elit", "sed", "do", "eiusmod", "tempor",
"incididunt", "ut", "labore", "et", "dolore", "magna", "aliqua"]
return " ".join(random.choice(lorem_words) for _ in range(words)).capitalize() + "."
def create_html_page(filepath: Path, title: str, body_content: str, breadcrumbs: list = [], head_extras: str = ""):
"""Creates an HTML file with basic structure and inline CSS."""
os.makedirs(filepath.parent, exist_ok=True)
# Generate breadcrumb HTML using the 'link' provided in the breadcrumbs list
breadcrumb_html = ""
if breadcrumbs:
links_html = " » ".join(f'<a href="{bc["link"]}">{bc["name"]}</a>' for bc in breadcrumbs)
breadcrumb_html = f"<nav class='breadcrumbs'>{links_html} » {title}</nav>"
# Basic CSS for structure identification (kept the same)
css = """
<style>
body {
font-family: sans-serif;
padding: 20px;
background-color: #1e1e1e;
color: #d1d1d1;
}
.container {
max-width: 960px;
margin: auto;
background: #2c2c2c;
padding: 20px;
border-radius: 5px;
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.5);
}
h1, h2 {
color: #ccc;
}
a {
color: #9bcdff;
text-decoration: none;
}
a:hover {
text-decoration: underline;
}
ul {
list-style: none;
padding-left: 0;
}
li {
margin-bottom: 10px;
}
.category-link,
.subcategory-link,
.product-link,
.details-link,
.reviews-link {
display: block;
padding: 8px;
background-color: #3a3a3a;
border-radius: 3px;
}
.product-preview {
border: 1px solid #444;
padding: 10px;
margin-bottom: 10px;
border-radius: 4px;
background-color: #2a2a2a;
}
.product-title {
color: #d1d1d1;
}
.product-price {
font-weight: bold;
color: #85e085;
}
.product-description,
.product-specs,
.product-reviews {
margin-top: 15px;
line-height: 1.6;
}
.product-specs li {
margin-bottom: 5px;
font-size: 0.9em;
}
.spec-name {
font-weight: bold;
}
.breadcrumbs {
margin-bottom: 20px;
font-size: 0.9em;
color: #888;
}
.breadcrumbs a {
color: #9bcdff;
}
</style>
"""
html_content = f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{title} - FakeShop</title>
{head_extras}
{css}
</head>
<body>
<div class="container">
{breadcrumb_html}
<h1>{title}</h1>
{body_content}
</div>
</body>
</html>"""
with open(filepath, "w", encoding="utf-8") as f:
f.write(html_content)
# Keep print statement concise for clarity
# print(f"Created: {filepath}")
def generate_site(base_dir: Path, site_name: str = "FakeShop", base_path: str = ""):
"""Generates the dummy website structure."""
base_dir.mkdir(parents=True, exist_ok=True)
# --- Clean and prepare the base path for URL construction ---
# Ensure it starts with '/' if not empty, and remove any trailing '/'
if base_path:
full_base_path = "/" + base_path.strip('/')
else:
full_base_path = "" # Represents the root
print(f"Using base path for links: '{full_base_path}'")
# --- Level 0: Homepage ---
home_body = "<h2>Welcome to FakeShop!</h2><p>Your one-stop shop for imaginary items.</p><h3>Categories:</h3>\n<ul>"
# Define the *actual* link path for the homepage breadcrumb
home_link_path = f"{full_base_path}/index.html"
breadcrumbs_home = [{"name": "Home", "link": home_link_path}] # Base breadcrumb
# Links *within* the page content should remain relative
for i in range(NUM_CATEGORIES):
cat_name = f"Category-{i+1}"
cat_folder_name = quote(cat_name.lower().replace(" ", "-"))
# This path is relative to the current directory (index.html)
cat_relative_page_path = f"{cat_folder_name}/index.html"
home_body += f'<li><a class="category-link" href="{cat_relative_page_path}">{cat_name}</a> - {generate_lorem(10)}</li>'
home_body += "</ul>"
create_html_page(base_dir / "index.html", "Homepage", home_body, []) # No breadcrumbs *on* the homepage itself
# --- Levels 1-5 ---
for i in range(NUM_CATEGORIES):
cat_name = f"Category-{i+1}"
cat_folder_name = quote(cat_name.lower().replace(" ", "-"))
cat_dir = base_dir / cat_folder_name
# This is the *absolute* path for the breadcrumb link
cat_link_path = f"{full_base_path}/{cat_folder_name}/index.html"
# Update breadcrumbs list for this level
breadcrumbs_cat = breadcrumbs_home + [{"name": cat_name, "link": cat_link_path}]
# --- Level 1: Category Page ---
cat_body = f"<p>{generate_lorem(15)} for {cat_name}.</p><h3>Sub-Categories:</h3>\n<ul>"
for j in range(NUM_SUBCATEGORIES_PER_CAT):
subcat_name = f"{cat_name}-Sub-{j+1}"
subcat_folder_name = quote(subcat_name.lower().replace(" ", "-"))
# Path relative to the category page
subcat_relative_page_path = f"{subcat_folder_name}/index.html"
cat_body += f'<li><a class="subcategory-link" href="{subcat_relative_page_path}">{subcat_name}</a> - {generate_lorem(8)}</li>'
cat_body += "</ul>"
# Pass the updated breadcrumbs list
create_html_page(cat_dir / "index.html", cat_name, cat_body, breadcrumbs_home) # Parent breadcrumb needed here
for j in range(NUM_SUBCATEGORIES_PER_CAT):
subcat_name = f"{cat_name}-Sub-{j+1}"
subcat_folder_name = quote(subcat_name.lower().replace(" ", "-"))
subcat_dir = cat_dir / subcat_folder_name
# Absolute path for the breadcrumb link
subcat_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/index.html"
# Update breadcrumbs list for this level
breadcrumbs_subcat = breadcrumbs_cat + [{"name": subcat_name, "link": subcat_link_path}]
# --- Level 2: Sub-Category Page (Product List) ---
subcat_body = f"<p>Explore products in {subcat_name}. {generate_lorem(12)}</p><h3>Products:</h3>\n<ul class='product-list'>"
for k in range(NUM_PRODUCTS_PER_SUBCAT):
prod_id = f"P{i+1}{j+1}{k+1:03d}" # e.g., P11001
prod_name = f"{subcat_name} Product {k+1} ({prod_id})"
# Filename relative to the subcategory page
prod_filename = f"product_{prod_id}.html"
# Absolute path for the breadcrumb link
prod_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{prod_filename}"
# Preview on list page (link remains relative)
subcat_body += f"""
<li>
<div class="product-preview">
<a class="product-link" href="{prod_filename}"><strong>{prod_name}</strong></a>
<p>{generate_lorem(10)}</p>
<span class="product-price"{random.uniform(10, 500):.2f}</span>
</div>
</li>"""
# --- Level 3: Product Page ---
prod_price = random.uniform(10, 500)
prod_desc = generate_lorem(40)
prod_specs = {f"Spec {s+1}": generate_lorem(3) for s in range(random.randint(3,6))}
prod_reviews_count = random.randint(0, 150)
# Relative filenames for links on this page
details_filename_relative = f"product_{prod_id}_details.html"
reviews_filename_relative = f"product_{prod_id}_reviews.html"
prod_body = f"""
<p class="product-price">Price: £{prod_price:.2f}</p>
<div class="product-description">
<h2>Description</h2>
<p>{prod_desc}</p>
</div>
<div class="product-specs">
<h2>Specifications</h2>
<ul>
{''.join(f'<li><span class="spec-name">{name}</span>: <span class="spec-value">{value}</span></li>' for name, value in prod_specs.items())}
</ul>
</div>
<div class="product-reviews">
<h2>Reviews</h2>
<p>Total Reviews: <span class="review-count">{prod_reviews_count}</span></p>
</div>
<hr>
<p>
<a class="details-link" href="{details_filename_relative}">View More Details</a> |
<a class="reviews-link" href="{reviews_filename_relative}">See All Reviews</a>
</p>
"""
# Update breadcrumbs list for this level
breadcrumbs_prod = breadcrumbs_subcat + [{"name": prod_name, "link": prod_link_path}]
# Pass the updated breadcrumbs list
create_html_page(subcat_dir / prod_filename, prod_name, prod_body, breadcrumbs_subcat) # Parent breadcrumb needed here
# --- Level 4: Product Details Page ---
details_filename = f"product_{prod_id}_details.html" # Actual filename
# Absolute path for the breadcrumb link
details_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{details_filename}"
details_body = f"<p>This page contains extremely detailed information about {prod_name}.</p>{generate_lorem(100)}"
# Update breadcrumbs list for this level
breadcrumbs_details = breadcrumbs_prod + [{"name": "Details", "link": details_link_path}]
# Pass the updated breadcrumbs list
create_html_page(subcat_dir / details_filename, f"{prod_name} - Details", details_body, breadcrumbs_prod) # Parent breadcrumb needed here
# --- Level 5: Product Reviews Page ---
reviews_filename = f"product_{prod_id}_reviews.html" # Actual filename
# Absolute path for the breadcrumb link
reviews_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{reviews_filename}"
reviews_body = f"<p>All {prod_reviews_count} reviews for {prod_name} are listed here.</p><ul>"
for r in range(prod_reviews_count):
reviews_body += f"<li>Review {r+1}: {generate_lorem(random.randint(15, 50))}</li>"
reviews_body += "</ul>"
# Update breadcrumbs list for this level
breadcrumbs_reviews = breadcrumbs_prod + [{"name": "Reviews", "link": reviews_link_path}]
# Pass the updated breadcrumbs list
create_html_page(subcat_dir / reviews_filename, f"{prod_name} - Reviews", reviews_body, breadcrumbs_prod) # Parent breadcrumb needed here
subcat_body += "</ul>" # Close product-list ul
# Pass the correct breadcrumbs list for the subcategory index page
create_html_page(subcat_dir / "index.html", subcat_name, subcat_body, breadcrumbs_cat) # Parent breadcrumb needed here
# --- Main Execution ---
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generate a dummy multi-level retail website.")
parser.add_argument(
"-o", "--output-dir",
type=str,
default="dummy_retail_site",
help="Directory to generate the website in."
)
parser.add_argument(
"-n", "--site-name",
type=str,
default="FakeShop",
help="Name of the fake shop."
)
parser.add_argument(
"-b", "--base-path",
type=str,
default="",
help="Base path for hosting the site (e.g., 'samples/deepcrawl'). Leave empty if hosted at the root."
)
# Optional: Add more args to configure counts if needed
args = parser.parse_args()
output_directory = Path(args.output_dir)
site_name = args.site_name
base_path = args.base_path
print(f"Generating dummy site '{site_name}' in '{output_directory}'...")
# Pass the base_path to the generation function
generate_site(output_directory, site_name, base_path)
print(f"\nCreated {sum(1 for _ in output_directory.rglob('*.html'))} HTML pages.")
print("Dummy site generation complete.")
print(f"To serve locally (example): python -m http.server --directory {output_directory} 8000")
if base_path:
print(f"Access the site at: http://localhost:8000/{base_path.strip('/')}/index.html")
else:
print(f"Access the site at: http://localhost:8000/index.html")