Merge branch 'next' into 2025-MAR-ALPHA-1
This commit is contained in:
47
Dockerfile
47
Dockerfile
@@ -24,7 +24,7 @@ ARG TARGETARCH
|
||||
|
||||
LABEL maintainer="unclecode"
|
||||
LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
|
||||
LABEL version="1.0"
|
||||
LABEL version="1.0"
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
@@ -38,6 +38,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
libjpeg-dev \
|
||||
redis-server \
|
||||
supervisor \
|
||||
&& apt-get clean \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
@@ -62,11 +63,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
libcairo2 \
|
||||
libasound2 \
|
||||
libatspi2.0-0 \
|
||||
&& apt-get clean \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \
|
||||
apt-get update && apt-get install -y --no-install-recommends \
|
||||
nvidia-cuda-toolkit \
|
||||
&& apt-get clean \
|
||||
&& rm -rf /var/lib/apt/lists/* ; \
|
||||
else \
|
||||
echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \
|
||||
@@ -76,16 +79,24 @@ RUN if [ "$TARGETARCH" = "arm64" ]; then \
|
||||
echo "🦾 Installing ARM-specific optimizations"; \
|
||||
apt-get update && apt-get install -y --no-install-recommends \
|
||||
libopenblas-dev \
|
||||
&& apt-get clean \
|
||||
&& rm -rf /var/lib/apt/lists/*; \
|
||||
elif [ "$TARGETARCH" = "amd64" ]; then \
|
||||
echo "🖥️ Installing AMD64-specific optimizations"; \
|
||||
apt-get update && apt-get install -y --no-install-recommends \
|
||||
libomp-dev \
|
||||
&& apt-get clean \
|
||||
&& rm -rf /var/lib/apt/lists/*; \
|
||||
else \
|
||||
echo "Skipping platform-specific optimizations (unsupported platform)"; \
|
||||
fi
|
||||
|
||||
# Create a non-root user and group
|
||||
RUN groupadd -r appuser && useradd --no-log-init -r -g appuser appuser
|
||||
|
||||
# Create and set permissions for appuser home directory
|
||||
RUN mkdir -p /home/appuser && chown -R appuser:appuser /home/appuser
|
||||
|
||||
WORKDIR ${APP_HOME}
|
||||
|
||||
RUN echo '#!/bin/bash\n\
|
||||
@@ -103,6 +114,7 @@ fi' > /tmp/install.sh && chmod +x /tmp/install.sh
|
||||
|
||||
COPY . /tmp/project/
|
||||
|
||||
# Copy supervisor config first (might need root later, but okay for now)
|
||||
COPY deploy/docker/supervisord.conf .
|
||||
|
||||
COPY deploy/docker/requirements.txt .
|
||||
@@ -131,16 +143,31 @@ RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
|
||||
else \
|
||||
pip install "/tmp/project" ; \
|
||||
fi
|
||||
|
||||
|
||||
RUN pip install --no-cache-dir --upgrade pip && \
|
||||
/tmp/install.sh && \
|
||||
python -c "import crawl4ai; print('✅ crawl4ai is ready to rock!')" && \
|
||||
python -c "from playwright.sync_api import sync_playwright; print('✅ Playwright is feeling dramatic!')"
|
||||
|
||||
RUN playwright install --with-deps chromium
|
||||
|
||||
RUN crawl4ai-setup
|
||||
|
||||
RUN playwright install --with-deps
|
||||
|
||||
RUN mkdir -p /home/appuser/.cache/ms-playwright \
|
||||
&& cp -r /root/.cache/ms-playwright/chromium-* /home/appuser/.cache/ms-playwright/ \
|
||||
&& chown -R appuser:appuser /home/appuser/.cache/ms-playwright
|
||||
|
||||
RUN crawl4ai-doctor
|
||||
|
||||
# Copy application code
|
||||
COPY deploy/docker/* ${APP_HOME}/
|
||||
|
||||
# Change ownership of the application directory to the non-root user
|
||||
RUN chown -R appuser:appuser ${APP_HOME}
|
||||
|
||||
# give permissions to redis persistence dirs if used
|
||||
RUN mkdir -p /var/lib/redis /var/log/redis && chown -R appuser:appuser /var/lib/redis /var/log/redis
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
CMD bash -c '\
|
||||
MEM=$(free -m | awk "/^Mem:/{print \$2}"); \
|
||||
@@ -149,8 +176,14 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
exit 1; \
|
||||
fi && \
|
||||
redis-cli ping > /dev/null && \
|
||||
curl -f http://localhost:8000/health || exit 1'
|
||||
curl -f http://localhost:11235/health || exit 1'
|
||||
|
||||
EXPOSE 6379
|
||||
CMD ["supervisord", "-c", "supervisord.conf"]
|
||||
|
||||
# Switch to the non-root user before starting the application
|
||||
USER appuser
|
||||
|
||||
# Set environment variables to ptoduction
|
||||
ENV PYTHON_ENV=production
|
||||
|
||||
# Start the application using supervisord
|
||||
CMD ["supervisord", "-c", "supervisord.conf"]
|
||||
@@ -2,7 +2,7 @@
|
||||
import warnings
|
||||
|
||||
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig
|
||||
|
||||
from .content_scraping_strategy import (
|
||||
ContentScrapingStrategy,
|
||||
@@ -121,6 +121,7 @@ __all__ = [
|
||||
"Crawl4aiDockerClient",
|
||||
"ProxyRotationStrategy",
|
||||
"RoundRobinProxyStrategy",
|
||||
"ProxyConfig"
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ from .config import (
|
||||
MIN_WORD_THRESHOLD,
|
||||
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
||||
PROVIDER_MODELS,
|
||||
PROVIDER_MODELS_PREFIXES,
|
||||
SCREENSHOT_HEIGHT_TRESHOLD,
|
||||
PAGE_TIMEOUT,
|
||||
IMAGE_SCORE_THRESHOLD,
|
||||
@@ -27,11 +28,8 @@ import inspect
|
||||
from typing import Any, Dict, Optional
|
||||
from enum import Enum
|
||||
|
||||
from .proxy_strategy import ProxyConfig
|
||||
try:
|
||||
from .browser.models import DockerConfig
|
||||
except ImportError:
|
||||
DockerConfig = None
|
||||
# from .proxy_strategy import ProxyConfig
|
||||
|
||||
|
||||
|
||||
def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
|
||||
@@ -122,23 +120,25 @@ def from_serializable_dict(data: Any) -> Any:
|
||||
# Handle typed data
|
||||
if isinstance(data, dict) and "type" in data:
|
||||
# Handle plain dictionaries
|
||||
if data["type"] == "dict":
|
||||
if data["type"] == "dict" and "value" in data:
|
||||
return {k: from_serializable_dict(v) for k, v in data["value"].items()}
|
||||
|
||||
# Import from crawl4ai for class instances
|
||||
import crawl4ai
|
||||
|
||||
cls = getattr(crawl4ai, data["type"])
|
||||
if hasattr(crawl4ai, data["type"]):
|
||||
cls = getattr(crawl4ai, data["type"])
|
||||
|
||||
# Handle Enum
|
||||
if issubclass(cls, Enum):
|
||||
return cls(data["params"])
|
||||
# Handle Enum
|
||||
if issubclass(cls, Enum):
|
||||
return cls(data["params"])
|
||||
|
||||
# Handle class instances
|
||||
constructor_args = {
|
||||
k: from_serializable_dict(v) for k, v in data["params"].items()
|
||||
}
|
||||
return cls(**constructor_args)
|
||||
if "params" in data:
|
||||
# Handle class instances
|
||||
constructor_args = {
|
||||
k: from_serializable_dict(v) for k, v in data["params"].items()
|
||||
}
|
||||
return cls(**constructor_args)
|
||||
|
||||
# Handle lists
|
||||
if isinstance(data, list):
|
||||
@@ -159,6 +159,117 @@ def is_empty_value(value: Any) -> bool:
|
||||
return True
|
||||
return False
|
||||
|
||||
class ProxyConfig:
|
||||
def __init__(
|
||||
self,
|
||||
server: str,
|
||||
username: Optional[str] = None,
|
||||
password: Optional[str] = None,
|
||||
ip: Optional[str] = None,
|
||||
):
|
||||
"""Configuration class for a single proxy.
|
||||
|
||||
Args:
|
||||
server: Proxy server URL (e.g., "http://127.0.0.1:8080")
|
||||
username: Optional username for proxy authentication
|
||||
password: Optional password for proxy authentication
|
||||
ip: Optional IP address for verification purposes
|
||||
"""
|
||||
self.server = server
|
||||
self.username = username
|
||||
self.password = password
|
||||
|
||||
# Extract IP from server if not explicitly provided
|
||||
self.ip = ip or self._extract_ip_from_server()
|
||||
|
||||
def _extract_ip_from_server(self) -> Optional[str]:
|
||||
"""Extract IP address from server URL."""
|
||||
try:
|
||||
# Simple extraction assuming http://ip:port format
|
||||
if "://" in self.server:
|
||||
parts = self.server.split("://")[1].split(":")
|
||||
return parts[0]
|
||||
else:
|
||||
parts = self.server.split(":")
|
||||
return parts[0]
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def from_string(proxy_str: str) -> "ProxyConfig":
|
||||
"""Create a ProxyConfig from a string in the format 'ip:port:username:password'."""
|
||||
parts = proxy_str.split(":")
|
||||
if len(parts) == 4: # ip:port:username:password
|
||||
ip, port, username, password = parts
|
||||
return ProxyConfig(
|
||||
server=f"http://{ip}:{port}",
|
||||
username=username,
|
||||
password=password,
|
||||
ip=ip
|
||||
)
|
||||
elif len(parts) == 2: # ip:port only
|
||||
ip, port = parts
|
||||
return ProxyConfig(
|
||||
server=f"http://{ip}:{port}",
|
||||
ip=ip
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Invalid proxy string format: {proxy_str}")
|
||||
|
||||
@staticmethod
|
||||
def from_dict(proxy_dict: Dict) -> "ProxyConfig":
|
||||
"""Create a ProxyConfig from a dictionary."""
|
||||
return ProxyConfig(
|
||||
server=proxy_dict.get("server"),
|
||||
username=proxy_dict.get("username"),
|
||||
password=proxy_dict.get("password"),
|
||||
ip=proxy_dict.get("ip")
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def from_env(env_var: str = "PROXIES") -> List["ProxyConfig"]:
|
||||
"""Load proxies from environment variable.
|
||||
|
||||
Args:
|
||||
env_var: Name of environment variable containing comma-separated proxy strings
|
||||
|
||||
Returns:
|
||||
List of ProxyConfig objects
|
||||
"""
|
||||
proxies = []
|
||||
try:
|
||||
proxy_list = os.getenv(env_var, "").split(",")
|
||||
for proxy in proxy_list:
|
||||
if not proxy:
|
||||
continue
|
||||
proxies.append(ProxyConfig.from_string(proxy))
|
||||
except Exception as e:
|
||||
print(f"Error loading proxies from environment: {e}")
|
||||
return proxies
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
"""Convert to dictionary representation."""
|
||||
return {
|
||||
"server": self.server,
|
||||
"username": self.username,
|
||||
"password": self.password,
|
||||
"ip": self.ip
|
||||
}
|
||||
|
||||
def clone(self, **kwargs) -> "ProxyConfig":
|
||||
"""Create a copy of this configuration with updated values.
|
||||
|
||||
Args:
|
||||
**kwargs: Key-value pairs of configuration options to update
|
||||
|
||||
Returns:
|
||||
ProxyConfig: A new instance with the specified updates
|
||||
"""
|
||||
config_dict = self.to_dict()
|
||||
config_dict.update(kwargs)
|
||||
return ProxyConfig.from_dict(config_dict)
|
||||
|
||||
|
||||
|
||||
class BrowserConfig:
|
||||
"""
|
||||
@@ -195,8 +306,6 @@ class BrowserConfig:
|
||||
Default: None.
|
||||
proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
|
||||
If None, no additional proxy config. Default: None.
|
||||
docker_config (DockerConfig or dict or None): Configuration for Docker-based browser automation.
|
||||
Contains settings for Docker container operation. Default: None.
|
||||
viewport_width (int): Default viewport width for pages. Default: 1080.
|
||||
viewport_height (int): Default viewport height for pages. Default: 600.
|
||||
viewport (dict): Default viewport dimensions for pages. If set, overrides viewport_width and viewport_height.
|
||||
@@ -242,7 +351,6 @@ class BrowserConfig:
|
||||
channel: str = "chromium",
|
||||
proxy: str = None,
|
||||
proxy_config: Union[ProxyConfig, dict, None] = None,
|
||||
docker_config: Union[DockerConfig, dict, None] = None,
|
||||
viewport_width: int = 1080,
|
||||
viewport_height: int = 600,
|
||||
viewport: dict = None,
|
||||
@@ -283,15 +391,7 @@ class BrowserConfig:
|
||||
self.chrome_channel = ""
|
||||
self.proxy = proxy
|
||||
self.proxy_config = proxy_config
|
||||
|
||||
# Handle docker configuration
|
||||
if isinstance(docker_config, dict) and DockerConfig is not None:
|
||||
self.docker_config = DockerConfig.from_kwargs(docker_config)
|
||||
else:
|
||||
self.docker_config = docker_config
|
||||
|
||||
if self.docker_config:
|
||||
self.user_data_dir = self.docker_config.user_data_dir
|
||||
|
||||
self.viewport_width = viewport_width
|
||||
self.viewport_height = viewport_height
|
||||
@@ -362,7 +462,6 @@ class BrowserConfig:
|
||||
channel=kwargs.get("channel", "chromium"),
|
||||
proxy=kwargs.get("proxy"),
|
||||
proxy_config=kwargs.get("proxy_config", None),
|
||||
docker_config=kwargs.get("docker_config", None),
|
||||
viewport_width=kwargs.get("viewport_width", 1080),
|
||||
viewport_height=kwargs.get("viewport_height", 600),
|
||||
accept_downloads=kwargs.get("accept_downloads", False),
|
||||
@@ -419,13 +518,7 @@ class BrowserConfig:
|
||||
"debugging_port": self.debugging_port,
|
||||
"host": self.host,
|
||||
}
|
||||
|
||||
# Include docker_config if it exists
|
||||
if hasattr(self, "docker_config") and self.docker_config is not None:
|
||||
if hasattr(self.docker_config, "to_dict"):
|
||||
result["docker_config"] = self.docker_config.to_dict()
|
||||
else:
|
||||
result["docker_config"] = self.docker_config
|
||||
|
||||
|
||||
return result
|
||||
|
||||
@@ -1178,9 +1271,18 @@ class LLMConfig:
|
||||
elif api_token and api_token.startswith("env:"):
|
||||
self.api_token = os.getenv(api_token[4:])
|
||||
else:
|
||||
self.api_token = PROVIDER_MODELS.get(provider, "no-token") or os.getenv(
|
||||
DEFAULT_PROVIDER_API_KEY
|
||||
)
|
||||
# Check if given provider starts with any of key in PROVIDER_MODELS_PREFIXES
|
||||
# If not, check if it is in PROVIDER_MODELS
|
||||
prefixes = PROVIDER_MODELS_PREFIXES.keys()
|
||||
if any(provider.startswith(prefix) for prefix in prefixes):
|
||||
selected_prefix = next(
|
||||
(prefix for prefix in prefixes if provider.startswith(prefix)),
|
||||
None,
|
||||
)
|
||||
self.api_token = PROVIDER_MODELS_PREFIXES.get(selected_prefix)
|
||||
else:
|
||||
self.provider = DEFAULT_PROVIDER
|
||||
self.api_token = os.getenv(DEFAULT_PROVIDER_API_KEY)
|
||||
self.base_url = base_url
|
||||
self.temprature = temprature
|
||||
self.max_tokens = max_tokens
|
||||
|
||||
@@ -36,7 +36,7 @@ from .markdown_generation_strategy import (
|
||||
)
|
||||
from .deep_crawling import DeepCrawlDecorator
|
||||
from .async_logger import AsyncLogger, AsyncLoggerBase
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig, ProxyConfig
|
||||
from .async_dispatcher import * # noqa: F403
|
||||
from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
|
||||
|
||||
@@ -291,12 +291,12 @@ class AsyncWebCrawler:
|
||||
|
||||
# Update proxy configuration from rotation strategy if available
|
||||
if config and config.proxy_rotation_strategy:
|
||||
next_proxy = await config.proxy_rotation_strategy.get_next_proxy()
|
||||
next_proxy : ProxyConfig = await config.proxy_rotation_strategy.get_next_proxy()
|
||||
if next_proxy:
|
||||
self.logger.info(
|
||||
message="Switch proxy: {proxy}",
|
||||
tag="PROXY",
|
||||
params={"proxy": next_proxy.server},
|
||||
params={"proxy": next_proxy.server}
|
||||
)
|
||||
config.proxy_config = next_proxy
|
||||
# config = config.clone(proxy_config=next_proxy)
|
||||
|
||||
@@ -94,6 +94,7 @@ class ManagedBrowser:
|
||||
host: str = "localhost",
|
||||
debugging_port: int = 9222,
|
||||
cdp_url: Optional[str] = None,
|
||||
browser_config: Optional[BrowserConfig] = None,
|
||||
):
|
||||
"""
|
||||
Initialize the ManagedBrowser instance.
|
||||
@@ -109,17 +110,19 @@ class ManagedBrowser:
|
||||
host (str): Host for debugging the browser. Default: "localhost".
|
||||
debugging_port (int): Port for debugging the browser. Default: 9222.
|
||||
cdp_url (str or None): CDP URL to connect to the browser. Default: None.
|
||||
browser_config (BrowserConfig): Configuration object containing all browser settings. Default: None.
|
||||
"""
|
||||
self.browser_type = browser_type
|
||||
self.user_data_dir = user_data_dir
|
||||
self.headless = headless
|
||||
self.browser_type = browser_config.browser_type
|
||||
self.user_data_dir = browser_config.user_data_dir
|
||||
self.headless = browser_config.headless
|
||||
self.browser_process = None
|
||||
self.temp_dir = None
|
||||
self.debugging_port = debugging_port
|
||||
self.host = host
|
||||
self.debugging_port = browser_config.debugging_port
|
||||
self.host = browser_config.host
|
||||
self.logger = logger
|
||||
self.shutting_down = False
|
||||
self.cdp_url = cdp_url
|
||||
self.cdp_url = browser_config.cdp_url
|
||||
self.browser_config = browser_config
|
||||
|
||||
async def start(self) -> str:
|
||||
"""
|
||||
@@ -142,6 +145,9 @@ class ManagedBrowser:
|
||||
# Get browser path and args based on OS and browser type
|
||||
# browser_path = self._get_browser_path()
|
||||
args = await self._get_browser_args()
|
||||
|
||||
if self.browser_config.extra_args:
|
||||
args.extend(self.browser_config.extra_args)
|
||||
|
||||
# Start browser process
|
||||
try:
|
||||
@@ -477,6 +483,7 @@ class BrowserManager:
|
||||
logger=self.logger,
|
||||
debugging_port=self.config.debugging_port,
|
||||
cdp_url=self.config.cdp_url,
|
||||
browser_config=self.config,
|
||||
)
|
||||
|
||||
async def start(self):
|
||||
@@ -491,10 +498,12 @@ class BrowserManager:
|
||||
|
||||
Note: This method should be called in a separate task to avoid blocking the main event loop.
|
||||
"""
|
||||
if self.playwright is None:
|
||||
from playwright.async_api import async_playwright
|
||||
if self.playwright is not None:
|
||||
await self.close()
|
||||
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
self.playwright = await async_playwright().start()
|
||||
self.playwright = await async_playwright().start()
|
||||
|
||||
if self.config.cdp_url or self.config.use_managed_browser:
|
||||
self.config.use_managed_browser = True
|
||||
|
||||
@@ -29,6 +29,14 @@ PROVIDER_MODELS = {
|
||||
'gemini/gemini-2.0-flash-lite-preview-02-05': os.getenv("GEMINI_API_KEY"),
|
||||
"deepseek/deepseek-chat": os.getenv("DEEPSEEK_API_KEY"),
|
||||
}
|
||||
PROVIDER_MODELS_PREFIXES = {
|
||||
"ollama": "no-token-needed", # Any model from Ollama no need for API token
|
||||
"groq": os.getenv("GROQ_API_KEY"),
|
||||
"openai": os.getenv("OPENAI_API_KEY"),
|
||||
"anthropic": os.getenv("ANTHROPIC_API_KEY"),
|
||||
"gemini": os.getenv("GEMINI_API_KEY"),
|
||||
"deepseek": os.getenv("DEEPSEEK_API_KEY"),
|
||||
}
|
||||
|
||||
# Chunk token threshold
|
||||
CHUNK_TOKEN_THRESHOLD = 2**11 # 2048 tokens
|
||||
|
||||
@@ -7,7 +7,9 @@ import time
|
||||
|
||||
from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH, PROMPT_EXTRACT_INFERRED_SCHEMA
|
||||
from .config import (
|
||||
DEFAULT_PROVIDER, CHUNK_TOKEN_THRESHOLD,
|
||||
DEFAULT_PROVIDER,
|
||||
DEFAULT_PROVIDER_API_KEY,
|
||||
CHUNK_TOKEN_THRESHOLD,
|
||||
OVERLAP_RATE,
|
||||
WORD_TOKEN_RATE,
|
||||
)
|
||||
@@ -542,6 +544,11 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
"""
|
||||
super().__init__( input_format=input_format, **kwargs)
|
||||
self.llm_config = llm_config
|
||||
if not self.llm_config:
|
||||
self.llm_config = create_llm_config(
|
||||
provider=DEFAULT_PROVIDER,
|
||||
api_token=os.environ.get(DEFAULT_PROVIDER_API_KEY),
|
||||
)
|
||||
self.instruction = instruction
|
||||
self.extract_type = extraction_type
|
||||
self.schema = schema
|
||||
|
||||
@@ -40,10 +40,25 @@ def setup_home_directory():
|
||||
f.write("")
|
||||
|
||||
def post_install():
|
||||
"""Run all post-installation tasks"""
|
||||
"""
|
||||
Run all post-installation tasks.
|
||||
Checks CRAWL4AI_MODE environment variable. If set to 'api',
|
||||
skips Playwright browser installation.
|
||||
"""
|
||||
logger.info("Running post-installation setup...", tag="INIT")
|
||||
setup_home_directory()
|
||||
install_playwright()
|
||||
|
||||
# Check environment variable to conditionally skip Playwright install
|
||||
run_mode = os.getenv('CRAWL4AI_MODE')
|
||||
if run_mode == 'api':
|
||||
logger.warning(
|
||||
"CRAWL4AI_MODE=api detected. Skipping Playwright browser installation.",
|
||||
tag="SETUP"
|
||||
)
|
||||
else:
|
||||
# Proceed with installation only if mode is not 'api'
|
||||
install_playwright()
|
||||
|
||||
run_migration()
|
||||
# TODO: Will be added in the future
|
||||
# setup_builtin_browser()
|
||||
|
||||
@@ -4,6 +4,9 @@ from itertools import cycle
|
||||
import os
|
||||
|
||||
|
||||
########### ATTENTION PEOPLE OF EARTH ###########
|
||||
# I have moved this config to async_configs.py, kept it here, in case someone still importing it, however
|
||||
# be a dear and follow `from crawl4ai import ProxyConfig` instead :)
|
||||
class ProxyConfig:
|
||||
def __init__(
|
||||
self,
|
||||
@@ -119,12 +122,12 @@ class ProxyRotationStrategy(ABC):
|
||||
"""Base abstract class for proxy rotation strategies"""
|
||||
|
||||
@abstractmethod
|
||||
async def get_next_proxy(self) -> Optional[Dict]:
|
||||
async def get_next_proxy(self) -> Optional[ProxyConfig]:
|
||||
"""Get next proxy configuration from the strategy"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def add_proxies(self, proxies: List[Dict]):
|
||||
def add_proxies(self, proxies: List[ProxyConfig]):
|
||||
"""Add proxy configurations to the strategy"""
|
||||
pass
|
||||
|
||||
|
||||
@@ -9,83 +9,44 @@ from urllib.parse import urlparse
|
||||
import OpenSSL.crypto
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class SSLCertificate:
|
||||
# === Inherit from dict ===
|
||||
class SSLCertificate(dict):
|
||||
"""
|
||||
A class representing an SSL certificate with methods to export in various formats.
|
||||
A class representing an SSL certificate, behaving like a dictionary
|
||||
for direct JSON serialization. It stores the certificate information internally
|
||||
and provides methods for export and property access.
|
||||
|
||||
Attributes:
|
||||
cert_info (Dict[str, Any]): The certificate information.
|
||||
|
||||
Methods:
|
||||
from_url(url: str, timeout: int = 10) -> Optional['SSLCertificate']: Create SSLCertificate instance from a URL.
|
||||
from_file(file_path: str) -> Optional['SSLCertificate']: Create SSLCertificate instance from a file.
|
||||
from_binary(binary_data: bytes) -> Optional['SSLCertificate']: Create SSLCertificate instance from binary data.
|
||||
export_as_pem() -> str: Export the certificate as PEM format.
|
||||
export_as_der() -> bytes: Export the certificate as DER format.
|
||||
export_as_json() -> Dict[str, Any]: Export the certificate as JSON format.
|
||||
export_as_text() -> str: Export the certificate as text format.
|
||||
Inherits from dict, so instances are directly JSON serializable.
|
||||
"""
|
||||
|
||||
# Use __slots__ for potential memory optimization if desired, though less common when inheriting dict
|
||||
# __slots__ = ("_cert_info",) # If using slots, be careful with dict inheritance interaction
|
||||
|
||||
def __init__(self, cert_info: Dict[str, Any]):
|
||||
self._cert_info = self._decode_cert_data(cert_info)
|
||||
|
||||
@staticmethod
|
||||
def from_url(url: str, timeout: int = 10) -> Optional["SSLCertificate"]:
|
||||
"""
|
||||
Create SSLCertificate instance from a URL.
|
||||
Initializes the SSLCertificate object.
|
||||
|
||||
Args:
|
||||
url (str): URL of the website.
|
||||
timeout (int): Timeout for the connection (default: 10).
|
||||
|
||||
Returns:
|
||||
Optional[SSLCertificate]: SSLCertificate instance if successful, None otherwise.
|
||||
cert_info (Dict[str, Any]): The raw certificate dictionary.
|
||||
"""
|
||||
try:
|
||||
hostname = urlparse(url).netloc
|
||||
if ":" in hostname:
|
||||
hostname = hostname.split(":")[0]
|
||||
# 1. Decode the data (handle bytes -> str)
|
||||
decoded_info = self._decode_cert_data(cert_info)
|
||||
|
||||
context = ssl.create_default_context()
|
||||
with socket.create_connection((hostname, 443), timeout=timeout) as sock:
|
||||
with context.wrap_socket(sock, server_hostname=hostname) as ssock:
|
||||
cert_binary = ssock.getpeercert(binary_form=True)
|
||||
x509 = OpenSSL.crypto.load_certificate(
|
||||
OpenSSL.crypto.FILETYPE_ASN1, cert_binary
|
||||
)
|
||||
# 2. Store the decoded info internally (optional but good practice)
|
||||
# self._cert_info = decoded_info # You can keep this if methods rely on it
|
||||
|
||||
cert_info = {
|
||||
"subject": dict(x509.get_subject().get_components()),
|
||||
"issuer": dict(x509.get_issuer().get_components()),
|
||||
"version": x509.get_version(),
|
||||
"serial_number": hex(x509.get_serial_number()),
|
||||
"not_before": x509.get_notBefore(),
|
||||
"not_after": x509.get_notAfter(),
|
||||
"fingerprint": x509.digest("sha256").hex(),
|
||||
"signature_algorithm": x509.get_signature_algorithm(),
|
||||
"raw_cert": base64.b64encode(cert_binary),
|
||||
}
|
||||
|
||||
# Add extensions
|
||||
extensions = []
|
||||
for i in range(x509.get_extension_count()):
|
||||
ext = x509.get_extension(i)
|
||||
extensions.append(
|
||||
{"name": ext.get_short_name(), "value": str(ext)}
|
||||
)
|
||||
cert_info["extensions"] = extensions
|
||||
|
||||
return SSLCertificate(cert_info)
|
||||
|
||||
except Exception:
|
||||
return None
|
||||
# 3. Initialize the dictionary part of the object with the decoded data
|
||||
super().__init__(decoded_info)
|
||||
|
||||
@staticmethod
|
||||
def _decode_cert_data(data: Any) -> Any:
|
||||
"""Helper method to decode bytes in certificate data."""
|
||||
if isinstance(data, bytes):
|
||||
return data.decode("utf-8")
|
||||
try:
|
||||
# Try UTF-8 first, fallback to latin-1 for arbitrary bytes
|
||||
return data.decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
return data.decode("latin-1") # Or handle as needed, maybe hex representation
|
||||
elif isinstance(data, dict):
|
||||
return {
|
||||
(
|
||||
@@ -97,36 +58,119 @@ class SSLCertificate:
|
||||
return [SSLCertificate._decode_cert_data(item) for item in data]
|
||||
return data
|
||||
|
||||
@staticmethod
|
||||
def from_url(url: str, timeout: int = 10) -> Optional["SSLCertificate"]:
|
||||
"""
|
||||
Create SSLCertificate instance from a URL. Fetches cert info and initializes.
|
||||
(Fetching logic remains the same)
|
||||
"""
|
||||
cert_info_raw = None # Variable to hold the fetched dict
|
||||
try:
|
||||
hostname = urlparse(url).netloc
|
||||
if ":" in hostname:
|
||||
hostname = hostname.split(":")[0]
|
||||
|
||||
context = ssl.create_default_context()
|
||||
# Set check_hostname to False and verify_mode to CERT_NONE temporarily
|
||||
# for potentially problematic certificates during fetch, but parse the result regardless.
|
||||
# context.check_hostname = False
|
||||
# context.verify_mode = ssl.CERT_NONE
|
||||
|
||||
with socket.create_connection((hostname, 443), timeout=timeout) as sock:
|
||||
with context.wrap_socket(sock, server_hostname=hostname) as ssock:
|
||||
cert_binary = ssock.getpeercert(binary_form=True)
|
||||
if not cert_binary:
|
||||
print(f"Warning: No certificate returned for {hostname}")
|
||||
return None
|
||||
|
||||
x509 = OpenSSL.crypto.load_certificate(
|
||||
OpenSSL.crypto.FILETYPE_ASN1, cert_binary
|
||||
)
|
||||
|
||||
# Create the dictionary directly
|
||||
cert_info_raw = {
|
||||
"subject": dict(x509.get_subject().get_components()),
|
||||
"issuer": dict(x509.get_issuer().get_components()),
|
||||
"version": x509.get_version(),
|
||||
"serial_number": hex(x509.get_serial_number()),
|
||||
"not_before": x509.get_notBefore(), # Keep as bytes initially, _decode handles it
|
||||
"not_after": x509.get_notAfter(), # Keep as bytes initially
|
||||
"fingerprint": x509.digest("sha256").hex(), # hex() is already string
|
||||
"signature_algorithm": x509.get_signature_algorithm(), # Keep as bytes
|
||||
"raw_cert": base64.b64encode(cert_binary), # Base64 is bytes, _decode handles it
|
||||
}
|
||||
|
||||
# Add extensions
|
||||
extensions = []
|
||||
for i in range(x509.get_extension_count()):
|
||||
ext = x509.get_extension(i)
|
||||
# get_short_name() returns bytes, str(ext) handles value conversion
|
||||
extensions.append(
|
||||
{"name": ext.get_short_name(), "value": str(ext)}
|
||||
)
|
||||
cert_info_raw["extensions"] = extensions
|
||||
|
||||
except ssl.SSLCertVerificationError as e:
|
||||
print(f"SSL Verification Error for {url}: {e}")
|
||||
# Decide if you want to proceed or return None based on your needs
|
||||
# You might try fetching without verification here if needed, but be cautious.
|
||||
return None
|
||||
except socket.gaierror:
|
||||
print(f"Could not resolve hostname: {hostname}")
|
||||
return None
|
||||
except socket.timeout:
|
||||
print(f"Connection timed out for {url}")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"Error fetching/processing certificate for {url}: {e}")
|
||||
# Log the full error details if needed: logging.exception("Cert fetch error")
|
||||
return None
|
||||
|
||||
# If successful, create the SSLCertificate instance from the dictionary
|
||||
if cert_info_raw:
|
||||
return SSLCertificate(cert_info_raw)
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
# --- Properties now access the dictionary items directly via self[] ---
|
||||
@property
|
||||
def issuer(self) -> Dict[str, str]:
|
||||
return self.get("issuer", {}) # Use self.get for safety
|
||||
|
||||
@property
|
||||
def subject(self) -> Dict[str, str]:
|
||||
return self.get("subject", {})
|
||||
|
||||
@property
|
||||
def valid_from(self) -> str:
|
||||
return self.get("not_before", "")
|
||||
|
||||
@property
|
||||
def valid_until(self) -> str:
|
||||
return self.get("not_after", "")
|
||||
|
||||
@property
|
||||
def fingerprint(self) -> str:
|
||||
return self.get("fingerprint", "")
|
||||
|
||||
# --- Export methods can use `self` directly as it is the dict ---
|
||||
def to_json(self, filepath: Optional[str] = None) -> Optional[str]:
|
||||
"""
|
||||
Export certificate as JSON.
|
||||
|
||||
Args:
|
||||
filepath (Optional[str]): Path to save the JSON file (default: None).
|
||||
|
||||
Returns:
|
||||
Optional[str]: JSON string if successful, None otherwise.
|
||||
"""
|
||||
json_str = json.dumps(self._cert_info, indent=2, ensure_ascii=False)
|
||||
"""Export certificate as JSON."""
|
||||
# `self` is already the dictionary we want to serialize
|
||||
json_str = json.dumps(self, indent=2, ensure_ascii=False)
|
||||
if filepath:
|
||||
Path(filepath).write_text(json_str, encoding="utf-8")
|
||||
return None
|
||||
return json_str
|
||||
|
||||
def to_pem(self, filepath: Optional[str] = None) -> Optional[str]:
|
||||
"""
|
||||
Export certificate as PEM.
|
||||
|
||||
Args:
|
||||
filepath (Optional[str]): Path to save the PEM file (default: None).
|
||||
|
||||
Returns:
|
||||
Optional[str]: PEM string if successful, None otherwise.
|
||||
"""
|
||||
"""Export certificate as PEM."""
|
||||
try:
|
||||
# Decode the raw_cert (which should be string due to _decode)
|
||||
raw_cert_bytes = base64.b64decode(self.get("raw_cert", ""))
|
||||
x509 = OpenSSL.crypto.load_certificate(
|
||||
OpenSSL.crypto.FILETYPE_ASN1,
|
||||
base64.b64decode(self._cert_info["raw_cert"]),
|
||||
OpenSSL.crypto.FILETYPE_ASN1, raw_cert_bytes
|
||||
)
|
||||
pem_data = OpenSSL.crypto.dump_certificate(
|
||||
OpenSSL.crypto.FILETYPE_PEM, x509
|
||||
@@ -136,49 +180,25 @@ class SSLCertificate:
|
||||
Path(filepath).write_text(pem_data, encoding="utf-8")
|
||||
return None
|
||||
return pem_data
|
||||
except Exception:
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"Error converting to PEM: {e}")
|
||||
return None
|
||||
|
||||
def to_der(self, filepath: Optional[str] = None) -> Optional[bytes]:
|
||||
"""
|
||||
Export certificate as DER.
|
||||
|
||||
Args:
|
||||
filepath (Optional[str]): Path to save the DER file (default: None).
|
||||
|
||||
Returns:
|
||||
Optional[bytes]: DER bytes if successful, None otherwise.
|
||||
"""
|
||||
"""Export certificate as DER."""
|
||||
try:
|
||||
der_data = base64.b64decode(self._cert_info["raw_cert"])
|
||||
# Decode the raw_cert (which should be string due to _decode)
|
||||
der_data = base64.b64decode(self.get("raw_cert", ""))
|
||||
if filepath:
|
||||
Path(filepath).write_bytes(der_data)
|
||||
return None
|
||||
return der_data
|
||||
except Exception:
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"Error converting to DER: {e}")
|
||||
return None
|
||||
|
||||
@property
|
||||
def issuer(self) -> Dict[str, str]:
|
||||
"""Get certificate issuer information."""
|
||||
return self._cert_info.get("issuer", {})
|
||||
|
||||
@property
|
||||
def subject(self) -> Dict[str, str]:
|
||||
"""Get certificate subject information."""
|
||||
return self._cert_info.get("subject", {})
|
||||
|
||||
@property
|
||||
def valid_from(self) -> str:
|
||||
"""Get certificate validity start date."""
|
||||
return self._cert_info.get("not_before", "")
|
||||
|
||||
@property
|
||||
def valid_until(self) -> str:
|
||||
"""Get certificate validity end date."""
|
||||
return self._cert_info.get("not_after", "")
|
||||
|
||||
@property
|
||||
def fingerprint(self) -> str:
|
||||
"""Get certificate fingerprint."""
|
||||
return self._cert_info.get("fingerprint", "")
|
||||
# Optional: Add __repr__ for better debugging
|
||||
def __repr__(self) -> str:
|
||||
subject_cn = self.subject.get('CN', 'N/A')
|
||||
issuer_cn = self.issuer.get('CN', 'N/A')
|
||||
return f"<SSLCertificate Subject='{subject_cn}' Issuer='{issuer_cn}'>"
|
||||
644
deploy/docker/README-new.md
Normal file
644
deploy/docker/README-new.md
Normal file
@@ -0,0 +1,644 @@
|
||||
# Crawl4AI Docker Guide 🐳
|
||||
|
||||
## Table of Contents
|
||||
- [Prerequisites](#prerequisites)
|
||||
- [Installation](#installation)
|
||||
- [Option 1: Using Docker Compose (Recommended)](#option-1-using-docker-compose-recommended)
|
||||
- [Option 2: Manual Local Build & Run](#option-2-manual-local-build--run)
|
||||
- [Option 3: Using Pre-built Docker Hub Images](#option-3-using-pre-built-docker-hub-images)
|
||||
- [Dockerfile Parameters](#dockerfile-parameters)
|
||||
- [Using the API](#using-the-api)
|
||||
- [Understanding Request Schema](#understanding-request-schema)
|
||||
- [REST API Examples](#rest-api-examples)
|
||||
- [Python SDK](#python-sdk)
|
||||
- [Metrics & Monitoring](#metrics--monitoring)
|
||||
- [Deployment Scenarios](#deployment-scenarios)
|
||||
- [Complete Examples](#complete-examples)
|
||||
- [Server Configuration](#server-configuration)
|
||||
- [Understanding config.yml](#understanding-configyml)
|
||||
- [JWT Authentication](#jwt-authentication)
|
||||
- [Configuration Tips and Best Practices](#configuration-tips-and-best-practices)
|
||||
- [Customizing Your Configuration](#customizing-your-configuration)
|
||||
- [Configuration Recommendations](#configuration-recommendations)
|
||||
- [Getting Help](#getting-help)
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Before we dive in, make sure you have:
|
||||
- Docker installed and running (version 20.10.0 or higher), including `docker compose` (usually bundled with Docker Desktop).
|
||||
- `git` for cloning the repository.
|
||||
- At least 4GB of RAM available for the container (more recommended for heavy use).
|
||||
- Python 3.10+ (if using the Python SDK).
|
||||
- Node.js 16+ (if using the Node.js examples).
|
||||
|
||||
> 💡 **Pro tip**: Run `docker info` to check your Docker installation and available resources.
|
||||
|
||||
## Installation
|
||||
|
||||
We offer several ways to get the Crawl4AI server running. Docker Compose is the easiest way to manage local builds and runs.
|
||||
|
||||
### Option 1: Using Docker Compose (Recommended)
|
||||
|
||||
Docker Compose simplifies building and running the service, especially for local development and testing across different platforms.
|
||||
|
||||
#### 1. Clone Repository
|
||||
|
||||
```bash
|
||||
git clone https://github.com/unclecode/crawl4ai.git
|
||||
cd crawl4ai
|
||||
```
|
||||
|
||||
#### 2. Environment Setup (API Keys)
|
||||
|
||||
If you plan to use LLMs, copy the example environment file and add your API keys. This file should be in the **project root directory**.
|
||||
|
||||
```bash
|
||||
# Make sure you are in the 'crawl4ai' root directory
|
||||
cp deploy/docker/.llm.env.example .llm.env
|
||||
|
||||
# Now edit .llm.env and add your API keys
|
||||
# Example content:
|
||||
# OPENAI_API_KEY=sk-your-key
|
||||
# ANTHROPIC_API_KEY=your-anthropic-key
|
||||
# ...
|
||||
```
|
||||
> 🔑 **Note**: Keep your API keys secure! Never commit `.llm.env` to version control.
|
||||
|
||||
#### 3. Build and Run with Compose
|
||||
|
||||
The `docker-compose.yml` file in the project root defines services for different scenarios using **profiles**.
|
||||
|
||||
* **Build and Run Locally (AMD64):**
|
||||
```bash
|
||||
# Builds the image locally using Dockerfile and runs it
|
||||
docker compose --profile local-amd64 up --build -d
|
||||
```
|
||||
|
||||
* **Build and Run Locally (ARM64):**
|
||||
```bash
|
||||
# Builds the image locally using Dockerfile and runs it
|
||||
docker compose --profile local-arm64 up --build -d
|
||||
```
|
||||
|
||||
* **Run Pre-built Image from Docker Hub (AMD64):**
|
||||
```bash
|
||||
# Pulls and runs the specified AMD64 image from Docker Hub
|
||||
# (Set VERSION env var for specific tags, e.g., VERSION=0.5.1-d1)
|
||||
docker compose --profile hub-amd64 up -d
|
||||
```
|
||||
|
||||
* **Run Pre-built Image from Docker Hub (ARM64):**
|
||||
```bash
|
||||
# Pulls and runs the specified ARM64 image from Docker Hub
|
||||
docker compose --profile hub-arm64 up -d
|
||||
```
|
||||
|
||||
> The server will be available at `http://localhost:11235`.
|
||||
|
||||
#### 4. Stopping Compose Services
|
||||
|
||||
```bash
|
||||
# Stop the service(s) associated with a profile (e.g., local-amd64)
|
||||
docker compose --profile local-amd64 down
|
||||
```
|
||||
|
||||
### Option 2: Manual Local Build & Run
|
||||
|
||||
If you prefer not to use Docker Compose for local builds.
|
||||
|
||||
#### 1. Clone Repository & Setup Environment
|
||||
|
||||
Follow steps 1 and 2 from the Docker Compose section above (clone repo, `cd crawl4ai`, create `.llm.env` in the root).
|
||||
|
||||
#### 2. Build the Image (Multi-Arch)
|
||||
|
||||
Use `docker buildx` to build the image. This example builds for multiple platforms and loads the image matching your host architecture into the local Docker daemon.
|
||||
|
||||
```bash
|
||||
# Make sure you are in the 'crawl4ai' root directory
|
||||
docker buildx build --platform linux/amd64,linux/arm64 -t crawl4ai-local:latest --load .
|
||||
```
|
||||
|
||||
#### 3. Run the Container
|
||||
|
||||
* **Basic run (no LLM support):**
|
||||
```bash
|
||||
# Replace --platform if your host is ARM64
|
||||
docker run -d \
|
||||
-p 11235:11235 \
|
||||
--name crawl4ai-standalone \
|
||||
--shm-size=1g \
|
||||
--platform linux/amd64 \
|
||||
crawl4ai-local:latest
|
||||
```
|
||||
|
||||
* **With LLM support:**
|
||||
```bash
|
||||
# Make sure .llm.env is in the current directory (project root)
|
||||
# Replace --platform if your host is ARM64
|
||||
docker run -d \
|
||||
-p 11235:11235 \
|
||||
--name crawl4ai-standalone \
|
||||
--env-file .llm.env \
|
||||
--shm-size=1g \
|
||||
--platform linux/amd64 \
|
||||
crawl4ai-local:latest
|
||||
```
|
||||
|
||||
> The server will be available at `http://localhost:11235`.
|
||||
|
||||
#### 4. Stopping the Manual Container
|
||||
|
||||
```bash
|
||||
docker stop crawl4ai-standalone && docker rm crawl4ai-standalone
|
||||
```
|
||||
|
||||
### Option 3: Using Pre-built Docker Hub Images
|
||||
|
||||
Pull and run images directly from Docker Hub without building locally.
|
||||
|
||||
#### 1. Pull the Image
|
||||
|
||||
We use a versioning scheme like `LIBRARY_VERSION-dREVISION` (e.g., `0.5.1-d1`). The `latest` tag points to the most recent stable release. Images are built with multi-arch manifests, so Docker usually pulls the correct version for your system automatically.
|
||||
|
||||
```bash
|
||||
# Pull a specific version (recommended for stability)
|
||||
docker pull unclecode/crawl4ai:0.5.1-d1
|
||||
|
||||
# Or pull the latest stable version
|
||||
docker pull unclecode/crawl4ai:latest
|
||||
```
|
||||
|
||||
#### 2. Setup Environment (API Keys)
|
||||
|
||||
If using LLMs, create the `.llm.env` file in a directory of your choice, similar to Step 2 in the Compose section.
|
||||
|
||||
#### 3. Run the Container
|
||||
|
||||
* **Basic run:**
|
||||
```bash
|
||||
docker run -d \
|
||||
-p 11235:11235 \
|
||||
--name crawl4ai-hub \
|
||||
--shm-size=1g \
|
||||
unclecode/crawl4ai:0.5.1-d1 # Or use :latest
|
||||
```
|
||||
|
||||
* **With LLM support:**
|
||||
```bash
|
||||
# Make sure .llm.env is in the current directory you are running docker from
|
||||
docker run -d \
|
||||
-p 11235:11235 \
|
||||
--name crawl4ai-hub \
|
||||
--env-file .llm.env \
|
||||
--shm-size=1g \
|
||||
unclecode/crawl4ai:0.5.1-d1 # Or use :latest
|
||||
```
|
||||
|
||||
> The server will be available at `http://localhost:11235`.
|
||||
|
||||
#### 4. Stopping the Hub Container
|
||||
|
||||
```bash
|
||||
docker stop crawl4ai-hub && docker rm crawl4ai-hub
|
||||
```
|
||||
|
||||
#### Docker Hub Versioning Explained
|
||||
|
||||
* **Image Name:** `unclecode/crawl4ai`
|
||||
* **Tag Format:** `LIBRARY_VERSION-dREVISION`
|
||||
* `LIBRARY_VERSION`: The Semantic Version of the core `crawl4ai` Python library included (e.g., `0.5.1`).
|
||||
* `dREVISION`: An incrementing number (starting at `d1`) for Docker build changes made *without* changing the library version (e.g., base image updates, dependency fixes). Resets to `d1` for each new `LIBRARY_VERSION`.
|
||||
* **Example:** `unclecode/crawl4ai:0.5.1-d1`
|
||||
* **`latest` Tag:** Points to the most recent stable `LIBRARY_VERSION-dREVISION`.
|
||||
* **Multi-Arch:** Images support `linux/amd64` and `linux/arm64`. Docker automatically selects the correct architecture.
|
||||
|
||||
---
|
||||
|
||||
*(Rest of the document remains largely the same, but with key updates below)*
|
||||
|
||||
---
|
||||
|
||||
## Dockerfile Parameters
|
||||
|
||||
You can customize the image build process using build arguments (`--build-arg`). These are typically used via `docker buildx build` or within the `docker-compose.yml` file.
|
||||
|
||||
```bash
|
||||
# Example: Build with 'all' features using buildx
|
||||
docker buildx build \
|
||||
--platform linux/amd64,linux/arm64 \
|
||||
--build-arg INSTALL_TYPE=all \
|
||||
-t yourname/crawl4ai-all:latest \
|
||||
--load \
|
||||
. # Build from root context
|
||||
```
|
||||
|
||||
### Build Arguments Explained
|
||||
|
||||
| Argument | Description | Default | Options |
|
||||
| :----------- | :--------------------------------------- | :-------- | :--------------------------------- |
|
||||
| INSTALL_TYPE | Feature set | `default` | `default`, `all`, `torch`, `transformer` |
|
||||
| ENABLE_GPU | GPU support (CUDA for AMD64) | `false` | `true`, `false` |
|
||||
| APP_HOME | Install path inside container (advanced) | `/app` | any valid path |
|
||||
| USE_LOCAL | Install library from local source | `true` | `true`, `false` |
|
||||
| GITHUB_REPO | Git repo to clone if USE_LOCAL=false | *(see Dockerfile)* | any git URL |
|
||||
| GITHUB_BRANCH| Git branch to clone if USE_LOCAL=false | `main` | any branch name |
|
||||
|
||||
*(Note: PYTHON_VERSION is fixed by the `FROM` instruction in the Dockerfile)*
|
||||
|
||||
### Build Best Practices
|
||||
|
||||
1. **Choose the Right Install Type**
|
||||
* `default`: Basic installation, smallest image size. Suitable for most standard web scraping and markdown generation.
|
||||
* `all`: Full features including `torch` and `transformers` for advanced extraction strategies (e.g., CosineStrategy, certain LLM filters). Significantly larger image. Ensure you need these extras.
|
||||
2. **Platform Considerations**
|
||||
* Use `buildx` for building multi-architecture images, especially for pushing to registries.
|
||||
* Use `docker compose` profiles (`local-amd64`, `local-arm64`) for easy platform-specific local builds.
|
||||
3. **Performance Optimization**
|
||||
* The image automatically includes platform-specific optimizations (OpenMP for AMD64, OpenBLAS for ARM64).
|
||||
|
||||
---
|
||||
|
||||
## Using the API
|
||||
|
||||
Communicate with the running Docker server via its REST API (defaulting to `http://localhost:11235`). You can use the Python SDK or make direct HTTP requests.
|
||||
|
||||
### Python SDK
|
||||
|
||||
Install the SDK: `pip install crawl4ai`
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||
from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode # Assuming you have crawl4ai installed
|
||||
|
||||
async def main():
|
||||
# Point to the correct server port
|
||||
async with Crawl4aiDockerClient(base_url="http://localhost:11235", verbose=True) as client:
|
||||
# If JWT is enabled on the server, authenticate first:
|
||||
# await client.authenticate("user@example.com") # See Server Configuration section
|
||||
|
||||
# Example Non-streaming crawl
|
||||
print("--- Running Non-Streaming Crawl ---")
|
||||
results = await client.crawl(
|
||||
["https://httpbin.org/html"],
|
||||
browser_config=BrowserConfig(headless=True), # Use library classes for config aid
|
||||
crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
)
|
||||
if results: # client.crawl returns None on failure
|
||||
print(f"Non-streaming results success: {results.success}")
|
||||
if results.success:
|
||||
for result in results: # Iterate through the CrawlResultContainer
|
||||
print(f"URL: {result.url}, Success: {result.success}")
|
||||
else:
|
||||
print("Non-streaming crawl failed.")
|
||||
|
||||
|
||||
# Example Streaming crawl
|
||||
print("\n--- Running Streaming Crawl ---")
|
||||
stream_config = CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)
|
||||
try:
|
||||
async for result in await client.crawl( # client.crawl returns an async generator for streaming
|
||||
["https://httpbin.org/html", "https://httpbin.org/links/5/0"],
|
||||
browser_config=BrowserConfig(headless=True),
|
||||
crawler_config=stream_config
|
||||
):
|
||||
print(f"Streamed result: URL: {result.url}, Success: {result.success}")
|
||||
except Exception as e:
|
||||
print(f"Streaming crawl failed: {e}")
|
||||
|
||||
|
||||
# Example Get schema
|
||||
print("\n--- Getting Schema ---")
|
||||
schema = await client.get_schema()
|
||||
print(f"Schema received: {bool(schema)}") # Print whether schema was received
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
*(SDK parameters like timeout, verify_ssl etc. remain the same)*
|
||||
|
||||
### Second Approach: Direct API Calls
|
||||
|
||||
Crucially, when sending configurations directly via JSON, they **must** follow the `{"type": "ClassName", "params": {...}}` structure for any non-primitive value (like config objects or strategies). Dictionaries must be wrapped as `{"type": "dict", "value": {...}}`.
|
||||
|
||||
*(Keep the detailed explanation of Configuration Structure, Basic Pattern, Simple vs Complex, Strategy Pattern, Complex Nested Example, Quick Grammar Overview, Important Rules, Pro Tip)*
|
||||
|
||||
#### More Examples *(Ensure Schema example uses type/value wrapper)*
|
||||
|
||||
**Advanced Crawler Configuration**
|
||||
*(Keep example, ensure cache_mode uses valid enum value like "bypass")*
|
||||
|
||||
**Extraction Strategy**
|
||||
```json
|
||||
{
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"extraction_strategy": {
|
||||
"type": "JsonCssExtractionStrategy",
|
||||
"params": {
|
||||
"schema": {
|
||||
"type": "dict",
|
||||
"value": {
|
||||
"baseSelector": "article.post",
|
||||
"fields": [
|
||||
{"name": "title", "selector": "h1", "type": "text"},
|
||||
{"name": "content", "selector": ".content", "type": "html"}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**LLM Extraction Strategy** *(Keep example, ensure schema uses type/value wrapper)*
|
||||
*(Keep Deep Crawler Example)*
|
||||
|
||||
### REST API Examples
|
||||
|
||||
Update URLs to use port `11235`.
|
||||
|
||||
#### Simple Crawl
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
# Configuration objects converted to the required JSON structure
|
||||
browser_config_payload = {
|
||||
"type": "BrowserConfig",
|
||||
"params": {"headless": True}
|
||||
}
|
||||
crawler_config_payload = {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {"stream": False, "cache_mode": "bypass"} # Use string value of enum
|
||||
}
|
||||
|
||||
crawl_payload = {
|
||||
"urls": ["https://httpbin.org/html"],
|
||||
"browser_config": browser_config_payload,
|
||||
"crawler_config": crawler_config_payload
|
||||
}
|
||||
response = requests.post(
|
||||
"http://localhost:11235/crawl", # Updated port
|
||||
# headers={"Authorization": f"Bearer {token}"}, # If JWT is enabled
|
||||
json=crawl_payload
|
||||
)
|
||||
print(f"Status Code: {response.status_code}")
|
||||
if response.ok:
|
||||
print(response.json())
|
||||
else:
|
||||
print(f"Error: {response.text}")
|
||||
|
||||
```
|
||||
|
||||
#### Streaming Results
|
||||
|
||||
```python
|
||||
import json
|
||||
import httpx # Use httpx for async streaming example
|
||||
|
||||
async def test_stream_crawl(token: str = None): # Made token optional
|
||||
"""Test the /crawl/stream endpoint with multiple URLs."""
|
||||
url = "http://localhost:11235/crawl/stream" # Updated port
|
||||
payload = {
|
||||
"urls": [
|
||||
"https://httpbin.org/html",
|
||||
"https://httpbin.org/links/5/0",
|
||||
],
|
||||
"browser_config": {
|
||||
"type": "BrowserConfig",
|
||||
"params": {"headless": True, "viewport": {"type": "dict", "value": {"width": 1200, "height": 800}}} # Viewport needs type:dict
|
||||
},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {"stream": True, "cache_mode": "bypass"}
|
||||
}
|
||||
}
|
||||
|
||||
headers = {}
|
||||
# if token:
|
||||
# headers = {"Authorization": f"Bearer {token}"} # If JWT is enabled
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient() as client:
|
||||
async with client.stream("POST", url, json=payload, headers=headers, timeout=120.0) as response:
|
||||
print(f"Status: {response.status_code} (Expected: 200)")
|
||||
response.raise_for_status() # Raise exception for bad status codes
|
||||
|
||||
# Read streaming response line-by-line (NDJSON)
|
||||
async for line in response.aiter_lines():
|
||||
if line:
|
||||
try:
|
||||
data = json.loads(line)
|
||||
# Check for completion marker
|
||||
if data.get("status") == "completed":
|
||||
print("Stream completed.")
|
||||
break
|
||||
print(f"Streamed Result: {json.dumps(data, indent=2)}")
|
||||
except json.JSONDecodeError:
|
||||
print(f"Warning: Could not decode JSON line: {line}")
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
print(f"HTTP error occurred: {e.response.status_code} - {e.response.text}")
|
||||
except Exception as e:
|
||||
print(f"Error in streaming crawl test: {str(e)}")
|
||||
|
||||
# To run this example:
|
||||
# import asyncio
|
||||
# asyncio.run(test_stream_crawl())
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Metrics & Monitoring
|
||||
|
||||
Keep an eye on your crawler with these endpoints:
|
||||
|
||||
- `/health` - Quick health check
|
||||
- `/metrics` - Detailed Prometheus metrics
|
||||
- `/schema` - Full API schema
|
||||
|
||||
Example health check:
|
||||
```bash
|
||||
curl http://localhost:11235/health
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
*(Deployment Scenarios and Complete Examples sections remain the same, maybe update links if examples moved)*
|
||||
|
||||
---
|
||||
|
||||
## Server Configuration
|
||||
|
||||
The server's behavior can be customized through the `config.yml` file.
|
||||
|
||||
### Understanding config.yml
|
||||
|
||||
The configuration file is loaded from `/app/config.yml` inside the container. By default, the file from `deploy/docker/config.yml` in the repository is copied there during the build.
|
||||
|
||||
Here's a detailed breakdown of the configuration options (using defaults from `deploy/docker/config.yml`):
|
||||
|
||||
```yaml
|
||||
# Application Configuration
|
||||
app:
|
||||
title: "Crawl4AI API"
|
||||
version: "1.0.0" # Consider setting this to match library version, e.g., "0.5.1"
|
||||
host: "0.0.0.0"
|
||||
port: 8020 # NOTE: This port is used ONLY when running server.py directly. Gunicorn overrides this (see supervisord.conf).
|
||||
reload: False # Default set to False - suitable for production
|
||||
timeout_keep_alive: 300
|
||||
|
||||
# Default LLM Configuration
|
||||
llm:
|
||||
provider: "openai/gpt-4o-mini"
|
||||
api_key_env: "OPENAI_API_KEY"
|
||||
# api_key: sk-... # If you pass the API key directly then api_key_env will be ignored
|
||||
|
||||
# Redis Configuration (Used by internal Redis server managed by supervisord)
|
||||
redis:
|
||||
host: "localhost"
|
||||
port: 6379
|
||||
db: 0
|
||||
password: ""
|
||||
# ... other redis options ...
|
||||
|
||||
# Rate Limiting Configuration
|
||||
rate_limiting:
|
||||
enabled: True
|
||||
default_limit: "1000/minute"
|
||||
trusted_proxies: []
|
||||
storage_uri: "memory://" # Use "redis://localhost:6379" if you need persistent/shared limits
|
||||
|
||||
# Security Configuration
|
||||
security:
|
||||
enabled: false # Master toggle for security features
|
||||
jwt_enabled: false # Enable JWT authentication (requires security.enabled=true)
|
||||
https_redirect: false # Force HTTPS (requires security.enabled=true)
|
||||
trusted_hosts: ["*"] # Allowed hosts (use specific domains in production)
|
||||
headers: # Security headers (applied if security.enabled=true)
|
||||
x_content_type_options: "nosniff"
|
||||
x_frame_options: "DENY"
|
||||
content_security_policy: "default-src 'self'"
|
||||
strict_transport_security: "max-age=63072000; includeSubDomains"
|
||||
|
||||
# Crawler Configuration
|
||||
crawler:
|
||||
memory_threshold_percent: 95.0
|
||||
rate_limiter:
|
||||
base_delay: [1.0, 2.0] # Min/max delay between requests in seconds for dispatcher
|
||||
timeouts:
|
||||
stream_init: 30.0 # Timeout for stream initialization
|
||||
batch_process: 300.0 # Timeout for non-streaming /crawl processing
|
||||
|
||||
# Logging Configuration
|
||||
logging:
|
||||
level: "INFO"
|
||||
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
|
||||
# Observability Configuration
|
||||
observability:
|
||||
prometheus:
|
||||
enabled: True
|
||||
endpoint: "/metrics"
|
||||
health_check:
|
||||
endpoint: "/health"
|
||||
```
|
||||
|
||||
*(JWT Authentication section remains the same, just note the default port is now 11235 for requests)*
|
||||
|
||||
*(Configuration Tips and Best Practices remain the same)*
|
||||
|
||||
### Customizing Your Configuration
|
||||
|
||||
You can override the default `config.yml`.
|
||||
|
||||
#### Method 1: Modify Before Build
|
||||
|
||||
1. Edit the `deploy/docker/config.yml` file in your local repository clone.
|
||||
2. Build the image using `docker buildx` or `docker compose --profile local-... up --build`. The modified file will be copied into the image.
|
||||
|
||||
#### Method 2: Runtime Mount (Recommended for Custom Deploys)
|
||||
|
||||
1. Create your custom configuration file, e.g., `my-custom-config.yml` locally. Ensure it contains all necessary sections.
|
||||
2. Mount it when running the container:
|
||||
|
||||
* **Using `docker run`:**
|
||||
```bash
|
||||
# Assumes my-custom-config.yml is in the current directory
|
||||
docker run -d -p 11235:11235 \
|
||||
--name crawl4ai-custom-config \
|
||||
--env-file .llm.env \
|
||||
--shm-size=1g \
|
||||
-v $(pwd)/my-custom-config.yml:/app/config.yml \
|
||||
unclecode/crawl4ai:latest # Or your specific tag
|
||||
```
|
||||
|
||||
* **Using `docker-compose.yml`:** Add a `volumes` section to the service definition:
|
||||
```yaml
|
||||
services:
|
||||
crawl4ai-hub-amd64: # Or your chosen service
|
||||
image: unclecode/crawl4ai:latest
|
||||
profiles: ["hub-amd64"]
|
||||
<<: *base-config
|
||||
volumes:
|
||||
# Mount local custom config over the default one in the container
|
||||
- ./my-custom-config.yml:/app/config.yml
|
||||
# Keep the shared memory volume from base-config
|
||||
- /dev/shm:/dev/shm
|
||||
```
|
||||
*(Note: Ensure `my-custom-config.yml` is in the same directory as `docker-compose.yml`)*
|
||||
|
||||
> 💡 When mounting, your custom file *completely replaces* the default one. Ensure it's a valid and complete configuration.
|
||||
|
||||
### Configuration Recommendations
|
||||
|
||||
1. **Security First** 🔒
|
||||
- Always enable security in production
|
||||
- Use specific trusted_hosts instead of wildcards
|
||||
- Set up proper rate limiting to protect your server
|
||||
- Consider your environment before enabling HTTPS redirect
|
||||
|
||||
2. **Resource Management** 💻
|
||||
- Adjust memory_threshold_percent based on available RAM
|
||||
- Set timeouts according to your content size and network conditions
|
||||
- Use Redis for rate limiting in multi-container setups
|
||||
|
||||
3. **Monitoring** 📊
|
||||
- Enable Prometheus if you need metrics
|
||||
- Set DEBUG logging in development, INFO in production
|
||||
- Regular health check monitoring is crucial
|
||||
|
||||
4. **Performance Tuning** ⚡
|
||||
- Start with conservative rate limiter delays
|
||||
- Increase batch_process timeout for large content
|
||||
- Adjust stream_init timeout based on initial response times
|
||||
|
||||
## Getting Help
|
||||
|
||||
We're here to help you succeed with Crawl4AI! Here's how to get support:
|
||||
|
||||
- 📖 Check our [full documentation](https://docs.crawl4ai.com)
|
||||
- 🐛 Found a bug? [Open an issue](https://github.com/unclecode/crawl4ai/issues)
|
||||
- 💬 Join our [Discord community](https://discord.gg/crawl4ai)
|
||||
- ⭐ Star us on GitHub to show support!
|
||||
|
||||
## Summary
|
||||
|
||||
In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
|
||||
- Building and running the Docker container
|
||||
- Configuring the environment
|
||||
- Making API requests with proper typing
|
||||
- Using the Python SDK
|
||||
- Monitoring your deployment
|
||||
|
||||
Remember, the examples in the `examples` folder are your friends - they show real-world usage patterns that you can adapt for your needs.
|
||||
|
||||
Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. 🚀
|
||||
|
||||
Happy crawling! 🕷️
|
||||
@@ -391,21 +391,25 @@ async def handle_crawl_request(
|
||||
)
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
results = []
|
||||
func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many")
|
||||
partial_func = partial(func,
|
||||
urls[0] if len(urls) == 1 else urls,
|
||||
config=crawler_config,
|
||||
dispatcher=dispatcher)
|
||||
results = await partial_func()
|
||||
return {
|
||||
"success": True,
|
||||
"results": [result.model_dump() for result in results]
|
||||
}
|
||||
crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config)
|
||||
await crawler.start()
|
||||
results = []
|
||||
func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many")
|
||||
partial_func = partial(func,
|
||||
urls[0] if len(urls) == 1 else urls,
|
||||
config=crawler_config,
|
||||
dispatcher=dispatcher)
|
||||
results = await partial_func()
|
||||
await crawler.close()
|
||||
return {
|
||||
"success": True,
|
||||
"results": [result.model_dump() for result in results]
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Crawl error: {str(e)}", exc_info=True)
|
||||
if 'crawler' in locals():
|
||||
await crawler.close()
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=str(e)
|
||||
|
||||
@@ -4,7 +4,7 @@ app:
|
||||
version: "1.0.0"
|
||||
host: "0.0.0.0"
|
||||
port: 8020
|
||||
reload: True
|
||||
reload: False
|
||||
timeout_keep_alive: 300
|
||||
|
||||
# Default LLM Configuration
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
crawl4ai
|
||||
fastapi
|
||||
uvicorn
|
||||
gunicorn>=23.0.0
|
||||
|
||||
@@ -1,12 +1,28 @@
|
||||
[supervisord]
|
||||
nodaemon=true
|
||||
nodaemon=true ; Run supervisord in the foreground
|
||||
logfile=/dev/null ; Log supervisord output to stdout/stderr
|
||||
logfile_maxbytes=0
|
||||
|
||||
[program:redis]
|
||||
command=redis-server
|
||||
command=/usr/bin/redis-server --loglevel notice ; Path to redis-server on Alpine
|
||||
user=appuser ; Run redis as our non-root user
|
||||
autorestart=true
|
||||
priority=10
|
||||
stdout_logfile=/dev/stdout ; Redirect redis stdout to container stdout
|
||||
stdout_logfile_maxbytes=0
|
||||
stderr_logfile=/dev/stderr ; Redirect redis stderr to container stderr
|
||||
stderr_logfile_maxbytes=0
|
||||
|
||||
[program:gunicorn]
|
||||
command=gunicorn --bind 0.0.0.0:8000 --workers 4 --threads 2 --timeout 300 --graceful-timeout 60 --keep-alive 65 --log-level debug --worker-class uvicorn.workers.UvicornWorker --max-requests 1000 --max-requests-jitter 50 server:app
|
||||
command=/usr/local/bin/gunicorn --bind 0.0.0.0:11235 --workers 2 --threads 2 --timeout 120 --graceful-timeout 30 --keep-alive 60 --log-level info --worker-class uvicorn.workers.UvicornWorker server:app
|
||||
directory=/app ; Working directory for the app
|
||||
user=appuser ; Run gunicorn as our non-root user
|
||||
autorestart=true
|
||||
priority=20
|
||||
priority=20
|
||||
environment=PYTHONUNBUFFERED=1 ; Ensure Python output is sent straight to logs
|
||||
stdout_logfile=/dev/stdout ; Redirect gunicorn stdout to container stdout
|
||||
stdout_logfile_maxbytes=0
|
||||
stderr_logfile=/dev/stderr ; Redirect gunicorn stderr to container stderr
|
||||
stderr_logfile_maxbytes=0
|
||||
|
||||
# Optional: Add filebeat or other logging agents here if needed
|
||||
@@ -1,15 +1,30 @@
|
||||
# Base configuration (not a service, just a reusable config block)
|
||||
# docker-compose.yml
|
||||
|
||||
# Base configuration anchor for reusability
|
||||
x-base-config: &base-config
|
||||
ports:
|
||||
# Map host port 11235 to container port 11235 (where Gunicorn will listen)
|
||||
- "11235:11235"
|
||||
- "8000:8000"
|
||||
- "9222:9222"
|
||||
- "8080:8080"
|
||||
# - "8080:8080" # Uncomment if needed
|
||||
|
||||
# Load API keys primarily from .llm.env file
|
||||
# Create .llm.env in the root directory .llm.env.example
|
||||
env_file:
|
||||
- .llm.env
|
||||
|
||||
# Define environment variables, allowing overrides from host environment
|
||||
# Syntax ${VAR:-} uses host env var 'VAR' if set, otherwise uses value from .llm.env
|
||||
environment:
|
||||
- CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}
|
||||
- OPENAI_API_KEY=${OPENAI_API_KEY:-}
|
||||
- CLAUDE_API_KEY=${CLAUDE_API_KEY:-}
|
||||
- DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
|
||||
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
|
||||
- GROQ_API_KEY=${GROQ_API_KEY:-}
|
||||
- TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
|
||||
- MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
|
||||
- GEMINI_API_TOKEN=${GEMINI_API_TOKEN:-}
|
||||
|
||||
volumes:
|
||||
# Mount /dev/shm for Chromium/Playwright performance
|
||||
- /dev/shm:/dev/shm
|
||||
deploy:
|
||||
resources:
|
||||
@@ -19,47 +34,47 @@ x-base-config: &base-config
|
||||
memory: 1G
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
# IMPORTANT: Ensure Gunicorn binds to 11235 in supervisord.conf
|
||||
test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 40s
|
||||
start_period: 40s # Give the server time to start
|
||||
# Run the container as the non-root user defined in the Dockerfile
|
||||
user: "appuser"
|
||||
|
||||
services:
|
||||
# Local build services for different platforms
|
||||
crawl4ai-amd64:
|
||||
# --- Local Build Services ---
|
||||
crawl4ai-local-amd64:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
context: . # Build context is the root directory
|
||||
dockerfile: Dockerfile # Dockerfile is in the root directory
|
||||
args:
|
||||
PYTHON_VERSION: "3.10"
|
||||
INSTALL_TYPE: ${INSTALL_TYPE:-basic}
|
||||
ENABLE_GPU: false
|
||||
platforms:
|
||||
- linux/amd64
|
||||
INSTALL_TYPE: ${INSTALL_TYPE:-default}
|
||||
ENABLE_GPU: ${ENABLE_GPU:-false}
|
||||
# PYTHON_VERSION arg is omitted as it's fixed by 'FROM python:3.10-slim' in Dockerfile
|
||||
platform: linux/amd64
|
||||
profiles: ["local-amd64"]
|
||||
<<: *base-config # extends yerine doğrudan yapılandırmayı dahil ettik
|
||||
<<: *base-config # Inherit base configuration
|
||||
|
||||
crawl4ai-arm64:
|
||||
crawl4ai-local-arm64:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
context: . # Build context is the root directory
|
||||
dockerfile: Dockerfile # Dockerfile is in the root directory
|
||||
args:
|
||||
PYTHON_VERSION: "3.10"
|
||||
INSTALL_TYPE: ${INSTALL_TYPE:-basic}
|
||||
ENABLE_GPU: false
|
||||
platforms:
|
||||
- linux/arm64
|
||||
INSTALL_TYPE: ${INSTALL_TYPE:-default}
|
||||
ENABLE_GPU: ${ENABLE_GPU:-false}
|
||||
platform: linux/arm64
|
||||
profiles: ["local-arm64"]
|
||||
<<: *base-config
|
||||
|
||||
# Hub services for different platforms and versions
|
||||
# --- Docker Hub Image Services ---
|
||||
crawl4ai-hub-amd64:
|
||||
image: unclecode/crawl4ai:${VERSION:-basic}-amd64
|
||||
image: unclecode/crawl4ai:${VERSION:-latest}-amd64
|
||||
profiles: ["hub-amd64"]
|
||||
<<: *base-config
|
||||
|
||||
crawl4ai-hub-arm64:
|
||||
image: unclecode/crawl4ai:${VERSION:-basic}-arm64
|
||||
image: unclecode/crawl4ai:${VERSION:-latest}-arm64
|
||||
profiles: ["hub-arm64"]
|
||||
<<: *base-config
|
||||
@@ -357,8 +357,7 @@ async def demo_performance_analysis():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
capture_network_requests=True,
|
||||
wait_until="networkidle",
|
||||
page_timeout=60000 # 60 seconds
|
||||
page_timeout=60 * 2 * 1000 # 120 seconds
|
||||
)
|
||||
|
||||
result = await crawler.arun(
|
||||
@@ -406,6 +405,13 @@ async def demo_performance_analysis():
|
||||
"url": url,
|
||||
"duration_ms": duration
|
||||
})
|
||||
if isinstance(timing, dict) and "requestStart" in timing and "responseStart" in timing and "startTime" in timing:
|
||||
# Convert to milliseconds
|
||||
duration = (timing["responseStart"] - timing["requestStart"]) * 1000
|
||||
resource_timings[resource_type].append({
|
||||
"url": url,
|
||||
"duration_ms": duration
|
||||
})
|
||||
|
||||
# Calculate statistics for each resource type
|
||||
print("\nPerformance by resource type:")
|
||||
@@ -455,14 +461,14 @@ async def main():
|
||||
os.makedirs(os.path.join(__cur_dir__, "tmp"), exist_ok=True)
|
||||
|
||||
# Run basic examples
|
||||
await demo_basic_network_capture()
|
||||
# await demo_basic_network_capture()
|
||||
await demo_basic_console_capture()
|
||||
await demo_combined_capture()
|
||||
# await demo_combined_capture()
|
||||
|
||||
# Run advanced examples
|
||||
await analyze_spa_network_traffic()
|
||||
await demo_security_analysis()
|
||||
await demo_performance_analysis()
|
||||
# await analyze_spa_network_traffic()
|
||||
# await demo_security_analysis()
|
||||
# await demo_performance_analysis()
|
||||
|
||||
print("\n=== Examples Complete ===")
|
||||
print(f"Check the tmp directory for output files: {os.path.join(__cur_dir__, 'tmp')}")
|
||||
|
||||
@@ -4,7 +4,7 @@ import json
|
||||
import base64
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
from crawl4ai.proxy_strategy import ProxyConfig
|
||||
from crawl4ai import ProxyConfig
|
||||
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult
|
||||
from crawl4ai import RoundRobinProxyStrategy
|
||||
|
||||
@@ -13,7 +13,7 @@ from crawl4ai.deep_crawling import (
|
||||
)
|
||||
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
|
||||
from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
|
||||
from crawl4ai.proxy_strategy import ProxyConfig
|
||||
from crawl4ai import ProxyConfig
|
||||
from crawl4ai import RoundRobinProxyStrategy
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
from crawl4ai import DefaultMarkdownGenerator
|
||||
|
||||
444
docs/md_v2/ask_ai/ask-ai.css
Normal file
444
docs/md_v2/ask_ai/ask-ai.css
Normal file
@@ -0,0 +1,444 @@
|
||||
/* ==== File: docs/ask_ai/ask_ai.css ==== */
|
||||
|
||||
/* --- Basic Reset & Font --- */
|
||||
body {
|
||||
/* Attempt to inherit variables from parent window (iframe context) */
|
||||
/* Fallback values if variables are not inherited */
|
||||
--fallback-bg: #070708;
|
||||
--fallback-font: #e8e9ed;
|
||||
--fallback-secondary: #a3abba;
|
||||
--fallback-primary: #50ffff;
|
||||
--fallback-primary-dimmed: #09b5a5;
|
||||
--fallback-border: #1d1d20;
|
||||
--fallback-code-bg: #1e1e1e;
|
||||
--fallback-invert-font: #222225;
|
||||
--font-stack: dm, Monaco, Courier New, monospace, serif;
|
||||
|
||||
font-family: var(--font-stack, "Courier New", monospace); /* Use theme font stack */
|
||||
background-color: var(--background-color, var(--fallback-bg));
|
||||
color: var(--font-color, var(--fallback-font));
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
font-size: 14px; /* Match global font size */
|
||||
line-height: 1.5em; /* Match global line height */
|
||||
height: 100vh; /* Ensure body takes full height */
|
||||
overflow: hidden; /* Prevent body scrollbars, panels handle scroll */
|
||||
display: flex; /* Use flex for the main container */
|
||||
}
|
||||
|
||||
a {
|
||||
color: var(--secondary-color, var(--fallback-secondary));
|
||||
text-decoration: none;
|
||||
transition: color 0.2s;
|
||||
}
|
||||
a:hover {
|
||||
color: var(--primary-color, var(--fallback-primary));
|
||||
}
|
||||
|
||||
/* --- Main Container Layout --- */
|
||||
.ai-assistant-container {
|
||||
display: flex;
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
background-color: var(--background-color, var(--fallback-bg));
|
||||
}
|
||||
|
||||
/* --- Sidebar Styling --- */
|
||||
.sidebar {
|
||||
flex-shrink: 0; /* Prevent sidebars from shrinking */
|
||||
height: 100%;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
/* background-color: var(--code-bg-color, var(--fallback-code-bg)); */
|
||||
overflow-y: hidden; /* Header fixed, list scrolls */
|
||||
}
|
||||
|
||||
.left-sidebar {
|
||||
flex-basis: 240px; /* Width of history panel */
|
||||
border-right: 1px solid var(--progress-bar-background, var(--fallback-border));
|
||||
}
|
||||
|
||||
.right-sidebar {
|
||||
flex-basis: 280px; /* Width of citations panel */
|
||||
border-left: 1px solid var(--progress-bar-background, var(--fallback-border));
|
||||
}
|
||||
|
||||
.sidebar header {
|
||||
padding: 0.6em 1em;
|
||||
border-bottom: 1px solid var(--progress-bar-background, var(--fallback-border));
|
||||
flex-shrink: 0;
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.sidebar header h3 {
|
||||
margin: 0;
|
||||
font-size: 1.1em;
|
||||
color: var(--font-color, var(--fallback-font));
|
||||
}
|
||||
|
||||
.sidebar ul {
|
||||
list-style: none;
|
||||
padding: 0;
|
||||
margin: 0;
|
||||
overflow-y: auto; /* Enable scrolling for the list */
|
||||
flex-grow: 1; /* Allow list to take remaining space */
|
||||
padding: 0.5em 0;
|
||||
}
|
||||
|
||||
.sidebar ul li {
|
||||
padding: 0.3em 1em;
|
||||
}
|
||||
.sidebar ul li.no-citations,
|
||||
.sidebar ul li.no-history {
|
||||
color: var(--secondary-color, var(--fallback-secondary));
|
||||
font-style: italic;
|
||||
font-size: 0.9em;
|
||||
padding-left: 1em;
|
||||
}
|
||||
|
||||
.sidebar ul li a {
|
||||
color: var(--secondary-color, var(--fallback-secondary));
|
||||
text-decoration: none;
|
||||
display: block;
|
||||
padding: 0.2em 0.5em;
|
||||
border-radius: 3px;
|
||||
transition: background-color 0.2s, color 0.2s;
|
||||
}
|
||||
|
||||
.sidebar ul li a:hover {
|
||||
color: var(--primary-color, var(--fallback-primary));
|
||||
background-color: rgba(80, 255, 255, 0.08); /* Use primary color with alpha */
|
||||
}
|
||||
/* Style for active history item */
|
||||
#history-list li.active a {
|
||||
color: var(--primary-dimmed-color, var(--fallback-primary-dimmed));
|
||||
font-weight: bold;
|
||||
background-color: rgba(80, 255, 255, 0.12);
|
||||
}
|
||||
|
||||
/* --- Chat Panel Styling --- */
|
||||
#chat-panel {
|
||||
flex-grow: 1; /* Take remaining space */
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
height: 100%;
|
||||
overflow: hidden; /* Prevent overflow, internal elements handle scroll */
|
||||
}
|
||||
|
||||
#chat-messages {
|
||||
flex-grow: 1;
|
||||
overflow-y: auto; /* Scrollable chat history */
|
||||
padding: 1em 1.5em;
|
||||
border-bottom: 1px solid var(--progress-bar-background, var(--fallback-border));
|
||||
}
|
||||
|
||||
.message {
|
||||
margin-bottom: 1em;
|
||||
padding: 0.8em 1.2em;
|
||||
border-radius: 8px;
|
||||
max-width: 90%; /* Slightly wider */
|
||||
line-height: 1.6;
|
||||
/* Apply pre-wrap for better handling of spaces/newlines AND wrapping */
|
||||
white-space: pre-wrap;
|
||||
word-wrap: break-word; /* Ensure long words break */
|
||||
}
|
||||
|
||||
.user-message {
|
||||
background-color: var(--progress-bar-background, var(--fallback-border)); /* User message background */
|
||||
color: var(--font-color, var(--fallback-font));
|
||||
margin-left: auto; /* Align user messages to the right */
|
||||
text-align: left;
|
||||
}
|
||||
|
||||
.ai-message {
|
||||
background-color: var(--code-bg-color, var(--fallback-code-bg)); /* AI message background */
|
||||
color: var(--font-color, var(--fallback-font));
|
||||
margin-right: auto; /* Align AI messages to the left */
|
||||
border: 1px solid var(--progress-bar-background, var(--fallback-border));
|
||||
}
|
||||
.ai-message.welcome-message {
|
||||
border: none;
|
||||
background-color: transparent;
|
||||
max-width: 100%;
|
||||
text-align: center;
|
||||
color: var(--secondary-color, var(--fallback-secondary));
|
||||
white-space: normal;
|
||||
}
|
||||
|
||||
/* Styles for code within messages */
|
||||
.ai-message code {
|
||||
background-color: var(--invert-font-color, var(--fallback-invert-font)) !important; /* Use light bg for code */
|
||||
/* color: var(--background-color, var(--fallback-bg)) !important; Dark text */
|
||||
padding: 0.1em 0.4em;
|
||||
border-radius: 4px;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
.ai-message pre {
|
||||
background-color: var(--invert-font-color, var(--fallback-invert-font)) !important;
|
||||
color: var(--background-color, var(--fallback-bg)) !important;
|
||||
padding: 1em;
|
||||
border-radius: 5px;
|
||||
overflow-x: auto;
|
||||
margin: 0.8em 0;
|
||||
white-space: pre;
|
||||
}
|
||||
.ai-message pre code {
|
||||
background-color: transparent !important;
|
||||
padding: 0;
|
||||
font-size: inherit;
|
||||
}
|
||||
|
||||
/* Override white-space for specific elements generated by Markdown */
|
||||
.ai-message p,
|
||||
.ai-message ul,
|
||||
.ai-message ol,
|
||||
.ai-message blockquote {
|
||||
white-space: normal; /* Allow standard wrapping for block elements */
|
||||
}
|
||||
|
||||
/* --- Markdown Element Styling within Messages --- */
|
||||
.message p {
|
||||
margin-top: 0;
|
||||
margin-bottom: 0.5em;
|
||||
}
|
||||
.message p:last-child {
|
||||
margin-bottom: 0;
|
||||
}
|
||||
.message ul,
|
||||
.message ol {
|
||||
margin: 0.5em 0 0.5em 1.5em;
|
||||
padding: 0;
|
||||
}
|
||||
.message li {
|
||||
margin-bottom: 0.2em;
|
||||
}
|
||||
|
||||
/* Code block styling (adjusts previous rules slightly) */
|
||||
.message code {
|
||||
/* Inline code */
|
||||
background-color: var(--invert-font-color, var(--fallback-invert-font)) !important;
|
||||
color: var(--font-color);
|
||||
padding: 0.1em 0.4em;
|
||||
border-radius: 4px;
|
||||
font-size: 0.9em;
|
||||
/* Ensure inline code breaks nicely */
|
||||
word-break: break-all;
|
||||
white-space: normal; /* Allow inline code to wrap if needed */
|
||||
}
|
||||
.message pre {
|
||||
/* Code block container */
|
||||
background-color: var(--invert-font-color, var(--fallback-invert-font)) !important;
|
||||
color: var(--background-color, var(--fallback-bg)) !important;
|
||||
padding: 1em;
|
||||
border-radius: 5px;
|
||||
overflow-x: auto;
|
||||
margin: 0.8em 0;
|
||||
font-size: 0.9em; /* Slightly smaller code blocks */
|
||||
}
|
||||
.message pre code {
|
||||
/* Code within code block */
|
||||
background-color: transparent !important;
|
||||
padding: 0;
|
||||
font-size: inherit;
|
||||
word-break: normal; /* Don't break words in code blocks */
|
||||
white-space: pre; /* Preserve whitespace strictly in code blocks */
|
||||
}
|
||||
|
||||
/* Thinking indicator */
|
||||
.message-thinking {
|
||||
display: inline-block;
|
||||
width: 5px;
|
||||
height: 5px;
|
||||
background-color: var(--primary-color, var(--fallback-primary));
|
||||
border-radius: 50%;
|
||||
margin-left: 8px;
|
||||
vertical-align: middle;
|
||||
animation: thinking 1s infinite ease-in-out;
|
||||
}
|
||||
@keyframes thinking {
|
||||
0%,
|
||||
100% {
|
||||
opacity: 0.5;
|
||||
transform: scale(0.8);
|
||||
}
|
||||
50% {
|
||||
opacity: 1;
|
||||
transform: scale(1.2);
|
||||
}
|
||||
}
|
||||
|
||||
/* --- Thinking Indicator (Blinking Cursor Style) --- */
|
||||
.thinking-indicator-cursor {
|
||||
display: inline-block;
|
||||
width: 10px; /* Width of the cursor */
|
||||
height: 1.1em; /* Match line height */
|
||||
background-color: var(--primary-color, var(--fallback-primary));
|
||||
margin-left: 5px;
|
||||
vertical-align: text-bottom; /* Align with text baseline */
|
||||
animation: blink-cursor 1s step-end infinite;
|
||||
}
|
||||
|
||||
@keyframes blink-cursor {
|
||||
from,
|
||||
to {
|
||||
background-color: transparent;
|
||||
}
|
||||
50% {
|
||||
background-color: var(--primary-color, var(--fallback-primary));
|
||||
}
|
||||
}
|
||||
|
||||
#chat-input-area {
|
||||
flex-shrink: 0; /* Prevent input area from shrinking */
|
||||
padding: 1em 1.5em;
|
||||
display: flex;
|
||||
align-items: flex-end; /* Align items to bottom */
|
||||
gap: 10px;
|
||||
background-color: var(--code-bg-color, var(--fallback-code-bg)); /* Match sidebars */
|
||||
}
|
||||
|
||||
#chat-input-area textarea {
|
||||
flex-grow: 1;
|
||||
padding: 0.8em 1em;
|
||||
border: 1px solid var(--progress-bar-background, var(--fallback-border));
|
||||
background-color: var(--background-color, var(--fallback-bg));
|
||||
color: var(--font-color, var(--fallback-font));
|
||||
border-radius: 5px;
|
||||
resize: none; /* Disable manual resize */
|
||||
font-family: inherit;
|
||||
font-size: 1em;
|
||||
line-height: 1.4;
|
||||
max-height: 150px; /* Limit excessive height */
|
||||
overflow-y: auto;
|
||||
/* rows: 2; */
|
||||
}
|
||||
|
||||
#chat-input-area button {
|
||||
/* Basic button styling - maybe inherit from main theme? */
|
||||
padding: 0.6em 1.2em;
|
||||
border: 1px solid var(--primary-dimmed-color, var(--fallback-primary-dimmed));
|
||||
background-color: var(--primary-dimmed-color, var(--fallback-primary-dimmed));
|
||||
color: var(--background-color, var(--fallback-bg));
|
||||
border-radius: 5px;
|
||||
cursor: pointer;
|
||||
font-size: 0.9em;
|
||||
transition: background-color 0.2s, border-color 0.2s;
|
||||
height: min-content; /* Align with bottom of textarea */
|
||||
}
|
||||
|
||||
#chat-input-area button:hover {
|
||||
background-color: var(--primary-color, var(--fallback-primary));
|
||||
border-color: var(--primary-color, var(--fallback-primary));
|
||||
}
|
||||
#chat-input-area button:disabled {
|
||||
opacity: 0.6;
|
||||
cursor: not-allowed;
|
||||
}
|
||||
|
||||
.loading-indicator {
|
||||
font-size: 0.9em;
|
||||
color: var(--secondary-color, var(--fallback-secondary));
|
||||
margin-right: 10px;
|
||||
align-self: center;
|
||||
}
|
||||
|
||||
/* --- Buttons --- */
|
||||
/* Inherit some button styles if possible */
|
||||
.btn.btn-sm {
|
||||
color: var(--font-color, var(--fallback-font));
|
||||
padding: 0.2em 0.5em;
|
||||
font-size: 0.8em;
|
||||
border: 1px solid var(--secondary-color, var(--fallback-secondary));
|
||||
background: none;
|
||||
border-radius: 3px;
|
||||
cursor: pointer;
|
||||
}
|
||||
.btn.btn-sm:hover {
|
||||
border-color: var(--font-color, var(--fallback-font));
|
||||
background-color: var(--progress-bar-background, var(--fallback-border));
|
||||
}
|
||||
|
||||
/* --- Basic Responsiveness --- */
|
||||
@media screen and (max-width: 900px) {
|
||||
.left-sidebar {
|
||||
flex-basis: 200px; /* Shrink history */
|
||||
}
|
||||
.right-sidebar {
|
||||
flex-basis: 240px; /* Shrink citations */
|
||||
}
|
||||
}
|
||||
|
||||
@media screen and (max-width: 768px) {
|
||||
/* Stack layout on mobile? Or hide sidebars? Hiding for now */
|
||||
.sidebar {
|
||||
display: none; /* Hide sidebars on small screens */
|
||||
}
|
||||
/* Could add toggle buttons later */
|
||||
}
|
||||
|
||||
|
||||
/* ==== File: docs/ask_ai/ask-ai.css (Updates V4 - Delete Button) ==== */
|
||||
|
||||
|
||||
.sidebar ul li {
|
||||
/* Use flexbox to align link and delete button */
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
padding: 0; /* Remove padding from li, add to link/button */
|
||||
margin: 0.1em 0; /* Small vertical margin */
|
||||
}
|
||||
|
||||
.sidebar ul li a {
|
||||
/* Link takes most space */
|
||||
flex-grow: 1;
|
||||
padding: 0.3em 0.5em 0.3em 1em; /* Adjust padding */
|
||||
/* Make ellipsis work for long titles */
|
||||
white-space: nowrap;
|
||||
overflow: hidden;
|
||||
text-overflow: ellipsis;
|
||||
/* Keep existing link styles */
|
||||
color: var(--secondary-color, var(--fallback-secondary));
|
||||
text-decoration: none;
|
||||
display: block;
|
||||
border-radius: 3px;
|
||||
transition: background-color 0.2s, color 0.2s;
|
||||
}
|
||||
.sidebar ul li a:hover {
|
||||
color: var(--primary-color, var(--fallback-primary));
|
||||
background-color: rgba(80, 255, 255, 0.08);
|
||||
}
|
||||
|
||||
/* Style for active history item's link */
|
||||
#history-list li.active a {
|
||||
color: var(--primary-dimmed-color, var(--fallback-primary-dimmed));
|
||||
font-weight: bold;
|
||||
background-color: rgba(80, 255, 255, 0.12);
|
||||
}
|
||||
|
||||
/* --- Delete Chat Button --- */
|
||||
.delete-chat-btn {
|
||||
flex-shrink: 0; /* Don't shrink */
|
||||
background: none;
|
||||
border: none;
|
||||
color: var(--secondary-color, var(--fallback-secondary));
|
||||
cursor: pointer;
|
||||
padding: 0.4em 0.8em; /* Padding around icon */
|
||||
font-size: 0.9em;
|
||||
opacity: 0.5; /* Dimmed by default */
|
||||
transition: opacity 0.2s, color 0.2s;
|
||||
margin-left: 5px; /* Space between link and button */
|
||||
border-radius: 3px;
|
||||
}
|
||||
|
||||
.sidebar ul li:hover .delete-chat-btn,
|
||||
.delete-chat-btn:hover {
|
||||
opacity: 1; /* Show fully on hover */
|
||||
color: var(--error-color, #ff3c74); /* Use error color on hover */
|
||||
}
|
||||
.delete-chat-btn:focus {
|
||||
outline: 1px dashed var(--error-color, #ff3c74); /* Accessibility */
|
||||
opacity: 1;
|
||||
}
|
||||
603
docs/md_v2/ask_ai/ask-ai.js
Normal file
603
docs/md_v2/ask_ai/ask-ai.js
Normal file
@@ -0,0 +1,603 @@
|
||||
// ==== File: docs/ask_ai/ask-ai.js (Marked, Streaming, History) ====
|
||||
|
||||
document.addEventListener("DOMContentLoaded", () => {
|
||||
console.log("AI Assistant JS V2 Loaded");
|
||||
|
||||
// --- DOM Element Selectors ---
|
||||
const historyList = document.getElementById("history-list");
|
||||
const newChatButton = document.getElementById("new-chat-button");
|
||||
const chatMessages = document.getElementById("chat-messages");
|
||||
const chatInput = document.getElementById("chat-input");
|
||||
const sendButton = document.getElementById("send-button");
|
||||
const citationsList = document.getElementById("citations-list");
|
||||
|
||||
// --- Constants ---
|
||||
const CHAT_INDEX_KEY = "aiAssistantChatIndex_v1";
|
||||
const CHAT_PREFIX = "aiAssistantChat_v1_";
|
||||
|
||||
// --- State ---
|
||||
let currentChatId = null;
|
||||
let conversationHistory = []; // Holds message objects { sender: 'user'/'ai', text: '...' }
|
||||
let isThinking = false;
|
||||
let streamInterval = null; // To control the streaming interval
|
||||
|
||||
// --- Event Listeners ---
|
||||
sendButton.addEventListener("click", handleSendMessage);
|
||||
chatInput.addEventListener("keydown", handleInputKeydown);
|
||||
newChatButton.addEventListener("click", handleNewChat);
|
||||
chatInput.addEventListener("input", autoGrowTextarea);
|
||||
|
||||
// --- Initialization ---
|
||||
loadChatHistoryIndex(); // Load history list on startup
|
||||
const initialQuery = checkForInitialQuery(window.parent.location); // Check for query param
|
||||
if (!initialQuery) {
|
||||
loadInitialChat(); // Load normally if no query
|
||||
}
|
||||
|
||||
// --- Core Functions ---
|
||||
|
||||
function handleSendMessage() {
|
||||
const userMessageText = chatInput.value.trim();
|
||||
if (!userMessageText || isThinking) return;
|
||||
|
||||
setThinking(true); // Start thinking state
|
||||
|
||||
// Add user message to state and UI
|
||||
const userMessage = { sender: "user", text: userMessageText };
|
||||
conversationHistory.push(userMessage);
|
||||
addMessageToChat(userMessage, false); // Add user message without parsing markdown
|
||||
|
||||
chatInput.value = "";
|
||||
autoGrowTextarea(); // Reset textarea height
|
||||
|
||||
// Prepare for AI response (create empty div)
|
||||
const aiMessageDiv = addMessageToChat({ sender: "ai", text: "" }, true); // Add empty div with thinking indicator
|
||||
|
||||
// TODO: Generate fingerprint/JWT here
|
||||
|
||||
// TODO: Send `conversationHistory` + JWT to backend API
|
||||
// Replace placeholder below with actual API call
|
||||
// The backend should ideally return a stream of text tokens
|
||||
|
||||
// --- Placeholder Streaming Simulation ---
|
||||
const simulatedFullResponse = `Okay, Here’s a minimal Python script that creates an AsyncWebCrawler, fetches a webpage, and prints the first 300 characters of its Markdown output:
|
||||
|
||||
\`\`\`python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
|
||||
async def main():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://example.com")
|
||||
print(result.markdown[:300]) # Print first 300 chars
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
\`\`\`
|
||||
|
||||
A code snippet: \`crawler.run()\`. Check the [quickstart](/core/quickstart).`;
|
||||
|
||||
// Simulate receiving the response stream
|
||||
streamSimulatedResponse(aiMessageDiv, simulatedFullResponse);
|
||||
|
||||
// // Simulate receiving citations *after* stream starts (or with first chunk)
|
||||
// setTimeout(() => {
|
||||
// addCitations([
|
||||
// { title: "Simulated Doc 1", url: "#sim1" },
|
||||
// { title: "Another Concept", url: "#sim2" },
|
||||
// ]);
|
||||
// }, 500); // Citations appear shortly after thinking starts
|
||||
}
|
||||
|
||||
function handleInputKeydown(event) {
|
||||
if (event.key === "Enter" && !event.shiftKey) {
|
||||
event.preventDefault();
|
||||
handleSendMessage();
|
||||
}
|
||||
}
|
||||
|
||||
function addMessageToChat(message, addThinkingIndicator = false) {
|
||||
const messageDiv = document.createElement("div");
|
||||
messageDiv.classList.add("message", `${message.sender}-message`);
|
||||
|
||||
// Parse markdown and set HTML
|
||||
messageDiv.innerHTML = message.text ? marked.parse(message.text) : "";
|
||||
|
||||
if (message.sender === "ai") {
|
||||
// Apply Syntax Highlighting AFTER setting innerHTML
|
||||
messageDiv.querySelectorAll("pre code:not(.hljs)").forEach((block) => {
|
||||
if (typeof hljs !== "undefined") {
|
||||
// Check if already highlighted to prevent double-highlighting issues
|
||||
if (!block.classList.contains("hljs")) {
|
||||
hljs.highlightElement(block);
|
||||
}
|
||||
} else {
|
||||
console.warn("highlight.js (hljs) not found for syntax highlighting.");
|
||||
}
|
||||
});
|
||||
|
||||
// Add thinking indicator if needed (and not already present)
|
||||
if (addThinkingIndicator && !message.text && !messageDiv.querySelector(".thinking-indicator-cursor")) {
|
||||
const thinkingDiv = document.createElement("div");
|
||||
thinkingDiv.className = "thinking-indicator-cursor";
|
||||
messageDiv.appendChild(thinkingDiv);
|
||||
}
|
||||
} else {
|
||||
// User messages remain plain text
|
||||
// messageDiv.textContent = message.text;
|
||||
}
|
||||
|
||||
// wrap each pre in a div.terminal
|
||||
messageDiv.querySelectorAll("pre").forEach((block) => {
|
||||
const wrapper = document.createElement("div");
|
||||
wrapper.className = "terminal";
|
||||
block.parentNode.insertBefore(wrapper, block);
|
||||
wrapper.appendChild(block);
|
||||
});
|
||||
|
||||
chatMessages.appendChild(messageDiv);
|
||||
// Scroll only if user is near the bottom? (More advanced)
|
||||
// Simple scroll for now:
|
||||
scrollToBottom();
|
||||
return messageDiv; // Return the created element
|
||||
}
|
||||
|
||||
function streamSimulatedResponse(messageDiv, fullText) {
|
||||
const thinkingIndicator = messageDiv.querySelector(".thinking-indicator-cursor");
|
||||
if (thinkingIndicator) thinkingIndicator.remove();
|
||||
|
||||
const tokens = fullText.split(/(\s+)/);
|
||||
let currentText = "";
|
||||
let tokenIndex = 0;
|
||||
// Clear previous interval just in case
|
||||
if (streamInterval) clearInterval(streamInterval);
|
||||
|
||||
streamInterval = setInterval(() => {
|
||||
const cursorSpan = '<span class="thinking-indicator-cursor"></span>'; // Cursor for streaming
|
||||
if (tokenIndex < tokens.length) {
|
||||
currentText += tokens[tokenIndex];
|
||||
// Render intermediate markdown + cursor
|
||||
messageDiv.innerHTML = marked.parse(currentText + cursorSpan);
|
||||
// Re-highlight code blocks on each stream update - might be slightly inefficient
|
||||
// but ensures partial code blocks look okay. Highlight only final on completion.
|
||||
// messageDiv.querySelectorAll('pre code:not(.hljs)').forEach((block) => {
|
||||
// hljs.highlightElement(block);
|
||||
// });
|
||||
scrollToBottom(); // Keep scrolling as content streams
|
||||
tokenIndex++;
|
||||
} else {
|
||||
// Streaming finished
|
||||
clearInterval(streamInterval);
|
||||
streamInterval = null;
|
||||
|
||||
// Final render without cursor
|
||||
messageDiv.innerHTML = marked.parse(currentText);
|
||||
|
||||
// === Final Syntax Highlighting ===
|
||||
messageDiv.querySelectorAll("pre code:not(.hljs)").forEach((block) => {
|
||||
if (typeof hljs !== "undefined" && !block.classList.contains("hljs")) {
|
||||
hljs.highlightElement(block);
|
||||
}
|
||||
});
|
||||
|
||||
// === Extract Citations ===
|
||||
const citations = extractMarkdownLinks(currentText);
|
||||
|
||||
// Wrap each pre in a div.terminal
|
||||
messageDiv.querySelectorAll("pre").forEach((block) => {
|
||||
const wrapper = document.createElement("div");
|
||||
wrapper.className = "terminal";
|
||||
block.parentNode.insertBefore(wrapper, block);
|
||||
wrapper.appendChild(block);
|
||||
});
|
||||
|
||||
const aiMessage = { sender: "ai", text: currentText, citations: citations };
|
||||
conversationHistory.push(aiMessage);
|
||||
updateCitationsDisplay();
|
||||
saveCurrentChat();
|
||||
setThinking(false);
|
||||
}
|
||||
}, 50); // Adjust speed
|
||||
}
|
||||
|
||||
// === NEW Function to Extract Links ===
|
||||
function extractMarkdownLinks(markdownText) {
|
||||
const regex = /\[([^\]]+)\]\(([^)]+)\)/g; // [text](url)
|
||||
const citations = [];
|
||||
let match;
|
||||
while ((match = regex.exec(markdownText)) !== null) {
|
||||
// Avoid adding self-links from within the citations list if AI includes them
|
||||
if (!match[2].startsWith("#citation-")) {
|
||||
citations.push({
|
||||
title: match[1].trim(),
|
||||
url: match[2].trim(),
|
||||
});
|
||||
}
|
||||
}
|
||||
// Optional: Deduplicate links based on URL
|
||||
const uniqueCitations = citations.filter(
|
||||
(citation, index, self) => index === self.findIndex((c) => c.url === citation.url)
|
||||
);
|
||||
return uniqueCitations;
|
||||
}
|
||||
|
||||
// === REVISED Function to Display Citations ===
|
||||
function updateCitationsDisplay() {
|
||||
let lastCitations = null;
|
||||
// Find the most recent AI message with citations
|
||||
for (let i = conversationHistory.length - 1; i >= 0; i--) {
|
||||
if (
|
||||
conversationHistory[i].sender === "ai" &&
|
||||
conversationHistory[i].citations &&
|
||||
conversationHistory[i].citations.length > 0
|
||||
) {
|
||||
lastCitations = conversationHistory[i].citations;
|
||||
break; // Found the latest citations
|
||||
}
|
||||
}
|
||||
|
||||
citationsList.innerHTML = ""; // Clear previous
|
||||
if (!lastCitations) {
|
||||
citationsList.innerHTML = '<li class="no-citations">No citations available.</li>';
|
||||
return;
|
||||
}
|
||||
|
||||
lastCitations.forEach((citation, index) => {
|
||||
const li = document.createElement("li");
|
||||
const a = document.createElement("a");
|
||||
// Generate a unique ID for potential internal linking if needed
|
||||
// a.id = `citation-${index}`;
|
||||
a.href = citation.url || "#";
|
||||
a.textContent = citation.title;
|
||||
a.target = "_top"; // Open in main window
|
||||
li.appendChild(a);
|
||||
citationsList.appendChild(li);
|
||||
});
|
||||
}
|
||||
|
||||
function addCitations(citations) {
|
||||
citationsList.innerHTML = ""; // Clear
|
||||
if (!citations || citations.length === 0) {
|
||||
citationsList.innerHTML = '<li class="no-citations">No citations available.</li>';
|
||||
return;
|
||||
}
|
||||
citations.forEach((citation) => {
|
||||
const li = document.createElement("li");
|
||||
const a = document.createElement("a");
|
||||
a.href = citation.url || "#";
|
||||
a.textContent = citation.title;
|
||||
a.target = "_top"; // Open in main window
|
||||
li.appendChild(a);
|
||||
citationsList.appendChild(li);
|
||||
});
|
||||
}
|
||||
|
||||
function setThinking(thinking) {
|
||||
isThinking = thinking;
|
||||
sendButton.disabled = thinking;
|
||||
chatInput.disabled = thinking;
|
||||
chatInput.placeholder = thinking ? "AI is responding..." : "Ask about Crawl4AI...";
|
||||
// Stop any existing stream if we start thinking again (e.g., rapid resend)
|
||||
if (thinking && streamInterval) {
|
||||
clearInterval(streamInterval);
|
||||
streamInterval = null;
|
||||
}
|
||||
}
|
||||
|
||||
function autoGrowTextarea() {
|
||||
chatInput.style.height = "auto";
|
||||
chatInput.style.height = `${chatInput.scrollHeight}px`;
|
||||
}
|
||||
|
||||
function scrollToBottom() {
|
||||
chatMessages.scrollTop = chatMessages.scrollHeight;
|
||||
}
|
||||
|
||||
// --- Query Parameter Handling ---
|
||||
function checkForInitialQuery(locationToCheck) {
|
||||
// <-- Receive location object
|
||||
if (!locationToCheck) {
|
||||
console.warn("Ask AI: Could not access parent window location.");
|
||||
return false;
|
||||
}
|
||||
const urlParams = new URLSearchParams(locationToCheck.search); // <-- Use passed location's search string
|
||||
const encodedQuery = urlParams.get("qq"); // <-- Use 'qq'
|
||||
|
||||
if (encodedQuery) {
|
||||
console.log("Initial query found (qq):", encodedQuery);
|
||||
try {
|
||||
const decodedText = decodeURIComponent(escape(atob(encodedQuery)));
|
||||
console.log("Decoded query:", decodedText);
|
||||
|
||||
// Start new chat immediately
|
||||
handleNewChat(true);
|
||||
|
||||
// Delay setting input and sending message slightly
|
||||
setTimeout(() => {
|
||||
chatInput.value = decodedText;
|
||||
autoGrowTextarea();
|
||||
handleSendMessage();
|
||||
|
||||
// Clean the PARENT window's URL
|
||||
try {
|
||||
const cleanUrl = locationToCheck.pathname;
|
||||
// Use parent's history object
|
||||
window.parent.history.replaceState({}, window.parent.document.title, cleanUrl);
|
||||
} catch (e) {
|
||||
console.warn("Ask AI: Could not clean parent URL using replaceState.", e);
|
||||
// This might fail due to cross-origin restrictions if served differently,
|
||||
// but should work fine with mkdocs serve on the same origin.
|
||||
}
|
||||
}, 100);
|
||||
|
||||
return true; // Query processed
|
||||
} catch (e) {
|
||||
console.error("Error decoding initial query (qq):", e);
|
||||
// Clean the PARENT window's URL even on error
|
||||
try {
|
||||
const cleanUrl = locationToCheck.pathname;
|
||||
window.parent.history.replaceState({}, window.parent.document.title, cleanUrl);
|
||||
} catch (cleanError) {
|
||||
console.warn("Ask AI: Could not clean parent URL after decode error.", cleanError);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return false; // No 'qq' query found
|
||||
}
|
||||
|
||||
// --- History Management ---
|
||||
|
||||
function handleNewChat(isFromQuery = false) {
|
||||
if (isThinking) return; // Don't allow new chat while responding
|
||||
|
||||
// Only save if NOT triggered immediately by a query parameter load
|
||||
if (!isFromQuery) {
|
||||
saveCurrentChat();
|
||||
}
|
||||
|
||||
currentChatId = `chat_${Date.now()}`;
|
||||
conversationHistory = []; // Clear message history state
|
||||
chatMessages.innerHTML = ""; // Start with clean slate for query
|
||||
if (!isFromQuery) {
|
||||
// Show welcome only if manually started
|
||||
chatMessages.innerHTML =
|
||||
'<div class="message ai-message welcome-message">Started a new chat! Ask me anything about Crawl4AI.</div>';
|
||||
}
|
||||
addCitations([]); // Clear citations
|
||||
updateCitationsDisplay(); // Clear UI
|
||||
|
||||
// Add to index and save
|
||||
let index = loadChatIndex();
|
||||
// Generate a generic title initially, update later
|
||||
const newTitle = isFromQuery ? "Chat from Selection" : `Chat ${new Date().toLocaleString()}`;
|
||||
// index.unshift({ id: currentChatId, title: `Chat ${new Date().toLocaleString()}` }); // Add to start
|
||||
index.unshift({ id: currentChatId, title: newTitle });
|
||||
saveChatIndex(index);
|
||||
|
||||
renderHistoryList(index); // Update UI
|
||||
setActiveHistoryItem(currentChatId);
|
||||
saveCurrentChat(); // Save the empty new chat state
|
||||
}
|
||||
|
||||
function loadChat(chatId) {
|
||||
if (isThinking || chatId === currentChatId) return;
|
||||
|
||||
// Check if chat data actually exists before proceeding
|
||||
const storedChat = localStorage.getItem(CHAT_PREFIX + chatId);
|
||||
if (storedChat === null) {
|
||||
console.warn(`Attempted to load non-existent chat: ${chatId}. Removing from index.`);
|
||||
deleteChatData(chatId); // Clean up index
|
||||
loadChatHistoryIndex(); // Reload history list
|
||||
loadInitialChat(); // Load next available chat
|
||||
return;
|
||||
}
|
||||
|
||||
console.log(`Loading chat: ${chatId}`);
|
||||
saveCurrentChat(); // Save current before switching
|
||||
|
||||
try {
|
||||
conversationHistory = JSON.parse(storedChat);
|
||||
currentChatId = chatId;
|
||||
renderChatMessages(conversationHistory);
|
||||
updateCitationsDisplay();
|
||||
setActiveHistoryItem(chatId);
|
||||
} catch (e) {
|
||||
console.error("Error loading chat:", chatId, e);
|
||||
alert("Failed to load chat data.");
|
||||
conversationHistory = [];
|
||||
renderChatMessages(conversationHistory);
|
||||
updateCitationsDisplay();
|
||||
}
|
||||
}
|
||||
|
||||
function saveCurrentChat() {
|
||||
if (currentChatId && conversationHistory.length > 0) {
|
||||
try {
|
||||
localStorage.setItem(CHAT_PREFIX + currentChatId, JSON.stringify(conversationHistory));
|
||||
console.log(`Chat ${currentChatId} saved.`);
|
||||
|
||||
// Update title in index (e.g., use first user message)
|
||||
let index = loadChatIndex();
|
||||
const currentItem = index.find((item) => item.id === currentChatId);
|
||||
if (
|
||||
currentItem &&
|
||||
conversationHistory[0]?.sender === "user" &&
|
||||
!currentItem.title.startsWith("Chat about:")
|
||||
) {
|
||||
currentItem.title = `Chat about: ${conversationHistory[0].text.substring(0, 30)}...`;
|
||||
saveChatIndex(index);
|
||||
// Re-render history list if title changed - small optimization needed here maybe
|
||||
renderHistoryList(index);
|
||||
setActiveHistoryItem(currentChatId); // Re-set active after re-render
|
||||
}
|
||||
} catch (e) {
|
||||
console.error("Error saving chat:", currentChatId, e);
|
||||
// Handle potential storage full errors
|
||||
if (e.name === "QuotaExceededError") {
|
||||
alert("Local storage is full. Cannot save chat history.");
|
||||
// Consider implementing history pruning logic here
|
||||
}
|
||||
}
|
||||
} else if (currentChatId) {
|
||||
// Save empty state for newly created chats if needed, or remove?
|
||||
localStorage.setItem(CHAT_PREFIX + currentChatId, JSON.stringify([]));
|
||||
}
|
||||
}
|
||||
|
||||
function loadChatIndex() {
|
||||
try {
|
||||
const storedIndex = localStorage.getItem(CHAT_INDEX_KEY);
|
||||
return storedIndex ? JSON.parse(storedIndex) : [];
|
||||
} catch (e) {
|
||||
console.error("Error loading chat index:", e);
|
||||
return []; // Return empty array on error
|
||||
}
|
||||
}
|
||||
|
||||
function saveChatIndex(indexArray) {
|
||||
try {
|
||||
localStorage.setItem(CHAT_INDEX_KEY, JSON.stringify(indexArray));
|
||||
} catch (e) {
|
||||
console.error("Error saving chat index:", e);
|
||||
}
|
||||
}
|
||||
|
||||
function renderHistoryList(indexArray) {
|
||||
historyList.innerHTML = ""; // Clear existing
|
||||
if (!indexArray || indexArray.length === 0) {
|
||||
historyList.innerHTML = '<li class="no-history">No past chats found.</li>';
|
||||
return;
|
||||
}
|
||||
indexArray.forEach((item) => {
|
||||
const li = document.createElement("li");
|
||||
li.dataset.chatId = item.id; // Add ID to li for easier selection
|
||||
|
||||
const a = document.createElement("a");
|
||||
a.href = "#";
|
||||
a.dataset.chatId = item.id;
|
||||
a.textContent = item.title || `Chat ${item.id.split("_")[1] || item.id}`;
|
||||
a.title = a.textContent; // Tooltip for potentially long titles
|
||||
a.addEventListener("click", (e) => {
|
||||
e.preventDefault();
|
||||
loadChat(item.id);
|
||||
});
|
||||
|
||||
// === Add Delete Button ===
|
||||
const deleteBtn = document.createElement("button");
|
||||
deleteBtn.className = "delete-chat-btn";
|
||||
deleteBtn.innerHTML = "✕"; // Trash can emoji/icon (or use text/SVG/FontAwesome)
|
||||
deleteBtn.title = "Delete Chat";
|
||||
deleteBtn.dataset.chatId = item.id; // Store ID on button too
|
||||
deleteBtn.addEventListener("click", handleDeleteChat);
|
||||
|
||||
li.appendChild(a);
|
||||
li.appendChild(deleteBtn); // Append button to the list item
|
||||
historyList.appendChild(li);
|
||||
});
|
||||
}
|
||||
|
||||
function renderChatMessages(messages) {
|
||||
chatMessages.innerHTML = ""; // Clear existing messages
|
||||
messages.forEach((message) => {
|
||||
// Ensure highlighting is applied when loading from history
|
||||
addMessageToChat(message, false);
|
||||
});
|
||||
if (messages.length === 0) {
|
||||
chatMessages.innerHTML =
|
||||
'<div class="message ai-message welcome-message">Chat history loaded. Ask a question!</div>';
|
||||
}
|
||||
// Scroll to bottom after loading messages
|
||||
scrollToBottom();
|
||||
}
|
||||
|
||||
function setActiveHistoryItem(chatId) {
|
||||
document.querySelectorAll("#history-list li").forEach((li) => li.classList.remove("active"));
|
||||
// Select the LI element directly now
|
||||
const activeLi = document.querySelector(`#history-list li[data-chat-id="${chatId}"]`);
|
||||
if (activeLi) {
|
||||
activeLi.classList.add("active");
|
||||
}
|
||||
}
|
||||
|
||||
function loadInitialChat() {
|
||||
const index = loadChatIndex();
|
||||
if (index.length > 0) {
|
||||
loadChat(index[0].id);
|
||||
} else {
|
||||
// Check if handleNewChat wasn't already called by query handler
|
||||
if (!currentChatId) {
|
||||
handleNewChat();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function loadChatHistoryIndex() {
|
||||
const index = loadChatIndex();
|
||||
renderHistoryList(index);
|
||||
if (currentChatId) setActiveHistoryItem(currentChatId);
|
||||
}
|
||||
|
||||
// === NEW Function to Handle Delete Click ===
|
||||
function handleDeleteChat(event) {
|
||||
event.stopPropagation(); // Prevent triggering loadChat on the link behind it
|
||||
const button = event.currentTarget;
|
||||
const chatIdToDelete = button.dataset.chatId;
|
||||
|
||||
if (!chatIdToDelete) return;
|
||||
|
||||
// Confirmation dialog
|
||||
if (
|
||||
window.confirm(
|
||||
`Are you sure you want to delete this chat session?\n"${
|
||||
button.previousElementSibling?.textContent || "Chat " + chatIdToDelete
|
||||
}"`
|
||||
)
|
||||
) {
|
||||
console.log(`Deleting chat: ${chatIdToDelete}`);
|
||||
|
||||
// Perform deletion
|
||||
const updatedIndex = deleteChatData(chatIdToDelete);
|
||||
|
||||
// If the deleted chat was the currently active one, load another chat
|
||||
if (currentChatId === chatIdToDelete) {
|
||||
currentChatId = null; // Reset current ID
|
||||
conversationHistory = []; // Clear state
|
||||
if (updatedIndex.length > 0) {
|
||||
// Load the new top chat (most recent remaining)
|
||||
loadChat(updatedIndex[0].id);
|
||||
} else {
|
||||
// No chats left, start a new one
|
||||
handleNewChat();
|
||||
}
|
||||
} else {
|
||||
// If a different chat was deleted, just re-render the list
|
||||
renderHistoryList(updatedIndex);
|
||||
// Re-apply active state in case IDs shifted (though they shouldn't)
|
||||
setActiveHistoryItem(currentChatId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// === NEW Function to Delete Chat Data ===
|
||||
function deleteChatData(chatId) {
|
||||
// Remove chat data
|
||||
localStorage.removeItem(CHAT_PREFIX + chatId);
|
||||
|
||||
// Update index
|
||||
let index = loadChatIndex();
|
||||
index = index.filter((item) => item.id !== chatId);
|
||||
saveChatIndex(index);
|
||||
|
||||
console.log(`Chat ${chatId} data and index entry removed.`);
|
||||
return index; // Return the updated index
|
||||
}
|
||||
|
||||
// --- Virtual Scrolling Placeholder ---
|
||||
// NOTE: Virtual scrolling is complex. For now, we do direct rendering.
|
||||
// If performance becomes an issue with very long chats/history,
|
||||
// investigate libraries like 'simple-virtual-scroll' or 'virtual-scroller'.
|
||||
// You would replace parts of `renderChatMessages` and `renderHistoryList`
|
||||
// to work with the chosen library's API (providing data and item renderers).
|
||||
console.warn("Virtual scrolling not implemented. Performance may degrade with very long chat histories.");
|
||||
});
|
||||
64
docs/md_v2/ask_ai/index.html
Normal file
64
docs/md_v2/ask_ai/index.html
Normal file
@@ -0,0 +1,64 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Crawl4AI Assistant</title>
|
||||
<!-- Link main styles first for variable access -->
|
||||
<link rel="stylesheet" href="../assets/layout.css">
|
||||
<link rel="stylesheet" href="../assets/styles.css">
|
||||
<!-- Link specific AI styles -->
|
||||
<link rel="stylesheet" href="../assets/highlight.css">
|
||||
<link rel="stylesheet" href="ask-ai.css">
|
||||
</head>
|
||||
<body>
|
||||
<div class="ai-assistant-container">
|
||||
|
||||
<!-- Left Sidebar: Conversation History -->
|
||||
<aside id="history-panel" class="sidebar left-sidebar">
|
||||
<header>
|
||||
<h3>History</h3>
|
||||
<button id="new-chat-button" class="btn btn-sm">New Chat</button>
|
||||
</header>
|
||||
<ul id="history-list">
|
||||
<!-- History items populated by JS -->
|
||||
</ul>
|
||||
</aside>
|
||||
|
||||
<!-- Main Area: Chat Interface -->
|
||||
<main id="chat-panel">
|
||||
<div id="chat-messages">
|
||||
<!-- Chat messages populated by JS -->
|
||||
<div class="message ai-message welcome-message">
|
||||
Welcome to the Crawl4AI Assistant! How can I help you today?
|
||||
</div>
|
||||
</div>
|
||||
<div id="chat-input-area">
|
||||
<!-- Loading indicator for general waiting (optional) -->
|
||||
<!-- <div class="loading-indicator" style="display: none;">Thinking...</div> -->
|
||||
<textarea id="chat-input" placeholder="Ask about Crawl4AI..." rows="2"></textarea>
|
||||
<button id="send-button">Send</button>
|
||||
</div>
|
||||
</main>
|
||||
|
||||
<!-- Right Sidebar: Citations / Context -->
|
||||
<aside id="citations-panel" class="sidebar right-sidebar">
|
||||
<header>
|
||||
<h3>Citations</h3>
|
||||
</header>
|
||||
<ul id="citations-list">
|
||||
<!-- Citations populated by JS -->
|
||||
<li class="no-citations">No citations for this response yet.</li>
|
||||
</ul>
|
||||
</aside>
|
||||
|
||||
</div>
|
||||
|
||||
<!-- Include Marked.js library -->
|
||||
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
|
||||
<script src="../assets/highlight.min.js"></script>
|
||||
|
||||
<!-- Your AI Assistant Logic -->
|
||||
<script src="ask-ai.js"></script>
|
||||
</body>
|
||||
</html>
|
||||
62
docs/md_v2/assets/copy_code.js
Normal file
62
docs/md_v2/assets/copy_code.js
Normal file
@@ -0,0 +1,62 @@
|
||||
// ==== File: docs/assets/copy_code.js ====
|
||||
|
||||
document.addEventListener('DOMContentLoaded', () => {
|
||||
// Target specifically code blocks within the main content area
|
||||
const codeBlocks = document.querySelectorAll('#terminal-mkdocs-main-content pre > code');
|
||||
|
||||
codeBlocks.forEach((codeElement) => {
|
||||
const preElement = codeElement.parentElement; // The <pre> tag
|
||||
|
||||
// Ensure the <pre> tag can contain a positioned button
|
||||
if (window.getComputedStyle(preElement).position === 'static') {
|
||||
preElement.style.position = 'relative';
|
||||
}
|
||||
|
||||
// Create the button
|
||||
const copyButton = document.createElement('button');
|
||||
copyButton.className = 'copy-code-button';
|
||||
copyButton.type = 'button';
|
||||
copyButton.setAttribute('aria-label', 'Copy code to clipboard');
|
||||
copyButton.title = 'Copy code to clipboard';
|
||||
copyButton.innerHTML = 'Copy'; // Or use an icon like an SVG or FontAwesome class
|
||||
|
||||
// Append the button to the <pre> element
|
||||
preElement.appendChild(copyButton);
|
||||
|
||||
// Add click event listener
|
||||
copyButton.addEventListener('click', () => {
|
||||
copyCodeToClipboard(codeElement, copyButton);
|
||||
});
|
||||
});
|
||||
|
||||
async function copyCodeToClipboard(codeElement, button) {
|
||||
// Use innerText to get the rendered text content, preserving line breaks
|
||||
const textToCopy = codeElement.innerText;
|
||||
|
||||
try {
|
||||
await navigator.clipboard.writeText(textToCopy);
|
||||
|
||||
// Visual feedback
|
||||
button.innerHTML = 'Copied!';
|
||||
button.classList.add('copied');
|
||||
button.disabled = true; // Temporarily disable
|
||||
|
||||
// Revert button state after a short delay
|
||||
setTimeout(() => {
|
||||
button.innerHTML = 'Copy';
|
||||
button.classList.remove('copied');
|
||||
button.disabled = false;
|
||||
}, 2000); // Show "Copied!" for 2 seconds
|
||||
|
||||
} catch (err) {
|
||||
console.error('Failed to copy code: ', err);
|
||||
// Optional: Provide error feedback on the button
|
||||
button.innerHTML = 'Error';
|
||||
setTimeout(() => {
|
||||
button.innerHTML = 'Copy';
|
||||
}, 2000);
|
||||
}
|
||||
}
|
||||
|
||||
console.log("Copy Code Button script loaded.");
|
||||
});
|
||||
39
docs/md_v2/assets/floating_ask_ai_button.js
Normal file
39
docs/md_v2/assets/floating_ask_ai_button.js
Normal file
@@ -0,0 +1,39 @@
|
||||
// ==== File: docs/assets/floating_ask_ai_button.js ====
|
||||
|
||||
document.addEventListener('DOMContentLoaded', () => {
|
||||
const askAiPagePath = '/core/ask-ai/'; // IMPORTANT: Adjust this path if needed!
|
||||
const currentPath = window.location.pathname;
|
||||
|
||||
// Determine the base URL for constructing the link correctly,
|
||||
// especially if deployed in a sub-directory.
|
||||
// This assumes a simple structure; adjust if needed.
|
||||
const baseUrl = window.location.origin + (currentPath.startsWith('/core/') ? '../..' : '');
|
||||
|
||||
|
||||
// Check if the current page IS the Ask AI page
|
||||
// Use includes() for flexibility (handles trailing slash or .html)
|
||||
if (currentPath.includes(askAiPagePath.replace(/\/$/, ''))) { // Remove trailing slash for includes check
|
||||
console.log("Floating Ask AI Button: Not adding button on the Ask AI page itself.");
|
||||
return; // Don't add the button on the target page
|
||||
}
|
||||
|
||||
// --- Create the button ---
|
||||
const fabLink = document.createElement('a');
|
||||
fabLink.className = 'floating-ask-ai-button';
|
||||
fabLink.href = askAiPagePath; // Construct the correct URL
|
||||
fabLink.title = 'Ask Crawl4AI Assistant';
|
||||
fabLink.setAttribute('aria-label', 'Ask Crawl4AI Assistant');
|
||||
|
||||
// Add content (using SVG icon for better visuals)
|
||||
fabLink.innerHTML = `
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="24" height="24" fill="currentColor">
|
||||
<path d="M20 2H4c-1.1 0-2 .9-2 2v12c0 1.1.9 2 2 2h14l4 4V4c0-1.1-.9-2-2-2zm-2 12H6v-2h12v2zm0-3H6V9h12v2zm0-3H6V6h12v2z"/>
|
||||
</svg>
|
||||
<span>Ask AI</span>
|
||||
`;
|
||||
|
||||
// Append to body
|
||||
document.body.appendChild(fabLink);
|
||||
|
||||
console.log("Floating Ask AI Button added.");
|
||||
});
|
||||
119
docs/md_v2/assets/github_stats.js
Normal file
119
docs/md_v2/assets/github_stats.js
Normal file
@@ -0,0 +1,119 @@
|
||||
// ==== File: assets/github_stats.js ====
|
||||
|
||||
document.addEventListener('DOMContentLoaded', async () => {
|
||||
// --- Configuration ---
|
||||
const targetHeaderSelector = '.terminal .container:first-child'; // Selector for your header container
|
||||
const insertBeforeSelector = '.terminal-nav'; // Selector for the element to insert the badge BEFORE (e.g., the main nav)
|
||||
// Or set to null to append at the end of the header.
|
||||
|
||||
// --- Find elements ---
|
||||
const headerContainer = document.querySelector(targetHeaderSelector);
|
||||
if (!headerContainer) {
|
||||
console.warn('GitHub Stats: Header container not found with selector:', targetHeaderSelector);
|
||||
return;
|
||||
}
|
||||
|
||||
const repoLinkElement = headerContainer.querySelector('a[href*="github.com/"]'); // Find the existing GitHub link
|
||||
let repoUrl = 'https://github.com/unclecode/crawl4ai';
|
||||
// if (repoLinkElement) {
|
||||
// repoUrl = repoLinkElement.href;
|
||||
// } else {
|
||||
// // Fallback: Try finding from config (requires template injection - harder)
|
||||
// // Or hardcode if necessary, but reading from the link is better.
|
||||
// console.warn('GitHub Stats: GitHub repo link not found in header.');
|
||||
// // Try to get repo_url from mkdocs config if available globally (less likely)
|
||||
// // repoUrl = window.mkdocs_config?.repo_url; // Requires setting this variable
|
||||
// // if (!repoUrl) return; // Exit if still no URL
|
||||
// return; // Exit for now if link isn't found
|
||||
// }
|
||||
|
||||
|
||||
// --- Extract Repo Owner/Name ---
|
||||
let owner = '';
|
||||
let repo = '';
|
||||
try {
|
||||
const url = new URL(repoUrl);
|
||||
const pathParts = url.pathname.split('/').filter(part => part.length > 0);
|
||||
if (pathParts.length >= 2) {
|
||||
owner = pathParts[0];
|
||||
repo = pathParts[1];
|
||||
}
|
||||
} catch (e) {
|
||||
console.error('GitHub Stats: Could not parse repository URL:', repoUrl, e);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!owner || !repo) {
|
||||
console.warn('GitHub Stats: Could not extract owner/repo from URL:', repoUrl);
|
||||
return;
|
||||
}
|
||||
|
||||
// --- Get Version (Attempt to extract from site title) ---
|
||||
let version = '';
|
||||
const siteTitleElement = headerContainer.querySelector('.terminal-title, .site-title'); // Adjust selector based on theme's title element
|
||||
// Example title: "Crawl4AI Documentation (v0.5.x)"
|
||||
if (siteTitleElement) {
|
||||
const match = siteTitleElement.textContent.match(/\((v?[^)]+)\)/); // Look for text in parentheses starting with 'v' (optional)
|
||||
if (match && match[1]) {
|
||||
version = match[1].trim();
|
||||
}
|
||||
}
|
||||
if (!version) {
|
||||
console.info('GitHub Stats: Could not extract version from title. You might need to adjust the selector or regex.');
|
||||
// You could fallback to config.extra.version if injected into JS
|
||||
// version = window.mkdocs_config?.extra?.version || 'N/A';
|
||||
}
|
||||
|
||||
|
||||
// --- Fetch GitHub API Data ---
|
||||
let stars = '...';
|
||||
let forks = '...';
|
||||
try {
|
||||
const apiUrl = `https://api.github.com/repos/${owner}/${repo}`;
|
||||
const response = await fetch(apiUrl);
|
||||
|
||||
if (response.ok) {
|
||||
const data = await response.json();
|
||||
// Format large numbers (optional)
|
||||
stars = data.stargazers_count > 1000 ? `${(data.stargazers_count / 1000).toFixed(1)}k` : data.stargazers_count;
|
||||
forks = data.forks_count > 1000 ? `${(data.forks_count / 1000).toFixed(1)}k` : data.forks_count;
|
||||
} else {
|
||||
console.warn(`GitHub Stats: API request failed with status ${response.status}. Rate limit exceeded?`);
|
||||
stars = 'N/A';
|
||||
forks = 'N/A';
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('GitHub Stats: Error fetching repository data:', error);
|
||||
stars = 'N/A';
|
||||
forks = 'N/A';
|
||||
}
|
||||
|
||||
// --- Create Badge HTML ---
|
||||
const badgeContainer = document.createElement('div');
|
||||
badgeContainer.className = 'github-stats-badge';
|
||||
|
||||
// Use innerHTML for simplicity, including potential icons (requires FontAwesome or similar)
|
||||
// Ensure your theme loads FontAwesome or add it yourself if you want icons.
|
||||
badgeContainer.innerHTML = `
|
||||
<a href="${repoUrl}" target="_blank" rel="noopener">
|
||||
<!-- Optional Icon (FontAwesome example) -->
|
||||
<!-- <i class="fab fa-github"></i> -->
|
||||
<span class="repo-name">${owner}/${repo}</span>
|
||||
${version ? `<span class="stat version"><i class="fas fa-tag"></i> ${version}</span>` : ''}
|
||||
<span class="stat stars"><i class="fas fa-star"></i> ${stars}</span>
|
||||
<span class="stat forks"><i class="fas fa-code-branch"></i> ${forks}</span>
|
||||
</a>
|
||||
`;
|
||||
|
||||
// --- Inject Badge into Header ---
|
||||
const insertBeforeElement = insertBeforeSelector ? headerContainer.querySelector(insertBeforeSelector) : null;
|
||||
if (insertBeforeElement) {
|
||||
// headerContainer.insertBefore(badgeContainer, insertBeforeElement);
|
||||
headerContainer.querySelector(insertBeforeSelector).appendChild(badgeContainer);
|
||||
} else {
|
||||
headerContainer.appendChild(badgeContainer);
|
||||
}
|
||||
|
||||
console.info('GitHub Stats: Badge added to header.');
|
||||
|
||||
});
|
||||
441
docs/md_v2/assets/layout.css
Normal file
441
docs/md_v2/assets/layout.css
Normal file
@@ -0,0 +1,441 @@
|
||||
/* ==== File: assets/layout.css (Non-Fluid Centered Layout) ==== */
|
||||
|
||||
:root {
|
||||
--header-height: 55px; /* Adjust if needed */
|
||||
--sidebar-width: 280px; /* Adjust if needed */
|
||||
--toc-width: 340px; /* As specified */
|
||||
--content-max-width: 90em; /* Max width for the centered content */
|
||||
--layout-transition-speed: 0.2s;
|
||||
--global-space: 10px;
|
||||
}
|
||||
|
||||
/* --- Basic Setup --- */
|
||||
html {
|
||||
scroll-behavior: smooth;
|
||||
scroll-padding-top: calc(var(--header-height) + 15px);
|
||||
box-sizing: border-box;
|
||||
}
|
||||
*, *:before, *:after {
|
||||
box-sizing: inherit;
|
||||
}
|
||||
|
||||
body {
|
||||
padding-top: 0;
|
||||
padding-bottom: 0;
|
||||
background-color: var(--background-color);
|
||||
color: var(--font-color);
|
||||
/* Prevents horizontal scrollbars during transitions */
|
||||
overflow-x: hidden;
|
||||
}
|
||||
|
||||
/* --- Fixed Header --- */
|
||||
/* Full width, fixed header */
|
||||
.terminal .container:first-child { /* Assuming this targets the header container */
|
||||
position: fixed;
|
||||
top: 0;
|
||||
left: 0;
|
||||
right: 0;
|
||||
height: var(--header-height);
|
||||
background-color: var(--background-color);
|
||||
z-index: 1000;
|
||||
border-bottom: 1px solid var(--progress-bar-background);
|
||||
max-width: none; /* Override any container max-width */
|
||||
padding: 0 calc(var(--global-space) * 2);
|
||||
}
|
||||
|
||||
/* --- Main Layout Container (Below Header) --- */
|
||||
/* This container just provides space for the fixed header */
|
||||
.container:has(.terminal-mkdocs-main-grid) {
|
||||
margin: 0 auto;
|
||||
padding: 0;
|
||||
padding-top: var(--header-height); /* Space for fixed header */
|
||||
}
|
||||
|
||||
/* --- Flex Container: Grid holding content and toc (CENTERED) --- */
|
||||
/* THIS is the main centered block */
|
||||
.terminal-mkdocs-main-grid {
|
||||
display: flex;
|
||||
align-items: flex-start;
|
||||
/* Enforce max-width and center */
|
||||
max-width: var(--content-max-width);
|
||||
margin-left: auto;
|
||||
margin-right: auto;
|
||||
position: relative;
|
||||
/* Apply side padding within the centered block */
|
||||
padding-left: calc(var(--global-space) * 2);
|
||||
padding-right: calc(var(--global-space) * 2);
|
||||
/* Add margin-left to clear the fixed sidebar */
|
||||
margin-left: var(--sidebar-width);
|
||||
}
|
||||
|
||||
/* --- 1. Fixed Left Sidebar (Viewport Relative) --- */
|
||||
#terminal-mkdocs-side-panel {
|
||||
position: fixed;
|
||||
top: var(--header-height);
|
||||
left: max(0px, calc((90vw - var(--content-max-width)) / 2));
|
||||
bottom: 0;
|
||||
width: var(--sidebar-width);
|
||||
background-color: var(--background-color);
|
||||
border-right: 1px solid var(--progress-bar-background);
|
||||
overflow-y: auto;
|
||||
z-index: 900;
|
||||
padding: 1em calc(var(--global-space) * 2);
|
||||
padding-bottom: 2em;
|
||||
/* transition: left var(--layout-transition-speed) ease-in-out; */
|
||||
}
|
||||
|
||||
/* --- 2. Main Content Area (Within Centered Grid) --- */
|
||||
#terminal-mkdocs-main-content {
|
||||
flex-grow: 1;
|
||||
flex-shrink: 1;
|
||||
min-width: 0; /* Flexbox shrink fix */
|
||||
|
||||
/* No left/right margins needed here - handled by parent grid */
|
||||
margin-left: 0;
|
||||
margin-right: 0;
|
||||
|
||||
/* Internal Padding */
|
||||
padding: 1.5em 2em;
|
||||
|
||||
position: relative;
|
||||
z-index: 1;
|
||||
}
|
||||
|
||||
/* --- 3. Right Table of Contents (Sticky, Within Centered Grid) --- */
|
||||
#toc-sidebar {
|
||||
flex-basis: var(--toc-width);
|
||||
flex-shrink: 0;
|
||||
width: var(--toc-width);
|
||||
|
||||
position: sticky; /* Sticks within the centered grid */
|
||||
top: var(--header-height);
|
||||
align-self: stretch;
|
||||
height: calc(100vh - var(--header-height));
|
||||
overflow-y: auto;
|
||||
|
||||
padding: 1.5em 1em;
|
||||
font-size: 0.85em;
|
||||
border-left: 1px solid var(--progress-bar-background);
|
||||
z-index: 800;
|
||||
/* display: none; /* JS handles */
|
||||
}
|
||||
|
||||
/* (ToC link styles remain the same) */
|
||||
#toc-sidebar h4 { margin-top: 0; margin-bottom: 1em; font-size: 1.1em; color: var(--secondary-color); padding-left: 0.8em; }
|
||||
#toc-sidebar ul { list-style: none; padding: 0; margin: 0; }
|
||||
#toc-sidebar ul li a { display: block; padding: 0.3em 0; color: var(--secondary-color); text-decoration: none; border-left: 3px solid transparent; padding-left: 0.8em; transition: all 0.1s ease-in-out; line-height: 1.4; word-break: break-word; }
|
||||
#toc-sidebar ul li.toc-level-3 a { padding-left: 1.8em; }
|
||||
#toc-sidebar ul li.toc-level-4 a { padding-left: 2.8em; }
|
||||
#toc-sidebar ul li a:hover { color: var(--font-color); background-color: rgba(255, 255, 255, 0.05); }
|
||||
#toc-sidebar ul li a.active { color: var(--primary-color); border-left-color: var(--primary-color); background-color: rgba(80, 255, 255, 0.08); }
|
||||
|
||||
|
||||
/* --- Footer Styling (Respects Centered Layout) --- */
|
||||
footer {
|
||||
background-color: var(--code-bg-color);
|
||||
color: var(--secondary-color);
|
||||
position: relative;
|
||||
z-index: 10;
|
||||
margin-top: 2em;
|
||||
|
||||
/* Apply margin-left to clear the fixed sidebar */
|
||||
margin-left: var(--sidebar-width);
|
||||
|
||||
/* Constrain width relative to the centered grid it follows */
|
||||
max-width: calc(var(--content-max-width) - var(--sidebar-width));
|
||||
margin-right: auto; /* Keep it left-aligned within the space next to sidebar */
|
||||
|
||||
/* Use padding consistent with the grid */
|
||||
padding: 2em calc(var(--global-space) * 2);
|
||||
}
|
||||
|
||||
/* Adjust footer grid if needed */
|
||||
.terminal-mkdocs-footer-grid {
|
||||
display: grid;
|
||||
grid-template-columns: 1fr auto;
|
||||
gap: 1em;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
/* ==========================================================================
|
||||
RESPONSIVENESS (Adapting the Non-Fluid Layout)
|
||||
========================================================================== */
|
||||
|
||||
/* --- Medium screens: Hide ToC --- */
|
||||
@media screen and (max-width: 1200px) {
|
||||
#toc-sidebar {
|
||||
display: none;
|
||||
}
|
||||
|
||||
.terminal-mkdocs-main-grid {
|
||||
/* Grid adjusts automatically as ToC is removed */
|
||||
/* Ensure grid padding remains */
|
||||
padding-left: calc(var(--global-space) * 2);
|
||||
padding-right: calc(var(--global-space) * 2);
|
||||
}
|
||||
|
||||
#terminal-mkdocs-main-content {
|
||||
/* Content area naturally expands */
|
||||
}
|
||||
|
||||
footer {
|
||||
/* Footer still respects the left sidebar and overall max width */
|
||||
margin-left: var(--sidebar-width);
|
||||
max-width: calc(var(--content-max-width) - var(--sidebar-width));
|
||||
/* Padding remains consistent */
|
||||
padding-left: calc(var(--global-space) * 2);
|
||||
padding-right: calc(var(--global-space) * 2);
|
||||
}
|
||||
}
|
||||
|
||||
/* --- Small screens: Hide left sidebar, full width content & footer --- */
|
||||
@media screen and (max-width: 768px) {
|
||||
|
||||
#terminal-mkdocs-side-panel {
|
||||
left: calc(-1 * var(--sidebar-width));
|
||||
z-index: 1100;
|
||||
box-shadow: 2px 0 10px rgba(0,0,0,0.3);
|
||||
}
|
||||
#terminal-mkdocs-side-panel.sidebar-visible {
|
||||
left: 0;
|
||||
}
|
||||
|
||||
.terminal-mkdocs-main-grid {
|
||||
/* Grid now takes full width (minus body padding) */
|
||||
margin-left: 0; /* Override sidebar margin */
|
||||
margin-right: 0; /* Override auto margin */
|
||||
max-width: 100%; /* Allow full width */
|
||||
padding-left: var(--global-space); /* Reduce padding */
|
||||
padding-right: var(--global-space);
|
||||
}
|
||||
|
||||
#terminal-mkdocs-main-content {
|
||||
padding: 1.5em 1em; /* Adjust internal padding */
|
||||
}
|
||||
|
||||
footer {
|
||||
margin-left: 0; /* Full width footer */
|
||||
max-width: 100%; /* Allow full width */
|
||||
padding: 2em 1em; /* Adjust internal padding */
|
||||
}
|
||||
|
||||
.terminal-mkdocs-footer-grid {
|
||||
grid-template-columns: 1fr; /* Stack footer items */
|
||||
text-align: center;
|
||||
gap: 0.5em;
|
||||
}
|
||||
/* Remember JS for toggle button & overlay */
|
||||
}
|
||||
|
||||
|
||||
/* ==== GitHub Stats Badge Styling ==== */
|
||||
|
||||
.github-stats-badge {
|
||||
display: inline-block; /* Or flex if needed */
|
||||
margin-left: 2em; /* Adjust spacing */
|
||||
vertical-align: middle; /* Align with other header items */
|
||||
font-size: 0.9em; /* Slightly smaller font */
|
||||
}
|
||||
|
||||
.github-stats-badge a {
|
||||
color: var(--secondary-color); /* Use secondary color */
|
||||
text-decoration: none;
|
||||
display: flex; /* Use flex for alignment */
|
||||
align-items: center;
|
||||
gap: 0.8em; /* Space between items */
|
||||
padding: 0.2em 0.5em;
|
||||
border: 1px solid var(--progress-bar-background); /* Subtle border */
|
||||
border-radius: 4px;
|
||||
transition: color 0.2s, background-color 0.2s;
|
||||
}
|
||||
|
||||
.github-stats-badge a:hover {
|
||||
color: var(--font-color); /* Brighter color on hover */
|
||||
background-color: var(--progress-bar-background); /* Subtle background on hover */
|
||||
}
|
||||
|
||||
.github-stats-badge .repo-name {
|
||||
color: var(--font-color); /* Make repo name stand out slightly */
|
||||
font-weight: 500; /* Optional bolder weight */
|
||||
}
|
||||
|
||||
.github-stats-badge .stat {
|
||||
/* Styles for individual stats (version, stars, forks) */
|
||||
white-space: nowrap; /* Prevent wrapping */
|
||||
}
|
||||
|
||||
.github-stats-badge .stat i {
|
||||
/* Optional: Style for FontAwesome icons */
|
||||
margin-right: 0.3em;
|
||||
color: var(--secondary-dimmed-color); /* Dimmer color for icons */
|
||||
}
|
||||
|
||||
|
||||
/* Adjust positioning relative to search/nav if needed */
|
||||
/* Example: If search is floated right */
|
||||
/* .terminal-nav { float: left; } */
|
||||
/* .github-stats-badge { float: left; } */
|
||||
/* #mkdocs-search-query { float: right; } */
|
||||
|
||||
/* --- Responsive adjustments --- */
|
||||
@media screen and (max-width: 900px) { /* Example breakpoint */
|
||||
.github-stats-badge .repo-name {
|
||||
display: none; /* Hide full repo name on smaller screens */
|
||||
}
|
||||
.github-stats-badge {
|
||||
margin-left: 1em;
|
||||
}
|
||||
.github-stats-badge a {
|
||||
gap: 0.5em;
|
||||
}
|
||||
}
|
||||
@media screen and (max-width: 768px) {
|
||||
/* Further hide or simplify on mobile if needed */
|
||||
.github-stats-badge {
|
||||
display: none; /* Example: Hide completely on smallest screens */
|
||||
}
|
||||
}
|
||||
|
||||
/* --- Ask AI Selection Button --- */
|
||||
.ask-ai-selection-button {
|
||||
background-color: var(--primary-dimmed-color, #09b5a5);
|
||||
color: var(--background-color, #070708);
|
||||
border: none;
|
||||
padding: 4px 8px;
|
||||
font-size: 0.8em;
|
||||
border-radius: 4px;
|
||||
cursor: pointer;
|
||||
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.3);
|
||||
transition: background-color 0.2s ease;
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
.ask-ai-selection-button:hover {
|
||||
background-color: var(--primary-color, #50ffff);
|
||||
}
|
||||
|
||||
/* ==== File: docs/assets/layout.css (Additions) ==== */
|
||||
|
||||
/* ... (keep all existing layout CSS) ... */
|
||||
|
||||
/* --- Copy Code Button Styling --- */
|
||||
|
||||
/* Ensure the parent <pre> can contain the absolutely positioned button */
|
||||
#terminal-mkdocs-main-content pre {
|
||||
position: relative; /* Needed for absolute positioning of child */
|
||||
/* Add a little padding top/right to make space for the button */
|
||||
padding-top: 2.5em;
|
||||
padding-right: 1em; /* Ensure padding is sufficient */
|
||||
}
|
||||
|
||||
.copy-code-button {
|
||||
position: absolute;
|
||||
top: 0.5em; /* Adjust spacing from top */
|
||||
left: 0.5em; /* Adjust spacing from left */
|
||||
z-index: 1; /* Sit on top of code */
|
||||
|
||||
background-color: var(--progress-bar-background, #444); /* Use a background */
|
||||
color: var(--font-color, #eaeaea);
|
||||
border: 1px solid var(--secondary-color, #727578);
|
||||
padding: 3px 8px;
|
||||
font-size: 0.8em;
|
||||
font-family: var(--font-stack, monospace);
|
||||
border-radius: 4px;
|
||||
cursor: pointer;
|
||||
opacity: 0; /* Hidden by default */
|
||||
transition: opacity 0.2s ease-in-out, background-color 0.2s ease, color 0.2s ease;
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
/* Show button on hover of the <pre> container */
|
||||
#terminal-mkdocs-main-content pre:hover .copy-code-button {
|
||||
opacity: 0.8; /* Show partially */
|
||||
}
|
||||
|
||||
.copy-code-button:hover {
|
||||
opacity: 1; /* Fully visible on button hover */
|
||||
background-color: var(--secondary-color, #727578);
|
||||
}
|
||||
|
||||
.copy-code-button:focus {
|
||||
opacity: 1; /* Ensure visible when focused */
|
||||
outline: 1px dashed var(--primary-color);
|
||||
}
|
||||
|
||||
|
||||
/* Style for "Copied!" state */
|
||||
.copy-code-button.copied {
|
||||
background-color: var(--primary-dimmed-color, #09b5a5);
|
||||
color: var(--background-color, #070708);
|
||||
border-color: var(--primary-dimmed-color, #09b5a5);
|
||||
opacity: 1; /* Ensure visible */
|
||||
}
|
||||
.copy-code-button.copied:hover {
|
||||
background-color: var(--primary-dimmed-color, #09b5a5); /* Prevent hover change */
|
||||
}
|
||||
|
||||
/* ==== File: docs/assets/layout.css (Additions) ==== */
|
||||
|
||||
/* ... (keep all existing layout CSS) ... */
|
||||
|
||||
/* --- Floating Ask AI Button --- */
|
||||
.floating-ask-ai-button {
|
||||
position: fixed;
|
||||
bottom: 25px;
|
||||
right: 25px;
|
||||
z-index: 1050; /* Below modals, above most content */
|
||||
|
||||
background-color: var(--primary-dimmed-color, #09b5a5);
|
||||
color: var(--background-color, #070708);
|
||||
border: none;
|
||||
border-radius: 50%; /* Make it circular */
|
||||
width: 60px; /* Adjust size */
|
||||
height: 60px; /* Adjust size */
|
||||
padding: 10px; /* Adjust padding */
|
||||
box-shadow: 0 4px 10px rgba(0, 0, 0, 0.4);
|
||||
cursor: pointer;
|
||||
transition: background-color 0.2s ease, transform 0.2s ease;
|
||||
|
||||
display: flex;
|
||||
flex-direction: column; /* Stack icon and text */
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
text-decoration: none;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.floating-ask-ai-button svg {
|
||||
width: 24px; /* Control icon size */
|
||||
height: 24px;
|
||||
}
|
||||
|
||||
.floating-ask-ai-button span {
|
||||
font-size: 0.7em;
|
||||
margin-top: 2px; /* Space between icon and text */
|
||||
display: block; /* Ensure it takes space */
|
||||
line-height: 1;
|
||||
}
|
||||
|
||||
|
||||
.floating-ask-ai-button:hover {
|
||||
background-color: var(--primary-color, #50ffff);
|
||||
transform: scale(1.05); /* Slight grow effect */
|
||||
}
|
||||
|
||||
.floating-ask-ai-button:focus {
|
||||
outline: 2px solid var(--primary-color);
|
||||
outline-offset: 2px;
|
||||
}
|
||||
|
||||
/* Optional: Hide text on smaller screens if needed */
|
||||
@media screen and (max-width: 768px) {
|
||||
.floating-ask-ai-button span {
|
||||
/* display: none; */ /* Uncomment to hide text */
|
||||
}
|
||||
.floating-ask-ai-button {
|
||||
width: 55px;
|
||||
height: 55px;
|
||||
bottom: 20px;
|
||||
right: 20px;
|
||||
}
|
||||
}
|
||||
109
docs/md_v2/assets/selection_ask_ai.js
Normal file
109
docs/md_v2/assets/selection_ask_ai.js
Normal file
@@ -0,0 +1,109 @@
|
||||
// ==== File: docs/assets/selection_ask_ai.js ====
|
||||
|
||||
document.addEventListener('DOMContentLoaded', () => {
|
||||
let askAiButton = null;
|
||||
const askAiPageUrl = '/core/ask-ai/'; // Adjust if your Ask AI page path is different
|
||||
|
||||
function createAskAiButton() {
|
||||
const button = document.createElement('button');
|
||||
button.id = 'ask-ai-selection-btn';
|
||||
button.className = 'ask-ai-selection-button';
|
||||
button.textContent = 'Ask AI'; // Or use an icon
|
||||
button.style.display = 'none'; // Initially hidden
|
||||
button.style.position = 'absolute';
|
||||
button.style.zIndex = '1500'; // Ensure it's on top
|
||||
document.body.appendChild(button);
|
||||
|
||||
button.addEventListener('click', handleAskAiClick);
|
||||
return button;
|
||||
}
|
||||
|
||||
function getSafeSelectedText() {
|
||||
const selection = window.getSelection();
|
||||
if (!selection || selection.rangeCount === 0) {
|
||||
return null;
|
||||
}
|
||||
// Avoid selecting text within the button itself if it was somehow selected
|
||||
const container = selection.getRangeAt(0).commonAncestorContainer;
|
||||
if (askAiButton && askAiButton.contains(container)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const text = selection.toString().trim();
|
||||
return text.length > 0 ? text : null;
|
||||
}
|
||||
|
||||
function positionButton(event) {
|
||||
const selection = window.getSelection();
|
||||
if (!selection || selection.rangeCount === 0 || selection.isCollapsed) {
|
||||
hideButton();
|
||||
return;
|
||||
}
|
||||
|
||||
const range = selection.getRangeAt(0);
|
||||
const rect = range.getBoundingClientRect();
|
||||
|
||||
// Calculate position: top-right of the selection
|
||||
const scrollX = window.scrollX;
|
||||
const scrollY = window.scrollY;
|
||||
const buttonTop = rect.top + scrollY - askAiButton.offsetHeight - 5; // 5px above
|
||||
const buttonLeft = rect.right + scrollX + 5; // 5px to the right
|
||||
|
||||
askAiButton.style.top = `${buttonTop}px`;
|
||||
askAiButton.style.left = `${buttonLeft}px`;
|
||||
askAiButton.style.display = 'block'; // Show the button
|
||||
}
|
||||
|
||||
function hideButton() {
|
||||
if (askAiButton) {
|
||||
askAiButton.style.display = 'none';
|
||||
}
|
||||
}
|
||||
|
||||
function handleAskAiClick(event) {
|
||||
event.stopPropagation(); // Prevent mousedown from hiding button immediately
|
||||
const selectedText = getSafeSelectedText();
|
||||
if (selectedText) {
|
||||
console.log("Selected Text:", selectedText);
|
||||
// Base64 encode for URL safety (handles special chars, line breaks)
|
||||
// Use encodeURIComponent first for proper Unicode handling before btoa
|
||||
const encodedText = btoa(unescape(encodeURIComponent(selectedText)));
|
||||
const targetUrl = `${askAiPageUrl}?qq=${encodedText}`;
|
||||
console.log("Navigating to:", targetUrl);
|
||||
window.location.href = targetUrl; // Navigate to Ask AI page
|
||||
}
|
||||
hideButton(); // Hide after click
|
||||
}
|
||||
|
||||
// --- Event Listeners ---
|
||||
|
||||
// Show button on mouse up after selection
|
||||
document.addEventListener('mouseup', (event) => {
|
||||
// Slight delay to ensure selection is registered
|
||||
setTimeout(() => {
|
||||
const selectedText = getSafeSelectedText();
|
||||
if (selectedText) {
|
||||
if (!askAiButton) {
|
||||
askAiButton = createAskAiButton();
|
||||
}
|
||||
// Don't position if the click was ON the button itself
|
||||
if (event.target !== askAiButton) {
|
||||
positionButton(event);
|
||||
}
|
||||
} else {
|
||||
hideButton();
|
||||
}
|
||||
}, 10); // Small delay
|
||||
});
|
||||
|
||||
// Hide button on scroll or click elsewhere
|
||||
document.addEventListener('mousedown', (event) => {
|
||||
// Hide if clicking anywhere EXCEPT the button itself
|
||||
if (askAiButton && event.target !== askAiButton) {
|
||||
hideButton();
|
||||
}
|
||||
});
|
||||
document.addEventListener('scroll', hideButton, true); // Capture scroll events
|
||||
|
||||
console.log("Selection Ask AI script loaded.");
|
||||
});
|
||||
@@ -6,8 +6,8 @@
|
||||
}
|
||||
|
||||
:root {
|
||||
--global-font-size: 16px;
|
||||
--global-code-font-size: 16px;
|
||||
--global-font-size: 14px;
|
||||
--global-code-font-size: 13px;
|
||||
--global-line-height: 1.5em;
|
||||
--global-space: 10px;
|
||||
--font-stack: Menlo, Monaco, Lucida Console, Liberation Mono, DejaVu Sans Mono, Bitstream Vera Sans Mono,
|
||||
@@ -50,8 +50,17 @@
|
||||
--display-h1-decoration: none;
|
||||
|
||||
--display-h1-decoration: none;
|
||||
|
||||
--header-height: 65px; /* Adjust based on your actual header height */
|
||||
--sidebar-width: 280px; /* Adjust based on your desired sidebar width */
|
||||
--toc-width: 240px; /* Adjust based on your desired ToC width */
|
||||
--layout-transition-speed: 0.2s; /* For potential future animations */
|
||||
|
||||
--page-width : 100em; /* Adjust based on your design */
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* body {
|
||||
background-color: var(--background-color);
|
||||
color: var(--font-color);
|
||||
@@ -256,4 +265,6 @@ div.badges a {
|
||||
}
|
||||
div.badges a > img {
|
||||
width: auto;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
144
docs/md_v2/assets/toc.js
Normal file
144
docs/md_v2/assets/toc.js
Normal file
@@ -0,0 +1,144 @@
|
||||
// ==== File: assets/toc.js ====
|
||||
|
||||
document.addEventListener('DOMContentLoaded', () => {
|
||||
const mainContent = document.getElementById('terminal-mkdocs-main-content');
|
||||
const tocContainer = document.getElementById('toc-sidebar');
|
||||
const mainGrid = document.querySelector('.terminal-mkdocs-main-grid'); // Get the flex container
|
||||
|
||||
if (!mainContent) {
|
||||
console.warn("TOC Generator: Main content area '#terminal-mkdocs-main-content' not found.");
|
||||
return;
|
||||
}
|
||||
|
||||
// --- Create ToC container if it doesn't exist ---
|
||||
let tocElement = tocContainer;
|
||||
if (!tocElement) {
|
||||
if (!mainGrid) {
|
||||
console.warn("TOC Generator: Flex container '.terminal-mkdocs-main-grid' not found to append ToC.");
|
||||
return;
|
||||
}
|
||||
tocElement = document.createElement('aside');
|
||||
tocElement.id = 'toc-sidebar';
|
||||
tocElement.style.display = 'none'; // Keep hidden initially
|
||||
// Append it as the last child of the flex grid
|
||||
mainGrid.appendChild(tocElement);
|
||||
console.info("TOC Generator: Created '#toc-sidebar' element.");
|
||||
}
|
||||
|
||||
// --- Find Headings (h2, h3, h4 are common for ToC) ---
|
||||
const headings = mainContent.querySelectorAll('h2, h3, h4');
|
||||
if (headings.length === 0) {
|
||||
console.info("TOC Generator: No headings found on this page. ToC not generated.");
|
||||
tocElement.style.display = 'none'; // Ensure it's hidden
|
||||
return;
|
||||
}
|
||||
|
||||
// --- Generate ToC List ---
|
||||
const tocList = document.createElement('ul');
|
||||
const observerTargets = []; // Store headings for IntersectionObserver
|
||||
|
||||
headings.forEach((heading, index) => {
|
||||
// Ensure heading has an ID for linking
|
||||
if (!heading.id) {
|
||||
// Create a simple slug-like ID
|
||||
heading.id = `toc-heading-${index}-${heading.textContent.toLowerCase().replace(/\s+/g, '-').replace(/[^a-z0-9-]/g, '')}`;
|
||||
}
|
||||
|
||||
const listItem = document.createElement('li');
|
||||
const link = document.createElement('a');
|
||||
|
||||
link.href = `#${heading.id}`;
|
||||
link.textContent = heading.textContent;
|
||||
|
||||
// Add class for styling based on heading level
|
||||
const level = parseInt(heading.tagName.substring(1), 10); // Get 2, 3, or 4
|
||||
listItem.classList.add(`toc-level-${level}`);
|
||||
|
||||
listItem.appendChild(link);
|
||||
tocList.appendChild(listItem);
|
||||
observerTargets.push(heading); // Add to observer list
|
||||
});
|
||||
|
||||
// --- Populate and Show ToC ---
|
||||
// Optional: Add a title
|
||||
const tocTitle = document.createElement('h4');
|
||||
tocTitle.textContent = 'On this page'; // Customize title if needed
|
||||
|
||||
tocElement.innerHTML = ''; // Clear previous content if any
|
||||
tocElement.appendChild(tocTitle);
|
||||
tocElement.appendChild(tocList);
|
||||
tocElement.style.display = ''; // Show the ToC container
|
||||
|
||||
console.info(`TOC Generator: Generated ToC with ${headings.length} items.`);
|
||||
|
||||
// --- Scroll Spy using Intersection Observer ---
|
||||
const tocLinks = tocElement.querySelectorAll('a');
|
||||
let activeLink = null; // Keep track of the current active link
|
||||
|
||||
const observerOptions = {
|
||||
// Observe changes relative to the viewport, offset by the header height
|
||||
// Negative top margin pushes the intersection trigger point down
|
||||
// Negative bottom margin ensures elements low on the screen can trigger before they exit
|
||||
rootMargin: `-${getComputedStyle(document.documentElement).getPropertyValue('--header-height').trim()} 0px -60% 0px`,
|
||||
threshold: 0 // Trigger as soon as any part enters/exits the boundary
|
||||
};
|
||||
|
||||
const observerCallback = (entries) => {
|
||||
let topmostVisibleHeading = null;
|
||||
|
||||
entries.forEach(entry => {
|
||||
const link = tocElement.querySelector(`a[href="#${entry.target.id}"]`);
|
||||
if (!link) return;
|
||||
|
||||
// Check if the heading is intersecting (partially or fully visible within rootMargin)
|
||||
if (entry.isIntersecting) {
|
||||
// Among visible headings, find the one closest to the top edge (within the rootMargin)
|
||||
if (!topmostVisibleHeading || entry.boundingClientRect.top < topmostVisibleHeading.boundingClientRect.top) {
|
||||
topmostVisibleHeading = entry.target;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// If we found a topmost visible heading, activate its link
|
||||
if (topmostVisibleHeading) {
|
||||
const newActiveLink = tocElement.querySelector(`a[href="#${topmostVisibleHeading.id}"]`);
|
||||
if (newActiveLink && newActiveLink !== activeLink) {
|
||||
// Remove active class from previous link
|
||||
if (activeLink) {
|
||||
activeLink.classList.remove('active');
|
||||
activeLink.parentElement.classList.remove('active-parent'); // Optional parent styling
|
||||
}
|
||||
// Add active class to the new link
|
||||
newActiveLink.classList.add('active');
|
||||
newActiveLink.parentElement.classList.add('active-parent'); // Optional parent styling
|
||||
activeLink = newActiveLink;
|
||||
|
||||
// Optional: Scroll the ToC sidebar to keep the active link visible
|
||||
// newActiveLink.scrollIntoView({ behavior: 'smooth', block: 'nearest' });
|
||||
}
|
||||
}
|
||||
// If no headings are intersecting (scrolled past the last one?), maybe deactivate all
|
||||
// Or keep the last one active - depends on desired behavior. Current logic keeps last active.
|
||||
};
|
||||
|
||||
const observer = new IntersectionObserver(observerCallback, observerOptions);
|
||||
|
||||
// Observe all target headings
|
||||
observerTargets.forEach(heading => observer.observe(heading));
|
||||
|
||||
// Initial check in case a heading is already in view on load
|
||||
// (Requires slight delay for accurate layout calculation)
|
||||
setTimeout(() => {
|
||||
observerCallback(observer.takeRecords()); // Process initial state
|
||||
}, 100);
|
||||
|
||||
// move footer and the hr before footer to the end of the main content
|
||||
const footer = document.querySelector('footer');
|
||||
const hr = footer.previousElementSibling;
|
||||
if (hr && hr.tagName === 'HR') {
|
||||
mainContent.appendChild(hr);
|
||||
}
|
||||
mainContent.appendChild(footer);
|
||||
console.info("TOC Generator: Footer moved to the end of the main content.");
|
||||
|
||||
});
|
||||
@@ -251,7 +251,7 @@ from crawl4ai import (
|
||||
RoundRobinProxyStrategy,
|
||||
)
|
||||
import asyncio
|
||||
from crawl4ai.proxy_strategy import ProxyConfig
|
||||
from crawl4ai import ProxyConfig
|
||||
async def main():
|
||||
# Load proxies and create rotation strategy
|
||||
proxies = ProxyConfig.from_env()
|
||||
|
||||
74
docs/md_v2/core/ask-ai.md
Normal file
74
docs/md_v2/core/ask-ai.md
Normal file
@@ -0,0 +1,74 @@
|
||||
<div class="ask-ai-container">
|
||||
<iframe id="ask-ai-frame" src="../../ask_ai/index.html" width="100%" style="border:none; display: block;" title="Crawl4AI Assistant"></iframe>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
// Iframe height adjustment
|
||||
function resizeAskAiIframe() {
|
||||
const iframe = document.getElementById('ask-ai-frame');
|
||||
if (iframe) {
|
||||
const headerHeight = parseFloat(getComputedStyle(document.documentElement).getPropertyValue('--header-height') || '55');
|
||||
// Footer is removed by JS below, so calculate height based on header + small buffer
|
||||
const topOffset = headerHeight + 20; // Header + buffer/margin
|
||||
|
||||
const availableHeight = window.innerHeight - topOffset;
|
||||
iframe.style.height = Math.max(600, availableHeight) + 'px'; // Min height 600px
|
||||
}
|
||||
}
|
||||
|
||||
// Run immediately and on resize/load
|
||||
resizeAskAiIframe(); // Initial call
|
||||
let resizeTimer;
|
||||
window.addEventListener('load', resizeAskAiIframe);
|
||||
window.addEventListener('resize', () => {
|
||||
clearTimeout(resizeTimer);
|
||||
resizeTimer = setTimeout(resizeAskAiIframe, 150);
|
||||
});
|
||||
|
||||
// Remove Footer & HR from parent page (DOM Ready might be safer)
|
||||
document.addEventListener('DOMContentLoaded', () => {
|
||||
setTimeout(() => { // Add slight delay just in case elements render slowly
|
||||
const footer = window.parent.document.querySelector('footer'); // Target parent document
|
||||
if (footer) {
|
||||
const hrBeforeFooter = footer.previousElementSibling;
|
||||
if (hrBeforeFooter && hrBeforeFooter.tagName === 'HR') {
|
||||
hrBeforeFooter.remove();
|
||||
}
|
||||
footer.remove();
|
||||
// Trigger resize again after removing footer
|
||||
resizeAskAiIframe();
|
||||
} else {
|
||||
console.warn("Ask AI Page: Could not find footer in parent document to remove.");
|
||||
}
|
||||
}, 100); // Shorter delay
|
||||
});
|
||||
</script>
|
||||
|
||||
<style>
|
||||
#terminal-mkdocs-main-content {
|
||||
padding: 0 !important;
|
||||
margin: 0;
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
overflow: hidden; /* Prevent body scrollbars, panels handle scroll */
|
||||
}
|
||||
|
||||
/* Ensure iframe container takes full space */
|
||||
#terminal-mkdocs-main-content .ask-ai-container {
|
||||
/* Remove negative margins if footer removal handles space */
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
max-width: none;
|
||||
/* Let the JS set the height */
|
||||
/* height: 600px; Initial fallback height */
|
||||
overflow: hidden; /* Hide potential overflow before JS resize */
|
||||
}
|
||||
|
||||
/* Hide title/paragraph if they were part of the markdown */
|
||||
/* Alternatively, just remove them from the .md file directly */
|
||||
/* #terminal-mkdocs-main-content > h1,
|
||||
#terminal-mkdocs-main-content > p:first-of-type {
|
||||
display: none;
|
||||
} */
|
||||
|
||||
</style>
|
||||
File diff suppressed because it is too large
Load Diff
0
docs/tutorials/coming_soon.md
Normal file
0
docs/tutorials/coming_soon.md
Normal file
11
mkdocs.yml
11
mkdocs.yml
@@ -7,10 +7,11 @@ docs_dir: docs/md_v2
|
||||
|
||||
nav:
|
||||
- Home: 'index.md'
|
||||
- "Ask AI": "core/ask-ai.md"
|
||||
- "Quick Start": "core/quickstart.md"
|
||||
- Setup & Installation:
|
||||
- "Installation": "core/installation.md"
|
||||
- "Docker Deployment": "core/docker-deployment.md"
|
||||
- "Quick Start": "core/quickstart.md"
|
||||
- "Blog & Changelog":
|
||||
- "Blog Home": "blog/index.md"
|
||||
- "Changelog": "https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md"
|
||||
@@ -76,6 +77,7 @@ extra:
|
||||
version: !ENV [CRAWL4AI_VERSION, 'development']
|
||||
|
||||
extra_css:
|
||||
- assets/layout.css
|
||||
- assets/styles.css
|
||||
- assets/highlight.css
|
||||
- assets/dmvendor.css
|
||||
@@ -83,4 +85,9 @@ extra_css:
|
||||
extra_javascript:
|
||||
- assets/highlight.min.js
|
||||
- assets/highlight_init.js
|
||||
- https://buttons.github.io/buttons.js
|
||||
- https://buttons.github.io/buttons.js
|
||||
- assets/toc.js
|
||||
- assets/github_stats.js
|
||||
- assets/selection_ask_ai.js
|
||||
- assets/copy_code.js
|
||||
- assets/floating_ask_ai_button.js
|
||||
@@ -1,20 +0,0 @@
|
||||
The file /docs/md_v2/api/parameters.md should be updated to include the new network and console capturing parameters.
|
||||
|
||||
Here's what needs to be updated:
|
||||
|
||||
1. Change section title from:
|
||||
```
|
||||
### G) **Debug & Logging**
|
||||
```
|
||||
to:
|
||||
```
|
||||
### G) **Debug, Logging & Capturing**
|
||||
```
|
||||
|
||||
2. Add new parameters to the table:
|
||||
```
|
||||
| **`capture_network_requests`** | `bool` (False) | Captures all network requests, responses, and failures during the crawl. Available in `result.network_requests`. |
|
||||
| **`capture_console_messages`** | `bool` (False) | Captures all browser console messages (logs, warnings, errors) during the crawl. Available in `result.console_messages`. |
|
||||
```
|
||||
|
||||
These changes demonstrate how to use the new network and console capturing features in the CrawlerRunConfig.
|
||||
596
tests/docker/test_rest_api_deep_crawl.py
Normal file
596
tests/docker/test_rest_api_deep_crawl.py
Normal file
@@ -0,0 +1,596 @@
|
||||
# ==== File: test_rest_api_deep_crawl.py ====
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import httpx
|
||||
import json
|
||||
import asyncio
|
||||
import os
|
||||
from typing import List, Dict, Any, AsyncGenerator
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv() # Load environment variables from .env file if present
|
||||
|
||||
# --- Test Configuration ---
|
||||
BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # If server is running in Docker, use the host's IP
|
||||
BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # If server is running in dev debug mode
|
||||
DEEP_CRAWL_BASE_URL = "https://docs.crawl4ai.com/samples/deepcrawl/"
|
||||
DEEP_CRAWL_DOMAIN = "docs.crawl4ai.com" # Used for domain filter
|
||||
|
||||
# --- Helper Functions ---
|
||||
def load_proxies_from_env() -> List[Dict]:
|
||||
"""Load proxies from PROXIES environment variable"""
|
||||
proxies = []
|
||||
proxies_str = os.getenv("PROXIES", "")
|
||||
if not proxies_str:
|
||||
print("PROXIES environment variable not set or empty.")
|
||||
return proxies
|
||||
try:
|
||||
proxy_list = proxies_str.split(",")
|
||||
for proxy in proxy_list:
|
||||
proxy = proxy.strip()
|
||||
if not proxy:
|
||||
continue
|
||||
parts = proxy.split(":")
|
||||
if len(parts) == 4:
|
||||
ip, port, username, password = parts
|
||||
proxies.append({
|
||||
"server": f"http://{ip}:{port}", # Assuming http, adjust if needed
|
||||
"username": username,
|
||||
"password": password,
|
||||
"ip": ip # Store original IP if available
|
||||
})
|
||||
elif len(parts) == 2: # ip:port only
|
||||
ip, port = parts
|
||||
proxies.append({
|
||||
"server": f"http://{ip}:{port}",
|
||||
"ip": ip
|
||||
})
|
||||
else:
|
||||
print(f"Skipping invalid proxy string format: {proxy}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error loading proxies from environment: {e}")
|
||||
return proxies
|
||||
|
||||
|
||||
async def check_server_health(client: httpx.AsyncClient):
|
||||
"""Check if the server is healthy before running tests."""
|
||||
try:
|
||||
response = await client.get("/health")
|
||||
response.raise_for_status()
|
||||
print(f"\nServer healthy: {response.json()}")
|
||||
return True
|
||||
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
||||
pytest.fail(f"Server health check failed: {e}. Is the server running at {BASE_URL}?", pytrace=False)
|
||||
|
||||
async def assert_crawl_result_structure(result: Dict[str, Any], check_ssl=False):
|
||||
"""Asserts the basic structure of a single crawl result."""
|
||||
assert isinstance(result, dict)
|
||||
assert "url" in result
|
||||
assert "success" in result
|
||||
assert "html" in result # Basic crawls should return HTML
|
||||
assert "metadata" in result
|
||||
assert isinstance(result["metadata"], dict)
|
||||
assert "depth" in result["metadata"] # Deep crawls add depth
|
||||
|
||||
if check_ssl:
|
||||
assert "ssl_certificate" in result # Check if SSL info is present
|
||||
assert isinstance(result["ssl_certificate"], dict) or result["ssl_certificate"] is None
|
||||
|
||||
|
||||
async def process_streaming_response(response: httpx.Response) -> List[Dict[str, Any]]:
|
||||
"""Processes an NDJSON streaming response."""
|
||||
results = []
|
||||
completed = False
|
||||
async for line in response.aiter_lines():
|
||||
if line:
|
||||
try:
|
||||
data = json.loads(line)
|
||||
if data.get("status") == "completed":
|
||||
completed = True
|
||||
break # Stop processing after completion marker
|
||||
elif data.get("url"): # Ensure it looks like a result object
|
||||
results.append(data)
|
||||
else:
|
||||
print(f"Received non-result JSON line: {data}") # Log other status messages if needed
|
||||
except json.JSONDecodeError:
|
||||
pytest.fail(f"Failed to decode JSON line: {line}")
|
||||
assert completed, "Streaming response did not end with a completion marker."
|
||||
return results
|
||||
|
||||
|
||||
# --- Pytest Fixtures ---
|
||||
@pytest_asyncio.fixture(scope="function")
|
||||
async def async_client() -> AsyncGenerator[httpx.AsyncClient, None]:
|
||||
"""Provides an async HTTP client"""
|
||||
# Increased timeout for potentially longer deep crawls
|
||||
async with httpx.AsyncClient(base_url=BASE_URL, timeout=300.0) as client:
|
||||
yield client
|
||||
# No explicit close needed with 'async with'
|
||||
|
||||
# --- Test Class ---
|
||||
@pytest.mark.asyncio
|
||||
class TestDeepCrawlEndpoints:
|
||||
|
||||
@pytest_asyncio.fixture(autouse=True)
|
||||
async def check_health_before_tests(self, async_client: httpx.AsyncClient):
|
||||
"""Fixture to ensure server is healthy before each test in the class."""
|
||||
await check_server_health(async_client)
|
||||
|
||||
# 1. Basic Deep Crawl
|
||||
async def test_deep_crawl_basic_bfs(self, async_client: httpx.AsyncClient):
|
||||
"""Test BFS deep crawl with limited depth and pages."""
|
||||
max_depth = 1
|
||||
max_pages = 3 # start_url + 2 more
|
||||
payload = {
|
||||
"urls": [DEEP_CRAWL_BASE_URL],
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"stream": False,
|
||||
"cache_mode": "BYPASS", # Use string value for CacheMode
|
||||
"deep_crawl_strategy": {
|
||||
"type": "BFSDeepCrawlStrategy",
|
||||
"params": {
|
||||
"max_depth": max_depth,
|
||||
"max_pages": max_pages,
|
||||
# Minimal filters for basic test
|
||||
"filter_chain": {
|
||||
"type": "FilterChain",
|
||||
"params": {
|
||||
"filters": [
|
||||
{
|
||||
"type": "DomainFilter",
|
||||
"params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
response = await async_client.post("/crawl", json=payload)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
assert data["success"] is True
|
||||
assert isinstance(data["results"], list)
|
||||
assert len(data["results"]) > 1 # Should be more than just the start URL
|
||||
assert len(data["results"]) <= max_pages # Respect max_pages
|
||||
|
||||
found_depth_0 = False
|
||||
found_depth_1 = False
|
||||
for result in data["results"]:
|
||||
await assert_crawl_result_structure(result)
|
||||
assert result["success"] is True
|
||||
assert DEEP_CRAWL_DOMAIN in result["url"]
|
||||
depth = result["metadata"]["depth"]
|
||||
assert depth <= max_depth
|
||||
if depth == 0: found_depth_0 = True
|
||||
if depth == 1: found_depth_1 = True
|
||||
|
||||
assert found_depth_0
|
||||
assert found_depth_1
|
||||
|
||||
# 2. Deep Crawl with Filtering
|
||||
async def test_deep_crawl_with_filters(self, async_client: httpx.AsyncClient):
|
||||
"""Test BFS deep crawl with content type and domain filters."""
|
||||
max_depth = 1
|
||||
max_pages = 5
|
||||
payload = {
|
||||
"urls": [DEEP_CRAWL_BASE_URL],
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"stream": False,
|
||||
"cache_mode": "BYPASS",
|
||||
"deep_crawl_strategy": {
|
||||
"type": "BFSDeepCrawlStrategy",
|
||||
"params": {
|
||||
"max_depth": max_depth,
|
||||
"max_pages": max_pages,
|
||||
"filter_chain": {
|
||||
"type": "FilterChain",
|
||||
"params": {
|
||||
"filters": [
|
||||
{
|
||||
"type": "DomainFilter",
|
||||
"params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
|
||||
},
|
||||
{
|
||||
"type": "ContentTypeFilter",
|
||||
"params": {"allowed_types": ["text/html"]}
|
||||
},
|
||||
# Example: Exclude specific paths using regex
|
||||
{
|
||||
"type": "URLPatternFilter",
|
||||
"params": {
|
||||
"patterns": ["*/category-3/*"], # Block category 3
|
||||
"reverse": True # Block if match
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
response = await async_client.post("/crawl", json=payload)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
assert data["success"] is True
|
||||
assert len(data["results"]) > 0
|
||||
assert len(data["results"]) <= max_pages
|
||||
|
||||
for result in data["results"]:
|
||||
await assert_crawl_result_structure(result)
|
||||
assert result["success"] is True
|
||||
assert DEEP_CRAWL_DOMAIN in result["url"]
|
||||
assert "category-3" not in result["url"] # Check if filter worked
|
||||
assert result["metadata"]["depth"] <= max_depth
|
||||
|
||||
# 3. Deep Crawl with Scoring
|
||||
async def test_deep_crawl_with_scoring(self, async_client: httpx.AsyncClient):
|
||||
"""Test BFS deep crawl with URL scoring."""
|
||||
max_depth = 1
|
||||
max_pages = 4
|
||||
payload = {
|
||||
"urls": [DEEP_CRAWL_BASE_URL],
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"stream": False,
|
||||
"cache_mode": "BYPASS",
|
||||
"deep_crawl_strategy": {
|
||||
"type": "BFSDeepCrawlStrategy",
|
||||
"params": {
|
||||
"max_depth": max_depth,
|
||||
"max_pages": max_pages,
|
||||
"filter_chain": { # Keep basic domain filter
|
||||
"type": "FilterChain",
|
||||
"params": { "filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
|
||||
},
|
||||
"url_scorer": { # Add scorer
|
||||
"type": "CompositeScorer",
|
||||
"params": {
|
||||
"scorers": [
|
||||
{ # Favor pages with 'product' in the URL
|
||||
"type": "KeywordRelevanceScorer",
|
||||
"params": {"keywords": ["product"], "weight": 1.0}
|
||||
},
|
||||
{ # Penalize deep paths slightly
|
||||
"type": "PathDepthScorer",
|
||||
"params": {"optimal_depth": 2, "weight": -0.2}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
# Set a threshold if needed: "score_threshold": 0.1
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
response = await async_client.post("/crawl", json=payload)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
assert data["success"] is True
|
||||
assert len(data["results"]) > 0
|
||||
assert len(data["results"]) <= max_pages
|
||||
|
||||
# Check if results seem biased towards products (harder to assert strictly without knowing exact scores)
|
||||
product_urls_found = any("product_" in result["url"] for result in data["results"] if result["metadata"]["depth"] > 0)
|
||||
print(f"Product URLs found among depth > 0 results: {product_urls_found}")
|
||||
# We expect scoring to prioritize product pages if available within limits
|
||||
# assert product_urls_found # This might be too strict depending on site structure and limits
|
||||
|
||||
for result in data["results"]:
|
||||
await assert_crawl_result_structure(result)
|
||||
assert result["success"] is True
|
||||
assert result["metadata"]["depth"] <= max_depth
|
||||
|
||||
# 4. Deep Crawl with CSS Extraction
|
||||
async def test_deep_crawl_with_css_extraction(self, async_client: httpx.AsyncClient):
|
||||
"""Test BFS deep crawl combined with JsonCssExtractionStrategy."""
|
||||
max_depth = 6 # Go deep enough to reach product pages
|
||||
max_pages = 20
|
||||
# Schema to extract product details
|
||||
product_schema = {
|
||||
"name": "ProductDetails",
|
||||
"baseSelector": "div.container", # Base for product page
|
||||
"fields": [
|
||||
{"name": "product_title", "selector": "h1", "type": "text"},
|
||||
{"name": "price", "selector": ".product-price", "type": "text"},
|
||||
{"name": "description", "selector": ".product-description p", "type": "text"},
|
||||
{"name": "specs", "selector": ".product-specs li", "type": "list", "fields":[
|
||||
{"name": "spec_name", "selector": ".spec-name", "type": "text"},
|
||||
{"name": "spec_value", "selector": ".spec-value", "type": "text"}
|
||||
]}
|
||||
]
|
||||
}
|
||||
payload = {
|
||||
"urls": [DEEP_CRAWL_BASE_URL],
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"stream": False,
|
||||
"cache_mode": "BYPASS",
|
||||
"extraction_strategy": { # Apply extraction to ALL crawled pages
|
||||
"type": "JsonCssExtractionStrategy",
|
||||
"params": {"schema": {"type": "dict", "value": product_schema}}
|
||||
},
|
||||
"deep_crawl_strategy": {
|
||||
"type": "BFSDeepCrawlStrategy",
|
||||
"params": {
|
||||
"max_depth": max_depth,
|
||||
"max_pages": max_pages,
|
||||
"filter_chain": { # Only crawl HTML on our domain
|
||||
"type": "FilterChain",
|
||||
"params": {
|
||||
"filters": [
|
||||
{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
|
||||
{"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
|
||||
]
|
||||
}
|
||||
}
|
||||
# Optional: Add scoring to prioritize product pages for extraction
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
response = await async_client.post("/crawl", json=payload)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
assert data["success"] is True
|
||||
assert len(data["results"]) > 0
|
||||
# assert len(data["results"]) <= max_pages
|
||||
|
||||
found_extracted_product = False
|
||||
for result in data["results"]:
|
||||
await assert_crawl_result_structure(result)
|
||||
assert result["success"] is True
|
||||
assert "extracted_content" in result
|
||||
if "product_" in result["url"]: # Check product pages specifically
|
||||
assert result["extracted_content"] is not None
|
||||
try:
|
||||
extracted = json.loads(result["extracted_content"])
|
||||
# Schema returns list even if one base match
|
||||
assert isinstance(extracted, list)
|
||||
if extracted:
|
||||
item = extracted[0]
|
||||
assert "product_title" in item and item["product_title"]
|
||||
assert "price" in item and item["price"]
|
||||
# Specs might be empty list if not found
|
||||
assert "specs" in item and isinstance(item["specs"], list)
|
||||
found_extracted_product = True
|
||||
print(f"Extracted product: {item.get('product_title')}")
|
||||
except (json.JSONDecodeError, AssertionError, IndexError) as e:
|
||||
pytest.fail(f"Extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
|
||||
# else:
|
||||
# # Non-product pages might have None or empty list depending on schema match
|
||||
# assert result["extracted_content"] is None or json.loads(result["extracted_content"]) == []
|
||||
|
||||
assert found_extracted_product, "Did not find any pages where product data was successfully extracted."
|
||||
|
||||
# 5. Deep Crawl with LLM Extraction (Requires Server LLM Setup)
|
||||
async def test_deep_crawl_with_llm_extraction(self, async_client: httpx.AsyncClient):
|
||||
"""Test BFS deep crawl combined with LLMExtractionStrategy."""
|
||||
max_depth = 1 # Limit depth to keep LLM calls manageable
|
||||
max_pages = 3
|
||||
payload = {
|
||||
"urls": [DEEP_CRAWL_BASE_URL],
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"stream": False,
|
||||
"cache_mode": "BYPASS",
|
||||
"extraction_strategy": { # Apply LLM extraction to crawled pages
|
||||
"type": "LLMExtractionStrategy",
|
||||
"params": {
|
||||
"instruction": "Extract the main H1 title and the text content of the first paragraph.",
|
||||
"llm_config": { # Example override, rely on server default if possible
|
||||
"type": "LLMConfig",
|
||||
"params": {"provider": "openai/gpt-4.1-mini"} # Use a cheaper model for testing
|
||||
},
|
||||
"schema": { # Expected JSON output
|
||||
"type": "dict",
|
||||
"value": {
|
||||
"title": "PageContent", "type": "object",
|
||||
"properties": {
|
||||
"h1_title": {"type": "string"},
|
||||
"first_paragraph": {"type": "string"}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"deep_crawl_strategy": {
|
||||
"type": "BFSDeepCrawlStrategy",
|
||||
"params": {
|
||||
"max_depth": max_depth,
|
||||
"max_pages": max_pages,
|
||||
"filter_chain": {
|
||||
"type": "FilterChain",
|
||||
"params": {
|
||||
"filters": [
|
||||
{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
|
||||
{"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
try:
|
||||
response = await async_client.post("/crawl", json=payload)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except httpx.HTTPStatusError as e:
|
||||
pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and LLM API key setup.")
|
||||
except httpx.RequestError as e:
|
||||
pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}.")
|
||||
|
||||
|
||||
assert data["success"] is True
|
||||
assert len(data["results"]) > 0
|
||||
assert len(data["results"]) <= max_pages
|
||||
|
||||
found_llm_extraction = False
|
||||
for result in data["results"]:
|
||||
await assert_crawl_result_structure(result)
|
||||
assert result["success"] is True
|
||||
assert "extracted_content" in result
|
||||
assert result["extracted_content"] is not None
|
||||
try:
|
||||
extracted = json.loads(result["extracted_content"])
|
||||
if isinstance(extracted, list): extracted = extracted[0] # Handle list output
|
||||
assert isinstance(extracted, dict)
|
||||
assert "h1_title" in extracted # Check keys based on schema
|
||||
assert "first_paragraph" in extracted
|
||||
found_llm_extraction = True
|
||||
print(f"LLM extracted from {result['url']}: Title='{extracted.get('h1_title')}'")
|
||||
except (json.JSONDecodeError, AssertionError, IndexError, TypeError) as e:
|
||||
pytest.fail(f"LLM extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
|
||||
|
||||
assert found_llm_extraction, "LLM extraction did not yield expected data on any crawled page."
|
||||
|
||||
|
||||
# 6. Deep Crawl with SSL Certificate Fetching
|
||||
async def test_deep_crawl_with_ssl(self, async_client: httpx.AsyncClient):
|
||||
"""Test BFS deep crawl with fetch_ssl_certificate enabled."""
|
||||
max_depth = 0 # Only fetch for start URL to keep test fast
|
||||
max_pages = 1
|
||||
payload = {
|
||||
"urls": [DEEP_CRAWL_BASE_URL],
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"stream": False,
|
||||
"cache_mode": "BYPASS",
|
||||
"fetch_ssl_certificate": True, # <-- Enable SSL fetching
|
||||
"deep_crawl_strategy": {
|
||||
"type": "BFSDeepCrawlStrategy",
|
||||
"params": {
|
||||
"max_depth": max_depth,
|
||||
"max_pages": max_pages,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
response = await async_client.post("/crawl", json=payload)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
assert data["success"] is True
|
||||
assert len(data["results"]) == 1
|
||||
result = data["results"][0]
|
||||
|
||||
await assert_crawl_result_structure(result, check_ssl=True) # <-- Tell helper to check SSL field
|
||||
assert result["success"] is True
|
||||
# Check if SSL info was actually retrieved
|
||||
if result["ssl_certificate"]:
|
||||
# Assert directly using dictionary keys
|
||||
assert isinstance(result["ssl_certificate"], dict) # Verify it's a dict
|
||||
assert "issuer" in result["ssl_certificate"]
|
||||
assert "subject" in result["ssl_certificate"]
|
||||
# --- MODIFIED ASSERTIONS ---
|
||||
assert "not_before" in result["ssl_certificate"] # Check for the actual key
|
||||
assert "not_after" in result["ssl_certificate"] # Check for the actual key
|
||||
# --- END MODIFICATIONS ---
|
||||
assert "fingerprint" in result["ssl_certificate"] # Check another key
|
||||
|
||||
# This print statement using .get() already works correctly with dictionaries
|
||||
print(f"SSL Issuer Org: {result['ssl_certificate'].get('issuer', {}).get('O', 'N/A')}")
|
||||
print(f"SSL Valid From: {result['ssl_certificate'].get('not_before', 'N/A')}")
|
||||
else:
|
||||
# This part remains the same
|
||||
print("SSL Certificate was null in the result.")
|
||||
|
||||
|
||||
# 7. Deep Crawl with Proxy Rotation (Requires PROXIES env var)
|
||||
async def test_deep_crawl_with_proxies(self, async_client: httpx.AsyncClient):
|
||||
"""Test BFS deep crawl using proxy rotation."""
|
||||
proxies = load_proxies_from_env()
|
||||
if not proxies:
|
||||
pytest.skip("Skipping proxy test: PROXIES environment variable not set or empty.")
|
||||
|
||||
print(f"\nTesting with {len(proxies)} proxies loaded from environment.")
|
||||
|
||||
max_depth = 1
|
||||
max_pages = 3
|
||||
payload = {
|
||||
"urls": [DEEP_CRAWL_BASE_URL], # Use the dummy site
|
||||
# Use a BrowserConfig that *might* pick up proxy if set, but rely on CrawlerRunConfig
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"stream": False,
|
||||
"cache_mode": "BYPASS",
|
||||
"proxy_rotation_strategy": { # <-- Define the strategy
|
||||
"type": "RoundRobinProxyStrategy",
|
||||
"params": {
|
||||
# Convert ProxyConfig dicts back to the serialized format expected by server
|
||||
"proxies": [{"type": "ProxyConfig", "params": p} for p in proxies]
|
||||
}
|
||||
},
|
||||
"deep_crawl_strategy": {
|
||||
"type": "BFSDeepCrawlStrategy",
|
||||
"params": {
|
||||
"max_depth": max_depth,
|
||||
"max_pages": max_pages,
|
||||
"filter_chain": {
|
||||
"type": "FilterChain",
|
||||
"params": { "filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
try:
|
||||
response = await async_client.post("/crawl", json=payload)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except httpx.HTTPStatusError as e:
|
||||
# Proxies often cause connection errors, catch them
|
||||
pytest.fail(f"Proxy deep crawl failed: {e}. Response: {e.response.text}. Are proxies valid and accessible by the server?")
|
||||
except httpx.RequestError as e:
|
||||
pytest.fail(f"Proxy deep crawl request failed: {e}. Are proxies valid and accessible?")
|
||||
|
||||
assert data["success"] is True
|
||||
assert len(data["results"]) > 0
|
||||
assert len(data["results"]) <= max_pages
|
||||
# Primary assertion is that the crawl succeeded *with* proxy config
|
||||
print(f"Proxy deep crawl completed successfully for {len(data['results'])} pages.")
|
||||
|
||||
# Verifying specific proxy usage requires server logs or custom headers/responses
|
||||
|
||||
|
||||
# --- Main Execution Block (for running script directly) ---
|
||||
if __name__ == "__main__":
|
||||
pytest_args = ["-v", "-s", __file__]
|
||||
# Example: Run only proxy test
|
||||
# pytest_args.append("-k test_deep_crawl_with_proxies")
|
||||
print(f"Running pytest with args: {pytest_args}")
|
||||
exit_code = pytest.main(pytest_args)
|
||||
print(f"Pytest finished with exit code: {exit_code}")
|
||||
655
tests/docker/test_server_requests.py
Normal file
655
tests/docker/test_server_requests.py
Normal file
@@ -0,0 +1,655 @@
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import httpx
|
||||
import json
|
||||
import asyncio
|
||||
import os
|
||||
from typing import List, Dict, Any, AsyncGenerator
|
||||
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
|
||||
|
||||
# Optional: Import crawl4ai classes directly for reference/easier payload creation aid
|
||||
# You don't strictly NEED these imports for the tests to run against the server,
|
||||
# but they help in understanding the structure you are mimicking in JSON.
|
||||
from crawl4ai import (
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
CacheMode,
|
||||
DefaultMarkdownGenerator,
|
||||
PruningContentFilter,
|
||||
BM25ContentFilter,
|
||||
BFSDeepCrawlStrategy,
|
||||
FilterChain,
|
||||
ContentTypeFilter,
|
||||
DomainFilter,
|
||||
CompositeScorer,
|
||||
KeywordRelevanceScorer,
|
||||
PathDepthScorer,
|
||||
JsonCssExtractionStrategy,
|
||||
LLMExtractionStrategy,
|
||||
LLMConfig
|
||||
)
|
||||
|
||||
# --- Test Configuration ---
|
||||
# BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # Make base URL configurable
|
||||
BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # Make base URL configurable
|
||||
# Use a known simple HTML page for basic tests
|
||||
SIMPLE_HTML_URL = "https://httpbin.org/html"
|
||||
# Use a site suitable for scraping tests
|
||||
SCRAPE_TARGET_URL = "http://books.toscrape.com/"
|
||||
# Use a site with internal links for deep crawl tests
|
||||
DEEP_CRAWL_URL = "https://python.org"
|
||||
|
||||
# --- Pytest Fixtures ---
|
||||
|
||||
# Use the built-in event_loop fixture from pytest_asyncio
|
||||
# The custom implementation was causing issues with closing the loop
|
||||
|
||||
@pytest_asyncio.fixture(scope="function") # Changed to function scope to avoid event loop issues
|
||||
async def async_client() -> AsyncGenerator[httpx.AsyncClient, None]:
|
||||
"""Provides an async HTTP client"""
|
||||
client = httpx.AsyncClient(base_url=BASE_URL, timeout=120.0)
|
||||
yield client
|
||||
await client.aclose()
|
||||
|
||||
# --- Helper Functions ---
|
||||
|
||||
async def check_server_health(client: httpx.AsyncClient):
|
||||
"""Check if the server is healthy before running tests."""
|
||||
try:
|
||||
response = await client.get("/health")
|
||||
response.raise_for_status()
|
||||
print(f"\nServer healthy: {response.json()}")
|
||||
return True
|
||||
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
||||
pytest.fail(f"Server health check failed: {e}. Is the server running at {BASE_URL}?", pytrace=False)
|
||||
|
||||
async def assert_crawl_result_structure(result: Dict[str, Any]):
|
||||
"""Asserts the basic structure of a single crawl result."""
|
||||
assert isinstance(result, dict)
|
||||
assert "url" in result
|
||||
assert "success" in result
|
||||
assert "html" in result
|
||||
# Add more common checks if needed
|
||||
|
||||
async def process_streaming_response(response: httpx.Response) -> List[Dict[str, Any]]:
|
||||
"""Processes an NDJSON streaming response."""
|
||||
results = []
|
||||
completed = False
|
||||
async for line in response.aiter_lines():
|
||||
if line:
|
||||
try:
|
||||
data = json.loads(line)
|
||||
if data.get("status") == "completed":
|
||||
completed = True
|
||||
break # Stop processing after completion marker
|
||||
else:
|
||||
results.append(data)
|
||||
except json.JSONDecodeError:
|
||||
pytest.fail(f"Failed to decode JSON line: {line}")
|
||||
assert completed, "Streaming response did not end with a completion marker."
|
||||
return results
|
||||
|
||||
|
||||
# --- Test Class ---
|
||||
|
||||
@pytest.mark.asyncio
|
||||
class TestCrawlEndpoints:
|
||||
|
||||
@pytest_asyncio.fixture(autouse=True)
|
||||
async def check_health_before_tests(self, async_client: httpx.AsyncClient):
|
||||
"""Fixture to ensure server is healthy before each test in the class."""
|
||||
await check_server_health(async_client)
|
||||
|
||||
# 1. Simple Requests (Primitives)
|
||||
async def test_simple_crawl_single_url(self, async_client: httpx.AsyncClient):
|
||||
"""Test /crawl with a single URL and simple config values."""
|
||||
payload = {
|
||||
"urls": [SIMPLE_HTML_URL],
|
||||
"browser_config": {
|
||||
"type": "BrowserConfig",
|
||||
"params": {
|
||||
"headless": True,
|
||||
}
|
||||
},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"stream": False, # Explicitly false for /crawl
|
||||
"screenshot": False,
|
||||
"cache_mode": CacheMode.BYPASS.value # Use enum value
|
||||
}
|
||||
}
|
||||
}
|
||||
try:
|
||||
response = await async_client.post("/crawl", json=payload)
|
||||
print(f"Response status: {response.status_code}")
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except httpx.HTTPStatusError as e:
|
||||
print(f"Server error: {e}")
|
||||
print(f"Response content: {e.response.text}")
|
||||
raise
|
||||
|
||||
assert data["success"] is True
|
||||
assert isinstance(data["results"], list)
|
||||
assert len(data["results"]) == 1
|
||||
result = data["results"][0]
|
||||
await assert_crawl_result_structure(result)
|
||||
assert result["success"] is True
|
||||
assert result["url"] == SIMPLE_HTML_URL
|
||||
assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
|
||||
# We don't specify a markdown generator in this test, so don't make assumptions about markdown field
|
||||
# It might be null, missing, or populated depending on the server's default behavior
|
||||
|
||||
async def test_simple_crawl_single_url_streaming(self, async_client: httpx.AsyncClient):
|
||||
"""Test /crawl/stream with a single URL and simple config values."""
|
||||
payload = {
|
||||
"urls": [SIMPLE_HTML_URL],
|
||||
"browser_config": {
|
||||
"type": "BrowserConfig",
|
||||
"params": {
|
||||
"headless": True,
|
||||
}
|
||||
},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"stream": True, # Must be true for /crawl/stream
|
||||
"screenshot": False,
|
||||
"cache_mode": CacheMode.BYPASS.value
|
||||
}
|
||||
}
|
||||
}
|
||||
async with async_client.stream("POST", "/crawl/stream", json=payload) as response:
|
||||
response.raise_for_status()
|
||||
results = await process_streaming_response(response)
|
||||
|
||||
assert len(results) == 1
|
||||
result = results[0]
|
||||
await assert_crawl_result_structure(result)
|
||||
assert result["success"] is True
|
||||
assert result["url"] == SIMPLE_HTML_URL
|
||||
assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
|
||||
|
||||
|
||||
# 2. Multi-URL and Dispatcher
|
||||
async def test_multi_url_crawl(self, async_client: httpx.AsyncClient):
|
||||
"""Test /crawl with multiple URLs, implicitly testing dispatcher."""
|
||||
urls = [SIMPLE_HTML_URL, "https://httpbin.org/links/10/0"]
|
||||
payload = {
|
||||
"urls": urls,
|
||||
"browser_config": {
|
||||
"type": "BrowserConfig",
|
||||
"params": {"headless": True}
|
||||
},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {"stream": False, "cache_mode": CacheMode.BYPASS.value}
|
||||
}
|
||||
}
|
||||
try:
|
||||
print(f"Sending deep crawl request to server...")
|
||||
response = await async_client.post("/crawl", json=payload)
|
||||
print(f"Response status: {response.status_code}")
|
||||
|
||||
if response.status_code >= 400:
|
||||
error_detail = response.json().get('detail', 'No detail provided')
|
||||
print(f"Error detail: {error_detail}")
|
||||
print(f"Full response: {response.text}")
|
||||
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except httpx.HTTPStatusError as e:
|
||||
print(f"Server error status: {e.response.status_code}")
|
||||
print(f"Server error response: {e.response.text}")
|
||||
try:
|
||||
error_json = e.response.json()
|
||||
print(f"Parsed error: {error_json}")
|
||||
except:
|
||||
print("Could not parse error response as JSON")
|
||||
raise
|
||||
|
||||
assert data["success"] is True
|
||||
assert isinstance(data["results"], list)
|
||||
assert len(data["results"]) == len(urls)
|
||||
for result in data["results"]:
|
||||
await assert_crawl_result_structure(result)
|
||||
assert result["success"] is True
|
||||
assert result["url"] in urls
|
||||
|
||||
async def test_multi_url_crawl_streaming(self, async_client: httpx.AsyncClient):
|
||||
"""Test /crawl/stream with multiple URLs."""
|
||||
urls = [SIMPLE_HTML_URL, "https://httpbin.org/links/10/0"]
|
||||
payload = {
|
||||
"urls": urls,
|
||||
"browser_config": {
|
||||
"type": "BrowserConfig",
|
||||
"params": {"headless": True}
|
||||
},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {"stream": True, "cache_mode": CacheMode.BYPASS.value}
|
||||
}
|
||||
}
|
||||
async with async_client.stream("POST", "/crawl/stream", json=payload) as response:
|
||||
response.raise_for_status()
|
||||
results = await process_streaming_response(response)
|
||||
|
||||
assert len(results) == len(urls)
|
||||
processed_urls = set()
|
||||
for result in results:
|
||||
await assert_crawl_result_structure(result)
|
||||
assert result["success"] is True
|
||||
assert result["url"] in urls
|
||||
processed_urls.add(result["url"])
|
||||
assert processed_urls == set(urls) # Ensure all URLs were processed
|
||||
|
||||
|
||||
# 3. Class Values and Nested Classes (Markdown Generator)
|
||||
async def test_crawl_with_markdown_pruning_filter(self, async_client: httpx.AsyncClient):
|
||||
"""Test /crawl with MarkdownGenerator using PruningContentFilter."""
|
||||
payload = {
|
||||
"urls": [SIMPLE_HTML_URL],
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"cache_mode": CacheMode.ENABLED.value, # Test different cache mode
|
||||
"markdown_generator": {
|
||||
"type": "DefaultMarkdownGenerator",
|
||||
"params": {
|
||||
"content_filter": {
|
||||
"type": "PruningContentFilter",
|
||||
"params": {
|
||||
"threshold": 0.5, # Example param
|
||||
"threshold_type": "relative"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
try:
|
||||
print(f"Sending deep crawl request to server...")
|
||||
response = await async_client.post("/crawl", json=payload)
|
||||
print(f"Response status: {response.status_code}")
|
||||
|
||||
if response.status_code >= 400:
|
||||
error_detail = response.json().get('detail', 'No detail provided')
|
||||
print(f"Error detail: {error_detail}")
|
||||
print(f"Full response: {response.text}")
|
||||
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except httpx.HTTPStatusError as e:
|
||||
print(f"Server error status: {e.response.status_code}")
|
||||
print(f"Server error response: {e.response.text}")
|
||||
try:
|
||||
error_json = e.response.json()
|
||||
print(f"Parsed error: {error_json}")
|
||||
except:
|
||||
print("Could not parse error response as JSON")
|
||||
raise
|
||||
|
||||
assert data["success"] is True
|
||||
assert len(data["results"]) == 1
|
||||
result = data["results"][0]
|
||||
await assert_crawl_result_structure(result)
|
||||
assert result["success"] is True
|
||||
assert "markdown" in result
|
||||
assert isinstance(result["markdown"], dict)
|
||||
assert "raw_markdown" in result["markdown"]
|
||||
assert "fit_markdown" in result["markdown"] # Pruning creates fit_markdown
|
||||
assert "Moby-Dick" in result["markdown"]["raw_markdown"]
|
||||
# Fit markdown content might be different/shorter due to pruning
|
||||
assert len(result["markdown"]["fit_markdown"]) <= len(result["markdown"]["raw_markdown"])
|
||||
|
||||
async def test_crawl_with_markdown_bm25_filter(self, async_client: httpx.AsyncClient):
|
||||
"""Test /crawl with MarkdownGenerator using BM25ContentFilter."""
|
||||
payload = {
|
||||
"urls": [SIMPLE_HTML_URL],
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"markdown_generator": {
|
||||
"type": "DefaultMarkdownGenerator",
|
||||
"params": {
|
||||
"content_filter": {
|
||||
"type": "BM25ContentFilter",
|
||||
"params": {
|
||||
"user_query": "Herman Melville", # Query for BM25
|
||||
"bm25_threshold": 0.1, # Lower threshold to increase matches
|
||||
"language": "english" # Valid parameters
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
try:
|
||||
print(f"Payload for BM25 test: {json.dumps(payload)}")
|
||||
response = await async_client.post("/crawl", json=payload)
|
||||
print(f"Response status: {response.status_code}")
|
||||
|
||||
if response.status_code >= 400:
|
||||
error_detail = response.json().get('detail', 'No detail provided')
|
||||
print(f"Error detail: {error_detail}")
|
||||
print(f"Full response: {response.text}")
|
||||
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except httpx.HTTPStatusError as e:
|
||||
print(f"Server error status: {e.response.status_code}")
|
||||
print(f"Server error response: {e.response.text}")
|
||||
try:
|
||||
error_json = e.response.json()
|
||||
print(f"Parsed error: {error_json}")
|
||||
except:
|
||||
print("Could not parse error response as JSON")
|
||||
raise
|
||||
|
||||
assert data["success"] is True
|
||||
assert len(data["results"]) == 1
|
||||
result = data["results"][0]
|
||||
await assert_crawl_result_structure(result)
|
||||
assert result["success"] is True
|
||||
assert "markdown" in result
|
||||
assert isinstance(result["markdown"], dict)
|
||||
assert "raw_markdown" in result["markdown"]
|
||||
assert "fit_markdown" in result["markdown"] # BM25 creates fit_markdown
|
||||
|
||||
# Print values for debug
|
||||
print(f"Raw markdown length: {len(result['markdown']['raw_markdown'])}")
|
||||
print(f"Fit markdown length: {len(result['markdown']['fit_markdown'])}")
|
||||
|
||||
# Either fit_markdown has content (possibly including our query terms)
|
||||
# or it might be empty if no good BM25 matches were found
|
||||
# Don't assert specific content since it can be environment-dependent
|
||||
|
||||
|
||||
# 4. Deep Crawling
|
||||
async def test_deep_crawl(self, async_client: httpx.AsyncClient):
|
||||
"""Test /crawl with a deep crawl strategy."""
|
||||
payload = {
|
||||
"urls": [DEEP_CRAWL_URL], # Start URL
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"stream": False,
|
||||
"cache_mode": CacheMode.BYPASS.value,
|
||||
"deep_crawl_strategy": {
|
||||
"type": "BFSDeepCrawlStrategy",
|
||||
"params": {
|
||||
"max_depth": 1, # Limit depth for testing speed
|
||||
"max_pages": 5, # Limit pages to crawl
|
||||
"filter_chain": {
|
||||
"type": "FilterChain",
|
||||
"params": {
|
||||
"filters": [
|
||||
{
|
||||
"type": "ContentTypeFilter",
|
||||
"params": {"allowed_types": ["text/html"]}
|
||||
},
|
||||
{
|
||||
"type": "DomainFilter",
|
||||
"params": {"allowed_domains": ["python.org", "docs.python.org"]} # Include important subdomains
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"url_scorer": {
|
||||
"type": "CompositeScorer",
|
||||
"params": {
|
||||
"scorers": [
|
||||
{
|
||||
"type": "KeywordRelevanceScorer",
|
||||
"params": {"keywords": ["documentation", "tutorial"]}
|
||||
},
|
||||
{
|
||||
"type": "PathDepthScorer",
|
||||
"params": {"weight": 0.5, "optimal_depth": 2}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
try:
|
||||
print(f"Sending deep crawl request to server...")
|
||||
response = await async_client.post("/crawl", json=payload)
|
||||
print(f"Response status: {response.status_code}")
|
||||
|
||||
if response.status_code >= 400:
|
||||
error_detail = response.json().get('detail', 'No detail provided')
|
||||
print(f"Error detail: {error_detail}")
|
||||
print(f"Full response: {response.text}")
|
||||
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except httpx.HTTPStatusError as e:
|
||||
print(f"Server error status: {e.response.status_code}")
|
||||
print(f"Server error response: {e.response.text}")
|
||||
try:
|
||||
error_json = e.response.json()
|
||||
print(f"Parsed error: {error_json}")
|
||||
except:
|
||||
print("Could not parse error response as JSON")
|
||||
raise
|
||||
|
||||
assert data["success"] is True
|
||||
assert isinstance(data["results"], list)
|
||||
# Expect more than 1 result due to deep crawl (start URL + crawled links)
|
||||
assert len(data["results"]) > 1
|
||||
assert len(data["results"]) <= 6 # Start URL + max_links=5
|
||||
|
||||
start_url_found = False
|
||||
crawled_urls_found = False
|
||||
for result in data["results"]:
|
||||
await assert_crawl_result_structure(result)
|
||||
assert result["success"] is True
|
||||
|
||||
# Print URL for debugging
|
||||
print(f"Crawled URL: {result['url']}")
|
||||
|
||||
# Allow URLs that contain python.org (including subdomains like docs.python.org)
|
||||
assert "python.org" in result["url"]
|
||||
if result["url"] == DEEP_CRAWL_URL:
|
||||
start_url_found = True
|
||||
else:
|
||||
crawled_urls_found = True
|
||||
|
||||
assert start_url_found
|
||||
assert crawled_urls_found
|
||||
|
||||
|
||||
# 5. Extraction without LLM (JSON/CSS)
|
||||
async def test_json_css_extraction(self, async_client: httpx.AsyncClient):
|
||||
"""Test /crawl with JsonCssExtractionStrategy."""
|
||||
payload = {
|
||||
"urls": [SCRAPE_TARGET_URL],
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"cache_mode": CacheMode.BYPASS.value,
|
||||
"extraction_strategy": {
|
||||
"type": "JsonCssExtractionStrategy",
|
||||
"params": {
|
||||
"schema": {
|
||||
"type": "dict", # IMPORTANT: Wrap schema dict with type/value structure
|
||||
"value": {
|
||||
"name": "BookList",
|
||||
"baseSelector": "ol.row li.col-xs-6", # Select each book item
|
||||
"fields": [
|
||||
{"name": "title", "selector": "article.product_pod h3 a", "type": "attribute", "attribute": "title"},
|
||||
{"name": "price", "selector": "article.product_pod .price_color", "type": "text"},
|
||||
{"name": "rating", "selector": "article.product_pod p.star-rating", "type": "attribute", "attribute": "class"}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
try:
|
||||
print(f"Sending deep crawl request to server...")
|
||||
response = await async_client.post("/crawl", json=payload)
|
||||
print(f"Response status: {response.status_code}")
|
||||
|
||||
if response.status_code >= 400:
|
||||
error_detail = response.json().get('detail', 'No detail provided')
|
||||
print(f"Error detail: {error_detail}")
|
||||
print(f"Full response: {response.text}")
|
||||
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
except httpx.HTTPStatusError as e:
|
||||
print(f"Server error status: {e.response.status_code}")
|
||||
print(f"Server error response: {e.response.text}")
|
||||
try:
|
||||
error_json = e.response.json()
|
||||
print(f"Parsed error: {error_json}")
|
||||
except:
|
||||
print("Could not parse error response as JSON")
|
||||
raise
|
||||
|
||||
assert data["success"] is True
|
||||
assert len(data["results"]) == 1
|
||||
result = data["results"][0]
|
||||
await assert_crawl_result_structure(result)
|
||||
assert result["success"] is True
|
||||
assert "extracted_content" in result
|
||||
assert result["extracted_content"] is not None
|
||||
|
||||
# Extracted content should be a JSON string representing a list of dicts
|
||||
try:
|
||||
extracted_data = json.loads(result["extracted_content"])
|
||||
assert isinstance(extracted_data, list)
|
||||
assert len(extracted_data) > 0 # Should find some books
|
||||
# Check structure of the first extracted item
|
||||
first_item = extracted_data[0]
|
||||
assert "title" in first_item
|
||||
assert "price" in first_item
|
||||
assert "rating" in first_item
|
||||
assert "star-rating" in first_item["rating"] # e.g., "star-rating Three"
|
||||
except (json.JSONDecodeError, AssertionError) as e:
|
||||
pytest.fail(f"Extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}")
|
||||
|
||||
|
||||
# 6. Extraction with LLM
|
||||
async def test_llm_extraction(self, async_client: httpx.AsyncClient):
|
||||
"""
|
||||
Test /crawl with LLMExtractionStrategy.
|
||||
NOTE: Requires the server to have appropriate LLM API keys (e.g., OPENAI_API_KEY)
|
||||
configured via .llm.env or environment variables.
|
||||
This test uses the default provider configured in the server's config.yml.
|
||||
"""
|
||||
payload = {
|
||||
"urls": [SIMPLE_HTML_URL],
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"cache_mode": CacheMode.BYPASS.value,
|
||||
"extraction_strategy": {
|
||||
"type": "LLMExtractionStrategy",
|
||||
"params": {
|
||||
"instruction": "Extract the main title and the author mentioned in the text into JSON.",
|
||||
# LLMConfig is implicitly defined by server's config.yml and .llm.env
|
||||
# If you needed to override provider/token PER REQUEST:
|
||||
"llm_config": {
|
||||
"type": "LLMConfig",
|
||||
"params": {
|
||||
"provider": "openai/gpt-4o", # Example override
|
||||
"api_token": os.getenv("OPENAI_API_KEY") # Example override
|
||||
}
|
||||
},
|
||||
"schema": { # Optional: Provide a schema for structured output
|
||||
"type": "dict", # IMPORTANT: Wrap schema dict
|
||||
"value": {
|
||||
"title": "Book Info",
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string", "description": "The main title of the work"},
|
||||
"author": {"type": "string", "description": "The author of the work"}
|
||||
},
|
||||
"required": ["title", "author"]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
try:
|
||||
response = await async_client.post("/crawl", json=payload)
|
||||
response.raise_for_status() # Will raise if server returns 500 (e.g., bad API key)
|
||||
data = response.json()
|
||||
except httpx.HTTPStatusError as e:
|
||||
# Catch potential server errors (like 500 due to missing/invalid API keys)
|
||||
pytest.fail(f"LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and ensure API keys are correctly configured for the server.")
|
||||
except httpx.RequestError as e:
|
||||
pytest.fail(f"LLM extraction request failed: {e}.")
|
||||
|
||||
assert data["success"] is True
|
||||
assert len(data["results"]) == 1
|
||||
result = data["results"][0]
|
||||
await assert_crawl_result_structure(result)
|
||||
assert result["success"] is True
|
||||
assert "extracted_content" in result
|
||||
assert result["extracted_content"] is not None
|
||||
|
||||
# Extracted content should be JSON (because we provided a schema)
|
||||
try:
|
||||
extracted_data = json.loads(result["extracted_content"])
|
||||
print(f"\nLLM Extracted Data: {extracted_data}") # Print for verification
|
||||
|
||||
# Handle both dict and list formats (server returns a list)
|
||||
if isinstance(extracted_data, list):
|
||||
assert len(extracted_data) > 0
|
||||
extracted_item = extracted_data[0] # Take first item
|
||||
assert isinstance(extracted_item, dict)
|
||||
assert "title" in extracted_item
|
||||
assert "author" in extracted_item
|
||||
assert "Moby-Dick" in extracted_item.get("title", "")
|
||||
assert "Herman Melville" in extracted_item.get("author", "")
|
||||
else:
|
||||
assert isinstance(extracted_data, dict)
|
||||
assert "title" in extracted_data
|
||||
assert "author" in extracted_data
|
||||
assert "Moby-Dick" in extracted_data.get("title", "")
|
||||
assert "Herman Melville" in extracted_data.get("author", "")
|
||||
except (json.JSONDecodeError, AssertionError) as e:
|
||||
pytest.fail(f"LLM extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}")
|
||||
except Exception as e: # Catch any other unexpected error
|
||||
pytest.fail(f"An unexpected error occurred during LLM result processing: {e}\nContent: {result['extracted_content']}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Define arguments for pytest programmatically
|
||||
# -v: verbose output
|
||||
# -s: show print statements immediately (useful for debugging)
|
||||
# __file__: tells pytest to run tests in the current file
|
||||
pytest_args = ["-v", "-s", __file__]
|
||||
|
||||
# You can add more pytest arguments here if needed, for example:
|
||||
# '-k test_llm_extraction': Run only the LLM test function
|
||||
# pytest_args.append("-k test_llm_extraction")
|
||||
|
||||
print(f"Running pytest with args: {pytest_args}")
|
||||
|
||||
# Execute pytest
|
||||
exit_code = pytest.main(pytest_args)
|
||||
|
||||
print(f"Pytest finished with exit code: {exit_code}")
|
||||
335
tests/general/generate_dummy_site.py
Normal file
335
tests/general/generate_dummy_site.py
Normal file
@@ -0,0 +1,335 @@
|
||||
# ==== File: build_dummy_site.py ====
|
||||
|
||||
import os
|
||||
import random
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from urllib.parse import quote
|
||||
|
||||
# --- Configuration ---
|
||||
NUM_CATEGORIES = 3
|
||||
NUM_SUBCATEGORIES_PER_CAT = 2 # Results in NUM_CATEGORIES * NUM_SUBCATEGORIES_PER_CAT total L2 categories
|
||||
NUM_PRODUCTS_PER_SUBCAT = 5 # Products listed on L3 pages
|
||||
MAX_DEPTH_TARGET = 5 # Explicitly set target depth
|
||||
|
||||
# --- Helper Functions ---
|
||||
|
||||
def generate_lorem(words=20):
|
||||
"""Generates simple placeholder text."""
|
||||
lorem_words = ["lorem", "ipsum", "dolor", "sit", "amet", "consectetur",
|
||||
"adipiscing", "elit", "sed", "do", "eiusmod", "tempor",
|
||||
"incididunt", "ut", "labore", "et", "dolore", "magna", "aliqua"]
|
||||
return " ".join(random.choice(lorem_words) for _ in range(words)).capitalize() + "."
|
||||
|
||||
def create_html_page(filepath: Path, title: str, body_content: str, breadcrumbs: list = [], head_extras: str = ""):
|
||||
"""Creates an HTML file with basic structure and inline CSS."""
|
||||
os.makedirs(filepath.parent, exist_ok=True)
|
||||
|
||||
# Generate breadcrumb HTML using the 'link' provided in the breadcrumbs list
|
||||
breadcrumb_html = ""
|
||||
if breadcrumbs:
|
||||
links_html = " » ".join(f'<a href="{bc["link"]}">{bc["name"]}</a>' for bc in breadcrumbs)
|
||||
breadcrumb_html = f"<nav class='breadcrumbs'>{links_html} » {title}</nav>"
|
||||
|
||||
# Basic CSS for structure identification (kept the same)
|
||||
css = """
|
||||
<style>
|
||||
body {
|
||||
font-family: sans-serif;
|
||||
padding: 20px;
|
||||
background-color: #1e1e1e;
|
||||
color: #d1d1d1;
|
||||
}
|
||||
|
||||
.container {
|
||||
max-width: 960px;
|
||||
margin: auto;
|
||||
background: #2c2c2c;
|
||||
padding: 20px;
|
||||
border-radius: 5px;
|
||||
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.5);
|
||||
}
|
||||
|
||||
h1, h2 {
|
||||
color: #ccc;
|
||||
}
|
||||
|
||||
a {
|
||||
color: #9bcdff;
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
a:hover {
|
||||
text-decoration: underline;
|
||||
}
|
||||
|
||||
ul {
|
||||
list-style: none;
|
||||
padding-left: 0;
|
||||
}
|
||||
|
||||
li {
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
.category-link,
|
||||
.subcategory-link,
|
||||
.product-link,
|
||||
.details-link,
|
||||
.reviews-link {
|
||||
display: block;
|
||||
padding: 8px;
|
||||
background-color: #3a3a3a;
|
||||
border-radius: 3px;
|
||||
}
|
||||
|
||||
.product-preview {
|
||||
border: 1px solid #444;
|
||||
padding: 10px;
|
||||
margin-bottom: 10px;
|
||||
border-radius: 4px;
|
||||
background-color: #2a2a2a;
|
||||
}
|
||||
|
||||
.product-title {
|
||||
color: #d1d1d1;
|
||||
}
|
||||
|
||||
.product-price {
|
||||
font-weight: bold;
|
||||
color: #85e085;
|
||||
}
|
||||
|
||||
.product-description,
|
||||
.product-specs,
|
||||
.product-reviews {
|
||||
margin-top: 15px;
|
||||
line-height: 1.6;
|
||||
}
|
||||
|
||||
.product-specs li {
|
||||
margin-bottom: 5px;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
|
||||
.spec-name {
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
.breadcrumbs {
|
||||
margin-bottom: 20px;
|
||||
font-size: 0.9em;
|
||||
color: #888;
|
||||
}
|
||||
|
||||
.breadcrumbs a {
|
||||
color: #9bcdff;
|
||||
}
|
||||
</style>
|
||||
"""
|
||||
html_content = f"""<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>{title} - FakeShop</title>
|
||||
{head_extras}
|
||||
{css}
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
{breadcrumb_html}
|
||||
<h1>{title}</h1>
|
||||
{body_content}
|
||||
</div>
|
||||
</body>
|
||||
</html>"""
|
||||
with open(filepath, "w", encoding="utf-8") as f:
|
||||
f.write(html_content)
|
||||
# Keep print statement concise for clarity
|
||||
# print(f"Created: {filepath}")
|
||||
|
||||
def generate_site(base_dir: Path, site_name: str = "FakeShop", base_path: str = ""):
|
||||
"""Generates the dummy website structure."""
|
||||
base_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# --- Clean and prepare the base path for URL construction ---
|
||||
# Ensure it starts with '/' if not empty, and remove any trailing '/'
|
||||
if base_path:
|
||||
full_base_path = "/" + base_path.strip('/')
|
||||
else:
|
||||
full_base_path = "" # Represents the root
|
||||
|
||||
print(f"Using base path for links: '{full_base_path}'")
|
||||
|
||||
# --- Level 0: Homepage ---
|
||||
home_body = "<h2>Welcome to FakeShop!</h2><p>Your one-stop shop for imaginary items.</p><h3>Categories:</h3>\n<ul>"
|
||||
# Define the *actual* link path for the homepage breadcrumb
|
||||
home_link_path = f"{full_base_path}/index.html"
|
||||
breadcrumbs_home = [{"name": "Home", "link": home_link_path}] # Base breadcrumb
|
||||
|
||||
# Links *within* the page content should remain relative
|
||||
for i in range(NUM_CATEGORIES):
|
||||
cat_name = f"Category-{i+1}"
|
||||
cat_folder_name = quote(cat_name.lower().replace(" ", "-"))
|
||||
# This path is relative to the current directory (index.html)
|
||||
cat_relative_page_path = f"{cat_folder_name}/index.html"
|
||||
home_body += f'<li><a class="category-link" href="{cat_relative_page_path}">{cat_name}</a> - {generate_lorem(10)}</li>'
|
||||
home_body += "</ul>"
|
||||
create_html_page(base_dir / "index.html", "Homepage", home_body, []) # No breadcrumbs *on* the homepage itself
|
||||
|
||||
# --- Levels 1-5 ---
|
||||
for i in range(NUM_CATEGORIES):
|
||||
cat_name = f"Category-{i+1}"
|
||||
cat_folder_name = quote(cat_name.lower().replace(" ", "-"))
|
||||
cat_dir = base_dir / cat_folder_name
|
||||
# This is the *absolute* path for the breadcrumb link
|
||||
cat_link_path = f"{full_base_path}/{cat_folder_name}/index.html"
|
||||
# Update breadcrumbs list for this level
|
||||
breadcrumbs_cat = breadcrumbs_home + [{"name": cat_name, "link": cat_link_path}]
|
||||
|
||||
# --- Level 1: Category Page ---
|
||||
cat_body = f"<p>{generate_lorem(15)} for {cat_name}.</p><h3>Sub-Categories:</h3>\n<ul>"
|
||||
for j in range(NUM_SUBCATEGORIES_PER_CAT):
|
||||
subcat_name = f"{cat_name}-Sub-{j+1}"
|
||||
subcat_folder_name = quote(subcat_name.lower().replace(" ", "-"))
|
||||
# Path relative to the category page
|
||||
subcat_relative_page_path = f"{subcat_folder_name}/index.html"
|
||||
cat_body += f'<li><a class="subcategory-link" href="{subcat_relative_page_path}">{subcat_name}</a> - {generate_lorem(8)}</li>'
|
||||
cat_body += "</ul>"
|
||||
# Pass the updated breadcrumbs list
|
||||
create_html_page(cat_dir / "index.html", cat_name, cat_body, breadcrumbs_home) # Parent breadcrumb needed here
|
||||
|
||||
for j in range(NUM_SUBCATEGORIES_PER_CAT):
|
||||
subcat_name = f"{cat_name}-Sub-{j+1}"
|
||||
subcat_folder_name = quote(subcat_name.lower().replace(" ", "-"))
|
||||
subcat_dir = cat_dir / subcat_folder_name
|
||||
# Absolute path for the breadcrumb link
|
||||
subcat_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/index.html"
|
||||
# Update breadcrumbs list for this level
|
||||
breadcrumbs_subcat = breadcrumbs_cat + [{"name": subcat_name, "link": subcat_link_path}]
|
||||
|
||||
# --- Level 2: Sub-Category Page (Product List) ---
|
||||
subcat_body = f"<p>Explore products in {subcat_name}. {generate_lorem(12)}</p><h3>Products:</h3>\n<ul class='product-list'>"
|
||||
for k in range(NUM_PRODUCTS_PER_SUBCAT):
|
||||
prod_id = f"P{i+1}{j+1}{k+1:03d}" # e.g., P11001
|
||||
prod_name = f"{subcat_name} Product {k+1} ({prod_id})"
|
||||
# Filename relative to the subcategory page
|
||||
prod_filename = f"product_{prod_id}.html"
|
||||
# Absolute path for the breadcrumb link
|
||||
prod_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{prod_filename}"
|
||||
|
||||
# Preview on list page (link remains relative)
|
||||
subcat_body += f"""
|
||||
<li>
|
||||
<div class="product-preview">
|
||||
<a class="product-link" href="{prod_filename}"><strong>{prod_name}</strong></a>
|
||||
<p>{generate_lorem(10)}</p>
|
||||
<span class="product-price">£{random.uniform(10, 500):.2f}</span>
|
||||
</div>
|
||||
</li>"""
|
||||
|
||||
# --- Level 3: Product Page ---
|
||||
prod_price = random.uniform(10, 500)
|
||||
prod_desc = generate_lorem(40)
|
||||
prod_specs = {f"Spec {s+1}": generate_lorem(3) for s in range(random.randint(3,6))}
|
||||
prod_reviews_count = random.randint(0, 150)
|
||||
# Relative filenames for links on this page
|
||||
details_filename_relative = f"product_{prod_id}_details.html"
|
||||
reviews_filename_relative = f"product_{prod_id}_reviews.html"
|
||||
|
||||
prod_body = f"""
|
||||
<p class="product-price">Price: £{prod_price:.2f}</p>
|
||||
<div class="product-description">
|
||||
<h2>Description</h2>
|
||||
<p>{prod_desc}</p>
|
||||
</div>
|
||||
<div class="product-specs">
|
||||
<h2>Specifications</h2>
|
||||
<ul>
|
||||
{''.join(f'<li><span class="spec-name">{name}</span>: <span class="spec-value">{value}</span></li>' for name, value in prod_specs.items())}
|
||||
</ul>
|
||||
</div>
|
||||
<div class="product-reviews">
|
||||
<h2>Reviews</h2>
|
||||
<p>Total Reviews: <span class="review-count">{prod_reviews_count}</span></p>
|
||||
</div>
|
||||
<hr>
|
||||
<p>
|
||||
<a class="details-link" href="{details_filename_relative}">View More Details</a> |
|
||||
<a class="reviews-link" href="{reviews_filename_relative}">See All Reviews</a>
|
||||
</p>
|
||||
"""
|
||||
# Update breadcrumbs list for this level
|
||||
breadcrumbs_prod = breadcrumbs_subcat + [{"name": prod_name, "link": prod_link_path}]
|
||||
# Pass the updated breadcrumbs list
|
||||
create_html_page(subcat_dir / prod_filename, prod_name, prod_body, breadcrumbs_subcat) # Parent breadcrumb needed here
|
||||
|
||||
# --- Level 4: Product Details Page ---
|
||||
details_filename = f"product_{prod_id}_details.html" # Actual filename
|
||||
# Absolute path for the breadcrumb link
|
||||
details_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{details_filename}"
|
||||
details_body = f"<p>This page contains extremely detailed information about {prod_name}.</p>{generate_lorem(100)}"
|
||||
# Update breadcrumbs list for this level
|
||||
breadcrumbs_details = breadcrumbs_prod + [{"name": "Details", "link": details_link_path}]
|
||||
# Pass the updated breadcrumbs list
|
||||
create_html_page(subcat_dir / details_filename, f"{prod_name} - Details", details_body, breadcrumbs_prod) # Parent breadcrumb needed here
|
||||
|
||||
# --- Level 5: Product Reviews Page ---
|
||||
reviews_filename = f"product_{prod_id}_reviews.html" # Actual filename
|
||||
# Absolute path for the breadcrumb link
|
||||
reviews_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{reviews_filename}"
|
||||
reviews_body = f"<p>All {prod_reviews_count} reviews for {prod_name} are listed here.</p><ul>"
|
||||
for r in range(prod_reviews_count):
|
||||
reviews_body += f"<li>Review {r+1}: {generate_lorem(random.randint(15, 50))}</li>"
|
||||
reviews_body += "</ul>"
|
||||
# Update breadcrumbs list for this level
|
||||
breadcrumbs_reviews = breadcrumbs_prod + [{"name": "Reviews", "link": reviews_link_path}]
|
||||
# Pass the updated breadcrumbs list
|
||||
create_html_page(subcat_dir / reviews_filename, f"{prod_name} - Reviews", reviews_body, breadcrumbs_prod) # Parent breadcrumb needed here
|
||||
|
||||
|
||||
subcat_body += "</ul>" # Close product-list ul
|
||||
# Pass the correct breadcrumbs list for the subcategory index page
|
||||
create_html_page(subcat_dir / "index.html", subcat_name, subcat_body, breadcrumbs_cat) # Parent breadcrumb needed here
|
||||
|
||||
|
||||
# --- Main Execution ---
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Generate a dummy multi-level retail website.")
|
||||
parser.add_argument(
|
||||
"-o", "--output-dir",
|
||||
type=str,
|
||||
default="dummy_retail_site",
|
||||
help="Directory to generate the website in."
|
||||
)
|
||||
parser.add_argument(
|
||||
"-n", "--site-name",
|
||||
type=str,
|
||||
default="FakeShop",
|
||||
help="Name of the fake shop."
|
||||
)
|
||||
parser.add_argument(
|
||||
"-b", "--base-path",
|
||||
type=str,
|
||||
default="",
|
||||
help="Base path for hosting the site (e.g., 'samples/deepcrawl'). Leave empty if hosted at the root."
|
||||
)
|
||||
# Optional: Add more args to configure counts if needed
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
output_directory = Path(args.output_dir)
|
||||
site_name = args.site_name
|
||||
base_path = args.base_path
|
||||
|
||||
print(f"Generating dummy site '{site_name}' in '{output_directory}'...")
|
||||
# Pass the base_path to the generation function
|
||||
generate_site(output_directory, site_name, base_path)
|
||||
print(f"\nCreated {sum(1 for _ in output_directory.rglob('*.html'))} HTML pages.")
|
||||
print("Dummy site generation complete.")
|
||||
print(f"To serve locally (example): python -m http.server --directory {output_directory} 8000")
|
||||
if base_path:
|
||||
print(f"Access the site at: http://localhost:8000/{base_path.strip('/')}/index.html")
|
||||
else:
|
||||
print(f"Access the site at: http://localhost:8000/index.html")
|
||||
Reference in New Issue
Block a user