Merge branch 'next' into 2025-MAR-ALPHA-1
This commit is contained in:
47
Dockerfile
47
Dockerfile
@@ -24,7 +24,7 @@ ARG TARGETARCH
|
|||||||
|
|
||||||
LABEL maintainer="unclecode"
|
LABEL maintainer="unclecode"
|
||||||
LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
|
LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
|
||||||
LABEL version="1.0"
|
LABEL version="1.0"
|
||||||
|
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
build-essential \
|
build-essential \
|
||||||
@@ -38,6 +38,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|||||||
libjpeg-dev \
|
libjpeg-dev \
|
||||||
redis-server \
|
redis-server \
|
||||||
supervisor \
|
supervisor \
|
||||||
|
&& apt-get clean \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
@@ -62,11 +63,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|||||||
libcairo2 \
|
libcairo2 \
|
||||||
libasound2 \
|
libasound2 \
|
||||||
libatspi2.0-0 \
|
libatspi2.0-0 \
|
||||||
|
&& apt-get clean \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \
|
RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \
|
||||||
apt-get update && apt-get install -y --no-install-recommends \
|
apt-get update && apt-get install -y --no-install-recommends \
|
||||||
nvidia-cuda-toolkit \
|
nvidia-cuda-toolkit \
|
||||||
|
&& apt-get clean \
|
||||||
&& rm -rf /var/lib/apt/lists/* ; \
|
&& rm -rf /var/lib/apt/lists/* ; \
|
||||||
else \
|
else \
|
||||||
echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \
|
echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \
|
||||||
@@ -76,16 +79,24 @@ RUN if [ "$TARGETARCH" = "arm64" ]; then \
|
|||||||
echo "🦾 Installing ARM-specific optimizations"; \
|
echo "🦾 Installing ARM-specific optimizations"; \
|
||||||
apt-get update && apt-get install -y --no-install-recommends \
|
apt-get update && apt-get install -y --no-install-recommends \
|
||||||
libopenblas-dev \
|
libopenblas-dev \
|
||||||
|
&& apt-get clean \
|
||||||
&& rm -rf /var/lib/apt/lists/*; \
|
&& rm -rf /var/lib/apt/lists/*; \
|
||||||
elif [ "$TARGETARCH" = "amd64" ]; then \
|
elif [ "$TARGETARCH" = "amd64" ]; then \
|
||||||
echo "🖥️ Installing AMD64-specific optimizations"; \
|
echo "🖥️ Installing AMD64-specific optimizations"; \
|
||||||
apt-get update && apt-get install -y --no-install-recommends \
|
apt-get update && apt-get install -y --no-install-recommends \
|
||||||
libomp-dev \
|
libomp-dev \
|
||||||
|
&& apt-get clean \
|
||||||
&& rm -rf /var/lib/apt/lists/*; \
|
&& rm -rf /var/lib/apt/lists/*; \
|
||||||
else \
|
else \
|
||||||
echo "Skipping platform-specific optimizations (unsupported platform)"; \
|
echo "Skipping platform-specific optimizations (unsupported platform)"; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Create a non-root user and group
|
||||||
|
RUN groupadd -r appuser && useradd --no-log-init -r -g appuser appuser
|
||||||
|
|
||||||
|
# Create and set permissions for appuser home directory
|
||||||
|
RUN mkdir -p /home/appuser && chown -R appuser:appuser /home/appuser
|
||||||
|
|
||||||
WORKDIR ${APP_HOME}
|
WORKDIR ${APP_HOME}
|
||||||
|
|
||||||
RUN echo '#!/bin/bash\n\
|
RUN echo '#!/bin/bash\n\
|
||||||
@@ -103,6 +114,7 @@ fi' > /tmp/install.sh && chmod +x /tmp/install.sh
|
|||||||
|
|
||||||
COPY . /tmp/project/
|
COPY . /tmp/project/
|
||||||
|
|
||||||
|
# Copy supervisor config first (might need root later, but okay for now)
|
||||||
COPY deploy/docker/supervisord.conf .
|
COPY deploy/docker/supervisord.conf .
|
||||||
|
|
||||||
COPY deploy/docker/requirements.txt .
|
COPY deploy/docker/requirements.txt .
|
||||||
@@ -131,16 +143,31 @@ RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
|
|||||||
else \
|
else \
|
||||||
pip install "/tmp/project" ; \
|
pip install "/tmp/project" ; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
RUN pip install --no-cache-dir --upgrade pip && \
|
RUN pip install --no-cache-dir --upgrade pip && \
|
||||||
/tmp/install.sh && \
|
/tmp/install.sh && \
|
||||||
python -c "import crawl4ai; print('✅ crawl4ai is ready to rock!')" && \
|
python -c "import crawl4ai; print('✅ crawl4ai is ready to rock!')" && \
|
||||||
python -c "from playwright.sync_api import sync_playwright; print('✅ Playwright is feeling dramatic!')"
|
python -c "from playwright.sync_api import sync_playwright; print('✅ Playwright is feeling dramatic!')"
|
||||||
|
|
||||||
RUN playwright install --with-deps chromium
|
|
||||||
|
|
||||||
|
RUN crawl4ai-setup
|
||||||
|
|
||||||
|
RUN playwright install --with-deps
|
||||||
|
|
||||||
|
RUN mkdir -p /home/appuser/.cache/ms-playwright \
|
||||||
|
&& cp -r /root/.cache/ms-playwright/chromium-* /home/appuser/.cache/ms-playwright/ \
|
||||||
|
&& chown -R appuser:appuser /home/appuser/.cache/ms-playwright
|
||||||
|
|
||||||
|
RUN crawl4ai-doctor
|
||||||
|
|
||||||
|
# Copy application code
|
||||||
COPY deploy/docker/* ${APP_HOME}/
|
COPY deploy/docker/* ${APP_HOME}/
|
||||||
|
|
||||||
|
# Change ownership of the application directory to the non-root user
|
||||||
|
RUN chown -R appuser:appuser ${APP_HOME}
|
||||||
|
|
||||||
|
# give permissions to redis persistence dirs if used
|
||||||
|
RUN mkdir -p /var/lib/redis /var/log/redis && chown -R appuser:appuser /var/lib/redis /var/log/redis
|
||||||
|
|
||||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||||
CMD bash -c '\
|
CMD bash -c '\
|
||||||
MEM=$(free -m | awk "/^Mem:/{print \$2}"); \
|
MEM=$(free -m | awk "/^Mem:/{print \$2}"); \
|
||||||
@@ -149,8 +176,14 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
|||||||
exit 1; \
|
exit 1; \
|
||||||
fi && \
|
fi && \
|
||||||
redis-cli ping > /dev/null && \
|
redis-cli ping > /dev/null && \
|
||||||
curl -f http://localhost:8000/health || exit 1'
|
curl -f http://localhost:11235/health || exit 1'
|
||||||
|
|
||||||
EXPOSE 6379
|
EXPOSE 6379
|
||||||
CMD ["supervisord", "-c", "supervisord.conf"]
|
# Switch to the non-root user before starting the application
|
||||||
|
USER appuser
|
||||||
|
|
||||||
|
# Set environment variables to ptoduction
|
||||||
|
ENV PYTHON_ENV=production
|
||||||
|
|
||||||
|
# Start the application using supervisord
|
||||||
|
CMD ["supervisord", "-c", "supervisord.conf"]
|
||||||
@@ -2,7 +2,7 @@
|
|||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
||||||
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig
|
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig
|
||||||
|
|
||||||
from .content_scraping_strategy import (
|
from .content_scraping_strategy import (
|
||||||
ContentScrapingStrategy,
|
ContentScrapingStrategy,
|
||||||
@@ -121,6 +121,7 @@ __all__ = [
|
|||||||
"Crawl4aiDockerClient",
|
"Crawl4aiDockerClient",
|
||||||
"ProxyRotationStrategy",
|
"ProxyRotationStrategy",
|
||||||
"RoundRobinProxyStrategy",
|
"RoundRobinProxyStrategy",
|
||||||
|
"ProxyConfig"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ from .config import (
|
|||||||
MIN_WORD_THRESHOLD,
|
MIN_WORD_THRESHOLD,
|
||||||
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
||||||
PROVIDER_MODELS,
|
PROVIDER_MODELS,
|
||||||
|
PROVIDER_MODELS_PREFIXES,
|
||||||
SCREENSHOT_HEIGHT_TRESHOLD,
|
SCREENSHOT_HEIGHT_TRESHOLD,
|
||||||
PAGE_TIMEOUT,
|
PAGE_TIMEOUT,
|
||||||
IMAGE_SCORE_THRESHOLD,
|
IMAGE_SCORE_THRESHOLD,
|
||||||
@@ -27,11 +28,8 @@ import inspect
|
|||||||
from typing import Any, Dict, Optional
|
from typing import Any, Dict, Optional
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
|
||||||
from .proxy_strategy import ProxyConfig
|
# from .proxy_strategy import ProxyConfig
|
||||||
try:
|
|
||||||
from .browser.models import DockerConfig
|
|
||||||
except ImportError:
|
|
||||||
DockerConfig = None
|
|
||||||
|
|
||||||
|
|
||||||
def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
|
def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
|
||||||
@@ -122,23 +120,25 @@ def from_serializable_dict(data: Any) -> Any:
|
|||||||
# Handle typed data
|
# Handle typed data
|
||||||
if isinstance(data, dict) and "type" in data:
|
if isinstance(data, dict) and "type" in data:
|
||||||
# Handle plain dictionaries
|
# Handle plain dictionaries
|
||||||
if data["type"] == "dict":
|
if data["type"] == "dict" and "value" in data:
|
||||||
return {k: from_serializable_dict(v) for k, v in data["value"].items()}
|
return {k: from_serializable_dict(v) for k, v in data["value"].items()}
|
||||||
|
|
||||||
# Import from crawl4ai for class instances
|
# Import from crawl4ai for class instances
|
||||||
import crawl4ai
|
import crawl4ai
|
||||||
|
|
||||||
cls = getattr(crawl4ai, data["type"])
|
if hasattr(crawl4ai, data["type"]):
|
||||||
|
cls = getattr(crawl4ai, data["type"])
|
||||||
|
|
||||||
# Handle Enum
|
# Handle Enum
|
||||||
if issubclass(cls, Enum):
|
if issubclass(cls, Enum):
|
||||||
return cls(data["params"])
|
return cls(data["params"])
|
||||||
|
|
||||||
# Handle class instances
|
if "params" in data:
|
||||||
constructor_args = {
|
# Handle class instances
|
||||||
k: from_serializable_dict(v) for k, v in data["params"].items()
|
constructor_args = {
|
||||||
}
|
k: from_serializable_dict(v) for k, v in data["params"].items()
|
||||||
return cls(**constructor_args)
|
}
|
||||||
|
return cls(**constructor_args)
|
||||||
|
|
||||||
# Handle lists
|
# Handle lists
|
||||||
if isinstance(data, list):
|
if isinstance(data, list):
|
||||||
@@ -159,6 +159,117 @@ def is_empty_value(value: Any) -> bool:
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
class ProxyConfig:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
server: str,
|
||||||
|
username: Optional[str] = None,
|
||||||
|
password: Optional[str] = None,
|
||||||
|
ip: Optional[str] = None,
|
||||||
|
):
|
||||||
|
"""Configuration class for a single proxy.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
server: Proxy server URL (e.g., "http://127.0.0.1:8080")
|
||||||
|
username: Optional username for proxy authentication
|
||||||
|
password: Optional password for proxy authentication
|
||||||
|
ip: Optional IP address for verification purposes
|
||||||
|
"""
|
||||||
|
self.server = server
|
||||||
|
self.username = username
|
||||||
|
self.password = password
|
||||||
|
|
||||||
|
# Extract IP from server if not explicitly provided
|
||||||
|
self.ip = ip or self._extract_ip_from_server()
|
||||||
|
|
||||||
|
def _extract_ip_from_server(self) -> Optional[str]:
|
||||||
|
"""Extract IP address from server URL."""
|
||||||
|
try:
|
||||||
|
# Simple extraction assuming http://ip:port format
|
||||||
|
if "://" in self.server:
|
||||||
|
parts = self.server.split("://")[1].split(":")
|
||||||
|
return parts[0]
|
||||||
|
else:
|
||||||
|
parts = self.server.split(":")
|
||||||
|
return parts[0]
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_string(proxy_str: str) -> "ProxyConfig":
|
||||||
|
"""Create a ProxyConfig from a string in the format 'ip:port:username:password'."""
|
||||||
|
parts = proxy_str.split(":")
|
||||||
|
if len(parts) == 4: # ip:port:username:password
|
||||||
|
ip, port, username, password = parts
|
||||||
|
return ProxyConfig(
|
||||||
|
server=f"http://{ip}:{port}",
|
||||||
|
username=username,
|
||||||
|
password=password,
|
||||||
|
ip=ip
|
||||||
|
)
|
||||||
|
elif len(parts) == 2: # ip:port only
|
||||||
|
ip, port = parts
|
||||||
|
return ProxyConfig(
|
||||||
|
server=f"http://{ip}:{port}",
|
||||||
|
ip=ip
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Invalid proxy string format: {proxy_str}")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_dict(proxy_dict: Dict) -> "ProxyConfig":
|
||||||
|
"""Create a ProxyConfig from a dictionary."""
|
||||||
|
return ProxyConfig(
|
||||||
|
server=proxy_dict.get("server"),
|
||||||
|
username=proxy_dict.get("username"),
|
||||||
|
password=proxy_dict.get("password"),
|
||||||
|
ip=proxy_dict.get("ip")
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_env(env_var: str = "PROXIES") -> List["ProxyConfig"]:
|
||||||
|
"""Load proxies from environment variable.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
env_var: Name of environment variable containing comma-separated proxy strings
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of ProxyConfig objects
|
||||||
|
"""
|
||||||
|
proxies = []
|
||||||
|
try:
|
||||||
|
proxy_list = os.getenv(env_var, "").split(",")
|
||||||
|
for proxy in proxy_list:
|
||||||
|
if not proxy:
|
||||||
|
continue
|
||||||
|
proxies.append(ProxyConfig.from_string(proxy))
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error loading proxies from environment: {e}")
|
||||||
|
return proxies
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict:
|
||||||
|
"""Convert to dictionary representation."""
|
||||||
|
return {
|
||||||
|
"server": self.server,
|
||||||
|
"username": self.username,
|
||||||
|
"password": self.password,
|
||||||
|
"ip": self.ip
|
||||||
|
}
|
||||||
|
|
||||||
|
def clone(self, **kwargs) -> "ProxyConfig":
|
||||||
|
"""Create a copy of this configuration with updated values.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
**kwargs: Key-value pairs of configuration options to update
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ProxyConfig: A new instance with the specified updates
|
||||||
|
"""
|
||||||
|
config_dict = self.to_dict()
|
||||||
|
config_dict.update(kwargs)
|
||||||
|
return ProxyConfig.from_dict(config_dict)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class BrowserConfig:
|
class BrowserConfig:
|
||||||
"""
|
"""
|
||||||
@@ -195,8 +306,6 @@ class BrowserConfig:
|
|||||||
Default: None.
|
Default: None.
|
||||||
proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
|
proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
|
||||||
If None, no additional proxy config. Default: None.
|
If None, no additional proxy config. Default: None.
|
||||||
docker_config (DockerConfig or dict or None): Configuration for Docker-based browser automation.
|
|
||||||
Contains settings for Docker container operation. Default: None.
|
|
||||||
viewport_width (int): Default viewport width for pages. Default: 1080.
|
viewport_width (int): Default viewport width for pages. Default: 1080.
|
||||||
viewport_height (int): Default viewport height for pages. Default: 600.
|
viewport_height (int): Default viewport height for pages. Default: 600.
|
||||||
viewport (dict): Default viewport dimensions for pages. If set, overrides viewport_width and viewport_height.
|
viewport (dict): Default viewport dimensions for pages. If set, overrides viewport_width and viewport_height.
|
||||||
@@ -242,7 +351,6 @@ class BrowserConfig:
|
|||||||
channel: str = "chromium",
|
channel: str = "chromium",
|
||||||
proxy: str = None,
|
proxy: str = None,
|
||||||
proxy_config: Union[ProxyConfig, dict, None] = None,
|
proxy_config: Union[ProxyConfig, dict, None] = None,
|
||||||
docker_config: Union[DockerConfig, dict, None] = None,
|
|
||||||
viewport_width: int = 1080,
|
viewport_width: int = 1080,
|
||||||
viewport_height: int = 600,
|
viewport_height: int = 600,
|
||||||
viewport: dict = None,
|
viewport: dict = None,
|
||||||
@@ -283,15 +391,7 @@ class BrowserConfig:
|
|||||||
self.chrome_channel = ""
|
self.chrome_channel = ""
|
||||||
self.proxy = proxy
|
self.proxy = proxy
|
||||||
self.proxy_config = proxy_config
|
self.proxy_config = proxy_config
|
||||||
|
|
||||||
# Handle docker configuration
|
|
||||||
if isinstance(docker_config, dict) and DockerConfig is not None:
|
|
||||||
self.docker_config = DockerConfig.from_kwargs(docker_config)
|
|
||||||
else:
|
|
||||||
self.docker_config = docker_config
|
|
||||||
|
|
||||||
if self.docker_config:
|
|
||||||
self.user_data_dir = self.docker_config.user_data_dir
|
|
||||||
|
|
||||||
self.viewport_width = viewport_width
|
self.viewport_width = viewport_width
|
||||||
self.viewport_height = viewport_height
|
self.viewport_height = viewport_height
|
||||||
@@ -362,7 +462,6 @@ class BrowserConfig:
|
|||||||
channel=kwargs.get("channel", "chromium"),
|
channel=kwargs.get("channel", "chromium"),
|
||||||
proxy=kwargs.get("proxy"),
|
proxy=kwargs.get("proxy"),
|
||||||
proxy_config=kwargs.get("proxy_config", None),
|
proxy_config=kwargs.get("proxy_config", None),
|
||||||
docker_config=kwargs.get("docker_config", None),
|
|
||||||
viewport_width=kwargs.get("viewport_width", 1080),
|
viewport_width=kwargs.get("viewport_width", 1080),
|
||||||
viewport_height=kwargs.get("viewport_height", 600),
|
viewport_height=kwargs.get("viewport_height", 600),
|
||||||
accept_downloads=kwargs.get("accept_downloads", False),
|
accept_downloads=kwargs.get("accept_downloads", False),
|
||||||
@@ -419,13 +518,7 @@ class BrowserConfig:
|
|||||||
"debugging_port": self.debugging_port,
|
"debugging_port": self.debugging_port,
|
||||||
"host": self.host,
|
"host": self.host,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Include docker_config if it exists
|
|
||||||
if hasattr(self, "docker_config") and self.docker_config is not None:
|
|
||||||
if hasattr(self.docker_config, "to_dict"):
|
|
||||||
result["docker_config"] = self.docker_config.to_dict()
|
|
||||||
else:
|
|
||||||
result["docker_config"] = self.docker_config
|
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@@ -1178,9 +1271,18 @@ class LLMConfig:
|
|||||||
elif api_token and api_token.startswith("env:"):
|
elif api_token and api_token.startswith("env:"):
|
||||||
self.api_token = os.getenv(api_token[4:])
|
self.api_token = os.getenv(api_token[4:])
|
||||||
else:
|
else:
|
||||||
self.api_token = PROVIDER_MODELS.get(provider, "no-token") or os.getenv(
|
# Check if given provider starts with any of key in PROVIDER_MODELS_PREFIXES
|
||||||
DEFAULT_PROVIDER_API_KEY
|
# If not, check if it is in PROVIDER_MODELS
|
||||||
)
|
prefixes = PROVIDER_MODELS_PREFIXES.keys()
|
||||||
|
if any(provider.startswith(prefix) for prefix in prefixes):
|
||||||
|
selected_prefix = next(
|
||||||
|
(prefix for prefix in prefixes if provider.startswith(prefix)),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
self.api_token = PROVIDER_MODELS_PREFIXES.get(selected_prefix)
|
||||||
|
else:
|
||||||
|
self.provider = DEFAULT_PROVIDER
|
||||||
|
self.api_token = os.getenv(DEFAULT_PROVIDER_API_KEY)
|
||||||
self.base_url = base_url
|
self.base_url = base_url
|
||||||
self.temprature = temprature
|
self.temprature = temprature
|
||||||
self.max_tokens = max_tokens
|
self.max_tokens = max_tokens
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ from .markdown_generation_strategy import (
|
|||||||
)
|
)
|
||||||
from .deep_crawling import DeepCrawlDecorator
|
from .deep_crawling import DeepCrawlDecorator
|
||||||
from .async_logger import AsyncLogger, AsyncLoggerBase
|
from .async_logger import AsyncLogger, AsyncLoggerBase
|
||||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
from .async_configs import BrowserConfig, CrawlerRunConfig, ProxyConfig
|
||||||
from .async_dispatcher import * # noqa: F403
|
from .async_dispatcher import * # noqa: F403
|
||||||
from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
|
from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
|
||||||
|
|
||||||
@@ -291,12 +291,12 @@ class AsyncWebCrawler:
|
|||||||
|
|
||||||
# Update proxy configuration from rotation strategy if available
|
# Update proxy configuration from rotation strategy if available
|
||||||
if config and config.proxy_rotation_strategy:
|
if config and config.proxy_rotation_strategy:
|
||||||
next_proxy = await config.proxy_rotation_strategy.get_next_proxy()
|
next_proxy : ProxyConfig = await config.proxy_rotation_strategy.get_next_proxy()
|
||||||
if next_proxy:
|
if next_proxy:
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
message="Switch proxy: {proxy}",
|
message="Switch proxy: {proxy}",
|
||||||
tag="PROXY",
|
tag="PROXY",
|
||||||
params={"proxy": next_proxy.server},
|
params={"proxy": next_proxy.server}
|
||||||
)
|
)
|
||||||
config.proxy_config = next_proxy
|
config.proxy_config = next_proxy
|
||||||
# config = config.clone(proxy_config=next_proxy)
|
# config = config.clone(proxy_config=next_proxy)
|
||||||
|
|||||||
@@ -94,6 +94,7 @@ class ManagedBrowser:
|
|||||||
host: str = "localhost",
|
host: str = "localhost",
|
||||||
debugging_port: int = 9222,
|
debugging_port: int = 9222,
|
||||||
cdp_url: Optional[str] = None,
|
cdp_url: Optional[str] = None,
|
||||||
|
browser_config: Optional[BrowserConfig] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Initialize the ManagedBrowser instance.
|
Initialize the ManagedBrowser instance.
|
||||||
@@ -109,17 +110,19 @@ class ManagedBrowser:
|
|||||||
host (str): Host for debugging the browser. Default: "localhost".
|
host (str): Host for debugging the browser. Default: "localhost".
|
||||||
debugging_port (int): Port for debugging the browser. Default: 9222.
|
debugging_port (int): Port for debugging the browser. Default: 9222.
|
||||||
cdp_url (str or None): CDP URL to connect to the browser. Default: None.
|
cdp_url (str or None): CDP URL to connect to the browser. Default: None.
|
||||||
|
browser_config (BrowserConfig): Configuration object containing all browser settings. Default: None.
|
||||||
"""
|
"""
|
||||||
self.browser_type = browser_type
|
self.browser_type = browser_config.browser_type
|
||||||
self.user_data_dir = user_data_dir
|
self.user_data_dir = browser_config.user_data_dir
|
||||||
self.headless = headless
|
self.headless = browser_config.headless
|
||||||
self.browser_process = None
|
self.browser_process = None
|
||||||
self.temp_dir = None
|
self.temp_dir = None
|
||||||
self.debugging_port = debugging_port
|
self.debugging_port = browser_config.debugging_port
|
||||||
self.host = host
|
self.host = browser_config.host
|
||||||
self.logger = logger
|
self.logger = logger
|
||||||
self.shutting_down = False
|
self.shutting_down = False
|
||||||
self.cdp_url = cdp_url
|
self.cdp_url = browser_config.cdp_url
|
||||||
|
self.browser_config = browser_config
|
||||||
|
|
||||||
async def start(self) -> str:
|
async def start(self) -> str:
|
||||||
"""
|
"""
|
||||||
@@ -142,6 +145,9 @@ class ManagedBrowser:
|
|||||||
# Get browser path and args based on OS and browser type
|
# Get browser path and args based on OS and browser type
|
||||||
# browser_path = self._get_browser_path()
|
# browser_path = self._get_browser_path()
|
||||||
args = await self._get_browser_args()
|
args = await self._get_browser_args()
|
||||||
|
|
||||||
|
if self.browser_config.extra_args:
|
||||||
|
args.extend(self.browser_config.extra_args)
|
||||||
|
|
||||||
# Start browser process
|
# Start browser process
|
||||||
try:
|
try:
|
||||||
@@ -477,6 +483,7 @@ class BrowserManager:
|
|||||||
logger=self.logger,
|
logger=self.logger,
|
||||||
debugging_port=self.config.debugging_port,
|
debugging_port=self.config.debugging_port,
|
||||||
cdp_url=self.config.cdp_url,
|
cdp_url=self.config.cdp_url,
|
||||||
|
browser_config=self.config,
|
||||||
)
|
)
|
||||||
|
|
||||||
async def start(self):
|
async def start(self):
|
||||||
@@ -491,10 +498,12 @@ class BrowserManager:
|
|||||||
|
|
||||||
Note: This method should be called in a separate task to avoid blocking the main event loop.
|
Note: This method should be called in a separate task to avoid blocking the main event loop.
|
||||||
"""
|
"""
|
||||||
if self.playwright is None:
|
if self.playwright is not None:
|
||||||
from playwright.async_api import async_playwright
|
await self.close()
|
||||||
|
|
||||||
|
from playwright.async_api import async_playwright
|
||||||
|
|
||||||
self.playwright = await async_playwright().start()
|
self.playwright = await async_playwright().start()
|
||||||
|
|
||||||
if self.config.cdp_url or self.config.use_managed_browser:
|
if self.config.cdp_url or self.config.use_managed_browser:
|
||||||
self.config.use_managed_browser = True
|
self.config.use_managed_browser = True
|
||||||
|
|||||||
@@ -29,6 +29,14 @@ PROVIDER_MODELS = {
|
|||||||
'gemini/gemini-2.0-flash-lite-preview-02-05': os.getenv("GEMINI_API_KEY"),
|
'gemini/gemini-2.0-flash-lite-preview-02-05': os.getenv("GEMINI_API_KEY"),
|
||||||
"deepseek/deepseek-chat": os.getenv("DEEPSEEK_API_KEY"),
|
"deepseek/deepseek-chat": os.getenv("DEEPSEEK_API_KEY"),
|
||||||
}
|
}
|
||||||
|
PROVIDER_MODELS_PREFIXES = {
|
||||||
|
"ollama": "no-token-needed", # Any model from Ollama no need for API token
|
||||||
|
"groq": os.getenv("GROQ_API_KEY"),
|
||||||
|
"openai": os.getenv("OPENAI_API_KEY"),
|
||||||
|
"anthropic": os.getenv("ANTHROPIC_API_KEY"),
|
||||||
|
"gemini": os.getenv("GEMINI_API_KEY"),
|
||||||
|
"deepseek": os.getenv("DEEPSEEK_API_KEY"),
|
||||||
|
}
|
||||||
|
|
||||||
# Chunk token threshold
|
# Chunk token threshold
|
||||||
CHUNK_TOKEN_THRESHOLD = 2**11 # 2048 tokens
|
CHUNK_TOKEN_THRESHOLD = 2**11 # 2048 tokens
|
||||||
|
|||||||
@@ -7,7 +7,9 @@ import time
|
|||||||
|
|
||||||
from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH, PROMPT_EXTRACT_INFERRED_SCHEMA
|
from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH, PROMPT_EXTRACT_INFERRED_SCHEMA
|
||||||
from .config import (
|
from .config import (
|
||||||
DEFAULT_PROVIDER, CHUNK_TOKEN_THRESHOLD,
|
DEFAULT_PROVIDER,
|
||||||
|
DEFAULT_PROVIDER_API_KEY,
|
||||||
|
CHUNK_TOKEN_THRESHOLD,
|
||||||
OVERLAP_RATE,
|
OVERLAP_RATE,
|
||||||
WORD_TOKEN_RATE,
|
WORD_TOKEN_RATE,
|
||||||
)
|
)
|
||||||
@@ -542,6 +544,11 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
"""
|
"""
|
||||||
super().__init__( input_format=input_format, **kwargs)
|
super().__init__( input_format=input_format, **kwargs)
|
||||||
self.llm_config = llm_config
|
self.llm_config = llm_config
|
||||||
|
if not self.llm_config:
|
||||||
|
self.llm_config = create_llm_config(
|
||||||
|
provider=DEFAULT_PROVIDER,
|
||||||
|
api_token=os.environ.get(DEFAULT_PROVIDER_API_KEY),
|
||||||
|
)
|
||||||
self.instruction = instruction
|
self.instruction = instruction
|
||||||
self.extract_type = extraction_type
|
self.extract_type = extraction_type
|
||||||
self.schema = schema
|
self.schema = schema
|
||||||
|
|||||||
@@ -40,10 +40,25 @@ def setup_home_directory():
|
|||||||
f.write("")
|
f.write("")
|
||||||
|
|
||||||
def post_install():
|
def post_install():
|
||||||
"""Run all post-installation tasks"""
|
"""
|
||||||
|
Run all post-installation tasks.
|
||||||
|
Checks CRAWL4AI_MODE environment variable. If set to 'api',
|
||||||
|
skips Playwright browser installation.
|
||||||
|
"""
|
||||||
logger.info("Running post-installation setup...", tag="INIT")
|
logger.info("Running post-installation setup...", tag="INIT")
|
||||||
setup_home_directory()
|
setup_home_directory()
|
||||||
install_playwright()
|
|
||||||
|
# Check environment variable to conditionally skip Playwright install
|
||||||
|
run_mode = os.getenv('CRAWL4AI_MODE')
|
||||||
|
if run_mode == 'api':
|
||||||
|
logger.warning(
|
||||||
|
"CRAWL4AI_MODE=api detected. Skipping Playwright browser installation.",
|
||||||
|
tag="SETUP"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Proceed with installation only if mode is not 'api'
|
||||||
|
install_playwright()
|
||||||
|
|
||||||
run_migration()
|
run_migration()
|
||||||
# TODO: Will be added in the future
|
# TODO: Will be added in the future
|
||||||
# setup_builtin_browser()
|
# setup_builtin_browser()
|
||||||
|
|||||||
@@ -4,6 +4,9 @@ from itertools import cycle
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
########### ATTENTION PEOPLE OF EARTH ###########
|
||||||
|
# I have moved this config to async_configs.py, kept it here, in case someone still importing it, however
|
||||||
|
# be a dear and follow `from crawl4ai import ProxyConfig` instead :)
|
||||||
class ProxyConfig:
|
class ProxyConfig:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -119,12 +122,12 @@ class ProxyRotationStrategy(ABC):
|
|||||||
"""Base abstract class for proxy rotation strategies"""
|
"""Base abstract class for proxy rotation strategies"""
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
async def get_next_proxy(self) -> Optional[Dict]:
|
async def get_next_proxy(self) -> Optional[ProxyConfig]:
|
||||||
"""Get next proxy configuration from the strategy"""
|
"""Get next proxy configuration from the strategy"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def add_proxies(self, proxies: List[Dict]):
|
def add_proxies(self, proxies: List[ProxyConfig]):
|
||||||
"""Add proxy configurations to the strategy"""
|
"""Add proxy configurations to the strategy"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|||||||
@@ -9,83 +9,44 @@ from urllib.parse import urlparse
|
|||||||
import OpenSSL.crypto
|
import OpenSSL.crypto
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
# === Inherit from dict ===
|
||||||
class SSLCertificate:
|
class SSLCertificate(dict):
|
||||||
"""
|
"""
|
||||||
A class representing an SSL certificate with methods to export in various formats.
|
A class representing an SSL certificate, behaving like a dictionary
|
||||||
|
for direct JSON serialization. It stores the certificate information internally
|
||||||
|
and provides methods for export and property access.
|
||||||
|
|
||||||
Attributes:
|
Inherits from dict, so instances are directly JSON serializable.
|
||||||
cert_info (Dict[str, Any]): The certificate information.
|
|
||||||
|
|
||||||
Methods:
|
|
||||||
from_url(url: str, timeout: int = 10) -> Optional['SSLCertificate']: Create SSLCertificate instance from a URL.
|
|
||||||
from_file(file_path: str) -> Optional['SSLCertificate']: Create SSLCertificate instance from a file.
|
|
||||||
from_binary(binary_data: bytes) -> Optional['SSLCertificate']: Create SSLCertificate instance from binary data.
|
|
||||||
export_as_pem() -> str: Export the certificate as PEM format.
|
|
||||||
export_as_der() -> bytes: Export the certificate as DER format.
|
|
||||||
export_as_json() -> Dict[str, Any]: Export the certificate as JSON format.
|
|
||||||
export_as_text() -> str: Export the certificate as text format.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# Use __slots__ for potential memory optimization if desired, though less common when inheriting dict
|
||||||
|
# __slots__ = ("_cert_info",) # If using slots, be careful with dict inheritance interaction
|
||||||
|
|
||||||
def __init__(self, cert_info: Dict[str, Any]):
|
def __init__(self, cert_info: Dict[str, Any]):
|
||||||
self._cert_info = self._decode_cert_data(cert_info)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def from_url(url: str, timeout: int = 10) -> Optional["SSLCertificate"]:
|
|
||||||
"""
|
"""
|
||||||
Create SSLCertificate instance from a URL.
|
Initializes the SSLCertificate object.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
url (str): URL of the website.
|
cert_info (Dict[str, Any]): The raw certificate dictionary.
|
||||||
timeout (int): Timeout for the connection (default: 10).
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Optional[SSLCertificate]: SSLCertificate instance if successful, None otherwise.
|
|
||||||
"""
|
"""
|
||||||
try:
|
# 1. Decode the data (handle bytes -> str)
|
||||||
hostname = urlparse(url).netloc
|
decoded_info = self._decode_cert_data(cert_info)
|
||||||
if ":" in hostname:
|
|
||||||
hostname = hostname.split(":")[0]
|
|
||||||
|
|
||||||
context = ssl.create_default_context()
|
# 2. Store the decoded info internally (optional but good practice)
|
||||||
with socket.create_connection((hostname, 443), timeout=timeout) as sock:
|
# self._cert_info = decoded_info # You can keep this if methods rely on it
|
||||||
with context.wrap_socket(sock, server_hostname=hostname) as ssock:
|
|
||||||
cert_binary = ssock.getpeercert(binary_form=True)
|
|
||||||
x509 = OpenSSL.crypto.load_certificate(
|
|
||||||
OpenSSL.crypto.FILETYPE_ASN1, cert_binary
|
|
||||||
)
|
|
||||||
|
|
||||||
cert_info = {
|
# 3. Initialize the dictionary part of the object with the decoded data
|
||||||
"subject": dict(x509.get_subject().get_components()),
|
super().__init__(decoded_info)
|
||||||
"issuer": dict(x509.get_issuer().get_components()),
|
|
||||||
"version": x509.get_version(),
|
|
||||||
"serial_number": hex(x509.get_serial_number()),
|
|
||||||
"not_before": x509.get_notBefore(),
|
|
||||||
"not_after": x509.get_notAfter(),
|
|
||||||
"fingerprint": x509.digest("sha256").hex(),
|
|
||||||
"signature_algorithm": x509.get_signature_algorithm(),
|
|
||||||
"raw_cert": base64.b64encode(cert_binary),
|
|
||||||
}
|
|
||||||
|
|
||||||
# Add extensions
|
|
||||||
extensions = []
|
|
||||||
for i in range(x509.get_extension_count()):
|
|
||||||
ext = x509.get_extension(i)
|
|
||||||
extensions.append(
|
|
||||||
{"name": ext.get_short_name(), "value": str(ext)}
|
|
||||||
)
|
|
||||||
cert_info["extensions"] = extensions
|
|
||||||
|
|
||||||
return SSLCertificate(cert_info)
|
|
||||||
|
|
||||||
except Exception:
|
|
||||||
return None
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _decode_cert_data(data: Any) -> Any:
|
def _decode_cert_data(data: Any) -> Any:
|
||||||
"""Helper method to decode bytes in certificate data."""
|
"""Helper method to decode bytes in certificate data."""
|
||||||
if isinstance(data, bytes):
|
if isinstance(data, bytes):
|
||||||
return data.decode("utf-8")
|
try:
|
||||||
|
# Try UTF-8 first, fallback to latin-1 for arbitrary bytes
|
||||||
|
return data.decode("utf-8")
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
return data.decode("latin-1") # Or handle as needed, maybe hex representation
|
||||||
elif isinstance(data, dict):
|
elif isinstance(data, dict):
|
||||||
return {
|
return {
|
||||||
(
|
(
|
||||||
@@ -97,36 +58,119 @@ class SSLCertificate:
|
|||||||
return [SSLCertificate._decode_cert_data(item) for item in data]
|
return [SSLCertificate._decode_cert_data(item) for item in data]
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_url(url: str, timeout: int = 10) -> Optional["SSLCertificate"]:
|
||||||
|
"""
|
||||||
|
Create SSLCertificate instance from a URL. Fetches cert info and initializes.
|
||||||
|
(Fetching logic remains the same)
|
||||||
|
"""
|
||||||
|
cert_info_raw = None # Variable to hold the fetched dict
|
||||||
|
try:
|
||||||
|
hostname = urlparse(url).netloc
|
||||||
|
if ":" in hostname:
|
||||||
|
hostname = hostname.split(":")[0]
|
||||||
|
|
||||||
|
context = ssl.create_default_context()
|
||||||
|
# Set check_hostname to False and verify_mode to CERT_NONE temporarily
|
||||||
|
# for potentially problematic certificates during fetch, but parse the result regardless.
|
||||||
|
# context.check_hostname = False
|
||||||
|
# context.verify_mode = ssl.CERT_NONE
|
||||||
|
|
||||||
|
with socket.create_connection((hostname, 443), timeout=timeout) as sock:
|
||||||
|
with context.wrap_socket(sock, server_hostname=hostname) as ssock:
|
||||||
|
cert_binary = ssock.getpeercert(binary_form=True)
|
||||||
|
if not cert_binary:
|
||||||
|
print(f"Warning: No certificate returned for {hostname}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
x509 = OpenSSL.crypto.load_certificate(
|
||||||
|
OpenSSL.crypto.FILETYPE_ASN1, cert_binary
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create the dictionary directly
|
||||||
|
cert_info_raw = {
|
||||||
|
"subject": dict(x509.get_subject().get_components()),
|
||||||
|
"issuer": dict(x509.get_issuer().get_components()),
|
||||||
|
"version": x509.get_version(),
|
||||||
|
"serial_number": hex(x509.get_serial_number()),
|
||||||
|
"not_before": x509.get_notBefore(), # Keep as bytes initially, _decode handles it
|
||||||
|
"not_after": x509.get_notAfter(), # Keep as bytes initially
|
||||||
|
"fingerprint": x509.digest("sha256").hex(), # hex() is already string
|
||||||
|
"signature_algorithm": x509.get_signature_algorithm(), # Keep as bytes
|
||||||
|
"raw_cert": base64.b64encode(cert_binary), # Base64 is bytes, _decode handles it
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add extensions
|
||||||
|
extensions = []
|
||||||
|
for i in range(x509.get_extension_count()):
|
||||||
|
ext = x509.get_extension(i)
|
||||||
|
# get_short_name() returns bytes, str(ext) handles value conversion
|
||||||
|
extensions.append(
|
||||||
|
{"name": ext.get_short_name(), "value": str(ext)}
|
||||||
|
)
|
||||||
|
cert_info_raw["extensions"] = extensions
|
||||||
|
|
||||||
|
except ssl.SSLCertVerificationError as e:
|
||||||
|
print(f"SSL Verification Error for {url}: {e}")
|
||||||
|
# Decide if you want to proceed or return None based on your needs
|
||||||
|
# You might try fetching without verification here if needed, but be cautious.
|
||||||
|
return None
|
||||||
|
except socket.gaierror:
|
||||||
|
print(f"Could not resolve hostname: {hostname}")
|
||||||
|
return None
|
||||||
|
except socket.timeout:
|
||||||
|
print(f"Connection timed out for {url}")
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error fetching/processing certificate for {url}: {e}")
|
||||||
|
# Log the full error details if needed: logging.exception("Cert fetch error")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# If successful, create the SSLCertificate instance from the dictionary
|
||||||
|
if cert_info_raw:
|
||||||
|
return SSLCertificate(cert_info_raw)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# --- Properties now access the dictionary items directly via self[] ---
|
||||||
|
@property
|
||||||
|
def issuer(self) -> Dict[str, str]:
|
||||||
|
return self.get("issuer", {}) # Use self.get for safety
|
||||||
|
|
||||||
|
@property
|
||||||
|
def subject(self) -> Dict[str, str]:
|
||||||
|
return self.get("subject", {})
|
||||||
|
|
||||||
|
@property
|
||||||
|
def valid_from(self) -> str:
|
||||||
|
return self.get("not_before", "")
|
||||||
|
|
||||||
|
@property
|
||||||
|
def valid_until(self) -> str:
|
||||||
|
return self.get("not_after", "")
|
||||||
|
|
||||||
|
@property
|
||||||
|
def fingerprint(self) -> str:
|
||||||
|
return self.get("fingerprint", "")
|
||||||
|
|
||||||
|
# --- Export methods can use `self` directly as it is the dict ---
|
||||||
def to_json(self, filepath: Optional[str] = None) -> Optional[str]:
|
def to_json(self, filepath: Optional[str] = None) -> Optional[str]:
|
||||||
"""
|
"""Export certificate as JSON."""
|
||||||
Export certificate as JSON.
|
# `self` is already the dictionary we want to serialize
|
||||||
|
json_str = json.dumps(self, indent=2, ensure_ascii=False)
|
||||||
Args:
|
|
||||||
filepath (Optional[str]): Path to save the JSON file (default: None).
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Optional[str]: JSON string if successful, None otherwise.
|
|
||||||
"""
|
|
||||||
json_str = json.dumps(self._cert_info, indent=2, ensure_ascii=False)
|
|
||||||
if filepath:
|
if filepath:
|
||||||
Path(filepath).write_text(json_str, encoding="utf-8")
|
Path(filepath).write_text(json_str, encoding="utf-8")
|
||||||
return None
|
return None
|
||||||
return json_str
|
return json_str
|
||||||
|
|
||||||
def to_pem(self, filepath: Optional[str] = None) -> Optional[str]:
|
def to_pem(self, filepath: Optional[str] = None) -> Optional[str]:
|
||||||
"""
|
"""Export certificate as PEM."""
|
||||||
Export certificate as PEM.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
filepath (Optional[str]): Path to save the PEM file (default: None).
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Optional[str]: PEM string if successful, None otherwise.
|
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
|
# Decode the raw_cert (which should be string due to _decode)
|
||||||
|
raw_cert_bytes = base64.b64decode(self.get("raw_cert", ""))
|
||||||
x509 = OpenSSL.crypto.load_certificate(
|
x509 = OpenSSL.crypto.load_certificate(
|
||||||
OpenSSL.crypto.FILETYPE_ASN1,
|
OpenSSL.crypto.FILETYPE_ASN1, raw_cert_bytes
|
||||||
base64.b64decode(self._cert_info["raw_cert"]),
|
|
||||||
)
|
)
|
||||||
pem_data = OpenSSL.crypto.dump_certificate(
|
pem_data = OpenSSL.crypto.dump_certificate(
|
||||||
OpenSSL.crypto.FILETYPE_PEM, x509
|
OpenSSL.crypto.FILETYPE_PEM, x509
|
||||||
@@ -136,49 +180,25 @@ class SSLCertificate:
|
|||||||
Path(filepath).write_text(pem_data, encoding="utf-8")
|
Path(filepath).write_text(pem_data, encoding="utf-8")
|
||||||
return None
|
return None
|
||||||
return pem_data
|
return pem_data
|
||||||
except Exception:
|
except Exception as e:
|
||||||
return None
|
print(f"Error converting to PEM: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
def to_der(self, filepath: Optional[str] = None) -> Optional[bytes]:
|
def to_der(self, filepath: Optional[str] = None) -> Optional[bytes]:
|
||||||
"""
|
"""Export certificate as DER."""
|
||||||
Export certificate as DER.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
filepath (Optional[str]): Path to save the DER file (default: None).
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Optional[bytes]: DER bytes if successful, None otherwise.
|
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
der_data = base64.b64decode(self._cert_info["raw_cert"])
|
# Decode the raw_cert (which should be string due to _decode)
|
||||||
|
der_data = base64.b64decode(self.get("raw_cert", ""))
|
||||||
if filepath:
|
if filepath:
|
||||||
Path(filepath).write_bytes(der_data)
|
Path(filepath).write_bytes(der_data)
|
||||||
return None
|
return None
|
||||||
return der_data
|
return der_data
|
||||||
except Exception:
|
except Exception as e:
|
||||||
return None
|
print(f"Error converting to DER: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
@property
|
# Optional: Add __repr__ for better debugging
|
||||||
def issuer(self) -> Dict[str, str]:
|
def __repr__(self) -> str:
|
||||||
"""Get certificate issuer information."""
|
subject_cn = self.subject.get('CN', 'N/A')
|
||||||
return self._cert_info.get("issuer", {})
|
issuer_cn = self.issuer.get('CN', 'N/A')
|
||||||
|
return f"<SSLCertificate Subject='{subject_cn}' Issuer='{issuer_cn}'>"
|
||||||
@property
|
|
||||||
def subject(self) -> Dict[str, str]:
|
|
||||||
"""Get certificate subject information."""
|
|
||||||
return self._cert_info.get("subject", {})
|
|
||||||
|
|
||||||
@property
|
|
||||||
def valid_from(self) -> str:
|
|
||||||
"""Get certificate validity start date."""
|
|
||||||
return self._cert_info.get("not_before", "")
|
|
||||||
|
|
||||||
@property
|
|
||||||
def valid_until(self) -> str:
|
|
||||||
"""Get certificate validity end date."""
|
|
||||||
return self._cert_info.get("not_after", "")
|
|
||||||
|
|
||||||
@property
|
|
||||||
def fingerprint(self) -> str:
|
|
||||||
"""Get certificate fingerprint."""
|
|
||||||
return self._cert_info.get("fingerprint", "")
|
|
||||||
644
deploy/docker/README-new.md
Normal file
644
deploy/docker/README-new.md
Normal file
@@ -0,0 +1,644 @@
|
|||||||
|
# Crawl4AI Docker Guide 🐳
|
||||||
|
|
||||||
|
## Table of Contents
|
||||||
|
- [Prerequisites](#prerequisites)
|
||||||
|
- [Installation](#installation)
|
||||||
|
- [Option 1: Using Docker Compose (Recommended)](#option-1-using-docker-compose-recommended)
|
||||||
|
- [Option 2: Manual Local Build & Run](#option-2-manual-local-build--run)
|
||||||
|
- [Option 3: Using Pre-built Docker Hub Images](#option-3-using-pre-built-docker-hub-images)
|
||||||
|
- [Dockerfile Parameters](#dockerfile-parameters)
|
||||||
|
- [Using the API](#using-the-api)
|
||||||
|
- [Understanding Request Schema](#understanding-request-schema)
|
||||||
|
- [REST API Examples](#rest-api-examples)
|
||||||
|
- [Python SDK](#python-sdk)
|
||||||
|
- [Metrics & Monitoring](#metrics--monitoring)
|
||||||
|
- [Deployment Scenarios](#deployment-scenarios)
|
||||||
|
- [Complete Examples](#complete-examples)
|
||||||
|
- [Server Configuration](#server-configuration)
|
||||||
|
- [Understanding config.yml](#understanding-configyml)
|
||||||
|
- [JWT Authentication](#jwt-authentication)
|
||||||
|
- [Configuration Tips and Best Practices](#configuration-tips-and-best-practices)
|
||||||
|
- [Customizing Your Configuration](#customizing-your-configuration)
|
||||||
|
- [Configuration Recommendations](#configuration-recommendations)
|
||||||
|
- [Getting Help](#getting-help)
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
Before we dive in, make sure you have:
|
||||||
|
- Docker installed and running (version 20.10.0 or higher), including `docker compose` (usually bundled with Docker Desktop).
|
||||||
|
- `git` for cloning the repository.
|
||||||
|
- At least 4GB of RAM available for the container (more recommended for heavy use).
|
||||||
|
- Python 3.10+ (if using the Python SDK).
|
||||||
|
- Node.js 16+ (if using the Node.js examples).
|
||||||
|
|
||||||
|
> 💡 **Pro tip**: Run `docker info` to check your Docker installation and available resources.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
We offer several ways to get the Crawl4AI server running. Docker Compose is the easiest way to manage local builds and runs.
|
||||||
|
|
||||||
|
### Option 1: Using Docker Compose (Recommended)
|
||||||
|
|
||||||
|
Docker Compose simplifies building and running the service, especially for local development and testing across different platforms.
|
||||||
|
|
||||||
|
#### 1. Clone Repository
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/unclecode/crawl4ai.git
|
||||||
|
cd crawl4ai
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 2. Environment Setup (API Keys)
|
||||||
|
|
||||||
|
If you plan to use LLMs, copy the example environment file and add your API keys. This file should be in the **project root directory**.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Make sure you are in the 'crawl4ai' root directory
|
||||||
|
cp deploy/docker/.llm.env.example .llm.env
|
||||||
|
|
||||||
|
# Now edit .llm.env and add your API keys
|
||||||
|
# Example content:
|
||||||
|
# OPENAI_API_KEY=sk-your-key
|
||||||
|
# ANTHROPIC_API_KEY=your-anthropic-key
|
||||||
|
# ...
|
||||||
|
```
|
||||||
|
> 🔑 **Note**: Keep your API keys secure! Never commit `.llm.env` to version control.
|
||||||
|
|
||||||
|
#### 3. Build and Run with Compose
|
||||||
|
|
||||||
|
The `docker-compose.yml` file in the project root defines services for different scenarios using **profiles**.
|
||||||
|
|
||||||
|
* **Build and Run Locally (AMD64):**
|
||||||
|
```bash
|
||||||
|
# Builds the image locally using Dockerfile and runs it
|
||||||
|
docker compose --profile local-amd64 up --build -d
|
||||||
|
```
|
||||||
|
|
||||||
|
* **Build and Run Locally (ARM64):**
|
||||||
|
```bash
|
||||||
|
# Builds the image locally using Dockerfile and runs it
|
||||||
|
docker compose --profile local-arm64 up --build -d
|
||||||
|
```
|
||||||
|
|
||||||
|
* **Run Pre-built Image from Docker Hub (AMD64):**
|
||||||
|
```bash
|
||||||
|
# Pulls and runs the specified AMD64 image from Docker Hub
|
||||||
|
# (Set VERSION env var for specific tags, e.g., VERSION=0.5.1-d1)
|
||||||
|
docker compose --profile hub-amd64 up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
* **Run Pre-built Image from Docker Hub (ARM64):**
|
||||||
|
```bash
|
||||||
|
# Pulls and runs the specified ARM64 image from Docker Hub
|
||||||
|
docker compose --profile hub-arm64 up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
> The server will be available at `http://localhost:11235`.
|
||||||
|
|
||||||
|
#### 4. Stopping Compose Services
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Stop the service(s) associated with a profile (e.g., local-amd64)
|
||||||
|
docker compose --profile local-amd64 down
|
||||||
|
```
|
||||||
|
|
||||||
|
### Option 2: Manual Local Build & Run
|
||||||
|
|
||||||
|
If you prefer not to use Docker Compose for local builds.
|
||||||
|
|
||||||
|
#### 1. Clone Repository & Setup Environment
|
||||||
|
|
||||||
|
Follow steps 1 and 2 from the Docker Compose section above (clone repo, `cd crawl4ai`, create `.llm.env` in the root).
|
||||||
|
|
||||||
|
#### 2. Build the Image (Multi-Arch)
|
||||||
|
|
||||||
|
Use `docker buildx` to build the image. This example builds for multiple platforms and loads the image matching your host architecture into the local Docker daemon.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Make sure you are in the 'crawl4ai' root directory
|
||||||
|
docker buildx build --platform linux/amd64,linux/arm64 -t crawl4ai-local:latest --load .
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 3. Run the Container
|
||||||
|
|
||||||
|
* **Basic run (no LLM support):**
|
||||||
|
```bash
|
||||||
|
# Replace --platform if your host is ARM64
|
||||||
|
docker run -d \
|
||||||
|
-p 11235:11235 \
|
||||||
|
--name crawl4ai-standalone \
|
||||||
|
--shm-size=1g \
|
||||||
|
--platform linux/amd64 \
|
||||||
|
crawl4ai-local:latest
|
||||||
|
```
|
||||||
|
|
||||||
|
* **With LLM support:**
|
||||||
|
```bash
|
||||||
|
# Make sure .llm.env is in the current directory (project root)
|
||||||
|
# Replace --platform if your host is ARM64
|
||||||
|
docker run -d \
|
||||||
|
-p 11235:11235 \
|
||||||
|
--name crawl4ai-standalone \
|
||||||
|
--env-file .llm.env \
|
||||||
|
--shm-size=1g \
|
||||||
|
--platform linux/amd64 \
|
||||||
|
crawl4ai-local:latest
|
||||||
|
```
|
||||||
|
|
||||||
|
> The server will be available at `http://localhost:11235`.
|
||||||
|
|
||||||
|
#### 4. Stopping the Manual Container
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker stop crawl4ai-standalone && docker rm crawl4ai-standalone
|
||||||
|
```
|
||||||
|
|
||||||
|
### Option 3: Using Pre-built Docker Hub Images
|
||||||
|
|
||||||
|
Pull and run images directly from Docker Hub without building locally.
|
||||||
|
|
||||||
|
#### 1. Pull the Image
|
||||||
|
|
||||||
|
We use a versioning scheme like `LIBRARY_VERSION-dREVISION` (e.g., `0.5.1-d1`). The `latest` tag points to the most recent stable release. Images are built with multi-arch manifests, so Docker usually pulls the correct version for your system automatically.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Pull a specific version (recommended for stability)
|
||||||
|
docker pull unclecode/crawl4ai:0.5.1-d1
|
||||||
|
|
||||||
|
# Or pull the latest stable version
|
||||||
|
docker pull unclecode/crawl4ai:latest
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 2. Setup Environment (API Keys)
|
||||||
|
|
||||||
|
If using LLMs, create the `.llm.env` file in a directory of your choice, similar to Step 2 in the Compose section.
|
||||||
|
|
||||||
|
#### 3. Run the Container
|
||||||
|
|
||||||
|
* **Basic run:**
|
||||||
|
```bash
|
||||||
|
docker run -d \
|
||||||
|
-p 11235:11235 \
|
||||||
|
--name crawl4ai-hub \
|
||||||
|
--shm-size=1g \
|
||||||
|
unclecode/crawl4ai:0.5.1-d1 # Or use :latest
|
||||||
|
```
|
||||||
|
|
||||||
|
* **With LLM support:**
|
||||||
|
```bash
|
||||||
|
# Make sure .llm.env is in the current directory you are running docker from
|
||||||
|
docker run -d \
|
||||||
|
-p 11235:11235 \
|
||||||
|
--name crawl4ai-hub \
|
||||||
|
--env-file .llm.env \
|
||||||
|
--shm-size=1g \
|
||||||
|
unclecode/crawl4ai:0.5.1-d1 # Or use :latest
|
||||||
|
```
|
||||||
|
|
||||||
|
> The server will be available at `http://localhost:11235`.
|
||||||
|
|
||||||
|
#### 4. Stopping the Hub Container
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker stop crawl4ai-hub && docker rm crawl4ai-hub
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Docker Hub Versioning Explained
|
||||||
|
|
||||||
|
* **Image Name:** `unclecode/crawl4ai`
|
||||||
|
* **Tag Format:** `LIBRARY_VERSION-dREVISION`
|
||||||
|
* `LIBRARY_VERSION`: The Semantic Version of the core `crawl4ai` Python library included (e.g., `0.5.1`).
|
||||||
|
* `dREVISION`: An incrementing number (starting at `d1`) for Docker build changes made *without* changing the library version (e.g., base image updates, dependency fixes). Resets to `d1` for each new `LIBRARY_VERSION`.
|
||||||
|
* **Example:** `unclecode/crawl4ai:0.5.1-d1`
|
||||||
|
* **`latest` Tag:** Points to the most recent stable `LIBRARY_VERSION-dREVISION`.
|
||||||
|
* **Multi-Arch:** Images support `linux/amd64` and `linux/arm64`. Docker automatically selects the correct architecture.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
*(Rest of the document remains largely the same, but with key updates below)*
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Dockerfile Parameters
|
||||||
|
|
||||||
|
You can customize the image build process using build arguments (`--build-arg`). These are typically used via `docker buildx build` or within the `docker-compose.yml` file.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Example: Build with 'all' features using buildx
|
||||||
|
docker buildx build \
|
||||||
|
--platform linux/amd64,linux/arm64 \
|
||||||
|
--build-arg INSTALL_TYPE=all \
|
||||||
|
-t yourname/crawl4ai-all:latest \
|
||||||
|
--load \
|
||||||
|
. # Build from root context
|
||||||
|
```
|
||||||
|
|
||||||
|
### Build Arguments Explained
|
||||||
|
|
||||||
|
| Argument | Description | Default | Options |
|
||||||
|
| :----------- | :--------------------------------------- | :-------- | :--------------------------------- |
|
||||||
|
| INSTALL_TYPE | Feature set | `default` | `default`, `all`, `torch`, `transformer` |
|
||||||
|
| ENABLE_GPU | GPU support (CUDA for AMD64) | `false` | `true`, `false` |
|
||||||
|
| APP_HOME | Install path inside container (advanced) | `/app` | any valid path |
|
||||||
|
| USE_LOCAL | Install library from local source | `true` | `true`, `false` |
|
||||||
|
| GITHUB_REPO | Git repo to clone if USE_LOCAL=false | *(see Dockerfile)* | any git URL |
|
||||||
|
| GITHUB_BRANCH| Git branch to clone if USE_LOCAL=false | `main` | any branch name |
|
||||||
|
|
||||||
|
*(Note: PYTHON_VERSION is fixed by the `FROM` instruction in the Dockerfile)*
|
||||||
|
|
||||||
|
### Build Best Practices
|
||||||
|
|
||||||
|
1. **Choose the Right Install Type**
|
||||||
|
* `default`: Basic installation, smallest image size. Suitable for most standard web scraping and markdown generation.
|
||||||
|
* `all`: Full features including `torch` and `transformers` for advanced extraction strategies (e.g., CosineStrategy, certain LLM filters). Significantly larger image. Ensure you need these extras.
|
||||||
|
2. **Platform Considerations**
|
||||||
|
* Use `buildx` for building multi-architecture images, especially for pushing to registries.
|
||||||
|
* Use `docker compose` profiles (`local-amd64`, `local-arm64`) for easy platform-specific local builds.
|
||||||
|
3. **Performance Optimization**
|
||||||
|
* The image automatically includes platform-specific optimizations (OpenMP for AMD64, OpenBLAS for ARM64).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Using the API
|
||||||
|
|
||||||
|
Communicate with the running Docker server via its REST API (defaulting to `http://localhost:11235`). You can use the Python SDK or make direct HTTP requests.
|
||||||
|
|
||||||
|
### Python SDK
|
||||||
|
|
||||||
|
Install the SDK: `pip install crawl4ai`
|
||||||
|
|
||||||
|
```python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||||
|
from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode # Assuming you have crawl4ai installed
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
# Point to the correct server port
|
||||||
|
async with Crawl4aiDockerClient(base_url="http://localhost:11235", verbose=True) as client:
|
||||||
|
# If JWT is enabled on the server, authenticate first:
|
||||||
|
# await client.authenticate("user@example.com") # See Server Configuration section
|
||||||
|
|
||||||
|
# Example Non-streaming crawl
|
||||||
|
print("--- Running Non-Streaming Crawl ---")
|
||||||
|
results = await client.crawl(
|
||||||
|
["https://httpbin.org/html"],
|
||||||
|
browser_config=BrowserConfig(headless=True), # Use library classes for config aid
|
||||||
|
crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||||
|
)
|
||||||
|
if results: # client.crawl returns None on failure
|
||||||
|
print(f"Non-streaming results success: {results.success}")
|
||||||
|
if results.success:
|
||||||
|
for result in results: # Iterate through the CrawlResultContainer
|
||||||
|
print(f"URL: {result.url}, Success: {result.success}")
|
||||||
|
else:
|
||||||
|
print("Non-streaming crawl failed.")
|
||||||
|
|
||||||
|
|
||||||
|
# Example Streaming crawl
|
||||||
|
print("\n--- Running Streaming Crawl ---")
|
||||||
|
stream_config = CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)
|
||||||
|
try:
|
||||||
|
async for result in await client.crawl( # client.crawl returns an async generator for streaming
|
||||||
|
["https://httpbin.org/html", "https://httpbin.org/links/5/0"],
|
||||||
|
browser_config=BrowserConfig(headless=True),
|
||||||
|
crawler_config=stream_config
|
||||||
|
):
|
||||||
|
print(f"Streamed result: URL: {result.url}, Success: {result.success}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Streaming crawl failed: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
# Example Get schema
|
||||||
|
print("\n--- Getting Schema ---")
|
||||||
|
schema = await client.get_schema()
|
||||||
|
print(f"Schema received: {bool(schema)}") # Print whether schema was received
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
|
```
|
||||||
|
|
||||||
|
*(SDK parameters like timeout, verify_ssl etc. remain the same)*
|
||||||
|
|
||||||
|
### Second Approach: Direct API Calls
|
||||||
|
|
||||||
|
Crucially, when sending configurations directly via JSON, they **must** follow the `{"type": "ClassName", "params": {...}}` structure for any non-primitive value (like config objects or strategies). Dictionaries must be wrapped as `{"type": "dict", "value": {...}}`.
|
||||||
|
|
||||||
|
*(Keep the detailed explanation of Configuration Structure, Basic Pattern, Simple vs Complex, Strategy Pattern, Complex Nested Example, Quick Grammar Overview, Important Rules, Pro Tip)*
|
||||||
|
|
||||||
|
#### More Examples *(Ensure Schema example uses type/value wrapper)*
|
||||||
|
|
||||||
|
**Advanced Crawler Configuration**
|
||||||
|
*(Keep example, ensure cache_mode uses valid enum value like "bypass")*
|
||||||
|
|
||||||
|
**Extraction Strategy**
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"crawler_config": {
|
||||||
|
"type": "CrawlerRunConfig",
|
||||||
|
"params": {
|
||||||
|
"extraction_strategy": {
|
||||||
|
"type": "JsonCssExtractionStrategy",
|
||||||
|
"params": {
|
||||||
|
"schema": {
|
||||||
|
"type": "dict",
|
||||||
|
"value": {
|
||||||
|
"baseSelector": "article.post",
|
||||||
|
"fields": [
|
||||||
|
{"name": "title", "selector": "h1", "type": "text"},
|
||||||
|
{"name": "content", "selector": ".content", "type": "html"}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**LLM Extraction Strategy** *(Keep example, ensure schema uses type/value wrapper)*
|
||||||
|
*(Keep Deep Crawler Example)*
|
||||||
|
|
||||||
|
### REST API Examples
|
||||||
|
|
||||||
|
Update URLs to use port `11235`.
|
||||||
|
|
||||||
|
#### Simple Crawl
|
||||||
|
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
|
||||||
|
# Configuration objects converted to the required JSON structure
|
||||||
|
browser_config_payload = {
|
||||||
|
"type": "BrowserConfig",
|
||||||
|
"params": {"headless": True}
|
||||||
|
}
|
||||||
|
crawler_config_payload = {
|
||||||
|
"type": "CrawlerRunConfig",
|
||||||
|
"params": {"stream": False, "cache_mode": "bypass"} # Use string value of enum
|
||||||
|
}
|
||||||
|
|
||||||
|
crawl_payload = {
|
||||||
|
"urls": ["https://httpbin.org/html"],
|
||||||
|
"browser_config": browser_config_payload,
|
||||||
|
"crawler_config": crawler_config_payload
|
||||||
|
}
|
||||||
|
response = requests.post(
|
||||||
|
"http://localhost:11235/crawl", # Updated port
|
||||||
|
# headers={"Authorization": f"Bearer {token}"}, # If JWT is enabled
|
||||||
|
json=crawl_payload
|
||||||
|
)
|
||||||
|
print(f"Status Code: {response.status_code}")
|
||||||
|
if response.ok:
|
||||||
|
print(response.json())
|
||||||
|
else:
|
||||||
|
print(f"Error: {response.text}")
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Streaming Results
|
||||||
|
|
||||||
|
```python
|
||||||
|
import json
|
||||||
|
import httpx # Use httpx for async streaming example
|
||||||
|
|
||||||
|
async def test_stream_crawl(token: str = None): # Made token optional
|
||||||
|
"""Test the /crawl/stream endpoint with multiple URLs."""
|
||||||
|
url = "http://localhost:11235/crawl/stream" # Updated port
|
||||||
|
payload = {
|
||||||
|
"urls": [
|
||||||
|
"https://httpbin.org/html",
|
||||||
|
"https://httpbin.org/links/5/0",
|
||||||
|
],
|
||||||
|
"browser_config": {
|
||||||
|
"type": "BrowserConfig",
|
||||||
|
"params": {"headless": True, "viewport": {"type": "dict", "value": {"width": 1200, "height": 800}}} # Viewport needs type:dict
|
||||||
|
},
|
||||||
|
"crawler_config": {
|
||||||
|
"type": "CrawlerRunConfig",
|
||||||
|
"params": {"stream": True, "cache_mode": "bypass"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
headers = {}
|
||||||
|
# if token:
|
||||||
|
# headers = {"Authorization": f"Bearer {token}"} # If JWT is enabled
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient() as client:
|
||||||
|
async with client.stream("POST", url, json=payload, headers=headers, timeout=120.0) as response:
|
||||||
|
print(f"Status: {response.status_code} (Expected: 200)")
|
||||||
|
response.raise_for_status() # Raise exception for bad status codes
|
||||||
|
|
||||||
|
# Read streaming response line-by-line (NDJSON)
|
||||||
|
async for line in response.aiter_lines():
|
||||||
|
if line:
|
||||||
|
try:
|
||||||
|
data = json.loads(line)
|
||||||
|
# Check for completion marker
|
||||||
|
if data.get("status") == "completed":
|
||||||
|
print("Stream completed.")
|
||||||
|
break
|
||||||
|
print(f"Streamed Result: {json.dumps(data, indent=2)}")
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
print(f"Warning: Could not decode JSON line: {line}")
|
||||||
|
|
||||||
|
except httpx.HTTPStatusError as e:
|
||||||
|
print(f"HTTP error occurred: {e.response.status_code} - {e.response.text}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error in streaming crawl test: {str(e)}")
|
||||||
|
|
||||||
|
# To run this example:
|
||||||
|
# import asyncio
|
||||||
|
# asyncio.run(test_stream_crawl())
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Metrics & Monitoring
|
||||||
|
|
||||||
|
Keep an eye on your crawler with these endpoints:
|
||||||
|
|
||||||
|
- `/health` - Quick health check
|
||||||
|
- `/metrics` - Detailed Prometheus metrics
|
||||||
|
- `/schema` - Full API schema
|
||||||
|
|
||||||
|
Example health check:
|
||||||
|
```bash
|
||||||
|
curl http://localhost:11235/health
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
*(Deployment Scenarios and Complete Examples sections remain the same, maybe update links if examples moved)*
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Server Configuration
|
||||||
|
|
||||||
|
The server's behavior can be customized through the `config.yml` file.
|
||||||
|
|
||||||
|
### Understanding config.yml
|
||||||
|
|
||||||
|
The configuration file is loaded from `/app/config.yml` inside the container. By default, the file from `deploy/docker/config.yml` in the repository is copied there during the build.
|
||||||
|
|
||||||
|
Here's a detailed breakdown of the configuration options (using defaults from `deploy/docker/config.yml`):
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# Application Configuration
|
||||||
|
app:
|
||||||
|
title: "Crawl4AI API"
|
||||||
|
version: "1.0.0" # Consider setting this to match library version, e.g., "0.5.1"
|
||||||
|
host: "0.0.0.0"
|
||||||
|
port: 8020 # NOTE: This port is used ONLY when running server.py directly. Gunicorn overrides this (see supervisord.conf).
|
||||||
|
reload: False # Default set to False - suitable for production
|
||||||
|
timeout_keep_alive: 300
|
||||||
|
|
||||||
|
# Default LLM Configuration
|
||||||
|
llm:
|
||||||
|
provider: "openai/gpt-4o-mini"
|
||||||
|
api_key_env: "OPENAI_API_KEY"
|
||||||
|
# api_key: sk-... # If you pass the API key directly then api_key_env will be ignored
|
||||||
|
|
||||||
|
# Redis Configuration (Used by internal Redis server managed by supervisord)
|
||||||
|
redis:
|
||||||
|
host: "localhost"
|
||||||
|
port: 6379
|
||||||
|
db: 0
|
||||||
|
password: ""
|
||||||
|
# ... other redis options ...
|
||||||
|
|
||||||
|
# Rate Limiting Configuration
|
||||||
|
rate_limiting:
|
||||||
|
enabled: True
|
||||||
|
default_limit: "1000/minute"
|
||||||
|
trusted_proxies: []
|
||||||
|
storage_uri: "memory://" # Use "redis://localhost:6379" if you need persistent/shared limits
|
||||||
|
|
||||||
|
# Security Configuration
|
||||||
|
security:
|
||||||
|
enabled: false # Master toggle for security features
|
||||||
|
jwt_enabled: false # Enable JWT authentication (requires security.enabled=true)
|
||||||
|
https_redirect: false # Force HTTPS (requires security.enabled=true)
|
||||||
|
trusted_hosts: ["*"] # Allowed hosts (use specific domains in production)
|
||||||
|
headers: # Security headers (applied if security.enabled=true)
|
||||||
|
x_content_type_options: "nosniff"
|
||||||
|
x_frame_options: "DENY"
|
||||||
|
content_security_policy: "default-src 'self'"
|
||||||
|
strict_transport_security: "max-age=63072000; includeSubDomains"
|
||||||
|
|
||||||
|
# Crawler Configuration
|
||||||
|
crawler:
|
||||||
|
memory_threshold_percent: 95.0
|
||||||
|
rate_limiter:
|
||||||
|
base_delay: [1.0, 2.0] # Min/max delay between requests in seconds for dispatcher
|
||||||
|
timeouts:
|
||||||
|
stream_init: 30.0 # Timeout for stream initialization
|
||||||
|
batch_process: 300.0 # Timeout for non-streaming /crawl processing
|
||||||
|
|
||||||
|
# Logging Configuration
|
||||||
|
logging:
|
||||||
|
level: "INFO"
|
||||||
|
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||||
|
|
||||||
|
# Observability Configuration
|
||||||
|
observability:
|
||||||
|
prometheus:
|
||||||
|
enabled: True
|
||||||
|
endpoint: "/metrics"
|
||||||
|
health_check:
|
||||||
|
endpoint: "/health"
|
||||||
|
```
|
||||||
|
|
||||||
|
*(JWT Authentication section remains the same, just note the default port is now 11235 for requests)*
|
||||||
|
|
||||||
|
*(Configuration Tips and Best Practices remain the same)*
|
||||||
|
|
||||||
|
### Customizing Your Configuration
|
||||||
|
|
||||||
|
You can override the default `config.yml`.
|
||||||
|
|
||||||
|
#### Method 1: Modify Before Build
|
||||||
|
|
||||||
|
1. Edit the `deploy/docker/config.yml` file in your local repository clone.
|
||||||
|
2. Build the image using `docker buildx` or `docker compose --profile local-... up --build`. The modified file will be copied into the image.
|
||||||
|
|
||||||
|
#### Method 2: Runtime Mount (Recommended for Custom Deploys)
|
||||||
|
|
||||||
|
1. Create your custom configuration file, e.g., `my-custom-config.yml` locally. Ensure it contains all necessary sections.
|
||||||
|
2. Mount it when running the container:
|
||||||
|
|
||||||
|
* **Using `docker run`:**
|
||||||
|
```bash
|
||||||
|
# Assumes my-custom-config.yml is in the current directory
|
||||||
|
docker run -d -p 11235:11235 \
|
||||||
|
--name crawl4ai-custom-config \
|
||||||
|
--env-file .llm.env \
|
||||||
|
--shm-size=1g \
|
||||||
|
-v $(pwd)/my-custom-config.yml:/app/config.yml \
|
||||||
|
unclecode/crawl4ai:latest # Or your specific tag
|
||||||
|
```
|
||||||
|
|
||||||
|
* **Using `docker-compose.yml`:** Add a `volumes` section to the service definition:
|
||||||
|
```yaml
|
||||||
|
services:
|
||||||
|
crawl4ai-hub-amd64: # Or your chosen service
|
||||||
|
image: unclecode/crawl4ai:latest
|
||||||
|
profiles: ["hub-amd64"]
|
||||||
|
<<: *base-config
|
||||||
|
volumes:
|
||||||
|
# Mount local custom config over the default one in the container
|
||||||
|
- ./my-custom-config.yml:/app/config.yml
|
||||||
|
# Keep the shared memory volume from base-config
|
||||||
|
- /dev/shm:/dev/shm
|
||||||
|
```
|
||||||
|
*(Note: Ensure `my-custom-config.yml` is in the same directory as `docker-compose.yml`)*
|
||||||
|
|
||||||
|
> 💡 When mounting, your custom file *completely replaces* the default one. Ensure it's a valid and complete configuration.
|
||||||
|
|
||||||
|
### Configuration Recommendations
|
||||||
|
|
||||||
|
1. **Security First** 🔒
|
||||||
|
- Always enable security in production
|
||||||
|
- Use specific trusted_hosts instead of wildcards
|
||||||
|
- Set up proper rate limiting to protect your server
|
||||||
|
- Consider your environment before enabling HTTPS redirect
|
||||||
|
|
||||||
|
2. **Resource Management** 💻
|
||||||
|
- Adjust memory_threshold_percent based on available RAM
|
||||||
|
- Set timeouts according to your content size and network conditions
|
||||||
|
- Use Redis for rate limiting in multi-container setups
|
||||||
|
|
||||||
|
3. **Monitoring** 📊
|
||||||
|
- Enable Prometheus if you need metrics
|
||||||
|
- Set DEBUG logging in development, INFO in production
|
||||||
|
- Regular health check monitoring is crucial
|
||||||
|
|
||||||
|
4. **Performance Tuning** ⚡
|
||||||
|
- Start with conservative rate limiter delays
|
||||||
|
- Increase batch_process timeout for large content
|
||||||
|
- Adjust stream_init timeout based on initial response times
|
||||||
|
|
||||||
|
## Getting Help
|
||||||
|
|
||||||
|
We're here to help you succeed with Crawl4AI! Here's how to get support:
|
||||||
|
|
||||||
|
- 📖 Check our [full documentation](https://docs.crawl4ai.com)
|
||||||
|
- 🐛 Found a bug? [Open an issue](https://github.com/unclecode/crawl4ai/issues)
|
||||||
|
- 💬 Join our [Discord community](https://discord.gg/crawl4ai)
|
||||||
|
- ⭐ Star us on GitHub to show support!
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
|
||||||
|
- Building and running the Docker container
|
||||||
|
- Configuring the environment
|
||||||
|
- Making API requests with proper typing
|
||||||
|
- Using the Python SDK
|
||||||
|
- Monitoring your deployment
|
||||||
|
|
||||||
|
Remember, the examples in the `examples` folder are your friends - they show real-world usage patterns that you can adapt for your needs.
|
||||||
|
|
||||||
|
Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. 🚀
|
||||||
|
|
||||||
|
Happy crawling! 🕷️
|
||||||
@@ -391,21 +391,25 @@ async def handle_crawl_request(
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
crawler: AsyncWebCrawler = AsyncWebCrawler(config=browser_config)
|
||||||
results = []
|
await crawler.start()
|
||||||
func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many")
|
results = []
|
||||||
partial_func = partial(func,
|
func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many")
|
||||||
urls[0] if len(urls) == 1 else urls,
|
partial_func = partial(func,
|
||||||
config=crawler_config,
|
urls[0] if len(urls) == 1 else urls,
|
||||||
dispatcher=dispatcher)
|
config=crawler_config,
|
||||||
results = await partial_func()
|
dispatcher=dispatcher)
|
||||||
return {
|
results = await partial_func()
|
||||||
"success": True,
|
await crawler.close()
|
||||||
"results": [result.model_dump() for result in results]
|
return {
|
||||||
}
|
"success": True,
|
||||||
|
"results": [result.model_dump() for result in results]
|
||||||
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Crawl error: {str(e)}", exc_info=True)
|
logger.error(f"Crawl error: {str(e)}", exc_info=True)
|
||||||
|
if 'crawler' in locals():
|
||||||
|
await crawler.close()
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
detail=str(e)
|
detail=str(e)
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ app:
|
|||||||
version: "1.0.0"
|
version: "1.0.0"
|
||||||
host: "0.0.0.0"
|
host: "0.0.0.0"
|
||||||
port: 8020
|
port: 8020
|
||||||
reload: True
|
reload: False
|
||||||
timeout_keep_alive: 300
|
timeout_keep_alive: 300
|
||||||
|
|
||||||
# Default LLM Configuration
|
# Default LLM Configuration
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
crawl4ai
|
|
||||||
fastapi
|
fastapi
|
||||||
uvicorn
|
uvicorn
|
||||||
gunicorn>=23.0.0
|
gunicorn>=23.0.0
|
||||||
|
|||||||
@@ -1,12 +1,28 @@
|
|||||||
[supervisord]
|
[supervisord]
|
||||||
nodaemon=true
|
nodaemon=true ; Run supervisord in the foreground
|
||||||
|
logfile=/dev/null ; Log supervisord output to stdout/stderr
|
||||||
|
logfile_maxbytes=0
|
||||||
|
|
||||||
[program:redis]
|
[program:redis]
|
||||||
command=redis-server
|
command=/usr/bin/redis-server --loglevel notice ; Path to redis-server on Alpine
|
||||||
|
user=appuser ; Run redis as our non-root user
|
||||||
autorestart=true
|
autorestart=true
|
||||||
priority=10
|
priority=10
|
||||||
|
stdout_logfile=/dev/stdout ; Redirect redis stdout to container stdout
|
||||||
|
stdout_logfile_maxbytes=0
|
||||||
|
stderr_logfile=/dev/stderr ; Redirect redis stderr to container stderr
|
||||||
|
stderr_logfile_maxbytes=0
|
||||||
|
|
||||||
[program:gunicorn]
|
[program:gunicorn]
|
||||||
command=gunicorn --bind 0.0.0.0:8000 --workers 4 --threads 2 --timeout 300 --graceful-timeout 60 --keep-alive 65 --log-level debug --worker-class uvicorn.workers.UvicornWorker --max-requests 1000 --max-requests-jitter 50 server:app
|
command=/usr/local/bin/gunicorn --bind 0.0.0.0:11235 --workers 2 --threads 2 --timeout 120 --graceful-timeout 30 --keep-alive 60 --log-level info --worker-class uvicorn.workers.UvicornWorker server:app
|
||||||
|
directory=/app ; Working directory for the app
|
||||||
|
user=appuser ; Run gunicorn as our non-root user
|
||||||
autorestart=true
|
autorestart=true
|
||||||
priority=20
|
priority=20
|
||||||
|
environment=PYTHONUNBUFFERED=1 ; Ensure Python output is sent straight to logs
|
||||||
|
stdout_logfile=/dev/stdout ; Redirect gunicorn stdout to container stdout
|
||||||
|
stdout_logfile_maxbytes=0
|
||||||
|
stderr_logfile=/dev/stderr ; Redirect gunicorn stderr to container stderr
|
||||||
|
stderr_logfile_maxbytes=0
|
||||||
|
|
||||||
|
# Optional: Add filebeat or other logging agents here if needed
|
||||||
@@ -1,15 +1,30 @@
|
|||||||
# Base configuration (not a service, just a reusable config block)
|
# docker-compose.yml
|
||||||
|
|
||||||
|
# Base configuration anchor for reusability
|
||||||
x-base-config: &base-config
|
x-base-config: &base-config
|
||||||
ports:
|
ports:
|
||||||
|
# Map host port 11235 to container port 11235 (where Gunicorn will listen)
|
||||||
- "11235:11235"
|
- "11235:11235"
|
||||||
- "8000:8000"
|
# - "8080:8080" # Uncomment if needed
|
||||||
- "9222:9222"
|
|
||||||
- "8080:8080"
|
# Load API keys primarily from .llm.env file
|
||||||
|
# Create .llm.env in the root directory .llm.env.example
|
||||||
|
env_file:
|
||||||
|
- .llm.env
|
||||||
|
|
||||||
|
# Define environment variables, allowing overrides from host environment
|
||||||
|
# Syntax ${VAR:-} uses host env var 'VAR' if set, otherwise uses value from .llm.env
|
||||||
environment:
|
environment:
|
||||||
- CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}
|
|
||||||
- OPENAI_API_KEY=${OPENAI_API_KEY:-}
|
- OPENAI_API_KEY=${OPENAI_API_KEY:-}
|
||||||
- CLAUDE_API_KEY=${CLAUDE_API_KEY:-}
|
- DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
|
||||||
|
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
|
||||||
|
- GROQ_API_KEY=${GROQ_API_KEY:-}
|
||||||
|
- TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
|
||||||
|
- MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
|
||||||
|
- GEMINI_API_TOKEN=${GEMINI_API_TOKEN:-}
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
|
# Mount /dev/shm for Chromium/Playwright performance
|
||||||
- /dev/shm:/dev/shm
|
- /dev/shm:/dev/shm
|
||||||
deploy:
|
deploy:
|
||||||
resources:
|
resources:
|
||||||
@@ -19,47 +34,47 @@ x-base-config: &base-config
|
|||||||
memory: 1G
|
memory: 1G
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
healthcheck:
|
healthcheck:
|
||||||
|
# IMPORTANT: Ensure Gunicorn binds to 11235 in supervisord.conf
|
||||||
test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
|
test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
|
||||||
interval: 30s
|
interval: 30s
|
||||||
timeout: 10s
|
timeout: 10s
|
||||||
retries: 3
|
retries: 3
|
||||||
start_period: 40s
|
start_period: 40s # Give the server time to start
|
||||||
|
# Run the container as the non-root user defined in the Dockerfile
|
||||||
|
user: "appuser"
|
||||||
|
|
||||||
services:
|
services:
|
||||||
# Local build services for different platforms
|
# --- Local Build Services ---
|
||||||
crawl4ai-amd64:
|
crawl4ai-local-amd64:
|
||||||
build:
|
build:
|
||||||
context: .
|
context: . # Build context is the root directory
|
||||||
dockerfile: Dockerfile
|
dockerfile: Dockerfile # Dockerfile is in the root directory
|
||||||
args:
|
args:
|
||||||
PYTHON_VERSION: "3.10"
|
INSTALL_TYPE: ${INSTALL_TYPE:-default}
|
||||||
INSTALL_TYPE: ${INSTALL_TYPE:-basic}
|
ENABLE_GPU: ${ENABLE_GPU:-false}
|
||||||
ENABLE_GPU: false
|
# PYTHON_VERSION arg is omitted as it's fixed by 'FROM python:3.10-slim' in Dockerfile
|
||||||
platforms:
|
platform: linux/amd64
|
||||||
- linux/amd64
|
|
||||||
profiles: ["local-amd64"]
|
profiles: ["local-amd64"]
|
||||||
<<: *base-config # extends yerine doğrudan yapılandırmayı dahil ettik
|
<<: *base-config # Inherit base configuration
|
||||||
|
|
||||||
crawl4ai-arm64:
|
crawl4ai-local-arm64:
|
||||||
build:
|
build:
|
||||||
context: .
|
context: . # Build context is the root directory
|
||||||
dockerfile: Dockerfile
|
dockerfile: Dockerfile # Dockerfile is in the root directory
|
||||||
args:
|
args:
|
||||||
PYTHON_VERSION: "3.10"
|
INSTALL_TYPE: ${INSTALL_TYPE:-default}
|
||||||
INSTALL_TYPE: ${INSTALL_TYPE:-basic}
|
ENABLE_GPU: ${ENABLE_GPU:-false}
|
||||||
ENABLE_GPU: false
|
platform: linux/arm64
|
||||||
platforms:
|
|
||||||
- linux/arm64
|
|
||||||
profiles: ["local-arm64"]
|
profiles: ["local-arm64"]
|
||||||
<<: *base-config
|
<<: *base-config
|
||||||
|
|
||||||
# Hub services for different platforms and versions
|
# --- Docker Hub Image Services ---
|
||||||
crawl4ai-hub-amd64:
|
crawl4ai-hub-amd64:
|
||||||
image: unclecode/crawl4ai:${VERSION:-basic}-amd64
|
image: unclecode/crawl4ai:${VERSION:-latest}-amd64
|
||||||
profiles: ["hub-amd64"]
|
profiles: ["hub-amd64"]
|
||||||
<<: *base-config
|
<<: *base-config
|
||||||
|
|
||||||
crawl4ai-hub-arm64:
|
crawl4ai-hub-arm64:
|
||||||
image: unclecode/crawl4ai:${VERSION:-basic}-arm64
|
image: unclecode/crawl4ai:${VERSION:-latest}-arm64
|
||||||
profiles: ["hub-arm64"]
|
profiles: ["hub-arm64"]
|
||||||
<<: *base-config
|
<<: *base-config
|
||||||
@@ -357,8 +357,7 @@ async def demo_performance_analysis():
|
|||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
config = CrawlerRunConfig(
|
config = CrawlerRunConfig(
|
||||||
capture_network_requests=True,
|
capture_network_requests=True,
|
||||||
wait_until="networkidle",
|
page_timeout=60 * 2 * 1000 # 120 seconds
|
||||||
page_timeout=60000 # 60 seconds
|
|
||||||
)
|
)
|
||||||
|
|
||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
@@ -406,6 +405,13 @@ async def demo_performance_analysis():
|
|||||||
"url": url,
|
"url": url,
|
||||||
"duration_ms": duration
|
"duration_ms": duration
|
||||||
})
|
})
|
||||||
|
if isinstance(timing, dict) and "requestStart" in timing and "responseStart" in timing and "startTime" in timing:
|
||||||
|
# Convert to milliseconds
|
||||||
|
duration = (timing["responseStart"] - timing["requestStart"]) * 1000
|
||||||
|
resource_timings[resource_type].append({
|
||||||
|
"url": url,
|
||||||
|
"duration_ms": duration
|
||||||
|
})
|
||||||
|
|
||||||
# Calculate statistics for each resource type
|
# Calculate statistics for each resource type
|
||||||
print("\nPerformance by resource type:")
|
print("\nPerformance by resource type:")
|
||||||
@@ -455,14 +461,14 @@ async def main():
|
|||||||
os.makedirs(os.path.join(__cur_dir__, "tmp"), exist_ok=True)
|
os.makedirs(os.path.join(__cur_dir__, "tmp"), exist_ok=True)
|
||||||
|
|
||||||
# Run basic examples
|
# Run basic examples
|
||||||
await demo_basic_network_capture()
|
# await demo_basic_network_capture()
|
||||||
await demo_basic_console_capture()
|
await demo_basic_console_capture()
|
||||||
await demo_combined_capture()
|
# await demo_combined_capture()
|
||||||
|
|
||||||
# Run advanced examples
|
# Run advanced examples
|
||||||
await analyze_spa_network_traffic()
|
# await analyze_spa_network_traffic()
|
||||||
await demo_security_analysis()
|
# await demo_security_analysis()
|
||||||
await demo_performance_analysis()
|
# await demo_performance_analysis()
|
||||||
|
|
||||||
print("\n=== Examples Complete ===")
|
print("\n=== Examples Complete ===")
|
||||||
print(f"Check the tmp directory for output files: {os.path.join(__cur_dir__, 'tmp')}")
|
print(f"Check the tmp directory for output files: {os.path.join(__cur_dir__, 'tmp')}")
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ import json
|
|||||||
import base64
|
import base64
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
from crawl4ai.proxy_strategy import ProxyConfig
|
from crawl4ai import ProxyConfig
|
||||||
|
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult
|
||||||
from crawl4ai import RoundRobinProxyStrategy
|
from crawl4ai import RoundRobinProxyStrategy
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ from crawl4ai.deep_crawling import (
|
|||||||
)
|
)
|
||||||
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
|
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
|
||||||
from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
|
from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
|
||||||
from crawl4ai.proxy_strategy import ProxyConfig
|
from crawl4ai import ProxyConfig
|
||||||
from crawl4ai import RoundRobinProxyStrategy
|
from crawl4ai import RoundRobinProxyStrategy
|
||||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||||
from crawl4ai import DefaultMarkdownGenerator
|
from crawl4ai import DefaultMarkdownGenerator
|
||||||
|
|||||||
444
docs/md_v2/ask_ai/ask-ai.css
Normal file
444
docs/md_v2/ask_ai/ask-ai.css
Normal file
@@ -0,0 +1,444 @@
|
|||||||
|
/* ==== File: docs/ask_ai/ask_ai.css ==== */
|
||||||
|
|
||||||
|
/* --- Basic Reset & Font --- */
|
||||||
|
body {
|
||||||
|
/* Attempt to inherit variables from parent window (iframe context) */
|
||||||
|
/* Fallback values if variables are not inherited */
|
||||||
|
--fallback-bg: #070708;
|
||||||
|
--fallback-font: #e8e9ed;
|
||||||
|
--fallback-secondary: #a3abba;
|
||||||
|
--fallback-primary: #50ffff;
|
||||||
|
--fallback-primary-dimmed: #09b5a5;
|
||||||
|
--fallback-border: #1d1d20;
|
||||||
|
--fallback-code-bg: #1e1e1e;
|
||||||
|
--fallback-invert-font: #222225;
|
||||||
|
--font-stack: dm, Monaco, Courier New, monospace, serif;
|
||||||
|
|
||||||
|
font-family: var(--font-stack, "Courier New", monospace); /* Use theme font stack */
|
||||||
|
background-color: var(--background-color, var(--fallback-bg));
|
||||||
|
color: var(--font-color, var(--fallback-font));
|
||||||
|
margin: 0;
|
||||||
|
padding: 0;
|
||||||
|
font-size: 14px; /* Match global font size */
|
||||||
|
line-height: 1.5em; /* Match global line height */
|
||||||
|
height: 100vh; /* Ensure body takes full height */
|
||||||
|
overflow: hidden; /* Prevent body scrollbars, panels handle scroll */
|
||||||
|
display: flex; /* Use flex for the main container */
|
||||||
|
}
|
||||||
|
|
||||||
|
a {
|
||||||
|
color: var(--secondary-color, var(--fallback-secondary));
|
||||||
|
text-decoration: none;
|
||||||
|
transition: color 0.2s;
|
||||||
|
}
|
||||||
|
a:hover {
|
||||||
|
color: var(--primary-color, var(--fallback-primary));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* --- Main Container Layout --- */
|
||||||
|
.ai-assistant-container {
|
||||||
|
display: flex;
|
||||||
|
width: 100%;
|
||||||
|
height: 100%;
|
||||||
|
background-color: var(--background-color, var(--fallback-bg));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* --- Sidebar Styling --- */
|
||||||
|
.sidebar {
|
||||||
|
flex-shrink: 0; /* Prevent sidebars from shrinking */
|
||||||
|
height: 100%;
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
/* background-color: var(--code-bg-color, var(--fallback-code-bg)); */
|
||||||
|
overflow-y: hidden; /* Header fixed, list scrolls */
|
||||||
|
}
|
||||||
|
|
||||||
|
.left-sidebar {
|
||||||
|
flex-basis: 240px; /* Width of history panel */
|
||||||
|
border-right: 1px solid var(--progress-bar-background, var(--fallback-border));
|
||||||
|
}
|
||||||
|
|
||||||
|
.right-sidebar {
|
||||||
|
flex-basis: 280px; /* Width of citations panel */
|
||||||
|
border-left: 1px solid var(--progress-bar-background, var(--fallback-border));
|
||||||
|
}
|
||||||
|
|
||||||
|
.sidebar header {
|
||||||
|
padding: 0.6em 1em;
|
||||||
|
border-bottom: 1px solid var(--progress-bar-background, var(--fallback-border));
|
||||||
|
flex-shrink: 0;
|
||||||
|
display: flex;
|
||||||
|
justify-content: space-between;
|
||||||
|
align-items: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
.sidebar header h3 {
|
||||||
|
margin: 0;
|
||||||
|
font-size: 1.1em;
|
||||||
|
color: var(--font-color, var(--fallback-font));
|
||||||
|
}
|
||||||
|
|
||||||
|
.sidebar ul {
|
||||||
|
list-style: none;
|
||||||
|
padding: 0;
|
||||||
|
margin: 0;
|
||||||
|
overflow-y: auto; /* Enable scrolling for the list */
|
||||||
|
flex-grow: 1; /* Allow list to take remaining space */
|
||||||
|
padding: 0.5em 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.sidebar ul li {
|
||||||
|
padding: 0.3em 1em;
|
||||||
|
}
|
||||||
|
.sidebar ul li.no-citations,
|
||||||
|
.sidebar ul li.no-history {
|
||||||
|
color: var(--secondary-color, var(--fallback-secondary));
|
||||||
|
font-style: italic;
|
||||||
|
font-size: 0.9em;
|
||||||
|
padding-left: 1em;
|
||||||
|
}
|
||||||
|
|
||||||
|
.sidebar ul li a {
|
||||||
|
color: var(--secondary-color, var(--fallback-secondary));
|
||||||
|
text-decoration: none;
|
||||||
|
display: block;
|
||||||
|
padding: 0.2em 0.5em;
|
||||||
|
border-radius: 3px;
|
||||||
|
transition: background-color 0.2s, color 0.2s;
|
||||||
|
}
|
||||||
|
|
||||||
|
.sidebar ul li a:hover {
|
||||||
|
color: var(--primary-color, var(--fallback-primary));
|
||||||
|
background-color: rgba(80, 255, 255, 0.08); /* Use primary color with alpha */
|
||||||
|
}
|
||||||
|
/* Style for active history item */
|
||||||
|
#history-list li.active a {
|
||||||
|
color: var(--primary-dimmed-color, var(--fallback-primary-dimmed));
|
||||||
|
font-weight: bold;
|
||||||
|
background-color: rgba(80, 255, 255, 0.12);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* --- Chat Panel Styling --- */
|
||||||
|
#chat-panel {
|
||||||
|
flex-grow: 1; /* Take remaining space */
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
height: 100%;
|
||||||
|
overflow: hidden; /* Prevent overflow, internal elements handle scroll */
|
||||||
|
}
|
||||||
|
|
||||||
|
#chat-messages {
|
||||||
|
flex-grow: 1;
|
||||||
|
overflow-y: auto; /* Scrollable chat history */
|
||||||
|
padding: 1em 1.5em;
|
||||||
|
border-bottom: 1px solid var(--progress-bar-background, var(--fallback-border));
|
||||||
|
}
|
||||||
|
|
||||||
|
.message {
|
||||||
|
margin-bottom: 1em;
|
||||||
|
padding: 0.8em 1.2em;
|
||||||
|
border-radius: 8px;
|
||||||
|
max-width: 90%; /* Slightly wider */
|
||||||
|
line-height: 1.6;
|
||||||
|
/* Apply pre-wrap for better handling of spaces/newlines AND wrapping */
|
||||||
|
white-space: pre-wrap;
|
||||||
|
word-wrap: break-word; /* Ensure long words break */
|
||||||
|
}
|
||||||
|
|
||||||
|
.user-message {
|
||||||
|
background-color: var(--progress-bar-background, var(--fallback-border)); /* User message background */
|
||||||
|
color: var(--font-color, var(--fallback-font));
|
||||||
|
margin-left: auto; /* Align user messages to the right */
|
||||||
|
text-align: left;
|
||||||
|
}
|
||||||
|
|
||||||
|
.ai-message {
|
||||||
|
background-color: var(--code-bg-color, var(--fallback-code-bg)); /* AI message background */
|
||||||
|
color: var(--font-color, var(--fallback-font));
|
||||||
|
margin-right: auto; /* Align AI messages to the left */
|
||||||
|
border: 1px solid var(--progress-bar-background, var(--fallback-border));
|
||||||
|
}
|
||||||
|
.ai-message.welcome-message {
|
||||||
|
border: none;
|
||||||
|
background-color: transparent;
|
||||||
|
max-width: 100%;
|
||||||
|
text-align: center;
|
||||||
|
color: var(--secondary-color, var(--fallback-secondary));
|
||||||
|
white-space: normal;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Styles for code within messages */
|
||||||
|
.ai-message code {
|
||||||
|
background-color: var(--invert-font-color, var(--fallback-invert-font)) !important; /* Use light bg for code */
|
||||||
|
/* color: var(--background-color, var(--fallback-bg)) !important; Dark text */
|
||||||
|
padding: 0.1em 0.4em;
|
||||||
|
border-radius: 4px;
|
||||||
|
font-size: 0.9em;
|
||||||
|
}
|
||||||
|
.ai-message pre {
|
||||||
|
background-color: var(--invert-font-color, var(--fallback-invert-font)) !important;
|
||||||
|
color: var(--background-color, var(--fallback-bg)) !important;
|
||||||
|
padding: 1em;
|
||||||
|
border-radius: 5px;
|
||||||
|
overflow-x: auto;
|
||||||
|
margin: 0.8em 0;
|
||||||
|
white-space: pre;
|
||||||
|
}
|
||||||
|
.ai-message pre code {
|
||||||
|
background-color: transparent !important;
|
||||||
|
padding: 0;
|
||||||
|
font-size: inherit;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Override white-space for specific elements generated by Markdown */
|
||||||
|
.ai-message p,
|
||||||
|
.ai-message ul,
|
||||||
|
.ai-message ol,
|
||||||
|
.ai-message blockquote {
|
||||||
|
white-space: normal; /* Allow standard wrapping for block elements */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* --- Markdown Element Styling within Messages --- */
|
||||||
|
.message p {
|
||||||
|
margin-top: 0;
|
||||||
|
margin-bottom: 0.5em;
|
||||||
|
}
|
||||||
|
.message p:last-child {
|
||||||
|
margin-bottom: 0;
|
||||||
|
}
|
||||||
|
.message ul,
|
||||||
|
.message ol {
|
||||||
|
margin: 0.5em 0 0.5em 1.5em;
|
||||||
|
padding: 0;
|
||||||
|
}
|
||||||
|
.message li {
|
||||||
|
margin-bottom: 0.2em;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Code block styling (adjusts previous rules slightly) */
|
||||||
|
.message code {
|
||||||
|
/* Inline code */
|
||||||
|
background-color: var(--invert-font-color, var(--fallback-invert-font)) !important;
|
||||||
|
color: var(--font-color);
|
||||||
|
padding: 0.1em 0.4em;
|
||||||
|
border-radius: 4px;
|
||||||
|
font-size: 0.9em;
|
||||||
|
/* Ensure inline code breaks nicely */
|
||||||
|
word-break: break-all;
|
||||||
|
white-space: normal; /* Allow inline code to wrap if needed */
|
||||||
|
}
|
||||||
|
.message pre {
|
||||||
|
/* Code block container */
|
||||||
|
background-color: var(--invert-font-color, var(--fallback-invert-font)) !important;
|
||||||
|
color: var(--background-color, var(--fallback-bg)) !important;
|
||||||
|
padding: 1em;
|
||||||
|
border-radius: 5px;
|
||||||
|
overflow-x: auto;
|
||||||
|
margin: 0.8em 0;
|
||||||
|
font-size: 0.9em; /* Slightly smaller code blocks */
|
||||||
|
}
|
||||||
|
.message pre code {
|
||||||
|
/* Code within code block */
|
||||||
|
background-color: transparent !important;
|
||||||
|
padding: 0;
|
||||||
|
font-size: inherit;
|
||||||
|
word-break: normal; /* Don't break words in code blocks */
|
||||||
|
white-space: pre; /* Preserve whitespace strictly in code blocks */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Thinking indicator */
|
||||||
|
.message-thinking {
|
||||||
|
display: inline-block;
|
||||||
|
width: 5px;
|
||||||
|
height: 5px;
|
||||||
|
background-color: var(--primary-color, var(--fallback-primary));
|
||||||
|
border-radius: 50%;
|
||||||
|
margin-left: 8px;
|
||||||
|
vertical-align: middle;
|
||||||
|
animation: thinking 1s infinite ease-in-out;
|
||||||
|
}
|
||||||
|
@keyframes thinking {
|
||||||
|
0%,
|
||||||
|
100% {
|
||||||
|
opacity: 0.5;
|
||||||
|
transform: scale(0.8);
|
||||||
|
}
|
||||||
|
50% {
|
||||||
|
opacity: 1;
|
||||||
|
transform: scale(1.2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* --- Thinking Indicator (Blinking Cursor Style) --- */
|
||||||
|
.thinking-indicator-cursor {
|
||||||
|
display: inline-block;
|
||||||
|
width: 10px; /* Width of the cursor */
|
||||||
|
height: 1.1em; /* Match line height */
|
||||||
|
background-color: var(--primary-color, var(--fallback-primary));
|
||||||
|
margin-left: 5px;
|
||||||
|
vertical-align: text-bottom; /* Align with text baseline */
|
||||||
|
animation: blink-cursor 1s step-end infinite;
|
||||||
|
}
|
||||||
|
|
||||||
|
@keyframes blink-cursor {
|
||||||
|
from,
|
||||||
|
to {
|
||||||
|
background-color: transparent;
|
||||||
|
}
|
||||||
|
50% {
|
||||||
|
background-color: var(--primary-color, var(--fallback-primary));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#chat-input-area {
|
||||||
|
flex-shrink: 0; /* Prevent input area from shrinking */
|
||||||
|
padding: 1em 1.5em;
|
||||||
|
display: flex;
|
||||||
|
align-items: flex-end; /* Align items to bottom */
|
||||||
|
gap: 10px;
|
||||||
|
background-color: var(--code-bg-color, var(--fallback-code-bg)); /* Match sidebars */
|
||||||
|
}
|
||||||
|
|
||||||
|
#chat-input-area textarea {
|
||||||
|
flex-grow: 1;
|
||||||
|
padding: 0.8em 1em;
|
||||||
|
border: 1px solid var(--progress-bar-background, var(--fallback-border));
|
||||||
|
background-color: var(--background-color, var(--fallback-bg));
|
||||||
|
color: var(--font-color, var(--fallback-font));
|
||||||
|
border-radius: 5px;
|
||||||
|
resize: none; /* Disable manual resize */
|
||||||
|
font-family: inherit;
|
||||||
|
font-size: 1em;
|
||||||
|
line-height: 1.4;
|
||||||
|
max-height: 150px; /* Limit excessive height */
|
||||||
|
overflow-y: auto;
|
||||||
|
/* rows: 2; */
|
||||||
|
}
|
||||||
|
|
||||||
|
#chat-input-area button {
|
||||||
|
/* Basic button styling - maybe inherit from main theme? */
|
||||||
|
padding: 0.6em 1.2em;
|
||||||
|
border: 1px solid var(--primary-dimmed-color, var(--fallback-primary-dimmed));
|
||||||
|
background-color: var(--primary-dimmed-color, var(--fallback-primary-dimmed));
|
||||||
|
color: var(--background-color, var(--fallback-bg));
|
||||||
|
border-radius: 5px;
|
||||||
|
cursor: pointer;
|
||||||
|
font-size: 0.9em;
|
||||||
|
transition: background-color 0.2s, border-color 0.2s;
|
||||||
|
height: min-content; /* Align with bottom of textarea */
|
||||||
|
}
|
||||||
|
|
||||||
|
#chat-input-area button:hover {
|
||||||
|
background-color: var(--primary-color, var(--fallback-primary));
|
||||||
|
border-color: var(--primary-color, var(--fallback-primary));
|
||||||
|
}
|
||||||
|
#chat-input-area button:disabled {
|
||||||
|
opacity: 0.6;
|
||||||
|
cursor: not-allowed;
|
||||||
|
}
|
||||||
|
|
||||||
|
.loading-indicator {
|
||||||
|
font-size: 0.9em;
|
||||||
|
color: var(--secondary-color, var(--fallback-secondary));
|
||||||
|
margin-right: 10px;
|
||||||
|
align-self: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* --- Buttons --- */
|
||||||
|
/* Inherit some button styles if possible */
|
||||||
|
.btn.btn-sm {
|
||||||
|
color: var(--font-color, var(--fallback-font));
|
||||||
|
padding: 0.2em 0.5em;
|
||||||
|
font-size: 0.8em;
|
||||||
|
border: 1px solid var(--secondary-color, var(--fallback-secondary));
|
||||||
|
background: none;
|
||||||
|
border-radius: 3px;
|
||||||
|
cursor: pointer;
|
||||||
|
}
|
||||||
|
.btn.btn-sm:hover {
|
||||||
|
border-color: var(--font-color, var(--fallback-font));
|
||||||
|
background-color: var(--progress-bar-background, var(--fallback-border));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* --- Basic Responsiveness --- */
|
||||||
|
@media screen and (max-width: 900px) {
|
||||||
|
.left-sidebar {
|
||||||
|
flex-basis: 200px; /* Shrink history */
|
||||||
|
}
|
||||||
|
.right-sidebar {
|
||||||
|
flex-basis: 240px; /* Shrink citations */
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@media screen and (max-width: 768px) {
|
||||||
|
/* Stack layout on mobile? Or hide sidebars? Hiding for now */
|
||||||
|
.sidebar {
|
||||||
|
display: none; /* Hide sidebars on small screens */
|
||||||
|
}
|
||||||
|
/* Could add toggle buttons later */
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* ==== File: docs/ask_ai/ask-ai.css (Updates V4 - Delete Button) ==== */
|
||||||
|
|
||||||
|
|
||||||
|
.sidebar ul li {
|
||||||
|
/* Use flexbox to align link and delete button */
|
||||||
|
display: flex;
|
||||||
|
justify-content: space-between;
|
||||||
|
align-items: center;
|
||||||
|
padding: 0; /* Remove padding from li, add to link/button */
|
||||||
|
margin: 0.1em 0; /* Small vertical margin */
|
||||||
|
}
|
||||||
|
|
||||||
|
.sidebar ul li a {
|
||||||
|
/* Link takes most space */
|
||||||
|
flex-grow: 1;
|
||||||
|
padding: 0.3em 0.5em 0.3em 1em; /* Adjust padding */
|
||||||
|
/* Make ellipsis work for long titles */
|
||||||
|
white-space: nowrap;
|
||||||
|
overflow: hidden;
|
||||||
|
text-overflow: ellipsis;
|
||||||
|
/* Keep existing link styles */
|
||||||
|
color: var(--secondary-color, var(--fallback-secondary));
|
||||||
|
text-decoration: none;
|
||||||
|
display: block;
|
||||||
|
border-radius: 3px;
|
||||||
|
transition: background-color 0.2s, color 0.2s;
|
||||||
|
}
|
||||||
|
.sidebar ul li a:hover {
|
||||||
|
color: var(--primary-color, var(--fallback-primary));
|
||||||
|
background-color: rgba(80, 255, 255, 0.08);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Style for active history item's link */
|
||||||
|
#history-list li.active a {
|
||||||
|
color: var(--primary-dimmed-color, var(--fallback-primary-dimmed));
|
||||||
|
font-weight: bold;
|
||||||
|
background-color: rgba(80, 255, 255, 0.12);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* --- Delete Chat Button --- */
|
||||||
|
.delete-chat-btn {
|
||||||
|
flex-shrink: 0; /* Don't shrink */
|
||||||
|
background: none;
|
||||||
|
border: none;
|
||||||
|
color: var(--secondary-color, var(--fallback-secondary));
|
||||||
|
cursor: pointer;
|
||||||
|
padding: 0.4em 0.8em; /* Padding around icon */
|
||||||
|
font-size: 0.9em;
|
||||||
|
opacity: 0.5; /* Dimmed by default */
|
||||||
|
transition: opacity 0.2s, color 0.2s;
|
||||||
|
margin-left: 5px; /* Space between link and button */
|
||||||
|
border-radius: 3px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.sidebar ul li:hover .delete-chat-btn,
|
||||||
|
.delete-chat-btn:hover {
|
||||||
|
opacity: 1; /* Show fully on hover */
|
||||||
|
color: var(--error-color, #ff3c74); /* Use error color on hover */
|
||||||
|
}
|
||||||
|
.delete-chat-btn:focus {
|
||||||
|
outline: 1px dashed var(--error-color, #ff3c74); /* Accessibility */
|
||||||
|
opacity: 1;
|
||||||
|
}
|
||||||
603
docs/md_v2/ask_ai/ask-ai.js
Normal file
603
docs/md_v2/ask_ai/ask-ai.js
Normal file
@@ -0,0 +1,603 @@
|
|||||||
|
// ==== File: docs/ask_ai/ask-ai.js (Marked, Streaming, History) ====
|
||||||
|
|
||||||
|
document.addEventListener("DOMContentLoaded", () => {
|
||||||
|
console.log("AI Assistant JS V2 Loaded");
|
||||||
|
|
||||||
|
// --- DOM Element Selectors ---
|
||||||
|
const historyList = document.getElementById("history-list");
|
||||||
|
const newChatButton = document.getElementById("new-chat-button");
|
||||||
|
const chatMessages = document.getElementById("chat-messages");
|
||||||
|
const chatInput = document.getElementById("chat-input");
|
||||||
|
const sendButton = document.getElementById("send-button");
|
||||||
|
const citationsList = document.getElementById("citations-list");
|
||||||
|
|
||||||
|
// --- Constants ---
|
||||||
|
const CHAT_INDEX_KEY = "aiAssistantChatIndex_v1";
|
||||||
|
const CHAT_PREFIX = "aiAssistantChat_v1_";
|
||||||
|
|
||||||
|
// --- State ---
|
||||||
|
let currentChatId = null;
|
||||||
|
let conversationHistory = []; // Holds message objects { sender: 'user'/'ai', text: '...' }
|
||||||
|
let isThinking = false;
|
||||||
|
let streamInterval = null; // To control the streaming interval
|
||||||
|
|
||||||
|
// --- Event Listeners ---
|
||||||
|
sendButton.addEventListener("click", handleSendMessage);
|
||||||
|
chatInput.addEventListener("keydown", handleInputKeydown);
|
||||||
|
newChatButton.addEventListener("click", handleNewChat);
|
||||||
|
chatInput.addEventListener("input", autoGrowTextarea);
|
||||||
|
|
||||||
|
// --- Initialization ---
|
||||||
|
loadChatHistoryIndex(); // Load history list on startup
|
||||||
|
const initialQuery = checkForInitialQuery(window.parent.location); // Check for query param
|
||||||
|
if (!initialQuery) {
|
||||||
|
loadInitialChat(); // Load normally if no query
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Core Functions ---
|
||||||
|
|
||||||
|
function handleSendMessage() {
|
||||||
|
const userMessageText = chatInput.value.trim();
|
||||||
|
if (!userMessageText || isThinking) return;
|
||||||
|
|
||||||
|
setThinking(true); // Start thinking state
|
||||||
|
|
||||||
|
// Add user message to state and UI
|
||||||
|
const userMessage = { sender: "user", text: userMessageText };
|
||||||
|
conversationHistory.push(userMessage);
|
||||||
|
addMessageToChat(userMessage, false); // Add user message without parsing markdown
|
||||||
|
|
||||||
|
chatInput.value = "";
|
||||||
|
autoGrowTextarea(); // Reset textarea height
|
||||||
|
|
||||||
|
// Prepare for AI response (create empty div)
|
||||||
|
const aiMessageDiv = addMessageToChat({ sender: "ai", text: "" }, true); // Add empty div with thinking indicator
|
||||||
|
|
||||||
|
// TODO: Generate fingerprint/JWT here
|
||||||
|
|
||||||
|
// TODO: Send `conversationHistory` + JWT to backend API
|
||||||
|
// Replace placeholder below with actual API call
|
||||||
|
// The backend should ideally return a stream of text tokens
|
||||||
|
|
||||||
|
// --- Placeholder Streaming Simulation ---
|
||||||
|
const simulatedFullResponse = `Okay, Here’s a minimal Python script that creates an AsyncWebCrawler, fetches a webpage, and prints the first 300 characters of its Markdown output:
|
||||||
|
|
||||||
|
\`\`\`python
|
||||||
|
import asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun("https://example.com")
|
||||||
|
print(result.markdown[:300]) # Print first 300 chars
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
|
\`\`\`
|
||||||
|
|
||||||
|
A code snippet: \`crawler.run()\`. Check the [quickstart](/core/quickstart).`;
|
||||||
|
|
||||||
|
// Simulate receiving the response stream
|
||||||
|
streamSimulatedResponse(aiMessageDiv, simulatedFullResponse);
|
||||||
|
|
||||||
|
// // Simulate receiving citations *after* stream starts (or with first chunk)
|
||||||
|
// setTimeout(() => {
|
||||||
|
// addCitations([
|
||||||
|
// { title: "Simulated Doc 1", url: "#sim1" },
|
||||||
|
// { title: "Another Concept", url: "#sim2" },
|
||||||
|
// ]);
|
||||||
|
// }, 500); // Citations appear shortly after thinking starts
|
||||||
|
}
|
||||||
|
|
||||||
|
function handleInputKeydown(event) {
|
||||||
|
if (event.key === "Enter" && !event.shiftKey) {
|
||||||
|
event.preventDefault();
|
||||||
|
handleSendMessage();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function addMessageToChat(message, addThinkingIndicator = false) {
|
||||||
|
const messageDiv = document.createElement("div");
|
||||||
|
messageDiv.classList.add("message", `${message.sender}-message`);
|
||||||
|
|
||||||
|
// Parse markdown and set HTML
|
||||||
|
messageDiv.innerHTML = message.text ? marked.parse(message.text) : "";
|
||||||
|
|
||||||
|
if (message.sender === "ai") {
|
||||||
|
// Apply Syntax Highlighting AFTER setting innerHTML
|
||||||
|
messageDiv.querySelectorAll("pre code:not(.hljs)").forEach((block) => {
|
||||||
|
if (typeof hljs !== "undefined") {
|
||||||
|
// Check if already highlighted to prevent double-highlighting issues
|
||||||
|
if (!block.classList.contains("hljs")) {
|
||||||
|
hljs.highlightElement(block);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
console.warn("highlight.js (hljs) not found for syntax highlighting.");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Add thinking indicator if needed (and not already present)
|
||||||
|
if (addThinkingIndicator && !message.text && !messageDiv.querySelector(".thinking-indicator-cursor")) {
|
||||||
|
const thinkingDiv = document.createElement("div");
|
||||||
|
thinkingDiv.className = "thinking-indicator-cursor";
|
||||||
|
messageDiv.appendChild(thinkingDiv);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// User messages remain plain text
|
||||||
|
// messageDiv.textContent = message.text;
|
||||||
|
}
|
||||||
|
|
||||||
|
// wrap each pre in a div.terminal
|
||||||
|
messageDiv.querySelectorAll("pre").forEach((block) => {
|
||||||
|
const wrapper = document.createElement("div");
|
||||||
|
wrapper.className = "terminal";
|
||||||
|
block.parentNode.insertBefore(wrapper, block);
|
||||||
|
wrapper.appendChild(block);
|
||||||
|
});
|
||||||
|
|
||||||
|
chatMessages.appendChild(messageDiv);
|
||||||
|
// Scroll only if user is near the bottom? (More advanced)
|
||||||
|
// Simple scroll for now:
|
||||||
|
scrollToBottom();
|
||||||
|
return messageDiv; // Return the created element
|
||||||
|
}
|
||||||
|
|
||||||
|
function streamSimulatedResponse(messageDiv, fullText) {
|
||||||
|
const thinkingIndicator = messageDiv.querySelector(".thinking-indicator-cursor");
|
||||||
|
if (thinkingIndicator) thinkingIndicator.remove();
|
||||||
|
|
||||||
|
const tokens = fullText.split(/(\s+)/);
|
||||||
|
let currentText = "";
|
||||||
|
let tokenIndex = 0;
|
||||||
|
// Clear previous interval just in case
|
||||||
|
if (streamInterval) clearInterval(streamInterval);
|
||||||
|
|
||||||
|
streamInterval = setInterval(() => {
|
||||||
|
const cursorSpan = '<span class="thinking-indicator-cursor"></span>'; // Cursor for streaming
|
||||||
|
if (tokenIndex < tokens.length) {
|
||||||
|
currentText += tokens[tokenIndex];
|
||||||
|
// Render intermediate markdown + cursor
|
||||||
|
messageDiv.innerHTML = marked.parse(currentText + cursorSpan);
|
||||||
|
// Re-highlight code blocks on each stream update - might be slightly inefficient
|
||||||
|
// but ensures partial code blocks look okay. Highlight only final on completion.
|
||||||
|
// messageDiv.querySelectorAll('pre code:not(.hljs)').forEach((block) => {
|
||||||
|
// hljs.highlightElement(block);
|
||||||
|
// });
|
||||||
|
scrollToBottom(); // Keep scrolling as content streams
|
||||||
|
tokenIndex++;
|
||||||
|
} else {
|
||||||
|
// Streaming finished
|
||||||
|
clearInterval(streamInterval);
|
||||||
|
streamInterval = null;
|
||||||
|
|
||||||
|
// Final render without cursor
|
||||||
|
messageDiv.innerHTML = marked.parse(currentText);
|
||||||
|
|
||||||
|
// === Final Syntax Highlighting ===
|
||||||
|
messageDiv.querySelectorAll("pre code:not(.hljs)").forEach((block) => {
|
||||||
|
if (typeof hljs !== "undefined" && !block.classList.contains("hljs")) {
|
||||||
|
hljs.highlightElement(block);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// === Extract Citations ===
|
||||||
|
const citations = extractMarkdownLinks(currentText);
|
||||||
|
|
||||||
|
// Wrap each pre in a div.terminal
|
||||||
|
messageDiv.querySelectorAll("pre").forEach((block) => {
|
||||||
|
const wrapper = document.createElement("div");
|
||||||
|
wrapper.className = "terminal";
|
||||||
|
block.parentNode.insertBefore(wrapper, block);
|
||||||
|
wrapper.appendChild(block);
|
||||||
|
});
|
||||||
|
|
||||||
|
const aiMessage = { sender: "ai", text: currentText, citations: citations };
|
||||||
|
conversationHistory.push(aiMessage);
|
||||||
|
updateCitationsDisplay();
|
||||||
|
saveCurrentChat();
|
||||||
|
setThinking(false);
|
||||||
|
}
|
||||||
|
}, 50); // Adjust speed
|
||||||
|
}
|
||||||
|
|
||||||
|
// === NEW Function to Extract Links ===
|
||||||
|
function extractMarkdownLinks(markdownText) {
|
||||||
|
const regex = /\[([^\]]+)\]\(([^)]+)\)/g; // [text](url)
|
||||||
|
const citations = [];
|
||||||
|
let match;
|
||||||
|
while ((match = regex.exec(markdownText)) !== null) {
|
||||||
|
// Avoid adding self-links from within the citations list if AI includes them
|
||||||
|
if (!match[2].startsWith("#citation-")) {
|
||||||
|
citations.push({
|
||||||
|
title: match[1].trim(),
|
||||||
|
url: match[2].trim(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Optional: Deduplicate links based on URL
|
||||||
|
const uniqueCitations = citations.filter(
|
||||||
|
(citation, index, self) => index === self.findIndex((c) => c.url === citation.url)
|
||||||
|
);
|
||||||
|
return uniqueCitations;
|
||||||
|
}
|
||||||
|
|
||||||
|
// === REVISED Function to Display Citations ===
|
||||||
|
function updateCitationsDisplay() {
|
||||||
|
let lastCitations = null;
|
||||||
|
// Find the most recent AI message with citations
|
||||||
|
for (let i = conversationHistory.length - 1; i >= 0; i--) {
|
||||||
|
if (
|
||||||
|
conversationHistory[i].sender === "ai" &&
|
||||||
|
conversationHistory[i].citations &&
|
||||||
|
conversationHistory[i].citations.length > 0
|
||||||
|
) {
|
||||||
|
lastCitations = conversationHistory[i].citations;
|
||||||
|
break; // Found the latest citations
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
citationsList.innerHTML = ""; // Clear previous
|
||||||
|
if (!lastCitations) {
|
||||||
|
citationsList.innerHTML = '<li class="no-citations">No citations available.</li>';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
lastCitations.forEach((citation, index) => {
|
||||||
|
const li = document.createElement("li");
|
||||||
|
const a = document.createElement("a");
|
||||||
|
// Generate a unique ID for potential internal linking if needed
|
||||||
|
// a.id = `citation-${index}`;
|
||||||
|
a.href = citation.url || "#";
|
||||||
|
a.textContent = citation.title;
|
||||||
|
a.target = "_top"; // Open in main window
|
||||||
|
li.appendChild(a);
|
||||||
|
citationsList.appendChild(li);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function addCitations(citations) {
|
||||||
|
citationsList.innerHTML = ""; // Clear
|
||||||
|
if (!citations || citations.length === 0) {
|
||||||
|
citationsList.innerHTML = '<li class="no-citations">No citations available.</li>';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
citations.forEach((citation) => {
|
||||||
|
const li = document.createElement("li");
|
||||||
|
const a = document.createElement("a");
|
||||||
|
a.href = citation.url || "#";
|
||||||
|
a.textContent = citation.title;
|
||||||
|
a.target = "_top"; // Open in main window
|
||||||
|
li.appendChild(a);
|
||||||
|
citationsList.appendChild(li);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function setThinking(thinking) {
|
||||||
|
isThinking = thinking;
|
||||||
|
sendButton.disabled = thinking;
|
||||||
|
chatInput.disabled = thinking;
|
||||||
|
chatInput.placeholder = thinking ? "AI is responding..." : "Ask about Crawl4AI...";
|
||||||
|
// Stop any existing stream if we start thinking again (e.g., rapid resend)
|
||||||
|
if (thinking && streamInterval) {
|
||||||
|
clearInterval(streamInterval);
|
||||||
|
streamInterval = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function autoGrowTextarea() {
|
||||||
|
chatInput.style.height = "auto";
|
||||||
|
chatInput.style.height = `${chatInput.scrollHeight}px`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function scrollToBottom() {
|
||||||
|
chatMessages.scrollTop = chatMessages.scrollHeight;
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Query Parameter Handling ---
|
||||||
|
function checkForInitialQuery(locationToCheck) {
|
||||||
|
// <-- Receive location object
|
||||||
|
if (!locationToCheck) {
|
||||||
|
console.warn("Ask AI: Could not access parent window location.");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
const urlParams = new URLSearchParams(locationToCheck.search); // <-- Use passed location's search string
|
||||||
|
const encodedQuery = urlParams.get("qq"); // <-- Use 'qq'
|
||||||
|
|
||||||
|
if (encodedQuery) {
|
||||||
|
console.log("Initial query found (qq):", encodedQuery);
|
||||||
|
try {
|
||||||
|
const decodedText = decodeURIComponent(escape(atob(encodedQuery)));
|
||||||
|
console.log("Decoded query:", decodedText);
|
||||||
|
|
||||||
|
// Start new chat immediately
|
||||||
|
handleNewChat(true);
|
||||||
|
|
||||||
|
// Delay setting input and sending message slightly
|
||||||
|
setTimeout(() => {
|
||||||
|
chatInput.value = decodedText;
|
||||||
|
autoGrowTextarea();
|
||||||
|
handleSendMessage();
|
||||||
|
|
||||||
|
// Clean the PARENT window's URL
|
||||||
|
try {
|
||||||
|
const cleanUrl = locationToCheck.pathname;
|
||||||
|
// Use parent's history object
|
||||||
|
window.parent.history.replaceState({}, window.parent.document.title, cleanUrl);
|
||||||
|
} catch (e) {
|
||||||
|
console.warn("Ask AI: Could not clean parent URL using replaceState.", e);
|
||||||
|
// This might fail due to cross-origin restrictions if served differently,
|
||||||
|
// but should work fine with mkdocs serve on the same origin.
|
||||||
|
}
|
||||||
|
}, 100);
|
||||||
|
|
||||||
|
return true; // Query processed
|
||||||
|
} catch (e) {
|
||||||
|
console.error("Error decoding initial query (qq):", e);
|
||||||
|
// Clean the PARENT window's URL even on error
|
||||||
|
try {
|
||||||
|
const cleanUrl = locationToCheck.pathname;
|
||||||
|
window.parent.history.replaceState({}, window.parent.document.title, cleanUrl);
|
||||||
|
} catch (cleanError) {
|
||||||
|
console.warn("Ask AI: Could not clean parent URL after decode error.", cleanError);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false; // No 'qq' query found
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- History Management ---
|
||||||
|
|
||||||
|
function handleNewChat(isFromQuery = false) {
|
||||||
|
if (isThinking) return; // Don't allow new chat while responding
|
||||||
|
|
||||||
|
// Only save if NOT triggered immediately by a query parameter load
|
||||||
|
if (!isFromQuery) {
|
||||||
|
saveCurrentChat();
|
||||||
|
}
|
||||||
|
|
||||||
|
currentChatId = `chat_${Date.now()}`;
|
||||||
|
conversationHistory = []; // Clear message history state
|
||||||
|
chatMessages.innerHTML = ""; // Start with clean slate for query
|
||||||
|
if (!isFromQuery) {
|
||||||
|
// Show welcome only if manually started
|
||||||
|
chatMessages.innerHTML =
|
||||||
|
'<div class="message ai-message welcome-message">Started a new chat! Ask me anything about Crawl4AI.</div>';
|
||||||
|
}
|
||||||
|
addCitations([]); // Clear citations
|
||||||
|
updateCitationsDisplay(); // Clear UI
|
||||||
|
|
||||||
|
// Add to index and save
|
||||||
|
let index = loadChatIndex();
|
||||||
|
// Generate a generic title initially, update later
|
||||||
|
const newTitle = isFromQuery ? "Chat from Selection" : `Chat ${new Date().toLocaleString()}`;
|
||||||
|
// index.unshift({ id: currentChatId, title: `Chat ${new Date().toLocaleString()}` }); // Add to start
|
||||||
|
index.unshift({ id: currentChatId, title: newTitle });
|
||||||
|
saveChatIndex(index);
|
||||||
|
|
||||||
|
renderHistoryList(index); // Update UI
|
||||||
|
setActiveHistoryItem(currentChatId);
|
||||||
|
saveCurrentChat(); // Save the empty new chat state
|
||||||
|
}
|
||||||
|
|
||||||
|
function loadChat(chatId) {
|
||||||
|
if (isThinking || chatId === currentChatId) return;
|
||||||
|
|
||||||
|
// Check if chat data actually exists before proceeding
|
||||||
|
const storedChat = localStorage.getItem(CHAT_PREFIX + chatId);
|
||||||
|
if (storedChat === null) {
|
||||||
|
console.warn(`Attempted to load non-existent chat: ${chatId}. Removing from index.`);
|
||||||
|
deleteChatData(chatId); // Clean up index
|
||||||
|
loadChatHistoryIndex(); // Reload history list
|
||||||
|
loadInitialChat(); // Load next available chat
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`Loading chat: ${chatId}`);
|
||||||
|
saveCurrentChat(); // Save current before switching
|
||||||
|
|
||||||
|
try {
|
||||||
|
conversationHistory = JSON.parse(storedChat);
|
||||||
|
currentChatId = chatId;
|
||||||
|
renderChatMessages(conversationHistory);
|
||||||
|
updateCitationsDisplay();
|
||||||
|
setActiveHistoryItem(chatId);
|
||||||
|
} catch (e) {
|
||||||
|
console.error("Error loading chat:", chatId, e);
|
||||||
|
alert("Failed to load chat data.");
|
||||||
|
conversationHistory = [];
|
||||||
|
renderChatMessages(conversationHistory);
|
||||||
|
updateCitationsDisplay();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function saveCurrentChat() {
|
||||||
|
if (currentChatId && conversationHistory.length > 0) {
|
||||||
|
try {
|
||||||
|
localStorage.setItem(CHAT_PREFIX + currentChatId, JSON.stringify(conversationHistory));
|
||||||
|
console.log(`Chat ${currentChatId} saved.`);
|
||||||
|
|
||||||
|
// Update title in index (e.g., use first user message)
|
||||||
|
let index = loadChatIndex();
|
||||||
|
const currentItem = index.find((item) => item.id === currentChatId);
|
||||||
|
if (
|
||||||
|
currentItem &&
|
||||||
|
conversationHistory[0]?.sender === "user" &&
|
||||||
|
!currentItem.title.startsWith("Chat about:")
|
||||||
|
) {
|
||||||
|
currentItem.title = `Chat about: ${conversationHistory[0].text.substring(0, 30)}...`;
|
||||||
|
saveChatIndex(index);
|
||||||
|
// Re-render history list if title changed - small optimization needed here maybe
|
||||||
|
renderHistoryList(index);
|
||||||
|
setActiveHistoryItem(currentChatId); // Re-set active after re-render
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
console.error("Error saving chat:", currentChatId, e);
|
||||||
|
// Handle potential storage full errors
|
||||||
|
if (e.name === "QuotaExceededError") {
|
||||||
|
alert("Local storage is full. Cannot save chat history.");
|
||||||
|
// Consider implementing history pruning logic here
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (currentChatId) {
|
||||||
|
// Save empty state for newly created chats if needed, or remove?
|
||||||
|
localStorage.setItem(CHAT_PREFIX + currentChatId, JSON.stringify([]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function loadChatIndex() {
|
||||||
|
try {
|
||||||
|
const storedIndex = localStorage.getItem(CHAT_INDEX_KEY);
|
||||||
|
return storedIndex ? JSON.parse(storedIndex) : [];
|
||||||
|
} catch (e) {
|
||||||
|
console.error("Error loading chat index:", e);
|
||||||
|
return []; // Return empty array on error
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function saveChatIndex(indexArray) {
|
||||||
|
try {
|
||||||
|
localStorage.setItem(CHAT_INDEX_KEY, JSON.stringify(indexArray));
|
||||||
|
} catch (e) {
|
||||||
|
console.error("Error saving chat index:", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function renderHistoryList(indexArray) {
|
||||||
|
historyList.innerHTML = ""; // Clear existing
|
||||||
|
if (!indexArray || indexArray.length === 0) {
|
||||||
|
historyList.innerHTML = '<li class="no-history">No past chats found.</li>';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
indexArray.forEach((item) => {
|
||||||
|
const li = document.createElement("li");
|
||||||
|
li.dataset.chatId = item.id; // Add ID to li for easier selection
|
||||||
|
|
||||||
|
const a = document.createElement("a");
|
||||||
|
a.href = "#";
|
||||||
|
a.dataset.chatId = item.id;
|
||||||
|
a.textContent = item.title || `Chat ${item.id.split("_")[1] || item.id}`;
|
||||||
|
a.title = a.textContent; // Tooltip for potentially long titles
|
||||||
|
a.addEventListener("click", (e) => {
|
||||||
|
e.preventDefault();
|
||||||
|
loadChat(item.id);
|
||||||
|
});
|
||||||
|
|
||||||
|
// === Add Delete Button ===
|
||||||
|
const deleteBtn = document.createElement("button");
|
||||||
|
deleteBtn.className = "delete-chat-btn";
|
||||||
|
deleteBtn.innerHTML = "✕"; // Trash can emoji/icon (or use text/SVG/FontAwesome)
|
||||||
|
deleteBtn.title = "Delete Chat";
|
||||||
|
deleteBtn.dataset.chatId = item.id; // Store ID on button too
|
||||||
|
deleteBtn.addEventListener("click", handleDeleteChat);
|
||||||
|
|
||||||
|
li.appendChild(a);
|
||||||
|
li.appendChild(deleteBtn); // Append button to the list item
|
||||||
|
historyList.appendChild(li);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function renderChatMessages(messages) {
|
||||||
|
chatMessages.innerHTML = ""; // Clear existing messages
|
||||||
|
messages.forEach((message) => {
|
||||||
|
// Ensure highlighting is applied when loading from history
|
||||||
|
addMessageToChat(message, false);
|
||||||
|
});
|
||||||
|
if (messages.length === 0) {
|
||||||
|
chatMessages.innerHTML =
|
||||||
|
'<div class="message ai-message welcome-message">Chat history loaded. Ask a question!</div>';
|
||||||
|
}
|
||||||
|
// Scroll to bottom after loading messages
|
||||||
|
scrollToBottom();
|
||||||
|
}
|
||||||
|
|
||||||
|
function setActiveHistoryItem(chatId) {
|
||||||
|
document.querySelectorAll("#history-list li").forEach((li) => li.classList.remove("active"));
|
||||||
|
// Select the LI element directly now
|
||||||
|
const activeLi = document.querySelector(`#history-list li[data-chat-id="${chatId}"]`);
|
||||||
|
if (activeLi) {
|
||||||
|
activeLi.classList.add("active");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function loadInitialChat() {
|
||||||
|
const index = loadChatIndex();
|
||||||
|
if (index.length > 0) {
|
||||||
|
loadChat(index[0].id);
|
||||||
|
} else {
|
||||||
|
// Check if handleNewChat wasn't already called by query handler
|
||||||
|
if (!currentChatId) {
|
||||||
|
handleNewChat();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function loadChatHistoryIndex() {
|
||||||
|
const index = loadChatIndex();
|
||||||
|
renderHistoryList(index);
|
||||||
|
if (currentChatId) setActiveHistoryItem(currentChatId);
|
||||||
|
}
|
||||||
|
|
||||||
|
// === NEW Function to Handle Delete Click ===
|
||||||
|
function handleDeleteChat(event) {
|
||||||
|
event.stopPropagation(); // Prevent triggering loadChat on the link behind it
|
||||||
|
const button = event.currentTarget;
|
||||||
|
const chatIdToDelete = button.dataset.chatId;
|
||||||
|
|
||||||
|
if (!chatIdToDelete) return;
|
||||||
|
|
||||||
|
// Confirmation dialog
|
||||||
|
if (
|
||||||
|
window.confirm(
|
||||||
|
`Are you sure you want to delete this chat session?\n"${
|
||||||
|
button.previousElementSibling?.textContent || "Chat " + chatIdToDelete
|
||||||
|
}"`
|
||||||
|
)
|
||||||
|
) {
|
||||||
|
console.log(`Deleting chat: ${chatIdToDelete}`);
|
||||||
|
|
||||||
|
// Perform deletion
|
||||||
|
const updatedIndex = deleteChatData(chatIdToDelete);
|
||||||
|
|
||||||
|
// If the deleted chat was the currently active one, load another chat
|
||||||
|
if (currentChatId === chatIdToDelete) {
|
||||||
|
currentChatId = null; // Reset current ID
|
||||||
|
conversationHistory = []; // Clear state
|
||||||
|
if (updatedIndex.length > 0) {
|
||||||
|
// Load the new top chat (most recent remaining)
|
||||||
|
loadChat(updatedIndex[0].id);
|
||||||
|
} else {
|
||||||
|
// No chats left, start a new one
|
||||||
|
handleNewChat();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// If a different chat was deleted, just re-render the list
|
||||||
|
renderHistoryList(updatedIndex);
|
||||||
|
// Re-apply active state in case IDs shifted (though they shouldn't)
|
||||||
|
setActiveHistoryItem(currentChatId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// === NEW Function to Delete Chat Data ===
|
||||||
|
function deleteChatData(chatId) {
|
||||||
|
// Remove chat data
|
||||||
|
localStorage.removeItem(CHAT_PREFIX + chatId);
|
||||||
|
|
||||||
|
// Update index
|
||||||
|
let index = loadChatIndex();
|
||||||
|
index = index.filter((item) => item.id !== chatId);
|
||||||
|
saveChatIndex(index);
|
||||||
|
|
||||||
|
console.log(`Chat ${chatId} data and index entry removed.`);
|
||||||
|
return index; // Return the updated index
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Virtual Scrolling Placeholder ---
|
||||||
|
// NOTE: Virtual scrolling is complex. For now, we do direct rendering.
|
||||||
|
// If performance becomes an issue with very long chats/history,
|
||||||
|
// investigate libraries like 'simple-virtual-scroll' or 'virtual-scroller'.
|
||||||
|
// You would replace parts of `renderChatMessages` and `renderHistoryList`
|
||||||
|
// to work with the chosen library's API (providing data and item renderers).
|
||||||
|
console.warn("Virtual scrolling not implemented. Performance may degrade with very long chat histories.");
|
||||||
|
});
|
||||||
64
docs/md_v2/ask_ai/index.html
Normal file
64
docs/md_v2/ask_ai/index.html
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>Crawl4AI Assistant</title>
|
||||||
|
<!-- Link main styles first for variable access -->
|
||||||
|
<link rel="stylesheet" href="../assets/layout.css">
|
||||||
|
<link rel="stylesheet" href="../assets/styles.css">
|
||||||
|
<!-- Link specific AI styles -->
|
||||||
|
<link rel="stylesheet" href="../assets/highlight.css">
|
||||||
|
<link rel="stylesheet" href="ask-ai.css">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div class="ai-assistant-container">
|
||||||
|
|
||||||
|
<!-- Left Sidebar: Conversation History -->
|
||||||
|
<aside id="history-panel" class="sidebar left-sidebar">
|
||||||
|
<header>
|
||||||
|
<h3>History</h3>
|
||||||
|
<button id="new-chat-button" class="btn btn-sm">New Chat</button>
|
||||||
|
</header>
|
||||||
|
<ul id="history-list">
|
||||||
|
<!-- History items populated by JS -->
|
||||||
|
</ul>
|
||||||
|
</aside>
|
||||||
|
|
||||||
|
<!-- Main Area: Chat Interface -->
|
||||||
|
<main id="chat-panel">
|
||||||
|
<div id="chat-messages">
|
||||||
|
<!-- Chat messages populated by JS -->
|
||||||
|
<div class="message ai-message welcome-message">
|
||||||
|
Welcome to the Crawl4AI Assistant! How can I help you today?
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div id="chat-input-area">
|
||||||
|
<!-- Loading indicator for general waiting (optional) -->
|
||||||
|
<!-- <div class="loading-indicator" style="display: none;">Thinking...</div> -->
|
||||||
|
<textarea id="chat-input" placeholder="Ask about Crawl4AI..." rows="2"></textarea>
|
||||||
|
<button id="send-button">Send</button>
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
|
||||||
|
<!-- Right Sidebar: Citations / Context -->
|
||||||
|
<aside id="citations-panel" class="sidebar right-sidebar">
|
||||||
|
<header>
|
||||||
|
<h3>Citations</h3>
|
||||||
|
</header>
|
||||||
|
<ul id="citations-list">
|
||||||
|
<!-- Citations populated by JS -->
|
||||||
|
<li class="no-citations">No citations for this response yet.</li>
|
||||||
|
</ul>
|
||||||
|
</aside>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Include Marked.js library -->
|
||||||
|
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
|
||||||
|
<script src="../assets/highlight.min.js"></script>
|
||||||
|
|
||||||
|
<!-- Your AI Assistant Logic -->
|
||||||
|
<script src="ask-ai.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
62
docs/md_v2/assets/copy_code.js
Normal file
62
docs/md_v2/assets/copy_code.js
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
// ==== File: docs/assets/copy_code.js ====
|
||||||
|
|
||||||
|
document.addEventListener('DOMContentLoaded', () => {
|
||||||
|
// Target specifically code blocks within the main content area
|
||||||
|
const codeBlocks = document.querySelectorAll('#terminal-mkdocs-main-content pre > code');
|
||||||
|
|
||||||
|
codeBlocks.forEach((codeElement) => {
|
||||||
|
const preElement = codeElement.parentElement; // The <pre> tag
|
||||||
|
|
||||||
|
// Ensure the <pre> tag can contain a positioned button
|
||||||
|
if (window.getComputedStyle(preElement).position === 'static') {
|
||||||
|
preElement.style.position = 'relative';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create the button
|
||||||
|
const copyButton = document.createElement('button');
|
||||||
|
copyButton.className = 'copy-code-button';
|
||||||
|
copyButton.type = 'button';
|
||||||
|
copyButton.setAttribute('aria-label', 'Copy code to clipboard');
|
||||||
|
copyButton.title = 'Copy code to clipboard';
|
||||||
|
copyButton.innerHTML = 'Copy'; // Or use an icon like an SVG or FontAwesome class
|
||||||
|
|
||||||
|
// Append the button to the <pre> element
|
||||||
|
preElement.appendChild(copyButton);
|
||||||
|
|
||||||
|
// Add click event listener
|
||||||
|
copyButton.addEventListener('click', () => {
|
||||||
|
copyCodeToClipboard(codeElement, copyButton);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
async function copyCodeToClipboard(codeElement, button) {
|
||||||
|
// Use innerText to get the rendered text content, preserving line breaks
|
||||||
|
const textToCopy = codeElement.innerText;
|
||||||
|
|
||||||
|
try {
|
||||||
|
await navigator.clipboard.writeText(textToCopy);
|
||||||
|
|
||||||
|
// Visual feedback
|
||||||
|
button.innerHTML = 'Copied!';
|
||||||
|
button.classList.add('copied');
|
||||||
|
button.disabled = true; // Temporarily disable
|
||||||
|
|
||||||
|
// Revert button state after a short delay
|
||||||
|
setTimeout(() => {
|
||||||
|
button.innerHTML = 'Copy';
|
||||||
|
button.classList.remove('copied');
|
||||||
|
button.disabled = false;
|
||||||
|
}, 2000); // Show "Copied!" for 2 seconds
|
||||||
|
|
||||||
|
} catch (err) {
|
||||||
|
console.error('Failed to copy code: ', err);
|
||||||
|
// Optional: Provide error feedback on the button
|
||||||
|
button.innerHTML = 'Error';
|
||||||
|
setTimeout(() => {
|
||||||
|
button.innerHTML = 'Copy';
|
||||||
|
}, 2000);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log("Copy Code Button script loaded.");
|
||||||
|
});
|
||||||
39
docs/md_v2/assets/floating_ask_ai_button.js
Normal file
39
docs/md_v2/assets/floating_ask_ai_button.js
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
// ==== File: docs/assets/floating_ask_ai_button.js ====
|
||||||
|
|
||||||
|
document.addEventListener('DOMContentLoaded', () => {
|
||||||
|
const askAiPagePath = '/core/ask-ai/'; // IMPORTANT: Adjust this path if needed!
|
||||||
|
const currentPath = window.location.pathname;
|
||||||
|
|
||||||
|
// Determine the base URL for constructing the link correctly,
|
||||||
|
// especially if deployed in a sub-directory.
|
||||||
|
// This assumes a simple structure; adjust if needed.
|
||||||
|
const baseUrl = window.location.origin + (currentPath.startsWith('/core/') ? '../..' : '');
|
||||||
|
|
||||||
|
|
||||||
|
// Check if the current page IS the Ask AI page
|
||||||
|
// Use includes() for flexibility (handles trailing slash or .html)
|
||||||
|
if (currentPath.includes(askAiPagePath.replace(/\/$/, ''))) { // Remove trailing slash for includes check
|
||||||
|
console.log("Floating Ask AI Button: Not adding button on the Ask AI page itself.");
|
||||||
|
return; // Don't add the button on the target page
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Create the button ---
|
||||||
|
const fabLink = document.createElement('a');
|
||||||
|
fabLink.className = 'floating-ask-ai-button';
|
||||||
|
fabLink.href = askAiPagePath; // Construct the correct URL
|
||||||
|
fabLink.title = 'Ask Crawl4AI Assistant';
|
||||||
|
fabLink.setAttribute('aria-label', 'Ask Crawl4AI Assistant');
|
||||||
|
|
||||||
|
// Add content (using SVG icon for better visuals)
|
||||||
|
fabLink.innerHTML = `
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" width="24" height="24" fill="currentColor">
|
||||||
|
<path d="M20 2H4c-1.1 0-2 .9-2 2v12c0 1.1.9 2 2 2h14l4 4V4c0-1.1-.9-2-2-2zm-2 12H6v-2h12v2zm0-3H6V9h12v2zm0-3H6V6h12v2z"/>
|
||||||
|
</svg>
|
||||||
|
<span>Ask AI</span>
|
||||||
|
`;
|
||||||
|
|
||||||
|
// Append to body
|
||||||
|
document.body.appendChild(fabLink);
|
||||||
|
|
||||||
|
console.log("Floating Ask AI Button added.");
|
||||||
|
});
|
||||||
119
docs/md_v2/assets/github_stats.js
Normal file
119
docs/md_v2/assets/github_stats.js
Normal file
@@ -0,0 +1,119 @@
|
|||||||
|
// ==== File: assets/github_stats.js ====
|
||||||
|
|
||||||
|
document.addEventListener('DOMContentLoaded', async () => {
|
||||||
|
// --- Configuration ---
|
||||||
|
const targetHeaderSelector = '.terminal .container:first-child'; // Selector for your header container
|
||||||
|
const insertBeforeSelector = '.terminal-nav'; // Selector for the element to insert the badge BEFORE (e.g., the main nav)
|
||||||
|
// Or set to null to append at the end of the header.
|
||||||
|
|
||||||
|
// --- Find elements ---
|
||||||
|
const headerContainer = document.querySelector(targetHeaderSelector);
|
||||||
|
if (!headerContainer) {
|
||||||
|
console.warn('GitHub Stats: Header container not found with selector:', targetHeaderSelector);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const repoLinkElement = headerContainer.querySelector('a[href*="github.com/"]'); // Find the existing GitHub link
|
||||||
|
let repoUrl = 'https://github.com/unclecode/crawl4ai';
|
||||||
|
// if (repoLinkElement) {
|
||||||
|
// repoUrl = repoLinkElement.href;
|
||||||
|
// } else {
|
||||||
|
// // Fallback: Try finding from config (requires template injection - harder)
|
||||||
|
// // Or hardcode if necessary, but reading from the link is better.
|
||||||
|
// console.warn('GitHub Stats: GitHub repo link not found in header.');
|
||||||
|
// // Try to get repo_url from mkdocs config if available globally (less likely)
|
||||||
|
// // repoUrl = window.mkdocs_config?.repo_url; // Requires setting this variable
|
||||||
|
// // if (!repoUrl) return; // Exit if still no URL
|
||||||
|
// return; // Exit for now if link isn't found
|
||||||
|
// }
|
||||||
|
|
||||||
|
|
||||||
|
// --- Extract Repo Owner/Name ---
|
||||||
|
let owner = '';
|
||||||
|
let repo = '';
|
||||||
|
try {
|
||||||
|
const url = new URL(repoUrl);
|
||||||
|
const pathParts = url.pathname.split('/').filter(part => part.length > 0);
|
||||||
|
if (pathParts.length >= 2) {
|
||||||
|
owner = pathParts[0];
|
||||||
|
repo = pathParts[1];
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
console.error('GitHub Stats: Could not parse repository URL:', repoUrl, e);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!owner || !repo) {
|
||||||
|
console.warn('GitHub Stats: Could not extract owner/repo from URL:', repoUrl);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Get Version (Attempt to extract from site title) ---
|
||||||
|
let version = '';
|
||||||
|
const siteTitleElement = headerContainer.querySelector('.terminal-title, .site-title'); // Adjust selector based on theme's title element
|
||||||
|
// Example title: "Crawl4AI Documentation (v0.5.x)"
|
||||||
|
if (siteTitleElement) {
|
||||||
|
const match = siteTitleElement.textContent.match(/\((v?[^)]+)\)/); // Look for text in parentheses starting with 'v' (optional)
|
||||||
|
if (match && match[1]) {
|
||||||
|
version = match[1].trim();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!version) {
|
||||||
|
console.info('GitHub Stats: Could not extract version from title. You might need to adjust the selector or regex.');
|
||||||
|
// You could fallback to config.extra.version if injected into JS
|
||||||
|
// version = window.mkdocs_config?.extra?.version || 'N/A';
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// --- Fetch GitHub API Data ---
|
||||||
|
let stars = '...';
|
||||||
|
let forks = '...';
|
||||||
|
try {
|
||||||
|
const apiUrl = `https://api.github.com/repos/${owner}/${repo}`;
|
||||||
|
const response = await fetch(apiUrl);
|
||||||
|
|
||||||
|
if (response.ok) {
|
||||||
|
const data = await response.json();
|
||||||
|
// Format large numbers (optional)
|
||||||
|
stars = data.stargazers_count > 1000 ? `${(data.stargazers_count / 1000).toFixed(1)}k` : data.stargazers_count;
|
||||||
|
forks = data.forks_count > 1000 ? `${(data.forks_count / 1000).toFixed(1)}k` : data.forks_count;
|
||||||
|
} else {
|
||||||
|
console.warn(`GitHub Stats: API request failed with status ${response.status}. Rate limit exceeded?`);
|
||||||
|
stars = 'N/A';
|
||||||
|
forks = 'N/A';
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('GitHub Stats: Error fetching repository data:', error);
|
||||||
|
stars = 'N/A';
|
||||||
|
forks = 'N/A';
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Create Badge HTML ---
|
||||||
|
const badgeContainer = document.createElement('div');
|
||||||
|
badgeContainer.className = 'github-stats-badge';
|
||||||
|
|
||||||
|
// Use innerHTML for simplicity, including potential icons (requires FontAwesome or similar)
|
||||||
|
// Ensure your theme loads FontAwesome or add it yourself if you want icons.
|
||||||
|
badgeContainer.innerHTML = `
|
||||||
|
<a href="${repoUrl}" target="_blank" rel="noopener">
|
||||||
|
<!-- Optional Icon (FontAwesome example) -->
|
||||||
|
<!-- <i class="fab fa-github"></i> -->
|
||||||
|
<span class="repo-name">${owner}/${repo}</span>
|
||||||
|
${version ? `<span class="stat version"><i class="fas fa-tag"></i> ${version}</span>` : ''}
|
||||||
|
<span class="stat stars"><i class="fas fa-star"></i> ${stars}</span>
|
||||||
|
<span class="stat forks"><i class="fas fa-code-branch"></i> ${forks}</span>
|
||||||
|
</a>
|
||||||
|
`;
|
||||||
|
|
||||||
|
// --- Inject Badge into Header ---
|
||||||
|
const insertBeforeElement = insertBeforeSelector ? headerContainer.querySelector(insertBeforeSelector) : null;
|
||||||
|
if (insertBeforeElement) {
|
||||||
|
// headerContainer.insertBefore(badgeContainer, insertBeforeElement);
|
||||||
|
headerContainer.querySelector(insertBeforeSelector).appendChild(badgeContainer);
|
||||||
|
} else {
|
||||||
|
headerContainer.appendChild(badgeContainer);
|
||||||
|
}
|
||||||
|
|
||||||
|
console.info('GitHub Stats: Badge added to header.');
|
||||||
|
|
||||||
|
});
|
||||||
441
docs/md_v2/assets/layout.css
Normal file
441
docs/md_v2/assets/layout.css
Normal file
@@ -0,0 +1,441 @@
|
|||||||
|
/* ==== File: assets/layout.css (Non-Fluid Centered Layout) ==== */
|
||||||
|
|
||||||
|
:root {
|
||||||
|
--header-height: 55px; /* Adjust if needed */
|
||||||
|
--sidebar-width: 280px; /* Adjust if needed */
|
||||||
|
--toc-width: 340px; /* As specified */
|
||||||
|
--content-max-width: 90em; /* Max width for the centered content */
|
||||||
|
--layout-transition-speed: 0.2s;
|
||||||
|
--global-space: 10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* --- Basic Setup --- */
|
||||||
|
html {
|
||||||
|
scroll-behavior: smooth;
|
||||||
|
scroll-padding-top: calc(var(--header-height) + 15px);
|
||||||
|
box-sizing: border-box;
|
||||||
|
}
|
||||||
|
*, *:before, *:after {
|
||||||
|
box-sizing: inherit;
|
||||||
|
}
|
||||||
|
|
||||||
|
body {
|
||||||
|
padding-top: 0;
|
||||||
|
padding-bottom: 0;
|
||||||
|
background-color: var(--background-color);
|
||||||
|
color: var(--font-color);
|
||||||
|
/* Prevents horizontal scrollbars during transitions */
|
||||||
|
overflow-x: hidden;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* --- Fixed Header --- */
|
||||||
|
/* Full width, fixed header */
|
||||||
|
.terminal .container:first-child { /* Assuming this targets the header container */
|
||||||
|
position: fixed;
|
||||||
|
top: 0;
|
||||||
|
left: 0;
|
||||||
|
right: 0;
|
||||||
|
height: var(--header-height);
|
||||||
|
background-color: var(--background-color);
|
||||||
|
z-index: 1000;
|
||||||
|
border-bottom: 1px solid var(--progress-bar-background);
|
||||||
|
max-width: none; /* Override any container max-width */
|
||||||
|
padding: 0 calc(var(--global-space) * 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* --- Main Layout Container (Below Header) --- */
|
||||||
|
/* This container just provides space for the fixed header */
|
||||||
|
.container:has(.terminal-mkdocs-main-grid) {
|
||||||
|
margin: 0 auto;
|
||||||
|
padding: 0;
|
||||||
|
padding-top: var(--header-height); /* Space for fixed header */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* --- Flex Container: Grid holding content and toc (CENTERED) --- */
|
||||||
|
/* THIS is the main centered block */
|
||||||
|
.terminal-mkdocs-main-grid {
|
||||||
|
display: flex;
|
||||||
|
align-items: flex-start;
|
||||||
|
/* Enforce max-width and center */
|
||||||
|
max-width: var(--content-max-width);
|
||||||
|
margin-left: auto;
|
||||||
|
margin-right: auto;
|
||||||
|
position: relative;
|
||||||
|
/* Apply side padding within the centered block */
|
||||||
|
padding-left: calc(var(--global-space) * 2);
|
||||||
|
padding-right: calc(var(--global-space) * 2);
|
||||||
|
/* Add margin-left to clear the fixed sidebar */
|
||||||
|
margin-left: var(--sidebar-width);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* --- 1. Fixed Left Sidebar (Viewport Relative) --- */
|
||||||
|
#terminal-mkdocs-side-panel {
|
||||||
|
position: fixed;
|
||||||
|
top: var(--header-height);
|
||||||
|
left: max(0px, calc((90vw - var(--content-max-width)) / 2));
|
||||||
|
bottom: 0;
|
||||||
|
width: var(--sidebar-width);
|
||||||
|
background-color: var(--background-color);
|
||||||
|
border-right: 1px solid var(--progress-bar-background);
|
||||||
|
overflow-y: auto;
|
||||||
|
z-index: 900;
|
||||||
|
padding: 1em calc(var(--global-space) * 2);
|
||||||
|
padding-bottom: 2em;
|
||||||
|
/* transition: left var(--layout-transition-speed) ease-in-out; */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* --- 2. Main Content Area (Within Centered Grid) --- */
|
||||||
|
#terminal-mkdocs-main-content {
|
||||||
|
flex-grow: 1;
|
||||||
|
flex-shrink: 1;
|
||||||
|
min-width: 0; /* Flexbox shrink fix */
|
||||||
|
|
||||||
|
/* No left/right margins needed here - handled by parent grid */
|
||||||
|
margin-left: 0;
|
||||||
|
margin-right: 0;
|
||||||
|
|
||||||
|
/* Internal Padding */
|
||||||
|
padding: 1.5em 2em;
|
||||||
|
|
||||||
|
position: relative;
|
||||||
|
z-index: 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* --- 3. Right Table of Contents (Sticky, Within Centered Grid) --- */
|
||||||
|
#toc-sidebar {
|
||||||
|
flex-basis: var(--toc-width);
|
||||||
|
flex-shrink: 0;
|
||||||
|
width: var(--toc-width);
|
||||||
|
|
||||||
|
position: sticky; /* Sticks within the centered grid */
|
||||||
|
top: var(--header-height);
|
||||||
|
align-self: stretch;
|
||||||
|
height: calc(100vh - var(--header-height));
|
||||||
|
overflow-y: auto;
|
||||||
|
|
||||||
|
padding: 1.5em 1em;
|
||||||
|
font-size: 0.85em;
|
||||||
|
border-left: 1px solid var(--progress-bar-background);
|
||||||
|
z-index: 800;
|
||||||
|
/* display: none; /* JS handles */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* (ToC link styles remain the same) */
|
||||||
|
#toc-sidebar h4 { margin-top: 0; margin-bottom: 1em; font-size: 1.1em; color: var(--secondary-color); padding-left: 0.8em; }
|
||||||
|
#toc-sidebar ul { list-style: none; padding: 0; margin: 0; }
|
||||||
|
#toc-sidebar ul li a { display: block; padding: 0.3em 0; color: var(--secondary-color); text-decoration: none; border-left: 3px solid transparent; padding-left: 0.8em; transition: all 0.1s ease-in-out; line-height: 1.4; word-break: break-word; }
|
||||||
|
#toc-sidebar ul li.toc-level-3 a { padding-left: 1.8em; }
|
||||||
|
#toc-sidebar ul li.toc-level-4 a { padding-left: 2.8em; }
|
||||||
|
#toc-sidebar ul li a:hover { color: var(--font-color); background-color: rgba(255, 255, 255, 0.05); }
|
||||||
|
#toc-sidebar ul li a.active { color: var(--primary-color); border-left-color: var(--primary-color); background-color: rgba(80, 255, 255, 0.08); }
|
||||||
|
|
||||||
|
|
||||||
|
/* --- Footer Styling (Respects Centered Layout) --- */
|
||||||
|
footer {
|
||||||
|
background-color: var(--code-bg-color);
|
||||||
|
color: var(--secondary-color);
|
||||||
|
position: relative;
|
||||||
|
z-index: 10;
|
||||||
|
margin-top: 2em;
|
||||||
|
|
||||||
|
/* Apply margin-left to clear the fixed sidebar */
|
||||||
|
margin-left: var(--sidebar-width);
|
||||||
|
|
||||||
|
/* Constrain width relative to the centered grid it follows */
|
||||||
|
max-width: calc(var(--content-max-width) - var(--sidebar-width));
|
||||||
|
margin-right: auto; /* Keep it left-aligned within the space next to sidebar */
|
||||||
|
|
||||||
|
/* Use padding consistent with the grid */
|
||||||
|
padding: 2em calc(var(--global-space) * 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Adjust footer grid if needed */
|
||||||
|
.terminal-mkdocs-footer-grid {
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: 1fr auto;
|
||||||
|
gap: 1em;
|
||||||
|
align-items: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ==========================================================================
|
||||||
|
RESPONSIVENESS (Adapting the Non-Fluid Layout)
|
||||||
|
========================================================================== */
|
||||||
|
|
||||||
|
/* --- Medium screens: Hide ToC --- */
|
||||||
|
@media screen and (max-width: 1200px) {
|
||||||
|
#toc-sidebar {
|
||||||
|
display: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.terminal-mkdocs-main-grid {
|
||||||
|
/* Grid adjusts automatically as ToC is removed */
|
||||||
|
/* Ensure grid padding remains */
|
||||||
|
padding-left: calc(var(--global-space) * 2);
|
||||||
|
padding-right: calc(var(--global-space) * 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
#terminal-mkdocs-main-content {
|
||||||
|
/* Content area naturally expands */
|
||||||
|
}
|
||||||
|
|
||||||
|
footer {
|
||||||
|
/* Footer still respects the left sidebar and overall max width */
|
||||||
|
margin-left: var(--sidebar-width);
|
||||||
|
max-width: calc(var(--content-max-width) - var(--sidebar-width));
|
||||||
|
/* Padding remains consistent */
|
||||||
|
padding-left: calc(var(--global-space) * 2);
|
||||||
|
padding-right: calc(var(--global-space) * 2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* --- Small screens: Hide left sidebar, full width content & footer --- */
|
||||||
|
@media screen and (max-width: 768px) {
|
||||||
|
|
||||||
|
#terminal-mkdocs-side-panel {
|
||||||
|
left: calc(-1 * var(--sidebar-width));
|
||||||
|
z-index: 1100;
|
||||||
|
box-shadow: 2px 0 10px rgba(0,0,0,0.3);
|
||||||
|
}
|
||||||
|
#terminal-mkdocs-side-panel.sidebar-visible {
|
||||||
|
left: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.terminal-mkdocs-main-grid {
|
||||||
|
/* Grid now takes full width (minus body padding) */
|
||||||
|
margin-left: 0; /* Override sidebar margin */
|
||||||
|
margin-right: 0; /* Override auto margin */
|
||||||
|
max-width: 100%; /* Allow full width */
|
||||||
|
padding-left: var(--global-space); /* Reduce padding */
|
||||||
|
padding-right: var(--global-space);
|
||||||
|
}
|
||||||
|
|
||||||
|
#terminal-mkdocs-main-content {
|
||||||
|
padding: 1.5em 1em; /* Adjust internal padding */
|
||||||
|
}
|
||||||
|
|
||||||
|
footer {
|
||||||
|
margin-left: 0; /* Full width footer */
|
||||||
|
max-width: 100%; /* Allow full width */
|
||||||
|
padding: 2em 1em; /* Adjust internal padding */
|
||||||
|
}
|
||||||
|
|
||||||
|
.terminal-mkdocs-footer-grid {
|
||||||
|
grid-template-columns: 1fr; /* Stack footer items */
|
||||||
|
text-align: center;
|
||||||
|
gap: 0.5em;
|
||||||
|
}
|
||||||
|
/* Remember JS for toggle button & overlay */
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* ==== GitHub Stats Badge Styling ==== */
|
||||||
|
|
||||||
|
.github-stats-badge {
|
||||||
|
display: inline-block; /* Or flex if needed */
|
||||||
|
margin-left: 2em; /* Adjust spacing */
|
||||||
|
vertical-align: middle; /* Align with other header items */
|
||||||
|
font-size: 0.9em; /* Slightly smaller font */
|
||||||
|
}
|
||||||
|
|
||||||
|
.github-stats-badge a {
|
||||||
|
color: var(--secondary-color); /* Use secondary color */
|
||||||
|
text-decoration: none;
|
||||||
|
display: flex; /* Use flex for alignment */
|
||||||
|
align-items: center;
|
||||||
|
gap: 0.8em; /* Space between items */
|
||||||
|
padding: 0.2em 0.5em;
|
||||||
|
border: 1px solid var(--progress-bar-background); /* Subtle border */
|
||||||
|
border-radius: 4px;
|
||||||
|
transition: color 0.2s, background-color 0.2s;
|
||||||
|
}
|
||||||
|
|
||||||
|
.github-stats-badge a:hover {
|
||||||
|
color: var(--font-color); /* Brighter color on hover */
|
||||||
|
background-color: var(--progress-bar-background); /* Subtle background on hover */
|
||||||
|
}
|
||||||
|
|
||||||
|
.github-stats-badge .repo-name {
|
||||||
|
color: var(--font-color); /* Make repo name stand out slightly */
|
||||||
|
font-weight: 500; /* Optional bolder weight */
|
||||||
|
}
|
||||||
|
|
||||||
|
.github-stats-badge .stat {
|
||||||
|
/* Styles for individual stats (version, stars, forks) */
|
||||||
|
white-space: nowrap; /* Prevent wrapping */
|
||||||
|
}
|
||||||
|
|
||||||
|
.github-stats-badge .stat i {
|
||||||
|
/* Optional: Style for FontAwesome icons */
|
||||||
|
margin-right: 0.3em;
|
||||||
|
color: var(--secondary-dimmed-color); /* Dimmer color for icons */
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* Adjust positioning relative to search/nav if needed */
|
||||||
|
/* Example: If search is floated right */
|
||||||
|
/* .terminal-nav { float: left; } */
|
||||||
|
/* .github-stats-badge { float: left; } */
|
||||||
|
/* #mkdocs-search-query { float: right; } */
|
||||||
|
|
||||||
|
/* --- Responsive adjustments --- */
|
||||||
|
@media screen and (max-width: 900px) { /* Example breakpoint */
|
||||||
|
.github-stats-badge .repo-name {
|
||||||
|
display: none; /* Hide full repo name on smaller screens */
|
||||||
|
}
|
||||||
|
.github-stats-badge {
|
||||||
|
margin-left: 1em;
|
||||||
|
}
|
||||||
|
.github-stats-badge a {
|
||||||
|
gap: 0.5em;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@media screen and (max-width: 768px) {
|
||||||
|
/* Further hide or simplify on mobile if needed */
|
||||||
|
.github-stats-badge {
|
||||||
|
display: none; /* Example: Hide completely on smallest screens */
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* --- Ask AI Selection Button --- */
|
||||||
|
.ask-ai-selection-button {
|
||||||
|
background-color: var(--primary-dimmed-color, #09b5a5);
|
||||||
|
color: var(--background-color, #070708);
|
||||||
|
border: none;
|
||||||
|
padding: 4px 8px;
|
||||||
|
font-size: 0.8em;
|
||||||
|
border-radius: 4px;
|
||||||
|
cursor: pointer;
|
||||||
|
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.3);
|
||||||
|
transition: background-color 0.2s ease;
|
||||||
|
white-space: nowrap;
|
||||||
|
}
|
||||||
|
|
||||||
|
.ask-ai-selection-button:hover {
|
||||||
|
background-color: var(--primary-color, #50ffff);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ==== File: docs/assets/layout.css (Additions) ==== */
|
||||||
|
|
||||||
|
/* ... (keep all existing layout CSS) ... */
|
||||||
|
|
||||||
|
/* --- Copy Code Button Styling --- */
|
||||||
|
|
||||||
|
/* Ensure the parent <pre> can contain the absolutely positioned button */
|
||||||
|
#terminal-mkdocs-main-content pre {
|
||||||
|
position: relative; /* Needed for absolute positioning of child */
|
||||||
|
/* Add a little padding top/right to make space for the button */
|
||||||
|
padding-top: 2.5em;
|
||||||
|
padding-right: 1em; /* Ensure padding is sufficient */
|
||||||
|
}
|
||||||
|
|
||||||
|
.copy-code-button {
|
||||||
|
position: absolute;
|
||||||
|
top: 0.5em; /* Adjust spacing from top */
|
||||||
|
left: 0.5em; /* Adjust spacing from left */
|
||||||
|
z-index: 1; /* Sit on top of code */
|
||||||
|
|
||||||
|
background-color: var(--progress-bar-background, #444); /* Use a background */
|
||||||
|
color: var(--font-color, #eaeaea);
|
||||||
|
border: 1px solid var(--secondary-color, #727578);
|
||||||
|
padding: 3px 8px;
|
||||||
|
font-size: 0.8em;
|
||||||
|
font-family: var(--font-stack, monospace);
|
||||||
|
border-radius: 4px;
|
||||||
|
cursor: pointer;
|
||||||
|
opacity: 0; /* Hidden by default */
|
||||||
|
transition: opacity 0.2s ease-in-out, background-color 0.2s ease, color 0.2s ease;
|
||||||
|
white-space: nowrap;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Show button on hover of the <pre> container */
|
||||||
|
#terminal-mkdocs-main-content pre:hover .copy-code-button {
|
||||||
|
opacity: 0.8; /* Show partially */
|
||||||
|
}
|
||||||
|
|
||||||
|
.copy-code-button:hover {
|
||||||
|
opacity: 1; /* Fully visible on button hover */
|
||||||
|
background-color: var(--secondary-color, #727578);
|
||||||
|
}
|
||||||
|
|
||||||
|
.copy-code-button:focus {
|
||||||
|
opacity: 1; /* Ensure visible when focused */
|
||||||
|
outline: 1px dashed var(--primary-color);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* Style for "Copied!" state */
|
||||||
|
.copy-code-button.copied {
|
||||||
|
background-color: var(--primary-dimmed-color, #09b5a5);
|
||||||
|
color: var(--background-color, #070708);
|
||||||
|
border-color: var(--primary-dimmed-color, #09b5a5);
|
||||||
|
opacity: 1; /* Ensure visible */
|
||||||
|
}
|
||||||
|
.copy-code-button.copied:hover {
|
||||||
|
background-color: var(--primary-dimmed-color, #09b5a5); /* Prevent hover change */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ==== File: docs/assets/layout.css (Additions) ==== */
|
||||||
|
|
||||||
|
/* ... (keep all existing layout CSS) ... */
|
||||||
|
|
||||||
|
/* --- Floating Ask AI Button --- */
|
||||||
|
.floating-ask-ai-button {
|
||||||
|
position: fixed;
|
||||||
|
bottom: 25px;
|
||||||
|
right: 25px;
|
||||||
|
z-index: 1050; /* Below modals, above most content */
|
||||||
|
|
||||||
|
background-color: var(--primary-dimmed-color, #09b5a5);
|
||||||
|
color: var(--background-color, #070708);
|
||||||
|
border: none;
|
||||||
|
border-radius: 50%; /* Make it circular */
|
||||||
|
width: 60px; /* Adjust size */
|
||||||
|
height: 60px; /* Adjust size */
|
||||||
|
padding: 10px; /* Adjust padding */
|
||||||
|
box-shadow: 0 4px 10px rgba(0, 0, 0, 0.4);
|
||||||
|
cursor: pointer;
|
||||||
|
transition: background-color 0.2s ease, transform 0.2s ease;
|
||||||
|
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column; /* Stack icon and text */
|
||||||
|
align-items: center;
|
||||||
|
justify-content: center;
|
||||||
|
text-decoration: none;
|
||||||
|
text-align: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
.floating-ask-ai-button svg {
|
||||||
|
width: 24px; /* Control icon size */
|
||||||
|
height: 24px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.floating-ask-ai-button span {
|
||||||
|
font-size: 0.7em;
|
||||||
|
margin-top: 2px; /* Space between icon and text */
|
||||||
|
display: block; /* Ensure it takes space */
|
||||||
|
line-height: 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
.floating-ask-ai-button:hover {
|
||||||
|
background-color: var(--primary-color, #50ffff);
|
||||||
|
transform: scale(1.05); /* Slight grow effect */
|
||||||
|
}
|
||||||
|
|
||||||
|
.floating-ask-ai-button:focus {
|
||||||
|
outline: 2px solid var(--primary-color);
|
||||||
|
outline-offset: 2px;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Optional: Hide text on smaller screens if needed */
|
||||||
|
@media screen and (max-width: 768px) {
|
||||||
|
.floating-ask-ai-button span {
|
||||||
|
/* display: none; */ /* Uncomment to hide text */
|
||||||
|
}
|
||||||
|
.floating-ask-ai-button {
|
||||||
|
width: 55px;
|
||||||
|
height: 55px;
|
||||||
|
bottom: 20px;
|
||||||
|
right: 20px;
|
||||||
|
}
|
||||||
|
}
|
||||||
109
docs/md_v2/assets/selection_ask_ai.js
Normal file
109
docs/md_v2/assets/selection_ask_ai.js
Normal file
@@ -0,0 +1,109 @@
|
|||||||
|
// ==== File: docs/assets/selection_ask_ai.js ====
|
||||||
|
|
||||||
|
document.addEventListener('DOMContentLoaded', () => {
|
||||||
|
let askAiButton = null;
|
||||||
|
const askAiPageUrl = '/core/ask-ai/'; // Adjust if your Ask AI page path is different
|
||||||
|
|
||||||
|
function createAskAiButton() {
|
||||||
|
const button = document.createElement('button');
|
||||||
|
button.id = 'ask-ai-selection-btn';
|
||||||
|
button.className = 'ask-ai-selection-button';
|
||||||
|
button.textContent = 'Ask AI'; // Or use an icon
|
||||||
|
button.style.display = 'none'; // Initially hidden
|
||||||
|
button.style.position = 'absolute';
|
||||||
|
button.style.zIndex = '1500'; // Ensure it's on top
|
||||||
|
document.body.appendChild(button);
|
||||||
|
|
||||||
|
button.addEventListener('click', handleAskAiClick);
|
||||||
|
return button;
|
||||||
|
}
|
||||||
|
|
||||||
|
function getSafeSelectedText() {
|
||||||
|
const selection = window.getSelection();
|
||||||
|
if (!selection || selection.rangeCount === 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
// Avoid selecting text within the button itself if it was somehow selected
|
||||||
|
const container = selection.getRangeAt(0).commonAncestorContainer;
|
||||||
|
if (askAiButton && askAiButton.contains(container)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const text = selection.toString().trim();
|
||||||
|
return text.length > 0 ? text : null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function positionButton(event) {
|
||||||
|
const selection = window.getSelection();
|
||||||
|
if (!selection || selection.rangeCount === 0 || selection.isCollapsed) {
|
||||||
|
hideButton();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const range = selection.getRangeAt(0);
|
||||||
|
const rect = range.getBoundingClientRect();
|
||||||
|
|
||||||
|
// Calculate position: top-right of the selection
|
||||||
|
const scrollX = window.scrollX;
|
||||||
|
const scrollY = window.scrollY;
|
||||||
|
const buttonTop = rect.top + scrollY - askAiButton.offsetHeight - 5; // 5px above
|
||||||
|
const buttonLeft = rect.right + scrollX + 5; // 5px to the right
|
||||||
|
|
||||||
|
askAiButton.style.top = `${buttonTop}px`;
|
||||||
|
askAiButton.style.left = `${buttonLeft}px`;
|
||||||
|
askAiButton.style.display = 'block'; // Show the button
|
||||||
|
}
|
||||||
|
|
||||||
|
function hideButton() {
|
||||||
|
if (askAiButton) {
|
||||||
|
askAiButton.style.display = 'none';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function handleAskAiClick(event) {
|
||||||
|
event.stopPropagation(); // Prevent mousedown from hiding button immediately
|
||||||
|
const selectedText = getSafeSelectedText();
|
||||||
|
if (selectedText) {
|
||||||
|
console.log("Selected Text:", selectedText);
|
||||||
|
// Base64 encode for URL safety (handles special chars, line breaks)
|
||||||
|
// Use encodeURIComponent first for proper Unicode handling before btoa
|
||||||
|
const encodedText = btoa(unescape(encodeURIComponent(selectedText)));
|
||||||
|
const targetUrl = `${askAiPageUrl}?qq=${encodedText}`;
|
||||||
|
console.log("Navigating to:", targetUrl);
|
||||||
|
window.location.href = targetUrl; // Navigate to Ask AI page
|
||||||
|
}
|
||||||
|
hideButton(); // Hide after click
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Event Listeners ---
|
||||||
|
|
||||||
|
// Show button on mouse up after selection
|
||||||
|
document.addEventListener('mouseup', (event) => {
|
||||||
|
// Slight delay to ensure selection is registered
|
||||||
|
setTimeout(() => {
|
||||||
|
const selectedText = getSafeSelectedText();
|
||||||
|
if (selectedText) {
|
||||||
|
if (!askAiButton) {
|
||||||
|
askAiButton = createAskAiButton();
|
||||||
|
}
|
||||||
|
// Don't position if the click was ON the button itself
|
||||||
|
if (event.target !== askAiButton) {
|
||||||
|
positionButton(event);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
hideButton();
|
||||||
|
}
|
||||||
|
}, 10); // Small delay
|
||||||
|
});
|
||||||
|
|
||||||
|
// Hide button on scroll or click elsewhere
|
||||||
|
document.addEventListener('mousedown', (event) => {
|
||||||
|
// Hide if clicking anywhere EXCEPT the button itself
|
||||||
|
if (askAiButton && event.target !== askAiButton) {
|
||||||
|
hideButton();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
document.addEventListener('scroll', hideButton, true); // Capture scroll events
|
||||||
|
|
||||||
|
console.log("Selection Ask AI script loaded.");
|
||||||
|
});
|
||||||
@@ -6,8 +6,8 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
:root {
|
:root {
|
||||||
--global-font-size: 16px;
|
--global-font-size: 14px;
|
||||||
--global-code-font-size: 16px;
|
--global-code-font-size: 13px;
|
||||||
--global-line-height: 1.5em;
|
--global-line-height: 1.5em;
|
||||||
--global-space: 10px;
|
--global-space: 10px;
|
||||||
--font-stack: Menlo, Monaco, Lucida Console, Liberation Mono, DejaVu Sans Mono, Bitstream Vera Sans Mono,
|
--font-stack: Menlo, Monaco, Lucida Console, Liberation Mono, DejaVu Sans Mono, Bitstream Vera Sans Mono,
|
||||||
@@ -50,8 +50,17 @@
|
|||||||
--display-h1-decoration: none;
|
--display-h1-decoration: none;
|
||||||
|
|
||||||
--display-h1-decoration: none;
|
--display-h1-decoration: none;
|
||||||
|
|
||||||
|
--header-height: 65px; /* Adjust based on your actual header height */
|
||||||
|
--sidebar-width: 280px; /* Adjust based on your desired sidebar width */
|
||||||
|
--toc-width: 240px; /* Adjust based on your desired ToC width */
|
||||||
|
--layout-transition-speed: 0.2s; /* For potential future animations */
|
||||||
|
|
||||||
|
--page-width : 100em; /* Adjust based on your design */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/* body {
|
/* body {
|
||||||
background-color: var(--background-color);
|
background-color: var(--background-color);
|
||||||
color: var(--font-color);
|
color: var(--font-color);
|
||||||
@@ -256,4 +265,6 @@ div.badges a {
|
|||||||
}
|
}
|
||||||
div.badges a > img {
|
div.badges a > img {
|
||||||
width: auto;
|
width: auto;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
144
docs/md_v2/assets/toc.js
Normal file
144
docs/md_v2/assets/toc.js
Normal file
@@ -0,0 +1,144 @@
|
|||||||
|
// ==== File: assets/toc.js ====
|
||||||
|
|
||||||
|
document.addEventListener('DOMContentLoaded', () => {
|
||||||
|
const mainContent = document.getElementById('terminal-mkdocs-main-content');
|
||||||
|
const tocContainer = document.getElementById('toc-sidebar');
|
||||||
|
const mainGrid = document.querySelector('.terminal-mkdocs-main-grid'); // Get the flex container
|
||||||
|
|
||||||
|
if (!mainContent) {
|
||||||
|
console.warn("TOC Generator: Main content area '#terminal-mkdocs-main-content' not found.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Create ToC container if it doesn't exist ---
|
||||||
|
let tocElement = tocContainer;
|
||||||
|
if (!tocElement) {
|
||||||
|
if (!mainGrid) {
|
||||||
|
console.warn("TOC Generator: Flex container '.terminal-mkdocs-main-grid' not found to append ToC.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
tocElement = document.createElement('aside');
|
||||||
|
tocElement.id = 'toc-sidebar';
|
||||||
|
tocElement.style.display = 'none'; // Keep hidden initially
|
||||||
|
// Append it as the last child of the flex grid
|
||||||
|
mainGrid.appendChild(tocElement);
|
||||||
|
console.info("TOC Generator: Created '#toc-sidebar' element.");
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Find Headings (h2, h3, h4 are common for ToC) ---
|
||||||
|
const headings = mainContent.querySelectorAll('h2, h3, h4');
|
||||||
|
if (headings.length === 0) {
|
||||||
|
console.info("TOC Generator: No headings found on this page. ToC not generated.");
|
||||||
|
tocElement.style.display = 'none'; // Ensure it's hidden
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Generate ToC List ---
|
||||||
|
const tocList = document.createElement('ul');
|
||||||
|
const observerTargets = []; // Store headings for IntersectionObserver
|
||||||
|
|
||||||
|
headings.forEach((heading, index) => {
|
||||||
|
// Ensure heading has an ID for linking
|
||||||
|
if (!heading.id) {
|
||||||
|
// Create a simple slug-like ID
|
||||||
|
heading.id = `toc-heading-${index}-${heading.textContent.toLowerCase().replace(/\s+/g, '-').replace(/[^a-z0-9-]/g, '')}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
const listItem = document.createElement('li');
|
||||||
|
const link = document.createElement('a');
|
||||||
|
|
||||||
|
link.href = `#${heading.id}`;
|
||||||
|
link.textContent = heading.textContent;
|
||||||
|
|
||||||
|
// Add class for styling based on heading level
|
||||||
|
const level = parseInt(heading.tagName.substring(1), 10); // Get 2, 3, or 4
|
||||||
|
listItem.classList.add(`toc-level-${level}`);
|
||||||
|
|
||||||
|
listItem.appendChild(link);
|
||||||
|
tocList.appendChild(listItem);
|
||||||
|
observerTargets.push(heading); // Add to observer list
|
||||||
|
});
|
||||||
|
|
||||||
|
// --- Populate and Show ToC ---
|
||||||
|
// Optional: Add a title
|
||||||
|
const tocTitle = document.createElement('h4');
|
||||||
|
tocTitle.textContent = 'On this page'; // Customize title if needed
|
||||||
|
|
||||||
|
tocElement.innerHTML = ''; // Clear previous content if any
|
||||||
|
tocElement.appendChild(tocTitle);
|
||||||
|
tocElement.appendChild(tocList);
|
||||||
|
tocElement.style.display = ''; // Show the ToC container
|
||||||
|
|
||||||
|
console.info(`TOC Generator: Generated ToC with ${headings.length} items.`);
|
||||||
|
|
||||||
|
// --- Scroll Spy using Intersection Observer ---
|
||||||
|
const tocLinks = tocElement.querySelectorAll('a');
|
||||||
|
let activeLink = null; // Keep track of the current active link
|
||||||
|
|
||||||
|
const observerOptions = {
|
||||||
|
// Observe changes relative to the viewport, offset by the header height
|
||||||
|
// Negative top margin pushes the intersection trigger point down
|
||||||
|
// Negative bottom margin ensures elements low on the screen can trigger before they exit
|
||||||
|
rootMargin: `-${getComputedStyle(document.documentElement).getPropertyValue('--header-height').trim()} 0px -60% 0px`,
|
||||||
|
threshold: 0 // Trigger as soon as any part enters/exits the boundary
|
||||||
|
};
|
||||||
|
|
||||||
|
const observerCallback = (entries) => {
|
||||||
|
let topmostVisibleHeading = null;
|
||||||
|
|
||||||
|
entries.forEach(entry => {
|
||||||
|
const link = tocElement.querySelector(`a[href="#${entry.target.id}"]`);
|
||||||
|
if (!link) return;
|
||||||
|
|
||||||
|
// Check if the heading is intersecting (partially or fully visible within rootMargin)
|
||||||
|
if (entry.isIntersecting) {
|
||||||
|
// Among visible headings, find the one closest to the top edge (within the rootMargin)
|
||||||
|
if (!topmostVisibleHeading || entry.boundingClientRect.top < topmostVisibleHeading.boundingClientRect.top) {
|
||||||
|
topmostVisibleHeading = entry.target;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// If we found a topmost visible heading, activate its link
|
||||||
|
if (topmostVisibleHeading) {
|
||||||
|
const newActiveLink = tocElement.querySelector(`a[href="#${topmostVisibleHeading.id}"]`);
|
||||||
|
if (newActiveLink && newActiveLink !== activeLink) {
|
||||||
|
// Remove active class from previous link
|
||||||
|
if (activeLink) {
|
||||||
|
activeLink.classList.remove('active');
|
||||||
|
activeLink.parentElement.classList.remove('active-parent'); // Optional parent styling
|
||||||
|
}
|
||||||
|
// Add active class to the new link
|
||||||
|
newActiveLink.classList.add('active');
|
||||||
|
newActiveLink.parentElement.classList.add('active-parent'); // Optional parent styling
|
||||||
|
activeLink = newActiveLink;
|
||||||
|
|
||||||
|
// Optional: Scroll the ToC sidebar to keep the active link visible
|
||||||
|
// newActiveLink.scrollIntoView({ behavior: 'smooth', block: 'nearest' });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// If no headings are intersecting (scrolled past the last one?), maybe deactivate all
|
||||||
|
// Or keep the last one active - depends on desired behavior. Current logic keeps last active.
|
||||||
|
};
|
||||||
|
|
||||||
|
const observer = new IntersectionObserver(observerCallback, observerOptions);
|
||||||
|
|
||||||
|
// Observe all target headings
|
||||||
|
observerTargets.forEach(heading => observer.observe(heading));
|
||||||
|
|
||||||
|
// Initial check in case a heading is already in view on load
|
||||||
|
// (Requires slight delay for accurate layout calculation)
|
||||||
|
setTimeout(() => {
|
||||||
|
observerCallback(observer.takeRecords()); // Process initial state
|
||||||
|
}, 100);
|
||||||
|
|
||||||
|
// move footer and the hr before footer to the end of the main content
|
||||||
|
const footer = document.querySelector('footer');
|
||||||
|
const hr = footer.previousElementSibling;
|
||||||
|
if (hr && hr.tagName === 'HR') {
|
||||||
|
mainContent.appendChild(hr);
|
||||||
|
}
|
||||||
|
mainContent.appendChild(footer);
|
||||||
|
console.info("TOC Generator: Footer moved to the end of the main content.");
|
||||||
|
|
||||||
|
});
|
||||||
@@ -251,7 +251,7 @@ from crawl4ai import (
|
|||||||
RoundRobinProxyStrategy,
|
RoundRobinProxyStrategy,
|
||||||
)
|
)
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai.proxy_strategy import ProxyConfig
|
from crawl4ai import ProxyConfig
|
||||||
async def main():
|
async def main():
|
||||||
# Load proxies and create rotation strategy
|
# Load proxies and create rotation strategy
|
||||||
proxies = ProxyConfig.from_env()
|
proxies = ProxyConfig.from_env()
|
||||||
|
|||||||
74
docs/md_v2/core/ask-ai.md
Normal file
74
docs/md_v2/core/ask-ai.md
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
<div class="ask-ai-container">
|
||||||
|
<iframe id="ask-ai-frame" src="../../ask_ai/index.html" width="100%" style="border:none; display: block;" title="Crawl4AI Assistant"></iframe>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<script>
|
||||||
|
// Iframe height adjustment
|
||||||
|
function resizeAskAiIframe() {
|
||||||
|
const iframe = document.getElementById('ask-ai-frame');
|
||||||
|
if (iframe) {
|
||||||
|
const headerHeight = parseFloat(getComputedStyle(document.documentElement).getPropertyValue('--header-height') || '55');
|
||||||
|
// Footer is removed by JS below, so calculate height based on header + small buffer
|
||||||
|
const topOffset = headerHeight + 20; // Header + buffer/margin
|
||||||
|
|
||||||
|
const availableHeight = window.innerHeight - topOffset;
|
||||||
|
iframe.style.height = Math.max(600, availableHeight) + 'px'; // Min height 600px
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run immediately and on resize/load
|
||||||
|
resizeAskAiIframe(); // Initial call
|
||||||
|
let resizeTimer;
|
||||||
|
window.addEventListener('load', resizeAskAiIframe);
|
||||||
|
window.addEventListener('resize', () => {
|
||||||
|
clearTimeout(resizeTimer);
|
||||||
|
resizeTimer = setTimeout(resizeAskAiIframe, 150);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Remove Footer & HR from parent page (DOM Ready might be safer)
|
||||||
|
document.addEventListener('DOMContentLoaded', () => {
|
||||||
|
setTimeout(() => { // Add slight delay just in case elements render slowly
|
||||||
|
const footer = window.parent.document.querySelector('footer'); // Target parent document
|
||||||
|
if (footer) {
|
||||||
|
const hrBeforeFooter = footer.previousElementSibling;
|
||||||
|
if (hrBeforeFooter && hrBeforeFooter.tagName === 'HR') {
|
||||||
|
hrBeforeFooter.remove();
|
||||||
|
}
|
||||||
|
footer.remove();
|
||||||
|
// Trigger resize again after removing footer
|
||||||
|
resizeAskAiIframe();
|
||||||
|
} else {
|
||||||
|
console.warn("Ask AI Page: Could not find footer in parent document to remove.");
|
||||||
|
}
|
||||||
|
}, 100); // Shorter delay
|
||||||
|
});
|
||||||
|
</script>
|
||||||
|
|
||||||
|
<style>
|
||||||
|
#terminal-mkdocs-main-content {
|
||||||
|
padding: 0 !important;
|
||||||
|
margin: 0;
|
||||||
|
width: 100%;
|
||||||
|
height: 100%;
|
||||||
|
overflow: hidden; /* Prevent body scrollbars, panels handle scroll */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Ensure iframe container takes full space */
|
||||||
|
#terminal-mkdocs-main-content .ask-ai-container {
|
||||||
|
/* Remove negative margins if footer removal handles space */
|
||||||
|
margin: 0;
|
||||||
|
padding: 0;
|
||||||
|
max-width: none;
|
||||||
|
/* Let the JS set the height */
|
||||||
|
/* height: 600px; Initial fallback height */
|
||||||
|
overflow: hidden; /* Hide potential overflow before JS resize */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Hide title/paragraph if they were part of the markdown */
|
||||||
|
/* Alternatively, just remove them from the .md file directly */
|
||||||
|
/* #terminal-mkdocs-main-content > h1,
|
||||||
|
#terminal-mkdocs-main-content > p:first-of-type {
|
||||||
|
display: none;
|
||||||
|
} */
|
||||||
|
|
||||||
|
</style>
|
||||||
File diff suppressed because it is too large
Load Diff
0
docs/tutorials/coming_soon.md
Normal file
0
docs/tutorials/coming_soon.md
Normal file
11
mkdocs.yml
11
mkdocs.yml
@@ -7,10 +7,11 @@ docs_dir: docs/md_v2
|
|||||||
|
|
||||||
nav:
|
nav:
|
||||||
- Home: 'index.md'
|
- Home: 'index.md'
|
||||||
|
- "Ask AI": "core/ask-ai.md"
|
||||||
|
- "Quick Start": "core/quickstart.md"
|
||||||
- Setup & Installation:
|
- Setup & Installation:
|
||||||
- "Installation": "core/installation.md"
|
- "Installation": "core/installation.md"
|
||||||
- "Docker Deployment": "core/docker-deployment.md"
|
- "Docker Deployment": "core/docker-deployment.md"
|
||||||
- "Quick Start": "core/quickstart.md"
|
|
||||||
- "Blog & Changelog":
|
- "Blog & Changelog":
|
||||||
- "Blog Home": "blog/index.md"
|
- "Blog Home": "blog/index.md"
|
||||||
- "Changelog": "https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md"
|
- "Changelog": "https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md"
|
||||||
@@ -76,6 +77,7 @@ extra:
|
|||||||
version: !ENV [CRAWL4AI_VERSION, 'development']
|
version: !ENV [CRAWL4AI_VERSION, 'development']
|
||||||
|
|
||||||
extra_css:
|
extra_css:
|
||||||
|
- assets/layout.css
|
||||||
- assets/styles.css
|
- assets/styles.css
|
||||||
- assets/highlight.css
|
- assets/highlight.css
|
||||||
- assets/dmvendor.css
|
- assets/dmvendor.css
|
||||||
@@ -83,4 +85,9 @@ extra_css:
|
|||||||
extra_javascript:
|
extra_javascript:
|
||||||
- assets/highlight.min.js
|
- assets/highlight.min.js
|
||||||
- assets/highlight_init.js
|
- assets/highlight_init.js
|
||||||
- https://buttons.github.io/buttons.js
|
- https://buttons.github.io/buttons.js
|
||||||
|
- assets/toc.js
|
||||||
|
- assets/github_stats.js
|
||||||
|
- assets/selection_ask_ai.js
|
||||||
|
- assets/copy_code.js
|
||||||
|
- assets/floating_ask_ai_button.js
|
||||||
@@ -1,20 +0,0 @@
|
|||||||
The file /docs/md_v2/api/parameters.md should be updated to include the new network and console capturing parameters.
|
|
||||||
|
|
||||||
Here's what needs to be updated:
|
|
||||||
|
|
||||||
1. Change section title from:
|
|
||||||
```
|
|
||||||
### G) **Debug & Logging**
|
|
||||||
```
|
|
||||||
to:
|
|
||||||
```
|
|
||||||
### G) **Debug, Logging & Capturing**
|
|
||||||
```
|
|
||||||
|
|
||||||
2. Add new parameters to the table:
|
|
||||||
```
|
|
||||||
| **`capture_network_requests`** | `bool` (False) | Captures all network requests, responses, and failures during the crawl. Available in `result.network_requests`. |
|
|
||||||
| **`capture_console_messages`** | `bool` (False) | Captures all browser console messages (logs, warnings, errors) during the crawl. Available in `result.console_messages`. |
|
|
||||||
```
|
|
||||||
|
|
||||||
These changes demonstrate how to use the new network and console capturing features in the CrawlerRunConfig.
|
|
||||||
596
tests/docker/test_rest_api_deep_crawl.py
Normal file
596
tests/docker/test_rest_api_deep_crawl.py
Normal file
@@ -0,0 +1,596 @@
|
|||||||
|
# ==== File: test_rest_api_deep_crawl.py ====
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import pytest_asyncio
|
||||||
|
import httpx
|
||||||
|
import json
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
from typing import List, Dict, Any, AsyncGenerator
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
load_dotenv() # Load environment variables from .env file if present
|
||||||
|
|
||||||
|
# --- Test Configuration ---
|
||||||
|
BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # If server is running in Docker, use the host's IP
|
||||||
|
BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # If server is running in dev debug mode
|
||||||
|
DEEP_CRAWL_BASE_URL = "https://docs.crawl4ai.com/samples/deepcrawl/"
|
||||||
|
DEEP_CRAWL_DOMAIN = "docs.crawl4ai.com" # Used for domain filter
|
||||||
|
|
||||||
|
# --- Helper Functions ---
|
||||||
|
def load_proxies_from_env() -> List[Dict]:
|
||||||
|
"""Load proxies from PROXIES environment variable"""
|
||||||
|
proxies = []
|
||||||
|
proxies_str = os.getenv("PROXIES", "")
|
||||||
|
if not proxies_str:
|
||||||
|
print("PROXIES environment variable not set or empty.")
|
||||||
|
return proxies
|
||||||
|
try:
|
||||||
|
proxy_list = proxies_str.split(",")
|
||||||
|
for proxy in proxy_list:
|
||||||
|
proxy = proxy.strip()
|
||||||
|
if not proxy:
|
||||||
|
continue
|
||||||
|
parts = proxy.split(":")
|
||||||
|
if len(parts) == 4:
|
||||||
|
ip, port, username, password = parts
|
||||||
|
proxies.append({
|
||||||
|
"server": f"http://{ip}:{port}", # Assuming http, adjust if needed
|
||||||
|
"username": username,
|
||||||
|
"password": password,
|
||||||
|
"ip": ip # Store original IP if available
|
||||||
|
})
|
||||||
|
elif len(parts) == 2: # ip:port only
|
||||||
|
ip, port = parts
|
||||||
|
proxies.append({
|
||||||
|
"server": f"http://{ip}:{port}",
|
||||||
|
"ip": ip
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
print(f"Skipping invalid proxy string format: {proxy}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error loading proxies from environment: {e}")
|
||||||
|
return proxies
|
||||||
|
|
||||||
|
|
||||||
|
async def check_server_health(client: httpx.AsyncClient):
|
||||||
|
"""Check if the server is healthy before running tests."""
|
||||||
|
try:
|
||||||
|
response = await client.get("/health")
|
||||||
|
response.raise_for_status()
|
||||||
|
print(f"\nServer healthy: {response.json()}")
|
||||||
|
return True
|
||||||
|
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
||||||
|
pytest.fail(f"Server health check failed: {e}. Is the server running at {BASE_URL}?", pytrace=False)
|
||||||
|
|
||||||
|
async def assert_crawl_result_structure(result: Dict[str, Any], check_ssl=False):
|
||||||
|
"""Asserts the basic structure of a single crawl result."""
|
||||||
|
assert isinstance(result, dict)
|
||||||
|
assert "url" in result
|
||||||
|
assert "success" in result
|
||||||
|
assert "html" in result # Basic crawls should return HTML
|
||||||
|
assert "metadata" in result
|
||||||
|
assert isinstance(result["metadata"], dict)
|
||||||
|
assert "depth" in result["metadata"] # Deep crawls add depth
|
||||||
|
|
||||||
|
if check_ssl:
|
||||||
|
assert "ssl_certificate" in result # Check if SSL info is present
|
||||||
|
assert isinstance(result["ssl_certificate"], dict) or result["ssl_certificate"] is None
|
||||||
|
|
||||||
|
|
||||||
|
async def process_streaming_response(response: httpx.Response) -> List[Dict[str, Any]]:
|
||||||
|
"""Processes an NDJSON streaming response."""
|
||||||
|
results = []
|
||||||
|
completed = False
|
||||||
|
async for line in response.aiter_lines():
|
||||||
|
if line:
|
||||||
|
try:
|
||||||
|
data = json.loads(line)
|
||||||
|
if data.get("status") == "completed":
|
||||||
|
completed = True
|
||||||
|
break # Stop processing after completion marker
|
||||||
|
elif data.get("url"): # Ensure it looks like a result object
|
||||||
|
results.append(data)
|
||||||
|
else:
|
||||||
|
print(f"Received non-result JSON line: {data}") # Log other status messages if needed
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pytest.fail(f"Failed to decode JSON line: {line}")
|
||||||
|
assert completed, "Streaming response did not end with a completion marker."
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
# --- Pytest Fixtures ---
|
||||||
|
@pytest_asyncio.fixture(scope="function")
|
||||||
|
async def async_client() -> AsyncGenerator[httpx.AsyncClient, None]:
|
||||||
|
"""Provides an async HTTP client"""
|
||||||
|
# Increased timeout for potentially longer deep crawls
|
||||||
|
async with httpx.AsyncClient(base_url=BASE_URL, timeout=300.0) as client:
|
||||||
|
yield client
|
||||||
|
# No explicit close needed with 'async with'
|
||||||
|
|
||||||
|
# --- Test Class ---
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
class TestDeepCrawlEndpoints:
|
||||||
|
|
||||||
|
@pytest_asyncio.fixture(autouse=True)
|
||||||
|
async def check_health_before_tests(self, async_client: httpx.AsyncClient):
|
||||||
|
"""Fixture to ensure server is healthy before each test in the class."""
|
||||||
|
await check_server_health(async_client)
|
||||||
|
|
||||||
|
# 1. Basic Deep Crawl
|
||||||
|
async def test_deep_crawl_basic_bfs(self, async_client: httpx.AsyncClient):
|
||||||
|
"""Test BFS deep crawl with limited depth and pages."""
|
||||||
|
max_depth = 1
|
||||||
|
max_pages = 3 # start_url + 2 more
|
||||||
|
payload = {
|
||||||
|
"urls": [DEEP_CRAWL_BASE_URL],
|
||||||
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||||
|
"crawler_config": {
|
||||||
|
"type": "CrawlerRunConfig",
|
||||||
|
"params": {
|
||||||
|
"stream": False,
|
||||||
|
"cache_mode": "BYPASS", # Use string value for CacheMode
|
||||||
|
"deep_crawl_strategy": {
|
||||||
|
"type": "BFSDeepCrawlStrategy",
|
||||||
|
"params": {
|
||||||
|
"max_depth": max_depth,
|
||||||
|
"max_pages": max_pages,
|
||||||
|
# Minimal filters for basic test
|
||||||
|
"filter_chain": {
|
||||||
|
"type": "FilterChain",
|
||||||
|
"params": {
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"type": "DomainFilter",
|
||||||
|
"params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
response = await async_client.post("/crawl", json=payload)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
assert data["success"] is True
|
||||||
|
assert isinstance(data["results"], list)
|
||||||
|
assert len(data["results"]) > 1 # Should be more than just the start URL
|
||||||
|
assert len(data["results"]) <= max_pages # Respect max_pages
|
||||||
|
|
||||||
|
found_depth_0 = False
|
||||||
|
found_depth_1 = False
|
||||||
|
for result in data["results"]:
|
||||||
|
await assert_crawl_result_structure(result)
|
||||||
|
assert result["success"] is True
|
||||||
|
assert DEEP_CRAWL_DOMAIN in result["url"]
|
||||||
|
depth = result["metadata"]["depth"]
|
||||||
|
assert depth <= max_depth
|
||||||
|
if depth == 0: found_depth_0 = True
|
||||||
|
if depth == 1: found_depth_1 = True
|
||||||
|
|
||||||
|
assert found_depth_0
|
||||||
|
assert found_depth_1
|
||||||
|
|
||||||
|
# 2. Deep Crawl with Filtering
|
||||||
|
async def test_deep_crawl_with_filters(self, async_client: httpx.AsyncClient):
|
||||||
|
"""Test BFS deep crawl with content type and domain filters."""
|
||||||
|
max_depth = 1
|
||||||
|
max_pages = 5
|
||||||
|
payload = {
|
||||||
|
"urls": [DEEP_CRAWL_BASE_URL],
|
||||||
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||||
|
"crawler_config": {
|
||||||
|
"type": "CrawlerRunConfig",
|
||||||
|
"params": {
|
||||||
|
"stream": False,
|
||||||
|
"cache_mode": "BYPASS",
|
||||||
|
"deep_crawl_strategy": {
|
||||||
|
"type": "BFSDeepCrawlStrategy",
|
||||||
|
"params": {
|
||||||
|
"max_depth": max_depth,
|
||||||
|
"max_pages": max_pages,
|
||||||
|
"filter_chain": {
|
||||||
|
"type": "FilterChain",
|
||||||
|
"params": {
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"type": "DomainFilter",
|
||||||
|
"params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "ContentTypeFilter",
|
||||||
|
"params": {"allowed_types": ["text/html"]}
|
||||||
|
},
|
||||||
|
# Example: Exclude specific paths using regex
|
||||||
|
{
|
||||||
|
"type": "URLPatternFilter",
|
||||||
|
"params": {
|
||||||
|
"patterns": ["*/category-3/*"], # Block category 3
|
||||||
|
"reverse": True # Block if match
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
response = await async_client.post("/crawl", json=payload)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
assert data["success"] is True
|
||||||
|
assert len(data["results"]) > 0
|
||||||
|
assert len(data["results"]) <= max_pages
|
||||||
|
|
||||||
|
for result in data["results"]:
|
||||||
|
await assert_crawl_result_structure(result)
|
||||||
|
assert result["success"] is True
|
||||||
|
assert DEEP_CRAWL_DOMAIN in result["url"]
|
||||||
|
assert "category-3" not in result["url"] # Check if filter worked
|
||||||
|
assert result["metadata"]["depth"] <= max_depth
|
||||||
|
|
||||||
|
# 3. Deep Crawl with Scoring
|
||||||
|
async def test_deep_crawl_with_scoring(self, async_client: httpx.AsyncClient):
|
||||||
|
"""Test BFS deep crawl with URL scoring."""
|
||||||
|
max_depth = 1
|
||||||
|
max_pages = 4
|
||||||
|
payload = {
|
||||||
|
"urls": [DEEP_CRAWL_BASE_URL],
|
||||||
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||||
|
"crawler_config": {
|
||||||
|
"type": "CrawlerRunConfig",
|
||||||
|
"params": {
|
||||||
|
"stream": False,
|
||||||
|
"cache_mode": "BYPASS",
|
||||||
|
"deep_crawl_strategy": {
|
||||||
|
"type": "BFSDeepCrawlStrategy",
|
||||||
|
"params": {
|
||||||
|
"max_depth": max_depth,
|
||||||
|
"max_pages": max_pages,
|
||||||
|
"filter_chain": { # Keep basic domain filter
|
||||||
|
"type": "FilterChain",
|
||||||
|
"params": { "filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
|
||||||
|
},
|
||||||
|
"url_scorer": { # Add scorer
|
||||||
|
"type": "CompositeScorer",
|
||||||
|
"params": {
|
||||||
|
"scorers": [
|
||||||
|
{ # Favor pages with 'product' in the URL
|
||||||
|
"type": "KeywordRelevanceScorer",
|
||||||
|
"params": {"keywords": ["product"], "weight": 1.0}
|
||||||
|
},
|
||||||
|
{ # Penalize deep paths slightly
|
||||||
|
"type": "PathDepthScorer",
|
||||||
|
"params": {"optimal_depth": 2, "weight": -0.2}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
# Set a threshold if needed: "score_threshold": 0.1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
response = await async_client.post("/crawl", json=payload)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
assert data["success"] is True
|
||||||
|
assert len(data["results"]) > 0
|
||||||
|
assert len(data["results"]) <= max_pages
|
||||||
|
|
||||||
|
# Check if results seem biased towards products (harder to assert strictly without knowing exact scores)
|
||||||
|
product_urls_found = any("product_" in result["url"] for result in data["results"] if result["metadata"]["depth"] > 0)
|
||||||
|
print(f"Product URLs found among depth > 0 results: {product_urls_found}")
|
||||||
|
# We expect scoring to prioritize product pages if available within limits
|
||||||
|
# assert product_urls_found # This might be too strict depending on site structure and limits
|
||||||
|
|
||||||
|
for result in data["results"]:
|
||||||
|
await assert_crawl_result_structure(result)
|
||||||
|
assert result["success"] is True
|
||||||
|
assert result["metadata"]["depth"] <= max_depth
|
||||||
|
|
||||||
|
# 4. Deep Crawl with CSS Extraction
|
||||||
|
async def test_deep_crawl_with_css_extraction(self, async_client: httpx.AsyncClient):
|
||||||
|
"""Test BFS deep crawl combined with JsonCssExtractionStrategy."""
|
||||||
|
max_depth = 6 # Go deep enough to reach product pages
|
||||||
|
max_pages = 20
|
||||||
|
# Schema to extract product details
|
||||||
|
product_schema = {
|
||||||
|
"name": "ProductDetails",
|
||||||
|
"baseSelector": "div.container", # Base for product page
|
||||||
|
"fields": [
|
||||||
|
{"name": "product_title", "selector": "h1", "type": "text"},
|
||||||
|
{"name": "price", "selector": ".product-price", "type": "text"},
|
||||||
|
{"name": "description", "selector": ".product-description p", "type": "text"},
|
||||||
|
{"name": "specs", "selector": ".product-specs li", "type": "list", "fields":[
|
||||||
|
{"name": "spec_name", "selector": ".spec-name", "type": "text"},
|
||||||
|
{"name": "spec_value", "selector": ".spec-value", "type": "text"}
|
||||||
|
]}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
payload = {
|
||||||
|
"urls": [DEEP_CRAWL_BASE_URL],
|
||||||
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||||
|
"crawler_config": {
|
||||||
|
"type": "CrawlerRunConfig",
|
||||||
|
"params": {
|
||||||
|
"stream": False,
|
||||||
|
"cache_mode": "BYPASS",
|
||||||
|
"extraction_strategy": { # Apply extraction to ALL crawled pages
|
||||||
|
"type": "JsonCssExtractionStrategy",
|
||||||
|
"params": {"schema": {"type": "dict", "value": product_schema}}
|
||||||
|
},
|
||||||
|
"deep_crawl_strategy": {
|
||||||
|
"type": "BFSDeepCrawlStrategy",
|
||||||
|
"params": {
|
||||||
|
"max_depth": max_depth,
|
||||||
|
"max_pages": max_pages,
|
||||||
|
"filter_chain": { # Only crawl HTML on our domain
|
||||||
|
"type": "FilterChain",
|
||||||
|
"params": {
|
||||||
|
"filters": [
|
||||||
|
{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
|
||||||
|
{"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
# Optional: Add scoring to prioritize product pages for extraction
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
response = await async_client.post("/crawl", json=payload)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
assert data["success"] is True
|
||||||
|
assert len(data["results"]) > 0
|
||||||
|
# assert len(data["results"]) <= max_pages
|
||||||
|
|
||||||
|
found_extracted_product = False
|
||||||
|
for result in data["results"]:
|
||||||
|
await assert_crawl_result_structure(result)
|
||||||
|
assert result["success"] is True
|
||||||
|
assert "extracted_content" in result
|
||||||
|
if "product_" in result["url"]: # Check product pages specifically
|
||||||
|
assert result["extracted_content"] is not None
|
||||||
|
try:
|
||||||
|
extracted = json.loads(result["extracted_content"])
|
||||||
|
# Schema returns list even if one base match
|
||||||
|
assert isinstance(extracted, list)
|
||||||
|
if extracted:
|
||||||
|
item = extracted[0]
|
||||||
|
assert "product_title" in item and item["product_title"]
|
||||||
|
assert "price" in item and item["price"]
|
||||||
|
# Specs might be empty list if not found
|
||||||
|
assert "specs" in item and isinstance(item["specs"], list)
|
||||||
|
found_extracted_product = True
|
||||||
|
print(f"Extracted product: {item.get('product_title')}")
|
||||||
|
except (json.JSONDecodeError, AssertionError, IndexError) as e:
|
||||||
|
pytest.fail(f"Extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
|
||||||
|
# else:
|
||||||
|
# # Non-product pages might have None or empty list depending on schema match
|
||||||
|
# assert result["extracted_content"] is None or json.loads(result["extracted_content"]) == []
|
||||||
|
|
||||||
|
assert found_extracted_product, "Did not find any pages where product data was successfully extracted."
|
||||||
|
|
||||||
|
# 5. Deep Crawl with LLM Extraction (Requires Server LLM Setup)
|
||||||
|
async def test_deep_crawl_with_llm_extraction(self, async_client: httpx.AsyncClient):
|
||||||
|
"""Test BFS deep crawl combined with LLMExtractionStrategy."""
|
||||||
|
max_depth = 1 # Limit depth to keep LLM calls manageable
|
||||||
|
max_pages = 3
|
||||||
|
payload = {
|
||||||
|
"urls": [DEEP_CRAWL_BASE_URL],
|
||||||
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||||
|
"crawler_config": {
|
||||||
|
"type": "CrawlerRunConfig",
|
||||||
|
"params": {
|
||||||
|
"stream": False,
|
||||||
|
"cache_mode": "BYPASS",
|
||||||
|
"extraction_strategy": { # Apply LLM extraction to crawled pages
|
||||||
|
"type": "LLMExtractionStrategy",
|
||||||
|
"params": {
|
||||||
|
"instruction": "Extract the main H1 title and the text content of the first paragraph.",
|
||||||
|
"llm_config": { # Example override, rely on server default if possible
|
||||||
|
"type": "LLMConfig",
|
||||||
|
"params": {"provider": "openai/gpt-4.1-mini"} # Use a cheaper model for testing
|
||||||
|
},
|
||||||
|
"schema": { # Expected JSON output
|
||||||
|
"type": "dict",
|
||||||
|
"value": {
|
||||||
|
"title": "PageContent", "type": "object",
|
||||||
|
"properties": {
|
||||||
|
"h1_title": {"type": "string"},
|
||||||
|
"first_paragraph": {"type": "string"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"deep_crawl_strategy": {
|
||||||
|
"type": "BFSDeepCrawlStrategy",
|
||||||
|
"params": {
|
||||||
|
"max_depth": max_depth,
|
||||||
|
"max_pages": max_pages,
|
||||||
|
"filter_chain": {
|
||||||
|
"type": "FilterChain",
|
||||||
|
"params": {
|
||||||
|
"filters": [
|
||||||
|
{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
|
||||||
|
{"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = await async_client.post("/crawl", json=payload)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
except httpx.HTTPStatusError as e:
|
||||||
|
pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and LLM API key setup.")
|
||||||
|
except httpx.RequestError as e:
|
||||||
|
pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}.")
|
||||||
|
|
||||||
|
|
||||||
|
assert data["success"] is True
|
||||||
|
assert len(data["results"]) > 0
|
||||||
|
assert len(data["results"]) <= max_pages
|
||||||
|
|
||||||
|
found_llm_extraction = False
|
||||||
|
for result in data["results"]:
|
||||||
|
await assert_crawl_result_structure(result)
|
||||||
|
assert result["success"] is True
|
||||||
|
assert "extracted_content" in result
|
||||||
|
assert result["extracted_content"] is not None
|
||||||
|
try:
|
||||||
|
extracted = json.loads(result["extracted_content"])
|
||||||
|
if isinstance(extracted, list): extracted = extracted[0] # Handle list output
|
||||||
|
assert isinstance(extracted, dict)
|
||||||
|
assert "h1_title" in extracted # Check keys based on schema
|
||||||
|
assert "first_paragraph" in extracted
|
||||||
|
found_llm_extraction = True
|
||||||
|
print(f"LLM extracted from {result['url']}: Title='{extracted.get('h1_title')}'")
|
||||||
|
except (json.JSONDecodeError, AssertionError, IndexError, TypeError) as e:
|
||||||
|
pytest.fail(f"LLM extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
|
||||||
|
|
||||||
|
assert found_llm_extraction, "LLM extraction did not yield expected data on any crawled page."
|
||||||
|
|
||||||
|
|
||||||
|
# 6. Deep Crawl with SSL Certificate Fetching
|
||||||
|
async def test_deep_crawl_with_ssl(self, async_client: httpx.AsyncClient):
|
||||||
|
"""Test BFS deep crawl with fetch_ssl_certificate enabled."""
|
||||||
|
max_depth = 0 # Only fetch for start URL to keep test fast
|
||||||
|
max_pages = 1
|
||||||
|
payload = {
|
||||||
|
"urls": [DEEP_CRAWL_BASE_URL],
|
||||||
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||||
|
"crawler_config": {
|
||||||
|
"type": "CrawlerRunConfig",
|
||||||
|
"params": {
|
||||||
|
"stream": False,
|
||||||
|
"cache_mode": "BYPASS",
|
||||||
|
"fetch_ssl_certificate": True, # <-- Enable SSL fetching
|
||||||
|
"deep_crawl_strategy": {
|
||||||
|
"type": "BFSDeepCrawlStrategy",
|
||||||
|
"params": {
|
||||||
|
"max_depth": max_depth,
|
||||||
|
"max_pages": max_pages,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
response = await async_client.post("/crawl", json=payload)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
|
||||||
|
assert data["success"] is True
|
||||||
|
assert len(data["results"]) == 1
|
||||||
|
result = data["results"][0]
|
||||||
|
|
||||||
|
await assert_crawl_result_structure(result, check_ssl=True) # <-- Tell helper to check SSL field
|
||||||
|
assert result["success"] is True
|
||||||
|
# Check if SSL info was actually retrieved
|
||||||
|
if result["ssl_certificate"]:
|
||||||
|
# Assert directly using dictionary keys
|
||||||
|
assert isinstance(result["ssl_certificate"], dict) # Verify it's a dict
|
||||||
|
assert "issuer" in result["ssl_certificate"]
|
||||||
|
assert "subject" in result["ssl_certificate"]
|
||||||
|
# --- MODIFIED ASSERTIONS ---
|
||||||
|
assert "not_before" in result["ssl_certificate"] # Check for the actual key
|
||||||
|
assert "not_after" in result["ssl_certificate"] # Check for the actual key
|
||||||
|
# --- END MODIFICATIONS ---
|
||||||
|
assert "fingerprint" in result["ssl_certificate"] # Check another key
|
||||||
|
|
||||||
|
# This print statement using .get() already works correctly with dictionaries
|
||||||
|
print(f"SSL Issuer Org: {result['ssl_certificate'].get('issuer', {}).get('O', 'N/A')}")
|
||||||
|
print(f"SSL Valid From: {result['ssl_certificate'].get('not_before', 'N/A')}")
|
||||||
|
else:
|
||||||
|
# This part remains the same
|
||||||
|
print("SSL Certificate was null in the result.")
|
||||||
|
|
||||||
|
|
||||||
|
# 7. Deep Crawl with Proxy Rotation (Requires PROXIES env var)
|
||||||
|
async def test_deep_crawl_with_proxies(self, async_client: httpx.AsyncClient):
|
||||||
|
"""Test BFS deep crawl using proxy rotation."""
|
||||||
|
proxies = load_proxies_from_env()
|
||||||
|
if not proxies:
|
||||||
|
pytest.skip("Skipping proxy test: PROXIES environment variable not set or empty.")
|
||||||
|
|
||||||
|
print(f"\nTesting with {len(proxies)} proxies loaded from environment.")
|
||||||
|
|
||||||
|
max_depth = 1
|
||||||
|
max_pages = 3
|
||||||
|
payload = {
|
||||||
|
"urls": [DEEP_CRAWL_BASE_URL], # Use the dummy site
|
||||||
|
# Use a BrowserConfig that *might* pick up proxy if set, but rely on CrawlerRunConfig
|
||||||
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||||
|
"crawler_config": {
|
||||||
|
"type": "CrawlerRunConfig",
|
||||||
|
"params": {
|
||||||
|
"stream": False,
|
||||||
|
"cache_mode": "BYPASS",
|
||||||
|
"proxy_rotation_strategy": { # <-- Define the strategy
|
||||||
|
"type": "RoundRobinProxyStrategy",
|
||||||
|
"params": {
|
||||||
|
# Convert ProxyConfig dicts back to the serialized format expected by server
|
||||||
|
"proxies": [{"type": "ProxyConfig", "params": p} for p in proxies]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"deep_crawl_strategy": {
|
||||||
|
"type": "BFSDeepCrawlStrategy",
|
||||||
|
"params": {
|
||||||
|
"max_depth": max_depth,
|
||||||
|
"max_pages": max_pages,
|
||||||
|
"filter_chain": {
|
||||||
|
"type": "FilterChain",
|
||||||
|
"params": { "filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
response = await async_client.post("/crawl", json=payload)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
except httpx.HTTPStatusError as e:
|
||||||
|
# Proxies often cause connection errors, catch them
|
||||||
|
pytest.fail(f"Proxy deep crawl failed: {e}. Response: {e.response.text}. Are proxies valid and accessible by the server?")
|
||||||
|
except httpx.RequestError as e:
|
||||||
|
pytest.fail(f"Proxy deep crawl request failed: {e}. Are proxies valid and accessible?")
|
||||||
|
|
||||||
|
assert data["success"] is True
|
||||||
|
assert len(data["results"]) > 0
|
||||||
|
assert len(data["results"]) <= max_pages
|
||||||
|
# Primary assertion is that the crawl succeeded *with* proxy config
|
||||||
|
print(f"Proxy deep crawl completed successfully for {len(data['results'])} pages.")
|
||||||
|
|
||||||
|
# Verifying specific proxy usage requires server logs or custom headers/responses
|
||||||
|
|
||||||
|
|
||||||
|
# --- Main Execution Block (for running script directly) ---
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pytest_args = ["-v", "-s", __file__]
|
||||||
|
# Example: Run only proxy test
|
||||||
|
# pytest_args.append("-k test_deep_crawl_with_proxies")
|
||||||
|
print(f"Running pytest with args: {pytest_args}")
|
||||||
|
exit_code = pytest.main(pytest_args)
|
||||||
|
print(f"Pytest finished with exit code: {exit_code}")
|
||||||
655
tests/docker/test_server_requests.py
Normal file
655
tests/docker/test_server_requests.py
Normal file
@@ -0,0 +1,655 @@
|
|||||||
|
import pytest
|
||||||
|
import pytest_asyncio
|
||||||
|
import httpx
|
||||||
|
import json
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
from typing import List, Dict, Any, AsyncGenerator
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
|
# Optional: Import crawl4ai classes directly for reference/easier payload creation aid
|
||||||
|
# You don't strictly NEED these imports for the tests to run against the server,
|
||||||
|
# but they help in understanding the structure you are mimicking in JSON.
|
||||||
|
from crawl4ai import (
|
||||||
|
BrowserConfig,
|
||||||
|
CrawlerRunConfig,
|
||||||
|
CacheMode,
|
||||||
|
DefaultMarkdownGenerator,
|
||||||
|
PruningContentFilter,
|
||||||
|
BM25ContentFilter,
|
||||||
|
BFSDeepCrawlStrategy,
|
||||||
|
FilterChain,
|
||||||
|
ContentTypeFilter,
|
||||||
|
DomainFilter,
|
||||||
|
CompositeScorer,
|
||||||
|
KeywordRelevanceScorer,
|
||||||
|
PathDepthScorer,
|
||||||
|
JsonCssExtractionStrategy,
|
||||||
|
LLMExtractionStrategy,
|
||||||
|
LLMConfig
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Test Configuration ---
|
||||||
|
# BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # Make base URL configurable
|
||||||
|
BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # Make base URL configurable
|
||||||
|
# Use a known simple HTML page for basic tests
|
||||||
|
SIMPLE_HTML_URL = "https://httpbin.org/html"
|
||||||
|
# Use a site suitable for scraping tests
|
||||||
|
SCRAPE_TARGET_URL = "http://books.toscrape.com/"
|
||||||
|
# Use a site with internal links for deep crawl tests
|
||||||
|
DEEP_CRAWL_URL = "https://python.org"
|
||||||
|
|
||||||
|
# --- Pytest Fixtures ---
|
||||||
|
|
||||||
|
# Use the built-in event_loop fixture from pytest_asyncio
|
||||||
|
# The custom implementation was causing issues with closing the loop
|
||||||
|
|
||||||
|
@pytest_asyncio.fixture(scope="function") # Changed to function scope to avoid event loop issues
|
||||||
|
async def async_client() -> AsyncGenerator[httpx.AsyncClient, None]:
|
||||||
|
"""Provides an async HTTP client"""
|
||||||
|
client = httpx.AsyncClient(base_url=BASE_URL, timeout=120.0)
|
||||||
|
yield client
|
||||||
|
await client.aclose()
|
||||||
|
|
||||||
|
# --- Helper Functions ---
|
||||||
|
|
||||||
|
async def check_server_health(client: httpx.AsyncClient):
|
||||||
|
"""Check if the server is healthy before running tests."""
|
||||||
|
try:
|
||||||
|
response = await client.get("/health")
|
||||||
|
response.raise_for_status()
|
||||||
|
print(f"\nServer healthy: {response.json()}")
|
||||||
|
return True
|
||||||
|
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
||||||
|
pytest.fail(f"Server health check failed: {e}. Is the server running at {BASE_URL}?", pytrace=False)
|
||||||
|
|
||||||
|
async def assert_crawl_result_structure(result: Dict[str, Any]):
|
||||||
|
"""Asserts the basic structure of a single crawl result."""
|
||||||
|
assert isinstance(result, dict)
|
||||||
|
assert "url" in result
|
||||||
|
assert "success" in result
|
||||||
|
assert "html" in result
|
||||||
|
# Add more common checks if needed
|
||||||
|
|
||||||
|
async def process_streaming_response(response: httpx.Response) -> List[Dict[str, Any]]:
|
||||||
|
"""Processes an NDJSON streaming response."""
|
||||||
|
results = []
|
||||||
|
completed = False
|
||||||
|
async for line in response.aiter_lines():
|
||||||
|
if line:
|
||||||
|
try:
|
||||||
|
data = json.loads(line)
|
||||||
|
if data.get("status") == "completed":
|
||||||
|
completed = True
|
||||||
|
break # Stop processing after completion marker
|
||||||
|
else:
|
||||||
|
results.append(data)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pytest.fail(f"Failed to decode JSON line: {line}")
|
||||||
|
assert completed, "Streaming response did not end with a completion marker."
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
# --- Test Class ---
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
class TestCrawlEndpoints:
|
||||||
|
|
||||||
|
@pytest_asyncio.fixture(autouse=True)
|
||||||
|
async def check_health_before_tests(self, async_client: httpx.AsyncClient):
|
||||||
|
"""Fixture to ensure server is healthy before each test in the class."""
|
||||||
|
await check_server_health(async_client)
|
||||||
|
|
||||||
|
# 1. Simple Requests (Primitives)
|
||||||
|
async def test_simple_crawl_single_url(self, async_client: httpx.AsyncClient):
|
||||||
|
"""Test /crawl with a single URL and simple config values."""
|
||||||
|
payload = {
|
||||||
|
"urls": [SIMPLE_HTML_URL],
|
||||||
|
"browser_config": {
|
||||||
|
"type": "BrowserConfig",
|
||||||
|
"params": {
|
||||||
|
"headless": True,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"crawler_config": {
|
||||||
|
"type": "CrawlerRunConfig",
|
||||||
|
"params": {
|
||||||
|
"stream": False, # Explicitly false for /crawl
|
||||||
|
"screenshot": False,
|
||||||
|
"cache_mode": CacheMode.BYPASS.value # Use enum value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
response = await async_client.post("/crawl", json=payload)
|
||||||
|
print(f"Response status: {response.status_code}")
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
except httpx.HTTPStatusError as e:
|
||||||
|
print(f"Server error: {e}")
|
||||||
|
print(f"Response content: {e.response.text}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
assert data["success"] is True
|
||||||
|
assert isinstance(data["results"], list)
|
||||||
|
assert len(data["results"]) == 1
|
||||||
|
result = data["results"][0]
|
||||||
|
await assert_crawl_result_structure(result)
|
||||||
|
assert result["success"] is True
|
||||||
|
assert result["url"] == SIMPLE_HTML_URL
|
||||||
|
assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
|
||||||
|
# We don't specify a markdown generator in this test, so don't make assumptions about markdown field
|
||||||
|
# It might be null, missing, or populated depending on the server's default behavior
|
||||||
|
|
||||||
|
async def test_simple_crawl_single_url_streaming(self, async_client: httpx.AsyncClient):
|
||||||
|
"""Test /crawl/stream with a single URL and simple config values."""
|
||||||
|
payload = {
|
||||||
|
"urls": [SIMPLE_HTML_URL],
|
||||||
|
"browser_config": {
|
||||||
|
"type": "BrowserConfig",
|
||||||
|
"params": {
|
||||||
|
"headless": True,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"crawler_config": {
|
||||||
|
"type": "CrawlerRunConfig",
|
||||||
|
"params": {
|
||||||
|
"stream": True, # Must be true for /crawl/stream
|
||||||
|
"screenshot": False,
|
||||||
|
"cache_mode": CacheMode.BYPASS.value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
async with async_client.stream("POST", "/crawl/stream", json=payload) as response:
|
||||||
|
response.raise_for_status()
|
||||||
|
results = await process_streaming_response(response)
|
||||||
|
|
||||||
|
assert len(results) == 1
|
||||||
|
result = results[0]
|
||||||
|
await assert_crawl_result_structure(result)
|
||||||
|
assert result["success"] is True
|
||||||
|
assert result["url"] == SIMPLE_HTML_URL
|
||||||
|
assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]
|
||||||
|
|
||||||
|
|
||||||
|
# 2. Multi-URL and Dispatcher
|
||||||
|
async def test_multi_url_crawl(self, async_client: httpx.AsyncClient):
|
||||||
|
"""Test /crawl with multiple URLs, implicitly testing dispatcher."""
|
||||||
|
urls = [SIMPLE_HTML_URL, "https://httpbin.org/links/10/0"]
|
||||||
|
payload = {
|
||||||
|
"urls": urls,
|
||||||
|
"browser_config": {
|
||||||
|
"type": "BrowserConfig",
|
||||||
|
"params": {"headless": True}
|
||||||
|
},
|
||||||
|
"crawler_config": {
|
||||||
|
"type": "CrawlerRunConfig",
|
||||||
|
"params": {"stream": False, "cache_mode": CacheMode.BYPASS.value}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
print(f"Sending deep crawl request to server...")
|
||||||
|
response = await async_client.post("/crawl", json=payload)
|
||||||
|
print(f"Response status: {response.status_code}")
|
||||||
|
|
||||||
|
if response.status_code >= 400:
|
||||||
|
error_detail = response.json().get('detail', 'No detail provided')
|
||||||
|
print(f"Error detail: {error_detail}")
|
||||||
|
print(f"Full response: {response.text}")
|
||||||
|
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
except httpx.HTTPStatusError as e:
|
||||||
|
print(f"Server error status: {e.response.status_code}")
|
||||||
|
print(f"Server error response: {e.response.text}")
|
||||||
|
try:
|
||||||
|
error_json = e.response.json()
|
||||||
|
print(f"Parsed error: {error_json}")
|
||||||
|
except:
|
||||||
|
print("Could not parse error response as JSON")
|
||||||
|
raise
|
||||||
|
|
||||||
|
assert data["success"] is True
|
||||||
|
assert isinstance(data["results"], list)
|
||||||
|
assert len(data["results"]) == len(urls)
|
||||||
|
for result in data["results"]:
|
||||||
|
await assert_crawl_result_structure(result)
|
||||||
|
assert result["success"] is True
|
||||||
|
assert result["url"] in urls
|
||||||
|
|
||||||
|
async def test_multi_url_crawl_streaming(self, async_client: httpx.AsyncClient):
|
||||||
|
"""Test /crawl/stream with multiple URLs."""
|
||||||
|
urls = [SIMPLE_HTML_URL, "https://httpbin.org/links/10/0"]
|
||||||
|
payload = {
|
||||||
|
"urls": urls,
|
||||||
|
"browser_config": {
|
||||||
|
"type": "BrowserConfig",
|
||||||
|
"params": {"headless": True}
|
||||||
|
},
|
||||||
|
"crawler_config": {
|
||||||
|
"type": "CrawlerRunConfig",
|
||||||
|
"params": {"stream": True, "cache_mode": CacheMode.BYPASS.value}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
async with async_client.stream("POST", "/crawl/stream", json=payload) as response:
|
||||||
|
response.raise_for_status()
|
||||||
|
results = await process_streaming_response(response)
|
||||||
|
|
||||||
|
assert len(results) == len(urls)
|
||||||
|
processed_urls = set()
|
||||||
|
for result in results:
|
||||||
|
await assert_crawl_result_structure(result)
|
||||||
|
assert result["success"] is True
|
||||||
|
assert result["url"] in urls
|
||||||
|
processed_urls.add(result["url"])
|
||||||
|
assert processed_urls == set(urls) # Ensure all URLs were processed
|
||||||
|
|
||||||
|
|
||||||
|
# 3. Class Values and Nested Classes (Markdown Generator)
|
||||||
|
async def test_crawl_with_markdown_pruning_filter(self, async_client: httpx.AsyncClient):
|
||||||
|
"""Test /crawl with MarkdownGenerator using PruningContentFilter."""
|
||||||
|
payload = {
|
||||||
|
"urls": [SIMPLE_HTML_URL],
|
||||||
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||||
|
"crawler_config": {
|
||||||
|
"type": "CrawlerRunConfig",
|
||||||
|
"params": {
|
||||||
|
"cache_mode": CacheMode.ENABLED.value, # Test different cache mode
|
||||||
|
"markdown_generator": {
|
||||||
|
"type": "DefaultMarkdownGenerator",
|
||||||
|
"params": {
|
||||||
|
"content_filter": {
|
||||||
|
"type": "PruningContentFilter",
|
||||||
|
"params": {
|
||||||
|
"threshold": 0.5, # Example param
|
||||||
|
"threshold_type": "relative"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
print(f"Sending deep crawl request to server...")
|
||||||
|
response = await async_client.post("/crawl", json=payload)
|
||||||
|
print(f"Response status: {response.status_code}")
|
||||||
|
|
||||||
|
if response.status_code >= 400:
|
||||||
|
error_detail = response.json().get('detail', 'No detail provided')
|
||||||
|
print(f"Error detail: {error_detail}")
|
||||||
|
print(f"Full response: {response.text}")
|
||||||
|
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
except httpx.HTTPStatusError as e:
|
||||||
|
print(f"Server error status: {e.response.status_code}")
|
||||||
|
print(f"Server error response: {e.response.text}")
|
||||||
|
try:
|
||||||
|
error_json = e.response.json()
|
||||||
|
print(f"Parsed error: {error_json}")
|
||||||
|
except:
|
||||||
|
print("Could not parse error response as JSON")
|
||||||
|
raise
|
||||||
|
|
||||||
|
assert data["success"] is True
|
||||||
|
assert len(data["results"]) == 1
|
||||||
|
result = data["results"][0]
|
||||||
|
await assert_crawl_result_structure(result)
|
||||||
|
assert result["success"] is True
|
||||||
|
assert "markdown" in result
|
||||||
|
assert isinstance(result["markdown"], dict)
|
||||||
|
assert "raw_markdown" in result["markdown"]
|
||||||
|
assert "fit_markdown" in result["markdown"] # Pruning creates fit_markdown
|
||||||
|
assert "Moby-Dick" in result["markdown"]["raw_markdown"]
|
||||||
|
# Fit markdown content might be different/shorter due to pruning
|
||||||
|
assert len(result["markdown"]["fit_markdown"]) <= len(result["markdown"]["raw_markdown"])
|
||||||
|
|
||||||
|
async def test_crawl_with_markdown_bm25_filter(self, async_client: httpx.AsyncClient):
|
||||||
|
"""Test /crawl with MarkdownGenerator using BM25ContentFilter."""
|
||||||
|
payload = {
|
||||||
|
"urls": [SIMPLE_HTML_URL],
|
||||||
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||||
|
"crawler_config": {
|
||||||
|
"type": "CrawlerRunConfig",
|
||||||
|
"params": {
|
||||||
|
"markdown_generator": {
|
||||||
|
"type": "DefaultMarkdownGenerator",
|
||||||
|
"params": {
|
||||||
|
"content_filter": {
|
||||||
|
"type": "BM25ContentFilter",
|
||||||
|
"params": {
|
||||||
|
"user_query": "Herman Melville", # Query for BM25
|
||||||
|
"bm25_threshold": 0.1, # Lower threshold to increase matches
|
||||||
|
"language": "english" # Valid parameters
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
print(f"Payload for BM25 test: {json.dumps(payload)}")
|
||||||
|
response = await async_client.post("/crawl", json=payload)
|
||||||
|
print(f"Response status: {response.status_code}")
|
||||||
|
|
||||||
|
if response.status_code >= 400:
|
||||||
|
error_detail = response.json().get('detail', 'No detail provided')
|
||||||
|
print(f"Error detail: {error_detail}")
|
||||||
|
print(f"Full response: {response.text}")
|
||||||
|
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
except httpx.HTTPStatusError as e:
|
||||||
|
print(f"Server error status: {e.response.status_code}")
|
||||||
|
print(f"Server error response: {e.response.text}")
|
||||||
|
try:
|
||||||
|
error_json = e.response.json()
|
||||||
|
print(f"Parsed error: {error_json}")
|
||||||
|
except:
|
||||||
|
print("Could not parse error response as JSON")
|
||||||
|
raise
|
||||||
|
|
||||||
|
assert data["success"] is True
|
||||||
|
assert len(data["results"]) == 1
|
||||||
|
result = data["results"][0]
|
||||||
|
await assert_crawl_result_structure(result)
|
||||||
|
assert result["success"] is True
|
||||||
|
assert "markdown" in result
|
||||||
|
assert isinstance(result["markdown"], dict)
|
||||||
|
assert "raw_markdown" in result["markdown"]
|
||||||
|
assert "fit_markdown" in result["markdown"] # BM25 creates fit_markdown
|
||||||
|
|
||||||
|
# Print values for debug
|
||||||
|
print(f"Raw markdown length: {len(result['markdown']['raw_markdown'])}")
|
||||||
|
print(f"Fit markdown length: {len(result['markdown']['fit_markdown'])}")
|
||||||
|
|
||||||
|
# Either fit_markdown has content (possibly including our query terms)
|
||||||
|
# or it might be empty if no good BM25 matches were found
|
||||||
|
# Don't assert specific content since it can be environment-dependent
|
||||||
|
|
||||||
|
|
||||||
|
# 4. Deep Crawling
|
||||||
|
async def test_deep_crawl(self, async_client: httpx.AsyncClient):
|
||||||
|
"""Test /crawl with a deep crawl strategy."""
|
||||||
|
payload = {
|
||||||
|
"urls": [DEEP_CRAWL_URL], # Start URL
|
||||||
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||||
|
"crawler_config": {
|
||||||
|
"type": "CrawlerRunConfig",
|
||||||
|
"params": {
|
||||||
|
"stream": False,
|
||||||
|
"cache_mode": CacheMode.BYPASS.value,
|
||||||
|
"deep_crawl_strategy": {
|
||||||
|
"type": "BFSDeepCrawlStrategy",
|
||||||
|
"params": {
|
||||||
|
"max_depth": 1, # Limit depth for testing speed
|
||||||
|
"max_pages": 5, # Limit pages to crawl
|
||||||
|
"filter_chain": {
|
||||||
|
"type": "FilterChain",
|
||||||
|
"params": {
|
||||||
|
"filters": [
|
||||||
|
{
|
||||||
|
"type": "ContentTypeFilter",
|
||||||
|
"params": {"allowed_types": ["text/html"]}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "DomainFilter",
|
||||||
|
"params": {"allowed_domains": ["python.org", "docs.python.org"]} # Include important subdomains
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"url_scorer": {
|
||||||
|
"type": "CompositeScorer",
|
||||||
|
"params": {
|
||||||
|
"scorers": [
|
||||||
|
{
|
||||||
|
"type": "KeywordRelevanceScorer",
|
||||||
|
"params": {"keywords": ["documentation", "tutorial"]}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "PathDepthScorer",
|
||||||
|
"params": {"weight": 0.5, "optimal_depth": 2}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
print(f"Sending deep crawl request to server...")
|
||||||
|
response = await async_client.post("/crawl", json=payload)
|
||||||
|
print(f"Response status: {response.status_code}")
|
||||||
|
|
||||||
|
if response.status_code >= 400:
|
||||||
|
error_detail = response.json().get('detail', 'No detail provided')
|
||||||
|
print(f"Error detail: {error_detail}")
|
||||||
|
print(f"Full response: {response.text}")
|
||||||
|
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
except httpx.HTTPStatusError as e:
|
||||||
|
print(f"Server error status: {e.response.status_code}")
|
||||||
|
print(f"Server error response: {e.response.text}")
|
||||||
|
try:
|
||||||
|
error_json = e.response.json()
|
||||||
|
print(f"Parsed error: {error_json}")
|
||||||
|
except:
|
||||||
|
print("Could not parse error response as JSON")
|
||||||
|
raise
|
||||||
|
|
||||||
|
assert data["success"] is True
|
||||||
|
assert isinstance(data["results"], list)
|
||||||
|
# Expect more than 1 result due to deep crawl (start URL + crawled links)
|
||||||
|
assert len(data["results"]) > 1
|
||||||
|
assert len(data["results"]) <= 6 # Start URL + max_links=5
|
||||||
|
|
||||||
|
start_url_found = False
|
||||||
|
crawled_urls_found = False
|
||||||
|
for result in data["results"]:
|
||||||
|
await assert_crawl_result_structure(result)
|
||||||
|
assert result["success"] is True
|
||||||
|
|
||||||
|
# Print URL for debugging
|
||||||
|
print(f"Crawled URL: {result['url']}")
|
||||||
|
|
||||||
|
# Allow URLs that contain python.org (including subdomains like docs.python.org)
|
||||||
|
assert "python.org" in result["url"]
|
||||||
|
if result["url"] == DEEP_CRAWL_URL:
|
||||||
|
start_url_found = True
|
||||||
|
else:
|
||||||
|
crawled_urls_found = True
|
||||||
|
|
||||||
|
assert start_url_found
|
||||||
|
assert crawled_urls_found
|
||||||
|
|
||||||
|
|
||||||
|
# 5. Extraction without LLM (JSON/CSS)
|
||||||
|
async def test_json_css_extraction(self, async_client: httpx.AsyncClient):
|
||||||
|
"""Test /crawl with JsonCssExtractionStrategy."""
|
||||||
|
payload = {
|
||||||
|
"urls": [SCRAPE_TARGET_URL],
|
||||||
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||||
|
"crawler_config": {
|
||||||
|
"type": "CrawlerRunConfig",
|
||||||
|
"params": {
|
||||||
|
"cache_mode": CacheMode.BYPASS.value,
|
||||||
|
"extraction_strategy": {
|
||||||
|
"type": "JsonCssExtractionStrategy",
|
||||||
|
"params": {
|
||||||
|
"schema": {
|
||||||
|
"type": "dict", # IMPORTANT: Wrap schema dict with type/value structure
|
||||||
|
"value": {
|
||||||
|
"name": "BookList",
|
||||||
|
"baseSelector": "ol.row li.col-xs-6", # Select each book item
|
||||||
|
"fields": [
|
||||||
|
{"name": "title", "selector": "article.product_pod h3 a", "type": "attribute", "attribute": "title"},
|
||||||
|
{"name": "price", "selector": "article.product_pod .price_color", "type": "text"},
|
||||||
|
{"name": "rating", "selector": "article.product_pod p.star-rating", "type": "attribute", "attribute": "class"}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
print(f"Sending deep crawl request to server...")
|
||||||
|
response = await async_client.post("/crawl", json=payload)
|
||||||
|
print(f"Response status: {response.status_code}")
|
||||||
|
|
||||||
|
if response.status_code >= 400:
|
||||||
|
error_detail = response.json().get('detail', 'No detail provided')
|
||||||
|
print(f"Error detail: {error_detail}")
|
||||||
|
print(f"Full response: {response.text}")
|
||||||
|
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
except httpx.HTTPStatusError as e:
|
||||||
|
print(f"Server error status: {e.response.status_code}")
|
||||||
|
print(f"Server error response: {e.response.text}")
|
||||||
|
try:
|
||||||
|
error_json = e.response.json()
|
||||||
|
print(f"Parsed error: {error_json}")
|
||||||
|
except:
|
||||||
|
print("Could not parse error response as JSON")
|
||||||
|
raise
|
||||||
|
|
||||||
|
assert data["success"] is True
|
||||||
|
assert len(data["results"]) == 1
|
||||||
|
result = data["results"][0]
|
||||||
|
await assert_crawl_result_structure(result)
|
||||||
|
assert result["success"] is True
|
||||||
|
assert "extracted_content" in result
|
||||||
|
assert result["extracted_content"] is not None
|
||||||
|
|
||||||
|
# Extracted content should be a JSON string representing a list of dicts
|
||||||
|
try:
|
||||||
|
extracted_data = json.loads(result["extracted_content"])
|
||||||
|
assert isinstance(extracted_data, list)
|
||||||
|
assert len(extracted_data) > 0 # Should find some books
|
||||||
|
# Check structure of the first extracted item
|
||||||
|
first_item = extracted_data[0]
|
||||||
|
assert "title" in first_item
|
||||||
|
assert "price" in first_item
|
||||||
|
assert "rating" in first_item
|
||||||
|
assert "star-rating" in first_item["rating"] # e.g., "star-rating Three"
|
||||||
|
except (json.JSONDecodeError, AssertionError) as e:
|
||||||
|
pytest.fail(f"Extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}")
|
||||||
|
|
||||||
|
|
||||||
|
# 6. Extraction with LLM
|
||||||
|
async def test_llm_extraction(self, async_client: httpx.AsyncClient):
|
||||||
|
"""
|
||||||
|
Test /crawl with LLMExtractionStrategy.
|
||||||
|
NOTE: Requires the server to have appropriate LLM API keys (e.g., OPENAI_API_KEY)
|
||||||
|
configured via .llm.env or environment variables.
|
||||||
|
This test uses the default provider configured in the server's config.yml.
|
||||||
|
"""
|
||||||
|
payload = {
|
||||||
|
"urls": [SIMPLE_HTML_URL],
|
||||||
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||||
|
"crawler_config": {
|
||||||
|
"type": "CrawlerRunConfig",
|
||||||
|
"params": {
|
||||||
|
"cache_mode": CacheMode.BYPASS.value,
|
||||||
|
"extraction_strategy": {
|
||||||
|
"type": "LLMExtractionStrategy",
|
||||||
|
"params": {
|
||||||
|
"instruction": "Extract the main title and the author mentioned in the text into JSON.",
|
||||||
|
# LLMConfig is implicitly defined by server's config.yml and .llm.env
|
||||||
|
# If you needed to override provider/token PER REQUEST:
|
||||||
|
"llm_config": {
|
||||||
|
"type": "LLMConfig",
|
||||||
|
"params": {
|
||||||
|
"provider": "openai/gpt-4o", # Example override
|
||||||
|
"api_token": os.getenv("OPENAI_API_KEY") # Example override
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"schema": { # Optional: Provide a schema for structured output
|
||||||
|
"type": "dict", # IMPORTANT: Wrap schema dict
|
||||||
|
"value": {
|
||||||
|
"title": "Book Info",
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"title": {"type": "string", "description": "The main title of the work"},
|
||||||
|
"author": {"type": "string", "description": "The author of the work"}
|
||||||
|
},
|
||||||
|
"required": ["title", "author"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = await async_client.post("/crawl", json=payload)
|
||||||
|
response.raise_for_status() # Will raise if server returns 500 (e.g., bad API key)
|
||||||
|
data = response.json()
|
||||||
|
except httpx.HTTPStatusError as e:
|
||||||
|
# Catch potential server errors (like 500 due to missing/invalid API keys)
|
||||||
|
pytest.fail(f"LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and ensure API keys are correctly configured for the server.")
|
||||||
|
except httpx.RequestError as e:
|
||||||
|
pytest.fail(f"LLM extraction request failed: {e}.")
|
||||||
|
|
||||||
|
assert data["success"] is True
|
||||||
|
assert len(data["results"]) == 1
|
||||||
|
result = data["results"][0]
|
||||||
|
await assert_crawl_result_structure(result)
|
||||||
|
assert result["success"] is True
|
||||||
|
assert "extracted_content" in result
|
||||||
|
assert result["extracted_content"] is not None
|
||||||
|
|
||||||
|
# Extracted content should be JSON (because we provided a schema)
|
||||||
|
try:
|
||||||
|
extracted_data = json.loads(result["extracted_content"])
|
||||||
|
print(f"\nLLM Extracted Data: {extracted_data}") # Print for verification
|
||||||
|
|
||||||
|
# Handle both dict and list formats (server returns a list)
|
||||||
|
if isinstance(extracted_data, list):
|
||||||
|
assert len(extracted_data) > 0
|
||||||
|
extracted_item = extracted_data[0] # Take first item
|
||||||
|
assert isinstance(extracted_item, dict)
|
||||||
|
assert "title" in extracted_item
|
||||||
|
assert "author" in extracted_item
|
||||||
|
assert "Moby-Dick" in extracted_item.get("title", "")
|
||||||
|
assert "Herman Melville" in extracted_item.get("author", "")
|
||||||
|
else:
|
||||||
|
assert isinstance(extracted_data, dict)
|
||||||
|
assert "title" in extracted_data
|
||||||
|
assert "author" in extracted_data
|
||||||
|
assert "Moby-Dick" in extracted_data.get("title", "")
|
||||||
|
assert "Herman Melville" in extracted_data.get("author", "")
|
||||||
|
except (json.JSONDecodeError, AssertionError) as e:
|
||||||
|
pytest.fail(f"LLM extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}")
|
||||||
|
except Exception as e: # Catch any other unexpected error
|
||||||
|
pytest.fail(f"An unexpected error occurred during LLM result processing: {e}\nContent: {result['extracted_content']}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Define arguments for pytest programmatically
|
||||||
|
# -v: verbose output
|
||||||
|
# -s: show print statements immediately (useful for debugging)
|
||||||
|
# __file__: tells pytest to run tests in the current file
|
||||||
|
pytest_args = ["-v", "-s", __file__]
|
||||||
|
|
||||||
|
# You can add more pytest arguments here if needed, for example:
|
||||||
|
# '-k test_llm_extraction': Run only the LLM test function
|
||||||
|
# pytest_args.append("-k test_llm_extraction")
|
||||||
|
|
||||||
|
print(f"Running pytest with args: {pytest_args}")
|
||||||
|
|
||||||
|
# Execute pytest
|
||||||
|
exit_code = pytest.main(pytest_args)
|
||||||
|
|
||||||
|
print(f"Pytest finished with exit code: {exit_code}")
|
||||||
335
tests/general/generate_dummy_site.py
Normal file
335
tests/general/generate_dummy_site.py
Normal file
@@ -0,0 +1,335 @@
|
|||||||
|
# ==== File: build_dummy_site.py ====
|
||||||
|
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
from urllib.parse import quote
|
||||||
|
|
||||||
|
# --- Configuration ---
|
||||||
|
NUM_CATEGORIES = 3
|
||||||
|
NUM_SUBCATEGORIES_PER_CAT = 2 # Results in NUM_CATEGORIES * NUM_SUBCATEGORIES_PER_CAT total L2 categories
|
||||||
|
NUM_PRODUCTS_PER_SUBCAT = 5 # Products listed on L3 pages
|
||||||
|
MAX_DEPTH_TARGET = 5 # Explicitly set target depth
|
||||||
|
|
||||||
|
# --- Helper Functions ---
|
||||||
|
|
||||||
|
def generate_lorem(words=20):
|
||||||
|
"""Generates simple placeholder text."""
|
||||||
|
lorem_words = ["lorem", "ipsum", "dolor", "sit", "amet", "consectetur",
|
||||||
|
"adipiscing", "elit", "sed", "do", "eiusmod", "tempor",
|
||||||
|
"incididunt", "ut", "labore", "et", "dolore", "magna", "aliqua"]
|
||||||
|
return " ".join(random.choice(lorem_words) for _ in range(words)).capitalize() + "."
|
||||||
|
|
||||||
|
def create_html_page(filepath: Path, title: str, body_content: str, breadcrumbs: list = [], head_extras: str = ""):
|
||||||
|
"""Creates an HTML file with basic structure and inline CSS."""
|
||||||
|
os.makedirs(filepath.parent, exist_ok=True)
|
||||||
|
|
||||||
|
# Generate breadcrumb HTML using the 'link' provided in the breadcrumbs list
|
||||||
|
breadcrumb_html = ""
|
||||||
|
if breadcrumbs:
|
||||||
|
links_html = " » ".join(f'<a href="{bc["link"]}">{bc["name"]}</a>' for bc in breadcrumbs)
|
||||||
|
breadcrumb_html = f"<nav class='breadcrumbs'>{links_html} » {title}</nav>"
|
||||||
|
|
||||||
|
# Basic CSS for structure identification (kept the same)
|
||||||
|
css = """
|
||||||
|
<style>
|
||||||
|
body {
|
||||||
|
font-family: sans-serif;
|
||||||
|
padding: 20px;
|
||||||
|
background-color: #1e1e1e;
|
||||||
|
color: #d1d1d1;
|
||||||
|
}
|
||||||
|
|
||||||
|
.container {
|
||||||
|
max-width: 960px;
|
||||||
|
margin: auto;
|
||||||
|
background: #2c2c2c;
|
||||||
|
padding: 20px;
|
||||||
|
border-radius: 5px;
|
||||||
|
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.5);
|
||||||
|
}
|
||||||
|
|
||||||
|
h1, h2 {
|
||||||
|
color: #ccc;
|
||||||
|
}
|
||||||
|
|
||||||
|
a {
|
||||||
|
color: #9bcdff;
|
||||||
|
text-decoration: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
a:hover {
|
||||||
|
text-decoration: underline;
|
||||||
|
}
|
||||||
|
|
||||||
|
ul {
|
||||||
|
list-style: none;
|
||||||
|
padding-left: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
li {
|
||||||
|
margin-bottom: 10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.category-link,
|
||||||
|
.subcategory-link,
|
||||||
|
.product-link,
|
||||||
|
.details-link,
|
||||||
|
.reviews-link {
|
||||||
|
display: block;
|
||||||
|
padding: 8px;
|
||||||
|
background-color: #3a3a3a;
|
||||||
|
border-radius: 3px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.product-preview {
|
||||||
|
border: 1px solid #444;
|
||||||
|
padding: 10px;
|
||||||
|
margin-bottom: 10px;
|
||||||
|
border-radius: 4px;
|
||||||
|
background-color: #2a2a2a;
|
||||||
|
}
|
||||||
|
|
||||||
|
.product-title {
|
||||||
|
color: #d1d1d1;
|
||||||
|
}
|
||||||
|
|
||||||
|
.product-price {
|
||||||
|
font-weight: bold;
|
||||||
|
color: #85e085;
|
||||||
|
}
|
||||||
|
|
||||||
|
.product-description,
|
||||||
|
.product-specs,
|
||||||
|
.product-reviews {
|
||||||
|
margin-top: 15px;
|
||||||
|
line-height: 1.6;
|
||||||
|
}
|
||||||
|
|
||||||
|
.product-specs li {
|
||||||
|
margin-bottom: 5px;
|
||||||
|
font-size: 0.9em;
|
||||||
|
}
|
||||||
|
|
||||||
|
.spec-name {
|
||||||
|
font-weight: bold;
|
||||||
|
}
|
||||||
|
|
||||||
|
.breadcrumbs {
|
||||||
|
margin-bottom: 20px;
|
||||||
|
font-size: 0.9em;
|
||||||
|
color: #888;
|
||||||
|
}
|
||||||
|
|
||||||
|
.breadcrumbs a {
|
||||||
|
color: #9bcdff;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
"""
|
||||||
|
html_content = f"""<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>{title} - FakeShop</title>
|
||||||
|
{head_extras}
|
||||||
|
{css}
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div class="container">
|
||||||
|
{breadcrumb_html}
|
||||||
|
<h1>{title}</h1>
|
||||||
|
{body_content}
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>"""
|
||||||
|
with open(filepath, "w", encoding="utf-8") as f:
|
||||||
|
f.write(html_content)
|
||||||
|
# Keep print statement concise for clarity
|
||||||
|
# print(f"Created: {filepath}")
|
||||||
|
|
||||||
|
def generate_site(base_dir: Path, site_name: str = "FakeShop", base_path: str = ""):
|
||||||
|
"""Generates the dummy website structure."""
|
||||||
|
base_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# --- Clean and prepare the base path for URL construction ---
|
||||||
|
# Ensure it starts with '/' if not empty, and remove any trailing '/'
|
||||||
|
if base_path:
|
||||||
|
full_base_path = "/" + base_path.strip('/')
|
||||||
|
else:
|
||||||
|
full_base_path = "" # Represents the root
|
||||||
|
|
||||||
|
print(f"Using base path for links: '{full_base_path}'")
|
||||||
|
|
||||||
|
# --- Level 0: Homepage ---
|
||||||
|
home_body = "<h2>Welcome to FakeShop!</h2><p>Your one-stop shop for imaginary items.</p><h3>Categories:</h3>\n<ul>"
|
||||||
|
# Define the *actual* link path for the homepage breadcrumb
|
||||||
|
home_link_path = f"{full_base_path}/index.html"
|
||||||
|
breadcrumbs_home = [{"name": "Home", "link": home_link_path}] # Base breadcrumb
|
||||||
|
|
||||||
|
# Links *within* the page content should remain relative
|
||||||
|
for i in range(NUM_CATEGORIES):
|
||||||
|
cat_name = f"Category-{i+1}"
|
||||||
|
cat_folder_name = quote(cat_name.lower().replace(" ", "-"))
|
||||||
|
# This path is relative to the current directory (index.html)
|
||||||
|
cat_relative_page_path = f"{cat_folder_name}/index.html"
|
||||||
|
home_body += f'<li><a class="category-link" href="{cat_relative_page_path}">{cat_name}</a> - {generate_lorem(10)}</li>'
|
||||||
|
home_body += "</ul>"
|
||||||
|
create_html_page(base_dir / "index.html", "Homepage", home_body, []) # No breadcrumbs *on* the homepage itself
|
||||||
|
|
||||||
|
# --- Levels 1-5 ---
|
||||||
|
for i in range(NUM_CATEGORIES):
|
||||||
|
cat_name = f"Category-{i+1}"
|
||||||
|
cat_folder_name = quote(cat_name.lower().replace(" ", "-"))
|
||||||
|
cat_dir = base_dir / cat_folder_name
|
||||||
|
# This is the *absolute* path for the breadcrumb link
|
||||||
|
cat_link_path = f"{full_base_path}/{cat_folder_name}/index.html"
|
||||||
|
# Update breadcrumbs list for this level
|
||||||
|
breadcrumbs_cat = breadcrumbs_home + [{"name": cat_name, "link": cat_link_path}]
|
||||||
|
|
||||||
|
# --- Level 1: Category Page ---
|
||||||
|
cat_body = f"<p>{generate_lorem(15)} for {cat_name}.</p><h3>Sub-Categories:</h3>\n<ul>"
|
||||||
|
for j in range(NUM_SUBCATEGORIES_PER_CAT):
|
||||||
|
subcat_name = f"{cat_name}-Sub-{j+1}"
|
||||||
|
subcat_folder_name = quote(subcat_name.lower().replace(" ", "-"))
|
||||||
|
# Path relative to the category page
|
||||||
|
subcat_relative_page_path = f"{subcat_folder_name}/index.html"
|
||||||
|
cat_body += f'<li><a class="subcategory-link" href="{subcat_relative_page_path}">{subcat_name}</a> - {generate_lorem(8)}</li>'
|
||||||
|
cat_body += "</ul>"
|
||||||
|
# Pass the updated breadcrumbs list
|
||||||
|
create_html_page(cat_dir / "index.html", cat_name, cat_body, breadcrumbs_home) # Parent breadcrumb needed here
|
||||||
|
|
||||||
|
for j in range(NUM_SUBCATEGORIES_PER_CAT):
|
||||||
|
subcat_name = f"{cat_name}-Sub-{j+1}"
|
||||||
|
subcat_folder_name = quote(subcat_name.lower().replace(" ", "-"))
|
||||||
|
subcat_dir = cat_dir / subcat_folder_name
|
||||||
|
# Absolute path for the breadcrumb link
|
||||||
|
subcat_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/index.html"
|
||||||
|
# Update breadcrumbs list for this level
|
||||||
|
breadcrumbs_subcat = breadcrumbs_cat + [{"name": subcat_name, "link": subcat_link_path}]
|
||||||
|
|
||||||
|
# --- Level 2: Sub-Category Page (Product List) ---
|
||||||
|
subcat_body = f"<p>Explore products in {subcat_name}. {generate_lorem(12)}</p><h3>Products:</h3>\n<ul class='product-list'>"
|
||||||
|
for k in range(NUM_PRODUCTS_PER_SUBCAT):
|
||||||
|
prod_id = f"P{i+1}{j+1}{k+1:03d}" # e.g., P11001
|
||||||
|
prod_name = f"{subcat_name} Product {k+1} ({prod_id})"
|
||||||
|
# Filename relative to the subcategory page
|
||||||
|
prod_filename = f"product_{prod_id}.html"
|
||||||
|
# Absolute path for the breadcrumb link
|
||||||
|
prod_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{prod_filename}"
|
||||||
|
|
||||||
|
# Preview on list page (link remains relative)
|
||||||
|
subcat_body += f"""
|
||||||
|
<li>
|
||||||
|
<div class="product-preview">
|
||||||
|
<a class="product-link" href="{prod_filename}"><strong>{prod_name}</strong></a>
|
||||||
|
<p>{generate_lorem(10)}</p>
|
||||||
|
<span class="product-price">£{random.uniform(10, 500):.2f}</span>
|
||||||
|
</div>
|
||||||
|
</li>"""
|
||||||
|
|
||||||
|
# --- Level 3: Product Page ---
|
||||||
|
prod_price = random.uniform(10, 500)
|
||||||
|
prod_desc = generate_lorem(40)
|
||||||
|
prod_specs = {f"Spec {s+1}": generate_lorem(3) for s in range(random.randint(3,6))}
|
||||||
|
prod_reviews_count = random.randint(0, 150)
|
||||||
|
# Relative filenames for links on this page
|
||||||
|
details_filename_relative = f"product_{prod_id}_details.html"
|
||||||
|
reviews_filename_relative = f"product_{prod_id}_reviews.html"
|
||||||
|
|
||||||
|
prod_body = f"""
|
||||||
|
<p class="product-price">Price: £{prod_price:.2f}</p>
|
||||||
|
<div class="product-description">
|
||||||
|
<h2>Description</h2>
|
||||||
|
<p>{prod_desc}</p>
|
||||||
|
</div>
|
||||||
|
<div class="product-specs">
|
||||||
|
<h2>Specifications</h2>
|
||||||
|
<ul>
|
||||||
|
{''.join(f'<li><span class="spec-name">{name}</span>: <span class="spec-value">{value}</span></li>' for name, value in prod_specs.items())}
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
<div class="product-reviews">
|
||||||
|
<h2>Reviews</h2>
|
||||||
|
<p>Total Reviews: <span class="review-count">{prod_reviews_count}</span></p>
|
||||||
|
</div>
|
||||||
|
<hr>
|
||||||
|
<p>
|
||||||
|
<a class="details-link" href="{details_filename_relative}">View More Details</a> |
|
||||||
|
<a class="reviews-link" href="{reviews_filename_relative}">See All Reviews</a>
|
||||||
|
</p>
|
||||||
|
"""
|
||||||
|
# Update breadcrumbs list for this level
|
||||||
|
breadcrumbs_prod = breadcrumbs_subcat + [{"name": prod_name, "link": prod_link_path}]
|
||||||
|
# Pass the updated breadcrumbs list
|
||||||
|
create_html_page(subcat_dir / prod_filename, prod_name, prod_body, breadcrumbs_subcat) # Parent breadcrumb needed here
|
||||||
|
|
||||||
|
# --- Level 4: Product Details Page ---
|
||||||
|
details_filename = f"product_{prod_id}_details.html" # Actual filename
|
||||||
|
# Absolute path for the breadcrumb link
|
||||||
|
details_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{details_filename}"
|
||||||
|
details_body = f"<p>This page contains extremely detailed information about {prod_name}.</p>{generate_lorem(100)}"
|
||||||
|
# Update breadcrumbs list for this level
|
||||||
|
breadcrumbs_details = breadcrumbs_prod + [{"name": "Details", "link": details_link_path}]
|
||||||
|
# Pass the updated breadcrumbs list
|
||||||
|
create_html_page(subcat_dir / details_filename, f"{prod_name} - Details", details_body, breadcrumbs_prod) # Parent breadcrumb needed here
|
||||||
|
|
||||||
|
# --- Level 5: Product Reviews Page ---
|
||||||
|
reviews_filename = f"product_{prod_id}_reviews.html" # Actual filename
|
||||||
|
# Absolute path for the breadcrumb link
|
||||||
|
reviews_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{reviews_filename}"
|
||||||
|
reviews_body = f"<p>All {prod_reviews_count} reviews for {prod_name} are listed here.</p><ul>"
|
||||||
|
for r in range(prod_reviews_count):
|
||||||
|
reviews_body += f"<li>Review {r+1}: {generate_lorem(random.randint(15, 50))}</li>"
|
||||||
|
reviews_body += "</ul>"
|
||||||
|
# Update breadcrumbs list for this level
|
||||||
|
breadcrumbs_reviews = breadcrumbs_prod + [{"name": "Reviews", "link": reviews_link_path}]
|
||||||
|
# Pass the updated breadcrumbs list
|
||||||
|
create_html_page(subcat_dir / reviews_filename, f"{prod_name} - Reviews", reviews_body, breadcrumbs_prod) # Parent breadcrumb needed here
|
||||||
|
|
||||||
|
|
||||||
|
subcat_body += "</ul>" # Close product-list ul
|
||||||
|
# Pass the correct breadcrumbs list for the subcategory index page
|
||||||
|
create_html_page(subcat_dir / "index.html", subcat_name, subcat_body, breadcrumbs_cat) # Parent breadcrumb needed here
|
||||||
|
|
||||||
|
|
||||||
|
# --- Main Execution ---
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(description="Generate a dummy multi-level retail website.")
|
||||||
|
parser.add_argument(
|
||||||
|
"-o", "--output-dir",
|
||||||
|
type=str,
|
||||||
|
default="dummy_retail_site",
|
||||||
|
help="Directory to generate the website in."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-n", "--site-name",
|
||||||
|
type=str,
|
||||||
|
default="FakeShop",
|
||||||
|
help="Name of the fake shop."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-b", "--base-path",
|
||||||
|
type=str,
|
||||||
|
default="",
|
||||||
|
help="Base path for hosting the site (e.g., 'samples/deepcrawl'). Leave empty if hosted at the root."
|
||||||
|
)
|
||||||
|
# Optional: Add more args to configure counts if needed
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
output_directory = Path(args.output_dir)
|
||||||
|
site_name = args.site_name
|
||||||
|
base_path = args.base_path
|
||||||
|
|
||||||
|
print(f"Generating dummy site '{site_name}' in '{output_directory}'...")
|
||||||
|
# Pass the base_path to the generation function
|
||||||
|
generate_site(output_directory, site_name, base_path)
|
||||||
|
print(f"\nCreated {sum(1 for _ in output_directory.rglob('*.html'))} HTML pages.")
|
||||||
|
print("Dummy site generation complete.")
|
||||||
|
print(f"To serve locally (example): python -m http.server --directory {output_directory} 8000")
|
||||||
|
if base_path:
|
||||||
|
print(f"Access the site at: http://localhost:8000/{base_path.strip('/')}/index.html")
|
||||||
|
else:
|
||||||
|
print(f"Access the site at: http://localhost:8000/index.html")
|
||||||
Reference in New Issue
Block a user