diff --git a/Dockerfile b/Dockerfile
index 2997590a..4ad605ae 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,32 +1,31 @@
-# syntax=docker/dockerfile:1.4
+FROM python:3.10-slim
-ARG TARGETPLATFORM
-ARG BUILDPLATFORM
+# Set build arguments
+ARG APP_HOME=/app
+ARG GITHUB_REPO=https://github.com/yourusername/crawl4ai.git
+ARG GITHUB_BRANCH=main
+ARG USE_LOCAL=true
-# Other build arguments
-ARG PYTHON_VERSION=3.10
-
-# Base stage with system dependencies
-FROM python:${PYTHON_VERSION}-slim as base
-
-# Declare ARG variables again within the build stage
-ARG INSTALL_TYPE=all
-ARG ENABLE_GPU=false
-
-# Platform-specific labels
-LABEL maintainer="unclecode"
-LABEL description="๐ฅ๐ท๏ธ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
-LABEL version="1.0"
-
-# Environment setup
-ENV PYTHONUNBUFFERED=1 \
- PYTHONDONTWRITEBYTECODE=1 \
+ENV PYTHONFAULTHANDLER=1 \
+ PYTHONHASHSEED=random \
+ PYTHONUNBUFFERED=1 \
PIP_NO_CACHE_DIR=1 \
+ PYTHONDONTWRITEBYTECODE=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1 \
PIP_DEFAULT_TIMEOUT=100 \
- DEBIAN_FRONTEND=noninteractive
+ DEBIAN_FRONTEND=noninteractive \
+ REDIS_HOST=localhost \
+ REDIS_PORT=6379
+
+ARG PYTHON_VERSION=3.10
+ARG INSTALL_TYPE=default
+ARG ENABLE_GPU=false
+ARG TARGETARCH
+
+LABEL maintainer="unclecode"
+LABEL description="๐ฅ๐ท๏ธ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
+LABEL version="1.0"
-# Install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
curl \
@@ -37,10 +36,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
pkg-config \
python3-dev \
libjpeg-dev \
- libpng-dev \
+ redis-server \
&& rm -rf /var/lib/apt/lists/*
-# Playwright system dependencies for Linux
RUN apt-get update && apt-get install -y --no-install-recommends \
libglib2.0-0 \
libnss3 \
@@ -65,8 +63,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
libatspi2.0-0 \
&& rm -rf /var/lib/apt/lists/*
-# GPU support if enabled and architecture is supported
-RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETPLATFORM" = "linux/amd64" ] ; then \
+RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \
apt-get update && apt-get install -y --no-install-recommends \
nvidia-cuda-toolkit \
&& rm -rf /var/lib/apt/lists/* ; \
@@ -74,19 +71,40 @@ else \
echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \
fi
-# Create and set working directory
-WORKDIR /app
+RUN if [ "$TARGETARCH" = "arm64" ]; then \
+ echo "๐ฆพ Installing ARM-specific optimizations"; \
+ apt-get update && apt-get install -y --no-install-recommends \
+ libopenblas-dev \
+ && rm -rf /var/lib/apt/lists/*; \
+elif [ "$TARGETARCH" = "amd64" ]; then \
+ echo "๐ฅ๏ธ Installing AMD64-specific optimizations"; \
+ apt-get update && apt-get install -y --no-install-recommends \
+ libomp-dev \
+ && rm -rf /var/lib/apt/lists/*; \
+else \
+ echo "Skipping platform-specific optimizations (unsupported platform)"; \
+fi
-# Copy the entire project
-COPY . .
+WORKDIR ${APP_HOME}
-# Install base requirements
+RUN echo '#!/bin/bash\n\
+if [ "$USE_LOCAL" = "true" ]; then\n\
+ echo "๐ฆ Installing from local source..."\n\
+ pip install --no-cache-dir /tmp/project/\n\
+else\n\
+ echo "๐ Installing from GitHub..."\n\
+ for i in {1..3}; do \n\
+ git clone --branch ${GITHUB_BRANCH} ${GITHUB_REPO} /tmp/crawl4ai && break || \n\
+ { echo "Attempt $i/3 failed! Taking a short break... โ"; sleep 5; }; \n\
+ done\n\
+ pip install --no-cache-dir /tmp/crawl4ai\n\
+fi' > /tmp/install.sh && chmod +x /tmp/install.sh
+
+COPY . /tmp/project/
+
+COPY deploy/docker/requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
-# Install required library for FastAPI
-RUN pip install fastapi uvicorn psutil
-
-# Install ML dependencies first for better layer caching
RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
pip install --no-cache-dir \
torch \
@@ -99,38 +117,50 @@ RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
python -m nltk.downloader punkt stopwords ; \
fi
-# Install the package
RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
- pip install ".[all]" && \
+ pip install "/tmp/project/[all]" && \
python -m crawl4ai.model_loader ; \
elif [ "$INSTALL_TYPE" = "torch" ] ; then \
- pip install ".[torch]" ; \
+ pip install "/tmp/project/[torch]" ; \
elif [ "$INSTALL_TYPE" = "transformer" ] ; then \
- pip install ".[transformer]" && \
+ pip install "/tmp/project/[transformer]" && \
python -m crawl4ai.model_loader ; \
else \
- pip install "." ; \
+ pip install "/tmp/project" ; \
fi
+
+RUN pip install --no-cache-dir --upgrade pip && \
+ /tmp/install.sh && \
+ python -c "import crawl4ai; print('โ
crawl4ai is ready to rock!')" && \
+ python -c "from playwright.sync_api import sync_playwright; print('โ
Playwright is feeling dramatic!')"
+
+RUN playwright install --with-deps chromium
- # Install MkDocs and required plugins
-RUN pip install --no-cache-dir \
- mkdocs \
- mkdocs-material \
- mkdocs-terminal \
- pymdown-extensions
+COPY deploy/docker/* ${APP_HOME}/
-# Build MkDocs documentation
-RUN mkdocs build
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+ CMD bash -c '\
+ MEM=$(free -m | awk "/^Mem:/{print \$2}"); \
+ if [ $MEM -lt 2048 ]; then \
+ echo "โ ๏ธ Warning: Less than 2GB RAM available! Your container might need a memory boost! ๐"; \
+ exit 1; \
+ fi && \
+ redis-cli ping > /dev/null && \
+ curl -f http://localhost:8000/health || exit 1'
-# Install Playwright and browsers
-RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
- playwright install chromium; \
- elif [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
- playwright install chromium; \
- fi
+COPY deploy/docker/docker-entrypoint.sh /usr/local/bin/
+RUN chmod +x /usr/local/bin/docker-entrypoint.sh
-# Expose port
-EXPOSE 8000 11235 9222 8080
+EXPOSE 6379
-# Start the FastAPI server
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "11235"]
\ No newline at end of file
+ENTRYPOINT ["docker-entrypoint.sh"]
+
+CMD service redis-server start && gunicorn \
+ --bind 0.0.0.0:8000 \
+ --workers 4 \
+ --threads 2 \
+ --timeout 120 \
+ --graceful-timeout 30 \
+ --log-level info \
+ --worker-class uvicorn.workers.UvicornWorker \
+ server:app
diff --git a/Dockerfile_old b/Dockerfile_old
new file mode 100644
index 00000000..2997590a
--- /dev/null
+++ b/Dockerfile_old
@@ -0,0 +1,136 @@
+# syntax=docker/dockerfile:1.4
+
+ARG TARGETPLATFORM
+ARG BUILDPLATFORM
+
+# Other build arguments
+ARG PYTHON_VERSION=3.10
+
+# Base stage with system dependencies
+FROM python:${PYTHON_VERSION}-slim as base
+
+# Declare ARG variables again within the build stage
+ARG INSTALL_TYPE=all
+ARG ENABLE_GPU=false
+
+# Platform-specific labels
+LABEL maintainer="unclecode"
+LABEL description="๐ฅ๐ท๏ธ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
+LABEL version="1.0"
+
+# Environment setup
+ENV PYTHONUNBUFFERED=1 \
+ PYTHONDONTWRITEBYTECODE=1 \
+ PIP_NO_CACHE_DIR=1 \
+ PIP_DISABLE_PIP_VERSION_CHECK=1 \
+ PIP_DEFAULT_TIMEOUT=100 \
+ DEBIAN_FRONTEND=noninteractive
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+ build-essential \
+ curl \
+ wget \
+ gnupg \
+ git \
+ cmake \
+ pkg-config \
+ python3-dev \
+ libjpeg-dev \
+ libpng-dev \
+ && rm -rf /var/lib/apt/lists/*
+
+# Playwright system dependencies for Linux
+RUN apt-get update && apt-get install -y --no-install-recommends \
+ libglib2.0-0 \
+ libnss3 \
+ libnspr4 \
+ libatk1.0-0 \
+ libatk-bridge2.0-0 \
+ libcups2 \
+ libdrm2 \
+ libdbus-1-3 \
+ libxcb1 \
+ libxkbcommon0 \
+ libx11-6 \
+ libxcomposite1 \
+ libxdamage1 \
+ libxext6 \
+ libxfixes3 \
+ libxrandr2 \
+ libgbm1 \
+ libpango-1.0-0 \
+ libcairo2 \
+ libasound2 \
+ libatspi2.0-0 \
+ && rm -rf /var/lib/apt/lists/*
+
+# GPU support if enabled and architecture is supported
+RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETPLATFORM" = "linux/amd64" ] ; then \
+ apt-get update && apt-get install -y --no-install-recommends \
+ nvidia-cuda-toolkit \
+ && rm -rf /var/lib/apt/lists/* ; \
+else \
+ echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \
+fi
+
+# Create and set working directory
+WORKDIR /app
+
+# Copy the entire project
+COPY . .
+
+# Install base requirements
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Install required library for FastAPI
+RUN pip install fastapi uvicorn psutil
+
+# Install ML dependencies first for better layer caching
+RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
+ pip install --no-cache-dir \
+ torch \
+ torchvision \
+ torchaudio \
+ scikit-learn \
+ nltk \
+ transformers \
+ tokenizers && \
+ python -m nltk.downloader punkt stopwords ; \
+ fi
+
+# Install the package
+RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
+ pip install ".[all]" && \
+ python -m crawl4ai.model_loader ; \
+ elif [ "$INSTALL_TYPE" = "torch" ] ; then \
+ pip install ".[torch]" ; \
+ elif [ "$INSTALL_TYPE" = "transformer" ] ; then \
+ pip install ".[transformer]" && \
+ python -m crawl4ai.model_loader ; \
+ else \
+ pip install "." ; \
+ fi
+
+ # Install MkDocs and required plugins
+RUN pip install --no-cache-dir \
+ mkdocs \
+ mkdocs-material \
+ mkdocs-terminal \
+ pymdown-extensions
+
+# Build MkDocs documentation
+RUN mkdocs build
+
+# Install Playwright and browsers
+RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
+ playwright install chromium; \
+ elif [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
+ playwright install chromium; \
+ fi
+
+# Expose port
+EXPOSE 8000 11235 9222 8080
+
+# Start the FastAPI server
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "11235"]
\ No newline at end of file
diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py
index 799bd221..55c60e8e 100644
--- a/crawl4ai/content_filter_strategy.py
+++ b/crawl4ai/content_filter_strategy.py
@@ -5,7 +5,7 @@ from typing import List, Tuple, Dict, Optional
from rank_bm25 import BM25Okapi
from collections import deque
from bs4 import NavigableString, Comment
-from .utils import clean_tokens, perform_completion_with_backoff, escape_json_string, sanitize_html, get_home_folder, extract_xml_data
+from .utils import clean_tokens, perform_completion_with_backoff, escape_json_string, sanitize_html, get_home_folder, extract_xml_data, merge_chunks
from abc import ABC, abstractmethod
import math
from snowballstemmer import stemmer
@@ -23,7 +23,14 @@ from colorama import Fore, Style
class RelevantContentFilter(ABC):
"""Abstract base class for content filtering strategies"""
- def __init__(self, user_query: str = None):
+ def __init__(self, user_query: str = None, verbose: bool = False, logger: Optional[AsyncLogger] = None):
+ """
+ Initializes the RelevantContentFilter class with optional user query.
+
+ Args:
+ user_query (str): User query for filtering (optional).
+ verbose (bool): Enable verbose logging (default: False).
+ """
self.user_query = user_query
self.included_tags = {
# Primary structure
@@ -92,6 +99,8 @@ class RelevantContentFilter(ABC):
r"nav|footer|header|sidebar|ads|comment|promo|advert|social|share", re.I
)
self.min_word_count = 2
+ self.verbose = False
+ self.logger = logger
@abstractmethod
def filter_content(self, html: str) -> List[str]:
@@ -755,8 +764,11 @@ class LLMContentFilter(RelevantContentFilter):
base_url: Optional[str] = None,
api_base: Optional[str] = None,
extra_args: Dict = None,
+ # char_token_rate: float = WORD_TOKEN_RATE * 5,
+ # chunk_mode: str = "char",
verbose: bool = False,
logger: Optional[AsyncLogger] = None,
+ ignore_cache: bool = False,
):
super().__init__(None)
self.provider = provider
@@ -768,10 +780,15 @@ class LLMContentFilter(RelevantContentFilter):
self.instruction = instruction
self.chunk_token_threshold = chunk_token_threshold
self.overlap_rate = overlap_rate
- self.word_token_rate = word_token_rate
+ self.word_token_rate = word_token_rate or WORD_TOKEN_RATE
+ # self.chunk_mode: str = chunk_mode
+ # self.char_token_rate = char_token_rate or word_token_rate / 5
+ # self.token_rate = word_token_rate if chunk_mode == "word" else self.char_token_rate
+ self.token_rate = word_token_rate or WORD_TOKEN_RATE
self.base_url = base_url
self.api_base = api_base or base_url
self.extra_args = extra_args or {}
+ self.ignore_cache = ignore_cache
self.verbose = verbose
# Setup logger with custom styling for LLM operations
@@ -779,7 +796,7 @@ class LLMContentFilter(RelevantContentFilter):
self.logger = logger
elif verbose:
self.logger = AsyncLogger(
- verbose=True,
+ verbose=verbose,
icons={
**AsyncLogger.DEFAULT_ICONS,
"LLM": "โ
", # Star for LLM operations
@@ -803,45 +820,25 @@ class LLMContentFilter(RelevantContentFilter):
return hashlib.md5(content.encode()).hexdigest()
def _merge_chunks(self, text: str) -> List[str]:
- """Split text into chunks with overlap"""
- # Calculate tokens and sections
- total_tokens = len(text.split()) * self.word_token_rate
- num_sections = max(1, math.floor(total_tokens / self.chunk_token_threshold))
- adjusted_chunk_threshold = total_tokens / num_sections
+ """Split text into chunks with overlap using char or word mode."""
+ ov = int(self.chunk_token_threshold * self.overlap_rate)
+ sections = merge_chunks(
+ docs = [text],
+ target_size= self.chunk_token_threshold,
+ overlap=ov,
+ word_token_ratio=self.word_token_rate
+ )
+ return sections
+
+
- # Split into words
- words = text.split()
- chunks = []
- current_chunk = []
- current_token_count = 0
-
- for word in words:
- word_tokens = len(word) * self.word_token_rate
- if current_token_count + word_tokens <= adjusted_chunk_threshold:
- current_chunk.append(word)
- current_token_count += word_tokens
- else:
- # Add overlap if not the last chunk
- if chunks and self.overlap_rate > 0:
- overlap_size = int(len(current_chunk) * self.overlap_rate)
- current_chunk.extend(current_chunk[-overlap_size:])
-
- chunks.append(" ".join(current_chunk))
- current_chunk = [word]
- current_token_count = word_tokens
-
- if current_chunk:
- chunks.append(" ".join(current_chunk))
-
- return chunks
-
- def filter_content(self, html: str, ignore_cache: bool = False) -> List[str]:
+ def filter_content(self, html: str, ignore_cache: bool = True) -> List[str]:
if not html or not isinstance(html, str):
return []
if self.logger:
self.logger.info(
- "Starting LLM content filtering process",
+ "Starting LLM markdown content filtering process",
tag="LLM",
params={"provider": self.provider},
colors={"provider": Fore.CYAN}
@@ -853,9 +850,12 @@ class LLMContentFilter(RelevantContentFilter):
cache_key = self._get_cache_key(html, self.instruction or "")
cache_file = cache_dir / f"{cache_key}.json"
+ # if ignore_cache == None:
+ ignore_cache = self.ignore_cache
+
if not ignore_cache and cache_file.exists():
if self.logger:
- self.logger.info("Found cached result", tag="CACHE")
+ self.logger.info("Found cached markdown result", tag="CACHE")
try:
with cache_file.open('r') as f:
cached_data = json.load(f)
@@ -867,13 +867,13 @@ class LLMContentFilter(RelevantContentFilter):
return cached_data['blocks']
except Exception as e:
if self.logger:
- self.logger.error(f"Cache read error: {str(e)}", tag="CACHE")
+ self.logger.error(f"LLM markdown: Cache read error: {str(e)}", tag="CACHE")
# Split into chunks
html_chunks = self._merge_chunks(html)
if self.logger:
self.logger.info(
- "Split content into {chunk_count} chunks",
+ "LLM markdown: Split content into {chunk_count} chunks",
tag="CHUNK",
params={"chunk_count": len(html_chunks)},
colors={"chunk_count": Fore.YELLOW}
@@ -887,7 +887,7 @@ class LLMContentFilter(RelevantContentFilter):
for i, chunk in enumerate(html_chunks):
if self.logger:
self.logger.debug(
- "Processing chunk {chunk_num}/{total_chunks}",
+ "LLM markdown: Processing chunk {chunk_num}/{total_chunks}",
tag="CHUNK",
params={
"chunk_num": i + 1,
@@ -904,16 +904,38 @@ class LLMContentFilter(RelevantContentFilter):
for var, value in prompt_variables.items():
prompt = prompt.replace("{" + var + "}", value)
+ def _proceed_with_chunk(
+ provider: str,
+ prompt: str,
+ api_token: str,
+ base_url: Optional[str] = None,
+ extra_args: Dict = {}
+ ) -> List[str]:
+ if self.logger:
+ self.logger.info(
+ "LLM Markdown: Processing chunk {chunk_num}",
+ tag="CHUNK",
+ params={"chunk_num": i + 1}
+ )
+ return perform_completion_with_backoff(
+ provider,
+ prompt,
+ api_token,
+ base_url=base_url,
+ extra_args=extra_args
+ )
+
future = executor.submit(
- perform_completion_with_backoff,
+ _proceed_with_chunk,
self.provider,
prompt,
self.api_token,
- base_url=self.api_base,
- extra_args=self.extra_args
+ self.api_base,
+ self.extra_args
)
futures.append((i, future))
+
# Collect results in order
ordered_results = []
for i, future in sorted(futures):
@@ -940,14 +962,14 @@ class LLMContentFilter(RelevantContentFilter):
ordered_results.append(blocks)
if self.logger:
self.logger.success(
- "Successfully processed chunk {chunk_num}",
+ "LLM markdown: Successfully processed chunk {chunk_num}",
tag="CHUNK",
params={"chunk_num": i + 1}
)
except Exception as e:
if self.logger:
self.logger.error(
- "Error processing chunk {chunk_num}: {error}",
+ "LLM markdown: Error processing chunk {chunk_num}: {error}",
tag="CHUNK",
params={
"chunk_num": i + 1,
@@ -958,7 +980,7 @@ class LLMContentFilter(RelevantContentFilter):
end_time = time.time()
if self.logger:
self.logger.success(
- "Completed processing in {time:.2f}s",
+ "LLM markdown: Completed processing in {time:.2f}s",
tag="LLM",
params={"time": end_time - start_time},
colors={"time": Fore.YELLOW}
diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py
index 2ba06f00..8b9dc9cf 100644
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -21,6 +21,9 @@ from .utils import (
extract_xml_data,
split_and_parse_json_objects,
sanitize_input_encode,
+ chunk_documents,
+ merge_chunks,
+ advanced_split,
)
from .models import * # noqa: F403
@@ -501,6 +504,10 @@ class LLMExtractionStrategy(ExtractionStrategy):
instruction: str = None,
schema: Dict = None,
extraction_type="block",
+ chunk_token_threshold=CHUNK_TOKEN_THRESHOLD,
+ overlap_rate=OVERLAP_RATE,
+ word_token_rate=WORD_TOKEN_RATE,
+ apply_chunking=True,
**kwargs,
):
"""
@@ -652,53 +659,16 @@ class LLMExtractionStrategy(ExtractionStrategy):
)
return blocks
- def _merge(self, documents, chunk_token_threshold, overlap):
+ def _merge(self, documents, chunk_token_threshold, overlap) -> List[str]:
"""
Merge documents into sections based on chunk_token_threshold and overlap.
"""
- # chunks = []
- sections = []
- total_tokens = 0
-
- # Calculate the total tokens across all documents
- for document in documents:
- total_tokens += len(document.split(" ")) * self.word_token_rate
-
- # Calculate the number of sections needed
- num_sections = math.floor(total_tokens / chunk_token_threshold)
- if num_sections < 1:
- num_sections = 1 # Ensure there is at least one section
- adjusted_chunk_threshold = total_tokens / num_sections
-
- total_token_so_far = 0
- current_chunk = []
-
- for document in documents:
- tokens = document.split(" ")
- token_count = len(tokens) * self.word_token_rate
-
- if total_token_so_far + token_count <= adjusted_chunk_threshold:
- current_chunk.extend(tokens)
- total_token_so_far += token_count
- else:
- # Ensure to handle the last section properly
- if len(sections) == num_sections - 1:
- current_chunk.extend(tokens)
- continue
-
- # Add overlap if specified
- if overlap > 0 and current_chunk:
- overlap_tokens = current_chunk[-overlap:]
- current_chunk.extend(overlap_tokens)
-
- sections.append(" ".join(current_chunk))
- current_chunk = tokens
- total_token_so_far = token_count
-
- # Add the last chunk
- if current_chunk:
- sections.append(" ".join(current_chunk))
-
+ sections = merge_chunks(
+ docs = documents,
+ target_size= chunk_token_threshold,
+ overlap=overlap,
+ word_token_ratio=self.word_token_rate
+ )
return sections
def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:
diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py
index 1e3f0554..f90aa665 100644
--- a/crawl4ai/markdown_generation_strategy.py
+++ b/crawl4ai/markdown_generation_strategy.py
@@ -1,4 +1,5 @@
from abc import ABC, abstractmethod
+from tabnanny import verbose
from typing import Optional, Dict, Any, Tuple
from .models import MarkdownGenerationResult
from .html2text import CustomHTML2Text
@@ -29,9 +30,11 @@ class MarkdownGenerationStrategy(ABC):
self,
content_filter: Optional[RelevantContentFilter] = None,
options: Optional[Dict[str, Any]] = None,
+ verbose: bool = False,
):
self.content_filter = content_filter
self.options = options or {}
+ self.verbose = verbose
@abstractmethod
def generate_markdown(
diff --git a/crawl4ai/prompts.py b/crawl4ai/prompts.py
index be5e0310..99e0f854 100644
--- a/crawl4ai/prompts.py
+++ b/crawl4ai/prompts.py
@@ -206,17 +206,6 @@ Output the final list of JSON objects, wrapped in ... XML tags.
PROMPT_FILTER_CONTENT = """Your task is to filter and convert HTML content into clean, focused markdown that's optimized for use with LLMs and information retrieval systems.
-INPUT HTML:
-<|HTML_CONTENT_START|>
-{HTML}
-<|HTML_CONTENT_END|>
-
-
-SPECIFIC INSTRUCTION:
-<|USER_INSTRUCTION_START|>
-{REQUEST}
-<|USER_INSTRUCTION_END|>
-
TASK DETAILS:
1. Content Selection
- DO: Keep essential information, main content, key details
@@ -240,15 +229,7 @@ TASK DETAILS:
- DON'T: Fragment related content
- DON'T: Duplicate information
-Example Input:
-
Setup Guide
Follow these steps...
-
-
-Example Output:
-# Setup Guide
-Follow these steps...
-
-IMPORTANT: If specific instruction is provided above, prioritize those requirements over these general guidelines.
+IMPORTANT: If user specific instruction is provided, ignore above guideline and prioritize those requirements over these general guidelines.
OUTPUT FORMAT:
Wrap your response in tags. Use proper markdown throughout.
@@ -256,7 +237,18 @@ Wrap your response in tags. Use proper markdown throughout.
[Your markdown content here]
-Begin filtering now."""
+Begin filtering now.
+
+--------------------------------------------
+
+<|HTML_CONTENT_START|>
+{HTML}
+<|HTML_CONTENT_END|>
+
+<|USER_INSTRUCTION_START|>
+{REQUEST}
+<|USER_INSTRUCTION_END|>
+"""
JSON_SCHEMA_BUILDER= """
# HTML Schema Generation Instructions
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index 19975f24..41deffb0 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -1,3 +1,4 @@
+from ast import Call
import time
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -8,9 +9,10 @@ import re
import os
import platform
from .prompts import PROMPT_EXTRACT_BLOCKS
+from array import array
from .config import *
from pathlib import Path
-from typing import Dict, Any
+from typing import Dict, Any, List, Tuple, Union, Optional, Callable
from urllib.parse import urljoin
import requests
from requests.exceptions import InvalidSchema
@@ -31,6 +33,154 @@ import aiohttp
from pathlib import Path
from packaging import version
from . import __version__
+from typing import Sequence, List
+from array import array
+from itertools import chain
+from collections import deque
+from typing import Callable, Generator, Iterable, List, Optional
+
+def chunk_documents(
+ documents: Iterable[str],
+ chunk_token_threshold: int,
+ overlap: int,
+ word_token_rate: float = 0.75,
+ tokenizer: Optional[Callable[[str], List[str]]] = None,
+) -> Generator[str, None, None]:
+ """
+ Efficiently chunks documents into token-limited sections with overlap between chunks.
+
+ Args:
+ documents: Iterable of document strings
+ chunk_token_threshold: Maximum tokens per chunk
+ overlap: Number of tokens to overlap between chunks
+ word_token_rate: Token estimate per word when not using a tokenizer
+ tokenizer: Function that splits text into tokens (if available)
+
+ Yields:
+ Text chunks as strings
+ """
+ token_queue = deque()
+ contribution_queue = deque()
+ current_token_count = 0.0
+
+ for doc in documents:
+ # Tokenize document
+ if tokenizer:
+ tokens = tokenizer(doc)
+ contributions = [1.0] * len(tokens)
+ else:
+ tokens = doc.split()
+ contributions = [word_token_rate] * len(tokens)
+
+ # Add to processing queues
+ token_queue.extend(tokens)
+ contribution_queue.extend(contributions)
+ current_token_count += sum(contributions)
+
+ # Process full chunks
+ while current_token_count >= chunk_token_threshold:
+ # Find chunk split point
+ chunk_tokens = []
+ chunk_contrib = []
+ chunk_total = 0.0
+
+ # Build chunk up to threshold
+ while contribution_queue:
+ next_contrib = contribution_queue[0]
+ if chunk_total + next_contrib > chunk_token_threshold:
+ break
+
+ chunk_total += next_contrib
+ chunk_contrib.append(contribution_queue.popleft())
+ chunk_tokens.append(token_queue.popleft())
+
+ # Handle edge case where first token exceeds threshold
+ if not chunk_contrib: # Single token exceeds threshold
+ chunk_contrib.append(contribution_queue.popleft())
+ chunk_tokens.append(token_queue.popleft())
+
+ # Calculate overlap
+ overlap_total = 0.0
+ overlap_idx = 0
+ for contrib in reversed(chunk_contrib):
+ if overlap_total + contrib > overlap:
+ break
+ overlap_total += contrib
+ overlap_idx += 1
+
+ # Prepend overlap to queues
+ if overlap_idx > 0:
+ overlap_tokens = chunk_tokens[-overlap_idx:]
+ overlap_contrib = chunk_contrib[-overlap_idx:]
+
+ token_queue.extendleft(reversed(overlap_tokens))
+ contribution_queue.extendleft(reversed(overlap_contrib))
+ current_token_count += overlap_total
+
+ # Update current token count and yield chunk
+ current_token_count -= sum(chunk_contrib)
+ yield " ".join(chunk_tokens[:len(chunk_tokens)-overlap_idx] if overlap_idx else chunk_tokens)
+
+ # Yield remaining tokens
+ if token_queue:
+ yield " ".join(token_queue)
+
+def merge_chunks(
+ docs: Sequence[str],
+ target_size: int,
+ overlap: int = 0,
+ word_token_ratio: float = 1.0,
+ splitter: Callable = None
+) -> List[str]:
+ """Merges documents into chunks of specified token size.
+
+ Args:
+ docs: Input documents
+ target_size: Desired token count per chunk
+ overlap: Number of tokens to overlap between chunks
+ word_token_ratio: Multiplier for word->token conversion
+ """
+ # Pre-tokenize all docs and store token counts
+ splitter = splitter or str.split
+ token_counts = array('I')
+ all_tokens: List[List[str]] = []
+ total_tokens = 0
+
+ for doc in docs:
+ tokens = doc.split()
+ count = int(len(tokens) * word_token_ratio)
+ if count: # Skip empty docs
+ token_counts.append(count)
+ all_tokens.append(tokens)
+ total_tokens += count
+
+ if not total_tokens:
+ return []
+
+ # Pre-allocate chunks
+ num_chunks = max(1, (total_tokens + target_size - 1) // target_size)
+ chunks: List[List[str]] = [[] for _ in range(num_chunks)]
+
+ curr_chunk = 0
+ curr_size = 0
+
+ # Distribute tokens
+ for tokens in chain.from_iterable(all_tokens):
+ if curr_size >= target_size and curr_chunk < num_chunks - 1:
+ if overlap > 0:
+ overlap_tokens = chunks[curr_chunk][-overlap:]
+ curr_chunk += 1
+ chunks[curr_chunk].extend(overlap_tokens)
+ curr_size = len(overlap_tokens)
+ else:
+ curr_chunk += 1
+ curr_size = 0
+
+ chunks[curr_chunk].append(tokens)
+ curr_size += 1
+
+ # Return only non-empty chunks
+ return [' '.join(chunk) for chunk in chunks if chunk]
class VersionManager:
@@ -189,6 +339,77 @@ class InvalidCSSSelectorError(Exception):
pass
+SPLITS = bytearray([
+ # Control chars (0-31) + space (32)
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ # Special chars (33-47): ! " # $ % & ' ( ) * + , - . /
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ # Numbers (48-57): Treat as non-splits
+ 0,0,0,0,0,0,0,0,0,0,
+ # More special chars (58-64): : ; < = > ? @
+ 1,1,1,1,1,1,1,
+ # Uppercase (65-90): Keep
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ # More special chars (91-96): [ \ ] ^ _ `
+ 1,1,1,1,1,1,
+ # Lowercase (97-122): Keep
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ # Special chars (123-126): { | } ~
+ 1,1,1,1,
+ # Extended ASCII
+ *([1] * 128)
+])
+
+# Additional split chars for HTML/code
+HTML_CODE_CHARS = {
+ # HTML specific
+ 'โข', 'โบ', 'โผ', 'ยฉ', 'ยฎ', 'โข', 'โ', 'โ', 'โ', 'โค', 'โฅ',
+ # Programming symbols
+ '+=', '-=', '*=', '/=', '=>', '<=>', '!=', '==', '===',
+ '++', '--', '<<', '>>', '&&', '||', '??', '?:', '?.',
+ # Common Unicode
+ 'โฆ', '"', '"', ''', ''', 'ยซ', 'ยป', 'โ', 'โ',
+ # Additional splits
+ '+', '=', '~', '@', '#', '$', '%', '^', '&', '*',
+ '(', ')', '{', '}', '[', ']', '|', '\\', '/', '`',
+ '<', '>', ',', '.', '?', '!', ':', ';', '-', '_'
+}
+
+def advanced_split(text: str) -> list[str]:
+ result = []
+ word = array('u')
+
+ i = 0
+ text_len = len(text)
+
+ while i < text_len:
+ char = text[i]
+ o = ord(char)
+
+ # Fast path for ASCII
+ if o < 256 and SPLITS[o]:
+ if word:
+ result.append(word.tounicode())
+ word = array('u')
+ # Check for multi-char symbols
+ elif i < text_len - 1:
+ two_chars = char + text[i + 1]
+ if two_chars in HTML_CODE_CHARS:
+ if word:
+ result.append(word.tounicode())
+ word = array('u')
+ i += 1 # Skip next char since we used it
+ else:
+ word.append(char)
+ else:
+ word.append(char)
+ i += 1
+
+ if word:
+ result.append(word.tounicode())
+
+ return result
+
def create_box_message(
message: str,
type: str = "info",
diff --git a/deploy/docker/Dockerfile b/deploy/docker/Dockerfile.bak
similarity index 100%
rename from deploy/docker/Dockerfile
rename to deploy/docker/Dockerfile.bak
diff --git a/deploy/docker/README.md b/deploy/docker/README.md
index b7d9533e..30f9fc13 100644
--- a/deploy/docker/README.md
+++ b/deploy/docker/README.md
@@ -1,113 +1,764 @@
-# Crawl4AI Docker Setup
+# Crawl4AI Docker Guide ๐ณ
-## Quick Start
-1. Build the Docker image:
- ```bash
- docker build -t crawl4ai-server:prod .
- ```
+## Table of Contents
+- [Prerequisites](#prerequisites)
+- [Installation](#installation)
+ - [Local Build](#local-build)
+ - [Docker Hub](#docker-hub)
+- [Dockerfile Parameters](#dockerfile-parameters)
+- [Using the API](#using-the-api)
+ - [Understanding Request Schema](#understanding-request-schema)
+ - [REST API Examples](#rest-api-examples)
+ - [Python SDK](#python-sdk)
+- [Metrics & Monitoring](#metrics--monitoring)
+- [Deployment Scenarios](#deployment-scenarios)
+- [Complete Examples](#complete-examples)
+- [Getting Help](#getting-help)
-2. Run the container:
- ```bash
- docker run -d -p 8000:8000 \
- --env-file .llm.env \
- --name crawl4ai \
- crawl4ai-server:prod
- ```
+## Prerequisites
----
+Before we dive in, make sure you have:
+- Docker installed and running (version 20.10.0 or higher)
+- At least 4GB of RAM available for the container
+- Python 3.10+ (if using the Python SDK)
+- Node.js 16+ (if using the Node.js examples)
-## Configuration Options
+> ๐ก **Pro tip**: Run `docker info` to check your Docker installation and available resources.
+
+## Installation
+
+### Local Build
+
+Let's get your local environment set up step by step!
+
+#### 1. Building the Image
+
+First, clone the repository and build the Docker image:
-### 1. **Using .llm.env File**
-Create a `.llm.env` file with your API keys:
```bash
-OPENAI_API_KEY=sk-your-key
-DEEPSEEK_API_KEY=your-deepseek-key
+# Clone the repository
+git clone https://github.com/unclecode/crawl4ai.git
+cd crawl4ai
+
+# Build the Docker image
+docker build -t crawl4ai-server:prod \
+ --build-arg PYTHON_VERSION=3.10 \
+ --build-arg INSTALL_TYPE=all \
+ --build-arg ENABLE_GPU=false \
+ deploy/docker/
```
-Run with:
+#### 2. Environment Setup
+
+If you plan to use LLMs (Language Models), you'll need to set up your API keys. Create a `.llm.env` file:
+
+```env
+# OpenAI
+OPENAI_API_KEY=sk-your-key
+
+# Anthropic
+ANTHROPIC_API_KEY=your-anthropic-key
+
+# DeepSeek
+DEEPSEEK_API_KEY=your-deepseek-key
+
+# Check out https://docs.litellm.ai/docs/providers for more providers!
+```
+
+> ๐ **Note**: Keep your API keys secure! Never commit them to version control.
+
+#### 3. Running the Container
+
+You have several options for running the container:
+
+Basic run (no LLM support):
+```bash
+docker run -d -p 8000:8000 --name crawl4ai crawl4ai-server:prod
+```
+
+With LLM support:
```bash
docker run -d -p 8000:8000 \
--env-file .llm.env \
+ --name crawl4ai \
crawl4ai-server:prod
```
-### 2. **Direct Environment Variables**
-Pass keys directly:
+Using host environment variables (Not a good practice, but works for local testing):
```bash
docker run -d -p 8000:8000 \
- -e OPENAI_API_KEY="sk-your-key" \
- -e DEEPSEEK_API_KEY="your-deepseek-key" \
+ --env-file .llm.env \
+ --env-from "$(env)" \
+ --name crawl4ai \
crawl4ai-server:prod
```
-### 3. **Copy Host Environment Variables**
-Use the `--copy-env` flag to copy `.llm.env` from the host:
+### More on Building
+
+You have several options for building the Docker image based on your needs:
+
+#### Basic Build
```bash
-docker run -d -p 8000:8000 \
- --copy-env \
- crawl4ai-server:prod
+# Clone the repository
+git clone https://github.com/unclecode/crawl4ai.git
+cd crawl4ai
+
+# Simple build with defaults
+docker build -t crawl4ai-server:prod deploy/docker/
```
-### 4. **Advanced: Docker Compose**
-Create a `docker-compose.yml`:
-```yaml
-version: '3.8'
-services:
- crawl4ai:
- image: crawl4ai-server:prod
- ports:
- - "8000:8000"
- env_file:
- - .llm.env
- restart: unless-stopped
-```
-
-Run with:
+#### Advanced Build Options
```bash
-docker-compose up -d
+# Build with custom parameters
+docker build -t crawl4ai-server:prod \
+ --build-arg PYTHON_VERSION=3.10 \
+ --build-arg INSTALL_TYPE=all \
+ --build-arg ENABLE_GPU=false \
+ deploy/docker/
```
----
+#### Platform-Specific Builds
+The Dockerfile includes optimizations for different architectures (ARM64 and AMD64). Docker automatically detects your platform, but you can specify it explicitly:
-## Supported Environment Variables
-| Variable | Description |
-|------------------------|--------------------------------------|
-| `OPENAI_API_KEY` | OpenAI API key |
-| `DEEPSEEK_API_KEY` | DeepSeek API key |
-| `ANTHROPIC_API_KEY` | Anthropic API key |
-| `GROQ_API_KEY` | Groq API key |
-| `TOGETHER_API_KEY` | Together API key |
-| `LLAMA_CLOUD_API_KEY` | Llama Cloud API key |
-| `COHERE_API_KEY` | Cohere API key |
-| `MISTRAL_API_KEY` | Mistral API key |
-| `PERPLEXITY_API_KEY` | Perplexity API key |
-| `VERTEXAI_PROJECT_ID` | Google Vertex AI project ID |
-| `VERTEXAI_LOCATION` | Google Vertex AI location |
+```bash
+# Build for ARM64
+docker build --platform linux/arm64 -t crawl4ai-server:arm64 deploy/docker/
----
+# Build for AMD64
+docker build --platform linux/amd64 -t crawl4ai-server:amd64 deploy/docker/
+```
-## Healthcheck
-The container includes a healthcheck:
+#### Multi-Platform Build
+For distributing your image across different architectures, use `buildx`:
+
+```bash
+# Set up buildx builder
+docker buildx create --use
+
+# Build for multiple platforms
+docker buildx build \
+ --platform linux/amd64,linux/arm64 \
+ -t yourusername/crawl4ai-server:multi \
+ --push \
+ deploy/docker/
+```
+
+> ๐ก **Note**: Multi-platform builds require Docker Buildx and need to be pushed to a registry.
+
+#### Development Build
+For development, you might want to enable all features:
+
+```bash
+docker build -t crawl4ai-server:dev \
+ --build-arg INSTALL_TYPE=all \
+ --build-arg PYTHON_VERSION=3.10 \
+ --build-arg ENABLE_GPU=true \
+ deploy/docker/
+```
+
+#### GPU-Enabled Build
+If you plan to use GPU acceleration:
+
+```bash
+docker build -t crawl4ai-server:gpu \
+ --build-arg ENABLE_GPU=true \
+ deploy/docker/
+```
+
+### Build Arguments Explained
+
+| Argument | Description | Default | Options |
+|----------|-------------|---------|----------|
+| PYTHON_VERSION | Python version | 3.10 | 3.8, 3.9, 3.10 |
+| INSTALL_TYPE | Feature set | default | default, all, torch, transformer |
+| ENABLE_GPU | GPU support | false | true, false |
+| APP_HOME | Install path | /app | any valid path |
+
+### Build Best Practices
+
+1. **Choose the Right Install Type**
+ - `default`: Basic installation, smallest image, to be honest, I use this most of the time.
+ - `all`: Full features, larger image (include transformer, and nltk, make sure you really need them)
+
+2. **Platform Considerations**
+ - Let Docker auto-detect platform unless you need cross-compilation
+ - Use --platform for specific architecture requirements
+ - Consider buildx for multi-architecture distribution
+
+3. **Development vs Production**
+ - Use `INSTALL_TYPE=all` for development
+ - Stick to `default` for production if you don't need extra features
+ - Enable GPU only if you have compatible hardware
+
+4. **Performance Optimization**
+ - The image automatically includes platform-specific optimizations
+ - AMD64 gets OpenMP optimizations
+ - ARM64 gets OpenBLAS optimizations
+
+### Docker Hub
+
+> ๐ง Coming soon! The image will be available at `crawl4ai/server`. Stay tuned!
+
+## Dockerfile Parameters
+
+Configure your build with these parameters:
+
+| Parameter | Description | Default | Options |
+|-----------|-------------|---------|----------|
+| PYTHON_VERSION | Python version to use | 3.10 | 3.8, 3.9, 3.10 |
+| INSTALL_TYPE | Installation profile | default | default, all, torch, transformer |
+| ENABLE_GPU | Enable GPU support | false | true, false |
+| APP_HOME | Application directory | /app | any valid path |
+| TARGETARCH | Target architecture | auto-detected | amd64, arm64 |
+
+## Using the API
+
+### Understanding Request Schema
+
+This is super important! The API expects a specific structure that matches our Python classes. Let me show you how it works.
+
+#### The Magic of Type Matching
+
+When you send a request, each configuration object needs a "type" field that matches the exact class name from the library. Here's an example:
+
+```python
+# First, let's create objects the normal way
+from crawl4ai import BrowserConfig, CrawlerRunConfig, PruningContentFilter
+
+# Create some config objects
+browser_config = BrowserConfig(headless=True, viewport={"width": 1200, "height": 800})
+content_filter = PruningContentFilter(threshold=0.48, threshold_type="fixed")
+
+# Use dump() to see the serialized format
+print(browser_config.dump())
+```
+
+This will output something like:
+```json
+{
+ "type": "BrowserConfig",
+ "params": {
+ "headless": true,
+ "viewport": {
+ "width": 1200,
+ "height": 800
+ }
+ }
+}
+```
+
+#### Making API Requests
+
+So when making a request, your JSON should look like this:
+
+```json
+{
+ "urls": ["https://example.com"],
+ "browser_config": {
+ "type": "BrowserConfig",
+ "params": {
+ "headless": true,
+ "viewport": {"width": 1200, "height": 800}
+ }
+ },
+ "crawler_config": {
+ "type": "CrawlerRunConfig",
+ "params": {
+ "cache_mode": "bypass",
+ "markdown_generator": {
+ "type": "DefaultMarkdownGenerator",
+ "params": {
+ "content_filter": {
+ "type": "PruningContentFilter",
+ "params": {
+ "threshold": 0.48,
+ "threshold_type": "fixed",
+ "min_word_threshold": 0
+ }
+ }
+ }
+ }
+ }
+ }
+}
+```
+
+> ๐ก **Pro tip**: Look at the class names in the library documentation - they map directly to the "type" fields in your requests!
+
+### REST API Examples
+
+Let's look at some practical examples:
+
+#### Simple Crawl
+
+```python
+import requests
+
+response = requests.post(
+ "http://localhost:8000/crawl",
+ json={
+ "urls": ["https://example.com"],
+ "browser_config": {
+ "type": "BrowserConfig",
+ "params": {"headless": True}
+ }
+ }
+)
+print(response.json())
+```
+
+#### Streaming Results
+
+```python
+import requests
+
+response = requests.post(
+ "http://localhost:8000/crawl",
+ json={
+ "urls": ["https://example.com"],
+ "crawler_config": {
+ "type": "CrawlerRunConfig",
+ "params": {"stream": True}
+ }
+ },
+ stream=True
+)
+
+for line in response.iter_lines():
+ if line:
+ print(line.decode())
+```
+
+### Python SDK
+
+The SDK makes things even easier! Here's how to use it:
+
+```python
+from crawl4ai.docker_client import Crawl4aiDockerClient
+from crawl4ai import BrowserConfig, CrawlerRunConfig
+
+async with Crawl4aiDockerClient() as client:
+ # The SDK handles serialization for you!
+ result = await client.crawl(
+ urls=["https://example.com"],
+ browser_config=BrowserConfig(headless=True),
+ crawler_config=CrawlerRunConfig(stream=False)
+ )
+ print(result.markdown)
+```
+
+## Metrics & Monitoring
+
+Keep an eye on your crawler with these endpoints:
+
+- `/health` - Quick health check
+- `/metrics` - Detailed Prometheus metrics
+- `/schema` - Full API schema
+
+Example health check:
```bash
curl http://localhost:8000/health
```
----
+## Deployment Scenarios
-## Troubleshooting
-1. **Missing Keys**: Ensure all required keys are set in `.llm.env`.
-2. **Permissions**: Run `chmod +x docker-entrypoint.sh` if permissions are denied.
-3. **Logs**: Check logs with:
- ```bash
- docker logs crawl4ai
+> ๐ง Coming soon! We'll cover:
+> - Kubernetes deployment
+> - Cloud provider setups (AWS, GCP, Azure)
+> - High-availability configurations
+> - Load balancing strategies
+
+## Complete Examples
+
+Check out the `examples` folder in our repository for full working examples! Here's one to get you started:
+
+```python
+import requests
+import time
+import httpx
+import asyncio
+from typing import Dict, Any
+from crawl4ai import (
+ BrowserConfig, CrawlerRunConfig, DefaultMarkdownGenerator,
+ PruningContentFilter, JsonCssExtractionStrategy, LLMContentFilter, CacheMode
+)
+from crawl4ai.docker_client import Crawl4aiDockerClient
+
+class Crawl4AiTester:
+ def __init__(self, base_url: str = "http://localhost:11235"):
+ self.base_url = base_url
+
+ def submit_and_wait(
+ self, request_data: Dict[str, Any], timeout: int = 300
+ ) -> Dict[str, Any]:
+ # Submit crawl job
+ response = requests.post(f"{self.base_url}/crawl", json=request_data)
+ task_id = response.json()["task_id"]
+ print(f"Task ID: {task_id}")
+
+ # Poll for result
+ start_time = time.time()
+ while True:
+ if time.time() - start_time > timeout:
+ raise TimeoutError(
+ f"Task {task_id} did not complete within {timeout} seconds"
+ )
+
+ result = requests.get(f"{self.base_url}/task/{task_id}")
+ status = result.json()
+
+ if status["status"] == "failed":
+ print("Task failed:", status.get("error"))
+ raise Exception(f"Task failed: {status.get('error')}")
+
+ if status["status"] == "completed":
+ return status
+
+ time.sleep(2)
+
+async def test_direct_api():
+ """Test direct API endpoints without using the client SDK"""
+ print("\n=== Testing Direct API Calls ===")
+
+ # Test 1: Basic crawl with content filtering
+ browser_config = BrowserConfig(
+ headless=True,
+ viewport_width=1200,
+ viewport_height=800
+ )
+
+ crawler_config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ markdown_generator=DefaultMarkdownGenerator(
+ content_filter=PruningContentFilter(
+ threshold=0.48,
+ threshold_type="fixed",
+ min_word_threshold=0
+ ),
+ options={"ignore_links": True}
+ )
+ )
+
+ request_data = {
+ "urls": ["https://example.com"],
+ "browser_config": browser_config.dump(),
+ "crawler_config": crawler_config.dump()
+ }
+
+ # Make direct API call
+ async with httpx.AsyncClient() as client:
+ response = await client.post(
+ "http://localhost:8000/crawl",
+ json=request_data,
+ timeout=300
+ )
+ assert response.status_code == 200
+ result = response.json()
+ print("Basic crawl result:", result["success"])
+
+ # Test 2: Structured extraction with JSON CSS
+ schema = {
+ "baseSelector": "article.post",
+ "fields": [
+ {"name": "title", "selector": "h1", "type": "text"},
+ {"name": "content", "selector": ".content", "type": "html"}
+ ]
+ }
+
+ crawler_config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ extraction_strategy=JsonCssExtractionStrategy(schema=schema)
+ )
+
+ request_data["crawler_config"] = crawler_config.dump()
+
+ async with httpx.AsyncClient() as client:
+ response = await client.post(
+ "http://localhost:8000/crawl",
+ json=request_data
+ )
+ assert response.status_code == 200
+ result = response.json()
+ print("Structured extraction result:", result["success"])
+
+ # Test 3: Get schema
+ # async with httpx.AsyncClient() as client:
+ # response = await client.get("http://localhost:8000/schema")
+ # assert response.status_code == 200
+ # schemas = response.json()
+ # print("Retrieved schemas for:", list(schemas.keys()))
+
+async def test_with_client():
+ """Test using the Crawl4AI Docker client SDK"""
+ print("\n=== Testing Client SDK ===")
+
+ async with Crawl4aiDockerClient(verbose=True) as client:
+ # Test 1: Basic crawl
+ browser_config = BrowserConfig(headless=True)
+ crawler_config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ markdown_generator=DefaultMarkdownGenerator(
+ content_filter=PruningContentFilter(
+ threshold=0.48,
+ threshold_type="fixed"
+ )
+ )
+ )
+
+ result = await client.crawl(
+ urls=["https://example.com"],
+ browser_config=browser_config,
+ crawler_config=crawler_config
+ )
+ print("Client SDK basic crawl:", result.success)
+
+ # Test 2: LLM extraction with streaming
+ crawler_config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ markdown_generator=DefaultMarkdownGenerator(
+ content_filter=LLMContentFilter(
+ provider="openai/gpt-40",
+ instruction="Extract key technical concepts"
+ )
+ ),
+ stream=True
+ )
+
+ async for result in await client.crawl(
+ urls=["https://example.com"],
+ browser_config=browser_config,
+ crawler_config=crawler_config
+ ):
+ print(f"Streaming result for: {result.url}")
+
+ # # Test 3: Get schema
+ # schemas = await client.get_schema()
+ # print("Retrieved client schemas for:", list(schemas.keys()))
+
+async def main():
+ """Run all tests"""
+ # Test direct API
+ print("Testing direct API calls...")
+ await test_direct_api()
+
+ # Test client SDK
+ print("\nTesting client SDK...")
+ await test_with_client()
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+## Server Configuration
+
+The server's behavior can be customized through the `config.yml` file. Let's explore how to configure your Crawl4AI server for optimal performance and security.
+
+### Understanding config.yml
+
+The configuration file is located at `deploy/docker/config.yml`. You can either modify this file before building the image or mount a custom configuration when running the container.
+
+Here's a detailed breakdown of the configuration options:
+
+```yaml
+# Application Configuration
+app:
+ title: "Crawl4AI API" # Server title in OpenAPI docs
+ version: "1.0.0" # API version
+ host: "0.0.0.0" # Listen on all interfaces
+ port: 8000 # Server port
+ reload: True # Enable hot reloading (development only)
+ timeout_keep_alive: 300 # Keep-alive timeout in seconds
+
+# Rate Limiting Configuration
+rate_limiting:
+ enabled: True # Enable/disable rate limiting
+ default_limit: "100/minute" # Rate limit format: "number/timeunit"
+ trusted_proxies: [] # List of trusted proxy IPs
+ storage_uri: "memory://" # Use "redis://localhost:6379" for production
+
+# Security Configuration
+security:
+ enabled: false # Master toggle for security features
+ https_redirect: True # Force HTTPS
+ trusted_hosts: ["*"] # Allowed hosts (use specific domains in production)
+ headers: # Security headers
+ x_content_type_options: "nosniff"
+ x_frame_options: "DENY"
+ content_security_policy: "default-src 'self'"
+ strict_transport_security: "max-age=63072000; includeSubDomains"
+
+# Crawler Configuration
+crawler:
+ memory_threshold_percent: 95.0 # Memory usage threshold
+ rate_limiter:
+ base_delay: [1.0, 2.0] # Min and max delay between requests
+ timeouts:
+ stream_init: 30.0 # Stream initialization timeout
+ batch_process: 300.0 # Batch processing timeout
+
+# Logging Configuration
+logging:
+ level: "INFO" # Log level (DEBUG, INFO, WARNING, ERROR)
+ format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+
+# Observability Configuration
+observability:
+ prometheus:
+ enabled: True # Enable Prometheus metrics
+ endpoint: "/metrics" # Metrics endpoint
+ health_check:
+ endpoint: "/health" # Health check endpoint
+```
+
+### Configuration Tips and Best Practices
+
+1. **Production Settings** ๐ญ
+ ```yaml
+ app:
+ reload: False # Disable reload in production
+ timeout_keep_alive: 120 # Lower timeout for better resource management
+
+ rate_limiting:
+ storage_uri: "redis://redis:6379" # Use Redis for distributed rate limiting
+ default_limit: "50/minute" # More conservative rate limit
+
+ security:
+ enabled: true # Enable all security features
+ trusted_hosts: ["your-domain.com"] # Restrict to your domain
```
----
+2. **Development Settings** ๐ ๏ธ
+ ```yaml
+ app:
+ reload: True # Enable hot reloading
+ timeout_keep_alive: 300 # Longer timeout for debugging
+
+ logging:
+ level: "DEBUG" # More verbose logging
+ ```
-## Security Best Practices
-- Never commit `.llm.env` to version control.
-- Use Docker secrets in production (Swarm/K8s).
-- Rotate keys regularly.
+3. **High-Traffic Settings** ๐ฆ
+ ```yaml
+ crawler:
+ memory_threshold_percent: 85.0 # More conservative memory limit
+ rate_limiter:
+ base_delay: [2.0, 4.0] # More aggressive rate limiting
+ ```
+### Customizing Your Configuration
+#### Method 1: Pre-build Configuration
+```bash
+# Copy and modify config before building
+cp deploy/docker/config.yml custom-config.yml
+vim custom-config.yml
+
+# Build with custom config
+docker build -t crawl4ai-server:prod \
+ --build-arg CONFIG_PATH=custom-config.yml .
+```
+
+#### Method 2: Runtime Configuration
+```bash
+# Mount custom config at runtime
+docker run -d -p 8000:8000 \
+ -v $(pwd)/custom-config.yml:/app/config.yml \
+ crawl4ai-server:prod
+```
+
+### Configuration Recommendations
+
+1. **Security First** ๐
+ - Always enable security in production
+ - Use specific trusted_hosts instead of wildcards
+ - Set up proper rate limiting to protect your server
+ - Consider your environment before enabling HTTPS redirect
+
+2. **Resource Management** ๐ป
+ - Adjust memory_threshold_percent based on available RAM
+ - Set timeouts according to your content size and network conditions
+ - Use Redis for rate limiting in multi-container setups
+
+3. **Monitoring** ๐
+ - Enable Prometheus if you need metrics
+ - Set DEBUG logging in development, INFO in production
+ - Regular health check monitoring is crucial
+
+4. **Performance Tuning** โก
+ - Start with conservative rate limiter delays
+ - Increase batch_process timeout for large content
+ - Adjust stream_init timeout based on initial response times
+
+### Configuration Migration
+
+When upgrading Crawl4AI, follow these steps:
+
+1. Back up your current config:
+ ```bash
+ cp /app/config.yml /app/config.yml.backup
+ ```
+
+2. Use version control:
+ ```bash
+ git add config.yml
+ git commit -m "Save current server configuration"
+ ```
+
+3. Test in staging first:
+ ```bash
+ docker run -d -p 8001:8000 \ # Use different port
+ -v $(pwd)/new-config.yml:/app/config.yml \
+ crawl4ai-server:prod
+ ```
+
+### Common Configuration Scenarios
+
+1. **Basic Development Setup**
+ ```yaml
+ security:
+ enabled: false
+ logging:
+ level: "DEBUG"
+ ```
+
+2. **Production API Server**
+ ```yaml
+ security:
+ enabled: true
+ trusted_hosts: ["api.yourdomain.com"]
+ rate_limiting:
+ enabled: true
+ default_limit: "50/minute"
+ ```
+
+3. **High-Performance Crawler**
+ ```yaml
+ crawler:
+ memory_threshold_percent: 90.0
+ timeouts:
+ batch_process: 600.0
+ ```
+
+## Getting Help
+
+We're here to help you succeed with Crawl4AI! Here's how to get support:
+
+- ๐ Check our [full documentation](https://docs.crawl4ai.com)
+- ๐ Found a bug? [Open an issue](https://github.com/unclecode/crawl4ai/issues)
+- ๐ฌ Join our [Discord community](https://discord.gg/crawl4ai)
+- โญ Star us on GitHub to show support!
+
+## Summary
+
+In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
+- Building and running the Docker container
+- Configuring the environment
+- Making API requests with proper typing
+- Using the Python SDK
+- Monitoring your deployment
+
+Remember, the examples in the `examples` folder are your friends - they show real-world usage patterns that you can adapt for your needs.
+
+Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. ๐
+
+Happy crawling! ๐ท๏ธ
\ No newline at end of file
diff --git a/deploy/docker/api.py b/deploy/docker/api.py
new file mode 100644
index 00000000..8838f19f
--- /dev/null
+++ b/deploy/docker/api.py
@@ -0,0 +1,305 @@
+import os
+import json
+import logging
+from typing import Optional, AsyncGenerator
+from urllib.parse import unquote
+from fastapi import HTTPException, Request, status
+from fastapi.background import BackgroundTasks
+from fastapi.responses import JSONResponse
+from redis import asyncio as aioredis
+
+from crawl4ai import (
+ AsyncWebCrawler,
+ CrawlerRunConfig,
+ LLMExtractionStrategy,
+ CacheMode
+)
+from crawl4ai.content_filter_strategy import (
+ PruningContentFilter,
+ BM25ContentFilter,
+ LLMContentFilter
+)
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
+
+from utils import (
+ TaskStatus,
+ FilterType,
+ get_base_url,
+ is_task_id,
+ should_cleanup_task,
+ decode_redis_hash
+)
+
+logger = logging.getLogger(__name__)
+
+async def process_llm_extraction(
+ redis: aioredis.Redis,
+ config: dict,
+ task_id: str,
+ url: str,
+ instruction: str,
+ schema: Optional[str] = None,
+ cache: str = "0"
+) -> None:
+ """Process LLM extraction in background."""
+ try:
+ llm_strategy = LLMExtractionStrategy(
+ provider=config["llm"]["provider"],
+ api_token=os.environ.get(config["llm"].get("api_key_env", None), ""),
+ instruction=instruction,
+ schema=json.loads(schema) if schema else None,
+ )
+
+ cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.BYPASS
+
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(
+ url=url,
+ config=CrawlerRunConfig(
+ extraction_strategy=llm_strategy,
+ scraping_strategy=LXMLWebScrapingStrategy(),
+ cache_mode=cache_mode
+ )
+ )
+
+ if not result.success:
+ await redis.hset(f"task:{task_id}", mapping={
+ "status": TaskStatus.FAILED,
+ "error": result.error_message
+ })
+ return
+
+ content = json.loads(result.extracted_content)
+ await redis.hset(f"task:{task_id}", mapping={
+ "status": TaskStatus.COMPLETED,
+ "result": json.dumps(content)
+ })
+
+ except Exception as e:
+ logger.error(f"LLM extraction error: {str(e)}", exc_info=True)
+ await redis.hset(f"task:{task_id}", mapping={
+ "status": TaskStatus.FAILED,
+ "error": str(e)
+ })
+
+async def handle_markdown_request(
+ url: str,
+ filter_type: FilterType,
+ query: Optional[str] = None,
+ cache: str = "0",
+ config: Optional[dict] = None
+) -> str:
+ """Handle markdown generation requests."""
+ try:
+ decoded_url = unquote(url)
+ if not decoded_url.startswith(('http://', 'https://')):
+ decoded_url = 'https://' + decoded_url
+
+ if filter_type == FilterType.RAW:
+ md_generator = DefaultMarkdownGenerator()
+ else:
+ content_filter = {
+ FilterType.FIT: PruningContentFilter(),
+ FilterType.BM25: BM25ContentFilter(user_query=query or ""),
+ FilterType.LLM: LLMContentFilter(
+ provider=config["llm"]["provider"],
+ api_token=os.environ.get(config["llm"].get("api_key_env", None), ""),
+ instruction=query or "Extract main content"
+ )
+ }[filter_type]
+ md_generator = DefaultMarkdownGenerator(content_filter=content_filter)
+
+ cache_mode = CacheMode.ENABLED if cache == "1" else CacheMode.BYPASS
+
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(
+ url=decoded_url,
+ config=CrawlerRunConfig(
+ markdown_generator=md_generator,
+ scraping_strategy=LXMLWebScrapingStrategy(),
+ cache_mode=cache_mode
+ )
+ )
+
+ if not result.success:
+ raise HTTPException(
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+ detail=result.error_message
+ )
+
+ return (result.markdown_v2.raw_markdown
+ if filter_type == FilterType.RAW
+ else result.markdown_v2.fit_markdown)
+
+ except Exception as e:
+ logger.error(f"Markdown error: {str(e)}", exc_info=True)
+ raise HTTPException(
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+ detail=str(e)
+ )
+
+async def handle_llm_request(
+ redis: aioredis.Redis,
+ background_tasks: BackgroundTasks,
+ request: Request,
+ input_path: str,
+ query: Optional[str] = None,
+ schema: Optional[str] = None,
+ cache: str = "0",
+ config: Optional[dict] = None
+) -> JSONResponse:
+ """Handle LLM extraction requests."""
+ base_url = get_base_url(request)
+
+ try:
+ if is_task_id(input_path):
+ return await handle_task_status(
+ redis, input_path, base_url
+ )
+
+ if not query:
+ return JSONResponse({
+ "message": "Please provide an instruction",
+ "_links": {
+ "example": {
+ "href": f"{base_url}/llm/{input_path}?q=Extract+main+content",
+ "title": "Try this example"
+ }
+ }
+ })
+
+ return await create_new_task(
+ redis,
+ background_tasks,
+ input_path,
+ query,
+ schema,
+ cache,
+ base_url,
+ config
+ )
+
+ except Exception as e:
+ logger.error(f"LLM endpoint error: {str(e)}", exc_info=True)
+ return JSONResponse({
+ "error": str(e),
+ "_links": {
+ "retry": {"href": str(request.url)}
+ }
+ }, status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)
+
+async def handle_task_status(
+ redis: aioredis.Redis,
+ task_id: str,
+ base_url: str
+) -> JSONResponse:
+ """Handle task status check requests."""
+ task = await redis.hgetall(f"task:{task_id}")
+ if not task:
+ raise HTTPException(
+ status_code=status.HTTP_404_NOT_FOUND,
+ detail="Task not found"
+ )
+
+ task = decode_redis_hash(task)
+ response = create_task_response(task, task_id, base_url)
+
+ if task["status"] in [TaskStatus.COMPLETED, TaskStatus.FAILED]:
+ if should_cleanup_task(task["created_at"]):
+ await redis.delete(f"task:{task_id}")
+
+ return JSONResponse(response)
+
+async def create_new_task(
+ redis: aioredis.Redis,
+ background_tasks: BackgroundTasks,
+ input_path: str,
+ query: str,
+ schema: Optional[str],
+ cache: str,
+ base_url: str,
+ config: dict
+) -> JSONResponse:
+ """Create and initialize a new task."""
+ decoded_url = unquote(input_path)
+ if not decoded_url.startswith(('http://', 'https://')):
+ decoded_url = 'https://' + decoded_url
+
+ from datetime import datetime
+ task_id = f"llm_{int(datetime.now().timestamp())}_{id(background_tasks)}"
+
+ await redis.hset(f"task:{task_id}", mapping={
+ "status": TaskStatus.PROCESSING,
+ "created_at": datetime.now().isoformat(),
+ "url": decoded_url
+ })
+
+ background_tasks.add_task(
+ process_llm_extraction,
+ redis,
+ config,
+ task_id,
+ decoded_url,
+ query,
+ schema,
+ cache
+ )
+
+ return JSONResponse({
+ "task_id": task_id,
+ "status": TaskStatus.PROCESSING,
+ "url": decoded_url,
+ "_links": {
+ "self": {"href": f"{base_url}/llm/{task_id}"},
+ "status": {"href": f"{base_url}/llm/{task_id}"}
+ }
+ })
+
+def create_task_response(task: dict, task_id: str, base_url: str) -> dict:
+ """Create response for task status check."""
+ response = {
+ "task_id": task_id,
+ "status": task["status"],
+ "created_at": task["created_at"],
+ "url": task["url"],
+ "_links": {
+ "self": {"href": f"{base_url}/llm/{task_id}"},
+ "refresh": {"href": f"{base_url}/llm/{task_id}"}
+ }
+ }
+
+ if task["status"] == TaskStatus.COMPLETED:
+ response["result"] = json.loads(task["result"])
+ elif task["status"] == TaskStatus.FAILED:
+ response["error"] = task["error"]
+
+ return response
+
+async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator) -> AsyncGenerator[bytes, None]:
+ """Stream results with heartbeats and completion markers."""
+ import asyncio
+ import json
+ from utils import datetime_handler
+
+ try:
+ async for result in results_gen:
+ try:
+ result_dict = result.model_dump()
+ logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}")
+ data = json.dumps(result_dict, default=datetime_handler) + "\n"
+ yield data.encode('utf-8')
+ except Exception as e:
+ logger.error(f"Serialization error: {e}")
+ error_response = {"error": str(e), "url": getattr(result, 'url', 'unknown')}
+ yield (json.dumps(error_response) + "\n").encode('utf-8')
+
+ yield json.dumps({"status": "completed"}).encode('utf-8')
+
+ except asyncio.CancelledError:
+ logger.warning("Client disconnected during streaming")
+ finally:
+ try:
+ await crawler.close()
+ except Exception as e:
+ logger.error(f"Crawler cleanup error: {e}")
\ No newline at end of file
diff --git a/deploy/docker/config.yml b/deploy/docker/config.yml
new file mode 100644
index 00000000..d74ddd08
--- /dev/null
+++ b/deploy/docker/config.yml
@@ -0,0 +1,69 @@
+# Application Configuration
+app:
+ title: "Crawl4AI API"
+ version: "1.0.0"
+ host: "0.0.0.0"
+ port: 8000
+ reload: True
+ timeout_keep_alive: 300
+
+# Default LLM Configuration
+llm:
+ provider: "openai/gpt-4o-mini"
+ api_key_env: "OPENAI_API_KEY"
+
+# Redis Configuration
+redis:
+ host: "localhost"
+ port: 6379
+ db: 0
+ password: ""
+ ssl: False
+ ssl_cert_reqs: None
+ ssl_ca_certs: None
+ ssl_certfile: None
+ ssl_keyfile: None
+ ssl_cert_reqs: None
+ ssl_ca_certs: None
+ ssl_certfile: None
+ ssl_keyfile: None
+
+# Rate Limiting Configuration
+rate_limiting:
+ enabled: True
+ default_limit: "1000/minute"
+ trusted_proxies: []
+ storage_uri: "memory://" # Use "redis://localhost:6379" for production
+
+# Security Configuration
+security:
+ enabled: false
+ https_redirect: True
+ trusted_hosts: ["*"]
+ headers:
+ x_content_type_options: "nosniff"
+ x_frame_options: "DENY"
+ content_security_policy: "default-src 'self'"
+ strict_transport_security: "max-age=63072000; includeSubDomains"
+
+# Crawler Configuration
+crawler:
+ memory_threshold_percent: 95.0
+ rate_limiter:
+ base_delay: [1.0, 2.0]
+ timeouts:
+ stream_init: 30.0 # Timeout for stream initialization
+ batch_process: 300.0 # Timeout for batch processing
+
+# Logging Configuration
+logging:
+ level: "INFO"
+ format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+
+# Observability Configuration
+observability:
+ prometheus:
+ enabled: True
+ endpoint: "/metrics"
+ health_check:
+ endpoint: "/health"
\ No newline at end of file
diff --git a/deploy/docker/requirements.txt b/deploy/docker/requirements.txt
index a395fa85..389652f4 100644
--- a/deploy/docker/requirements.txt
+++ b/deploy/docker/requirements.txt
@@ -1,4 +1,7 @@
crawl4ai
fastapi
uvicorn
-gunicorn>=23.0.0
\ No newline at end of file
+gunicorn>=23.0.0
+slowapi>=0.1.9
+prometheus-fastapi-instrumentator>=7.0.2
+redis>=5.2.1
\ No newline at end of file
diff --git a/deploy/docker/server.py b/deploy/docker/server.py
index 7ec662a3..7a0bca15 100644
--- a/deploy/docker/server.py
+++ b/deploy/docker/server.py
@@ -1,120 +1,237 @@
import os
import sys
+import time
+from typing import List, Optional
+
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
-from fastapi import FastAPI, HTTPException
-from fastapi.responses import StreamingResponse
-import json
-import asyncio
-from typing import AsyncGenerator
-from crawl4ai import (
- BrowserConfig,
- CrawlerRunConfig,
- AsyncWebCrawler,
- MemoryAdaptiveDispatcher,
- RateLimiter,
+
+from redis import asyncio as aioredis
+from fastapi import FastAPI, HTTPException, Request, status
+from fastapi.responses import StreamingResponse, RedirectResponse
+from fastapi.middleware.httpsredirect import HTTPSRedirectMiddleware
+from fastapi.middleware.trustedhost import TrustedHostMiddleware
+from pydantic import BaseModel, Field
+from slowapi import Limiter
+from slowapi.util import get_remote_address
+from prometheus_fastapi_instrumentator import Instrumentator
+from fastapi.responses import PlainTextResponse
+from fastapi.responses import JSONResponse
+from fastapi.background import BackgroundTasks
+from typing import Dict
+import os
+
+from utils import (
+ FilterType,
+ load_config,
+ setup_logging
+)
+from api import (
+ handle_markdown_request,
+ handle_llm_request
)
-from typing import List, Optional
-from pydantic import BaseModel
+# Load configuration and setup
+config = load_config()
+setup_logging(config)
+
+# Initialize Redis
+redis = aioredis.from_url(config["redis"].get("uri", "redis://localhost"))
+
+# Initialize rate limiter
+limiter = Limiter(
+ key_func=get_remote_address,
+ default_limits=[config["rate_limiting"]["default_limit"]],
+ storage_uri=config["rate_limiting"]["storage_uri"]
+)
+
+app = FastAPI(
+ title=config["app"]["title"],
+ version=config["app"]["version"]
+)
+
+# Configure middleware
+if config["security"]["enabled"]:
+ if config["security"]["https_redirect"]:
+ app.add_middleware(HTTPSRedirectMiddleware)
+ if config["security"]["trusted_hosts"] and config["security"]["trusted_hosts"] != ["*"]:
+ app.add_middleware(
+ TrustedHostMiddleware,
+ allowed_hosts=config["security"]["trusted_hosts"]
+ )
+
+# Prometheus instrumentation
+if config["observability"]["prometheus"]["enabled"]:
+ Instrumentator().instrument(app).expose(app)
class CrawlRequest(BaseModel):
- urls: List[str]
- browser_config: Optional[dict] = None
- crawler_config: Optional[dict] = None
-
-class CrawlResponse(BaseModel):
- success: bool
- results: List[dict]
-
- class Config:
- arbitrary_types_allowed = True
-
-app = FastAPI(title="Crawl4AI API")
-
-async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator) -> AsyncGenerator[bytes, None]:
- """Stream results and manage crawler lifecycle"""
- def datetime_handler(obj):
- """Custom handler for datetime objects during JSON serialization"""
- if hasattr(obj, 'isoformat'):
- return obj.isoformat()
- raise TypeError(f"Object of type {type(obj)} is not JSON serializable")
-
- try:
- async for result in results_gen:
- try:
- # Use dump method for serialization
- result_dict = result.model_dump()
- print(f"Streaming result for URL: {result_dict['url']}, Success: {result_dict['success']}")
- # Use custom JSON encoder with datetime handler
- yield (json.dumps(result_dict, default=datetime_handler) + "\n").encode('utf-8')
- except Exception as e:
- print(f"Error serializing result: {e}")
- error_response = {
- "error": str(e),
- "url": getattr(result, 'url', 'unknown')
- }
- yield (json.dumps(error_response, default=datetime_handler) + "\n").encode('utf-8')
- except asyncio.CancelledError:
- print("Client disconnected, cleaning up...")
- finally:
- try:
- await crawler.close()
- except Exception as e:
- print(f"Error closing crawler: {e}")
-
-@app.post("/crawl")
-async def crawl(request: CrawlRequest):
- # Load configs using our new utilities
- browser_config = BrowserConfig.load(request.browser_config)
- crawler_config = CrawlerRunConfig.load(request.crawler_config)
-
- dispatcher = MemoryAdaptiveDispatcher(
- memory_threshold_percent=95.0,
- rate_limiter=RateLimiter(base_delay=(1.0, 2.0)),
+ urls: List[str] = Field(
+ min_length=1,
+ max_length=100,
+ json_schema_extra={
+ "items": {"type": "string", "maxLength": 2000, "pattern": "\\S"}
+ }
+ )
+ browser_config: Optional[Dict] = Field(
+ default_factory=dict,
+ example={"headless": True, "viewport": {"width": 1200}}
+ )
+ crawler_config: Optional[Dict] = Field(
+ default_factory=dict,
+ example={"stream": True, "cache_mode": "aggressive"}
)
- try:
- if crawler_config.stream:
- crawler = AsyncWebCrawler(config=browser_config)
- await crawler.start()
+@app.middleware("http")
+async def add_security_headers(request: Request, call_next):
+ response = await call_next(request)
+ if config["security"]["enabled"]:
+ response.headers.update(config["security"]["headers"])
+ return response
- results_gen = await crawler.arun_many(
- urls=request.urls,
- config=crawler_config,
- dispatcher=dispatcher
- )
+@app.get("/md/{url:path}")
+@limiter.limit(config["rate_limiting"]["default_limit"])
+async def get_markdown(
+ request: Request,
+ url: str,
+ f: FilterType = FilterType.FIT,
+ q: Optional[str] = None,
+ c: Optional[str] = "0"
+):
+ """Get markdown from URL with optional filtering."""
+ result = await handle_markdown_request(url, f, q, c, config)
+ return PlainTextResponse(result)
- return StreamingResponse(
- stream_results(crawler, results_gen),
- media_type='application/x-ndjson'
- )
- else:
- async with AsyncWebCrawler(config=browser_config) as crawler:
- results = await crawler.arun_many(
- urls=request.urls,
- config=crawler_config,
- dispatcher=dispatcher
- )
- # Use dump method for each result
- results_dict = [result.model_dump() for result in results]
- return CrawlResponse(success=True, results=results_dict)
- except Exception as e:
- raise HTTPException(status_code=500, detail=str(e))
+@app.get("/llm/{input:path}")
+@limiter.limit(config["rate_limiting"]["default_limit"])
+async def llm_endpoint(
+ request: Request,
+ background_tasks: BackgroundTasks,
+ input: str,
+ q: Optional[str] = None,
+ s: Optional[str] = None,
+ c: Optional[str] = "0"
+):
+ """Handle LLM extraction requests."""
+ return await handle_llm_request(
+ redis, background_tasks, request, input, q, s, c, config
+ )
@app.get("/schema")
async def get_schema():
- """Return config schemas for client validation"""
+ """Endpoint for client-side validation schema."""
+ from crawl4ai import BrowserConfig, CrawlerRunConfig
return {
"browser": BrowserConfig.model_json_schema(),
"crawler": CrawlerRunConfig.model_json_schema()
}
-@app.get("/health")
+@app.get(config["observability"]["health_check"]["endpoint"])
async def health():
- return {"status": "ok"}
+ """Health check endpoint."""
+ return {"status": "ok", "timestamp": time.time()}
+@app.get(config["observability"]["prometheus"]["endpoint"])
+async def metrics():
+ """Prometheus metrics endpoint."""
+ return RedirectResponse(url=config["observability"]["prometheus"]["endpoint"])
+@app.post("/crawl")
+@limiter.limit(config["rate_limiting"]["default_limit"])
+async def crawl(request: Request, crawl_request: CrawlRequest):
+ """Handle crawl requests."""
+ from crawl4ai import (
+ AsyncWebCrawler,
+ BrowserConfig,
+ CrawlerRunConfig,
+ MemoryAdaptiveDispatcher,
+ RateLimiter
+ )
+ import asyncio
+ import logging
+
+ logger = logging.getLogger(__name__)
+ crawler = None
+
+ try:
+ if not crawl_request.urls:
+ logger.error("Empty URL list received")
+ raise HTTPException(
+ status_code=status.HTTP_400_BAD_REQUEST,
+ detail="At least one URL required"
+ )
+
+ browser_config = BrowserConfig.load(crawl_request.browser_config)
+ crawler_config = CrawlerRunConfig.load(crawl_request.crawler_config)
+
+ dispatcher = MemoryAdaptiveDispatcher(
+ memory_threshold_percent=config["crawler"]["memory_threshold_percent"],
+ rate_limiter=RateLimiter(
+ base_delay=tuple(config["crawler"]["rate_limiter"]["base_delay"])
+ )
+ )
+
+ if crawler_config.stream:
+ crawler = AsyncWebCrawler(config=browser_config)
+ await crawler.start()
+
+ results_gen = await asyncio.wait_for(
+ crawler.arun_many(
+ urls=crawl_request.urls,
+ config=crawler_config,
+ dispatcher=dispatcher
+ ),
+ timeout=config["crawler"]["timeouts"]["stream_init"]
+ )
+
+ from api import stream_results
+ return StreamingResponse(
+ stream_results(crawler, results_gen),
+ media_type='application/x-ndjson',
+ headers={
+ 'Cache-Control': 'no-cache',
+ 'Connection': 'keep-alive',
+ 'X-Stream-Status': 'active'
+ }
+ )
+ else:
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ results = await asyncio.wait_for(
+ crawler.arun_many(
+ urls=crawl_request.urls,
+ config=crawler_config,
+ dispatcher=dispatcher
+ ),
+ timeout=config["crawler"]["timeouts"]["batch_process"]
+ )
+ return JSONResponse({
+ "success": True,
+ "results": [result.model_dump() for result in results]
+ })
+
+ except asyncio.TimeoutError as e:
+ logger.error(f"Operation timed out: {str(e)}")
+ raise HTTPException(
+ status_code=status.HTTP_504_GATEWAY_TIMEOUT,
+ detail="Processing timeout"
+ )
+ except Exception as e:
+ logger.error(f"Server error: {str(e)}", exc_info=True)
+ raise HTTPException(
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+ detail="Internal server error"
+ )
+ finally:
+ if crawler:
+ try:
+ await crawler.close()
+ except Exception as e:
+ logger.error(f"Final crawler cleanup error: {e}")
if __name__ == "__main__":
import uvicorn
- uvicorn.run("server:app", host="0.0.0.0", port=8000, reload=True)
\ No newline at end of file
+ uvicorn.run(
+ "server:app",
+ host=config["app"]["host"],
+ port=config["app"]["port"],
+ reload=config["app"]["reload"],
+ timeout_keep_alive=config["app"]["timeout_keep_alive"]
+ )
\ No newline at end of file
diff --git a/deploy/docker/utils.py b/deploy/docker/utils.py
new file mode 100644
index 00000000..61a36e6c
--- /dev/null
+++ b/deploy/docker/utils.py
@@ -0,0 +1,54 @@
+import logging
+import yaml
+from datetime import datetime
+from enum import Enum
+from pathlib import Path
+from fastapi import Request
+from typing import Dict, Optional
+
+class TaskStatus(str, Enum):
+ PROCESSING = "processing"
+ FAILED = "failed"
+ COMPLETED = "completed"
+
+class FilterType(str, Enum):
+ RAW = "raw"
+ FIT = "fit"
+ BM25 = "bm25"
+ LLM = "llm"
+
+def load_config() -> Dict:
+ """Load and return application configuration."""
+ config_path = Path(__file__).parent / "config.yml"
+ with open(config_path, "r") as config_file:
+ return yaml.safe_load(config_file)
+
+def setup_logging(config: Dict) -> None:
+ """Configure application logging."""
+ logging.basicConfig(
+ level=config["logging"]["level"],
+ format=config["logging"]["format"]
+ )
+
+def get_base_url(request: Request) -> str:
+ """Get base URL including scheme and host."""
+ return f"{request.url.scheme}://{request.url.netloc}"
+
+def is_task_id(value: str) -> bool:
+ """Check if the value matches task ID pattern."""
+ return value.startswith("llm_") and "_" in value
+
+def datetime_handler(obj: any) -> Optional[str]:
+ """Handle datetime serialization for JSON."""
+ if hasattr(obj, 'isoformat'):
+ return obj.isoformat()
+ raise TypeError(f"Object of type {type(obj)} is not JSON serializable")
+
+def should_cleanup_task(created_at: str) -> bool:
+ """Check if task should be cleaned up based on creation time."""
+ created = datetime.fromisoformat(created_at)
+ return (datetime.now() - created).total_seconds() > 3600
+
+def decode_redis_hash(hash_data: Dict[bytes, bytes]) -> Dict[str, str]:
+ """Decode Redis hash data from bytes to strings."""
+ return {k.decode('utf-8'): v.decode('utf-8') for k, v in hash_data.items()}
\ No newline at end of file
diff --git a/docs/examples/llm_markdown_generator.py b/docs/examples/llm_markdown_generator.py
index 60b8549d..8c673734 100644
--- a/docs/examples/llm_markdown_generator.py
+++ b/docs/examples/llm_markdown_generator.py
@@ -46,6 +46,7 @@ async def test_llm_filter():
provider="openai/gpt-4o",
api_token=os.getenv('OPENAI_API_KEY'),
chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
+ ignore_cache = True,
instruction="""
Extract the main educational content while preserving its original wording and substance completely. Your task is to:
@@ -68,7 +69,7 @@ async def test_llm_filter():
)
# Apply filtering
- filtered_content = filter.filter_content(html, ignore_cache = True)
+ filtered_content = filter.filter_content(html)
# Show results
print("\nFiltered Content Length:", len(filtered_content))
diff --git a/server.py b/server.py
new file mode 100644
index 00000000..e69de29b