refactor(pdf): improve PDF processor dependency handling

Make PyPDF2 an optional dependency and improve import handling in PDF processor.
Move imports inside methods to allow for lazy loading and better error handling.
Add new 'pdf' optional dependency group in pyproject.toml.
Clean up unused imports and remove deprecated files.

BREAKING CHANGE: PyPDF2 is now an optional dependency. Users need to install with 'pip install crawl4ai[pdf]' to use PDF processing features.
This commit is contained in:
UncleCode
2025-02-25 22:27:55 +08:00
parent 71ce01c9e1
commit 4bcd4cbda1
8 changed files with 67 additions and 388 deletions

View File

@@ -151,24 +151,6 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
redis-cli ping > /dev/null && \ redis-cli ping > /dev/null && \
curl -f http://localhost:8000/health || exit 1' curl -f http://localhost:8000/health || exit 1'
# COPY deploy/docker/docker-entrypoint.sh /usr/local/bin/
# RUN chmod +x /usr/local/bin/docker-entrypoint.sh
EXPOSE 6379 EXPOSE 6379
# ENTRYPOINT ["docker-entrypoint.sh"]
# CMD service redis-server start && gunicorn \
# --bind 0.0.0.0:8000 \
# --workers 4 \
# --threads 2 \
# --timeout 120 \
# --graceful-timeout 30 \
# --log-level info \
# --worker-class uvicorn.workers.UvicornWorker \
# server:app
# ENTRYPOINT ["docker-entrypoint.sh"]
CMD ["supervisord", "-c", "supervisord.conf"] CMD ["supervisord", "-c", "supervisord.conf"]

View File

@@ -10,7 +10,7 @@ import asyncio
# from contextlib import nullcontext, asynccontextmanager # from contextlib import nullcontext, asynccontextmanager
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from .models import CrawlResult, MarkdownGenerationResult,DispatchResult from .models import CrawlResult, MarkdownGenerationResult, DispatchResult, ScrapingResult
from .async_database import async_db_manager from .async_database import async_db_manager
from .chunking_strategy import * # noqa: F403 from .chunking_strategy import * # noqa: F403
from .chunking_strategy import RegexChunking, ChunkingStrategy, IdentityChunking from .chunking_strategy import RegexChunking, ChunkingStrategy, IdentityChunking
@@ -537,7 +537,7 @@ class AsyncWebCrawler:
################################ ################################
# Scraping Strategy Execution # # Scraping Strategy Execution #
################################ ################################
result = scraping_strategy.scrap(url, html, **params) result : ScrapingResult = scraping_strategy.scrap(url, html, **params)
if result is None: if result is None:
raise ValueError( raise ValueError(

View File

@@ -1,210 +0,0 @@
from typing import List, Optional, Union, AsyncGenerator, Dict, Any
import httpx
import json
from urllib.parse import urljoin
from .async_configs import BrowserConfig, CrawlerRunConfig
from .models import CrawlResult
from .async_logger import AsyncLogger, LogLevel
class Crawl4aiClientError(Exception):
"""Base exception for Crawl4ai Docker client errors."""
pass
class ConnectionError(Crawl4aiClientError):
"""Raised when connection to the Docker server fails."""
pass
class RequestError(Crawl4aiClientError):
"""Raised when the server returns an error response."""
pass
class Crawl4aiDockerClient:
"""
Client for interacting with Crawl4AI Docker server.
Args:
base_url (str): Base URL of the Crawl4AI Docker server
timeout (float): Default timeout for requests in seconds
verify_ssl (bool): Whether to verify SSL certificates
verbose (bool): Whether to show logging output
log_file (str, optional): Path to log file if file logging is desired
"""
def __init__(
self,
base_url: str = "http://localhost:8000",
timeout: float = 30.0,
verify_ssl: bool = True,
verbose: bool = True,
log_file: Optional[str] = None
) -> None:
self.base_url = base_url.rstrip('/')
self.timeout = timeout
self._http_client = httpx.AsyncClient(
timeout=timeout,
verify=verify_ssl,
headers={"Content-Type": "application/json"}
)
self.logger = AsyncLogger(
log_file=log_file,
log_level=LogLevel.DEBUG,
verbose=verbose
)
async def _check_server_connection(self) -> bool:
"""Check if server is reachable."""
try:
self.logger.info("Checking server connection...", tag="INIT")
response = await self._http_client.get(f"{self.base_url}/health")
response.raise_for_status()
self.logger.success(f"Connected to server at {self.base_url}", tag="READY")
return True
except Exception as e:
self.logger.error(f"Failed to connect to server: {str(e)}", tag="ERROR")
return False
def _prepare_request_data(
self,
urls: List[str],
browser_config: Optional[BrowserConfig] = None,
crawler_config: Optional[CrawlerRunConfig] = None
) -> Dict[str, Any]:
"""Prepare request data from configs using dump methods."""
self.logger.debug("Preparing request data", tag="INIT")
data = {
"urls": urls,
"browser_config": browser_config.dump() if browser_config else {},
"crawler_config": crawler_config.dump() if crawler_config else {}
}
self.logger.debug(f"Request data prepared for {len(urls)} URLs", tag="READY")
return data
async def _make_request(
self,
method: str,
endpoint: str,
**kwargs
) -> Union[Dict, AsyncGenerator]:
"""Make HTTP request to the server with error handling."""
url = urljoin(self.base_url, endpoint)
try:
self.logger.debug(f"Making {method} request to {endpoint}", tag="FETCH")
response = await self._http_client.request(method, url, **kwargs)
response.raise_for_status()
self.logger.success(f"Request to {endpoint} successful", tag="COMPLETE")
return response
except httpx.TimeoutException as e:
error_msg = f"Request timed out: {str(e)}"
self.logger.error(error_msg, tag="ERROR")
raise ConnectionError(error_msg)
except httpx.RequestError as e:
error_msg = f"Failed to connect to server: {str(e)}"
self.logger.error(error_msg, tag="ERROR")
raise ConnectionError(error_msg)
except httpx.HTTPStatusError as e:
error_detail = ""
try:
error_data = e.response.json()
error_detail = error_data.get('detail', str(e))
except (json.JSONDecodeError, AttributeError) as json_err:
error_detail = f"{str(e)} (Failed to parse error response: {str(json_err)})"
error_msg = f"Server returned error {e.response.status_code}: {error_detail}"
self.logger.error(error_msg, tag="ERROR")
raise RequestError(error_msg)
async def crawl(
self,
urls: List[str],
browser_config: Optional[BrowserConfig] = None,
crawler_config: Optional[CrawlerRunConfig] = None
) -> Union[CrawlResult, AsyncGenerator[CrawlResult, None]]:
"""Execute a crawl operation through the Docker server."""
# Check server connection first
if not await self._check_server_connection():
raise ConnectionError("Cannot proceed with crawl - server is not reachable")
request_data = self._prepare_request_data(urls, browser_config, crawler_config)
is_streaming = crawler_config.stream if crawler_config else False
self.logger.info(
f"Starting crawl for {len(urls)} URLs {'(streaming)' if is_streaming else ''}",
tag="INIT"
)
if is_streaming:
async def result_generator() -> AsyncGenerator[CrawlResult, None]:
try:
async with self._http_client.stream(
"POST",
f"{self.base_url}/crawl",
json=request_data,
timeout=None
) as response:
response.raise_for_status()
async for line in response.aiter_lines():
if line.strip():
try:
result_dict = json.loads(line)
if "error" in result_dict:
self.logger.error_status(
url=result_dict.get('url', 'unknown'),
error=result_dict['error']
)
continue
self.logger.url_status(
url=result_dict.get('url', 'unknown'),
success=True,
timing=result_dict.get('timing', 0.0)
)
yield CrawlResult(**result_dict)
except json.JSONDecodeError as e:
self.logger.error(f"Failed to parse server response: {e}", tag="ERROR")
continue
except httpx.StreamError as e:
error_msg = f"Stream connection error: {str(e)}"
self.logger.error(error_msg, tag="ERROR")
raise ConnectionError(error_msg)
except Exception as e:
error_msg = f"Unexpected error during streaming: {str(e)}"
self.logger.error(error_msg, tag="ERROR")
raise Crawl4aiClientError(error_msg)
return result_generator()
response = await self._make_request("POST", "/crawl", json=request_data)
response_data = response.json()
if not response_data.get("success", False):
error_msg = f"Crawl operation failed: {response_data.get('error', 'Unknown error')}"
self.logger.error(error_msg, tag="ERROR")
raise RequestError(error_msg)
results = [CrawlResult(**result_dict) for result_dict in response_data.get("results", [])]
self.logger.success(f"Crawl completed successfully with {len(results)} results", tag="COMPLETE")
return results[0] if len(results) == 1 else results
async def get_schema(self) -> Dict[str, Any]:
"""Retrieve the configuration schemas from the server."""
self.logger.info("Retrieving schema from server", tag="FETCH")
response = await self._make_request("GET", "/schema")
self.logger.success("Schema retrieved successfully", tag="COMPLETE")
return response.json()
async def close(self) -> None:
"""Close the HTTP client session."""
self.logger.info("Closing client connection", tag="COMPLETE")
await self._http_client.aclose()
async def __aenter__(self) -> "Crawl4aiDockerClient":
return self
async def __aexit__(self, exc_type: Optional[type], exc_val: Optional[Exception], exc_tb: Optional[Any]) -> None:
await self.close()

View File

@@ -1,8 +1,6 @@
from abc import ABC, abstractmethod
from pathlib import Path from pathlib import Path
from typing import Dict, List, Optional import asyncio
from dataclasses import asdict from dataclasses import asdict
from crawl4ai.async_logger import AsyncLogger from crawl4ai.async_logger import AsyncLogger
from crawl4ai.async_crawler_strategy import AsyncCrawlerStrategy from crawl4ai.async_crawler_strategy import AsyncCrawlerStrategy
from crawl4ai.models import AsyncCrawlResponse, ScrapingResult from crawl4ai.models import AsyncCrawlResponse, ScrapingResult

View File

@@ -5,21 +5,22 @@ from datetime import datetime
from pathlib import Path from pathlib import Path
from time import time from time import time
from dataclasses import dataclass, asdict, field from dataclasses import dataclass, asdict, field
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Any, Union
import PyPDF2
from PIL import Image
from PyPDF2 import PdfReader
from .utils import *
import base64 import base64
import tempfile import tempfile
from .utils import *
from .utils import (
apply_png_predictor,
clean_pdf_text,
clean_pdf_text_to_html,
)
# Remove direct PyPDF2 imports from the top
# import PyPDF2
# from PyPDF2 import PdfReader
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
from dataclasses import dataclass, field
from datetime import datetime
from typing import List, Optional, Dict, Any
from pathlib import Path
@dataclass @dataclass
class PDFMetadata: class PDFMetadata:
title: Optional[str] = None title: Optional[str] = None
@@ -35,8 +36,8 @@ class PDFMetadata:
class PDFPage: class PDFPage:
page_number: int page_number: int
raw_text: str = "" raw_text: str = ""
markdown: str = "" # Added per your request markdown: str = ""
html: str = "" # Added per your request html: str = ""
images: List[Dict] = field(default_factory=list) images: List[Dict] = field(default_factory=list)
links: List[str] = field(default_factory=list) links: List[str] = field(default_factory=list)
layout: List[Dict] = field(default_factory=list) layout: List[Dict] = field(default_factory=list)
@@ -56,6 +57,12 @@ class PDFProcessorStrategy(ABC):
class NaivePDFProcessorStrategy(PDFProcessorStrategy): class NaivePDFProcessorStrategy(PDFProcessorStrategy):
def __init__(self, image_dpi: int = 144, image_quality: int = 85, extract_images: bool = True, def __init__(self, image_dpi: int = 144, image_quality: int = 85, extract_images: bool = True,
save_images_locally: bool = False, image_save_dir: Optional[Path] = None, batch_size: int = 4): save_images_locally: bool = False, image_save_dir: Optional[Path] = None, batch_size: int = 4):
# Import check at initialization time
try:
import PyPDF2
except ImportError:
raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
self.image_dpi = image_dpi self.image_dpi = image_dpi
self.image_quality = image_quality self.image_quality = image_quality
self.current_page_number = 0 self.current_page_number = 0
@@ -66,6 +73,12 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):
self._temp_dir = None self._temp_dir = None
def process(self, pdf_path: Path) -> PDFProcessResult: def process(self, pdf_path: Path) -> PDFProcessResult:
# Import inside method to allow dependency to be optional
try:
from PyPDF2 import PdfReader
except ImportError:
raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
start_time = time() start_time = time()
result = PDFProcessResult( result = PDFProcessResult(
metadata=PDFMetadata(), metadata=PDFMetadata(),
@@ -110,6 +123,13 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):
def process_batch(self, pdf_path: Path) -> PDFProcessResult: def process_batch(self, pdf_path: Path) -> PDFProcessResult:
"""Like process() but processes PDF pages in parallel batches""" """Like process() but processes PDF pages in parallel batches"""
# Import inside method to allow dependency to be optional
try:
from PyPDF2 import PdfReader
import PyPDF2 # For type checking
except ImportError:
raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
import concurrent.futures import concurrent.futures
import threading import threading
@@ -212,6 +232,12 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):
return pdf_page return pdf_page
def _extract_images(self, page, image_dir: Optional[Path]) -> List[Dict]: def _extract_images(self, page, image_dir: Optional[Path]) -> List[Dict]:
# Import PyPDF2 for type checking only when needed
try:
import PyPDF2
except ImportError:
raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
if not self.extract_images: if not self.extract_images:
return [] return []
@@ -262,6 +288,7 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):
data = apply_png_predictor(data, width, bits, colors) data = apply_png_predictor(data, width, bits, colors)
# Create PIL Image # Create PIL Image
from PIL import Image
mode = 'RGB' if color_space == '/DeviceRGB' else 'L' mode = 'RGB' if color_space == '/DeviceRGB' else 'L'
img = Image.frombytes(mode, (width, height), data) img = Image.frombytes(mode, (width, height), data)
@@ -385,9 +412,14 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):
print(f"Link error: {str(e)}") print(f"Link error: {str(e)}")
return links return links
def _extract_metadata(self, pdf_path: Path, reader: PdfReader = None) -> PDFMetadata: def _extract_metadata(self, pdf_path: Path, reader = None) -> PDFMetadata:
if not reader: # Import inside method to allow dependency to be optional
reader = PdfReader(pdf_path) if reader is None:
try:
from PyPDF2 import PdfReader
reader = PdfReader(pdf_path)
except ImportError:
raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
meta = reader.metadata or {} meta = reader.metadata or {}
created = self._parse_pdf_date(meta.get('/CreationDate', '')) created = self._parse_pdf_date(meta.get('/CreationDate', ''))
@@ -425,6 +457,15 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):
if __name__ == "__main__": if __name__ == "__main__":
import json import json
from pathlib import Path from pathlib import Path
try:
# Import PyPDF2 only when running the file directly
import PyPDF2
from PyPDF2 import PdfReader
except ImportError:
print("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
exit(1)
current_dir = Path(__file__).resolve().parent current_dir = Path(__file__).resolve().parent
pdf_path = f'{current_dir}/test.pdf' pdf_path = f'{current_dir}/test.pdf'

View File

@@ -1,137 +0,0 @@
FROM python:3.10-slim
# Set build arguments
ARG APP_HOME=/app
ARG GITHUB_REPO=https://github.com/unclecode/crawl4ai.git
ARG GITHUB_BRANCH=next
ARG USE_LOCAL=False
ARG CONFIG_PATH=""
ENV PYTHONFAULTHANDLER=1 \
PYTHONHASHSEED=random \
PYTHONUNBUFFERED=1 \
PIP_NO_CACHE_DIR=1 \
PYTHONDONTWRITEBYTECODE=1 \
PIP_DISABLE_PIP_VERSION_CHECK=1 \
PIP_DEFAULT_TIMEOUT=100 \
DEBIAN_FRONTEND=noninteractive \
REDIS_HOST=localhost \
REDIS_PORT=6379
ARG PYTHON_VERSION=3.10
ARG INSTALL_TYPE=default
ARG ENABLE_GPU=false
ARG TARGETARCH
LABEL maintainer="unclecode"
LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
LABEL version="1.0"
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
curl \
wget \
gnupg \
git \
cmake \
pkg-config \
python3-dev \
libjpeg-dev \
redis-server \
supervisor \
&& rm -rf /var/lib/apt/lists/*
RUN apt-get update && apt-get install -y --no-install-recommends \
libglib2.0-0 \
libnss3 \
libnspr4 \
libatk1.0-0 \
libatk-bridge2.0-0 \
libcups2 \
libdrm2 \
libdbus-1-3 \
libxcb1 \
libxkbcommon0 \
libx11-6 \
libxcomposite1 \
libxdamage1 \
libxext6 \
libxfixes3 \
libxrandr2 \
libgbm1 \
libpango-1.0-0 \
libcairo2 \
libasound2 \
libatspi2.0-0 \
&& rm -rf /var/lib/apt/lists/*
RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \
apt-get update && apt-get install -y --no-install-recommends \
nvidia-cuda-toolkit \
&& rm -rf /var/lib/apt/lists/* ; \
else \
echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \
fi
RUN if [ "$TARGETARCH" = "arm64" ]; then \
echo "🦾 Installing ARM-specific optimizations"; \
apt-get update && apt-get install -y --no-install-recommends \
libopenblas-dev \
&& rm -rf /var/lib/apt/lists/*; \
elif [ "$TARGETARCH" = "amd64" ]; then \
echo "🖥️ Installing AMD64-specific optimizations"; \
apt-get update && apt-get install -y --no-install-recommends \
libomp-dev \
&& rm -rf /var/lib/apt/lists/*; \
else \
echo "Skipping platform-specific optimizations (unsupported platform)"; \
fi
WORKDIR ${APP_HOME}
RUN git clone --branch ${GITHUB_BRANCH} ${GITHUB_REPO} /tmp/crawl4ai
COPY docker/supervisord.conf .
COPY docker/requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
pip install "/tmp/crawl4ai/[all]" && \
python -m nltk.downloader punkt stopwords && \
python -m crawl4ai.model_loader ; \
elif [ "$INSTALL_TYPE" = "torch" ] ; then \
pip install "/tmp/crawl4ai/[torch]" ; \
elif [ "$INSTALL_TYPE" = "transformer" ] ; then \
pip install "/tmp/crawl4ai/[transformer]" && \
python -m crawl4ai.model_loader ; \
else \
pip install "/tmp/crawl4ai" ; \
fi
RUN pip install --no-cache-dir --upgrade pip && \
python -c "import crawl4ai; print('✅ crawl4ai is ready to rock!')" && \
python -c "from playwright.sync_api import sync_playwright; print('✅ Playwright is feeling dramatic!')"
RUN playwright install --with-deps chromium
COPY docker/* ${APP_HOME}/
RUN if [ -n "$CONFIG_PATH" ] && [ -f "$CONFIG_PATH" ]; then \
echo "Using custom config from $CONFIG_PATH" && \
cp $CONFIG_PATH /app/config.yml; \
fi
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD bash -c '\
MEM=$(free -m | awk "/^Mem:/{print \$2}"); \
if [ $MEM -lt 2048 ]; then \
echo "⚠️ Warning: Less than 2GB RAM available! Your container might need a memory boost! 🚀"; \
exit 1; \
fi && \
redis-cli ping > /dev/null && \
curl -f http://localhost:8000/health || exit 1'
# EXPOSE 6379
CMD ["supervisord", "-c", "supervisord.conf"]

View File

@@ -6,6 +6,7 @@ from crawl4ai import (
CacheMode, CacheMode,
DefaultMarkdownGenerator, DefaultMarkdownGenerator,
PruningContentFilter, PruningContentFilter,
CrawlResult
) )
@@ -20,10 +21,12 @@ async def main():
) )
), ),
) )
result = await crawler.arun( result : CrawlResult = await crawler.arun(
url="https://www.helloworld.org", config=crawler_config # url="https://www.helloworld.org", config=crawler_config
url="https://www.kidocode.com", config=crawler_config
) )
print(result.markdown_v2.raw_markdown[:500]) print(result.markdown_v2.raw_markdown[:500])
# print(result.model_dump())
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -56,6 +56,7 @@ classifiers = [
] ]
[project.optional-dependencies] [project.optional-dependencies]
pdf = ["PyPDF2"]
torch = ["torch", "nltk", "scikit-learn"] torch = ["torch", "nltk", "scikit-learn"]
transformer = ["transformers", "tokenizers"] transformer = ["transformers", "tokenizers"]
cosine = ["torch", "transformers", "nltk"] cosine = ["torch", "transformers", "nltk"]
@@ -66,7 +67,8 @@ all = [
"scikit-learn", "scikit-learn",
"transformers", "transformers",
"tokenizers", "tokenizers",
"selenium" "selenium",
"PyPDF2"
] ]
[project.scripts] [project.scripts]