From 4bcd4cbda18eaf45cf2c1ff3e3fe34f11c7eff11 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 25 Feb 2025 22:27:55 +0800 Subject: [PATCH] refactor(pdf): improve PDF processor dependency handling Make PyPDF2 an optional dependency and improve import handling in PDF processor. Move imports inside methods to allow for lazy loading and better error handling. Add new 'pdf' optional dependency group in pyproject.toml. Clean up unused imports and remove deprecated files. BREAKING CHANGE: PyPDF2 is now an optional dependency. Users need to install with 'pip install crawl4ai[pdf]' to use PDF processing features. --- Dockerfile | 18 --- crawl4ai/async_webcrawler.py | 4 +- crawl4ai/docker_client copy.py | 210 --------------------------- crawl4ai/processors/pdf/__init__.py | 4 +- crawl4ai/processors/pdf/processor.py | 71 +++++++-- deploy/Dockerfile | 137 ----------------- docs/examples/hello_world.py | 7 +- pyproject.toml | 4 +- 8 files changed, 67 insertions(+), 388 deletions(-) delete mode 100644 crawl4ai/docker_client copy.py delete mode 100644 deploy/Dockerfile diff --git a/Dockerfile b/Dockerfile index b7e5e07f..9796bcb6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -151,24 +151,6 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ redis-cli ping > /dev/null && \ curl -f http://localhost:8000/health || exit 1' -# COPY deploy/docker/docker-entrypoint.sh /usr/local/bin/ -# RUN chmod +x /usr/local/bin/docker-entrypoint.sh - EXPOSE 6379 - -# ENTRYPOINT ["docker-entrypoint.sh"] - -# CMD service redis-server start && gunicorn \ -# --bind 0.0.0.0:8000 \ -# --workers 4 \ -# --threads 2 \ -# --timeout 120 \ -# --graceful-timeout 30 \ -# --log-level info \ -# --worker-class uvicorn.workers.UvicornWorker \ -# server:app - -# ENTRYPOINT ["docker-entrypoint.sh"] - CMD ["supervisord", "-c", "supervisord.conf"] diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 1a4cdcef..47c6778a 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -10,7 +10,7 @@ import asyncio # from contextlib import nullcontext, asynccontextmanager from contextlib import asynccontextmanager -from .models import CrawlResult, MarkdownGenerationResult,DispatchResult +from .models import CrawlResult, MarkdownGenerationResult, DispatchResult, ScrapingResult from .async_database import async_db_manager from .chunking_strategy import * # noqa: F403 from .chunking_strategy import RegexChunking, ChunkingStrategy, IdentityChunking @@ -537,7 +537,7 @@ class AsyncWebCrawler: ################################ # Scraping Strategy Execution # ################################ - result = scraping_strategy.scrap(url, html, **params) + result : ScrapingResult = scraping_strategy.scrap(url, html, **params) if result is None: raise ValueError( diff --git a/crawl4ai/docker_client copy.py b/crawl4ai/docker_client copy.py deleted file mode 100644 index 7c0fce1c..00000000 --- a/crawl4ai/docker_client copy.py +++ /dev/null @@ -1,210 +0,0 @@ -from typing import List, Optional, Union, AsyncGenerator, Dict, Any -import httpx -import json -from urllib.parse import urljoin - -from .async_configs import BrowserConfig, CrawlerRunConfig -from .models import CrawlResult -from .async_logger import AsyncLogger, LogLevel - - -class Crawl4aiClientError(Exception): - """Base exception for Crawl4ai Docker client errors.""" - pass - - -class ConnectionError(Crawl4aiClientError): - """Raised when connection to the Docker server fails.""" - pass - - -class RequestError(Crawl4aiClientError): - """Raised when the server returns an error response.""" - pass - - -class Crawl4aiDockerClient: - """ - Client for interacting with Crawl4AI Docker server. - - Args: - base_url (str): Base URL of the Crawl4AI Docker server - timeout (float): Default timeout for requests in seconds - verify_ssl (bool): Whether to verify SSL certificates - verbose (bool): Whether to show logging output - log_file (str, optional): Path to log file if file logging is desired - """ - - def __init__( - self, - base_url: str = "http://localhost:8000", - timeout: float = 30.0, - verify_ssl: bool = True, - verbose: bool = True, - log_file: Optional[str] = None - ) -> None: - self.base_url = base_url.rstrip('/') - self.timeout = timeout - self._http_client = httpx.AsyncClient( - timeout=timeout, - verify=verify_ssl, - headers={"Content-Type": "application/json"} - ) - self.logger = AsyncLogger( - log_file=log_file, - log_level=LogLevel.DEBUG, - verbose=verbose - ) - - async def _check_server_connection(self) -> bool: - """Check if server is reachable.""" - try: - self.logger.info("Checking server connection...", tag="INIT") - response = await self._http_client.get(f"{self.base_url}/health") - response.raise_for_status() - self.logger.success(f"Connected to server at {self.base_url}", tag="READY") - return True - except Exception as e: - self.logger.error(f"Failed to connect to server: {str(e)}", tag="ERROR") - return False - - def _prepare_request_data( - self, - urls: List[str], - browser_config: Optional[BrowserConfig] = None, - crawler_config: Optional[CrawlerRunConfig] = None - ) -> Dict[str, Any]: - """Prepare request data from configs using dump methods.""" - self.logger.debug("Preparing request data", tag="INIT") - data = { - "urls": urls, - "browser_config": browser_config.dump() if browser_config else {}, - "crawler_config": crawler_config.dump() if crawler_config else {} - } - self.logger.debug(f"Request data prepared for {len(urls)} URLs", tag="READY") - return data - - async def _make_request( - self, - method: str, - endpoint: str, - **kwargs - ) -> Union[Dict, AsyncGenerator]: - """Make HTTP request to the server with error handling.""" - url = urljoin(self.base_url, endpoint) - - try: - self.logger.debug(f"Making {method} request to {endpoint}", tag="FETCH") - response = await self._http_client.request(method, url, **kwargs) - response.raise_for_status() - self.logger.success(f"Request to {endpoint} successful", tag="COMPLETE") - return response - except httpx.TimeoutException as e: - error_msg = f"Request timed out: {str(e)}" - self.logger.error(error_msg, tag="ERROR") - raise ConnectionError(error_msg) - except httpx.RequestError as e: - error_msg = f"Failed to connect to server: {str(e)}" - self.logger.error(error_msg, tag="ERROR") - raise ConnectionError(error_msg) - except httpx.HTTPStatusError as e: - error_detail = "" - try: - error_data = e.response.json() - error_detail = error_data.get('detail', str(e)) - except (json.JSONDecodeError, AttributeError) as json_err: - error_detail = f"{str(e)} (Failed to parse error response: {str(json_err)})" - - error_msg = f"Server returned error {e.response.status_code}: {error_detail}" - self.logger.error(error_msg, tag="ERROR") - raise RequestError(error_msg) - - async def crawl( - self, - urls: List[str], - browser_config: Optional[BrowserConfig] = None, - crawler_config: Optional[CrawlerRunConfig] = None - ) -> Union[CrawlResult, AsyncGenerator[CrawlResult, None]]: - """Execute a crawl operation through the Docker server.""" - # Check server connection first - if not await self._check_server_connection(): - raise ConnectionError("Cannot proceed with crawl - server is not reachable") - - request_data = self._prepare_request_data(urls, browser_config, crawler_config) - is_streaming = crawler_config.stream if crawler_config else False - - self.logger.info( - f"Starting crawl for {len(urls)} URLs {'(streaming)' if is_streaming else ''}", - tag="INIT" - ) - - if is_streaming: - async def result_generator() -> AsyncGenerator[CrawlResult, None]: - try: - async with self._http_client.stream( - "POST", - f"{self.base_url}/crawl", - json=request_data, - timeout=None - ) as response: - response.raise_for_status() - async for line in response.aiter_lines(): - if line.strip(): - try: - result_dict = json.loads(line) - if "error" in result_dict: - self.logger.error_status( - url=result_dict.get('url', 'unknown'), - error=result_dict['error'] - ) - continue - - self.logger.url_status( - url=result_dict.get('url', 'unknown'), - success=True, - timing=result_dict.get('timing', 0.0) - ) - yield CrawlResult(**result_dict) - except json.JSONDecodeError as e: - self.logger.error(f"Failed to parse server response: {e}", tag="ERROR") - continue - except httpx.StreamError as e: - error_msg = f"Stream connection error: {str(e)}" - self.logger.error(error_msg, tag="ERROR") - raise ConnectionError(error_msg) - except Exception as e: - error_msg = f"Unexpected error during streaming: {str(e)}" - self.logger.error(error_msg, tag="ERROR") - raise Crawl4aiClientError(error_msg) - - return result_generator() - - response = await self._make_request("POST", "/crawl", json=request_data) - response_data = response.json() - - if not response_data.get("success", False): - error_msg = f"Crawl operation failed: {response_data.get('error', 'Unknown error')}" - self.logger.error(error_msg, tag="ERROR") - raise RequestError(error_msg) - - results = [CrawlResult(**result_dict) for result_dict in response_data.get("results", [])] - self.logger.success(f"Crawl completed successfully with {len(results)} results", tag="COMPLETE") - return results[0] if len(results) == 1 else results - - async def get_schema(self) -> Dict[str, Any]: - """Retrieve the configuration schemas from the server.""" - self.logger.info("Retrieving schema from server", tag="FETCH") - response = await self._make_request("GET", "/schema") - self.logger.success("Schema retrieved successfully", tag="COMPLETE") - return response.json() - - async def close(self) -> None: - """Close the HTTP client session.""" - self.logger.info("Closing client connection", tag="COMPLETE") - await self._http_client.aclose() - - async def __aenter__(self) -> "Crawl4aiDockerClient": - return self - - async def __aexit__(self, exc_type: Optional[type], exc_val: Optional[Exception], exc_tb: Optional[Any]) -> None: - await self.close() \ No newline at end of file diff --git a/crawl4ai/processors/pdf/__init__.py b/crawl4ai/processors/pdf/__init__.py index ac55e9de..947641cb 100644 --- a/crawl4ai/processors/pdf/__init__.py +++ b/crawl4ai/processors/pdf/__init__.py @@ -1,8 +1,6 @@ -from abc import ABC, abstractmethod from pathlib import Path -from typing import Dict, List, Optional +import asyncio from dataclasses import asdict - from crawl4ai.async_logger import AsyncLogger from crawl4ai.async_crawler_strategy import AsyncCrawlerStrategy from crawl4ai.models import AsyncCrawlResponse, ScrapingResult diff --git a/crawl4ai/processors/pdf/processor.py b/crawl4ai/processors/pdf/processor.py index 9dc58a2e..16963f70 100644 --- a/crawl4ai/processors/pdf/processor.py +++ b/crawl4ai/processors/pdf/processor.py @@ -5,21 +5,22 @@ from datetime import datetime from pathlib import Path from time import time from dataclasses import dataclass, asdict, field -from typing import Dict, List, Optional, Tuple -import PyPDF2 -from PIL import Image -from PyPDF2 import PdfReader -from .utils import * +from typing import Dict, List, Optional, Any, Union import base64 import tempfile +from .utils import * +from .utils import ( + apply_png_predictor, + clean_pdf_text, + clean_pdf_text_to_html, +) + +# Remove direct PyPDF2 imports from the top +# import PyPDF2 +# from PyPDF2 import PdfReader logger = logging.getLogger(__name__) -from dataclasses import dataclass, field -from datetime import datetime -from typing import List, Optional, Dict, Any -from pathlib import Path - @dataclass class PDFMetadata: title: Optional[str] = None @@ -35,8 +36,8 @@ class PDFMetadata: class PDFPage: page_number: int raw_text: str = "" - markdown: str = "" # Added per your request - html: str = "" # Added per your request + markdown: str = "" + html: str = "" images: List[Dict] = field(default_factory=list) links: List[str] = field(default_factory=list) layout: List[Dict] = field(default_factory=list) @@ -56,6 +57,12 @@ class PDFProcessorStrategy(ABC): class NaivePDFProcessorStrategy(PDFProcessorStrategy): def __init__(self, image_dpi: int = 144, image_quality: int = 85, extract_images: bool = True, save_images_locally: bool = False, image_save_dir: Optional[Path] = None, batch_size: int = 4): + # Import check at initialization time + try: + import PyPDF2 + except ImportError: + raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'") + self.image_dpi = image_dpi self.image_quality = image_quality self.current_page_number = 0 @@ -66,6 +73,12 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy): self._temp_dir = None def process(self, pdf_path: Path) -> PDFProcessResult: + # Import inside method to allow dependency to be optional + try: + from PyPDF2 import PdfReader + except ImportError: + raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'") + start_time = time() result = PDFProcessResult( metadata=PDFMetadata(), @@ -110,6 +123,13 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy): def process_batch(self, pdf_path: Path) -> PDFProcessResult: """Like process() but processes PDF pages in parallel batches""" + # Import inside method to allow dependency to be optional + try: + from PyPDF2 import PdfReader + import PyPDF2 # For type checking + except ImportError: + raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'") + import concurrent.futures import threading @@ -212,6 +232,12 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy): return pdf_page def _extract_images(self, page, image_dir: Optional[Path]) -> List[Dict]: + # Import PyPDF2 for type checking only when needed + try: + import PyPDF2 + except ImportError: + raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'") + if not self.extract_images: return [] @@ -262,6 +288,7 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy): data = apply_png_predictor(data, width, bits, colors) # Create PIL Image + from PIL import Image mode = 'RGB' if color_space == '/DeviceRGB' else 'L' img = Image.frombytes(mode, (width, height), data) @@ -385,9 +412,14 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy): print(f"Link error: {str(e)}") return links - def _extract_metadata(self, pdf_path: Path, reader: PdfReader = None) -> PDFMetadata: - if not reader: - reader = PdfReader(pdf_path) + def _extract_metadata(self, pdf_path: Path, reader = None) -> PDFMetadata: + # Import inside method to allow dependency to be optional + if reader is None: + try: + from PyPDF2 import PdfReader + reader = PdfReader(pdf_path) + except ImportError: + raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'") meta = reader.metadata or {} created = self._parse_pdf_date(meta.get('/CreationDate', '')) @@ -425,6 +457,15 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy): if __name__ == "__main__": import json from pathlib import Path + + try: + # Import PyPDF2 only when running the file directly + import PyPDF2 + from PyPDF2 import PdfReader + except ImportError: + print("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'") + exit(1) + current_dir = Path(__file__).resolve().parent pdf_path = f'{current_dir}/test.pdf' diff --git a/deploy/Dockerfile b/deploy/Dockerfile deleted file mode 100644 index 3043bd57..00000000 --- a/deploy/Dockerfile +++ /dev/null @@ -1,137 +0,0 @@ -FROM python:3.10-slim - -# Set build arguments -ARG APP_HOME=/app -ARG GITHUB_REPO=https://github.com/unclecode/crawl4ai.git -ARG GITHUB_BRANCH=next -ARG USE_LOCAL=False -ARG CONFIG_PATH="" - -ENV PYTHONFAULTHANDLER=1 \ - PYTHONHASHSEED=random \ - PYTHONUNBUFFERED=1 \ - PIP_NO_CACHE_DIR=1 \ - PYTHONDONTWRITEBYTECODE=1 \ - PIP_DISABLE_PIP_VERSION_CHECK=1 \ - PIP_DEFAULT_TIMEOUT=100 \ - DEBIAN_FRONTEND=noninteractive \ - REDIS_HOST=localhost \ - REDIS_PORT=6379 - -ARG PYTHON_VERSION=3.10 -ARG INSTALL_TYPE=default -ARG ENABLE_GPU=false -ARG TARGETARCH - -LABEL maintainer="unclecode" -LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper" -LABEL version="1.0" - -RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - curl \ - wget \ - gnupg \ - git \ - cmake \ - pkg-config \ - python3-dev \ - libjpeg-dev \ - redis-server \ - supervisor \ - && rm -rf /var/lib/apt/lists/* - -RUN apt-get update && apt-get install -y --no-install-recommends \ - libglib2.0-0 \ - libnss3 \ - libnspr4 \ - libatk1.0-0 \ - libatk-bridge2.0-0 \ - libcups2 \ - libdrm2 \ - libdbus-1-3 \ - libxcb1 \ - libxkbcommon0 \ - libx11-6 \ - libxcomposite1 \ - libxdamage1 \ - libxext6 \ - libxfixes3 \ - libxrandr2 \ - libgbm1 \ - libpango-1.0-0 \ - libcairo2 \ - libasound2 \ - libatspi2.0-0 \ - && rm -rf /var/lib/apt/lists/* - -RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \ - apt-get update && apt-get install -y --no-install-recommends \ - nvidia-cuda-toolkit \ - && rm -rf /var/lib/apt/lists/* ; \ -else \ - echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \ -fi - -RUN if [ "$TARGETARCH" = "arm64" ]; then \ - echo "🦾 Installing ARM-specific optimizations"; \ - apt-get update && apt-get install -y --no-install-recommends \ - libopenblas-dev \ - && rm -rf /var/lib/apt/lists/*; \ -elif [ "$TARGETARCH" = "amd64" ]; then \ - echo "🖥️ Installing AMD64-specific optimizations"; \ - apt-get update && apt-get install -y --no-install-recommends \ - libomp-dev \ - && rm -rf /var/lib/apt/lists/*; \ -else \ - echo "Skipping platform-specific optimizations (unsupported platform)"; \ -fi - -WORKDIR ${APP_HOME} - -RUN git clone --branch ${GITHUB_BRANCH} ${GITHUB_REPO} /tmp/crawl4ai - -COPY docker/supervisord.conf . -COPY docker/requirements.txt . - -RUN pip install --no-cache-dir -r requirements.txt - -RUN if [ "$INSTALL_TYPE" = "all" ] ; then \ - pip install "/tmp/crawl4ai/[all]" && \ - python -m nltk.downloader punkt stopwords && \ - python -m crawl4ai.model_loader ; \ - elif [ "$INSTALL_TYPE" = "torch" ] ; then \ - pip install "/tmp/crawl4ai/[torch]" ; \ - elif [ "$INSTALL_TYPE" = "transformer" ] ; then \ - pip install "/tmp/crawl4ai/[transformer]" && \ - python -m crawl4ai.model_loader ; \ - else \ - pip install "/tmp/crawl4ai" ; \ - fi - -RUN pip install --no-cache-dir --upgrade pip && \ - python -c "import crawl4ai; print('✅ crawl4ai is ready to rock!')" && \ - python -c "from playwright.sync_api import sync_playwright; print('✅ Playwright is feeling dramatic!')" - -RUN playwright install --with-deps chromium - -COPY docker/* ${APP_HOME}/ -RUN if [ -n "$CONFIG_PATH" ] && [ -f "$CONFIG_PATH" ]; then \ - echo "Using custom config from $CONFIG_PATH" && \ - cp $CONFIG_PATH /app/config.yml; \ -fi - -HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ - CMD bash -c '\ - MEM=$(free -m | awk "/^Mem:/{print \$2}"); \ - if [ $MEM -lt 2048 ]; then \ - echo "⚠️ Warning: Less than 2GB RAM available! Your container might need a memory boost! 🚀"; \ - exit 1; \ - fi && \ - redis-cli ping > /dev/null && \ - curl -f http://localhost:8000/health || exit 1' - -# EXPOSE 6379 - -CMD ["supervisord", "-c", "supervisord.conf"] - diff --git a/docs/examples/hello_world.py b/docs/examples/hello_world.py index 89f3188a..ae74a4b3 100644 --- a/docs/examples/hello_world.py +++ b/docs/examples/hello_world.py @@ -6,6 +6,7 @@ from crawl4ai import ( CacheMode, DefaultMarkdownGenerator, PruningContentFilter, + CrawlResult ) @@ -20,10 +21,12 @@ async def main(): ) ), ) - result = await crawler.arun( - url="https://www.helloworld.org", config=crawler_config + result : CrawlResult = await crawler.arun( + # url="https://www.helloworld.org", config=crawler_config + url="https://www.kidocode.com", config=crawler_config ) print(result.markdown_v2.raw_markdown[:500]) + # print(result.model_dump()) if __name__ == "__main__": diff --git a/pyproject.toml b/pyproject.toml index f59eabd1..bcee2974 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,6 +56,7 @@ classifiers = [ ] [project.optional-dependencies] +pdf = ["PyPDF2"] torch = ["torch", "nltk", "scikit-learn"] transformer = ["transformers", "tokenizers"] cosine = ["torch", "transformers", "nltk"] @@ -66,7 +67,8 @@ all = [ "scikit-learn", "transformers", "tokenizers", - "selenium" + "selenium", + "PyPDF2" ] [project.scripts]