diff --git a/Dockerfile b/Dockerfile index b7e5e07f..9796bcb6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -151,24 +151,6 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ redis-cli ping > /dev/null && \ curl -f http://localhost:8000/health || exit 1' -# COPY deploy/docker/docker-entrypoint.sh /usr/local/bin/ -# RUN chmod +x /usr/local/bin/docker-entrypoint.sh - EXPOSE 6379 - -# ENTRYPOINT ["docker-entrypoint.sh"] - -# CMD service redis-server start && gunicorn \ -# --bind 0.0.0.0:8000 \ -# --workers 4 \ -# --threads 2 \ -# --timeout 120 \ -# --graceful-timeout 30 \ -# --log-level info \ -# --worker-class uvicorn.workers.UvicornWorker \ -# server:app - -# ENTRYPOINT ["docker-entrypoint.sh"] - CMD ["supervisord", "-c", "supervisord.conf"] diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 1a4cdcef..47c6778a 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -10,7 +10,7 @@ import asyncio # from contextlib import nullcontext, asynccontextmanager from contextlib import asynccontextmanager -from .models import CrawlResult, MarkdownGenerationResult,DispatchResult +from .models import CrawlResult, MarkdownGenerationResult, DispatchResult, ScrapingResult from .async_database import async_db_manager from .chunking_strategy import * # noqa: F403 from .chunking_strategy import RegexChunking, ChunkingStrategy, IdentityChunking @@ -537,7 +537,7 @@ class AsyncWebCrawler: ################################ # Scraping Strategy Execution # ################################ - result = scraping_strategy.scrap(url, html, **params) + result : ScrapingResult = scraping_strategy.scrap(url, html, **params) if result is None: raise ValueError( diff --git a/crawl4ai/docker_client copy.py b/crawl4ai/docker_client copy.py deleted file mode 100644 index 7c0fce1c..00000000 --- a/crawl4ai/docker_client copy.py +++ /dev/null @@ -1,210 +0,0 @@ -from typing import List, Optional, Union, AsyncGenerator, Dict, Any -import httpx -import json -from urllib.parse import urljoin - -from .async_configs import BrowserConfig, CrawlerRunConfig -from .models import CrawlResult -from .async_logger import AsyncLogger, LogLevel - - -class Crawl4aiClientError(Exception): - """Base exception for Crawl4ai Docker client errors.""" - pass - - -class ConnectionError(Crawl4aiClientError): - """Raised when connection to the Docker server fails.""" - pass - - -class RequestError(Crawl4aiClientError): - """Raised when the server returns an error response.""" - pass - - -class Crawl4aiDockerClient: - """ - Client for interacting with Crawl4AI Docker server. - - Args: - base_url (str): Base URL of the Crawl4AI Docker server - timeout (float): Default timeout for requests in seconds - verify_ssl (bool): Whether to verify SSL certificates - verbose (bool): Whether to show logging output - log_file (str, optional): Path to log file if file logging is desired - """ - - def __init__( - self, - base_url: str = "http://localhost:8000", - timeout: float = 30.0, - verify_ssl: bool = True, - verbose: bool = True, - log_file: Optional[str] = None - ) -> None: - self.base_url = base_url.rstrip('/') - self.timeout = timeout - self._http_client = httpx.AsyncClient( - timeout=timeout, - verify=verify_ssl, - headers={"Content-Type": "application/json"} - ) - self.logger = AsyncLogger( - log_file=log_file, - log_level=LogLevel.DEBUG, - verbose=verbose - ) - - async def _check_server_connection(self) -> bool: - """Check if server is reachable.""" - try: - self.logger.info("Checking server connection...", tag="INIT") - response = await self._http_client.get(f"{self.base_url}/health") - response.raise_for_status() - self.logger.success(f"Connected to server at {self.base_url}", tag="READY") - return True - except Exception as e: - self.logger.error(f"Failed to connect to server: {str(e)}", tag="ERROR") - return False - - def _prepare_request_data( - self, - urls: List[str], - browser_config: Optional[BrowserConfig] = None, - crawler_config: Optional[CrawlerRunConfig] = None - ) -> Dict[str, Any]: - """Prepare request data from configs using dump methods.""" - self.logger.debug("Preparing request data", tag="INIT") - data = { - "urls": urls, - "browser_config": browser_config.dump() if browser_config else {}, - "crawler_config": crawler_config.dump() if crawler_config else {} - } - self.logger.debug(f"Request data prepared for {len(urls)} URLs", tag="READY") - return data - - async def _make_request( - self, - method: str, - endpoint: str, - **kwargs - ) -> Union[Dict, AsyncGenerator]: - """Make HTTP request to the server with error handling.""" - url = urljoin(self.base_url, endpoint) - - try: - self.logger.debug(f"Making {method} request to {endpoint}", tag="FETCH") - response = await self._http_client.request(method, url, **kwargs) - response.raise_for_status() - self.logger.success(f"Request to {endpoint} successful", tag="COMPLETE") - return response - except httpx.TimeoutException as e: - error_msg = f"Request timed out: {str(e)}" - self.logger.error(error_msg, tag="ERROR") - raise ConnectionError(error_msg) - except httpx.RequestError as e: - error_msg = f"Failed to connect to server: {str(e)}" - self.logger.error(error_msg, tag="ERROR") - raise ConnectionError(error_msg) - except httpx.HTTPStatusError as e: - error_detail = "" - try: - error_data = e.response.json() - error_detail = error_data.get('detail', str(e)) - except (json.JSONDecodeError, AttributeError) as json_err: - error_detail = f"{str(e)} (Failed to parse error response: {str(json_err)})" - - error_msg = f"Server returned error {e.response.status_code}: {error_detail}" - self.logger.error(error_msg, tag="ERROR") - raise RequestError(error_msg) - - async def crawl( - self, - urls: List[str], - browser_config: Optional[BrowserConfig] = None, - crawler_config: Optional[CrawlerRunConfig] = None - ) -> Union[CrawlResult, AsyncGenerator[CrawlResult, None]]: - """Execute a crawl operation through the Docker server.""" - # Check server connection first - if not await self._check_server_connection(): - raise ConnectionError("Cannot proceed with crawl - server is not reachable") - - request_data = self._prepare_request_data(urls, browser_config, crawler_config) - is_streaming = crawler_config.stream if crawler_config else False - - self.logger.info( - f"Starting crawl for {len(urls)} URLs {'(streaming)' if is_streaming else ''}", - tag="INIT" - ) - - if is_streaming: - async def result_generator() -> AsyncGenerator[CrawlResult, None]: - try: - async with self._http_client.stream( - "POST", - f"{self.base_url}/crawl", - json=request_data, - timeout=None - ) as response: - response.raise_for_status() - async for line in response.aiter_lines(): - if line.strip(): - try: - result_dict = json.loads(line) - if "error" in result_dict: - self.logger.error_status( - url=result_dict.get('url', 'unknown'), - error=result_dict['error'] - ) - continue - - self.logger.url_status( - url=result_dict.get('url', 'unknown'), - success=True, - timing=result_dict.get('timing', 0.0) - ) - yield CrawlResult(**result_dict) - except json.JSONDecodeError as e: - self.logger.error(f"Failed to parse server response: {e}", tag="ERROR") - continue - except httpx.StreamError as e: - error_msg = f"Stream connection error: {str(e)}" - self.logger.error(error_msg, tag="ERROR") - raise ConnectionError(error_msg) - except Exception as e: - error_msg = f"Unexpected error during streaming: {str(e)}" - self.logger.error(error_msg, tag="ERROR") - raise Crawl4aiClientError(error_msg) - - return result_generator() - - response = await self._make_request("POST", "/crawl", json=request_data) - response_data = response.json() - - if not response_data.get("success", False): - error_msg = f"Crawl operation failed: {response_data.get('error', 'Unknown error')}" - self.logger.error(error_msg, tag="ERROR") - raise RequestError(error_msg) - - results = [CrawlResult(**result_dict) for result_dict in response_data.get("results", [])] - self.logger.success(f"Crawl completed successfully with {len(results)} results", tag="COMPLETE") - return results[0] if len(results) == 1 else results - - async def get_schema(self) -> Dict[str, Any]: - """Retrieve the configuration schemas from the server.""" - self.logger.info("Retrieving schema from server", tag="FETCH") - response = await self._make_request("GET", "/schema") - self.logger.success("Schema retrieved successfully", tag="COMPLETE") - return response.json() - - async def close(self) -> None: - """Close the HTTP client session.""" - self.logger.info("Closing client connection", tag="COMPLETE") - await self._http_client.aclose() - - async def __aenter__(self) -> "Crawl4aiDockerClient": - return self - - async def __aexit__(self, exc_type: Optional[type], exc_val: Optional[Exception], exc_tb: Optional[Any]) -> None: - await self.close() \ No newline at end of file diff --git a/crawl4ai/processors/pdf/__init__.py b/crawl4ai/processors/pdf/__init__.py index ac55e9de..947641cb 100644 --- a/crawl4ai/processors/pdf/__init__.py +++ b/crawl4ai/processors/pdf/__init__.py @@ -1,8 +1,6 @@ -from abc import ABC, abstractmethod from pathlib import Path -from typing import Dict, List, Optional +import asyncio from dataclasses import asdict - from crawl4ai.async_logger import AsyncLogger from crawl4ai.async_crawler_strategy import AsyncCrawlerStrategy from crawl4ai.models import AsyncCrawlResponse, ScrapingResult diff --git a/crawl4ai/processors/pdf/processor.py b/crawl4ai/processors/pdf/processor.py index 9dc58a2e..16963f70 100644 --- a/crawl4ai/processors/pdf/processor.py +++ b/crawl4ai/processors/pdf/processor.py @@ -5,21 +5,22 @@ from datetime import datetime from pathlib import Path from time import time from dataclasses import dataclass, asdict, field -from typing import Dict, List, Optional, Tuple -import PyPDF2 -from PIL import Image -from PyPDF2 import PdfReader -from .utils import * +from typing import Dict, List, Optional, Any, Union import base64 import tempfile +from .utils import * +from .utils import ( + apply_png_predictor, + clean_pdf_text, + clean_pdf_text_to_html, +) + +# Remove direct PyPDF2 imports from the top +# import PyPDF2 +# from PyPDF2 import PdfReader logger = logging.getLogger(__name__) -from dataclasses import dataclass, field -from datetime import datetime -from typing import List, Optional, Dict, Any -from pathlib import Path - @dataclass class PDFMetadata: title: Optional[str] = None @@ -35,8 +36,8 @@ class PDFMetadata: class PDFPage: page_number: int raw_text: str = "" - markdown: str = "" # Added per your request - html: str = "" # Added per your request + markdown: str = "" + html: str = "" images: List[Dict] = field(default_factory=list) links: List[str] = field(default_factory=list) layout: List[Dict] = field(default_factory=list) @@ -56,6 +57,12 @@ class PDFProcessorStrategy(ABC): class NaivePDFProcessorStrategy(PDFProcessorStrategy): def __init__(self, image_dpi: int = 144, image_quality: int = 85, extract_images: bool = True, save_images_locally: bool = False, image_save_dir: Optional[Path] = None, batch_size: int = 4): + # Import check at initialization time + try: + import PyPDF2 + except ImportError: + raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'") + self.image_dpi = image_dpi self.image_quality = image_quality self.current_page_number = 0 @@ -66,6 +73,12 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy): self._temp_dir = None def process(self, pdf_path: Path) -> PDFProcessResult: + # Import inside method to allow dependency to be optional + try: + from PyPDF2 import PdfReader + except ImportError: + raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'") + start_time = time() result = PDFProcessResult( metadata=PDFMetadata(), @@ -110,6 +123,13 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy): def process_batch(self, pdf_path: Path) -> PDFProcessResult: """Like process() but processes PDF pages in parallel batches""" + # Import inside method to allow dependency to be optional + try: + from PyPDF2 import PdfReader + import PyPDF2 # For type checking + except ImportError: + raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'") + import concurrent.futures import threading @@ -212,6 +232,12 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy): return pdf_page def _extract_images(self, page, image_dir: Optional[Path]) -> List[Dict]: + # Import PyPDF2 for type checking only when needed + try: + import PyPDF2 + except ImportError: + raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'") + if not self.extract_images: return [] @@ -262,6 +288,7 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy): data = apply_png_predictor(data, width, bits, colors) # Create PIL Image + from PIL import Image mode = 'RGB' if color_space == '/DeviceRGB' else 'L' img = Image.frombytes(mode, (width, height), data) @@ -385,9 +412,14 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy): print(f"Link error: {str(e)}") return links - def _extract_metadata(self, pdf_path: Path, reader: PdfReader = None) -> PDFMetadata: - if not reader: - reader = PdfReader(pdf_path) + def _extract_metadata(self, pdf_path: Path, reader = None) -> PDFMetadata: + # Import inside method to allow dependency to be optional + if reader is None: + try: + from PyPDF2 import PdfReader + reader = PdfReader(pdf_path) + except ImportError: + raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'") meta = reader.metadata or {} created = self._parse_pdf_date(meta.get('/CreationDate', '')) @@ -425,6 +457,15 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy): if __name__ == "__main__": import json from pathlib import Path + + try: + # Import PyPDF2 only when running the file directly + import PyPDF2 + from PyPDF2 import PdfReader + except ImportError: + print("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'") + exit(1) + current_dir = Path(__file__).resolve().parent pdf_path = f'{current_dir}/test.pdf' diff --git a/deploy/Dockerfile b/deploy/Dockerfile deleted file mode 100644 index 3043bd57..00000000 --- a/deploy/Dockerfile +++ /dev/null @@ -1,137 +0,0 @@ -FROM python:3.10-slim - -# Set build arguments -ARG APP_HOME=/app -ARG GITHUB_REPO=https://github.com/unclecode/crawl4ai.git -ARG GITHUB_BRANCH=next -ARG USE_LOCAL=False -ARG CONFIG_PATH="" - -ENV PYTHONFAULTHANDLER=1 \ - PYTHONHASHSEED=random \ - PYTHONUNBUFFERED=1 \ - PIP_NO_CACHE_DIR=1 \ - PYTHONDONTWRITEBYTECODE=1 \ - PIP_DISABLE_PIP_VERSION_CHECK=1 \ - PIP_DEFAULT_TIMEOUT=100 \ - DEBIAN_FRONTEND=noninteractive \ - REDIS_HOST=localhost \ - REDIS_PORT=6379 - -ARG PYTHON_VERSION=3.10 -ARG INSTALL_TYPE=default -ARG ENABLE_GPU=false -ARG TARGETARCH - -LABEL maintainer="unclecode" -LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper" -LABEL version="1.0" - -RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - curl \ - wget \ - gnupg \ - git \ - cmake \ - pkg-config \ - python3-dev \ - libjpeg-dev \ - redis-server \ - supervisor \ - && rm -rf /var/lib/apt/lists/* - -RUN apt-get update && apt-get install -y --no-install-recommends \ - libglib2.0-0 \ - libnss3 \ - libnspr4 \ - libatk1.0-0 \ - libatk-bridge2.0-0 \ - libcups2 \ - libdrm2 \ - libdbus-1-3 \ - libxcb1 \ - libxkbcommon0 \ - libx11-6 \ - libxcomposite1 \ - libxdamage1 \ - libxext6 \ - libxfixes3 \ - libxrandr2 \ - libgbm1 \ - libpango-1.0-0 \ - libcairo2 \ - libasound2 \ - libatspi2.0-0 \ - && rm -rf /var/lib/apt/lists/* - -RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \ - apt-get update && apt-get install -y --no-install-recommends \ - nvidia-cuda-toolkit \ - && rm -rf /var/lib/apt/lists/* ; \ -else \ - echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \ -fi - -RUN if [ "$TARGETARCH" = "arm64" ]; then \ - echo "🦾 Installing ARM-specific optimizations"; \ - apt-get update && apt-get install -y --no-install-recommends \ - libopenblas-dev \ - && rm -rf /var/lib/apt/lists/*; \ -elif [ "$TARGETARCH" = "amd64" ]; then \ - echo "🖥️ Installing AMD64-specific optimizations"; \ - apt-get update && apt-get install -y --no-install-recommends \ - libomp-dev \ - && rm -rf /var/lib/apt/lists/*; \ -else \ - echo "Skipping platform-specific optimizations (unsupported platform)"; \ -fi - -WORKDIR ${APP_HOME} - -RUN git clone --branch ${GITHUB_BRANCH} ${GITHUB_REPO} /tmp/crawl4ai - -COPY docker/supervisord.conf . -COPY docker/requirements.txt . - -RUN pip install --no-cache-dir -r requirements.txt - -RUN if [ "$INSTALL_TYPE" = "all" ] ; then \ - pip install "/tmp/crawl4ai/[all]" && \ - python -m nltk.downloader punkt stopwords && \ - python -m crawl4ai.model_loader ; \ - elif [ "$INSTALL_TYPE" = "torch" ] ; then \ - pip install "/tmp/crawl4ai/[torch]" ; \ - elif [ "$INSTALL_TYPE" = "transformer" ] ; then \ - pip install "/tmp/crawl4ai/[transformer]" && \ - python -m crawl4ai.model_loader ; \ - else \ - pip install "/tmp/crawl4ai" ; \ - fi - -RUN pip install --no-cache-dir --upgrade pip && \ - python -c "import crawl4ai; print('✅ crawl4ai is ready to rock!')" && \ - python -c "from playwright.sync_api import sync_playwright; print('✅ Playwright is feeling dramatic!')" - -RUN playwright install --with-deps chromium - -COPY docker/* ${APP_HOME}/ -RUN if [ -n "$CONFIG_PATH" ] && [ -f "$CONFIG_PATH" ]; then \ - echo "Using custom config from $CONFIG_PATH" && \ - cp $CONFIG_PATH /app/config.yml; \ -fi - -HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ - CMD bash -c '\ - MEM=$(free -m | awk "/^Mem:/{print \$2}"); \ - if [ $MEM -lt 2048 ]; then \ - echo "⚠️ Warning: Less than 2GB RAM available! Your container might need a memory boost! 🚀"; \ - exit 1; \ - fi && \ - redis-cli ping > /dev/null && \ - curl -f http://localhost:8000/health || exit 1' - -# EXPOSE 6379 - -CMD ["supervisord", "-c", "supervisord.conf"] - diff --git a/docs/examples/hello_world.py b/docs/examples/hello_world.py index 89f3188a..ae74a4b3 100644 --- a/docs/examples/hello_world.py +++ b/docs/examples/hello_world.py @@ -6,6 +6,7 @@ from crawl4ai import ( CacheMode, DefaultMarkdownGenerator, PruningContentFilter, + CrawlResult ) @@ -20,10 +21,12 @@ async def main(): ) ), ) - result = await crawler.arun( - url="https://www.helloworld.org", config=crawler_config + result : CrawlResult = await crawler.arun( + # url="https://www.helloworld.org", config=crawler_config + url="https://www.kidocode.com", config=crawler_config ) print(result.markdown_v2.raw_markdown[:500]) + # print(result.model_dump()) if __name__ == "__main__": diff --git a/pyproject.toml b/pyproject.toml index f59eabd1..bcee2974 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,6 +56,7 @@ classifiers = [ ] [project.optional-dependencies] +pdf = ["PyPDF2"] torch = ["torch", "nltk", "scikit-learn"] transformer = ["transformers", "tokenizers"] cosine = ["torch", "transformers", "nltk"] @@ -66,7 +67,8 @@ all = [ "scikit-learn", "transformers", "tokenizers", - "selenium" + "selenium", + "PyPDF2" ] [project.scripts]