refactor(pdf): improve PDF processor dependency handling

Make PyPDF2 an optional dependency and improve import handling in PDF processor. Move imports inside methods to allow for lazy loading and better error handling. Add new 'pdf' optional dependency group in pyproject.toml. Clean up unused imports and remove deprecated files. BREAKING CHANGE: PyPDF2 is now an optional dependency. Users need to install with 'pip install crawl4ai[pdf]' to use PDF processing features.
2025-02-25 22:27:55 +08:00
parent 71ce01c9e1
commit 4bcd4cbda1
8 changed files with 67 additions and 388 deletions
--- a/crawl4ai/processors/pdf/init.py
+++ b/crawl4ai/processors/pdf/init.py
@@ -1,8 +1,6 @@
-from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Dict, List, Optional
+import asyncio
 from dataclasses import asdict
-
 from crawl4ai.async_logger import AsyncLogger
 from crawl4ai.async_crawler_strategy import AsyncCrawlerStrategy
 from crawl4ai.models import AsyncCrawlResponse, ScrapingResult 
--- a/crawl4ai/processors/pdf/processor.py
+++ b/crawl4ai/processors/pdf/processor.py
@@ -5,21 +5,22 @@ from datetime import datetime
 from pathlib import Path
 from time import time
 from dataclasses import dataclass, asdict, field
-from typing import Dict, List, Optional, Tuple
-import PyPDF2
-from PIL import Image
-from PyPDF2 import PdfReader
-from .utils import *
+from typing import Dict, List, Optional, Any, Union
 import base64
 import tempfile
+from .utils import *
+from .utils import (
+    apply_png_predictor,
+    clean_pdf_text,
+    clean_pdf_text_to_html,
+)
+
+# Remove direct PyPDF2 imports from the top
+# import PyPDF2
+# from PyPDF2 import PdfReader

 logger = logging.getLogger(__name__)

-from dataclasses import dataclass, field
-from datetime import datetime
-from typing import List, Optional, Dict, Any
-from pathlib import Path
-
@dataclass
 class PDFMetadata:
    title: Optional[str] = None
@@ -35,8 +36,8 @@ class PDFMetadata:
 class PDFPage:
    page_number: int
    raw_text: str = ""
-    markdown: str = ""  # Added per your request
-    html: str = ""  # Added per your request
+    markdown: str = ""
+    html: str = ""
    images: List[Dict] = field(default_factory=list)
    links: List[str] = field(default_factory=list)
    layout: List[Dict] = field(default_factory=list)
@@ -56,6 +57,12 @@ class PDFProcessorStrategy(ABC):
 class NaivePDFProcessorStrategy(PDFProcessorStrategy):
    def __init__(self, image_dpi: int = 144, image_quality: int = 85, extract_images: bool = True, 
                 save_images_locally: bool = False, image_save_dir: Optional[Path] = None, batch_size: int = 4):
+        # Import check at initialization time
+        try:
+            import PyPDF2
+        except ImportError:
+            raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
+            
        self.image_dpi = image_dpi
        self.image_quality = image_quality
        self.current_page_number = 0
@@ -66,6 +73,12 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):
        self._temp_dir = None

    def process(self, pdf_path: Path) -> PDFProcessResult:
+        # Import inside method to allow dependency to be optional
+        try:
+            from PyPDF2 import PdfReader
+        except ImportError:
+            raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
+            
        start_time = time()
        result = PDFProcessResult(
            metadata=PDFMetadata(),
@@ -110,6 +123,13 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):

    def process_batch(self, pdf_path: Path) -> PDFProcessResult:
        """Like process() but processes PDF pages in parallel batches"""
+        # Import inside method to allow dependency to be optional
+        try:
+            from PyPDF2 import PdfReader
+            import PyPDF2  # For type checking
+        except ImportError:
+            raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
+            
        import concurrent.futures
        import threading
        
@@ -212,6 +232,12 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):
        return pdf_page

    def _extract_images(self, page, image_dir: Optional[Path]) -> List[Dict]:
+        # Import PyPDF2 for type checking only when needed
+        try:
+            import PyPDF2
+        except ImportError:
+            raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
+            
        if not self.extract_images:
            return []

@@ -262,6 +288,7 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):
                                                data = apply_png_predictor(data, width, bits, colors)

                                            # Create PIL Image
+                                            from PIL import Image
                                            mode = 'RGB' if color_space == '/DeviceRGB' else 'L'
                                            img = Image.frombytes(mode, (width, height), data)
                                            
@@ -385,9 +412,14 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):
                print(f"Link error: {str(e)}")
        return links

-    def _extract_metadata(self, pdf_path: Path, reader: PdfReader = None) -> PDFMetadata:
-        if not reader:
-            reader = PdfReader(pdf_path)
+    def _extract_metadata(self, pdf_path: Path, reader = None) -> PDFMetadata:
+        # Import inside method to allow dependency to be optional 
+        if reader is None:
+            try:
+                from PyPDF2 import PdfReader
+                reader = PdfReader(pdf_path)
+            except ImportError:
+                raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")

        meta = reader.metadata or {}
        created = self._parse_pdf_date(meta.get('/CreationDate', ''))
@@ -425,6 +457,15 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):
 if __name__ == "__main__":
    import json
    from pathlib import Path
+    
+    try:
+        # Import PyPDF2 only when running the file directly
+        import PyPDF2
+        from PyPDF2 import PdfReader
+    except ImportError:
+        print("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
+        exit(1)
+        
    current_dir = Path(__file__).resolve().parent
    pdf_path = f'{current_dir}/test.pdf'