refactor: replace PyPDF2 with pypdf across the codebase. ref #1412

This commit is contained in:
ntohidi
2025-12-03 10:59:18 +01:00
parent f32cfc6db0
commit df4d87ed78
4 changed files with 26 additions and 26 deletions

View File

@@ -15,9 +15,9 @@ from .utils import (
clean_pdf_text_to_html, clean_pdf_text_to_html,
) )
# Remove direct PyPDF2 imports from the top # Remove direct pypdf imports from the top
# import PyPDF2 # import pypdf
# from PyPDF2 import PdfReader # from pypdf import PdfReader
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -59,9 +59,9 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):
save_images_locally: bool = False, image_save_dir: Optional[Path] = None, batch_size: int = 4): save_images_locally: bool = False, image_save_dir: Optional[Path] = None, batch_size: int = 4):
# Import check at initialization time # Import check at initialization time
try: try:
import PyPDF2 import pypdf
except ImportError: except ImportError:
raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'") raise ImportError("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
self.image_dpi = image_dpi self.image_dpi = image_dpi
self.image_quality = image_quality self.image_quality = image_quality
@@ -75,9 +75,9 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):
def process(self, pdf_path: Path) -> PDFProcessResult: def process(self, pdf_path: Path) -> PDFProcessResult:
# Import inside method to allow dependency to be optional # Import inside method to allow dependency to be optional
try: try:
from PyPDF2 import PdfReader from pypdf import PdfReader
except ImportError: except ImportError:
raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'") raise ImportError("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
start_time = time() start_time = time()
result = PDFProcessResult( result = PDFProcessResult(
@@ -125,15 +125,15 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):
"""Like process() but processes PDF pages in parallel batches""" """Like process() but processes PDF pages in parallel batches"""
# Import inside method to allow dependency to be optional # Import inside method to allow dependency to be optional
try: try:
from PyPDF2 import PdfReader from pypdf import PdfReader
import PyPDF2 # For type checking import pypdf # For type checking
except ImportError: except ImportError:
raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'") raise ImportError("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
import concurrent.futures import concurrent.futures
import threading import threading
# Initialize PyPDF2 thread support # Initialize pypdf thread support
if not hasattr(threading.current_thread(), "_children"): if not hasattr(threading.current_thread(), "_children"):
threading.current_thread()._children = set() threading.current_thread()._children = set()
@@ -232,11 +232,11 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):
return pdf_page return pdf_page
def _extract_images(self, page, image_dir: Optional[Path]) -> List[Dict]: def _extract_images(self, page, image_dir: Optional[Path]) -> List[Dict]:
# Import PyPDF2 for type checking only when needed # Import pypdf for type checking only when needed
try: try:
import PyPDF2 from pypdf.generic import IndirectObject
except ImportError: except ImportError:
raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'") raise ImportError("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
if not self.extract_images: if not self.extract_images:
return [] return []
@@ -266,7 +266,7 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):
width = xobj.get('/Width', 0) width = xobj.get('/Width', 0)
height = xobj.get('/Height', 0) height = xobj.get('/Height', 0)
color_space = xobj.get('/ColorSpace', '/DeviceRGB') color_space = xobj.get('/ColorSpace', '/DeviceRGB')
if isinstance(color_space, PyPDF2.generic.IndirectObject): if isinstance(color_space, IndirectObject):
color_space = color_space.get_object() color_space = color_space.get_object()
# Handle different image encodings # Handle different image encodings
@@ -277,7 +277,7 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):
if '/FlateDecode' in filters: if '/FlateDecode' in filters:
try: try:
decode_parms = xobj.get('/DecodeParms', {}) decode_parms = xobj.get('/DecodeParms', {})
if isinstance(decode_parms, PyPDF2.generic.IndirectObject): if isinstance(decode_parms, IndirectObject):
decode_parms = decode_parms.get_object() decode_parms = decode_parms.get_object()
predictor = decode_parms.get('/Predictor', 1) predictor = decode_parms.get('/Predictor', 1)
@@ -416,10 +416,10 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):
# Import inside method to allow dependency to be optional # Import inside method to allow dependency to be optional
if reader is None: if reader is None:
try: try:
from PyPDF2 import PdfReader from pypdf import PdfReader
reader = PdfReader(pdf_path) reader = PdfReader(pdf_path)
except ImportError: except ImportError:
raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'") raise ImportError("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
meta = reader.metadata or {} meta = reader.metadata or {}
created = self._parse_pdf_date(meta.get('/CreationDate', '')) created = self._parse_pdf_date(meta.get('/CreationDate', ''))
@@ -459,11 +459,11 @@ if __name__ == "__main__":
from pathlib import Path from pathlib import Path
try: try:
# Import PyPDF2 only when running the file directly # Import pypdf only when running the file directly
import PyPDF2 import pypdf
from PyPDF2 import PdfReader from pypdf import PdfReader
except ImportError: except ImportError:
print("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'") print("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
exit(1) exit(1)
current_dir = Path(__file__).resolve().parent current_dir = Path(__file__).resolve().parent

View File

@@ -59,13 +59,13 @@ classifiers = [
] ]
[project.optional-dependencies] [project.optional-dependencies]
pdf = ["PyPDF2"] pdf = ["pypdf"]
torch = ["torch", "nltk", "scikit-learn"] torch = ["torch", "nltk", "scikit-learn"]
transformer = ["transformers", "tokenizers", "sentence-transformers"] transformer = ["transformers", "tokenizers", "sentence-transformers"]
cosine = ["torch", "transformers", "nltk", "sentence-transformers"] cosine = ["torch", "transformers", "nltk", "sentence-transformers"]
sync = ["selenium"] sync = ["selenium"]
all = [ all = [
"PyPDF2", "pypdf",
"torch", "torch",
"nltk", "nltk",
"scikit-learn", "scikit-learn",

View File

@@ -33,4 +33,4 @@ shapely>=2.0.0
fake-useragent>=2.2.0 fake-useragent>=2.2.0
pdf2image>=1.17.0 pdf2image>=1.17.0
PyPDF2>=3.0.1 pypdf>=6.0.0

View File

@@ -71,7 +71,7 @@ PACKAGE_MAPPINGS = {
'sentence_transformers': 'sentence-transformers', 'sentence_transformers': 'sentence-transformers',
'rank_bm25': 'rank-bm25', 'rank_bm25': 'rank-bm25',
'snowballstemmer': 'snowballstemmer', 'snowballstemmer': 'snowballstemmer',
'PyPDF2': 'PyPDF2', 'pypdf': 'pypdf',
'pdf2image': 'pdf2image', 'pdf2image': 'pdf2image',
} }