refactor: replace PyPDF2 with pypdf across the codebase. ref #1412
This commit is contained in:
@@ -15,9 +15,9 @@ from .utils import (
|
|||||||
clean_pdf_text_to_html,
|
clean_pdf_text_to_html,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Remove direct PyPDF2 imports from the top
|
# Remove direct pypdf imports from the top
|
||||||
# import PyPDF2
|
# import pypdf
|
||||||
# from PyPDF2 import PdfReader
|
# from pypdf import PdfReader
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -59,9 +59,9 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):
|
|||||||
save_images_locally: bool = False, image_save_dir: Optional[Path] = None, batch_size: int = 4):
|
save_images_locally: bool = False, image_save_dir: Optional[Path] = None, batch_size: int = 4):
|
||||||
# Import check at initialization time
|
# Import check at initialization time
|
||||||
try:
|
try:
|
||||||
import PyPDF2
|
import pypdf
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
|
raise ImportError("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
|
||||||
|
|
||||||
self.image_dpi = image_dpi
|
self.image_dpi = image_dpi
|
||||||
self.image_quality = image_quality
|
self.image_quality = image_quality
|
||||||
@@ -75,9 +75,9 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):
|
|||||||
def process(self, pdf_path: Path) -> PDFProcessResult:
|
def process(self, pdf_path: Path) -> PDFProcessResult:
|
||||||
# Import inside method to allow dependency to be optional
|
# Import inside method to allow dependency to be optional
|
||||||
try:
|
try:
|
||||||
from PyPDF2 import PdfReader
|
from pypdf import PdfReader
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
|
raise ImportError("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
|
||||||
|
|
||||||
start_time = time()
|
start_time = time()
|
||||||
result = PDFProcessResult(
|
result = PDFProcessResult(
|
||||||
@@ -125,15 +125,15 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):
|
|||||||
"""Like process() but processes PDF pages in parallel batches"""
|
"""Like process() but processes PDF pages in parallel batches"""
|
||||||
# Import inside method to allow dependency to be optional
|
# Import inside method to allow dependency to be optional
|
||||||
try:
|
try:
|
||||||
from PyPDF2 import PdfReader
|
from pypdf import PdfReader
|
||||||
import PyPDF2 # For type checking
|
import pypdf # For type checking
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
|
raise ImportError("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
|
||||||
|
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
import threading
|
import threading
|
||||||
|
|
||||||
# Initialize PyPDF2 thread support
|
# Initialize pypdf thread support
|
||||||
if not hasattr(threading.current_thread(), "_children"):
|
if not hasattr(threading.current_thread(), "_children"):
|
||||||
threading.current_thread()._children = set()
|
threading.current_thread()._children = set()
|
||||||
|
|
||||||
@@ -232,11 +232,11 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):
|
|||||||
return pdf_page
|
return pdf_page
|
||||||
|
|
||||||
def _extract_images(self, page, image_dir: Optional[Path]) -> List[Dict]:
|
def _extract_images(self, page, image_dir: Optional[Path]) -> List[Dict]:
|
||||||
# Import PyPDF2 for type checking only when needed
|
# Import pypdf for type checking only when needed
|
||||||
try:
|
try:
|
||||||
import PyPDF2
|
from pypdf.generic import IndirectObject
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
|
raise ImportError("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
|
||||||
|
|
||||||
if not self.extract_images:
|
if not self.extract_images:
|
||||||
return []
|
return []
|
||||||
@@ -266,7 +266,7 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):
|
|||||||
width = xobj.get('/Width', 0)
|
width = xobj.get('/Width', 0)
|
||||||
height = xobj.get('/Height', 0)
|
height = xobj.get('/Height', 0)
|
||||||
color_space = xobj.get('/ColorSpace', '/DeviceRGB')
|
color_space = xobj.get('/ColorSpace', '/DeviceRGB')
|
||||||
if isinstance(color_space, PyPDF2.generic.IndirectObject):
|
if isinstance(color_space, IndirectObject):
|
||||||
color_space = color_space.get_object()
|
color_space = color_space.get_object()
|
||||||
|
|
||||||
# Handle different image encodings
|
# Handle different image encodings
|
||||||
@@ -277,7 +277,7 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):
|
|||||||
if '/FlateDecode' in filters:
|
if '/FlateDecode' in filters:
|
||||||
try:
|
try:
|
||||||
decode_parms = xobj.get('/DecodeParms', {})
|
decode_parms = xobj.get('/DecodeParms', {})
|
||||||
if isinstance(decode_parms, PyPDF2.generic.IndirectObject):
|
if isinstance(decode_parms, IndirectObject):
|
||||||
decode_parms = decode_parms.get_object()
|
decode_parms = decode_parms.get_object()
|
||||||
|
|
||||||
predictor = decode_parms.get('/Predictor', 1)
|
predictor = decode_parms.get('/Predictor', 1)
|
||||||
@@ -416,10 +416,10 @@ class NaivePDFProcessorStrategy(PDFProcessorStrategy):
|
|||||||
# Import inside method to allow dependency to be optional
|
# Import inside method to allow dependency to be optional
|
||||||
if reader is None:
|
if reader is None:
|
||||||
try:
|
try:
|
||||||
from PyPDF2 import PdfReader
|
from pypdf import PdfReader
|
||||||
reader = PdfReader(pdf_path)
|
reader = PdfReader(pdf_path)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ImportError("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
|
raise ImportError("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
|
||||||
|
|
||||||
meta = reader.metadata or {}
|
meta = reader.metadata or {}
|
||||||
created = self._parse_pdf_date(meta.get('/CreationDate', ''))
|
created = self._parse_pdf_date(meta.get('/CreationDate', ''))
|
||||||
@@ -459,11 +459,11 @@ if __name__ == "__main__":
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Import PyPDF2 only when running the file directly
|
# Import pypdf only when running the file directly
|
||||||
import PyPDF2
|
import pypdf
|
||||||
from PyPDF2 import PdfReader
|
from pypdf import PdfReader
|
||||||
except ImportError:
|
except ImportError:
|
||||||
print("PyPDF2 is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
|
print("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
current_dir = Path(__file__).resolve().parent
|
current_dir = Path(__file__).resolve().parent
|
||||||
|
|||||||
@@ -59,13 +59,13 @@ classifiers = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
pdf = ["PyPDF2"]
|
pdf = ["pypdf"]
|
||||||
torch = ["torch", "nltk", "scikit-learn"]
|
torch = ["torch", "nltk", "scikit-learn"]
|
||||||
transformer = ["transformers", "tokenizers", "sentence-transformers"]
|
transformer = ["transformers", "tokenizers", "sentence-transformers"]
|
||||||
cosine = ["torch", "transformers", "nltk", "sentence-transformers"]
|
cosine = ["torch", "transformers", "nltk", "sentence-transformers"]
|
||||||
sync = ["selenium"]
|
sync = ["selenium"]
|
||||||
all = [
|
all = [
|
||||||
"PyPDF2",
|
"pypdf",
|
||||||
"torch",
|
"torch",
|
||||||
"nltk",
|
"nltk",
|
||||||
"scikit-learn",
|
"scikit-learn",
|
||||||
|
|||||||
@@ -33,4 +33,4 @@ shapely>=2.0.0
|
|||||||
|
|
||||||
fake-useragent>=2.2.0
|
fake-useragent>=2.2.0
|
||||||
pdf2image>=1.17.0
|
pdf2image>=1.17.0
|
||||||
PyPDF2>=3.0.1
|
pypdf>=6.0.0
|
||||||
@@ -71,7 +71,7 @@ PACKAGE_MAPPINGS = {
|
|||||||
'sentence_transformers': 'sentence-transformers',
|
'sentence_transformers': 'sentence-transformers',
|
||||||
'rank_bm25': 'rank-bm25',
|
'rank_bm25': 'rank-bm25',
|
||||||
'snowballstemmer': 'snowballstemmer',
|
'snowballstemmer': 'snowballstemmer',
|
||||||
'PyPDF2': 'PyPDF2',
|
'pypdf': 'pypdf',
|
||||||
'pdf2image': 'pdf2image',
|
'pdf2image': 'pdf2image',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user