488 lines
22 KiB
Python
488 lines
22 KiB
Python
import logging
|
|
import re
|
|
from abc import ABC, abstractmethod
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from time import time
|
|
from dataclasses import dataclass, asdict, field
|
|
from typing import Dict, List, Optional, Any, Union
|
|
import base64
|
|
import tempfile
|
|
from .utils import *
|
|
from .utils import (
|
|
apply_png_predictor,
|
|
clean_pdf_text,
|
|
clean_pdf_text_to_html,
|
|
)
|
|
|
|
# Remove direct pypdf imports from the top
|
|
# import pypdf
|
|
# from pypdf import PdfReader
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
@dataclass
|
|
class PDFMetadata:
|
|
title: Optional[str] = None
|
|
author: Optional[str] = None
|
|
producer: Optional[str] = None
|
|
created: Optional[datetime] = None
|
|
modified: Optional[datetime] = None
|
|
pages: int = 0
|
|
encrypted: bool = False
|
|
file_size: Optional[int] = None
|
|
|
|
@dataclass
|
|
class PDFPage:
|
|
page_number: int
|
|
raw_text: str = ""
|
|
markdown: str = ""
|
|
html: str = ""
|
|
images: List[Dict] = field(default_factory=list)
|
|
links: List[str] = field(default_factory=list)
|
|
layout: List[Dict] = field(default_factory=list)
|
|
|
|
@dataclass
|
|
class PDFProcessResult:
|
|
metadata: PDFMetadata
|
|
pages: List[PDFPage]
|
|
processing_time: float = 0.0
|
|
version: str = "1.0"
|
|
|
|
class PDFProcessorStrategy(ABC):
|
|
@abstractmethod
|
|
def process(self, pdf_path: Path) -> PDFProcessResult:
|
|
pass
|
|
|
|
class NaivePDFProcessorStrategy(PDFProcessorStrategy):
|
|
def __init__(self, image_dpi: int = 144, image_quality: int = 85, extract_images: bool = True,
|
|
save_images_locally: bool = False, image_save_dir: Optional[Path] = None, batch_size: int = 4):
|
|
# Import check at initialization time
|
|
try:
|
|
import pypdf
|
|
except ImportError:
|
|
raise ImportError("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
|
|
|
|
self.image_dpi = image_dpi
|
|
self.image_quality = image_quality
|
|
self.current_page_number = 0
|
|
self.extract_images = extract_images
|
|
self.save_images_locally = save_images_locally
|
|
self.image_save_dir = image_save_dir
|
|
self.batch_size = batch_size
|
|
self._temp_dir = None
|
|
|
|
def process(self, pdf_path: Path) -> PDFProcessResult:
|
|
# Import inside method to allow dependency to be optional
|
|
try:
|
|
from pypdf import PdfReader
|
|
except ImportError:
|
|
raise ImportError("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
|
|
|
|
start_time = time()
|
|
result = PDFProcessResult(
|
|
metadata=PDFMetadata(),
|
|
pages=[],
|
|
version="1.1"
|
|
)
|
|
|
|
try:
|
|
with pdf_path.open('rb') as file:
|
|
reader = PdfReader(file)
|
|
result.metadata = self._extract_metadata(pdf_path, reader)
|
|
|
|
# Handle image directory
|
|
image_dir = None
|
|
if self.extract_images and self.save_images_locally:
|
|
if self.image_save_dir:
|
|
image_dir = Path(self.image_save_dir)
|
|
image_dir.mkdir(exist_ok=True, parents=True)
|
|
else:
|
|
self._temp_dir = tempfile.mkdtemp(prefix='pdf_images_')
|
|
image_dir = Path(self._temp_dir)
|
|
|
|
for page_num, page in enumerate(reader.pages):
|
|
self.current_page_number = page_num + 1
|
|
pdf_page = self._process_page(page, image_dir)
|
|
result.pages.append(pdf_page)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to process PDF: {str(e)}")
|
|
raise
|
|
finally:
|
|
# Cleanup temp directory if it was created
|
|
if self._temp_dir and not self.image_save_dir:
|
|
import shutil
|
|
try:
|
|
shutil.rmtree(self._temp_dir)
|
|
except Exception as e:
|
|
logger.error(f"Failed to cleanup temp directory: {str(e)}")
|
|
|
|
result.processing_time = time() - start_time
|
|
return result
|
|
|
|
def process_batch(self, pdf_path: Path) -> PDFProcessResult:
|
|
"""Like process() but processes PDF pages in parallel batches"""
|
|
# Import inside method to allow dependency to be optional
|
|
try:
|
|
from pypdf import PdfReader
|
|
import pypdf # For type checking
|
|
except ImportError:
|
|
raise ImportError("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
|
|
|
|
import concurrent.futures
|
|
import threading
|
|
|
|
# Initialize pypdf thread support
|
|
if not hasattr(threading.current_thread(), "_children"):
|
|
threading.current_thread()._children = set()
|
|
|
|
start_time = time()
|
|
result = PDFProcessResult(
|
|
metadata=PDFMetadata(),
|
|
pages=[],
|
|
version="1.1"
|
|
)
|
|
|
|
try:
|
|
# Get metadata and page count from main thread
|
|
with pdf_path.open('rb') as file:
|
|
reader = PdfReader(file)
|
|
result.metadata = self._extract_metadata(pdf_path, reader)
|
|
total_pages = len(reader.pages)
|
|
|
|
# Handle image directory setup
|
|
image_dir = None
|
|
if self.extract_images and self.save_images_locally:
|
|
if self.image_save_dir:
|
|
image_dir = Path(self.image_save_dir)
|
|
image_dir.mkdir(exist_ok=True, parents=True)
|
|
else:
|
|
self._temp_dir = tempfile.mkdtemp(prefix='pdf_images_')
|
|
image_dir = Path(self._temp_dir)
|
|
|
|
def process_page_safely(page_num: int):
|
|
# Each thread opens its own file handle
|
|
with pdf_path.open('rb') as file:
|
|
thread_reader = PdfReader(file)
|
|
page = thread_reader.pages[page_num]
|
|
self.current_page_number = page_num + 1
|
|
return self._process_page(page, image_dir)
|
|
|
|
# Process pages in parallel batches
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=self.batch_size) as executor:
|
|
futures = []
|
|
for page_num in range(total_pages):
|
|
future = executor.submit(process_page_safely, page_num)
|
|
futures.append((page_num + 1, future))
|
|
|
|
# Collect results in order
|
|
result.pages = [None] * total_pages
|
|
for page_num, future in futures:
|
|
try:
|
|
pdf_page = future.result()
|
|
result.pages[page_num - 1] = pdf_page
|
|
except Exception as e:
|
|
logger.error(f"Failed to process page {page_num}: {str(e)}")
|
|
raise
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to process PDF: {str(e)}")
|
|
raise
|
|
finally:
|
|
# Cleanup temp directory if it was created
|
|
if self._temp_dir and not self.image_save_dir:
|
|
import shutil
|
|
try:
|
|
shutil.rmtree(self._temp_dir)
|
|
except Exception as e:
|
|
logger.error(f"Failed to cleanup temp directory: {str(e)}")
|
|
|
|
result.processing_time = time() - start_time
|
|
return result
|
|
|
|
def _process_page(self, page, image_dir: Optional[Path]) -> PDFPage:
|
|
pdf_page = PDFPage(
|
|
page_number=self.current_page_number,
|
|
)
|
|
|
|
# Text and font extraction
|
|
def visitor_text(text, cm, tm, font_dict, font_size):
|
|
pdf_page.raw_text += text
|
|
pdf_page.layout.append({
|
|
"type": "text",
|
|
"text": text,
|
|
"x": tm[4],
|
|
"y": tm[5],
|
|
})
|
|
|
|
page.extract_text(visitor_text=visitor_text)
|
|
|
|
# Image extraction
|
|
if self.extract_images:
|
|
pdf_page.images = self._extract_images(page, image_dir)
|
|
|
|
# Link extraction
|
|
pdf_page.links = self._extract_links(page)
|
|
|
|
# Add markdown content
|
|
pdf_page.markdown = clean_pdf_text(self.current_page_number, pdf_page.raw_text)
|
|
pdf_page.html = clean_pdf_text_to_html(self.current_page_number, pdf_page.raw_text)
|
|
|
|
return pdf_page
|
|
|
|
def _extract_images(self, page, image_dir: Optional[Path]) -> List[Dict]:
|
|
# Import pypdf for type checking only when needed
|
|
try:
|
|
from pypdf.generic import IndirectObject
|
|
except ImportError:
|
|
raise ImportError("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
|
|
|
|
if not self.extract_images:
|
|
return []
|
|
|
|
images = []
|
|
try:
|
|
resources = page.get("/Resources")
|
|
if resources: # Check if resources exist
|
|
resources = resources.get_object() # Resolve IndirectObject
|
|
if '/XObject' in resources:
|
|
xobjects = resources['/XObject'].get_object()
|
|
img_count = 0
|
|
for obj_name in xobjects:
|
|
xobj = xobjects[obj_name]
|
|
if hasattr(xobj, 'get_object') and callable(xobj.get_object):
|
|
xobj = xobj.get_object()
|
|
if xobj.get('/Subtype') == '/Image':
|
|
try:
|
|
img_count += 1
|
|
img_filename = f"page_{self.current_page_number}_img_{img_count}"
|
|
data = xobj.get_data()
|
|
filters = xobj.get('/Filter', [])
|
|
if not isinstance(filters, list):
|
|
filters = [filters]
|
|
|
|
# Resolve IndirectObjects in properties
|
|
width = xobj.get('/Width', 0)
|
|
height = xobj.get('/Height', 0)
|
|
color_space = xobj.get('/ColorSpace', '/DeviceRGB')
|
|
if isinstance(color_space, IndirectObject):
|
|
color_space = color_space.get_object()
|
|
|
|
# Handle different image encodings
|
|
success = False
|
|
image_format = 'bin'
|
|
image_data = None
|
|
|
|
if '/FlateDecode' in filters:
|
|
try:
|
|
decode_parms = xobj.get('/DecodeParms', {})
|
|
if isinstance(decode_parms, IndirectObject):
|
|
decode_parms = decode_parms.get_object()
|
|
|
|
predictor = decode_parms.get('/Predictor', 1)
|
|
bits = xobj.get('/BitsPerComponent', 8)
|
|
colors = 3 if color_space == '/DeviceRGB' else 1
|
|
|
|
if predictor >= 10:
|
|
data = apply_png_predictor(data, width, bits, colors)
|
|
|
|
# Create PIL Image
|
|
from PIL import Image
|
|
mode = 'RGB' if color_space == '/DeviceRGB' else 'L'
|
|
img = Image.frombytes(mode, (width, height), data)
|
|
|
|
if self.save_images_locally:
|
|
final_path = (image_dir / img_filename).with_suffix('.png')
|
|
img.save(final_path)
|
|
image_data = str(final_path)
|
|
else:
|
|
import io
|
|
img_byte_arr = io.BytesIO()
|
|
img.save(img_byte_arr, format='PNG')
|
|
image_data = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
|
|
|
|
success = True
|
|
image_format = 'png'
|
|
except Exception as e:
|
|
logger.error(f"FlateDecode error: {str(e)}")
|
|
|
|
elif '/DCTDecode' in filters:
|
|
# JPEG image
|
|
try:
|
|
if self.save_images_locally:
|
|
final_path = (image_dir / img_filename).with_suffix('.jpg')
|
|
with open(final_path, 'wb') as f:
|
|
f.write(data)
|
|
image_data = str(final_path)
|
|
else:
|
|
image_data = base64.b64encode(data).decode('utf-8')
|
|
success = True
|
|
image_format = 'jpeg'
|
|
except Exception as e:
|
|
logger.error(f"JPEG save error: {str(e)}")
|
|
|
|
elif '/CCITTFaxDecode' in filters:
|
|
try:
|
|
if data[:4] != b'II*\x00':
|
|
# Add TIFF header if missing
|
|
tiff_header = b'II*\x00\x08\x00\x00\x00\x0e\x00\x00\x01\x03\x00\x01\x00\x00\x00' + \
|
|
width.to_bytes(4, 'little') + \
|
|
b'\x01\x03\x00\x01\x00\x00\x00' + \
|
|
height.to_bytes(4, 'little') + \
|
|
b'\x01\x12\x00\x03\x00\x00\x00\x01\x00\x01\x00\x00\x01\x17\x00\x04\x00\x00\x00\x01\x00\x00\x00J\x01\x1B\x00\x05\x00\x00\x00\x01\x00\x00\x00R\x01\x28\x00\x03\x00\x00\x00\x01\x00\x02\x00\x00'
|
|
data = tiff_header + data
|
|
|
|
if self.save_images_locally:
|
|
final_path = (image_dir / img_filename).with_suffix('.tiff')
|
|
with open(final_path, 'wb') as f:
|
|
f.write(data)
|
|
image_data = str(final_path)
|
|
else:
|
|
image_data = base64.b64encode(data).decode('utf-8')
|
|
success = True
|
|
image_format = 'tiff'
|
|
except Exception as e:
|
|
logger.error(f"CCITT save error: {str(e)}")
|
|
|
|
elif '/JPXDecode' in filters:
|
|
# JPEG 2000
|
|
try:
|
|
if self.save_images_locally:
|
|
final_path = (image_dir / img_filename).with_suffix('.jp2')
|
|
with open(final_path, 'wb') as f:
|
|
f.write(data)
|
|
image_data = str(final_path)
|
|
else:
|
|
image_data = base64.b64encode(data).decode('utf-8')
|
|
success = True
|
|
image_format = 'jpeg2000'
|
|
except Exception as e:
|
|
logger.error(f"JPEG2000 save error: {str(e)}")
|
|
|
|
if success and image_data:
|
|
image_info = {
|
|
"format": image_format,
|
|
"width": width,
|
|
"height": height,
|
|
"color_space": str(color_space),
|
|
"bits_per_component": xobj.get('/BitsPerComponent', 1)
|
|
}
|
|
|
|
if self.save_images_locally:
|
|
image_info["path"] = image_data
|
|
else:
|
|
image_info["data"] = image_data
|
|
|
|
images.append(image_info)
|
|
else:
|
|
# Fallback: Save raw data
|
|
if self.save_images_locally:
|
|
final_path = (image_dir / img_filename).with_suffix('.bin')
|
|
with open(final_path, 'wb') as f:
|
|
f.write(data)
|
|
logger.warning(f"Saved raw image data to {final_path}")
|
|
else:
|
|
image_data = base64.b64encode(data).decode('utf-8')
|
|
images.append({
|
|
"format": "bin",
|
|
"width": width,
|
|
"height": height,
|
|
"color_space": str(color_space),
|
|
"bits_per_component": xobj.get('/BitsPerComponent', 1),
|
|
"data": image_data
|
|
})
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing image: {str(e)}")
|
|
except Exception as e:
|
|
logger.error(f"Image extraction error: {str(e)}")
|
|
|
|
return images
|
|
|
|
def _extract_links(self, page) -> List[str]:
|
|
links = []
|
|
if '/Annots' in page:
|
|
try:
|
|
for annot in page['/Annots']:
|
|
a = annot.get_object()
|
|
if '/A' in a and '/URI' in a['/A']:
|
|
links.append(a['/A']['/URI'])
|
|
except Exception as e:
|
|
print(f"Link error: {str(e)}")
|
|
return links
|
|
|
|
def _extract_metadata(self, pdf_path: Path, reader = None) -> PDFMetadata:
|
|
# Import inside method to allow dependency to be optional
|
|
if reader is None:
|
|
try:
|
|
from pypdf import PdfReader
|
|
reader = PdfReader(pdf_path)
|
|
except ImportError:
|
|
raise ImportError("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
|
|
|
|
meta = reader.metadata or {}
|
|
created = self._parse_pdf_date(meta.get('/CreationDate', ''))
|
|
modified = self._parse_pdf_date(meta.get('/ModDate', ''))
|
|
|
|
return PDFMetadata(
|
|
title=meta.get('/Title'),
|
|
author=meta.get('/Author'),
|
|
producer=meta.get('/Producer'),
|
|
created=created,
|
|
modified=modified,
|
|
pages=len(reader.pages),
|
|
encrypted=reader.is_encrypted,
|
|
file_size=pdf_path.stat().st_size
|
|
)
|
|
|
|
def _parse_pdf_date(self, date_str: str) -> Optional[datetime]:
|
|
try:
|
|
match = re.match(r'D:(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})', date_str)
|
|
if not match:
|
|
return None
|
|
|
|
return datetime(
|
|
year=int(match[1]),
|
|
month=int(match[2]),
|
|
day=int(match[3]),
|
|
hour=int(match[4]),
|
|
minute=int(match[5]),
|
|
second=int(match[6])
|
|
)
|
|
except:
|
|
return None
|
|
|
|
# Usage example
|
|
if __name__ == "__main__":
|
|
import json
|
|
from pathlib import Path
|
|
|
|
try:
|
|
# Import pypdf only when running the file directly
|
|
import pypdf
|
|
from pypdf import PdfReader
|
|
except ImportError:
|
|
print("pypdf is required for PDF processing. Install with 'pip install crawl4ai[pdf]'")
|
|
exit(1)
|
|
|
|
current_dir = Path(__file__).resolve().parent
|
|
pdf_path = f'{current_dir}/test.pdf'
|
|
|
|
strategy = NaivePDFProcessorStrategy()
|
|
result = strategy.process(Path(pdf_path))
|
|
|
|
# Convert to JSON
|
|
json_output = asdict(result)
|
|
print(json.dumps(json_output, indent=2, default=str))
|
|
|
|
with open(f'{current_dir}/test.html', 'w') as f:
|
|
for page in result.pages:
|
|
f.write(f'<h1>Page {page["page_number"]}</h1>')
|
|
f.write(page['html'])
|
|
with open(f'{current_dir}/test.md', 'w') as f:
|
|
for page in result.pages:
|
|
f.write(f'# Page {page["page_number"]}\n\n')
|
|
f.write(clean_pdf_text(page["page_number"], page['raw_text']))
|
|
f.write('\n\n')
|