feat(pdf): add PDF processing capabilities
Add new PDF processing module with the following features: - PDF text extraction and formatting to HTML/Markdown - Image extraction with multiple format support (JPEG, PNG, TIFF) - Link extraction from PDF documents - Metadata extraction including title, author, dates - Support for both local and remote PDF files Also includes: - New configuration options for HTML attribute handling - Internal/external link filtering improvements - Version bump to 0.4.300b4
This commit is contained in:
164
crawl4ai/processors/pdf/__init__.py
Normal file
164
crawl4ai/processors/pdf/__init__.py
Normal file
@@ -0,0 +1,164 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
from dataclasses import asdict
|
||||
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
from crawl4ai.async_crawler_strategy import AsyncCrawlerStrategy
|
||||
from crawl4ai.models import AsyncCrawlResponse, ScrapingResult
|
||||
from crawl4ai.content_scraping_strategy import ContentScrapingStrategy
|
||||
from .processor import NaivePDFProcessorStrategy # Assuming your current PDF code is in pdf_processor.py
|
||||
|
||||
class PDFCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
def __init__(self, logger: AsyncLogger = None):
|
||||
self.logger = logger
|
||||
|
||||
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
|
||||
# Just pass through with empty HTML - scraper will handle actual processing
|
||||
return AsyncCrawlResponse(
|
||||
html="", # Scraper will handle the real work
|
||||
response_headers={"Content-Type": "application/pdf"},
|
||||
status_code=200
|
||||
)
|
||||
|
||||
async def close(self):
|
||||
pass
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||
await self.close()
|
||||
|
||||
class PDFContentScrapingStrategy(ContentScrapingStrategy):
|
||||
"""
|
||||
A content scraping strategy for PDF files.
|
||||
|
||||
Attributes:
|
||||
save_images_locally (bool): Whether to save images locally.
|
||||
extract_images (bool): Whether to extract images from PDF.
|
||||
image_save_dir (str): Directory to save extracted images.
|
||||
logger (AsyncLogger): Logger instance for recording events and errors.
|
||||
|
||||
Methods:
|
||||
scrap(url: str, html: str, **params) -> ScrapingResult:
|
||||
Scrap content from a PDF file.
|
||||
ascrap(url: str, html: str, **kwargs) -> ScrapingResult:
|
||||
Asynchronous version of scrap.
|
||||
|
||||
Usage:
|
||||
strategy = PDFContentScrapingStrategy(
|
||||
save_images_locally=False,
|
||||
extract_images=False,
|
||||
image_save_dir=None,
|
||||
logger=logger
|
||||
)
|
||||
|
||||
"""
|
||||
def __init__(self,
|
||||
save_images_locally=False,
|
||||
extract_images=False,
|
||||
image_save_dir=None,
|
||||
logger: AsyncLogger = None):
|
||||
self.logger = logger
|
||||
self.pdf_processor = NaivePDFProcessorStrategy(
|
||||
save_images_locally=False,
|
||||
extract_images=False,
|
||||
image_save_dir=None
|
||||
)
|
||||
|
||||
def scrap(self, url: str, html: str, **params) -> ScrapingResult:
|
||||
"""
|
||||
Scrap content from a PDF file.
|
||||
|
||||
Args:
|
||||
url (str): The URL of the PDF file.
|
||||
html (str): The HTML content of the page.
|
||||
**params: Additional parameters.
|
||||
|
||||
Returns:
|
||||
ScrapingResult: The scraped content.
|
||||
"""
|
||||
# Download if URL or use local path
|
||||
pdf_path = self._get_pdf_path(url)
|
||||
try:
|
||||
# Process PDF
|
||||
result = self.pdf_processor.process(Path(pdf_path))
|
||||
|
||||
# Combine page HTML
|
||||
cleaned_html = f"""
|
||||
<html>
|
||||
<head><meta name="pdf-pages" content="{len(result.pages)}"></head>
|
||||
<body>
|
||||
{''.join(f'<div class="pdf-page" data-page="{i+1}">{page.html}</div>'
|
||||
for i, page in enumerate(result.pages))}
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
# Accumulate media and links with page numbers
|
||||
media = {"images": []}
|
||||
links = {"urls": []}
|
||||
|
||||
for page in result.pages:
|
||||
# Add page number to each image
|
||||
for img in page.images:
|
||||
img["page"] = page.page_number
|
||||
media["images"].append(img)
|
||||
|
||||
# Add page number to each link
|
||||
for link in page.links:
|
||||
links["urls"].append({
|
||||
"url": link,
|
||||
"page": page.page_number
|
||||
})
|
||||
|
||||
return ScrapingResult(
|
||||
cleaned_html=cleaned_html,
|
||||
success=True,
|
||||
media=media,
|
||||
links=links,
|
||||
metadata=asdict(result.metadata)
|
||||
)
|
||||
finally:
|
||||
# Cleanup temp file if downloaded
|
||||
if url.startswith(("http://", "https://")):
|
||||
Path(pdf_path).unlink(missing_ok=True)
|
||||
|
||||
async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
|
||||
# For simple cases, you can use the sync version
|
||||
return await asyncio.to_thread(self.scrap, url, html, **kwargs)
|
||||
|
||||
|
||||
def _get_pdf_path(self, url: str) -> str:
|
||||
if url.startswith(("http://", "https://")):
|
||||
import tempfile
|
||||
import requests
|
||||
|
||||
# Create temp file with .pdf extension
|
||||
temp_file = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
|
||||
|
||||
try:
|
||||
# Download PDF with streaming
|
||||
response = requests.get(url, stream=True)
|
||||
response.raise_for_status()
|
||||
|
||||
# Write to temp file
|
||||
with open(temp_file.name, 'wb') as f:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
f.write(chunk)
|
||||
|
||||
return temp_file.name
|
||||
|
||||
except Exception as e:
|
||||
# Clean up temp file if download fails
|
||||
Path(temp_file.name).unlink(missing_ok=True)
|
||||
raise RuntimeError(f"Failed to download PDF from {url}: {str(e)}")
|
||||
|
||||
elif url.startswith("file://"):
|
||||
return url[7:] # Strip file:// prefix
|
||||
|
||||
return url # Assume local path
|
||||
|
||||
|
||||
__all__ = ["PDFCrawlerStrategy", "PDFContentScrapingStrategy"]
|
||||
372
crawl4ai/processors/pdf/processor.py
Normal file
372
crawl4ai/processors/pdf/processor.py
Normal file
@@ -0,0 +1,372 @@
|
||||
import logging
|
||||
import re
|
||||
from abc import ABC, abstractmethod
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from time import time
|
||||
from dataclasses import dataclass, asdict, field
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
import PyPDF2
|
||||
from PIL import Image
|
||||
from PyPDF2 import PdfReader
|
||||
from .utils import *
|
||||
import base64
|
||||
import tempfile
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import List, Optional, Dict, Any
|
||||
from pathlib import Path
|
||||
|
||||
@dataclass
|
||||
class PDFMetadata:
|
||||
title: Optional[str] = None
|
||||
author: Optional[str] = None
|
||||
producer: Optional[str] = None
|
||||
created: Optional[datetime] = None
|
||||
modified: Optional[datetime] = None
|
||||
pages: int = 0
|
||||
encrypted: bool = False
|
||||
file_size: Optional[int] = None
|
||||
|
||||
@dataclass
|
||||
class PDFPage:
|
||||
page_number: int
|
||||
raw_text: str = ""
|
||||
markdown: str = "" # Added per your request
|
||||
html: str = "" # Added per your request
|
||||
images: List[Dict] = field(default_factory=list)
|
||||
links: List[str] = field(default_factory=list)
|
||||
layout: List[Dict] = field(default_factory=list)
|
||||
|
||||
@dataclass
|
||||
class PDFProcessResult:
|
||||
metadata: PDFMetadata
|
||||
pages: List[PDFPage]
|
||||
processing_time: float = 0.0
|
||||
version: str = "1.0"
|
||||
|
||||
class PDFProcessorStrategy(ABC):
|
||||
@abstractmethod
|
||||
def process(self, pdf_path: Path) -> PDFProcessResult:
|
||||
pass
|
||||
|
||||
class NaivePDFProcessorStrategy(PDFProcessorStrategy):
|
||||
def __init__(self, image_dpi: int = 144, image_quality: int = 85, extract_images: bool = True,
|
||||
save_images_locally: bool = False, image_save_dir: Optional[Path] = None):
|
||||
self.image_dpi = image_dpi
|
||||
self.image_quality = image_quality
|
||||
self.current_page_number = 0
|
||||
self.extract_images = extract_images
|
||||
self.save_images_locally = save_images_locally
|
||||
self.image_save_dir = image_save_dir
|
||||
self._temp_dir = None
|
||||
|
||||
def process(self, pdf_path: Path) -> PDFProcessResult:
|
||||
start_time = time()
|
||||
result = PDFProcessResult(
|
||||
metadata=PDFMetadata(),
|
||||
pages=[],
|
||||
version="1.1"
|
||||
)
|
||||
|
||||
try:
|
||||
with pdf_path.open('rb') as file:
|
||||
reader = PdfReader(file)
|
||||
result.metadata = self._extract_metadata(pdf_path, reader)
|
||||
|
||||
# Handle image directory
|
||||
image_dir = None
|
||||
if self.extract_images and self.save_images_locally:
|
||||
if self.image_save_dir:
|
||||
image_dir = Path(self.image_save_dir)
|
||||
image_dir.mkdir(exist_ok=True, parents=True)
|
||||
else:
|
||||
self._temp_dir = tempfile.mkdtemp(prefix='pdf_images_')
|
||||
image_dir = Path(self._temp_dir)
|
||||
|
||||
for page_num, page in enumerate(reader.pages):
|
||||
self.current_page_number = page_num + 1
|
||||
pdf_page = self._process_page(page, image_dir, reader)
|
||||
result.pages.append(pdf_page)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process PDF: {str(e)}")
|
||||
raise
|
||||
finally:
|
||||
# Cleanup temp directory if it was created
|
||||
if self._temp_dir and not self.image_save_dir:
|
||||
import shutil
|
||||
try:
|
||||
shutil.rmtree(self._temp_dir)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to cleanup temp directory: {str(e)}")
|
||||
|
||||
result.processing_time = time() - start_time
|
||||
return result
|
||||
|
||||
def _process_page(self, page, image_dir: Optional[Path], reader) -> PDFPage:
|
||||
pdf_page = PDFPage(
|
||||
page_number=self.current_page_number,
|
||||
)
|
||||
|
||||
# Text and font extraction
|
||||
def visitor_text(text, cm, tm, font_dict, font_size):
|
||||
pdf_page.raw_text += text
|
||||
pdf_page.layout.append({
|
||||
"type": "text",
|
||||
"text": text,
|
||||
"x": tm[4],
|
||||
"y": tm[5],
|
||||
})
|
||||
|
||||
page.extract_text(visitor_text=visitor_text)
|
||||
|
||||
# Image extraction
|
||||
if self.extract_images:
|
||||
pdf_page.images = self._extract_images(page, image_dir)
|
||||
|
||||
# Link extraction
|
||||
pdf_page.links = self._extract_links(page)
|
||||
|
||||
# Add markdown content
|
||||
pdf_page.markdown = clean_pdf_text(self.current_page_number, pdf_page.raw_text)
|
||||
pdf_page.html = clean_pdf_text_to_html(self.current_page_number, pdf_page.raw_text)
|
||||
|
||||
return pdf_page
|
||||
|
||||
def _extract_images(self, page, image_dir: Optional[Path]) -> List[Dict]:
|
||||
if not self.extract_images:
|
||||
return []
|
||||
|
||||
images = []
|
||||
try:
|
||||
resources = page.get("/Resources")
|
||||
if resources: # Check if resources exist
|
||||
resources = resources.get_object() # Resolve IndirectObject
|
||||
if '/XObject' in resources:
|
||||
xobjects = resources['/XObject'].get_object()
|
||||
img_count = 0
|
||||
for obj_name in xobjects:
|
||||
xobj = xobjects[obj_name]
|
||||
if hasattr(xobj, 'get_object') and callable(xobj.get_object):
|
||||
xobj = xobj.get_object()
|
||||
if xobj.get('/Subtype') == '/Image':
|
||||
try:
|
||||
img_count += 1
|
||||
img_filename = f"page_{self.current_page_number}_img_{img_count}"
|
||||
data = xobj.get_data()
|
||||
filters = xobj.get('/Filter', [])
|
||||
if not isinstance(filters, list):
|
||||
filters = [filters]
|
||||
|
||||
# Resolve IndirectObjects in properties
|
||||
width = xobj.get('/Width', 0)
|
||||
height = xobj.get('/Height', 0)
|
||||
color_space = xobj.get('/ColorSpace', '/DeviceRGB')
|
||||
if isinstance(color_space, PyPDF2.generic.IndirectObject):
|
||||
color_space = color_space.get_object()
|
||||
|
||||
# Handle different image encodings
|
||||
success = False
|
||||
image_format = 'bin'
|
||||
image_data = None
|
||||
|
||||
if '/FlateDecode' in filters:
|
||||
try:
|
||||
decode_parms = xobj.get('/DecodeParms', {})
|
||||
if isinstance(decode_parms, PyPDF2.generic.IndirectObject):
|
||||
decode_parms = decode_parms.get_object()
|
||||
|
||||
predictor = decode_parms.get('/Predictor', 1)
|
||||
bits = xobj.get('/BitsPerComponent', 8)
|
||||
colors = 3 if color_space == '/DeviceRGB' else 1
|
||||
|
||||
if predictor >= 10:
|
||||
data = apply_png_predictor(data, width, bits, colors)
|
||||
|
||||
# Create PIL Image
|
||||
mode = 'RGB' if color_space == '/DeviceRGB' else 'L'
|
||||
img = Image.frombytes(mode, (width, height), data)
|
||||
|
||||
if self.save_images_locally:
|
||||
final_path = (image_dir / img_filename).with_suffix('.png')
|
||||
img.save(final_path)
|
||||
image_data = str(final_path)
|
||||
else:
|
||||
import io
|
||||
img_byte_arr = io.BytesIO()
|
||||
img.save(img_byte_arr, format='PNG')
|
||||
image_data = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
|
||||
|
||||
success = True
|
||||
image_format = 'png'
|
||||
except Exception as e:
|
||||
logger.error(f"FlateDecode error: {str(e)}")
|
||||
|
||||
elif '/DCTDecode' in filters:
|
||||
# JPEG image
|
||||
try:
|
||||
if self.save_images_locally:
|
||||
final_path = (image_dir / img_filename).with_suffix('.jpg')
|
||||
with open(final_path, 'wb') as f:
|
||||
f.write(data)
|
||||
image_data = str(final_path)
|
||||
else:
|
||||
image_data = base64.b64encode(data).decode('utf-8')
|
||||
success = True
|
||||
image_format = 'jpeg'
|
||||
except Exception as e:
|
||||
logger.error(f"JPEG save error: {str(e)}")
|
||||
|
||||
elif '/CCITTFaxDecode' in filters:
|
||||
try:
|
||||
if data[:4] != b'II*\x00':
|
||||
# Add TIFF header if missing
|
||||
tiff_header = b'II*\x00\x08\x00\x00\x00\x0e\x00\x00\x01\x03\x00\x01\x00\x00\x00' + \
|
||||
width.to_bytes(4, 'little') + \
|
||||
b'\x01\x03\x00\x01\x00\x00\x00' + \
|
||||
height.to_bytes(4, 'little') + \
|
||||
b'\x01\x12\x00\x03\x00\x00\x00\x01\x00\x01\x00\x00\x01\x17\x00\x04\x00\x00\x00\x01\x00\x00\x00J\x01\x1B\x00\x05\x00\x00\x00\x01\x00\x00\x00R\x01\x28\x00\x03\x00\x00\x00\x01\x00\x02\x00\x00'
|
||||
data = tiff_header + data
|
||||
|
||||
if self.save_images_locally:
|
||||
final_path = (image_dir / img_filename).with_suffix('.tiff')
|
||||
with open(final_path, 'wb') as f:
|
||||
f.write(data)
|
||||
image_data = str(final_path)
|
||||
else:
|
||||
image_data = base64.b64encode(data).decode('utf-8')
|
||||
success = True
|
||||
image_format = 'tiff'
|
||||
except Exception as e:
|
||||
logger.error(f"CCITT save error: {str(e)}")
|
||||
|
||||
elif '/JPXDecode' in filters:
|
||||
# JPEG 2000
|
||||
try:
|
||||
if self.save_images_locally:
|
||||
final_path = (image_dir / img_filename).with_suffix('.jp2')
|
||||
with open(final_path, 'wb') as f:
|
||||
f.write(data)
|
||||
image_data = str(final_path)
|
||||
else:
|
||||
image_data = base64.b64encode(data).decode('utf-8')
|
||||
success = True
|
||||
image_format = 'jpeg2000'
|
||||
except Exception as e:
|
||||
logger.error(f"JPEG2000 save error: {str(e)}")
|
||||
|
||||
if success and image_data:
|
||||
image_info = {
|
||||
"format": image_format,
|
||||
"width": width,
|
||||
"height": height,
|
||||
"color_space": str(color_space),
|
||||
"bits_per_component": xobj.get('/BitsPerComponent', 1)
|
||||
}
|
||||
|
||||
if self.save_images_locally:
|
||||
image_info["path"] = image_data
|
||||
else:
|
||||
image_info["data"] = image_data
|
||||
|
||||
images.append(image_info)
|
||||
else:
|
||||
# Fallback: Save raw data
|
||||
if self.save_images_locally:
|
||||
final_path = (image_dir / img_filename).with_suffix('.bin')
|
||||
with open(final_path, 'wb') as f:
|
||||
f.write(data)
|
||||
logger.warning(f"Saved raw image data to {final_path}")
|
||||
else:
|
||||
image_data = base64.b64encode(data).decode('utf-8')
|
||||
images.append({
|
||||
"format": "bin",
|
||||
"width": width,
|
||||
"height": height,
|
||||
"color_space": str(color_space),
|
||||
"bits_per_component": xobj.get('/BitsPerComponent', 1),
|
||||
"data": image_data
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing image: {str(e)}")
|
||||
except Exception as e:
|
||||
logger.error(f"Image extraction error: {str(e)}")
|
||||
|
||||
return images
|
||||
|
||||
def _extract_links(self, page) -> List[str]:
|
||||
links = []
|
||||
if '/Annots' in page:
|
||||
try:
|
||||
for annot in page['/Annots']:
|
||||
a = annot.get_object()
|
||||
if '/A' in a and '/URI' in a['/A']:
|
||||
links.append(a['/A']['/URI'])
|
||||
except Exception as e:
|
||||
print(f"Link error: {str(e)}")
|
||||
return links
|
||||
|
||||
def _extract_metadata(self, pdf_path: Path, reader: PdfReader = None) -> PDFMetadata:
|
||||
if not reader:
|
||||
reader = PdfReader(pdf_path)
|
||||
|
||||
meta = reader.metadata or {}
|
||||
created = self._parse_pdf_date(meta.get('/CreationDate', ''))
|
||||
modified = self._parse_pdf_date(meta.get('/ModDate', ''))
|
||||
|
||||
return PDFMetadata(
|
||||
title=meta.get('/Title'),
|
||||
author=meta.get('/Author'),
|
||||
producer=meta.get('/Producer'),
|
||||
created=created,
|
||||
modified=modified,
|
||||
pages=len(reader.pages),
|
||||
encrypted=reader.is_encrypted,
|
||||
file_size=pdf_path.stat().st_size
|
||||
)
|
||||
|
||||
def _parse_pdf_date(self, date_str: str) -> Optional[datetime]:
|
||||
try:
|
||||
match = re.match(r'D:(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})', date_str)
|
||||
if not match:
|
||||
return None
|
||||
|
||||
return datetime(
|
||||
year=int(match[1]),
|
||||
month=int(match[2]),
|
||||
day=int(match[3]),
|
||||
hour=int(match[4]),
|
||||
minute=int(match[5]),
|
||||
second=int(match[6])
|
||||
)
|
||||
except:
|
||||
return None
|
||||
|
||||
# Usage example
|
||||
if __name__ == "__main__":
|
||||
import json
|
||||
from pathlib import Path
|
||||
current_dir = Path(__file__).resolve().parent
|
||||
pdf_path = f'{current_dir}/test.pdf'
|
||||
|
||||
strategy = NaivePDFProcessorStrategy()
|
||||
result = strategy.process(Path(pdf_path))
|
||||
|
||||
# Convert to JSON
|
||||
json_output = asdict(result)
|
||||
print(json.dumps(json_output, indent=2, default=str))
|
||||
|
||||
with open(f'{current_dir}/test.html', 'w') as f:
|
||||
for page in result.pages:
|
||||
f.write(f'<h1>Page {page["page_number"]}</h1>')
|
||||
f.write(page['html'])
|
||||
with open(f'{current_dir}/test.md', 'w') as f:
|
||||
for page in result.pages:
|
||||
f.write(f'# Page {page["page_number"]}\n\n')
|
||||
f.write(clean_pdf_text(page["page_number"], page['raw_text']))
|
||||
f.write('\n\n')
|
||||
350
crawl4ai/processors/pdf/utils.py
Normal file
350
crawl4ai/processors/pdf/utils.py
Normal file
@@ -0,0 +1,350 @@
|
||||
import re
|
||||
|
||||
def apply_png_predictor(data, width, bits, color_channels):
|
||||
"""Decode PNG predictor (PDF 1.5+ filter)"""
|
||||
bytes_per_pixel = (bits * color_channels) // 8
|
||||
if (bits * color_channels) % 8 != 0:
|
||||
bytes_per_pixel += 1
|
||||
|
||||
stride = width * bytes_per_pixel
|
||||
scanline_length = stride + 1 # +1 for filter byte
|
||||
|
||||
if len(data) % scanline_length != 0:
|
||||
raise ValueError("Invalid scanline structure")
|
||||
|
||||
num_lines = len(data) // scanline_length
|
||||
output = bytearray()
|
||||
prev_line = b'\x00' * stride
|
||||
|
||||
for i in range(num_lines):
|
||||
line = data[i*scanline_length:(i+1)*scanline_length]
|
||||
filter_type = line[0]
|
||||
filtered = line[1:]
|
||||
|
||||
if filter_type == 0: # None
|
||||
decoded = filtered
|
||||
elif filter_type == 1: # Sub
|
||||
decoded = bytearray(filtered)
|
||||
for j in range(bytes_per_pixel, len(decoded)):
|
||||
decoded[j] = (decoded[j] + decoded[j - bytes_per_pixel]) % 256
|
||||
elif filter_type == 2: # Up
|
||||
decoded = bytearray([(filtered[j] + prev_line[j]) % 256
|
||||
for j in range(len(filtered))])
|
||||
elif filter_type == 3: # Average
|
||||
decoded = bytearray(filtered)
|
||||
for j in range(len(decoded)):
|
||||
left = decoded[j - bytes_per_pixel] if j >= bytes_per_pixel else 0
|
||||
up = prev_line[j]
|
||||
avg = (left + up) // 2
|
||||
decoded[j] = (decoded[j] + avg) % 256
|
||||
elif filter_type == 4: # Paeth
|
||||
decoded = bytearray(filtered)
|
||||
for j in range(len(decoded)):
|
||||
left = decoded[j - bytes_per_pixel] if j >= bytes_per_pixel else 0
|
||||
up = prev_line[j]
|
||||
up_left = prev_line[j - bytes_per_pixel] if j >= bytes_per_pixel else 0
|
||||
paeth = paeth_predictor(left, up, up_left)
|
||||
decoded[j] = (decoded[j] + paeth) % 256
|
||||
else:
|
||||
raise ValueError(f"Unsupported filter type: {filter_type}")
|
||||
|
||||
output.extend(decoded)
|
||||
prev_line = decoded
|
||||
|
||||
return bytes(output)
|
||||
|
||||
def paeth_predictor(a, b, c):
|
||||
p = a + b - c
|
||||
pa = abs(p - a)
|
||||
pb = abs(p - b)
|
||||
pc = abs(p - c)
|
||||
if pa <= pb and pa <= pc:
|
||||
return a
|
||||
elif pb <= pc:
|
||||
return b
|
||||
else:
|
||||
return c
|
||||
|
||||
import re
|
||||
import html
|
||||
|
||||
def clean_pdf_text_to_html(page_number, text):
|
||||
# Decode Unicode escapes and handle surrogate pairs
|
||||
try:
|
||||
decoded = text.encode('latin-1').decode('unicode-escape')
|
||||
decoded = decoded.encode('utf-16', 'surrogatepass').decode('utf-16')
|
||||
except Exception as e:
|
||||
decoded = text # Fallback if decoding fails
|
||||
|
||||
article_title_detected = False
|
||||
# decoded = re.sub(r'\.\n', '.\n\n', decoded)
|
||||
# decoded = re.sub(r'\.\n', '<|break|>', decoded)
|
||||
lines = decoded.split('\n')
|
||||
output = []
|
||||
current_paragraph = []
|
||||
in_header = False
|
||||
email_pattern = re.compile(r'\{.*?\}')
|
||||
affiliation_pattern = re.compile(r'^†')
|
||||
quote_pattern = re.compile(r'^["“]')
|
||||
author_pattern = re.compile(
|
||||
r'^\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?'
|
||||
r'(?:,\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?)*'
|
||||
r'(?:,\s*(?:and|&)\s+[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?)?\s*$'
|
||||
)
|
||||
|
||||
def flush_paragraph():
|
||||
if current_paragraph:
|
||||
para = ' '.join(current_paragraph)
|
||||
para = re.sub(r'\s+', ' ', para).strip()
|
||||
if para:
|
||||
# escaped_para = html.escape(para)
|
||||
escaped_para = para
|
||||
# escaped_para = re.sub(r'\.\n', '.\n\n', escaped_para)
|
||||
# Split escaped_para by <|break|> to avoid HTML escaping
|
||||
escaped_para = escaped_para.split('.\n\n')
|
||||
# Wrap each part in <p> tag
|
||||
escaped_para = [f'<p>{part}</p>' for part in escaped_para]
|
||||
output.append(f'<div class="paragraph">{"".join(escaped_para)}</div><hr/>')
|
||||
current_paragraph.clear()
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
line = line.strip()
|
||||
|
||||
# Handle empty lines
|
||||
if not line:
|
||||
flush_paragraph()
|
||||
continue
|
||||
|
||||
# Detect article title (first line with reasonable length)
|
||||
if not article_title_detected and i == 0 and 3 <= len(line.split()) <= 8 and len(lines) > 1:
|
||||
flush_paragraph()
|
||||
escaped_line = html.escape(line)
|
||||
output.append(f'<h2>{escaped_line}</h2>')
|
||||
article_title_detected = True
|
||||
continue
|
||||
|
||||
# Detect numbered headers like "2.1 Background"
|
||||
numbered_header = re.match(r'^(\d+(?:\.\d+)*)\s+(.+)$', line)
|
||||
if i > 0 and not lines[i-1].strip() and numbered_header:
|
||||
flush_paragraph()
|
||||
level = numbered_header.group(1).count('.') + 1
|
||||
header_text = numbered_header.group(2)
|
||||
md_level = min(level + 1, 6)
|
||||
escaped_header = html.escape(header_text)
|
||||
output.append(f'<h{md_level}>{escaped_header}</h{md_level}>')
|
||||
in_header = True
|
||||
continue
|
||||
|
||||
# Detect authors
|
||||
if page_number == 1 and author_pattern.match(line):
|
||||
authors = re.sub(r'[†â€]', '', line)
|
||||
authors = re.split(r', | and ', authors)
|
||||
formatted_authors = []
|
||||
for author in authors:
|
||||
if author.strip():
|
||||
parts = [p for p in author.strip().split() if p]
|
||||
formatted = ' '.join(parts)
|
||||
escaped_author = html.escape(formatted)
|
||||
formatted_authors.append(f'<strong>{escaped_author}</strong>')
|
||||
|
||||
if len(formatted_authors) > 1:
|
||||
joined = ', '.join(formatted_authors[:-1]) + ' and ' + formatted_authors[-1]
|
||||
else:
|
||||
joined = formatted_authors[0]
|
||||
|
||||
output.append(f'<p>{joined}</p>')
|
||||
continue
|
||||
|
||||
# Detect affiliation
|
||||
if affiliation_pattern.match(line):
|
||||
escaped_line = html.escape(line)
|
||||
output.append(f'<p><em>{escaped_line}</em></p>')
|
||||
continue
|
||||
|
||||
# Detect emails
|
||||
if email_pattern.match(line):
|
||||
escaped_line = html.escape(line)
|
||||
output.append(f'<p><code>{escaped_line}</code></p>')
|
||||
continue
|
||||
|
||||
# Detect section headers
|
||||
if re.match(r'^(Abstract|\d+\s+[A-Z]|References|Appendix|Figure|Table)', line):
|
||||
flush_paragraph()
|
||||
escaped_line = html.escape(line)
|
||||
output.append(f'<h2 class="section-header"><em>{escaped_line}</em></h2>')
|
||||
in_header = True
|
||||
continue
|
||||
|
||||
# Handle quotes
|
||||
if quote_pattern.match(line):
|
||||
flush_paragraph()
|
||||
escaped_line = html.escape(line)
|
||||
output.append(f'<blockquote><p>{escaped_line}</p></blockquote>')
|
||||
continue
|
||||
|
||||
# Handle hyphenated words
|
||||
if line.endswith('-'):
|
||||
current_paragraph.append(line[:-1].strip())
|
||||
else:
|
||||
current_paragraph.append(line)
|
||||
|
||||
# Handle paragraph breaks after headers
|
||||
if in_header and not line.endswith(('.', '!', '?')):
|
||||
flush_paragraph()
|
||||
in_header = False
|
||||
|
||||
flush_paragraph()
|
||||
|
||||
# Post-process HTML
|
||||
html_output = '\n'.join(output)
|
||||
|
||||
# Fix common citation patterns
|
||||
html_output = re.sub(r'\(([A-Z][a-z]+ et al\. \d{4})\)', r'<cite>\1</cite>', html_output)
|
||||
|
||||
# Fix escaped characters
|
||||
html_output = html_output.replace('\\ud835', '').replace('\\u2020', '†')
|
||||
|
||||
# Remove leftover hyphens and fix spacing
|
||||
html_output = re.sub(r'\s+-\s+', '', html_output)
|
||||
html_output = re.sub(r'\s+([.,!?)])', r'\1', html_output)
|
||||
|
||||
return html_output
|
||||
|
||||
def clean_pdf_text(page_number, text):
|
||||
# Decode Unicode escapes and handle surrogate pairs
|
||||
try:
|
||||
decoded = text.encode('latin-1').decode('unicode-escape')
|
||||
decoded = decoded.encode('utf-16', 'surrogatepass').decode('utf-16')
|
||||
except Exception as e:
|
||||
decoded = text # Fallback if decoding fails
|
||||
|
||||
article_title_detected = False
|
||||
decoded = re.sub(r'\.\n', '.\n\n', decoded)
|
||||
lines = decoded.split('\n')
|
||||
output = []
|
||||
current_paragraph = []
|
||||
in_header = False
|
||||
email_pattern = re.compile(r'\{.*?\}')
|
||||
affiliation_pattern = re.compile(r'^†')
|
||||
quote_pattern = re.compile(r'^["“]')
|
||||
author_pattern = re.compile(
|
||||
r'^\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?'
|
||||
r'(?:,\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?)*'
|
||||
r'(?:,\s*(?:and|&)\s+[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?)?\s*$'
|
||||
)
|
||||
|
||||
def flush_paragraph():
|
||||
if current_paragraph:
|
||||
para = ' '.join(current_paragraph)
|
||||
para = re.sub(r'\s+', ' ', para).strip()
|
||||
if para:
|
||||
output.append(para)
|
||||
current_paragraph.clear()
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
line = line.strip()
|
||||
|
||||
# Handle special patterns
|
||||
if not line:
|
||||
flush_paragraph()
|
||||
continue
|
||||
|
||||
# Detect headline (first line, reasonable length, surrounded by empty lines)
|
||||
if not article_title_detected and i == 0 and 3 <= len(line.split()) <= 8 and (len(lines) > 1):
|
||||
flush_paragraph()
|
||||
output.append(f'## {line}')
|
||||
continue
|
||||
|
||||
# Detect paragraph breaks for ALL paragraphs
|
||||
if not line and current_paragraph:
|
||||
flush_paragraph()
|
||||
output.append('') # Add empty line between paragraphs
|
||||
continue
|
||||
|
||||
# Detect numbered headers like "2.1 Background"
|
||||
numbered_header = re.match(r'^(\d+(?:\.\d+)*)\s+(.+)$', line)
|
||||
if not lines[i-1].strip() and numbered_header:
|
||||
flush_paragraph()
|
||||
level = numbered_header.group(1).count('.') + 1 # Convert 2.1 → level 2
|
||||
header_text = numbered_header.group(2)
|
||||
# Never go beyond ### for subsections
|
||||
md_level = min(level + 1, 6) # 1 → ##, 2 → ###, 3 → #### etc
|
||||
output.append(f'{"#" * md_level} {header_text}')
|
||||
in_header = True
|
||||
continue
|
||||
|
||||
|
||||
# Detect authors
|
||||
if page_number == 1 and author_pattern.match(line):
|
||||
# Clean and format author names
|
||||
authors = re.sub(r'[†â€]', '', line) # Remove affiliation markers
|
||||
authors = re.split(r', | and ', authors)
|
||||
formatted_authors = []
|
||||
for author in authors:
|
||||
if author.strip():
|
||||
# Handle "First Last" formatting
|
||||
parts = [p for p in author.strip().split() if p]
|
||||
formatted = ' '.join(parts)
|
||||
formatted_authors.append(f'**{formatted}**')
|
||||
|
||||
# Join with commas and "and"
|
||||
if len(formatted_authors) > 1:
|
||||
joined = ', '.join(formatted_authors[:-1]) + ' and ' + formatted_authors[-1]
|
||||
else:
|
||||
joined = formatted_authors[0]
|
||||
|
||||
output.append(joined)
|
||||
continue
|
||||
|
||||
# Detect affiliation
|
||||
if affiliation_pattern.match(line):
|
||||
output.append(f'*{line}*')
|
||||
continue
|
||||
|
||||
# Detect emails
|
||||
if email_pattern.match(line):
|
||||
output.append(f'`{line}`')
|
||||
continue
|
||||
|
||||
# Detect section headers
|
||||
if re.match(r'^(Abstract|\d+\s+[A-Z]|References|Appendix|Figure|Table)', line):
|
||||
flush_paragraph()
|
||||
output.append(f'_[{line}]_')
|
||||
in_header = True
|
||||
continue
|
||||
|
||||
|
||||
# Handle quotes
|
||||
if quote_pattern.match(line):
|
||||
flush_paragraph()
|
||||
output.append(f'> {line}')
|
||||
continue
|
||||
|
||||
# Handle hyphenated words
|
||||
if line.endswith('-'):
|
||||
current_paragraph.append(line[:-1].strip())
|
||||
else:
|
||||
current_paragraph.append(line)
|
||||
|
||||
# Handle paragraph breaks after headers
|
||||
if in_header and not line.endswith(('.', '!', '?')):
|
||||
flush_paragraph()
|
||||
in_header = False
|
||||
|
||||
flush_paragraph()
|
||||
|
||||
# Post-processing
|
||||
markdown = '\n\n'.join(output)
|
||||
|
||||
# Fix common citation patterns
|
||||
markdown = re.sub(r'\(([A-Z][a-z]+ et al\. \d{4})\)', r'[\1]', markdown)
|
||||
|
||||
# Fix escaped characters
|
||||
markdown = markdown.replace('\\ud835', '').replace('\\u2020', '†')
|
||||
|
||||
# Remove leftover hyphens and fix spacing
|
||||
markdown = re.sub(r'\s+-\s+', '', markdown) # Join hyphenated words
|
||||
markdown = re.sub(r'\s+([.,!?)])', r'\1', markdown) # Fix punctuation spacing
|
||||
|
||||
|
||||
return markdown
|
||||
Reference in New Issue
Block a user