feat(pdf): add PDF processing capabilities

Add new PDF processing module with the following features:
- PDF text extraction and formatting to HTML/Markdown
- Image extraction with multiple format support (JPEG, PNG, TIFF)
- Link extraction from PDF documents
- Metadata extraction including title, author, dates
- Support for both local and remote PDF files

Also includes:
- New configuration options for HTML attribute handling
- Internal/external link filtering improvements
- Version bump to 0.4.300b4
This commit is contained in:
UncleCode
2025-01-27 21:24:15 +08:00
parent 54c84079c4
commit f8fd9d9eff
9 changed files with 933 additions and 49 deletions

View File

@@ -0,0 +1,164 @@
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Dict, List, Optional
from dataclasses import asdict
from crawl4ai.async_logger import AsyncLogger
from crawl4ai.async_crawler_strategy import AsyncCrawlerStrategy
from crawl4ai.models import AsyncCrawlResponse, ScrapingResult
from crawl4ai.content_scraping_strategy import ContentScrapingStrategy
from .processor import NaivePDFProcessorStrategy # Assuming your current PDF code is in pdf_processor.py
class PDFCrawlerStrategy(AsyncCrawlerStrategy):
def __init__(self, logger: AsyncLogger = None):
self.logger = logger
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
# Just pass through with empty HTML - scraper will handle actual processing
return AsyncCrawlResponse(
html="", # Scraper will handle the real work
response_headers={"Content-Type": "application/pdf"},
status_code=200
)
async def close(self):
pass
async def __aenter__(self):
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
await self.close()
class PDFContentScrapingStrategy(ContentScrapingStrategy):
"""
A content scraping strategy for PDF files.
Attributes:
save_images_locally (bool): Whether to save images locally.
extract_images (bool): Whether to extract images from PDF.
image_save_dir (str): Directory to save extracted images.
logger (AsyncLogger): Logger instance for recording events and errors.
Methods:
scrap(url: str, html: str, **params) -> ScrapingResult:
Scrap content from a PDF file.
ascrap(url: str, html: str, **kwargs) -> ScrapingResult:
Asynchronous version of scrap.
Usage:
strategy = PDFContentScrapingStrategy(
save_images_locally=False,
extract_images=False,
image_save_dir=None,
logger=logger
)
"""
def __init__(self,
save_images_locally=False,
extract_images=False,
image_save_dir=None,
logger: AsyncLogger = None):
self.logger = logger
self.pdf_processor = NaivePDFProcessorStrategy(
save_images_locally=False,
extract_images=False,
image_save_dir=None
)
def scrap(self, url: str, html: str, **params) -> ScrapingResult:
"""
Scrap content from a PDF file.
Args:
url (str): The URL of the PDF file.
html (str): The HTML content of the page.
**params: Additional parameters.
Returns:
ScrapingResult: The scraped content.
"""
# Download if URL or use local path
pdf_path = self._get_pdf_path(url)
try:
# Process PDF
result = self.pdf_processor.process(Path(pdf_path))
# Combine page HTML
cleaned_html = f"""
<html>
<head><meta name="pdf-pages" content="{len(result.pages)}"></head>
<body>
{''.join(f'<div class="pdf-page" data-page="{i+1}">{page.html}</div>'
for i, page in enumerate(result.pages))}
</body>
</html>
"""
# Accumulate media and links with page numbers
media = {"images": []}
links = {"urls": []}
for page in result.pages:
# Add page number to each image
for img in page.images:
img["page"] = page.page_number
media["images"].append(img)
# Add page number to each link
for link in page.links:
links["urls"].append({
"url": link,
"page": page.page_number
})
return ScrapingResult(
cleaned_html=cleaned_html,
success=True,
media=media,
links=links,
metadata=asdict(result.metadata)
)
finally:
# Cleanup temp file if downloaded
if url.startswith(("http://", "https://")):
Path(pdf_path).unlink(missing_ok=True)
async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
# For simple cases, you can use the sync version
return await asyncio.to_thread(self.scrap, url, html, **kwargs)
def _get_pdf_path(self, url: str) -> str:
if url.startswith(("http://", "https://")):
import tempfile
import requests
# Create temp file with .pdf extension
temp_file = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
try:
# Download PDF with streaming
response = requests.get(url, stream=True)
response.raise_for_status()
# Write to temp file
with open(temp_file.name, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
return temp_file.name
except Exception as e:
# Clean up temp file if download fails
Path(temp_file.name).unlink(missing_ok=True)
raise RuntimeError(f"Failed to download PDF from {url}: {str(e)}")
elif url.startswith("file://"):
return url[7:] # Strip file:// prefix
return url # Assume local path
__all__ = ["PDFCrawlerStrategy", "PDFContentScrapingStrategy"]

View File

@@ -0,0 +1,372 @@
import logging
import re
from abc import ABC, abstractmethod
from datetime import datetime
from pathlib import Path
from time import time
from dataclasses import dataclass, asdict, field
from typing import Dict, List, Optional, Tuple
import PyPDF2
from PIL import Image
from PyPDF2 import PdfReader
from .utils import *
import base64
import tempfile
logger = logging.getLogger(__name__)
from dataclasses import dataclass, field
from datetime import datetime
from typing import List, Optional, Dict, Any
from pathlib import Path
@dataclass
class PDFMetadata:
title: Optional[str] = None
author: Optional[str] = None
producer: Optional[str] = None
created: Optional[datetime] = None
modified: Optional[datetime] = None
pages: int = 0
encrypted: bool = False
file_size: Optional[int] = None
@dataclass
class PDFPage:
page_number: int
raw_text: str = ""
markdown: str = "" # Added per your request
html: str = "" # Added per your request
images: List[Dict] = field(default_factory=list)
links: List[str] = field(default_factory=list)
layout: List[Dict] = field(default_factory=list)
@dataclass
class PDFProcessResult:
metadata: PDFMetadata
pages: List[PDFPage]
processing_time: float = 0.0
version: str = "1.0"
class PDFProcessorStrategy(ABC):
@abstractmethod
def process(self, pdf_path: Path) -> PDFProcessResult:
pass
class NaivePDFProcessorStrategy(PDFProcessorStrategy):
def __init__(self, image_dpi: int = 144, image_quality: int = 85, extract_images: bool = True,
save_images_locally: bool = False, image_save_dir: Optional[Path] = None):
self.image_dpi = image_dpi
self.image_quality = image_quality
self.current_page_number = 0
self.extract_images = extract_images
self.save_images_locally = save_images_locally
self.image_save_dir = image_save_dir
self._temp_dir = None
def process(self, pdf_path: Path) -> PDFProcessResult:
start_time = time()
result = PDFProcessResult(
metadata=PDFMetadata(),
pages=[],
version="1.1"
)
try:
with pdf_path.open('rb') as file:
reader = PdfReader(file)
result.metadata = self._extract_metadata(pdf_path, reader)
# Handle image directory
image_dir = None
if self.extract_images and self.save_images_locally:
if self.image_save_dir:
image_dir = Path(self.image_save_dir)
image_dir.mkdir(exist_ok=True, parents=True)
else:
self._temp_dir = tempfile.mkdtemp(prefix='pdf_images_')
image_dir = Path(self._temp_dir)
for page_num, page in enumerate(reader.pages):
self.current_page_number = page_num + 1
pdf_page = self._process_page(page, image_dir, reader)
result.pages.append(pdf_page)
except Exception as e:
logger.error(f"Failed to process PDF: {str(e)}")
raise
finally:
# Cleanup temp directory if it was created
if self._temp_dir and not self.image_save_dir:
import shutil
try:
shutil.rmtree(self._temp_dir)
except Exception as e:
logger.error(f"Failed to cleanup temp directory: {str(e)}")
result.processing_time = time() - start_time
return result
def _process_page(self, page, image_dir: Optional[Path], reader) -> PDFPage:
pdf_page = PDFPage(
page_number=self.current_page_number,
)
# Text and font extraction
def visitor_text(text, cm, tm, font_dict, font_size):
pdf_page.raw_text += text
pdf_page.layout.append({
"type": "text",
"text": text,
"x": tm[4],
"y": tm[5],
})
page.extract_text(visitor_text=visitor_text)
# Image extraction
if self.extract_images:
pdf_page.images = self._extract_images(page, image_dir)
# Link extraction
pdf_page.links = self._extract_links(page)
# Add markdown content
pdf_page.markdown = clean_pdf_text(self.current_page_number, pdf_page.raw_text)
pdf_page.html = clean_pdf_text_to_html(self.current_page_number, pdf_page.raw_text)
return pdf_page
def _extract_images(self, page, image_dir: Optional[Path]) -> List[Dict]:
if not self.extract_images:
return []
images = []
try:
resources = page.get("/Resources")
if resources: # Check if resources exist
resources = resources.get_object() # Resolve IndirectObject
if '/XObject' in resources:
xobjects = resources['/XObject'].get_object()
img_count = 0
for obj_name in xobjects:
xobj = xobjects[obj_name]
if hasattr(xobj, 'get_object') and callable(xobj.get_object):
xobj = xobj.get_object()
if xobj.get('/Subtype') == '/Image':
try:
img_count += 1
img_filename = f"page_{self.current_page_number}_img_{img_count}"
data = xobj.get_data()
filters = xobj.get('/Filter', [])
if not isinstance(filters, list):
filters = [filters]
# Resolve IndirectObjects in properties
width = xobj.get('/Width', 0)
height = xobj.get('/Height', 0)
color_space = xobj.get('/ColorSpace', '/DeviceRGB')
if isinstance(color_space, PyPDF2.generic.IndirectObject):
color_space = color_space.get_object()
# Handle different image encodings
success = False
image_format = 'bin'
image_data = None
if '/FlateDecode' in filters:
try:
decode_parms = xobj.get('/DecodeParms', {})
if isinstance(decode_parms, PyPDF2.generic.IndirectObject):
decode_parms = decode_parms.get_object()
predictor = decode_parms.get('/Predictor', 1)
bits = xobj.get('/BitsPerComponent', 8)
colors = 3 if color_space == '/DeviceRGB' else 1
if predictor >= 10:
data = apply_png_predictor(data, width, bits, colors)
# Create PIL Image
mode = 'RGB' if color_space == '/DeviceRGB' else 'L'
img = Image.frombytes(mode, (width, height), data)
if self.save_images_locally:
final_path = (image_dir / img_filename).with_suffix('.png')
img.save(final_path)
image_data = str(final_path)
else:
import io
img_byte_arr = io.BytesIO()
img.save(img_byte_arr, format='PNG')
image_data = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
success = True
image_format = 'png'
except Exception as e:
logger.error(f"FlateDecode error: {str(e)}")
elif '/DCTDecode' in filters:
# JPEG image
try:
if self.save_images_locally:
final_path = (image_dir / img_filename).with_suffix('.jpg')
with open(final_path, 'wb') as f:
f.write(data)
image_data = str(final_path)
else:
image_data = base64.b64encode(data).decode('utf-8')
success = True
image_format = 'jpeg'
except Exception as e:
logger.error(f"JPEG save error: {str(e)}")
elif '/CCITTFaxDecode' in filters:
try:
if data[:4] != b'II*\x00':
# Add TIFF header if missing
tiff_header = b'II*\x00\x08\x00\x00\x00\x0e\x00\x00\x01\x03\x00\x01\x00\x00\x00' + \
width.to_bytes(4, 'little') + \
b'\x01\x03\x00\x01\x00\x00\x00' + \
height.to_bytes(4, 'little') + \
b'\x01\x12\x00\x03\x00\x00\x00\x01\x00\x01\x00\x00\x01\x17\x00\x04\x00\x00\x00\x01\x00\x00\x00J\x01\x1B\x00\x05\x00\x00\x00\x01\x00\x00\x00R\x01\x28\x00\x03\x00\x00\x00\x01\x00\x02\x00\x00'
data = tiff_header + data
if self.save_images_locally:
final_path = (image_dir / img_filename).with_suffix('.tiff')
with open(final_path, 'wb') as f:
f.write(data)
image_data = str(final_path)
else:
image_data = base64.b64encode(data).decode('utf-8')
success = True
image_format = 'tiff'
except Exception as e:
logger.error(f"CCITT save error: {str(e)}")
elif '/JPXDecode' in filters:
# JPEG 2000
try:
if self.save_images_locally:
final_path = (image_dir / img_filename).with_suffix('.jp2')
with open(final_path, 'wb') as f:
f.write(data)
image_data = str(final_path)
else:
image_data = base64.b64encode(data).decode('utf-8')
success = True
image_format = 'jpeg2000'
except Exception as e:
logger.error(f"JPEG2000 save error: {str(e)}")
if success and image_data:
image_info = {
"format": image_format,
"width": width,
"height": height,
"color_space": str(color_space),
"bits_per_component": xobj.get('/BitsPerComponent', 1)
}
if self.save_images_locally:
image_info["path"] = image_data
else:
image_info["data"] = image_data
images.append(image_info)
else:
# Fallback: Save raw data
if self.save_images_locally:
final_path = (image_dir / img_filename).with_suffix('.bin')
with open(final_path, 'wb') as f:
f.write(data)
logger.warning(f"Saved raw image data to {final_path}")
else:
image_data = base64.b64encode(data).decode('utf-8')
images.append({
"format": "bin",
"width": width,
"height": height,
"color_space": str(color_space),
"bits_per_component": xobj.get('/BitsPerComponent', 1),
"data": image_data
})
except Exception as e:
logger.error(f"Error processing image: {str(e)}")
except Exception as e:
logger.error(f"Image extraction error: {str(e)}")
return images
def _extract_links(self, page) -> List[str]:
links = []
if '/Annots' in page:
try:
for annot in page['/Annots']:
a = annot.get_object()
if '/A' in a and '/URI' in a['/A']:
links.append(a['/A']['/URI'])
except Exception as e:
print(f"Link error: {str(e)}")
return links
def _extract_metadata(self, pdf_path: Path, reader: PdfReader = None) -> PDFMetadata:
if not reader:
reader = PdfReader(pdf_path)
meta = reader.metadata or {}
created = self._parse_pdf_date(meta.get('/CreationDate', ''))
modified = self._parse_pdf_date(meta.get('/ModDate', ''))
return PDFMetadata(
title=meta.get('/Title'),
author=meta.get('/Author'),
producer=meta.get('/Producer'),
created=created,
modified=modified,
pages=len(reader.pages),
encrypted=reader.is_encrypted,
file_size=pdf_path.stat().st_size
)
def _parse_pdf_date(self, date_str: str) -> Optional[datetime]:
try:
match = re.match(r'D:(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})', date_str)
if not match:
return None
return datetime(
year=int(match[1]),
month=int(match[2]),
day=int(match[3]),
hour=int(match[4]),
minute=int(match[5]),
second=int(match[6])
)
except:
return None
# Usage example
if __name__ == "__main__":
import json
from pathlib import Path
current_dir = Path(__file__).resolve().parent
pdf_path = f'{current_dir}/test.pdf'
strategy = NaivePDFProcessorStrategy()
result = strategy.process(Path(pdf_path))
# Convert to JSON
json_output = asdict(result)
print(json.dumps(json_output, indent=2, default=str))
with open(f'{current_dir}/test.html', 'w') as f:
for page in result.pages:
f.write(f'<h1>Page {page["page_number"]}</h1>')
f.write(page['html'])
with open(f'{current_dir}/test.md', 'w') as f:
for page in result.pages:
f.write(f'# Page {page["page_number"]}\n\n')
f.write(clean_pdf_text(page["page_number"], page['raw_text']))
f.write('\n\n')

View File

@@ -0,0 +1,350 @@
import re
def apply_png_predictor(data, width, bits, color_channels):
"""Decode PNG predictor (PDF 1.5+ filter)"""
bytes_per_pixel = (bits * color_channels) // 8
if (bits * color_channels) % 8 != 0:
bytes_per_pixel += 1
stride = width * bytes_per_pixel
scanline_length = stride + 1 # +1 for filter byte
if len(data) % scanline_length != 0:
raise ValueError("Invalid scanline structure")
num_lines = len(data) // scanline_length
output = bytearray()
prev_line = b'\x00' * stride
for i in range(num_lines):
line = data[i*scanline_length:(i+1)*scanline_length]
filter_type = line[0]
filtered = line[1:]
if filter_type == 0: # None
decoded = filtered
elif filter_type == 1: # Sub
decoded = bytearray(filtered)
for j in range(bytes_per_pixel, len(decoded)):
decoded[j] = (decoded[j] + decoded[j - bytes_per_pixel]) % 256
elif filter_type == 2: # Up
decoded = bytearray([(filtered[j] + prev_line[j]) % 256
for j in range(len(filtered))])
elif filter_type == 3: # Average
decoded = bytearray(filtered)
for j in range(len(decoded)):
left = decoded[j - bytes_per_pixel] if j >= bytes_per_pixel else 0
up = prev_line[j]
avg = (left + up) // 2
decoded[j] = (decoded[j] + avg) % 256
elif filter_type == 4: # Paeth
decoded = bytearray(filtered)
for j in range(len(decoded)):
left = decoded[j - bytes_per_pixel] if j >= bytes_per_pixel else 0
up = prev_line[j]
up_left = prev_line[j - bytes_per_pixel] if j >= bytes_per_pixel else 0
paeth = paeth_predictor(left, up, up_left)
decoded[j] = (decoded[j] + paeth) % 256
else:
raise ValueError(f"Unsupported filter type: {filter_type}")
output.extend(decoded)
prev_line = decoded
return bytes(output)
def paeth_predictor(a, b, c):
p = a + b - c
pa = abs(p - a)
pb = abs(p - b)
pc = abs(p - c)
if pa <= pb and pa <= pc:
return a
elif pb <= pc:
return b
else:
return c
import re
import html
def clean_pdf_text_to_html(page_number, text):
# Decode Unicode escapes and handle surrogate pairs
try:
decoded = text.encode('latin-1').decode('unicode-escape')
decoded = decoded.encode('utf-16', 'surrogatepass').decode('utf-16')
except Exception as e:
decoded = text # Fallback if decoding fails
article_title_detected = False
# decoded = re.sub(r'\.\n', '.\n\n', decoded)
# decoded = re.sub(r'\.\n', '<|break|>', decoded)
lines = decoded.split('\n')
output = []
current_paragraph = []
in_header = False
email_pattern = re.compile(r'\{.*?\}')
affiliation_pattern = re.compile(r'^†')
quote_pattern = re.compile(r'^["“]')
author_pattern = re.compile(
r'^\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?'
r'(?:,\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?)*'
r'(?:,\s*(?:and|&)\s+[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?)?\s*$'
)
def flush_paragraph():
if current_paragraph:
para = ' '.join(current_paragraph)
para = re.sub(r'\s+', ' ', para).strip()
if para:
# escaped_para = html.escape(para)
escaped_para = para
# escaped_para = re.sub(r'\.\n', '.\n\n', escaped_para)
# Split escaped_para by <|break|> to avoid HTML escaping
escaped_para = escaped_para.split('.\n\n')
# Wrap each part in <p> tag
escaped_para = [f'<p>{part}</p>' for part in escaped_para]
output.append(f'<div class="paragraph">{"".join(escaped_para)}</div><hr/>')
current_paragraph.clear()
for i, line in enumerate(lines):
line = line.strip()
# Handle empty lines
if not line:
flush_paragraph()
continue
# Detect article title (first line with reasonable length)
if not article_title_detected and i == 0 and 3 <= len(line.split()) <= 8 and len(lines) > 1:
flush_paragraph()
escaped_line = html.escape(line)
output.append(f'<h2>{escaped_line}</h2>')
article_title_detected = True
continue
# Detect numbered headers like "2.1 Background"
numbered_header = re.match(r'^(\d+(?:\.\d+)*)\s+(.+)$', line)
if i > 0 and not lines[i-1].strip() and numbered_header:
flush_paragraph()
level = numbered_header.group(1).count('.') + 1
header_text = numbered_header.group(2)
md_level = min(level + 1, 6)
escaped_header = html.escape(header_text)
output.append(f'<h{md_level}>{escaped_header}</h{md_level}>')
in_header = True
continue
# Detect authors
if page_number == 1 and author_pattern.match(line):
authors = re.sub(r'[†â€]', '', line)
authors = re.split(r', | and ', authors)
formatted_authors = []
for author in authors:
if author.strip():
parts = [p for p in author.strip().split() if p]
formatted = ' '.join(parts)
escaped_author = html.escape(formatted)
formatted_authors.append(f'<strong>{escaped_author}</strong>')
if len(formatted_authors) > 1:
joined = ', '.join(formatted_authors[:-1]) + ' and ' + formatted_authors[-1]
else:
joined = formatted_authors[0]
output.append(f'<p>{joined}</p>')
continue
# Detect affiliation
if affiliation_pattern.match(line):
escaped_line = html.escape(line)
output.append(f'<p><em>{escaped_line}</em></p>')
continue
# Detect emails
if email_pattern.match(line):
escaped_line = html.escape(line)
output.append(f'<p><code>{escaped_line}</code></p>')
continue
# Detect section headers
if re.match(r'^(Abstract|\d+\s+[A-Z]|References|Appendix|Figure|Table)', line):
flush_paragraph()
escaped_line = html.escape(line)
output.append(f'<h2 class="section-header"><em>{escaped_line}</em></h2>')
in_header = True
continue
# Handle quotes
if quote_pattern.match(line):
flush_paragraph()
escaped_line = html.escape(line)
output.append(f'<blockquote><p>{escaped_line}</p></blockquote>')
continue
# Handle hyphenated words
if line.endswith('-'):
current_paragraph.append(line[:-1].strip())
else:
current_paragraph.append(line)
# Handle paragraph breaks after headers
if in_header and not line.endswith(('.', '!', '?')):
flush_paragraph()
in_header = False
flush_paragraph()
# Post-process HTML
html_output = '\n'.join(output)
# Fix common citation patterns
html_output = re.sub(r'\(([A-Z][a-z]+ et al\. \d{4})\)', r'<cite>\1</cite>', html_output)
# Fix escaped characters
html_output = html_output.replace('\\ud835', '').replace('\\u2020', '')
# Remove leftover hyphens and fix spacing
html_output = re.sub(r'\s+-\s+', '', html_output)
html_output = re.sub(r'\s+([.,!?)])', r'\1', html_output)
return html_output
def clean_pdf_text(page_number, text):
# Decode Unicode escapes and handle surrogate pairs
try:
decoded = text.encode('latin-1').decode('unicode-escape')
decoded = decoded.encode('utf-16', 'surrogatepass').decode('utf-16')
except Exception as e:
decoded = text # Fallback if decoding fails
article_title_detected = False
decoded = re.sub(r'\.\n', '.\n\n', decoded)
lines = decoded.split('\n')
output = []
current_paragraph = []
in_header = False
email_pattern = re.compile(r'\{.*?\}')
affiliation_pattern = re.compile(r'^†')
quote_pattern = re.compile(r'^["“]')
author_pattern = re.compile(
r'^\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?'
r'(?:,\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?)*'
r'(?:,\s*(?:and|&)\s+[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?)?\s*$'
)
def flush_paragraph():
if current_paragraph:
para = ' '.join(current_paragraph)
para = re.sub(r'\s+', ' ', para).strip()
if para:
output.append(para)
current_paragraph.clear()
for i, line in enumerate(lines):
line = line.strip()
# Handle special patterns
if not line:
flush_paragraph()
continue
# Detect headline (first line, reasonable length, surrounded by empty lines)
if not article_title_detected and i == 0 and 3 <= len(line.split()) <= 8 and (len(lines) > 1):
flush_paragraph()
output.append(f'## {line}')
continue
# Detect paragraph breaks for ALL paragraphs
if not line and current_paragraph:
flush_paragraph()
output.append('') # Add empty line between paragraphs
continue
# Detect numbered headers like "2.1 Background"
numbered_header = re.match(r'^(\d+(?:\.\d+)*)\s+(.+)$', line)
if not lines[i-1].strip() and numbered_header:
flush_paragraph()
level = numbered_header.group(1).count('.') + 1 # Convert 2.1 → level 2
header_text = numbered_header.group(2)
# Never go beyond ### for subsections
md_level = min(level + 1, 6) # 1 → ##, 2 → ###, 3 → #### etc
output.append(f'{"#" * md_level} {header_text}')
in_header = True
continue
# Detect authors
if page_number == 1 and author_pattern.match(line):
# Clean and format author names
authors = re.sub(r'[†â€]', '', line) # Remove affiliation markers
authors = re.split(r', | and ', authors)
formatted_authors = []
for author in authors:
if author.strip():
# Handle "First Last" formatting
parts = [p for p in author.strip().split() if p]
formatted = ' '.join(parts)
formatted_authors.append(f'**{formatted}**')
# Join with commas and "and"
if len(formatted_authors) > 1:
joined = ', '.join(formatted_authors[:-1]) + ' and ' + formatted_authors[-1]
else:
joined = formatted_authors[0]
output.append(joined)
continue
# Detect affiliation
if affiliation_pattern.match(line):
output.append(f'*{line}*')
continue
# Detect emails
if email_pattern.match(line):
output.append(f'`{line}`')
continue
# Detect section headers
if re.match(r'^(Abstract|\d+\s+[A-Z]|References|Appendix|Figure|Table)', line):
flush_paragraph()
output.append(f'_[{line}]_')
in_header = True
continue
# Handle quotes
if quote_pattern.match(line):
flush_paragraph()
output.append(f'> {line}')
continue
# Handle hyphenated words
if line.endswith('-'):
current_paragraph.append(line[:-1].strip())
else:
current_paragraph.append(line)
# Handle paragraph breaks after headers
if in_header and not line.endswith(('.', '!', '?')):
flush_paragraph()
in_header = False
flush_paragraph()
# Post-processing
markdown = '\n\n'.join(output)
# Fix common citation patterns
markdown = re.sub(r'\(([A-Z][a-z]+ et al\. \d{4})\)', r'[\1]', markdown)
# Fix escaped characters
markdown = markdown.replace('\\ud835', '').replace('\\u2020', '')
# Remove leftover hyphens and fix spacing
markdown = re.sub(r'\s+-\s+', '', markdown) # Join hyphenated words
markdown = re.sub(r'\s+([.,!?)])', r'\1', markdown) # Fix punctuation spacing
return markdown