feat(pdf): add PDF processing capabilities

Add new PDF processing module with the following features: - PDF text extraction and formatting to HTML/Markdown - Image extraction with multiple format support (JPEG, PNG, TIFF) - Link extraction from PDF documents - Metadata extraction including title, author, dates - Support for both local and remote PDF files Also includes: - New configuration options for HTML attribute handling - Internal/external link filtering improvements - Version bump to 0.4.300b4
2025-01-27 21:24:15 +08:00
parent 54c84079c4
commit f8fd9d9eff
9 changed files with 933 additions and 49 deletions
--- a/crawl4ai/processors/pdf/init.py
+++ b/crawl4ai/processors/pdf/init.py
@@ -0,0 +1,164 @@
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Dict, List, Optional
+from dataclasses import asdict
+
+from crawl4ai.async_logger import AsyncLogger
+from crawl4ai.async_crawler_strategy import AsyncCrawlerStrategy
+from crawl4ai.models import AsyncCrawlResponse, ScrapingResult 
+from crawl4ai.content_scraping_strategy import ContentScrapingStrategy
+from .processor import NaivePDFProcessorStrategy  # Assuming your current PDF code is in pdf_processor.py
+
+class PDFCrawlerStrategy(AsyncCrawlerStrategy):
+    def __init__(self, logger: AsyncLogger = None):
+        self.logger = logger
+        
+    async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
+        # Just pass through with empty HTML - scraper will handle actual processing
+        return AsyncCrawlResponse(
+            html="",  # Scraper will handle the real work
+            response_headers={"Content-Type": "application/pdf"},
+            status_code=200
+        )
+    
+    async def close(self):
+        pass        
+        
+    async def __aenter__(self):        
+        return self
+    
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await self.close()
+
+class PDFContentScrapingStrategy(ContentScrapingStrategy):
+    """
+    A content scraping strategy for PDF files.
+    
+    Attributes:
+        save_images_locally (bool): Whether to save images locally.
+        extract_images (bool): Whether to extract images from PDF.
+        image_save_dir (str): Directory to save extracted images.
+        logger (AsyncLogger): Logger instance for recording events and errors.
+        
+    Methods:
+        scrap(url: str, html: str, **params) -> ScrapingResult:
+            Scrap content from a PDF file.
+        ascrap(url: str, html: str, **kwargs) -> ScrapingResult:
+            Asynchronous version of scrap.
+            
+    Usage:
+        strategy = PDFContentScrapingStrategy(
+            save_images_locally=False,
+            extract_images=False,
+            image_save_dir=None,
+            logger=logger
+        )
+        
+    """
+    def __init__(self, 
+                 save_images_locally=False,
+                 extract_images=False,
+                 image_save_dir=None,
+                 logger: AsyncLogger = None):
+        self.logger = logger
+        self.pdf_processor = NaivePDFProcessorStrategy(
+            save_images_locally=False,
+            extract_images=False,
+            image_save_dir=None
+        )
+
+    def scrap(self, url: str, html: str, **params) -> ScrapingResult:
+        """
+        Scrap content from a PDF file.
+        
+        Args:
+            url (str): The URL of the PDF file.
+            html (str): The HTML content of the page.
+            **params: Additional parameters.
+        
+        Returns:
+            ScrapingResult: The scraped content.
+        """
+        # Download if URL or use local path
+        pdf_path = self._get_pdf_path(url)
+        try:
+            # Process PDF
+            result = self.pdf_processor.process(Path(pdf_path))
+            
+            # Combine page HTML
+            cleaned_html = f"""
+        <html>
+            <head><meta name="pdf-pages" content="{len(result.pages)}"></head>
+            <body>
+                {''.join(f'<div class="pdf-page" data-page="{i+1}">{page.html}</div>'
+                         for i, page in enumerate(result.pages))}
+            </body>
+        </html>
+        """
+            
+            # Accumulate media and links with page numbers
+            media = {"images": []}
+            links = {"urls": []}
+            
+            for page in result.pages:
+                # Add page number to each image
+                for img in page.images:
+                    img["page"] = page.page_number
+                    media["images"].append(img)
+                
+                # Add page number to each link
+                for link in page.links:
+                    links["urls"].append({
+                        "url": link,
+                        "page": page.page_number
+                    })
+
+            return ScrapingResult(
+                cleaned_html=cleaned_html,
+                success=True,
+                media=media,
+                links=links,
+                metadata=asdict(result.metadata)
+            )
+        finally:
+            # Cleanup temp file if downloaded
+            if url.startswith(("http://", "https://")):
+                Path(pdf_path).unlink(missing_ok=True)
+
+    async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
+        # For simple cases, you can use the sync version
+        return await asyncio.to_thread(self.scrap, url, html, **kwargs)
+        
+
+    def _get_pdf_path(self, url: str) -> str:
+        if url.startswith(("http://", "https://")):
+            import tempfile
+            import requests
+            
+            # Create temp file with .pdf extension
+            temp_file = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
+            
+            try:
+                # Download PDF with streaming
+                response = requests.get(url, stream=True)
+                response.raise_for_status()
+                
+                # Write to temp file
+                with open(temp_file.name, 'wb') as f:
+                    for chunk in response.iter_content(chunk_size=8192):
+                        f.write(chunk)
+                        
+                return temp_file.name
+                
+            except Exception as e:
+                # Clean up temp file if download fails
+                Path(temp_file.name).unlink(missing_ok=True)
+                raise RuntimeError(f"Failed to download PDF from {url}: {str(e)}")
+                
+        elif url.startswith("file://"):
+            return url[7:]  # Strip file:// prefix
+            
+        return url  # Assume local path
+    
+
+__all__ = ["PDFCrawlerStrategy", "PDFContentScrapingStrategy"]
--- a/crawl4ai/processors/pdf/processor.py
+++ b/crawl4ai/processors/pdf/processor.py
@@ -0,0 +1,372 @@
+import logging
+import re
+from abc import ABC, abstractmethod
+from datetime import datetime
+from pathlib import Path
+from time import time
+from dataclasses import dataclass, asdict, field
+from typing import Dict, List, Optional, Tuple
+import PyPDF2
+from PIL import Image
+from PyPDF2 import PdfReader
+from .utils import *
+import base64
+import tempfile
+
+logger = logging.getLogger(__name__)
+
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import List, Optional, Dict, Any
+from pathlib import Path
+
+@dataclass
+class PDFMetadata:
+    title: Optional[str] = None
+    author: Optional[str] = None
+    producer: Optional[str] = None
+    created: Optional[datetime] = None
+    modified: Optional[datetime] = None
+    pages: int = 0
+    encrypted: bool = False
+    file_size: Optional[int] = None
+
+@dataclass
+class PDFPage:
+    page_number: int
+    raw_text: str = ""
+    markdown: str = ""  # Added per your request
+    html: str = ""  # Added per your request
+    images: List[Dict] = field(default_factory=list)
+    links: List[str] = field(default_factory=list)
+    layout: List[Dict] = field(default_factory=list)
+
+@dataclass
+class PDFProcessResult:
+    metadata: PDFMetadata
+    pages: List[PDFPage]
+    processing_time: float = 0.0
+    version: str = "1.0"
+
+class PDFProcessorStrategy(ABC):
+    @abstractmethod
+    def process(self, pdf_path: Path) -> PDFProcessResult:
+        pass
+
+class NaivePDFProcessorStrategy(PDFProcessorStrategy):
+    def __init__(self, image_dpi: int = 144, image_quality: int = 85, extract_images: bool = True, 
+                 save_images_locally: bool = False, image_save_dir: Optional[Path] = None):
+        self.image_dpi = image_dpi
+        self.image_quality = image_quality
+        self.current_page_number = 0
+        self.extract_images = extract_images
+        self.save_images_locally = save_images_locally
+        self.image_save_dir = image_save_dir
+        self._temp_dir = None
+
+    def process(self, pdf_path: Path) -> PDFProcessResult:
+        start_time = time()
+        result = PDFProcessResult(
+            metadata=PDFMetadata(),
+            pages=[],
+            version="1.1"
+        )
+
+        try:
+            with pdf_path.open('rb') as file:
+                reader = PdfReader(file)
+                result.metadata = self._extract_metadata(pdf_path, reader)
+                
+                # Handle image directory
+                image_dir = None
+                if self.extract_images and self.save_images_locally:
+                    if self.image_save_dir:
+                        image_dir = Path(self.image_save_dir)
+                        image_dir.mkdir(exist_ok=True, parents=True)
+                    else:
+                        self._temp_dir = tempfile.mkdtemp(prefix='pdf_images_')
+                        image_dir = Path(self._temp_dir)
+
+                for page_num, page in enumerate(reader.pages):
+                    self.current_page_number = page_num + 1
+                    pdf_page = self._process_page(page, image_dir, reader)
+                    result.pages.append(pdf_page)
+
+        except Exception as e:
+            logger.error(f"Failed to process PDF: {str(e)}")
+            raise
+        finally:
+            # Cleanup temp directory if it was created
+            if self._temp_dir and not self.image_save_dir:
+                import shutil
+                try:
+                    shutil.rmtree(self._temp_dir)
+                except Exception as e:
+                    logger.error(f"Failed to cleanup temp directory: {str(e)}")
+
+        result.processing_time = time() - start_time
+        return result
+
+    def _process_page(self, page, image_dir: Optional[Path], reader) -> PDFPage:
+        pdf_page = PDFPage(
+            page_number=self.current_page_number,
+        )
+
+        # Text and font extraction
+        def visitor_text(text, cm, tm, font_dict, font_size):
+            pdf_page.raw_text += text
+            pdf_page.layout.append({
+                "type": "text",
+                "text": text,
+                "x": tm[4],
+                "y": tm[5],
+            })
+        
+        page.extract_text(visitor_text=visitor_text)
+
+        # Image extraction
+        if self.extract_images:
+            pdf_page.images = self._extract_images(page, image_dir)
+
+        # Link extraction
+        pdf_page.links = self._extract_links(page)
+        
+        # Add markdown content
+        pdf_page.markdown = clean_pdf_text(self.current_page_number, pdf_page.raw_text)
+        pdf_page.html = clean_pdf_text_to_html(self.current_page_number, pdf_page.raw_text)
+
+        return pdf_page
+
+    def _extract_images(self, page, image_dir: Optional[Path]) -> List[Dict]:
+        if not self.extract_images:
+            return []
+
+        images = []
+        try:
+            resources = page.get("/Resources")
+            if resources:  # Check if resources exist
+                resources = resources.get_object()  # Resolve IndirectObject
+                if '/XObject' in resources:
+                    xobjects = resources['/XObject'].get_object()
+                    img_count = 0
+                    for obj_name in xobjects:
+                        xobj = xobjects[obj_name]
+                        if hasattr(xobj, 'get_object') and callable(xobj.get_object):
+                            xobj = xobj.get_object()
+                            if xobj.get('/Subtype') == '/Image':
+                                try:
+                                    img_count += 1
+                                    img_filename = f"page_{self.current_page_number}_img_{img_count}"
+                                    data = xobj.get_data()
+                                    filters = xobj.get('/Filter', [])
+                                    if not isinstance(filters, list):
+                                        filters = [filters]
+
+                                    # Resolve IndirectObjects in properties
+                                    width = xobj.get('/Width', 0)
+                                    height = xobj.get('/Height', 0)
+                                    color_space = xobj.get('/ColorSpace', '/DeviceRGB')
+                                    if isinstance(color_space, PyPDF2.generic.IndirectObject):
+                                        color_space = color_space.get_object()
+
+                                    # Handle different image encodings
+                                    success = False
+                                    image_format = 'bin'
+                                    image_data = None
+                                    
+                                    if '/FlateDecode' in filters:
+                                        try:
+                                            decode_parms = xobj.get('/DecodeParms', {})
+                                            if isinstance(decode_parms, PyPDF2.generic.IndirectObject):
+                                                decode_parms = decode_parms.get_object()
+                                            
+                                            predictor = decode_parms.get('/Predictor', 1)
+                                            bits = xobj.get('/BitsPerComponent', 8)
+                                            colors = 3 if color_space == '/DeviceRGB' else 1
+
+                                            if predictor >= 10:
+                                                data = apply_png_predictor(data, width, bits, colors)
+
+                                            # Create PIL Image
+                                            mode = 'RGB' if color_space == '/DeviceRGB' else 'L'
+                                            img = Image.frombytes(mode, (width, height), data)
+                                            
+                                            if self.save_images_locally:
+                                                final_path = (image_dir / img_filename).with_suffix('.png')
+                                                img.save(final_path)
+                                                image_data = str(final_path)
+                                            else:
+                                                import io
+                                                img_byte_arr = io.BytesIO()
+                                                img.save(img_byte_arr, format='PNG')
+                                                image_data = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
+                                            
+                                            success = True
+                                            image_format = 'png'
+                                        except Exception as e:
+                                            logger.error(f"FlateDecode error: {str(e)}")
+
+                                    elif '/DCTDecode' in filters:
+                                        # JPEG image
+                                        try:
+                                            if self.save_images_locally:
+                                                final_path = (image_dir / img_filename).with_suffix('.jpg')
+                                                with open(final_path, 'wb') as f:
+                                                    f.write(data)
+                                                image_data = str(final_path)
+                                            else:
+                                                image_data = base64.b64encode(data).decode('utf-8')
+                                            success = True
+                                            image_format = 'jpeg'
+                                        except Exception as e:
+                                            logger.error(f"JPEG save error: {str(e)}")
+
+                                    elif '/CCITTFaxDecode' in filters:
+                                        try:
+                                            if data[:4] != b'II*\x00':
+                                                # Add TIFF header if missing
+                                                tiff_header = b'II*\x00\x08\x00\x00\x00\x0e\x00\x00\x01\x03\x00\x01\x00\x00\x00' + \
+                                                            width.to_bytes(4, 'little') + \
+                                                            b'\x01\x03\x00\x01\x00\x00\x00' + \
+                                                            height.to_bytes(4, 'little') + \
+                                                            b'\x01\x12\x00\x03\x00\x00\x00\x01\x00\x01\x00\x00\x01\x17\x00\x04\x00\x00\x00\x01\x00\x00\x00J\x01\x1B\x00\x05\x00\x00\x00\x01\x00\x00\x00R\x01\x28\x00\x03\x00\x00\x00\x01\x00\x02\x00\x00'
+                                                data = tiff_header + data
+                                            
+                                            if self.save_images_locally:
+                                                final_path = (image_dir / img_filename).with_suffix('.tiff')
+                                                with open(final_path, 'wb') as f:
+                                                    f.write(data)
+                                                image_data = str(final_path)
+                                            else:
+                                                image_data = base64.b64encode(data).decode('utf-8')
+                                            success = True
+                                            image_format = 'tiff'
+                                        except Exception as e:
+                                            logger.error(f"CCITT save error: {str(e)}")
+
+                                    elif '/JPXDecode' in filters:
+                                        # JPEG 2000
+                                        try:
+                                            if self.save_images_locally:
+                                                final_path = (image_dir / img_filename).with_suffix('.jp2')
+                                                with open(final_path, 'wb') as f:
+                                                    f.write(data)
+                                                image_data = str(final_path)
+                                            else:
+                                                image_data = base64.b64encode(data).decode('utf-8')
+                                            success = True
+                                            image_format = 'jpeg2000'
+                                        except Exception as e:
+                                            logger.error(f"JPEG2000 save error: {str(e)}")
+
+                                    if success and image_data:
+                                        image_info = {
+                                            "format": image_format,
+                                            "width": width,
+                                            "height": height,
+                                            "color_space": str(color_space),
+                                            "bits_per_component": xobj.get('/BitsPerComponent', 1)
+                                        }
+                                        
+                                        if self.save_images_locally:
+                                            image_info["path"] = image_data
+                                        else:
+                                            image_info["data"] = image_data
+                                            
+                                        images.append(image_info)
+                                    else:
+                                        # Fallback: Save raw data
+                                        if self.save_images_locally:
+                                            final_path = (image_dir / img_filename).with_suffix('.bin')
+                                            with open(final_path, 'wb') as f:
+                                                f.write(data)
+                                            logger.warning(f"Saved raw image data to {final_path}")
+                                        else:
+                                            image_data = base64.b64encode(data).decode('utf-8')
+                                            images.append({
+                                                "format": "bin",
+                                                "width": width,
+                                                "height": height,
+                                                "color_space": str(color_space),
+                                                "bits_per_component": xobj.get('/BitsPerComponent', 1),
+                                                "data": image_data
+                                            })
+
+                                except Exception as e:
+                                    logger.error(f"Error processing image: {str(e)}")
+        except Exception as e:
+            logger.error(f"Image extraction error: {str(e)}")
+        
+        return images
+
+    def _extract_links(self, page) -> List[str]:
+        links = []
+        if '/Annots' in page:
+            try:
+                for annot in page['/Annots']:
+                    a = annot.get_object()
+                    if '/A' in a and '/URI' in a['/A']:
+                        links.append(a['/A']['/URI'])
+            except Exception as e:
+                print(f"Link error: {str(e)}")
+        return links
+
+    def _extract_metadata(self, pdf_path: Path, reader: PdfReader = None) -> PDFMetadata:
+        if not reader:
+            reader = PdfReader(pdf_path)
+
+        meta = reader.metadata or {}
+        created = self._parse_pdf_date(meta.get('/CreationDate', ''))
+        modified = self._parse_pdf_date(meta.get('/ModDate', ''))
+        
+        return PDFMetadata(
+            title=meta.get('/Title'),
+            author=meta.get('/Author'),
+            producer=meta.get('/Producer'),
+            created=created,
+            modified=modified,
+            pages=len(reader.pages),
+            encrypted=reader.is_encrypted,
+            file_size=pdf_path.stat().st_size
+        )
+
+    def _parse_pdf_date(self, date_str: str) -> Optional[datetime]:
+        try:
+            match = re.match(r'D:(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})', date_str)
+            if not match:
+                return None
+                
+            return datetime(
+                year=int(match[1]),
+                month=int(match[2]),
+                day=int(match[3]),
+                hour=int(match[4]),
+                minute=int(match[5]),
+                second=int(match[6])
+            )
+        except:
+            return None
+
+# Usage example
+if __name__ == "__main__":
+    import json
+    from pathlib import Path
+    current_dir = Path(__file__).resolve().parent
+    pdf_path = f'{current_dir}/test.pdf'
+    
+    strategy = NaivePDFProcessorStrategy()
+    result = strategy.process(Path(pdf_path))
+    
+    # Convert to JSON
+    json_output = asdict(result)
+    print(json.dumps(json_output, indent=2, default=str))
+    
+    with open(f'{current_dir}/test.html', 'w') as f:
+        for page in result.pages:
+            f.write(f'<h1>Page {page["page_number"]}</h1>')
+            f.write(page['html'])
+    with open(f'{current_dir}/test.md', 'w') as f:
+        for page in result.pages:
+            f.write(f'# Page {page["page_number"]}\n\n')
+            f.write(clean_pdf_text(page["page_number"], page['raw_text']))
+            f.write('\n\n')
--- a/crawl4ai/processors/pdf/utils.py
+++ b/crawl4ai/processors/pdf/utils.py
@@ -0,0 +1,350 @@
+import re
+
+def apply_png_predictor(data, width, bits, color_channels):
+    """Decode PNG predictor (PDF 1.5+ filter)"""
+    bytes_per_pixel = (bits * color_channels) // 8
+    if (bits * color_channels) % 8 != 0:
+        bytes_per_pixel += 1
+        
+    stride = width * bytes_per_pixel
+    scanline_length = stride + 1  # +1 for filter byte
+    
+    if len(data) % scanline_length != 0:
+        raise ValueError("Invalid scanline structure")
+    
+    num_lines = len(data) // scanline_length
+    output = bytearray()
+    prev_line = b'\x00' * stride
+    
+    for i in range(num_lines):
+        line = data[i*scanline_length:(i+1)*scanline_length]
+        filter_type = line[0]
+        filtered = line[1:]
+        
+        if filter_type == 0:  # None
+            decoded = filtered
+        elif filter_type == 1:  # Sub
+            decoded = bytearray(filtered)
+            for j in range(bytes_per_pixel, len(decoded)):
+                decoded[j] = (decoded[j] + decoded[j - bytes_per_pixel]) % 256
+        elif filter_type == 2:  # Up
+            decoded = bytearray([(filtered[j] + prev_line[j]) % 256 
+                               for j in range(len(filtered))])
+        elif filter_type == 3:  # Average
+            decoded = bytearray(filtered)
+            for j in range(len(decoded)):
+                left = decoded[j - bytes_per_pixel] if j >= bytes_per_pixel else 0
+                up = prev_line[j]
+                avg = (left + up) // 2
+                decoded[j] = (decoded[j] + avg) % 256
+        elif filter_type == 4:  # Paeth
+            decoded = bytearray(filtered)
+            for j in range(len(decoded)):
+                left = decoded[j - bytes_per_pixel] if j >= bytes_per_pixel else 0
+                up = prev_line[j]
+                up_left = prev_line[j - bytes_per_pixel] if j >= bytes_per_pixel else 0
+                paeth = paeth_predictor(left, up, up_left)
+                decoded[j] = (decoded[j] + paeth) % 256
+        else:
+            raise ValueError(f"Unsupported filter type: {filter_type}")
+        
+        output.extend(decoded)
+        prev_line = decoded
+    
+    return bytes(output)
+
+def paeth_predictor(a, b, c):
+    p = a + b - c
+    pa = abs(p - a)
+    pb = abs(p - b)
+    pc = abs(p - c)
+    if pa <= pb and pa <= pc:
+        return a
+    elif pb <= pc:
+        return b
+    else:
+        return c
+
+import re
+import html
+
+def clean_pdf_text_to_html(page_number, text):
+    # Decode Unicode escapes and handle surrogate pairs
+    try:
+        decoded = text.encode('latin-1').decode('unicode-escape')
+        decoded = decoded.encode('utf-16', 'surrogatepass').decode('utf-16')
+    except Exception as e:
+        decoded = text  # Fallback if decoding fails
+    
+    article_title_detected = False
+    # decoded = re.sub(r'\.\n', '.\n\n', decoded)
+    # decoded = re.sub(r'\.\n', '<|break|>', decoded)
+    lines = decoded.split('\n')
+    output = []
+    current_paragraph = []
+    in_header = False
+    email_pattern = re.compile(r'\{.*?\}')
+    affiliation_pattern = re.compile(r'^†')
+    quote_pattern = re.compile(r'^["“]')
+    author_pattern = re.compile(
+        r'^\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?'
+        r'(?:,\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?)*'
+        r'(?:,\s*(?:and|&)\s+[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?)?\s*$'
+    )
+    
+    def flush_paragraph():
+        if current_paragraph:
+            para = ' '.join(current_paragraph)
+            para = re.sub(r'\s+', ' ', para).strip()
+            if para:
+                # escaped_para = html.escape(para)
+                escaped_para = para
+                # escaped_para = re.sub(r'\.\n', '.\n\n', escaped_para)
+                # Split escaped_para by <|break|> to avoid HTML escaping
+                escaped_para = escaped_para.split('.\n\n')
+                # Wrap each part in <p> tag
+                escaped_para = [f'<p>{part}</p>' for part in escaped_para]
+                output.append(f'<div class="paragraph">{"".join(escaped_para)}</div><hr/>')
+            current_paragraph.clear()
+    
+    for i, line in enumerate(lines):
+        line = line.strip()
+        
+        # Handle empty lines
+        if not line:
+            flush_paragraph()
+            continue
+            
+        # Detect article title (first line with reasonable length)
+        if not article_title_detected and i == 0 and 3 <= len(line.split()) <= 8 and len(lines) > 1:
+            flush_paragraph()
+            escaped_line = html.escape(line)
+            output.append(f'<h2>{escaped_line}</h2>')
+            article_title_detected = True
+            continue
+            
+        # Detect numbered headers like "2.1 Background"
+        numbered_header = re.match(r'^(\d+(?:\.\d+)*)\s+(.+)$', line)
+        if i > 0 and not lines[i-1].strip() and numbered_header:
+            flush_paragraph()
+            level = numbered_header.group(1).count('.') + 1
+            header_text = numbered_header.group(2)
+            md_level = min(level + 1, 6)
+            escaped_header = html.escape(header_text)
+            output.append(f'<h{md_level}>{escaped_header}</h{md_level}>')
+            in_header = True
+            continue
+            
+        # Detect authors
+        if page_number == 1 and author_pattern.match(line):
+            authors = re.sub(r'[†â€]', '', line)
+            authors = re.split(r', | and ', authors)
+            formatted_authors = []
+            for author in authors:
+                if author.strip():
+                    parts = [p for p in author.strip().split() if p]
+                    formatted = ' '.join(parts)
+                    escaped_author = html.escape(formatted)
+                    formatted_authors.append(f'<strong>{escaped_author}</strong>')
+            
+            if len(formatted_authors) > 1:
+                joined = ', '.join(formatted_authors[:-1]) + ' and ' + formatted_authors[-1]
+            else:
+                joined = formatted_authors[0]
+            
+            output.append(f'<p>{joined}</p>')
+            continue
+            
+        # Detect affiliation
+        if affiliation_pattern.match(line):
+            escaped_line = html.escape(line)
+            output.append(f'<p><em>{escaped_line}</em></p>')
+            continue
+            
+        # Detect emails
+        if email_pattern.match(line):
+            escaped_line = html.escape(line)
+            output.append(f'<p><code>{escaped_line}</code></p>')
+            continue
+            
+        # Detect section headers
+        if re.match(r'^(Abstract|\d+\s+[A-Z]|References|Appendix|Figure|Table)', line):
+            flush_paragraph()
+            escaped_line = html.escape(line)
+            output.append(f'<h2 class="section-header"><em>{escaped_line}</em></h2>')
+            in_header = True
+            continue
+            
+        # Handle quotes
+        if quote_pattern.match(line):
+            flush_paragraph()
+            escaped_line = html.escape(line)
+            output.append(f'<blockquote><p>{escaped_line}</p></blockquote>')
+            continue
+            
+        # Handle hyphenated words
+        if line.endswith('-'):
+            current_paragraph.append(line[:-1].strip())
+        else:
+            current_paragraph.append(line)
+            
+        # Handle paragraph breaks after headers
+        if in_header and not line.endswith(('.', '!', '?')):
+            flush_paragraph()
+            in_header = False
+    
+    flush_paragraph()
+    
+    # Post-process HTML
+    html_output = '\n'.join(output)
+    
+    # Fix common citation patterns
+    html_output = re.sub(r'\(([A-Z][a-z]+ et al\. \d{4})\)', r'<cite>\1</cite>', html_output)
+    
+    # Fix escaped characters
+    html_output = html_output.replace('\\ud835', '').replace('\\u2020', '†')
+    
+    # Remove leftover hyphens and fix spacing
+    html_output = re.sub(r'\s+-\s+', '', html_output)
+    html_output = re.sub(r'\s+([.,!?)])', r'\1', html_output)
+    
+    return html_output
+
+def clean_pdf_text(page_number, text):
+    # Decode Unicode escapes and handle surrogate pairs
+    try:
+        decoded = text.encode('latin-1').decode('unicode-escape')
+        decoded = decoded.encode('utf-16', 'surrogatepass').decode('utf-16')
+    except Exception as e:
+        decoded = text  # Fallback if decoding fails
+    
+    article_title_detected = False
+    decoded = re.sub(r'\.\n', '.\n\n', decoded)
+    lines = decoded.split('\n')
+    output = []
+    current_paragraph = []
+    in_header = False
+    email_pattern = re.compile(r'\{.*?\}')
+    affiliation_pattern = re.compile(r'^†')
+    quote_pattern = re.compile(r'^["“]')
+    author_pattern = re.compile(
+        r'^\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?'
+        r'(?:,\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?)*'
+        r'(?:,\s*(?:and|&)\s+[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?)?\s*$'
+    )
+    
+    def flush_paragraph():
+        if current_paragraph:
+            para = ' '.join(current_paragraph)
+            para = re.sub(r'\s+', ' ', para).strip()
+            if para:
+                output.append(para)
+            current_paragraph.clear()
+    
+    for i, line in enumerate(lines):
+        line = line.strip()
+        
+        # Handle special patterns
+        if not line:
+            flush_paragraph()
+            continue
+            
+        # Detect headline (first line, reasonable length, surrounded by empty lines)
+        if not article_title_detected and i == 0 and 3 <= len(line.split()) <= 8 and (len(lines) > 1):
+            flush_paragraph()
+            output.append(f'## {line}')
+            continue
+            
+        # Detect paragraph breaks for ALL paragraphs
+        if not line and current_paragraph:
+            flush_paragraph()
+            output.append('')  # Add empty line between paragraphs
+            continue
+                    
+        # Detect numbered headers like "2.1 Background"
+        numbered_header = re.match(r'^(\d+(?:\.\d+)*)\s+(.+)$', line)
+        if not lines[i-1].strip() and numbered_header:
+            flush_paragraph()
+            level = numbered_header.group(1).count('.') + 1  # Convert 2.1 → level 2
+            header_text = numbered_header.group(2)
+            # Never go beyond ### for subsections
+            md_level = min(level + 1, 6)   # 1 → ##, 2 → ###, 3 → #### etc
+            output.append(f'{"#" * md_level} {header_text}')
+            in_header = True
+            continue            
+            
+                    
+        # Detect authors
+        if page_number == 1 and author_pattern.match(line):
+            # Clean and format author names
+            authors = re.sub(r'[†â€]', '', line)  # Remove affiliation markers
+            authors = re.split(r', | and ', authors)
+            formatted_authors = []
+            for author in authors:
+                if author.strip():
+                    # Handle "First Last" formatting
+                    parts = [p for p in author.strip().split() if p]
+                    formatted = ' '.join(parts)
+                    formatted_authors.append(f'**{formatted}**')
+            
+            # Join with commas and "and"
+            if len(formatted_authors) > 1:
+                joined = ', '.join(formatted_authors[:-1]) + ' and ' + formatted_authors[-1]
+            else:
+                joined = formatted_authors[0]
+            
+            output.append(joined)
+            continue
+            
+        # Detect affiliation
+        if affiliation_pattern.match(line):
+            output.append(f'*{line}*')
+            continue
+            
+        # Detect emails
+        if email_pattern.match(line):
+            output.append(f'`{line}`')
+            continue
+            
+        # Detect section headers
+        if re.match(r'^(Abstract|\d+\s+[A-Z]|References|Appendix|Figure|Table)', line):
+            flush_paragraph()
+            output.append(f'_[{line}]_')
+            in_header = True
+            continue
+            
+           
+        # Handle quotes
+        if quote_pattern.match(line):
+            flush_paragraph()
+            output.append(f'> {line}')
+            continue
+            
+        # Handle hyphenated words
+        if line.endswith('-'):
+            current_paragraph.append(line[:-1].strip())
+        else:
+            current_paragraph.append(line)
+            
+        # Handle paragraph breaks after headers
+        if in_header and not line.endswith(('.', '!', '?')):
+            flush_paragraph()
+            in_header = False
+    
+    flush_paragraph()
+    
+    # Post-processing
+    markdown = '\n\n'.join(output)
+    
+    # Fix common citation patterns
+    markdown = re.sub(r'\(([A-Z][a-z]+ et al\. \d{4})\)', r'[\1]', markdown)
+    
+    # Fix escaped characters
+    markdown = markdown.replace('\\ud835', '').replace('\\u2020', '†')
+    
+    # Remove leftover hyphens and fix spacing
+    markdown = re.sub(r'\s+-\s+', '', markdown)  # Join hyphenated words
+    markdown = re.sub(r'\s+([.,!?)])', r'\1', markdown)  # Fix punctuation spacing
+    
+    
+    return markdown