feat(pdf): add PDF processing capabilities

Add new PDF processing module with the following features: - PDF text extraction and formatting to HTML/Markdown - Image extraction with multiple format support (JPEG, PNG, TIFF) - Link extraction from PDF documents - Metadata extraction including title, author, dates - Support for both local and remote PDF files Also includes: - New configuration options for HTML attribute handling - Internal/external link filtering improvements - Version bump to 0.4.300b4
2025-01-27 21:24:15 +08:00
parent 54c84079c4
commit f8fd9d9eff
9 changed files with 933 additions and 49 deletions
--- a/crawl4ai/version.py
+++ b/crawl4ai/version.py
@@ -1,2 +1,3 @@
 # crawl4ai/_version.py
-__version__ = "0.4.3b3"
+# __version__ = "0.4.3b3"
+__version__ = "0.4.300b4"
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -271,6 +271,8 @@ class CrawlerRunConfig:
                                         Default: None.
        keep_data_attributes (bool): If True, retain `data-*` attributes while removing unwanted attributes.
                                     Default: False.
+        keep_attrs (list of str): List of HTML attributes to keep during processing.
+                                      Default: [].
        remove_forms (bool): If True, remove all `<form>` elements from the HTML.
                             Default: False.
        prettiify (bool): If True, apply `fast_format_html` to produce prettified HTML output.
@@ -282,6 +284,8 @@ class CrawlerRunConfig:
        proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
                                     If None, no additional proxy config. Default: None.

+        # SSL Parameters
+        fetch_ssl_certificate: bool = False,
        # Caching Parameters
        cache_mode (CacheMode or None): Defines how caching is handled.
                                        If None, defaults to CacheMode.ENABLED internally.
@@ -363,10 +367,14 @@ class CrawlerRunConfig:
                                                    Default: SOCIAL_MEDIA_DOMAINS (from config).
        exclude_external_links (bool): If True, exclude all external links from the results.
                                       Default: False.
+        exclude_internal_links (bool): If True, exclude internal links from the results.
+                                       Default: False.
        exclude_social_media_links (bool): If True, exclude links pointing to social media domains.
                                           Default: False.
        exclude_domains (list of str): List of specific domains to exclude from results.
                                       Default: [].
+        exclude_internal_links (bool): If True, exclude internal links from the results.
+                                       Default: False.

        # Debugging and Logging Parameters
        verbose (bool): Enable verbose logging.
@@ -402,6 +410,7 @@ class CrawlerRunConfig:
        excluded_tags: list = None,
        excluded_selector: str = None,
        keep_data_attributes: bool = False,
+        keep_attrs: list = None,
        remove_forms: bool = False,
        prettiify: bool = False,
        parser_type: str = "lxml",
@@ -451,6 +460,7 @@ class CrawlerRunConfig:
        exclude_external_links: bool = False,
        exclude_social_media_links: bool = False,
        exclude_domains: list = None,
+        exclude_internal_links: bool = False,
        # Debugging and Logging Parameters
        verbose: bool = True,
        log_console: bool = False,
@@ -475,6 +485,7 @@ class CrawlerRunConfig:
        self.excluded_tags = excluded_tags or []
        self.excluded_selector = excluded_selector or ""
        self.keep_data_attributes = keep_data_attributes
+        self.keep_attrs = keep_attrs or []
        self.remove_forms = remove_forms
        self.prettiify = prettiify
        self.parser_type = parser_type
@@ -532,6 +543,7 @@ class CrawlerRunConfig:
        self.exclude_external_links = exclude_external_links
        self.exclude_social_media_links = exclude_social_media_links
        self.exclude_domains = exclude_domains or []
+        self.exclude_internal_links = exclude_internal_links

        # Debugging and Logging Parameters
        self.verbose = verbose
@@ -580,6 +592,7 @@ class CrawlerRunConfig:
            excluded_tags=kwargs.get("excluded_tags", []),
            excluded_selector=kwargs.get("excluded_selector", ""),
            keep_data_attributes=kwargs.get("keep_data_attributes", False),
+            keep_attrs=kwargs.get("keep_attrs", []),
            remove_forms=kwargs.get("remove_forms", False),
            prettiify=kwargs.get("prettiify", False),
            parser_type=kwargs.get("parser_type", "lxml"),
@@ -638,6 +651,7 @@ class CrawlerRunConfig:
            exclude_external_links=kwargs.get("exclude_external_links", False),
            exclude_social_media_links=kwargs.get("exclude_social_media_links", False),
            exclude_domains=kwargs.get("exclude_domains", []),
+            exclude_internal_links=kwargs.get("exclude_internal_links", False),
            # Debugging and Logging Parameters
            verbose=kwargs.get("verbose", True),
            log_console=kwargs.get("log_console", False),
@@ -663,6 +677,7 @@ class CrawlerRunConfig:
            "excluded_tags": self.excluded_tags,
            "excluded_selector": self.excluded_selector,
            "keep_data_attributes": self.keep_data_attributes,
+            "keep_attrs": self.keep_attrs,
            "remove_forms": self.remove_forms,
            "prettiify": self.prettiify,
            "parser_type": self.parser_type,
@@ -706,6 +721,7 @@ class CrawlerRunConfig:
            "exclude_external_links": self.exclude_external_links,
            "exclude_social_media_links": self.exclude_social_media_links,
            "exclude_domains": self.exclude_domains,
+            "exclude_internal_links": self.exclude_internal_links,
            "verbose": self.verbose,
            "log_console": self.log_console,
            "stream": self.stream,
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -319,14 +319,6 @@ class AsyncWebCrawler:
            try:
                # Handle configuration
                if crawler_config is not None:
-                    # if any(param is not None for param in [
-                    #     word_count_threshold, extraction_strategy, chunking_strategy,
-                    #     content_filter, cache_mode, css_selector, screenshot, pdf
-                    # ]):
-                    #     self.logger.warning(
-                    #         message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.",
-                    #         tag="WARNING"
-                    #     )
                    config = crawler_config
                else:
                    # Merge all parameters into a single kwargs dict for config creation
@@ -350,14 +342,6 @@ class AsyncWebCrawler:

                # Handle deprecated cache parameters
                if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]):
-                    if kwargs.get("warning", True):
-                        warnings.warn(
-                            "Cache control boolean flags are deprecated and will be removed in version 0.5.0. "
-                            "Use 'cache_mode' parameter instead.",
-                            DeprecationWarning,
-                            stacklevel=2,
-                        )
-
                    # Convert legacy parameters if cache_mode not provided
                    if config.cache_mode is None:
                        config.cache_mode = _legacy_to_cache_mode(
@@ -430,7 +414,9 @@ class AsyncWebCrawler:
                                response_headers={"X-Robots-Status": "Blocked by robots.txt"}
                            )

-                    # Pass config to crawl method
+                    ##############################
+                    # Call CrawlerStrategy.crawl #
+                    ##############################
                    async_response = await self.crawler_strategy.crawl(
                        url,
                        config=config,  # Pass the entire config object
@@ -448,7 +434,9 @@ class AsyncWebCrawler:
                        tag="FETCH",
                    )

-                    # Process the HTML content
+                    ###############################################################
+                    # Process the HTML content, Call CrawlerStrategy.process_html #
+                    ###############################################################
                    crawl_result : CrawlResult = await self.aprocess_html(
                        url=url,
                        html=html,
@@ -469,26 +457,6 @@ class AsyncWebCrawler:
                        async_response.ssl_certificate
                    )  # Add SSL certificate

-                    # # Check and set values from async_response to crawl_result
-                    # try:
-                    #     for key in vars(async_response):
-                    #         if hasattr(crawl_result, key):
-                    #             value = getattr(async_response, key, None)
-                    #             current_value = getattr(crawl_result, key, None)
-                    #             if value is not None and not current_value:
-                    #                 try:
-                    #                     setattr(crawl_result, key, value)
-                    #                 except Exception as e:
-                    #                     self.logger.warning(
-                    #                         message=f"Failed to set attribute {key}: {str(e)}",
-                    #                         tag="WARNING"
-                    #                     )
-                    # except Exception as e:
-                    #     self.logger.warning(
-                    #         message=f"Error copying response attributes: {str(e)}",
-                    #         tag="WARNING"
-                    #     )
-
                    crawl_result.success = bool(html)
                    crawl_result.session_id = getattr(config, "session_id", None)

@@ -538,8 +506,6 @@ class AsyncWebCrawler:
                    f"Error: {str(e)}\n\n"
                    f"Code context:\n{error_context['code_context']}"
                )
-                # if not hasattr(e, "msg"):
-                #     e.msg = str(e)

                self.logger.error_status(
                    url=url,
@@ -578,6 +544,7 @@ class AsyncWebCrawler:
        Returns:
            CrawlResult: Processed result containing extracted and formatted content
        """
+        cleaned_html = ""
        try:
            _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
            t1 = time.perf_counter()
@@ -592,6 +559,10 @@ class AsyncWebCrawler:
            # add keys from kwargs to params that doesn't exist in params
            params.update({k: v for k, v in kwargs.items() if k not in params.keys()})

+            
+            ################################
+            # Scraping Strategy Execution  #
+            ################################
            result = scraping_strategy.scrap(url, html, **params)

            if result is None:
@@ -618,7 +589,9 @@ class AsyncWebCrawler:
            links = result.links.model_dump()
            metadata = result.metadata

-        # Markdown Generation
+        ################################
+        # Generate Markdown            #
+        ################################
        markdown_generator: Optional[MarkdownGenerationStrategy] = (
            config.markdown_generator or DefaultMarkdownGenerator()
        )
@@ -644,14 +617,15 @@ class AsyncWebCrawler:
            params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000)},
        )

-        # Handle content extraction if needed
+        ################################
+        # Structured Content Extraction           #
+        ################################
        if (
            not bool(extracted_content)
            and config.extraction_strategy
            and not isinstance(config.extraction_strategy, NoExtractionStrategy)
        ):
            t1 = time.perf_counter()
-
            # Choose content based on input_format
            content_format = config.extraction_strategy.input_format
            if content_format == "fit_markdown" and not markdown_result.fit_markdown:
@@ -665,6 +639,7 @@ class AsyncWebCrawler:
            content = {
                "markdown": markdown,
                "html": html,
+                "cleaned_html": cleaned_html,
                "fit_markdown": markdown_result.raw_markdown,
            }.get(content_format, markdown)

--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -529,6 +529,9 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                        if normalized_href not in external_links_dict:
                            external_links_dict[normalized_href] = link_data
                    else:
+                        if kwargs.get("exclude_internal_links", False):
+                            element.decompose()
+                            return False
                        if normalized_href not in internal_links_dict:
                            internal_links_dict[normalized_href] = link_data

@@ -629,7 +632,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):

            try:
                self.remove_unwanted_attributes(
-                    element, IMPORTANT_ATTRS, kwargs.get("keep_data_attributes", False)
+                    element, IMPORTANT_ATTRS + kwargs.get("keep_attrs", []) , kwargs.get("keep_data_attributes", False)
                )
            except Exception as e:
                # print('Error removing unwanted attributes:', str(e))
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -1098,17 +1098,19 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
        user_message = {
            "role": "user",
            "content": f"""
-                Instructions:
-                {prompt_template}
-
                HTML to analyze:
                ```html
                {html}
                ```

-                {"Extract the following data: " + query if query else "Please analyze the HTML structure and create the most appropriate schema for data extraction."}
+                Instructions to extract schema for the above given HTML:
+                {prompt_template}
+
                """
        }
+        
+        if query:
+            user_message["content"] += f"\n\nImportant Notes to Consider:\n{query}"

        try:
            # Call LLM with backoff handling
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -143,6 +143,7 @@ class AsyncCrawlResponse(BaseModel):
 ###############################
 class MediaItem(BaseModel):
    src: Optional[str] = ""
+    data: Optional[str] = ""
    alt: Optional[str] = ""
    desc: Optional[str] = ""
    score: Optional[int] = 0
--- a/crawl4ai/processors/pdf/init.py
+++ b/crawl4ai/processors/pdf/init.py
@@ -0,0 +1,164 @@
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Dict, List, Optional
+from dataclasses import asdict
+
+from crawl4ai.async_logger import AsyncLogger
+from crawl4ai.async_crawler_strategy import AsyncCrawlerStrategy
+from crawl4ai.models import AsyncCrawlResponse, ScrapingResult 
+from crawl4ai.content_scraping_strategy import ContentScrapingStrategy
+from .processor import NaivePDFProcessorStrategy  # Assuming your current PDF code is in pdf_processor.py
+
+class PDFCrawlerStrategy(AsyncCrawlerStrategy):
+    def __init__(self, logger: AsyncLogger = None):
+        self.logger = logger
+        
+    async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
+        # Just pass through with empty HTML - scraper will handle actual processing
+        return AsyncCrawlResponse(
+            html="",  # Scraper will handle the real work
+            response_headers={"Content-Type": "application/pdf"},
+            status_code=200
+        )
+    
+    async def close(self):
+        pass        
+        
+    async def __aenter__(self):        
+        return self
+    
+    async def __aexit__(self, exc_type, exc_val, exc_tb):
+        await self.close()
+
+class PDFContentScrapingStrategy(ContentScrapingStrategy):
+    """
+    A content scraping strategy for PDF files.
+    
+    Attributes:
+        save_images_locally (bool): Whether to save images locally.
+        extract_images (bool): Whether to extract images from PDF.
+        image_save_dir (str): Directory to save extracted images.
+        logger (AsyncLogger): Logger instance for recording events and errors.
+        
+    Methods:
+        scrap(url: str, html: str, **params) -> ScrapingResult:
+            Scrap content from a PDF file.
+        ascrap(url: str, html: str, **kwargs) -> ScrapingResult:
+            Asynchronous version of scrap.
+            
+    Usage:
+        strategy = PDFContentScrapingStrategy(
+            save_images_locally=False,
+            extract_images=False,
+            image_save_dir=None,
+            logger=logger
+        )
+        
+    """
+    def __init__(self, 
+                 save_images_locally=False,
+                 extract_images=False,
+                 image_save_dir=None,
+                 logger: AsyncLogger = None):
+        self.logger = logger
+        self.pdf_processor = NaivePDFProcessorStrategy(
+            save_images_locally=False,
+            extract_images=False,
+            image_save_dir=None
+        )
+
+    def scrap(self, url: str, html: str, **params) -> ScrapingResult:
+        """
+        Scrap content from a PDF file.
+        
+        Args:
+            url (str): The URL of the PDF file.
+            html (str): The HTML content of the page.
+            **params: Additional parameters.
+        
+        Returns:
+            ScrapingResult: The scraped content.
+        """
+        # Download if URL or use local path
+        pdf_path = self._get_pdf_path(url)
+        try:
+            # Process PDF
+            result = self.pdf_processor.process(Path(pdf_path))
+            
+            # Combine page HTML
+            cleaned_html = f"""
+        <html>
+            <head><meta name="pdf-pages" content="{len(result.pages)}"></head>
+            <body>
+                {''.join(f'<div class="pdf-page" data-page="{i+1}">{page.html}</div>'
+                         for i, page in enumerate(result.pages))}
+            </body>
+        </html>
+        """
+            
+            # Accumulate media and links with page numbers
+            media = {"images": []}
+            links = {"urls": []}
+            
+            for page in result.pages:
+                # Add page number to each image
+                for img in page.images:
+                    img["page"] = page.page_number
+                    media["images"].append(img)
+                
+                # Add page number to each link
+                for link in page.links:
+                    links["urls"].append({
+                        "url": link,
+                        "page": page.page_number
+                    })
+
+            return ScrapingResult(
+                cleaned_html=cleaned_html,
+                success=True,
+                media=media,
+                links=links,
+                metadata=asdict(result.metadata)
+            )
+        finally:
+            # Cleanup temp file if downloaded
+            if url.startswith(("http://", "https://")):
+                Path(pdf_path).unlink(missing_ok=True)
+
+    async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
+        # For simple cases, you can use the sync version
+        return await asyncio.to_thread(self.scrap, url, html, **kwargs)
+        
+
+    def _get_pdf_path(self, url: str) -> str:
+        if url.startswith(("http://", "https://")):
+            import tempfile
+            import requests
+            
+            # Create temp file with .pdf extension
+            temp_file = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
+            
+            try:
+                # Download PDF with streaming
+                response = requests.get(url, stream=True)
+                response.raise_for_status()
+                
+                # Write to temp file
+                with open(temp_file.name, 'wb') as f:
+                    for chunk in response.iter_content(chunk_size=8192):
+                        f.write(chunk)
+                        
+                return temp_file.name
+                
+            except Exception as e:
+                # Clean up temp file if download fails
+                Path(temp_file.name).unlink(missing_ok=True)
+                raise RuntimeError(f"Failed to download PDF from {url}: {str(e)}")
+                
+        elif url.startswith("file://"):
+            return url[7:]  # Strip file:// prefix
+            
+        return url  # Assume local path
+    
+
+__all__ = ["PDFCrawlerStrategy", "PDFContentScrapingStrategy"]
--- a/crawl4ai/processors/pdf/processor.py
+++ b/crawl4ai/processors/pdf/processor.py
@@ -0,0 +1,372 @@
+import logging
+import re
+from abc import ABC, abstractmethod
+from datetime import datetime
+from pathlib import Path
+from time import time
+from dataclasses import dataclass, asdict, field
+from typing import Dict, List, Optional, Tuple
+import PyPDF2
+from PIL import Image
+from PyPDF2 import PdfReader
+from .utils import *
+import base64
+import tempfile
+
+logger = logging.getLogger(__name__)
+
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import List, Optional, Dict, Any
+from pathlib import Path
+
+@dataclass
+class PDFMetadata:
+    title: Optional[str] = None
+    author: Optional[str] = None
+    producer: Optional[str] = None
+    created: Optional[datetime] = None
+    modified: Optional[datetime] = None
+    pages: int = 0
+    encrypted: bool = False
+    file_size: Optional[int] = None
+
+@dataclass
+class PDFPage:
+    page_number: int
+    raw_text: str = ""
+    markdown: str = ""  # Added per your request
+    html: str = ""  # Added per your request
+    images: List[Dict] = field(default_factory=list)
+    links: List[str] = field(default_factory=list)
+    layout: List[Dict] = field(default_factory=list)
+
+@dataclass
+class PDFProcessResult:
+    metadata: PDFMetadata
+    pages: List[PDFPage]
+    processing_time: float = 0.0
+    version: str = "1.0"
+
+class PDFProcessorStrategy(ABC):
+    @abstractmethod
+    def process(self, pdf_path: Path) -> PDFProcessResult:
+        pass
+
+class NaivePDFProcessorStrategy(PDFProcessorStrategy):
+    def __init__(self, image_dpi: int = 144, image_quality: int = 85, extract_images: bool = True, 
+                 save_images_locally: bool = False, image_save_dir: Optional[Path] = None):
+        self.image_dpi = image_dpi
+        self.image_quality = image_quality
+        self.current_page_number = 0
+        self.extract_images = extract_images
+        self.save_images_locally = save_images_locally
+        self.image_save_dir = image_save_dir
+        self._temp_dir = None
+
+    def process(self, pdf_path: Path) -> PDFProcessResult:
+        start_time = time()
+        result = PDFProcessResult(
+            metadata=PDFMetadata(),
+            pages=[],
+            version="1.1"
+        )
+
+        try:
+            with pdf_path.open('rb') as file:
+                reader = PdfReader(file)
+                result.metadata = self._extract_metadata(pdf_path, reader)
+                
+                # Handle image directory
+                image_dir = None
+                if self.extract_images and self.save_images_locally:
+                    if self.image_save_dir:
+                        image_dir = Path(self.image_save_dir)
+                        image_dir.mkdir(exist_ok=True, parents=True)
+                    else:
+                        self._temp_dir = tempfile.mkdtemp(prefix='pdf_images_')
+                        image_dir = Path(self._temp_dir)
+
+                for page_num, page in enumerate(reader.pages):
+                    self.current_page_number = page_num + 1
+                    pdf_page = self._process_page(page, image_dir, reader)
+                    result.pages.append(pdf_page)
+
+        except Exception as e:
+            logger.error(f"Failed to process PDF: {str(e)}")
+            raise
+        finally:
+            # Cleanup temp directory if it was created
+            if self._temp_dir and not self.image_save_dir:
+                import shutil
+                try:
+                    shutil.rmtree(self._temp_dir)
+                except Exception as e:
+                    logger.error(f"Failed to cleanup temp directory: {str(e)}")
+
+        result.processing_time = time() - start_time
+        return result
+
+    def _process_page(self, page, image_dir: Optional[Path], reader) -> PDFPage:
+        pdf_page = PDFPage(
+            page_number=self.current_page_number,
+        )
+
+        # Text and font extraction
+        def visitor_text(text, cm, tm, font_dict, font_size):
+            pdf_page.raw_text += text
+            pdf_page.layout.append({
+                "type": "text",
+                "text": text,
+                "x": tm[4],
+                "y": tm[5],
+            })
+        
+        page.extract_text(visitor_text=visitor_text)
+
+        # Image extraction
+        if self.extract_images:
+            pdf_page.images = self._extract_images(page, image_dir)
+
+        # Link extraction
+        pdf_page.links = self._extract_links(page)
+        
+        # Add markdown content
+        pdf_page.markdown = clean_pdf_text(self.current_page_number, pdf_page.raw_text)
+        pdf_page.html = clean_pdf_text_to_html(self.current_page_number, pdf_page.raw_text)
+
+        return pdf_page
+
+    def _extract_images(self, page, image_dir: Optional[Path]) -> List[Dict]:
+        if not self.extract_images:
+            return []
+
+        images = []
+        try:
+            resources = page.get("/Resources")
+            if resources:  # Check if resources exist
+                resources = resources.get_object()  # Resolve IndirectObject
+                if '/XObject' in resources:
+                    xobjects = resources['/XObject'].get_object()
+                    img_count = 0
+                    for obj_name in xobjects:
+                        xobj = xobjects[obj_name]
+                        if hasattr(xobj, 'get_object') and callable(xobj.get_object):
+                            xobj = xobj.get_object()
+                            if xobj.get('/Subtype') == '/Image':
+                                try:
+                                    img_count += 1
+                                    img_filename = f"page_{self.current_page_number}_img_{img_count}"
+                                    data = xobj.get_data()
+                                    filters = xobj.get('/Filter', [])
+                                    if not isinstance(filters, list):
+                                        filters = [filters]
+
+                                    # Resolve IndirectObjects in properties
+                                    width = xobj.get('/Width', 0)
+                                    height = xobj.get('/Height', 0)
+                                    color_space = xobj.get('/ColorSpace', '/DeviceRGB')
+                                    if isinstance(color_space, PyPDF2.generic.IndirectObject):
+                                        color_space = color_space.get_object()
+
+                                    # Handle different image encodings
+                                    success = False
+                                    image_format = 'bin'
+                                    image_data = None
+                                    
+                                    if '/FlateDecode' in filters:
+                                        try:
+                                            decode_parms = xobj.get('/DecodeParms', {})
+                                            if isinstance(decode_parms, PyPDF2.generic.IndirectObject):
+                                                decode_parms = decode_parms.get_object()
+                                            
+                                            predictor = decode_parms.get('/Predictor', 1)
+                                            bits = xobj.get('/BitsPerComponent', 8)
+                                            colors = 3 if color_space == '/DeviceRGB' else 1
+
+                                            if predictor >= 10:
+                                                data = apply_png_predictor(data, width, bits, colors)
+
+                                            # Create PIL Image
+                                            mode = 'RGB' if color_space == '/DeviceRGB' else 'L'
+                                            img = Image.frombytes(mode, (width, height), data)
+                                            
+                                            if self.save_images_locally:
+                                                final_path = (image_dir / img_filename).with_suffix('.png')
+                                                img.save(final_path)
+                                                image_data = str(final_path)
+                                            else:
+                                                import io
+                                                img_byte_arr = io.BytesIO()
+                                                img.save(img_byte_arr, format='PNG')
+                                                image_data = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
+                                            
+                                            success = True
+                                            image_format = 'png'
+                                        except Exception as e:
+                                            logger.error(f"FlateDecode error: {str(e)}")
+
+                                    elif '/DCTDecode' in filters:
+                                        # JPEG image
+                                        try:
+                                            if self.save_images_locally:
+                                                final_path = (image_dir / img_filename).with_suffix('.jpg')
+                                                with open(final_path, 'wb') as f:
+                                                    f.write(data)
+                                                image_data = str(final_path)
+                                            else:
+                                                image_data = base64.b64encode(data).decode('utf-8')
+                                            success = True
+                                            image_format = 'jpeg'
+                                        except Exception as e:
+                                            logger.error(f"JPEG save error: {str(e)}")
+
+                                    elif '/CCITTFaxDecode' in filters:
+                                        try:
+                                            if data[:4] != b'II*\x00':
+                                                # Add TIFF header if missing
+                                                tiff_header = b'II*\x00\x08\x00\x00\x00\x0e\x00\x00\x01\x03\x00\x01\x00\x00\x00' + \
+                                                            width.to_bytes(4, 'little') + \
+                                                            b'\x01\x03\x00\x01\x00\x00\x00' + \
+                                                            height.to_bytes(4, 'little') + \
+                                                            b'\x01\x12\x00\x03\x00\x00\x00\x01\x00\x01\x00\x00\x01\x17\x00\x04\x00\x00\x00\x01\x00\x00\x00J\x01\x1B\x00\x05\x00\x00\x00\x01\x00\x00\x00R\x01\x28\x00\x03\x00\x00\x00\x01\x00\x02\x00\x00'
+                                                data = tiff_header + data
+                                            
+                                            if self.save_images_locally:
+                                                final_path = (image_dir / img_filename).with_suffix('.tiff')
+                                                with open(final_path, 'wb') as f:
+                                                    f.write(data)
+                                                image_data = str(final_path)
+                                            else:
+                                                image_data = base64.b64encode(data).decode('utf-8')
+                                            success = True
+                                            image_format = 'tiff'
+                                        except Exception as e:
+                                            logger.error(f"CCITT save error: {str(e)}")
+
+                                    elif '/JPXDecode' in filters:
+                                        # JPEG 2000
+                                        try:
+                                            if self.save_images_locally:
+                                                final_path = (image_dir / img_filename).with_suffix('.jp2')
+                                                with open(final_path, 'wb') as f:
+                                                    f.write(data)
+                                                image_data = str(final_path)
+                                            else:
+                                                image_data = base64.b64encode(data).decode('utf-8')
+                                            success = True
+                                            image_format = 'jpeg2000'
+                                        except Exception as e:
+                                            logger.error(f"JPEG2000 save error: {str(e)}")
+
+                                    if success and image_data:
+                                        image_info = {
+                                            "format": image_format,
+                                            "width": width,
+                                            "height": height,
+                                            "color_space": str(color_space),
+                                            "bits_per_component": xobj.get('/BitsPerComponent', 1)
+                                        }
+                                        
+                                        if self.save_images_locally:
+                                            image_info["path"] = image_data
+                                        else:
+                                            image_info["data"] = image_data
+                                            
+                                        images.append(image_info)
+                                    else:
+                                        # Fallback: Save raw data
+                                        if self.save_images_locally:
+                                            final_path = (image_dir / img_filename).with_suffix('.bin')
+                                            with open(final_path, 'wb') as f:
+                                                f.write(data)
+                                            logger.warning(f"Saved raw image data to {final_path}")
+                                        else:
+                                            image_data = base64.b64encode(data).decode('utf-8')
+                                            images.append({
+                                                "format": "bin",
+                                                "width": width,
+                                                "height": height,
+                                                "color_space": str(color_space),
+                                                "bits_per_component": xobj.get('/BitsPerComponent', 1),
+                                                "data": image_data
+                                            })
+
+                                except Exception as e:
+                                    logger.error(f"Error processing image: {str(e)}")
+        except Exception as e:
+            logger.error(f"Image extraction error: {str(e)}")
+        
+        return images
+
+    def _extract_links(self, page) -> List[str]:
+        links = []
+        if '/Annots' in page:
+            try:
+                for annot in page['/Annots']:
+                    a = annot.get_object()
+                    if '/A' in a and '/URI' in a['/A']:
+                        links.append(a['/A']['/URI'])
+            except Exception as e:
+                print(f"Link error: {str(e)}")
+        return links
+
+    def _extract_metadata(self, pdf_path: Path, reader: PdfReader = None) -> PDFMetadata:
+        if not reader:
+            reader = PdfReader(pdf_path)
+
+        meta = reader.metadata or {}
+        created = self._parse_pdf_date(meta.get('/CreationDate', ''))
+        modified = self._parse_pdf_date(meta.get('/ModDate', ''))
+        
+        return PDFMetadata(
+            title=meta.get('/Title'),
+            author=meta.get('/Author'),
+            producer=meta.get('/Producer'),
+            created=created,
+            modified=modified,
+            pages=len(reader.pages),
+            encrypted=reader.is_encrypted,
+            file_size=pdf_path.stat().st_size
+        )
+
+    def _parse_pdf_date(self, date_str: str) -> Optional[datetime]:
+        try:
+            match = re.match(r'D:(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})', date_str)
+            if not match:
+                return None
+                
+            return datetime(
+                year=int(match[1]),
+                month=int(match[2]),
+                day=int(match[3]),
+                hour=int(match[4]),
+                minute=int(match[5]),
+                second=int(match[6])
+            )
+        except:
+            return None
+
+# Usage example
+if __name__ == "__main__":
+    import json
+    from pathlib import Path
+    current_dir = Path(__file__).resolve().parent
+    pdf_path = f'{current_dir}/test.pdf'
+    
+    strategy = NaivePDFProcessorStrategy()
+    result = strategy.process(Path(pdf_path))
+    
+    # Convert to JSON
+    json_output = asdict(result)
+    print(json.dumps(json_output, indent=2, default=str))
+    
+    with open(f'{current_dir}/test.html', 'w') as f:
+        for page in result.pages:
+            f.write(f'<h1>Page {page["page_number"]}</h1>')
+            f.write(page['html'])
+    with open(f'{current_dir}/test.md', 'w') as f:
+        for page in result.pages:
+            f.write(f'# Page {page["page_number"]}\n\n')
+            f.write(clean_pdf_text(page["page_number"], page['raw_text']))
+            f.write('\n\n')
--- a/crawl4ai/processors/pdf/utils.py
+++ b/crawl4ai/processors/pdf/utils.py
@@ -0,0 +1,350 @@
+import re
+
+def apply_png_predictor(data, width, bits, color_channels):
+    """Decode PNG predictor (PDF 1.5+ filter)"""
+    bytes_per_pixel = (bits * color_channels) // 8
+    if (bits * color_channels) % 8 != 0:
+        bytes_per_pixel += 1
+        
+    stride = width * bytes_per_pixel
+    scanline_length = stride + 1  # +1 for filter byte
+    
+    if len(data) % scanline_length != 0:
+        raise ValueError("Invalid scanline structure")
+    
+    num_lines = len(data) // scanline_length
+    output = bytearray()
+    prev_line = b'\x00' * stride
+    
+    for i in range(num_lines):
+        line = data[i*scanline_length:(i+1)*scanline_length]
+        filter_type = line[0]
+        filtered = line[1:]
+        
+        if filter_type == 0:  # None
+            decoded = filtered
+        elif filter_type == 1:  # Sub
+            decoded = bytearray(filtered)
+            for j in range(bytes_per_pixel, len(decoded)):
+                decoded[j] = (decoded[j] + decoded[j - bytes_per_pixel]) % 256
+        elif filter_type == 2:  # Up
+            decoded = bytearray([(filtered[j] + prev_line[j]) % 256 
+                               for j in range(len(filtered))])
+        elif filter_type == 3:  # Average
+            decoded = bytearray(filtered)
+            for j in range(len(decoded)):
+                left = decoded[j - bytes_per_pixel] if j >= bytes_per_pixel else 0
+                up = prev_line[j]
+                avg = (left + up) // 2
+                decoded[j] = (decoded[j] + avg) % 256
+        elif filter_type == 4:  # Paeth
+            decoded = bytearray(filtered)
+            for j in range(len(decoded)):
+                left = decoded[j - bytes_per_pixel] if j >= bytes_per_pixel else 0
+                up = prev_line[j]
+                up_left = prev_line[j - bytes_per_pixel] if j >= bytes_per_pixel else 0
+                paeth = paeth_predictor(left, up, up_left)
+                decoded[j] = (decoded[j] + paeth) % 256
+        else:
+            raise ValueError(f"Unsupported filter type: {filter_type}")
+        
+        output.extend(decoded)
+        prev_line = decoded
+    
+    return bytes(output)
+
+def paeth_predictor(a, b, c):
+    p = a + b - c
+    pa = abs(p - a)
+    pb = abs(p - b)
+    pc = abs(p - c)
+    if pa <= pb and pa <= pc:
+        return a
+    elif pb <= pc:
+        return b
+    else:
+        return c
+
+import re
+import html
+
+def clean_pdf_text_to_html(page_number, text):
+    # Decode Unicode escapes and handle surrogate pairs
+    try:
+        decoded = text.encode('latin-1').decode('unicode-escape')
+        decoded = decoded.encode('utf-16', 'surrogatepass').decode('utf-16')
+    except Exception as e:
+        decoded = text  # Fallback if decoding fails
+    
+    article_title_detected = False
+    # decoded = re.sub(r'\.\n', '.\n\n', decoded)
+    # decoded = re.sub(r'\.\n', '<|break|>', decoded)
+    lines = decoded.split('\n')
+    output = []
+    current_paragraph = []
+    in_header = False
+    email_pattern = re.compile(r'\{.*?\}')
+    affiliation_pattern = re.compile(r'^†')
+    quote_pattern = re.compile(r'^["“]')
+    author_pattern = re.compile(
+        r'^\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?'
+        r'(?:,\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?)*'
+        r'(?:,\s*(?:and|&)\s+[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?)?\s*$'
+    )
+    
+    def flush_paragraph():
+        if current_paragraph:
+            para = ' '.join(current_paragraph)
+            para = re.sub(r'\s+', ' ', para).strip()
+            if para:
+                # escaped_para = html.escape(para)
+                escaped_para = para
+                # escaped_para = re.sub(r'\.\n', '.\n\n', escaped_para)
+                # Split escaped_para by <|break|> to avoid HTML escaping
+                escaped_para = escaped_para.split('.\n\n')
+                # Wrap each part in <p> tag
+                escaped_para = [f'<p>{part}</p>' for part in escaped_para]
+                output.append(f'<div class="paragraph">{"".join(escaped_para)}</div><hr/>')
+            current_paragraph.clear()
+    
+    for i, line in enumerate(lines):
+        line = line.strip()
+        
+        # Handle empty lines
+        if not line:
+            flush_paragraph()
+            continue
+            
+        # Detect article title (first line with reasonable length)
+        if not article_title_detected and i == 0 and 3 <= len(line.split()) <= 8 and len(lines) > 1:
+            flush_paragraph()
+            escaped_line = html.escape(line)
+            output.append(f'<h2>{escaped_line}</h2>')
+            article_title_detected = True
+            continue
+            
+        # Detect numbered headers like "2.1 Background"
+        numbered_header = re.match(r'^(\d+(?:\.\d+)*)\s+(.+)$', line)
+        if i > 0 and not lines[i-1].strip() and numbered_header:
+            flush_paragraph()
+            level = numbered_header.group(1).count('.') + 1
+            header_text = numbered_header.group(2)
+            md_level = min(level + 1, 6)
+            escaped_header = html.escape(header_text)
+            output.append(f'<h{md_level}>{escaped_header}</h{md_level}>')
+            in_header = True
+            continue
+            
+        # Detect authors
+        if page_number == 1 and author_pattern.match(line):
+            authors = re.sub(r'[†â€]', '', line)
+            authors = re.split(r', | and ', authors)
+            formatted_authors = []
+            for author in authors:
+                if author.strip():
+                    parts = [p for p in author.strip().split() if p]
+                    formatted = ' '.join(parts)
+                    escaped_author = html.escape(formatted)
+                    formatted_authors.append(f'<strong>{escaped_author}</strong>')
+            
+            if len(formatted_authors) > 1:
+                joined = ', '.join(formatted_authors[:-1]) + ' and ' + formatted_authors[-1]
+            else:
+                joined = formatted_authors[0]
+            
+            output.append(f'<p>{joined}</p>')
+            continue
+            
+        # Detect affiliation
+        if affiliation_pattern.match(line):
+            escaped_line = html.escape(line)
+            output.append(f'<p><em>{escaped_line}</em></p>')
+            continue
+            
+        # Detect emails
+        if email_pattern.match(line):
+            escaped_line = html.escape(line)
+            output.append(f'<p><code>{escaped_line}</code></p>')
+            continue
+            
+        # Detect section headers
+        if re.match(r'^(Abstract|\d+\s+[A-Z]|References|Appendix|Figure|Table)', line):
+            flush_paragraph()
+            escaped_line = html.escape(line)
+            output.append(f'<h2 class="section-header"><em>{escaped_line}</em></h2>')
+            in_header = True
+            continue
+            
+        # Handle quotes
+        if quote_pattern.match(line):
+            flush_paragraph()
+            escaped_line = html.escape(line)
+            output.append(f'<blockquote><p>{escaped_line}</p></blockquote>')
+            continue
+            
+        # Handle hyphenated words
+        if line.endswith('-'):
+            current_paragraph.append(line[:-1].strip())
+        else:
+            current_paragraph.append(line)
+            
+        # Handle paragraph breaks after headers
+        if in_header and not line.endswith(('.', '!', '?')):
+            flush_paragraph()
+            in_header = False
+    
+    flush_paragraph()
+    
+    # Post-process HTML
+    html_output = '\n'.join(output)
+    
+    # Fix common citation patterns
+    html_output = re.sub(r'\(([A-Z][a-z]+ et al\. \d{4})\)', r'<cite>\1</cite>', html_output)
+    
+    # Fix escaped characters
+    html_output = html_output.replace('\\ud835', '').replace('\\u2020', '†')
+    
+    # Remove leftover hyphens and fix spacing
+    html_output = re.sub(r'\s+-\s+', '', html_output)
+    html_output = re.sub(r'\s+([.,!?)])', r'\1', html_output)
+    
+    return html_output
+
+def clean_pdf_text(page_number, text):
+    # Decode Unicode escapes and handle surrogate pairs
+    try:
+        decoded = text.encode('latin-1').decode('unicode-escape')
+        decoded = decoded.encode('utf-16', 'surrogatepass').decode('utf-16')
+    except Exception as e:
+        decoded = text  # Fallback if decoding fails
+    
+    article_title_detected = False
+    decoded = re.sub(r'\.\n', '.\n\n', decoded)
+    lines = decoded.split('\n')
+    output = []
+    current_paragraph = []
+    in_header = False
+    email_pattern = re.compile(r'\{.*?\}')
+    affiliation_pattern = re.compile(r'^†')
+    quote_pattern = re.compile(r'^["“]')
+    author_pattern = re.compile(
+        r'^\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?'
+        r'(?:,\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?)*'
+        r'(?:,\s*(?:and|&)\s+[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?)?\s*$'
+    )
+    
+    def flush_paragraph():
+        if current_paragraph:
+            para = ' '.join(current_paragraph)
+            para = re.sub(r'\s+', ' ', para).strip()
+            if para:
+                output.append(para)
+            current_paragraph.clear()
+    
+    for i, line in enumerate(lines):
+        line = line.strip()
+        
+        # Handle special patterns
+        if not line:
+            flush_paragraph()
+            continue
+            
+        # Detect headline (first line, reasonable length, surrounded by empty lines)
+        if not article_title_detected and i == 0 and 3 <= len(line.split()) <= 8 and (len(lines) > 1):
+            flush_paragraph()
+            output.append(f'## {line}')
+            continue
+            
+        # Detect paragraph breaks for ALL paragraphs
+        if not line and current_paragraph:
+            flush_paragraph()
+            output.append('')  # Add empty line between paragraphs
+            continue
+                    
+        # Detect numbered headers like "2.1 Background"
+        numbered_header = re.match(r'^(\d+(?:\.\d+)*)\s+(.+)$', line)
+        if not lines[i-1].strip() and numbered_header:
+            flush_paragraph()
+            level = numbered_header.group(1).count('.') + 1  # Convert 2.1 → level 2
+            header_text = numbered_header.group(2)
+            # Never go beyond ### for subsections
+            md_level = min(level + 1, 6)   # 1 → ##, 2 → ###, 3 → #### etc
+            output.append(f'{"#" * md_level} {header_text}')
+            in_header = True
+            continue            
+            
+                    
+        # Detect authors
+        if page_number == 1 and author_pattern.match(line):
+            # Clean and format author names
+            authors = re.sub(r'[†â€]', '', line)  # Remove affiliation markers
+            authors = re.split(r', | and ', authors)
+            formatted_authors = []
+            for author in authors:
+                if author.strip():
+                    # Handle "First Last" formatting
+                    parts = [p for p in author.strip().split() if p]
+                    formatted = ' '.join(parts)
+                    formatted_authors.append(f'**{formatted}**')
+            
+            # Join with commas and "and"
+            if len(formatted_authors) > 1:
+                joined = ', '.join(formatted_authors[:-1]) + ' and ' + formatted_authors[-1]
+            else:
+                joined = formatted_authors[0]
+            
+            output.append(joined)
+            continue
+            
+        # Detect affiliation
+        if affiliation_pattern.match(line):
+            output.append(f'*{line}*')
+            continue
+            
+        # Detect emails
+        if email_pattern.match(line):
+            output.append(f'`{line}`')
+            continue
+            
+        # Detect section headers
+        if re.match(r'^(Abstract|\d+\s+[A-Z]|References|Appendix|Figure|Table)', line):
+            flush_paragraph()
+            output.append(f'_[{line}]_')
+            in_header = True
+            continue
+            
+           
+        # Handle quotes
+        if quote_pattern.match(line):
+            flush_paragraph()
+            output.append(f'> {line}')
+            continue
+            
+        # Handle hyphenated words
+        if line.endswith('-'):
+            current_paragraph.append(line[:-1].strip())
+        else:
+            current_paragraph.append(line)
+            
+        # Handle paragraph breaks after headers
+        if in_header and not line.endswith(('.', '!', '?')):
+            flush_paragraph()
+            in_header = False
+    
+    flush_paragraph()
+    
+    # Post-processing
+    markdown = '\n\n'.join(output)
+    
+    # Fix common citation patterns
+    markdown = re.sub(r'\(([A-Z][a-z]+ et al\. \d{4})\)', r'[\1]', markdown)
+    
+    # Fix escaped characters
+    markdown = markdown.replace('\\ud835', '').replace('\\u2020', '†')
+    
+    # Remove leftover hyphens and fix spacing
+    markdown = re.sub(r'\s+-\s+', '', markdown)  # Join hyphenated words
+    markdown = re.sub(r'\s+([.,!?)])', r'\1', markdown)  # Fix punctuation spacing
+    
+    
+    return markdown