feat(pdf): add PDF processing capabilities

Add new PDF processing module with the following features: - PDF text extraction and formatting to HTML/Markdown - Image extraction with multiple format support (JPEG, PNG, TIFF) - Link extraction from PDF documents - Metadata extraction including title, author, dates - Support for both local and remote PDF files Also includes: - New configuration options for HTML attribute handling - Internal/external link filtering improvements - Version bump to 0.4.300b4
2025-01-27 21:24:15 +08:00
parent 54c84079c4
commit f8fd9d9eff
9 changed files with 933 additions and 49 deletions
--- a/crawl4ai/version.py
+++ b/crawl4ai/version.py
@@ -1,2 +1,3 @@
 # crawl4ai/_version.py
-__version__ = "0.4.3b3"
+# __version__ = "0.4.3b3"
 __version__ = "0.4.300b4"
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -271,6 +271,8 @@ class CrawlerRunConfig:
                                         Default: None.
        keep_data_attributes (bool): If True, retain `data-*` attributes while removing unwanted attributes.
                                     Default: False.
        keep_attrs (list of str): List of HTML attributes to keep during processing.
                                      Default: [].
        remove_forms (bool): If True, remove all `<form>` elements from the HTML.
                             Default: False.
        prettiify (bool): If True, apply `fast_format_html` to produce prettified HTML output.
@@ -282,6 +284,8 @@ class CrawlerRunConfig:
        proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
                                     If None, no additional proxy config. Default: None.
        # SSL Parameters
        fetch_ssl_certificate: bool = False,
        # Caching Parameters
        cache_mode (CacheMode or None): Defines how caching is handled.
                                        If None, defaults to CacheMode.ENABLED internally.
@@ -363,10 +367,14 @@ class CrawlerRunConfig:
                                                    Default: SOCIAL_MEDIA_DOMAINS (from config).
        exclude_external_links (bool): If True, exclude all external links from the results.
                                       Default: False.
        exclude_internal_links (bool): If True, exclude internal links from the results.
                                       Default: False.
        exclude_social_media_links (bool): If True, exclude links pointing to social media domains.
                                           Default: False.
        exclude_domains (list of str): List of specific domains to exclude from results.
                                       Default: [].
        exclude_internal_links (bool): If True, exclude internal links from the results.
                                       Default: False.
        # Debugging and Logging Parameters
        verbose (bool): Enable verbose logging.
@@ -402,6 +410,7 @@ class CrawlerRunConfig:
        excluded_tags: list = None,
        excluded_selector: str = None,
        keep_data_attributes: bool = False,
        keep_attrs: list = None,
        remove_forms: bool = False,
        prettiify: bool = False,
        parser_type: str = "lxml",
@@ -451,6 +460,7 @@ class CrawlerRunConfig:
        exclude_external_links: bool = False,
        exclude_social_media_links: bool = False,
        exclude_domains: list = None,
        exclude_internal_links: bool = False,
        # Debugging and Logging Parameters
        verbose: bool = True,
        log_console: bool = False,
@@ -475,6 +485,7 @@ class CrawlerRunConfig:
        self.excluded_tags = excluded_tags or []
        self.excluded_selector = excluded_selector or ""
        self.keep_data_attributes = keep_data_attributes
        self.keep_attrs = keep_attrs or []
        self.remove_forms = remove_forms
        self.prettiify = prettiify
        self.parser_type = parser_type
@@ -532,6 +543,7 @@ class CrawlerRunConfig:
        self.exclude_external_links = exclude_external_links
        self.exclude_social_media_links = exclude_social_media_links
        self.exclude_domains = exclude_domains or []
        self.exclude_internal_links = exclude_internal_links
        # Debugging and Logging Parameters
        self.verbose = verbose
@@ -580,6 +592,7 @@ class CrawlerRunConfig:
            excluded_tags=kwargs.get("excluded_tags", []),
            excluded_selector=kwargs.get("excluded_selector", ""),
            keep_data_attributes=kwargs.get("keep_data_attributes", False),
            keep_attrs=kwargs.get("keep_attrs", []),
            remove_forms=kwargs.get("remove_forms", False),
            prettiify=kwargs.get("prettiify", False),
            parser_type=kwargs.get("parser_type", "lxml"),
@@ -638,6 +651,7 @@ class CrawlerRunConfig:
            exclude_external_links=kwargs.get("exclude_external_links", False),
            exclude_social_media_links=kwargs.get("exclude_social_media_links", False),
            exclude_domains=kwargs.get("exclude_domains", []),
            exclude_internal_links=kwargs.get("exclude_internal_links", False),
            # Debugging and Logging Parameters
            verbose=kwargs.get("verbose", True),
            log_console=kwargs.get("log_console", False),
@@ -663,6 +677,7 @@ class CrawlerRunConfig:
            "excluded_tags": self.excluded_tags,
            "excluded_selector": self.excluded_selector,
            "keep_data_attributes": self.keep_data_attributes,
            "keep_attrs": self.keep_attrs,
            "remove_forms": self.remove_forms,
            "prettiify": self.prettiify,
            "parser_type": self.parser_type,
@@ -706,6 +721,7 @@ class CrawlerRunConfig:
            "exclude_external_links": self.exclude_external_links,
            "exclude_social_media_links": self.exclude_social_media_links,
            "exclude_domains": self.exclude_domains,
            "exclude_internal_links": self.exclude_internal_links,
            "verbose": self.verbose,
            "log_console": self.log_console,
            "stream": self.stream,
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -319,14 +319,6 @@ class AsyncWebCrawler:
            try:
                # Handle configuration
                if crawler_config is not None:
                    # if any(param is not None for param in [
                    #     word_count_threshold, extraction_strategy, chunking_strategy,
                    #     content_filter, cache_mode, css_selector, screenshot, pdf
                    # ]):
                    #     self.logger.warning(
                    #         message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.",
                    #         tag="WARNING"
                    #     )
                    config = crawler_config
                else:
                    # Merge all parameters into a single kwargs dict for config creation
@@ -350,14 +342,6 @@ class AsyncWebCrawler:
                # Handle deprecated cache parameters
                if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]):
                    if kwargs.get("warning", True):
                        warnings.warn(
                            "Cache control boolean flags are deprecated and will be removed in version 0.5.0. "
                            "Use 'cache_mode' parameter instead.",
                            DeprecationWarning,
                            stacklevel=2,
                        )
                    # Convert legacy parameters if cache_mode not provided
                    if config.cache_mode is None:
                        config.cache_mode = _legacy_to_cache_mode(
@@ -430,7 +414,9 @@ class AsyncWebCrawler:
                                response_headers={"X-Robots-Status": "Blocked by robots.txt"}
                            )
-                    # Pass config to crawl method
+                    ##############################
                    # Call CrawlerStrategy.crawl #
                    ##############################
                    async_response = await self.crawler_strategy.crawl(
                        url,
                        config=config,  # Pass the entire config object
@@ -448,7 +434,9 @@ class AsyncWebCrawler:
                        tag="FETCH",
                    )
-                    # Process the HTML content
+                    ###############################################################
                    # Process the HTML content, Call CrawlerStrategy.process_html #
                    ###############################################################
                    crawl_result : CrawlResult = await self.aprocess_html(
                        url=url,
                        html=html,
@@ -469,26 +457,6 @@ class AsyncWebCrawler:
                        async_response.ssl_certificate
                    )  # Add SSL certificate
                    # # Check and set values from async_response to crawl_result
                    # try:
                    #     for key in vars(async_response):
                    #         if hasattr(crawl_result, key):
                    #             value = getattr(async_response, key, None)
                    #             current_value = getattr(crawl_result, key, None)
                    #             if value is not None and not current_value:
                    #                 try:
                    #                     setattr(crawl_result, key, value)
                    #                 except Exception as e:
                    #                     self.logger.warning(
                    #                         message=f"Failed to set attribute {key}: {str(e)}",
                    #                         tag="WARNING"
                    #                     )
                    # except Exception as e:
                    #     self.logger.warning(
                    #         message=f"Error copying response attributes: {str(e)}",
                    #         tag="WARNING"
                    #     )
                    crawl_result.success = bool(html)
                    crawl_result.session_id = getattr(config, "session_id", None)
@@ -538,8 +506,6 @@ class AsyncWebCrawler:
                    f"Error: {str(e)}\n\n"
                    f"Code context:\n{error_context['code_context']}"
                )
                # if not hasattr(e, "msg"):
                #     e.msg = str(e)
                self.logger.error_status(
                    url=url,
@@ -578,6 +544,7 @@ class AsyncWebCrawler:
        Returns:
            CrawlResult: Processed result containing extracted and formatted content
        """
        cleaned_html = ""
        try:
            _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
            t1 = time.perf_counter()
@@ -592,6 +559,10 @@ class AsyncWebCrawler:
            # add keys from kwargs to params that doesn't exist in params
            params.update({k: v for k, v in kwargs.items() if k not in params.keys()})
            ################################
            # Scraping Strategy Execution  #
            ################################
            result = scraping_strategy.scrap(url, html, **params)
            if result is None:
@@ -618,7 +589,9 @@ class AsyncWebCrawler:
            links = result.links.model_dump()
            metadata = result.metadata
-        # Markdown Generation
+        ################################
        # Generate Markdown            #
        ################################
        markdown_generator: Optional[MarkdownGenerationStrategy] = (
            config.markdown_generator or DefaultMarkdownGenerator()
        )
@@ -644,14 +617,15 @@ class AsyncWebCrawler:
            params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000)},
        )
-        # Handle content extraction if needed
+        ################################
        # Structured Content Extraction           #
        ################################
        if (
            not bool(extracted_content)
            and config.extraction_strategy
            and not isinstance(config.extraction_strategy, NoExtractionStrategy)
        ):
            t1 = time.perf_counter()
            # Choose content based on input_format
            content_format = config.extraction_strategy.input_format
            if content_format == "fit_markdown" and not markdown_result.fit_markdown:
@@ -665,6 +639,7 @@ class AsyncWebCrawler:
            content = {
                "markdown": markdown,
                "html": html,
                "cleaned_html": cleaned_html,
                "fit_markdown": markdown_result.raw_markdown,
            }.get(content_format, markdown)
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -529,6 +529,9 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                        if normalized_href not in external_links_dict:
                            external_links_dict[normalized_href] = link_data
                    else:
                        if kwargs.get("exclude_internal_links", False):
                            element.decompose()
                            return False
                        if normalized_href not in internal_links_dict:
                            internal_links_dict[normalized_href] = link_data
@@ -629,7 +632,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
            try:
                self.remove_unwanted_attributes(
-                    element, IMPORTANT_ATTRS, kwargs.get("keep_data_attributes", False)
+                    element, IMPORTANT_ATTRS + kwargs.get("keep_attrs", []) , kwargs.get("keep_data_attributes", False)
                )
            except Exception as e:
                # print('Error removing unwanted attributes:', str(e))
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -1098,17 +1098,19 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
        user_message = {
            "role": "user",
            "content": f"""
                Instructions:
                {prompt_template}
                HTML to analyze:
                ```html
                {html}
                ```
-                {"Extract the following data: " + query if query else "Please analyze the HTML structure and create the most appropriate schema for data extraction."}
+                Instructions to extract schema for the above given HTML:
                {prompt_template}
                """
        }
        if query:
            user_message["content"] += f"\n\nImportant Notes to Consider:\n{query}"
        try:
            # Call LLM with backoff handling
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -143,6 +143,7 @@ class AsyncCrawlResponse(BaseModel):
 ###############################
 class MediaItem(BaseModel):
    src: Optional[str] = ""
    data: Optional[str] = ""
    alt: Optional[str] = ""
    desc: Optional[str] = ""
    score: Optional[int] = 0
--- a/crawl4ai/processors/pdf/init.py
+++ b/crawl4ai/processors/pdf/init.py
@@ -0,0 +1,164 @@
 from abc import ABC, abstractmethod
 from pathlib import Path
 from typing import Dict, List, Optional
 from dataclasses import asdict
 from crawl4ai.async_logger import AsyncLogger
 from crawl4ai.async_crawler_strategy import AsyncCrawlerStrategy
 from crawl4ai.models import AsyncCrawlResponse, ScrapingResult 
 from crawl4ai.content_scraping_strategy import ContentScrapingStrategy
 from .processor import NaivePDFProcessorStrategy  # Assuming your current PDF code is in pdf_processor.py
 class PDFCrawlerStrategy(AsyncCrawlerStrategy):
    def __init__(self, logger: AsyncLogger = None):
        self.logger = logger
    async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
        # Just pass through with empty HTML - scraper will handle actual processing
        return AsyncCrawlResponse(
            html="",  # Scraper will handle the real work
            response_headers={"Content-Type": "application/pdf"},
            status_code=200
        )
    async def close(self):
        pass        
    async def __aenter__(self):        
        return self
    async def __aexit__(self, exc_type, exc_val, exc_tb):
        await self.close()
 class PDFContentScrapingStrategy(ContentScrapingStrategy):
    """
    A content scraping strategy for PDF files.
    Attributes:
        save_images_locally (bool): Whether to save images locally.
        extract_images (bool): Whether to extract images from PDF.
        image_save_dir (str): Directory to save extracted images.
        logger (AsyncLogger): Logger instance for recording events and errors.
    Methods:
        scrap(url: str, html: str, **params) -> ScrapingResult:
            Scrap content from a PDF file.
        ascrap(url: str, html: str, **kwargs) -> ScrapingResult:
            Asynchronous version of scrap.
    Usage:
        strategy = PDFContentScrapingStrategy(
            save_images_locally=False,
            extract_images=False,
            image_save_dir=None,
            logger=logger
        )
    """
    def __init__(self, 
                 save_images_locally=False,
                 extract_images=False,
                 image_save_dir=None,
                 logger: AsyncLogger = None):
        self.logger = logger
        self.pdf_processor = NaivePDFProcessorStrategy(
            save_images_locally=False,
            extract_images=False,
            image_save_dir=None
        )
    def scrap(self, url: str, html: str, **params) -> ScrapingResult:
        """
        Scrap content from a PDF file.
        Args:
            url (str): The URL of the PDF file.
            html (str): The HTML content of the page.
            **params: Additional parameters.
        Returns:
            ScrapingResult: The scraped content.
        """
        # Download if URL or use local path
        pdf_path = self._get_pdf_path(url)
        try:
            # Process PDF
            result = self.pdf_processor.process(Path(pdf_path))
            # Combine page HTML
            cleaned_html = f"""
        <html>
            <head><meta name="pdf-pages" content="{len(result.pages)}"></head>
            <body>
                {''.join(f'<div class="pdf-page" data-page="{i+1}">{page.html}</div>'
                         for i, page in enumerate(result.pages))}
            </body>
        </html>
        """
            # Accumulate media and links with page numbers
            media = {"images": []}
            links = {"urls": []}
            for page in result.pages:
                # Add page number to each image
                for img in page.images:
                    img["page"] = page.page_number
                    media["images"].append(img)
                # Add page number to each link
                for link in page.links:
                    links["urls"].append({
                        "url": link,
                        "page": page.page_number
                    })
            return ScrapingResult(
                cleaned_html=cleaned_html,
                success=True,
                media=media,
                links=links,
                metadata=asdict(result.metadata)
            )
        finally:
            # Cleanup temp file if downloaded
            if url.startswith(("http://", "https://")):
                Path(pdf_path).unlink(missing_ok=True)
    async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
        # For simple cases, you can use the sync version
        return await asyncio.to_thread(self.scrap, url, html, **kwargs)
    def _get_pdf_path(self, url: str) -> str:
        if url.startswith(("http://", "https://")):
            import tempfile
            import requests
            # Create temp file with .pdf extension
            temp_file = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
            try:
                # Download PDF with streaming
                response = requests.get(url, stream=True)
                response.raise_for_status()
                # Write to temp file
                with open(temp_file.name, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
                return temp_file.name
            except Exception as e:
                # Clean up temp file if download fails
                Path(temp_file.name).unlink(missing_ok=True)
                raise RuntimeError(f"Failed to download PDF from {url}: {str(e)}")
        elif url.startswith("file://"):
            return url[7:]  # Strip file:// prefix
        return url  # Assume local path
 __all__ = ["PDFCrawlerStrategy", "PDFContentScrapingStrategy"]
--- a/crawl4ai/processors/pdf/processor.py
+++ b/crawl4ai/processors/pdf/processor.py
@@ -0,0 +1,372 @@
 import logging
 import re
 from abc import ABC, abstractmethod
 from datetime import datetime
 from pathlib import Path
 from time import time
 from dataclasses import dataclass, asdict, field
 from typing import Dict, List, Optional, Tuple
 import PyPDF2
 from PIL import Image
 from PyPDF2 import PdfReader
 from .utils import *
 import base64
 import tempfile
 logger = logging.getLogger(__name__)
 from dataclasses import dataclass, field
 from datetime import datetime
 from typing import List, Optional, Dict, Any
 from pathlib import Path
@dataclass
 class PDFMetadata:
    title: Optional[str] = None
    author: Optional[str] = None
    producer: Optional[str] = None
    created: Optional[datetime] = None
    modified: Optional[datetime] = None
    pages: int = 0
    encrypted: bool = False
    file_size: Optional[int] = None
@dataclass
 class PDFPage:
    page_number: int
    raw_text: str = ""
    markdown: str = ""  # Added per your request
    html: str = ""  # Added per your request
    images: List[Dict] = field(default_factory=list)
    links: List[str] = field(default_factory=list)
    layout: List[Dict] = field(default_factory=list)
@dataclass
 class PDFProcessResult:
    metadata: PDFMetadata
    pages: List[PDFPage]
    processing_time: float = 0.0
    version: str = "1.0"
 class PDFProcessorStrategy(ABC):
    @abstractmethod
    def process(self, pdf_path: Path) -> PDFProcessResult:
        pass
 class NaivePDFProcessorStrategy(PDFProcessorStrategy):
    def __init__(self, image_dpi: int = 144, image_quality: int = 85, extract_images: bool = True, 
                 save_images_locally: bool = False, image_save_dir: Optional[Path] = None):
        self.image_dpi = image_dpi
        self.image_quality = image_quality
        self.current_page_number = 0
        self.extract_images = extract_images
        self.save_images_locally = save_images_locally
        self.image_save_dir = image_save_dir
        self._temp_dir = None
    def process(self, pdf_path: Path) -> PDFProcessResult:
        start_time = time()
        result = PDFProcessResult(
            metadata=PDFMetadata(),
            pages=[],
            version="1.1"
        )
        try:
            with pdf_path.open('rb') as file:
                reader = PdfReader(file)
                result.metadata = self._extract_metadata(pdf_path, reader)
                # Handle image directory
                image_dir = None
                if self.extract_images and self.save_images_locally:
                    if self.image_save_dir:
                        image_dir = Path(self.image_save_dir)
                        image_dir.mkdir(exist_ok=True, parents=True)
                    else:
                        self._temp_dir = tempfile.mkdtemp(prefix='pdf_images_')
                        image_dir = Path(self._temp_dir)
                for page_num, page in enumerate(reader.pages):
                    self.current_page_number = page_num + 1
                    pdf_page = self._process_page(page, image_dir, reader)
                    result.pages.append(pdf_page)
        except Exception as e:
            logger.error(f"Failed to process PDF: {str(e)}")
            raise
        finally:
            # Cleanup temp directory if it was created
            if self._temp_dir and not self.image_save_dir:
                import shutil
                try:
                    shutil.rmtree(self._temp_dir)
                except Exception as e:
                    logger.error(f"Failed to cleanup temp directory: {str(e)}")
        result.processing_time = time() - start_time
        return result
    def _process_page(self, page, image_dir: Optional[Path], reader) -> PDFPage:
        pdf_page = PDFPage(
            page_number=self.current_page_number,
        )
        # Text and font extraction
        def visitor_text(text, cm, tm, font_dict, font_size):
            pdf_page.raw_text += text
            pdf_page.layout.append({
                "type": "text",
                "text": text,
                "x": tm[4],
                "y": tm[5],
            })
        page.extract_text(visitor_text=visitor_text)
        # Image extraction
        if self.extract_images:
            pdf_page.images = self._extract_images(page, image_dir)
        # Link extraction
        pdf_page.links = self._extract_links(page)
        # Add markdown content
        pdf_page.markdown = clean_pdf_text(self.current_page_number, pdf_page.raw_text)
        pdf_page.html = clean_pdf_text_to_html(self.current_page_number, pdf_page.raw_text)
        return pdf_page
    def _extract_images(self, page, image_dir: Optional[Path]) -> List[Dict]:
        if not self.extract_images:
            return []
        images = []
        try:
            resources = page.get("/Resources")
            if resources:  # Check if resources exist
                resources = resources.get_object()  # Resolve IndirectObject
                if '/XObject' in resources:
                    xobjects = resources['/XObject'].get_object()
                    img_count = 0
                    for obj_name in xobjects:
                        xobj = xobjects[obj_name]
                        if hasattr(xobj, 'get_object') and callable(xobj.get_object):
                            xobj = xobj.get_object()
                            if xobj.get('/Subtype') == '/Image':
                                try:
                                    img_count += 1
                                    img_filename = f"page_{self.current_page_number}_img_{img_count}"
                                    data = xobj.get_data()
                                    filters = xobj.get('/Filter', [])
                                    if not isinstance(filters, list):
                                        filters = [filters]
                                    # Resolve IndirectObjects in properties
                                    width = xobj.get('/Width', 0)
                                    height = xobj.get('/Height', 0)
                                    color_space = xobj.get('/ColorSpace', '/DeviceRGB')
                                    if isinstance(color_space, PyPDF2.generic.IndirectObject):
                                        color_space = color_space.get_object()
                                    # Handle different image encodings
                                    success = False
                                    image_format = 'bin'
                                    image_data = None
                                    if '/FlateDecode' in filters:
                                        try:
                                            decode_parms = xobj.get('/DecodeParms', {})
                                            if isinstance(decode_parms, PyPDF2.generic.IndirectObject):
                                                decode_parms = decode_parms.get_object()
                                            predictor = decode_parms.get('/Predictor', 1)
                                            bits = xobj.get('/BitsPerComponent', 8)
                                            colors = 3 if color_space == '/DeviceRGB' else 1
                                            if predictor >= 10:
                                                data = apply_png_predictor(data, width, bits, colors)
                                            # Create PIL Image
                                            mode = 'RGB' if color_space == '/DeviceRGB' else 'L'
                                            img = Image.frombytes(mode, (width, height), data)
                                            if self.save_images_locally:
                                                final_path = (image_dir / img_filename).with_suffix('.png')
                                                img.save(final_path)
                                                image_data = str(final_path)
                                            else:
                                                import io
                                                img_byte_arr = io.BytesIO()
                                                img.save(img_byte_arr, format='PNG')
                                                image_data = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
                                            success = True
                                            image_format = 'png'
                                        except Exception as e:
                                            logger.error(f"FlateDecode error: {str(e)}")
                                    elif '/DCTDecode' in filters:
                                        # JPEG image
                                        try:
                                            if self.save_images_locally:
                                                final_path = (image_dir / img_filename).with_suffix('.jpg')
                                                with open(final_path, 'wb') as f:
                                                    f.write(data)
                                                image_data = str(final_path)
                                            else:
                                                image_data = base64.b64encode(data).decode('utf-8')
                                            success = True
                                            image_format = 'jpeg'
                                        except Exception as e:
                                            logger.error(f"JPEG save error: {str(e)}")
                                    elif '/CCITTFaxDecode' in filters:
                                        try:
                                            if data[:4] != b'II*\x00':
                                                # Add TIFF header if missing
                                                tiff_header = b'II*\x00\x08\x00\x00\x00\x0e\x00\x00\x01\x03\x00\x01\x00\x00\x00' + \
                                                            width.to_bytes(4, 'little') + \
                                                            b'\x01\x03\x00\x01\x00\x00\x00' + \
                                                            height.to_bytes(4, 'little') + \
                                                            b'\x01\x12\x00\x03\x00\x00\x00\x01\x00\x01\x00\x00\x01\x17\x00\x04\x00\x00\x00\x01\x00\x00\x00J\x01\x1B\x00\x05\x00\x00\x00\x01\x00\x00\x00R\x01\x28\x00\x03\x00\x00\x00\x01\x00\x02\x00\x00'
                                                data = tiff_header + data
                                            if self.save_images_locally:
                                                final_path = (image_dir / img_filename).with_suffix('.tiff')
                                                with open(final_path, 'wb') as f:
                                                    f.write(data)
                                                image_data = str(final_path)
                                            else:
                                                image_data = base64.b64encode(data).decode('utf-8')
                                            success = True
                                            image_format = 'tiff'
                                        except Exception as e:
                                            logger.error(f"CCITT save error: {str(e)}")
                                    elif '/JPXDecode' in filters:
                                        # JPEG 2000
                                        try:
                                            if self.save_images_locally:
                                                final_path = (image_dir / img_filename).with_suffix('.jp2')
                                                with open(final_path, 'wb') as f:
                                                    f.write(data)
                                                image_data = str(final_path)
                                            else:
                                                image_data = base64.b64encode(data).decode('utf-8')
                                            success = True
                                            image_format = 'jpeg2000'
                                        except Exception as e:
                                            logger.error(f"JPEG2000 save error: {str(e)}")
                                    if success and image_data:
                                        image_info = {
                                            "format": image_format,
                                            "width": width,
                                            "height": height,
                                            "color_space": str(color_space),
                                            "bits_per_component": xobj.get('/BitsPerComponent', 1)
                                        }
                                        if self.save_images_locally:
                                            image_info["path"] = image_data
                                        else:
                                            image_info["data"] = image_data
                                        images.append(image_info)
                                    else:
                                        # Fallback: Save raw data
                                        if self.save_images_locally:
                                            final_path = (image_dir / img_filename).with_suffix('.bin')
                                            with open(final_path, 'wb') as f:
                                                f.write(data)
                                            logger.warning(f"Saved raw image data to {final_path}")
                                        else:
                                            image_data = base64.b64encode(data).decode('utf-8')
                                            images.append({
                                                "format": "bin",
                                                "width": width,
                                                "height": height,
                                                "color_space": str(color_space),
                                                "bits_per_component": xobj.get('/BitsPerComponent', 1),
                                                "data": image_data
                                            })
                                except Exception as e:
                                    logger.error(f"Error processing image: {str(e)}")
        except Exception as e:
            logger.error(f"Image extraction error: {str(e)}")
        return images
    def _extract_links(self, page) -> List[str]:
        links = []
        if '/Annots' in page:
            try:
                for annot in page['/Annots']:
                    a = annot.get_object()
                    if '/A' in a and '/URI' in a['/A']:
                        links.append(a['/A']['/URI'])
            except Exception as e:
                print(f"Link error: {str(e)}")
        return links
    def _extract_metadata(self, pdf_path: Path, reader: PdfReader = None) -> PDFMetadata:
        if not reader:
            reader = PdfReader(pdf_path)
        meta = reader.metadata or {}
        created = self._parse_pdf_date(meta.get('/CreationDate', ''))
        modified = self._parse_pdf_date(meta.get('/ModDate', ''))
        return PDFMetadata(
            title=meta.get('/Title'),
            author=meta.get('/Author'),
            producer=meta.get('/Producer'),
            created=created,
            modified=modified,
            pages=len(reader.pages),
            encrypted=reader.is_encrypted,
            file_size=pdf_path.stat().st_size
        )
    def _parse_pdf_date(self, date_str: str) -> Optional[datetime]:
        try:
            match = re.match(r'D:(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})', date_str)
            if not match:
                return None
            return datetime(
                year=int(match[1]),
                month=int(match[2]),
                day=int(match[3]),
                hour=int(match[4]),
                minute=int(match[5]),
                second=int(match[6])
            )
        except:
            return None
 # Usage example
 if __name__ == "__main__":
    import json
    from pathlib import Path
    current_dir = Path(__file__).resolve().parent
    pdf_path = f'{current_dir}/test.pdf'
    strategy = NaivePDFProcessorStrategy()
    result = strategy.process(Path(pdf_path))
    # Convert to JSON
    json_output = asdict(result)
    print(json.dumps(json_output, indent=2, default=str))
    with open(f'{current_dir}/test.html', 'w') as f:
        for page in result.pages:
            f.write(f'<h1>Page {page["page_number"]}</h1>')
            f.write(page['html'])
    with open(f'{current_dir}/test.md', 'w') as f:
        for page in result.pages:
            f.write(f'# Page {page["page_number"]}\n\n')
            f.write(clean_pdf_text(page["page_number"], page['raw_text']))
            f.write('\n\n')
--- a/crawl4ai/processors/pdf/utils.py
+++ b/crawl4ai/processors/pdf/utils.py
@@ -0,0 +1,350 @@
 import re
 def apply_png_predictor(data, width, bits, color_channels):
    """Decode PNG predictor (PDF 1.5+ filter)"""
    bytes_per_pixel = (bits * color_channels) // 8
    if (bits * color_channels) % 8 != 0:
        bytes_per_pixel += 1
    stride = width * bytes_per_pixel
    scanline_length = stride + 1  # +1 for filter byte
    if len(data) % scanline_length != 0:
        raise ValueError("Invalid scanline structure")
    num_lines = len(data) // scanline_length
    output = bytearray()
    prev_line = b'\x00' * stride
    for i in range(num_lines):
        line = data[i*scanline_length:(i+1)*scanline_length]
        filter_type = line[0]
        filtered = line[1:]
        if filter_type == 0:  # None
            decoded = filtered
        elif filter_type == 1:  # Sub
            decoded = bytearray(filtered)
            for j in range(bytes_per_pixel, len(decoded)):
                decoded[j] = (decoded[j] + decoded[j - bytes_per_pixel]) % 256
        elif filter_type == 2:  # Up
            decoded = bytearray([(filtered[j] + prev_line[j]) % 256 
                               for j in range(len(filtered))])
        elif filter_type == 3:  # Average
            decoded = bytearray(filtered)
            for j in range(len(decoded)):
                left = decoded[j - bytes_per_pixel] if j >= bytes_per_pixel else 0
                up = prev_line[j]
                avg = (left + up) // 2
                decoded[j] = (decoded[j] + avg) % 256
        elif filter_type == 4:  # Paeth
            decoded = bytearray(filtered)
            for j in range(len(decoded)):
                left = decoded[j - bytes_per_pixel] if j >= bytes_per_pixel else 0
                up = prev_line[j]
                up_left = prev_line[j - bytes_per_pixel] if j >= bytes_per_pixel else 0
                paeth = paeth_predictor(left, up, up_left)
                decoded[j] = (decoded[j] + paeth) % 256
        else:
            raise ValueError(f"Unsupported filter type: {filter_type}")
        output.extend(decoded)
        prev_line = decoded
    return bytes(output)
 def paeth_predictor(a, b, c):
    p = a + b - c
    pa = abs(p - a)
    pb = abs(p - b)
    pc = abs(p - c)
    if pa <= pb and pa <= pc:
        return a
    elif pb <= pc:
        return b
    else:
        return c
 import re
 import html
 def clean_pdf_text_to_html(page_number, text):
    # Decode Unicode escapes and handle surrogate pairs
    try:
        decoded = text.encode('latin-1').decode('unicode-escape')
        decoded = decoded.encode('utf-16', 'surrogatepass').decode('utf-16')
    except Exception as e:
        decoded = text  # Fallback if decoding fails
    article_title_detected = False
    # decoded = re.sub(r'\.\n', '.\n\n', decoded)
    # decoded = re.sub(r'\.\n', '<|break|>', decoded)
    lines = decoded.split('\n')
    output = []
    current_paragraph = []
    in_header = False
    email_pattern = re.compile(r'\{.*?\}')
    affiliation_pattern = re.compile(r'^†')
    quote_pattern = re.compile(r'^["“]')
    author_pattern = re.compile(
        r'^\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?'
        r'(?:,\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?)*'
        r'(?:,\s*(?:and|&)\s+[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?)?\s*$'
    )
    def flush_paragraph():
        if current_paragraph:
            para = ' '.join(current_paragraph)
            para = re.sub(r'\s+', ' ', para).strip()
            if para:
                # escaped_para = html.escape(para)
                escaped_para = para
                # escaped_para = re.sub(r'\.\n', '.\n\n', escaped_para)
                # Split escaped_para by <|break|> to avoid HTML escaping
                escaped_para = escaped_para.split('.\n\n')
                # Wrap each part in <p> tag
                escaped_para = [f'<p>{part}</p>' for part in escaped_para]
                output.append(f'<div class="paragraph">{"".join(escaped_para)}</div><hr/>')
            current_paragraph.clear()
    for i, line in enumerate(lines):
        line = line.strip()
        # Handle empty lines
        if not line:
            flush_paragraph()
            continue
        # Detect article title (first line with reasonable length)
        if not article_title_detected and i == 0 and 3 <= len(line.split()) <= 8 and len(lines) > 1:
            flush_paragraph()
            escaped_line = html.escape(line)
            output.append(f'<h2>{escaped_line}</h2>')
            article_title_detected = True
            continue
        # Detect numbered headers like "2.1 Background"
        numbered_header = re.match(r'^(\d+(?:\.\d+)*)\s+(.+)$', line)
        if i > 0 and not lines[i-1].strip() and numbered_header:
            flush_paragraph()
            level = numbered_header.group(1).count('.') + 1
            header_text = numbered_header.group(2)
            md_level = min(level + 1, 6)
            escaped_header = html.escape(header_text)
            output.append(f'<h{md_level}>{escaped_header}</h{md_level}>')
            in_header = True
            continue
        # Detect authors
        if page_number == 1 and author_pattern.match(line):
            authors = re.sub(r'[†â€]', '', line)
            authors = re.split(r', | and ', authors)
            formatted_authors = []
            for author in authors:
                if author.strip():
                    parts = [p for p in author.strip().split() if p]
                    formatted = ' '.join(parts)
                    escaped_author = html.escape(formatted)
                    formatted_authors.append(f'<strong>{escaped_author}</strong>')
            if len(formatted_authors) > 1:
                joined = ', '.join(formatted_authors[:-1]) + ' and ' + formatted_authors[-1]
            else:
                joined = formatted_authors[0]
            output.append(f'<p>{joined}</p>')
            continue
        # Detect affiliation
        if affiliation_pattern.match(line):
            escaped_line = html.escape(line)
            output.append(f'<p><em>{escaped_line}</em></p>')
            continue
        # Detect emails
        if email_pattern.match(line):
            escaped_line = html.escape(line)
            output.append(f'<p><code>{escaped_line}</code></p>')
            continue
        # Detect section headers
        if re.match(r'^(Abstract|\d+\s+[A-Z]|References|Appendix|Figure|Table)', line):
            flush_paragraph()
            escaped_line = html.escape(line)
            output.append(f'<h2 class="section-header"><em>{escaped_line}</em></h2>')
            in_header = True
            continue
        # Handle quotes
        if quote_pattern.match(line):
            flush_paragraph()
            escaped_line = html.escape(line)
            output.append(f'<blockquote><p>{escaped_line}</p></blockquote>')
            continue
        # Handle hyphenated words
        if line.endswith('-'):
            current_paragraph.append(line[:-1].strip())
        else:
            current_paragraph.append(line)
        # Handle paragraph breaks after headers
        if in_header and not line.endswith(('.', '!', '?')):
            flush_paragraph()
            in_header = False
    flush_paragraph()
    # Post-process HTML
    html_output = '\n'.join(output)
    # Fix common citation patterns
    html_output = re.sub(r'\(([A-Z][a-z]+ et al\. \d{4})\)', r'<cite>\1</cite>', html_output)
    # Fix escaped characters
    html_output = html_output.replace('\\ud835', '').replace('\\u2020', '†')
    # Remove leftover hyphens and fix spacing
    html_output = re.sub(r'\s+-\s+', '', html_output)
    html_output = re.sub(r'\s+([.,!?)])', r'\1', html_output)
    return html_output
 def clean_pdf_text(page_number, text):
    # Decode Unicode escapes and handle surrogate pairs
    try:
        decoded = text.encode('latin-1').decode('unicode-escape')
        decoded = decoded.encode('utf-16', 'surrogatepass').decode('utf-16')
    except Exception as e:
        decoded = text  # Fallback if decoding fails
    article_title_detected = False
    decoded = re.sub(r'\.\n', '.\n\n', decoded)
    lines = decoded.split('\n')
    output = []
    current_paragraph = []
    in_header = False
    email_pattern = re.compile(r'\{.*?\}')
    affiliation_pattern = re.compile(r'^†')
    quote_pattern = re.compile(r'^["“]')
    author_pattern = re.compile(
        r'^\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?'
        r'(?:,\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?)*'
        r'(?:,\s*(?:and|&)\s+[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?)?\s*$'
    )
    def flush_paragraph():
        if current_paragraph:
            para = ' '.join(current_paragraph)
            para = re.sub(r'\s+', ' ', para).strip()
            if para:
                output.append(para)
            current_paragraph.clear()
    for i, line in enumerate(lines):
        line = line.strip()
        # Handle special patterns
        if not line:
            flush_paragraph()
            continue
        # Detect headline (first line, reasonable length, surrounded by empty lines)
        if not article_title_detected and i == 0 and 3 <= len(line.split()) <= 8 and (len(lines) > 1):
            flush_paragraph()
            output.append(f'## {line}')
            continue
        # Detect paragraph breaks for ALL paragraphs
        if not line and current_paragraph:
            flush_paragraph()
            output.append('')  # Add empty line between paragraphs
            continue
        # Detect numbered headers like "2.1 Background"
        numbered_header = re.match(r'^(\d+(?:\.\d+)*)\s+(.+)$', line)
        if not lines[i-1].strip() and numbered_header:
            flush_paragraph()
            level = numbered_header.group(1).count('.') + 1  # Convert 2.1 → level 2
            header_text = numbered_header.group(2)
            # Never go beyond ### for subsections
            md_level = min(level + 1, 6)   # 1 → ##, 2 → ###, 3 → #### etc
            output.append(f'{"#" * md_level} {header_text}')
            in_header = True
            continue            
        # Detect authors
        if page_number == 1 and author_pattern.match(line):
            # Clean and format author names
            authors = re.sub(r'[†â€]', '', line)  # Remove affiliation markers
            authors = re.split(r', | and ', authors)
            formatted_authors = []
            for author in authors:
                if author.strip():
                    # Handle "First Last" formatting
                    parts = [p for p in author.strip().split() if p]
                    formatted = ' '.join(parts)
                    formatted_authors.append(f'**{formatted}**')
            # Join with commas and "and"
            if len(formatted_authors) > 1:
                joined = ', '.join(formatted_authors[:-1]) + ' and ' + formatted_authors[-1]
            else:
                joined = formatted_authors[0]
            output.append(joined)
            continue
        # Detect affiliation
        if affiliation_pattern.match(line):
            output.append(f'*{line}*')
            continue
        # Detect emails
        if email_pattern.match(line):
            output.append(f'`{line}`')
            continue
        # Detect section headers
        if re.match(r'^(Abstract|\d+\s+[A-Z]|References|Appendix|Figure|Table)', line):
            flush_paragraph()
            output.append(f'_[{line}]_')
            in_header = True
            continue
        # Handle quotes
        if quote_pattern.match(line):
            flush_paragraph()
            output.append(f'> {line}')
            continue
        # Handle hyphenated words
        if line.endswith('-'):
            current_paragraph.append(line[:-1].strip())
        else:
            current_paragraph.append(line)
        # Handle paragraph breaks after headers
        if in_header and not line.endswith(('.', '!', '?')):
            flush_paragraph()
            in_header = False
    flush_paragraph()
    # Post-processing
    markdown = '\n\n'.join(output)
    # Fix common citation patterns
    markdown = re.sub(r'\(([A-Z][a-z]+ et al\. \d{4})\)', r'[\1]', markdown)
    # Fix escaped characters
    markdown = markdown.replace('\\ud835', '').replace('\\u2020', '†')
    # Remove leftover hyphens and fix spacing
    markdown = re.sub(r'\s+-\s+', '', markdown)  # Join hyphenated words
    markdown = re.sub(r'\s+([.,!?)])', r'\1', markdown)  # Fix punctuation spacing
    return markdown