From f8fd9d9eff2f696edf68e482b393e61b8d2991f7 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Mon, 27 Jan 2025 21:24:15 +0800 Subject: [PATCH] feat(pdf): add PDF processing capabilities Add new PDF processing module with the following features: - PDF text extraction and formatting to HTML/Markdown - Image extraction with multiple format support (JPEG, PNG, TIFF) - Link extraction from PDF documents - Metadata extraction including title, author, dates - Support for both local and remote PDF files Also includes: - New configuration options for HTML attribute handling - Internal/external link filtering improvements - Version bump to 0.4.300b4 --- crawl4ai/__version__.py | 3 +- crawl4ai/async_configs.py | 16 ++ crawl4ai/async_webcrawler.py | 61 ++--- crawl4ai/content_scraping_strategy.py | 5 +- crawl4ai/extraction_strategy.py | 10 +- crawl4ai/models.py | 1 + crawl4ai/processors/pdf/__init__.py | 164 ++++++++++++ crawl4ai/processors/pdf/processor.py | 372 ++++++++++++++++++++++++++ crawl4ai/processors/pdf/utils.py | 350 ++++++++++++++++++++++++ 9 files changed, 933 insertions(+), 49 deletions(-) create mode 100644 crawl4ai/processors/pdf/__init__.py create mode 100644 crawl4ai/processors/pdf/processor.py create mode 100644 crawl4ai/processors/pdf/utils.py diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 3274435a..5955f704 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,3 @@ # crawl4ai/_version.py -__version__ = "0.4.3b3" +# __version__ = "0.4.3b3" +__version__ = "0.4.300b4" diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 44c83262..35503bd3 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -271,6 +271,8 @@ class CrawlerRunConfig: Default: None. keep_data_attributes (bool): If True, retain `data-*` attributes while removing unwanted attributes. Default: False. + keep_attrs (list of str): List of HTML attributes to keep during processing. + Default: []. remove_forms (bool): If True, remove all `
` elements from the HTML. Default: False. prettiify (bool): If True, apply `fast_format_html` to produce prettified HTML output. @@ -282,6 +284,8 @@ class CrawlerRunConfig: proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}. If None, no additional proxy config. Default: None. + # SSL Parameters + fetch_ssl_certificate: bool = False, # Caching Parameters cache_mode (CacheMode or None): Defines how caching is handled. If None, defaults to CacheMode.ENABLED internally. @@ -363,10 +367,14 @@ class CrawlerRunConfig: Default: SOCIAL_MEDIA_DOMAINS (from config). exclude_external_links (bool): If True, exclude all external links from the results. Default: False. + exclude_internal_links (bool): If True, exclude internal links from the results. + Default: False. exclude_social_media_links (bool): If True, exclude links pointing to social media domains. Default: False. exclude_domains (list of str): List of specific domains to exclude from results. Default: []. + exclude_internal_links (bool): If True, exclude internal links from the results. + Default: False. # Debugging and Logging Parameters verbose (bool): Enable verbose logging. @@ -402,6 +410,7 @@ class CrawlerRunConfig: excluded_tags: list = None, excluded_selector: str = None, keep_data_attributes: bool = False, + keep_attrs: list = None, remove_forms: bool = False, prettiify: bool = False, parser_type: str = "lxml", @@ -451,6 +460,7 @@ class CrawlerRunConfig: exclude_external_links: bool = False, exclude_social_media_links: bool = False, exclude_domains: list = None, + exclude_internal_links: bool = False, # Debugging and Logging Parameters verbose: bool = True, log_console: bool = False, @@ -475,6 +485,7 @@ class CrawlerRunConfig: self.excluded_tags = excluded_tags or [] self.excluded_selector = excluded_selector or "" self.keep_data_attributes = keep_data_attributes + self.keep_attrs = keep_attrs or [] self.remove_forms = remove_forms self.prettiify = prettiify self.parser_type = parser_type @@ -532,6 +543,7 @@ class CrawlerRunConfig: self.exclude_external_links = exclude_external_links self.exclude_social_media_links = exclude_social_media_links self.exclude_domains = exclude_domains or [] + self.exclude_internal_links = exclude_internal_links # Debugging and Logging Parameters self.verbose = verbose @@ -580,6 +592,7 @@ class CrawlerRunConfig: excluded_tags=kwargs.get("excluded_tags", []), excluded_selector=kwargs.get("excluded_selector", ""), keep_data_attributes=kwargs.get("keep_data_attributes", False), + keep_attrs=kwargs.get("keep_attrs", []), remove_forms=kwargs.get("remove_forms", False), prettiify=kwargs.get("prettiify", False), parser_type=kwargs.get("parser_type", "lxml"), @@ -638,6 +651,7 @@ class CrawlerRunConfig: exclude_external_links=kwargs.get("exclude_external_links", False), exclude_social_media_links=kwargs.get("exclude_social_media_links", False), exclude_domains=kwargs.get("exclude_domains", []), + exclude_internal_links=kwargs.get("exclude_internal_links", False), # Debugging and Logging Parameters verbose=kwargs.get("verbose", True), log_console=kwargs.get("log_console", False), @@ -663,6 +677,7 @@ class CrawlerRunConfig: "excluded_tags": self.excluded_tags, "excluded_selector": self.excluded_selector, "keep_data_attributes": self.keep_data_attributes, + "keep_attrs": self.keep_attrs, "remove_forms": self.remove_forms, "prettiify": self.prettiify, "parser_type": self.parser_type, @@ -706,6 +721,7 @@ class CrawlerRunConfig: "exclude_external_links": self.exclude_external_links, "exclude_social_media_links": self.exclude_social_media_links, "exclude_domains": self.exclude_domains, + "exclude_internal_links": self.exclude_internal_links, "verbose": self.verbose, "log_console": self.log_console, "stream": self.stream, diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 617b6901..c1b32021 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -319,14 +319,6 @@ class AsyncWebCrawler: try: # Handle configuration if crawler_config is not None: - # if any(param is not None for param in [ - # word_count_threshold, extraction_strategy, chunking_strategy, - # content_filter, cache_mode, css_selector, screenshot, pdf - # ]): - # self.logger.warning( - # message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.", - # tag="WARNING" - # ) config = crawler_config else: # Merge all parameters into a single kwargs dict for config creation @@ -350,14 +342,6 @@ class AsyncWebCrawler: # Handle deprecated cache parameters if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]): - if kwargs.get("warning", True): - warnings.warn( - "Cache control boolean flags are deprecated and will be removed in version 0.5.0. " - "Use 'cache_mode' parameter instead.", - DeprecationWarning, - stacklevel=2, - ) - # Convert legacy parameters if cache_mode not provided if config.cache_mode is None: config.cache_mode = _legacy_to_cache_mode( @@ -430,7 +414,9 @@ class AsyncWebCrawler: response_headers={"X-Robots-Status": "Blocked by robots.txt"} ) - # Pass config to crawl method + ############################## + # Call CrawlerStrategy.crawl # + ############################## async_response = await self.crawler_strategy.crawl( url, config=config, # Pass the entire config object @@ -448,7 +434,9 @@ class AsyncWebCrawler: tag="FETCH", ) - # Process the HTML content + ############################################################### + # Process the HTML content, Call CrawlerStrategy.process_html # + ############################################################### crawl_result : CrawlResult = await self.aprocess_html( url=url, html=html, @@ -469,26 +457,6 @@ class AsyncWebCrawler: async_response.ssl_certificate ) # Add SSL certificate - # # Check and set values from async_response to crawl_result - # try: - # for key in vars(async_response): - # if hasattr(crawl_result, key): - # value = getattr(async_response, key, None) - # current_value = getattr(crawl_result, key, None) - # if value is not None and not current_value: - # try: - # setattr(crawl_result, key, value) - # except Exception as e: - # self.logger.warning( - # message=f"Failed to set attribute {key}: {str(e)}", - # tag="WARNING" - # ) - # except Exception as e: - # self.logger.warning( - # message=f"Error copying response attributes: {str(e)}", - # tag="WARNING" - # ) - crawl_result.success = bool(html) crawl_result.session_id = getattr(config, "session_id", None) @@ -538,8 +506,6 @@ class AsyncWebCrawler: f"Error: {str(e)}\n\n" f"Code context:\n{error_context['code_context']}" ) - # if not hasattr(e, "msg"): - # e.msg = str(e) self.logger.error_status( url=url, @@ -578,6 +544,7 @@ class AsyncWebCrawler: Returns: CrawlResult: Processed result containing extracted and formatted content """ + cleaned_html = "" try: _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML" t1 = time.perf_counter() @@ -592,6 +559,10 @@ class AsyncWebCrawler: # add keys from kwargs to params that doesn't exist in params params.update({k: v for k, v in kwargs.items() if k not in params.keys()}) + + ################################ + # Scraping Strategy Execution # + ################################ result = scraping_strategy.scrap(url, html, **params) if result is None: @@ -618,7 +589,9 @@ class AsyncWebCrawler: links = result.links.model_dump() metadata = result.metadata - # Markdown Generation + ################################ + # Generate Markdown # + ################################ markdown_generator: Optional[MarkdownGenerationStrategy] = ( config.markdown_generator or DefaultMarkdownGenerator() ) @@ -644,14 +617,15 @@ class AsyncWebCrawler: params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000)}, ) - # Handle content extraction if needed + ################################ + # Structured Content Extraction # + ################################ if ( not bool(extracted_content) and config.extraction_strategy and not isinstance(config.extraction_strategy, NoExtractionStrategy) ): t1 = time.perf_counter() - # Choose content based on input_format content_format = config.extraction_strategy.input_format if content_format == "fit_markdown" and not markdown_result.fit_markdown: @@ -665,6 +639,7 @@ class AsyncWebCrawler: content = { "markdown": markdown, "html": html, + "cleaned_html": cleaned_html, "fit_markdown": markdown_result.raw_markdown, }.get(content_format, markdown) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 6cb169db..46761013 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -529,6 +529,9 @@ class WebScrapingStrategy(ContentScrapingStrategy): if normalized_href not in external_links_dict: external_links_dict[normalized_href] = link_data else: + if kwargs.get("exclude_internal_links", False): + element.decompose() + return False if normalized_href not in internal_links_dict: internal_links_dict[normalized_href] = link_data @@ -629,7 +632,7 @@ class WebScrapingStrategy(ContentScrapingStrategy): try: self.remove_unwanted_attributes( - element, IMPORTANT_ATTRS, kwargs.get("keep_data_attributes", False) + element, IMPORTANT_ATTRS + kwargs.get("keep_attrs", []) , kwargs.get("keep_data_attributes", False) ) except Exception as e: # print('Error removing unwanted attributes:', str(e)) diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index b2b24751..74ffc555 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -1098,17 +1098,19 @@ class JsonElementExtractionStrategy(ExtractionStrategy): user_message = { "role": "user", "content": f""" - Instructions: - {prompt_template} - HTML to analyze: ```html {html} ``` - {"Extract the following data: " + query if query else "Please analyze the HTML structure and create the most appropriate schema for data extraction."} + Instructions to extract schema for the above given HTML: + {prompt_template} + """ } + + if query: + user_message["content"] += f"\n\nImportant Notes to Consider:\n{query}" try: # Call LLM with backoff handling diff --git a/crawl4ai/models.py b/crawl4ai/models.py index 57edacd7..9c095eed 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -143,6 +143,7 @@ class AsyncCrawlResponse(BaseModel): ############################### class MediaItem(BaseModel): src: Optional[str] = "" + data: Optional[str] = "" alt: Optional[str] = "" desc: Optional[str] = "" score: Optional[int] = 0 diff --git a/crawl4ai/processors/pdf/__init__.py b/crawl4ai/processors/pdf/__init__.py new file mode 100644 index 00000000..9643c6cd --- /dev/null +++ b/crawl4ai/processors/pdf/__init__.py @@ -0,0 +1,164 @@ +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Dict, List, Optional +from dataclasses import asdict + +from crawl4ai.async_logger import AsyncLogger +from crawl4ai.async_crawler_strategy import AsyncCrawlerStrategy +from crawl4ai.models import AsyncCrawlResponse, ScrapingResult +from crawl4ai.content_scraping_strategy import ContentScrapingStrategy +from .processor import NaivePDFProcessorStrategy # Assuming your current PDF code is in pdf_processor.py + +class PDFCrawlerStrategy(AsyncCrawlerStrategy): + def __init__(self, logger: AsyncLogger = None): + self.logger = logger + + async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: + # Just pass through with empty HTML - scraper will handle actual processing + return AsyncCrawlResponse( + html="", # Scraper will handle the real work + response_headers={"Content-Type": "application/pdf"}, + status_code=200 + ) + + async def close(self): + pass + + async def __aenter__(self): + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + await self.close() + +class PDFContentScrapingStrategy(ContentScrapingStrategy): + """ + A content scraping strategy for PDF files. + + Attributes: + save_images_locally (bool): Whether to save images locally. + extract_images (bool): Whether to extract images from PDF. + image_save_dir (str): Directory to save extracted images. + logger (AsyncLogger): Logger instance for recording events and errors. + + Methods: + scrap(url: str, html: str, **params) -> ScrapingResult: + Scrap content from a PDF file. + ascrap(url: str, html: str, **kwargs) -> ScrapingResult: + Asynchronous version of scrap. + + Usage: + strategy = PDFContentScrapingStrategy( + save_images_locally=False, + extract_images=False, + image_save_dir=None, + logger=logger + ) + + """ + def __init__(self, + save_images_locally=False, + extract_images=False, + image_save_dir=None, + logger: AsyncLogger = None): + self.logger = logger + self.pdf_processor = NaivePDFProcessorStrategy( + save_images_locally=False, + extract_images=False, + image_save_dir=None + ) + + def scrap(self, url: str, html: str, **params) -> ScrapingResult: + """ + Scrap content from a PDF file. + + Args: + url (str): The URL of the PDF file. + html (str): The HTML content of the page. + **params: Additional parameters. + + Returns: + ScrapingResult: The scraped content. + """ + # Download if URL or use local path + pdf_path = self._get_pdf_path(url) + try: + # Process PDF + result = self.pdf_processor.process(Path(pdf_path)) + + # Combine page HTML + cleaned_html = f""" + + + + {''.join(f'
{page.html}
' + for i, page in enumerate(result.pages))} + + + """ + + # Accumulate media and links with page numbers + media = {"images": []} + links = {"urls": []} + + for page in result.pages: + # Add page number to each image + for img in page.images: + img["page"] = page.page_number + media["images"].append(img) + + # Add page number to each link + for link in page.links: + links["urls"].append({ + "url": link, + "page": page.page_number + }) + + return ScrapingResult( + cleaned_html=cleaned_html, + success=True, + media=media, + links=links, + metadata=asdict(result.metadata) + ) + finally: + # Cleanup temp file if downloaded + if url.startswith(("http://", "https://")): + Path(pdf_path).unlink(missing_ok=True) + + async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult: + # For simple cases, you can use the sync version + return await asyncio.to_thread(self.scrap, url, html, **kwargs) + + + def _get_pdf_path(self, url: str) -> str: + if url.startswith(("http://", "https://")): + import tempfile + import requests + + # Create temp file with .pdf extension + temp_file = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) + + try: + # Download PDF with streaming + response = requests.get(url, stream=True) + response.raise_for_status() + + # Write to temp file + with open(temp_file.name, 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + + return temp_file.name + + except Exception as e: + # Clean up temp file if download fails + Path(temp_file.name).unlink(missing_ok=True) + raise RuntimeError(f"Failed to download PDF from {url}: {str(e)}") + + elif url.startswith("file://"): + return url[7:] # Strip file:// prefix + + return url # Assume local path + + +__all__ = ["PDFCrawlerStrategy", "PDFContentScrapingStrategy"] \ No newline at end of file diff --git a/crawl4ai/processors/pdf/processor.py b/crawl4ai/processors/pdf/processor.py new file mode 100644 index 00000000..d89ea277 --- /dev/null +++ b/crawl4ai/processors/pdf/processor.py @@ -0,0 +1,372 @@ +import logging +import re +from abc import ABC, abstractmethod +from datetime import datetime +from pathlib import Path +from time import time +from dataclasses import dataclass, asdict, field +from typing import Dict, List, Optional, Tuple +import PyPDF2 +from PIL import Image +from PyPDF2 import PdfReader +from .utils import * +import base64 +import tempfile + +logger = logging.getLogger(__name__) + +from dataclasses import dataclass, field +from datetime import datetime +from typing import List, Optional, Dict, Any +from pathlib import Path + +@dataclass +class PDFMetadata: + title: Optional[str] = None + author: Optional[str] = None + producer: Optional[str] = None + created: Optional[datetime] = None + modified: Optional[datetime] = None + pages: int = 0 + encrypted: bool = False + file_size: Optional[int] = None + +@dataclass +class PDFPage: + page_number: int + raw_text: str = "" + markdown: str = "" # Added per your request + html: str = "" # Added per your request + images: List[Dict] = field(default_factory=list) + links: List[str] = field(default_factory=list) + layout: List[Dict] = field(default_factory=list) + +@dataclass +class PDFProcessResult: + metadata: PDFMetadata + pages: List[PDFPage] + processing_time: float = 0.0 + version: str = "1.0" + +class PDFProcessorStrategy(ABC): + @abstractmethod + def process(self, pdf_path: Path) -> PDFProcessResult: + pass + +class NaivePDFProcessorStrategy(PDFProcessorStrategy): + def __init__(self, image_dpi: int = 144, image_quality: int = 85, extract_images: bool = True, + save_images_locally: bool = False, image_save_dir: Optional[Path] = None): + self.image_dpi = image_dpi + self.image_quality = image_quality + self.current_page_number = 0 + self.extract_images = extract_images + self.save_images_locally = save_images_locally + self.image_save_dir = image_save_dir + self._temp_dir = None + + def process(self, pdf_path: Path) -> PDFProcessResult: + start_time = time() + result = PDFProcessResult( + metadata=PDFMetadata(), + pages=[], + version="1.1" + ) + + try: + with pdf_path.open('rb') as file: + reader = PdfReader(file) + result.metadata = self._extract_metadata(pdf_path, reader) + + # Handle image directory + image_dir = None + if self.extract_images and self.save_images_locally: + if self.image_save_dir: + image_dir = Path(self.image_save_dir) + image_dir.mkdir(exist_ok=True, parents=True) + else: + self._temp_dir = tempfile.mkdtemp(prefix='pdf_images_') + image_dir = Path(self._temp_dir) + + for page_num, page in enumerate(reader.pages): + self.current_page_number = page_num + 1 + pdf_page = self._process_page(page, image_dir, reader) + result.pages.append(pdf_page) + + except Exception as e: + logger.error(f"Failed to process PDF: {str(e)}") + raise + finally: + # Cleanup temp directory if it was created + if self._temp_dir and not self.image_save_dir: + import shutil + try: + shutil.rmtree(self._temp_dir) + except Exception as e: + logger.error(f"Failed to cleanup temp directory: {str(e)}") + + result.processing_time = time() - start_time + return result + + def _process_page(self, page, image_dir: Optional[Path], reader) -> PDFPage: + pdf_page = PDFPage( + page_number=self.current_page_number, + ) + + # Text and font extraction + def visitor_text(text, cm, tm, font_dict, font_size): + pdf_page.raw_text += text + pdf_page.layout.append({ + "type": "text", + "text": text, + "x": tm[4], + "y": tm[5], + }) + + page.extract_text(visitor_text=visitor_text) + + # Image extraction + if self.extract_images: + pdf_page.images = self._extract_images(page, image_dir) + + # Link extraction + pdf_page.links = self._extract_links(page) + + # Add markdown content + pdf_page.markdown = clean_pdf_text(self.current_page_number, pdf_page.raw_text) + pdf_page.html = clean_pdf_text_to_html(self.current_page_number, pdf_page.raw_text) + + return pdf_page + + def _extract_images(self, page, image_dir: Optional[Path]) -> List[Dict]: + if not self.extract_images: + return [] + + images = [] + try: + resources = page.get("/Resources") + if resources: # Check if resources exist + resources = resources.get_object() # Resolve IndirectObject + if '/XObject' in resources: + xobjects = resources['/XObject'].get_object() + img_count = 0 + for obj_name in xobjects: + xobj = xobjects[obj_name] + if hasattr(xobj, 'get_object') and callable(xobj.get_object): + xobj = xobj.get_object() + if xobj.get('/Subtype') == '/Image': + try: + img_count += 1 + img_filename = f"page_{self.current_page_number}_img_{img_count}" + data = xobj.get_data() + filters = xobj.get('/Filter', []) + if not isinstance(filters, list): + filters = [filters] + + # Resolve IndirectObjects in properties + width = xobj.get('/Width', 0) + height = xobj.get('/Height', 0) + color_space = xobj.get('/ColorSpace', '/DeviceRGB') + if isinstance(color_space, PyPDF2.generic.IndirectObject): + color_space = color_space.get_object() + + # Handle different image encodings + success = False + image_format = 'bin' + image_data = None + + if '/FlateDecode' in filters: + try: + decode_parms = xobj.get('/DecodeParms', {}) + if isinstance(decode_parms, PyPDF2.generic.IndirectObject): + decode_parms = decode_parms.get_object() + + predictor = decode_parms.get('/Predictor', 1) + bits = xobj.get('/BitsPerComponent', 8) + colors = 3 if color_space == '/DeviceRGB' else 1 + + if predictor >= 10: + data = apply_png_predictor(data, width, bits, colors) + + # Create PIL Image + mode = 'RGB' if color_space == '/DeviceRGB' else 'L' + img = Image.frombytes(mode, (width, height), data) + + if self.save_images_locally: + final_path = (image_dir / img_filename).with_suffix('.png') + img.save(final_path) + image_data = str(final_path) + else: + import io + img_byte_arr = io.BytesIO() + img.save(img_byte_arr, format='PNG') + image_data = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8') + + success = True + image_format = 'png' + except Exception as e: + logger.error(f"FlateDecode error: {str(e)}") + + elif '/DCTDecode' in filters: + # JPEG image + try: + if self.save_images_locally: + final_path = (image_dir / img_filename).with_suffix('.jpg') + with open(final_path, 'wb') as f: + f.write(data) + image_data = str(final_path) + else: + image_data = base64.b64encode(data).decode('utf-8') + success = True + image_format = 'jpeg' + except Exception as e: + logger.error(f"JPEG save error: {str(e)}") + + elif '/CCITTFaxDecode' in filters: + try: + if data[:4] != b'II*\x00': + # Add TIFF header if missing + tiff_header = b'II*\x00\x08\x00\x00\x00\x0e\x00\x00\x01\x03\x00\x01\x00\x00\x00' + \ + width.to_bytes(4, 'little') + \ + b'\x01\x03\x00\x01\x00\x00\x00' + \ + height.to_bytes(4, 'little') + \ + b'\x01\x12\x00\x03\x00\x00\x00\x01\x00\x01\x00\x00\x01\x17\x00\x04\x00\x00\x00\x01\x00\x00\x00J\x01\x1B\x00\x05\x00\x00\x00\x01\x00\x00\x00R\x01\x28\x00\x03\x00\x00\x00\x01\x00\x02\x00\x00' + data = tiff_header + data + + if self.save_images_locally: + final_path = (image_dir / img_filename).with_suffix('.tiff') + with open(final_path, 'wb') as f: + f.write(data) + image_data = str(final_path) + else: + image_data = base64.b64encode(data).decode('utf-8') + success = True + image_format = 'tiff' + except Exception as e: + logger.error(f"CCITT save error: {str(e)}") + + elif '/JPXDecode' in filters: + # JPEG 2000 + try: + if self.save_images_locally: + final_path = (image_dir / img_filename).with_suffix('.jp2') + with open(final_path, 'wb') as f: + f.write(data) + image_data = str(final_path) + else: + image_data = base64.b64encode(data).decode('utf-8') + success = True + image_format = 'jpeg2000' + except Exception as e: + logger.error(f"JPEG2000 save error: {str(e)}") + + if success and image_data: + image_info = { + "format": image_format, + "width": width, + "height": height, + "color_space": str(color_space), + "bits_per_component": xobj.get('/BitsPerComponent', 1) + } + + if self.save_images_locally: + image_info["path"] = image_data + else: + image_info["data"] = image_data + + images.append(image_info) + else: + # Fallback: Save raw data + if self.save_images_locally: + final_path = (image_dir / img_filename).with_suffix('.bin') + with open(final_path, 'wb') as f: + f.write(data) + logger.warning(f"Saved raw image data to {final_path}") + else: + image_data = base64.b64encode(data).decode('utf-8') + images.append({ + "format": "bin", + "width": width, + "height": height, + "color_space": str(color_space), + "bits_per_component": xobj.get('/BitsPerComponent', 1), + "data": image_data + }) + + except Exception as e: + logger.error(f"Error processing image: {str(e)}") + except Exception as e: + logger.error(f"Image extraction error: {str(e)}") + + return images + + def _extract_links(self, page) -> List[str]: + links = [] + if '/Annots' in page: + try: + for annot in page['/Annots']: + a = annot.get_object() + if '/A' in a and '/URI' in a['/A']: + links.append(a['/A']['/URI']) + except Exception as e: + print(f"Link error: {str(e)}") + return links + + def _extract_metadata(self, pdf_path: Path, reader: PdfReader = None) -> PDFMetadata: + if not reader: + reader = PdfReader(pdf_path) + + meta = reader.metadata or {} + created = self._parse_pdf_date(meta.get('/CreationDate', '')) + modified = self._parse_pdf_date(meta.get('/ModDate', '')) + + return PDFMetadata( + title=meta.get('/Title'), + author=meta.get('/Author'), + producer=meta.get('/Producer'), + created=created, + modified=modified, + pages=len(reader.pages), + encrypted=reader.is_encrypted, + file_size=pdf_path.stat().st_size + ) + + def _parse_pdf_date(self, date_str: str) -> Optional[datetime]: + try: + match = re.match(r'D:(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})', date_str) + if not match: + return None + + return datetime( + year=int(match[1]), + month=int(match[2]), + day=int(match[3]), + hour=int(match[4]), + minute=int(match[5]), + second=int(match[6]) + ) + except: + return None + +# Usage example +if __name__ == "__main__": + import json + from pathlib import Path + current_dir = Path(__file__).resolve().parent + pdf_path = f'{current_dir}/test.pdf' + + strategy = NaivePDFProcessorStrategy() + result = strategy.process(Path(pdf_path)) + + # Convert to JSON + json_output = asdict(result) + print(json.dumps(json_output, indent=2, default=str)) + + with open(f'{current_dir}/test.html', 'w') as f: + for page in result.pages: + f.write(f'

Page {page["page_number"]}

') + f.write(page['html']) + with open(f'{current_dir}/test.md', 'w') as f: + for page in result.pages: + f.write(f'# Page {page["page_number"]}\n\n') + f.write(clean_pdf_text(page["page_number"], page['raw_text'])) + f.write('\n\n') \ No newline at end of file diff --git a/crawl4ai/processors/pdf/utils.py b/crawl4ai/processors/pdf/utils.py new file mode 100644 index 00000000..3dc0e739 --- /dev/null +++ b/crawl4ai/processors/pdf/utils.py @@ -0,0 +1,350 @@ +import re + +def apply_png_predictor(data, width, bits, color_channels): + """Decode PNG predictor (PDF 1.5+ filter)""" + bytes_per_pixel = (bits * color_channels) // 8 + if (bits * color_channels) % 8 != 0: + bytes_per_pixel += 1 + + stride = width * bytes_per_pixel + scanline_length = stride + 1 # +1 for filter byte + + if len(data) % scanline_length != 0: + raise ValueError("Invalid scanline structure") + + num_lines = len(data) // scanline_length + output = bytearray() + prev_line = b'\x00' * stride + + for i in range(num_lines): + line = data[i*scanline_length:(i+1)*scanline_length] + filter_type = line[0] + filtered = line[1:] + + if filter_type == 0: # None + decoded = filtered + elif filter_type == 1: # Sub + decoded = bytearray(filtered) + for j in range(bytes_per_pixel, len(decoded)): + decoded[j] = (decoded[j] + decoded[j - bytes_per_pixel]) % 256 + elif filter_type == 2: # Up + decoded = bytearray([(filtered[j] + prev_line[j]) % 256 + for j in range(len(filtered))]) + elif filter_type == 3: # Average + decoded = bytearray(filtered) + for j in range(len(decoded)): + left = decoded[j - bytes_per_pixel] if j >= bytes_per_pixel else 0 + up = prev_line[j] + avg = (left + up) // 2 + decoded[j] = (decoded[j] + avg) % 256 + elif filter_type == 4: # Paeth + decoded = bytearray(filtered) + for j in range(len(decoded)): + left = decoded[j - bytes_per_pixel] if j >= bytes_per_pixel else 0 + up = prev_line[j] + up_left = prev_line[j - bytes_per_pixel] if j >= bytes_per_pixel else 0 + paeth = paeth_predictor(left, up, up_left) + decoded[j] = (decoded[j] + paeth) % 256 + else: + raise ValueError(f"Unsupported filter type: {filter_type}") + + output.extend(decoded) + prev_line = decoded + + return bytes(output) + +def paeth_predictor(a, b, c): + p = a + b - c + pa = abs(p - a) + pb = abs(p - b) + pc = abs(p - c) + if pa <= pb and pa <= pc: + return a + elif pb <= pc: + return b + else: + return c + +import re +import html + +def clean_pdf_text_to_html(page_number, text): + # Decode Unicode escapes and handle surrogate pairs + try: + decoded = text.encode('latin-1').decode('unicode-escape') + decoded = decoded.encode('utf-16', 'surrogatepass').decode('utf-16') + except Exception as e: + decoded = text # Fallback if decoding fails + + article_title_detected = False + # decoded = re.sub(r'\.\n', '.\n\n', decoded) + # decoded = re.sub(r'\.\n', '<|break|>', decoded) + lines = decoded.split('\n') + output = [] + current_paragraph = [] + in_header = False + email_pattern = re.compile(r'\{.*?\}') + affiliation_pattern = re.compile(r'^†') + quote_pattern = re.compile(r'^["“]') + author_pattern = re.compile( + r'^\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?' + r'(?:,\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?)*' + r'(?:,\s*(?:and|&)\s+[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?)?\s*$' + ) + + def flush_paragraph(): + if current_paragraph: + para = ' '.join(current_paragraph) + para = re.sub(r'\s+', ' ', para).strip() + if para: + # escaped_para = html.escape(para) + escaped_para = para + # escaped_para = re.sub(r'\.\n', '.\n\n', escaped_para) + # Split escaped_para by <|break|> to avoid HTML escaping + escaped_para = escaped_para.split('.\n\n') + # Wrap each part in

tag + escaped_para = [f'

{part}

' for part in escaped_para] + output.append(f'
{"".join(escaped_para)}

') + current_paragraph.clear() + + for i, line in enumerate(lines): + line = line.strip() + + # Handle empty lines + if not line: + flush_paragraph() + continue + + # Detect article title (first line with reasonable length) + if not article_title_detected and i == 0 and 3 <= len(line.split()) <= 8 and len(lines) > 1: + flush_paragraph() + escaped_line = html.escape(line) + output.append(f'

{escaped_line}

') + article_title_detected = True + continue + + # Detect numbered headers like "2.1 Background" + numbered_header = re.match(r'^(\d+(?:\.\d+)*)\s+(.+)$', line) + if i > 0 and not lines[i-1].strip() and numbered_header: + flush_paragraph() + level = numbered_header.group(1).count('.') + 1 + header_text = numbered_header.group(2) + md_level = min(level + 1, 6) + escaped_header = html.escape(header_text) + output.append(f'{escaped_header}') + in_header = True + continue + + # Detect authors + if page_number == 1 and author_pattern.match(line): + authors = re.sub(r'[†â€]', '', line) + authors = re.split(r', | and ', authors) + formatted_authors = [] + for author in authors: + if author.strip(): + parts = [p for p in author.strip().split() if p] + formatted = ' '.join(parts) + escaped_author = html.escape(formatted) + formatted_authors.append(f'{escaped_author}') + + if len(formatted_authors) > 1: + joined = ', '.join(formatted_authors[:-1]) + ' and ' + formatted_authors[-1] + else: + joined = formatted_authors[0] + + output.append(f'

{joined}

') + continue + + # Detect affiliation + if affiliation_pattern.match(line): + escaped_line = html.escape(line) + output.append(f'

{escaped_line}

') + continue + + # Detect emails + if email_pattern.match(line): + escaped_line = html.escape(line) + output.append(f'

{escaped_line}

') + continue + + # Detect section headers + if re.match(r'^(Abstract|\d+\s+[A-Z]|References|Appendix|Figure|Table)', line): + flush_paragraph() + escaped_line = html.escape(line) + output.append(f'

{escaped_line}

') + in_header = True + continue + + # Handle quotes + if quote_pattern.match(line): + flush_paragraph() + escaped_line = html.escape(line) + output.append(f'

{escaped_line}

') + continue + + # Handle hyphenated words + if line.endswith('-'): + current_paragraph.append(line[:-1].strip()) + else: + current_paragraph.append(line) + + # Handle paragraph breaks after headers + if in_header and not line.endswith(('.', '!', '?')): + flush_paragraph() + in_header = False + + flush_paragraph() + + # Post-process HTML + html_output = '\n'.join(output) + + # Fix common citation patterns + html_output = re.sub(r'\(([A-Z][a-z]+ et al\. \d{4})\)', r'\1', html_output) + + # Fix escaped characters + html_output = html_output.replace('\\ud835', '').replace('\\u2020', '†') + + # Remove leftover hyphens and fix spacing + html_output = re.sub(r'\s+-\s+', '', html_output) + html_output = re.sub(r'\s+([.,!?)])', r'\1', html_output) + + return html_output + +def clean_pdf_text(page_number, text): + # Decode Unicode escapes and handle surrogate pairs + try: + decoded = text.encode('latin-1').decode('unicode-escape') + decoded = decoded.encode('utf-16', 'surrogatepass').decode('utf-16') + except Exception as e: + decoded = text # Fallback if decoding fails + + article_title_detected = False + decoded = re.sub(r'\.\n', '.\n\n', decoded) + lines = decoded.split('\n') + output = [] + current_paragraph = [] + in_header = False + email_pattern = re.compile(r'\{.*?\}') + affiliation_pattern = re.compile(r'^†') + quote_pattern = re.compile(r'^["“]') + author_pattern = re.compile( + r'^\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?' + r'(?:,\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?)*' + r'(?:,\s*(?:and|&)\s+[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?)?\s*$' + ) + + def flush_paragraph(): + if current_paragraph: + para = ' '.join(current_paragraph) + para = re.sub(r'\s+', ' ', para).strip() + if para: + output.append(para) + current_paragraph.clear() + + for i, line in enumerate(lines): + line = line.strip() + + # Handle special patterns + if not line: + flush_paragraph() + continue + + # Detect headline (first line, reasonable length, surrounded by empty lines) + if not article_title_detected and i == 0 and 3 <= len(line.split()) <= 8 and (len(lines) > 1): + flush_paragraph() + output.append(f'## {line}') + continue + + # Detect paragraph breaks for ALL paragraphs + if not line and current_paragraph: + flush_paragraph() + output.append('') # Add empty line between paragraphs + continue + + # Detect numbered headers like "2.1 Background" + numbered_header = re.match(r'^(\d+(?:\.\d+)*)\s+(.+)$', line) + if not lines[i-1].strip() and numbered_header: + flush_paragraph() + level = numbered_header.group(1).count('.') + 1 # Convert 2.1 → level 2 + header_text = numbered_header.group(2) + # Never go beyond ### for subsections + md_level = min(level + 1, 6) # 1 → ##, 2 → ###, 3 → #### etc + output.append(f'{"#" * md_level} {header_text}') + in_header = True + continue + + + # Detect authors + if page_number == 1 and author_pattern.match(line): + # Clean and format author names + authors = re.sub(r'[†â€]', '', line) # Remove affiliation markers + authors = re.split(r', | and ', authors) + formatted_authors = [] + for author in authors: + if author.strip(): + # Handle "First Last" formatting + parts = [p for p in author.strip().split() if p] + formatted = ' '.join(parts) + formatted_authors.append(f'**{formatted}**') + + # Join with commas and "and" + if len(formatted_authors) > 1: + joined = ', '.join(formatted_authors[:-1]) + ' and ' + formatted_authors[-1] + else: + joined = formatted_authors[0] + + output.append(joined) + continue + + # Detect affiliation + if affiliation_pattern.match(line): + output.append(f'*{line}*') + continue + + # Detect emails + if email_pattern.match(line): + output.append(f'`{line}`') + continue + + # Detect section headers + if re.match(r'^(Abstract|\d+\s+[A-Z]|References|Appendix|Figure|Table)', line): + flush_paragraph() + output.append(f'_[{line}]_') + in_header = True + continue + + + # Handle quotes + if quote_pattern.match(line): + flush_paragraph() + output.append(f'> {line}') + continue + + # Handle hyphenated words + if line.endswith('-'): + current_paragraph.append(line[:-1].strip()) + else: + current_paragraph.append(line) + + # Handle paragraph breaks after headers + if in_header and not line.endswith(('.', '!', '?')): + flush_paragraph() + in_header = False + + flush_paragraph() + + # Post-processing + markdown = '\n\n'.join(output) + + # Fix common citation patterns + markdown = re.sub(r'\(([A-Z][a-z]+ et al\. \d{4})\)', r'[\1]', markdown) + + # Fix escaped characters + markdown = markdown.replace('\\ud835', '').replace('\\u2020', '†') + + # Remove leftover hyphens and fix spacing + markdown = re.sub(r'\s+-\s+', '', markdown) # Join hyphenated words + markdown = re.sub(r'\s+([.,!?)])', r'\1', markdown) # Fix punctuation spacing + + + return markdown