feat(pdf): add PDF processing capabilities
Add new PDF processing module with the following features: - PDF text extraction and formatting to HTML/Markdown - Image extraction with multiple format support (JPEG, PNG, TIFF) - Link extraction from PDF documents - Metadata extraction including title, author, dates - Support for both local and remote PDF files Also includes: - New configuration options for HTML attribute handling - Internal/external link filtering improvements - Version bump to 0.4.300b4
This commit is contained in:
@@ -1,2 +1,3 @@
|
|||||||
# crawl4ai/_version.py
|
# crawl4ai/_version.py
|
||||||
__version__ = "0.4.3b3"
|
# __version__ = "0.4.3b3"
|
||||||
|
__version__ = "0.4.300b4"
|
||||||
|
|||||||
@@ -271,6 +271,8 @@ class CrawlerRunConfig:
|
|||||||
Default: None.
|
Default: None.
|
||||||
keep_data_attributes (bool): If True, retain `data-*` attributes while removing unwanted attributes.
|
keep_data_attributes (bool): If True, retain `data-*` attributes while removing unwanted attributes.
|
||||||
Default: False.
|
Default: False.
|
||||||
|
keep_attrs (list of str): List of HTML attributes to keep during processing.
|
||||||
|
Default: [].
|
||||||
remove_forms (bool): If True, remove all `<form>` elements from the HTML.
|
remove_forms (bool): If True, remove all `<form>` elements from the HTML.
|
||||||
Default: False.
|
Default: False.
|
||||||
prettiify (bool): If True, apply `fast_format_html` to produce prettified HTML output.
|
prettiify (bool): If True, apply `fast_format_html` to produce prettified HTML output.
|
||||||
@@ -282,6 +284,8 @@ class CrawlerRunConfig:
|
|||||||
proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
|
proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
|
||||||
If None, no additional proxy config. Default: None.
|
If None, no additional proxy config. Default: None.
|
||||||
|
|
||||||
|
# SSL Parameters
|
||||||
|
fetch_ssl_certificate: bool = False,
|
||||||
# Caching Parameters
|
# Caching Parameters
|
||||||
cache_mode (CacheMode or None): Defines how caching is handled.
|
cache_mode (CacheMode or None): Defines how caching is handled.
|
||||||
If None, defaults to CacheMode.ENABLED internally.
|
If None, defaults to CacheMode.ENABLED internally.
|
||||||
@@ -363,10 +367,14 @@ class CrawlerRunConfig:
|
|||||||
Default: SOCIAL_MEDIA_DOMAINS (from config).
|
Default: SOCIAL_MEDIA_DOMAINS (from config).
|
||||||
exclude_external_links (bool): If True, exclude all external links from the results.
|
exclude_external_links (bool): If True, exclude all external links from the results.
|
||||||
Default: False.
|
Default: False.
|
||||||
|
exclude_internal_links (bool): If True, exclude internal links from the results.
|
||||||
|
Default: False.
|
||||||
exclude_social_media_links (bool): If True, exclude links pointing to social media domains.
|
exclude_social_media_links (bool): If True, exclude links pointing to social media domains.
|
||||||
Default: False.
|
Default: False.
|
||||||
exclude_domains (list of str): List of specific domains to exclude from results.
|
exclude_domains (list of str): List of specific domains to exclude from results.
|
||||||
Default: [].
|
Default: [].
|
||||||
|
exclude_internal_links (bool): If True, exclude internal links from the results.
|
||||||
|
Default: False.
|
||||||
|
|
||||||
# Debugging and Logging Parameters
|
# Debugging and Logging Parameters
|
||||||
verbose (bool): Enable verbose logging.
|
verbose (bool): Enable verbose logging.
|
||||||
@@ -402,6 +410,7 @@ class CrawlerRunConfig:
|
|||||||
excluded_tags: list = None,
|
excluded_tags: list = None,
|
||||||
excluded_selector: str = None,
|
excluded_selector: str = None,
|
||||||
keep_data_attributes: bool = False,
|
keep_data_attributes: bool = False,
|
||||||
|
keep_attrs: list = None,
|
||||||
remove_forms: bool = False,
|
remove_forms: bool = False,
|
||||||
prettiify: bool = False,
|
prettiify: bool = False,
|
||||||
parser_type: str = "lxml",
|
parser_type: str = "lxml",
|
||||||
@@ -451,6 +460,7 @@ class CrawlerRunConfig:
|
|||||||
exclude_external_links: bool = False,
|
exclude_external_links: bool = False,
|
||||||
exclude_social_media_links: bool = False,
|
exclude_social_media_links: bool = False,
|
||||||
exclude_domains: list = None,
|
exclude_domains: list = None,
|
||||||
|
exclude_internal_links: bool = False,
|
||||||
# Debugging and Logging Parameters
|
# Debugging and Logging Parameters
|
||||||
verbose: bool = True,
|
verbose: bool = True,
|
||||||
log_console: bool = False,
|
log_console: bool = False,
|
||||||
@@ -475,6 +485,7 @@ class CrawlerRunConfig:
|
|||||||
self.excluded_tags = excluded_tags or []
|
self.excluded_tags = excluded_tags or []
|
||||||
self.excluded_selector = excluded_selector or ""
|
self.excluded_selector = excluded_selector or ""
|
||||||
self.keep_data_attributes = keep_data_attributes
|
self.keep_data_attributes = keep_data_attributes
|
||||||
|
self.keep_attrs = keep_attrs or []
|
||||||
self.remove_forms = remove_forms
|
self.remove_forms = remove_forms
|
||||||
self.prettiify = prettiify
|
self.prettiify = prettiify
|
||||||
self.parser_type = parser_type
|
self.parser_type = parser_type
|
||||||
@@ -532,6 +543,7 @@ class CrawlerRunConfig:
|
|||||||
self.exclude_external_links = exclude_external_links
|
self.exclude_external_links = exclude_external_links
|
||||||
self.exclude_social_media_links = exclude_social_media_links
|
self.exclude_social_media_links = exclude_social_media_links
|
||||||
self.exclude_domains = exclude_domains or []
|
self.exclude_domains = exclude_domains or []
|
||||||
|
self.exclude_internal_links = exclude_internal_links
|
||||||
|
|
||||||
# Debugging and Logging Parameters
|
# Debugging and Logging Parameters
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
@@ -580,6 +592,7 @@ class CrawlerRunConfig:
|
|||||||
excluded_tags=kwargs.get("excluded_tags", []),
|
excluded_tags=kwargs.get("excluded_tags", []),
|
||||||
excluded_selector=kwargs.get("excluded_selector", ""),
|
excluded_selector=kwargs.get("excluded_selector", ""),
|
||||||
keep_data_attributes=kwargs.get("keep_data_attributes", False),
|
keep_data_attributes=kwargs.get("keep_data_attributes", False),
|
||||||
|
keep_attrs=kwargs.get("keep_attrs", []),
|
||||||
remove_forms=kwargs.get("remove_forms", False),
|
remove_forms=kwargs.get("remove_forms", False),
|
||||||
prettiify=kwargs.get("prettiify", False),
|
prettiify=kwargs.get("prettiify", False),
|
||||||
parser_type=kwargs.get("parser_type", "lxml"),
|
parser_type=kwargs.get("parser_type", "lxml"),
|
||||||
@@ -638,6 +651,7 @@ class CrawlerRunConfig:
|
|||||||
exclude_external_links=kwargs.get("exclude_external_links", False),
|
exclude_external_links=kwargs.get("exclude_external_links", False),
|
||||||
exclude_social_media_links=kwargs.get("exclude_social_media_links", False),
|
exclude_social_media_links=kwargs.get("exclude_social_media_links", False),
|
||||||
exclude_domains=kwargs.get("exclude_domains", []),
|
exclude_domains=kwargs.get("exclude_domains", []),
|
||||||
|
exclude_internal_links=kwargs.get("exclude_internal_links", False),
|
||||||
# Debugging and Logging Parameters
|
# Debugging and Logging Parameters
|
||||||
verbose=kwargs.get("verbose", True),
|
verbose=kwargs.get("verbose", True),
|
||||||
log_console=kwargs.get("log_console", False),
|
log_console=kwargs.get("log_console", False),
|
||||||
@@ -663,6 +677,7 @@ class CrawlerRunConfig:
|
|||||||
"excluded_tags": self.excluded_tags,
|
"excluded_tags": self.excluded_tags,
|
||||||
"excluded_selector": self.excluded_selector,
|
"excluded_selector": self.excluded_selector,
|
||||||
"keep_data_attributes": self.keep_data_attributes,
|
"keep_data_attributes": self.keep_data_attributes,
|
||||||
|
"keep_attrs": self.keep_attrs,
|
||||||
"remove_forms": self.remove_forms,
|
"remove_forms": self.remove_forms,
|
||||||
"prettiify": self.prettiify,
|
"prettiify": self.prettiify,
|
||||||
"parser_type": self.parser_type,
|
"parser_type": self.parser_type,
|
||||||
@@ -706,6 +721,7 @@ class CrawlerRunConfig:
|
|||||||
"exclude_external_links": self.exclude_external_links,
|
"exclude_external_links": self.exclude_external_links,
|
||||||
"exclude_social_media_links": self.exclude_social_media_links,
|
"exclude_social_media_links": self.exclude_social_media_links,
|
||||||
"exclude_domains": self.exclude_domains,
|
"exclude_domains": self.exclude_domains,
|
||||||
|
"exclude_internal_links": self.exclude_internal_links,
|
||||||
"verbose": self.verbose,
|
"verbose": self.verbose,
|
||||||
"log_console": self.log_console,
|
"log_console": self.log_console,
|
||||||
"stream": self.stream,
|
"stream": self.stream,
|
||||||
|
|||||||
@@ -319,14 +319,6 @@ class AsyncWebCrawler:
|
|||||||
try:
|
try:
|
||||||
# Handle configuration
|
# Handle configuration
|
||||||
if crawler_config is not None:
|
if crawler_config is not None:
|
||||||
# if any(param is not None for param in [
|
|
||||||
# word_count_threshold, extraction_strategy, chunking_strategy,
|
|
||||||
# content_filter, cache_mode, css_selector, screenshot, pdf
|
|
||||||
# ]):
|
|
||||||
# self.logger.warning(
|
|
||||||
# message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.",
|
|
||||||
# tag="WARNING"
|
|
||||||
# )
|
|
||||||
config = crawler_config
|
config = crawler_config
|
||||||
else:
|
else:
|
||||||
# Merge all parameters into a single kwargs dict for config creation
|
# Merge all parameters into a single kwargs dict for config creation
|
||||||
@@ -350,14 +342,6 @@ class AsyncWebCrawler:
|
|||||||
|
|
||||||
# Handle deprecated cache parameters
|
# Handle deprecated cache parameters
|
||||||
if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]):
|
if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]):
|
||||||
if kwargs.get("warning", True):
|
|
||||||
warnings.warn(
|
|
||||||
"Cache control boolean flags are deprecated and will be removed in version 0.5.0. "
|
|
||||||
"Use 'cache_mode' parameter instead.",
|
|
||||||
DeprecationWarning,
|
|
||||||
stacklevel=2,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Convert legacy parameters if cache_mode not provided
|
# Convert legacy parameters if cache_mode not provided
|
||||||
if config.cache_mode is None:
|
if config.cache_mode is None:
|
||||||
config.cache_mode = _legacy_to_cache_mode(
|
config.cache_mode = _legacy_to_cache_mode(
|
||||||
@@ -430,7 +414,9 @@ class AsyncWebCrawler:
|
|||||||
response_headers={"X-Robots-Status": "Blocked by robots.txt"}
|
response_headers={"X-Robots-Status": "Blocked by robots.txt"}
|
||||||
)
|
)
|
||||||
|
|
||||||
# Pass config to crawl method
|
##############################
|
||||||
|
# Call CrawlerStrategy.crawl #
|
||||||
|
##############################
|
||||||
async_response = await self.crawler_strategy.crawl(
|
async_response = await self.crawler_strategy.crawl(
|
||||||
url,
|
url,
|
||||||
config=config, # Pass the entire config object
|
config=config, # Pass the entire config object
|
||||||
@@ -448,7 +434,9 @@ class AsyncWebCrawler:
|
|||||||
tag="FETCH",
|
tag="FETCH",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Process the HTML content
|
###############################################################
|
||||||
|
# Process the HTML content, Call CrawlerStrategy.process_html #
|
||||||
|
###############################################################
|
||||||
crawl_result : CrawlResult = await self.aprocess_html(
|
crawl_result : CrawlResult = await self.aprocess_html(
|
||||||
url=url,
|
url=url,
|
||||||
html=html,
|
html=html,
|
||||||
@@ -469,26 +457,6 @@ class AsyncWebCrawler:
|
|||||||
async_response.ssl_certificate
|
async_response.ssl_certificate
|
||||||
) # Add SSL certificate
|
) # Add SSL certificate
|
||||||
|
|
||||||
# # Check and set values from async_response to crawl_result
|
|
||||||
# try:
|
|
||||||
# for key in vars(async_response):
|
|
||||||
# if hasattr(crawl_result, key):
|
|
||||||
# value = getattr(async_response, key, None)
|
|
||||||
# current_value = getattr(crawl_result, key, None)
|
|
||||||
# if value is not None and not current_value:
|
|
||||||
# try:
|
|
||||||
# setattr(crawl_result, key, value)
|
|
||||||
# except Exception as e:
|
|
||||||
# self.logger.warning(
|
|
||||||
# message=f"Failed to set attribute {key}: {str(e)}",
|
|
||||||
# tag="WARNING"
|
|
||||||
# )
|
|
||||||
# except Exception as e:
|
|
||||||
# self.logger.warning(
|
|
||||||
# message=f"Error copying response attributes: {str(e)}",
|
|
||||||
# tag="WARNING"
|
|
||||||
# )
|
|
||||||
|
|
||||||
crawl_result.success = bool(html)
|
crawl_result.success = bool(html)
|
||||||
crawl_result.session_id = getattr(config, "session_id", None)
|
crawl_result.session_id = getattr(config, "session_id", None)
|
||||||
|
|
||||||
@@ -538,8 +506,6 @@ class AsyncWebCrawler:
|
|||||||
f"Error: {str(e)}\n\n"
|
f"Error: {str(e)}\n\n"
|
||||||
f"Code context:\n{error_context['code_context']}"
|
f"Code context:\n{error_context['code_context']}"
|
||||||
)
|
)
|
||||||
# if not hasattr(e, "msg"):
|
|
||||||
# e.msg = str(e)
|
|
||||||
|
|
||||||
self.logger.error_status(
|
self.logger.error_status(
|
||||||
url=url,
|
url=url,
|
||||||
@@ -578,6 +544,7 @@ class AsyncWebCrawler:
|
|||||||
Returns:
|
Returns:
|
||||||
CrawlResult: Processed result containing extracted and formatted content
|
CrawlResult: Processed result containing extracted and formatted content
|
||||||
"""
|
"""
|
||||||
|
cleaned_html = ""
|
||||||
try:
|
try:
|
||||||
_url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
|
_url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
|
||||||
t1 = time.perf_counter()
|
t1 = time.perf_counter()
|
||||||
@@ -592,6 +559,10 @@ class AsyncWebCrawler:
|
|||||||
# add keys from kwargs to params that doesn't exist in params
|
# add keys from kwargs to params that doesn't exist in params
|
||||||
params.update({k: v for k, v in kwargs.items() if k not in params.keys()})
|
params.update({k: v for k, v in kwargs.items() if k not in params.keys()})
|
||||||
|
|
||||||
|
|
||||||
|
################################
|
||||||
|
# Scraping Strategy Execution #
|
||||||
|
################################
|
||||||
result = scraping_strategy.scrap(url, html, **params)
|
result = scraping_strategy.scrap(url, html, **params)
|
||||||
|
|
||||||
if result is None:
|
if result is None:
|
||||||
@@ -618,7 +589,9 @@ class AsyncWebCrawler:
|
|||||||
links = result.links.model_dump()
|
links = result.links.model_dump()
|
||||||
metadata = result.metadata
|
metadata = result.metadata
|
||||||
|
|
||||||
# Markdown Generation
|
################################
|
||||||
|
# Generate Markdown #
|
||||||
|
################################
|
||||||
markdown_generator: Optional[MarkdownGenerationStrategy] = (
|
markdown_generator: Optional[MarkdownGenerationStrategy] = (
|
||||||
config.markdown_generator or DefaultMarkdownGenerator()
|
config.markdown_generator or DefaultMarkdownGenerator()
|
||||||
)
|
)
|
||||||
@@ -644,14 +617,15 @@ class AsyncWebCrawler:
|
|||||||
params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000)},
|
params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000)},
|
||||||
)
|
)
|
||||||
|
|
||||||
# Handle content extraction if needed
|
################################
|
||||||
|
# Structured Content Extraction #
|
||||||
|
################################
|
||||||
if (
|
if (
|
||||||
not bool(extracted_content)
|
not bool(extracted_content)
|
||||||
and config.extraction_strategy
|
and config.extraction_strategy
|
||||||
and not isinstance(config.extraction_strategy, NoExtractionStrategy)
|
and not isinstance(config.extraction_strategy, NoExtractionStrategy)
|
||||||
):
|
):
|
||||||
t1 = time.perf_counter()
|
t1 = time.perf_counter()
|
||||||
|
|
||||||
# Choose content based on input_format
|
# Choose content based on input_format
|
||||||
content_format = config.extraction_strategy.input_format
|
content_format = config.extraction_strategy.input_format
|
||||||
if content_format == "fit_markdown" and not markdown_result.fit_markdown:
|
if content_format == "fit_markdown" and not markdown_result.fit_markdown:
|
||||||
@@ -665,6 +639,7 @@ class AsyncWebCrawler:
|
|||||||
content = {
|
content = {
|
||||||
"markdown": markdown,
|
"markdown": markdown,
|
||||||
"html": html,
|
"html": html,
|
||||||
|
"cleaned_html": cleaned_html,
|
||||||
"fit_markdown": markdown_result.raw_markdown,
|
"fit_markdown": markdown_result.raw_markdown,
|
||||||
}.get(content_format, markdown)
|
}.get(content_format, markdown)
|
||||||
|
|
||||||
|
|||||||
@@ -529,6 +529,9 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
if normalized_href not in external_links_dict:
|
if normalized_href not in external_links_dict:
|
||||||
external_links_dict[normalized_href] = link_data
|
external_links_dict[normalized_href] = link_data
|
||||||
else:
|
else:
|
||||||
|
if kwargs.get("exclude_internal_links", False):
|
||||||
|
element.decompose()
|
||||||
|
return False
|
||||||
if normalized_href not in internal_links_dict:
|
if normalized_href not in internal_links_dict:
|
||||||
internal_links_dict[normalized_href] = link_data
|
internal_links_dict[normalized_href] = link_data
|
||||||
|
|
||||||
@@ -629,7 +632,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
self.remove_unwanted_attributes(
|
self.remove_unwanted_attributes(
|
||||||
element, IMPORTANT_ATTRS, kwargs.get("keep_data_attributes", False)
|
element, IMPORTANT_ATTRS + kwargs.get("keep_attrs", []) , kwargs.get("keep_data_attributes", False)
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# print('Error removing unwanted attributes:', str(e))
|
# print('Error removing unwanted attributes:', str(e))
|
||||||
|
|||||||
@@ -1098,17 +1098,19 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
|
|||||||
user_message = {
|
user_message = {
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": f"""
|
"content": f"""
|
||||||
Instructions:
|
|
||||||
{prompt_template}
|
|
||||||
|
|
||||||
HTML to analyze:
|
HTML to analyze:
|
||||||
```html
|
```html
|
||||||
{html}
|
{html}
|
||||||
```
|
```
|
||||||
|
|
||||||
{"Extract the following data: " + query if query else "Please analyze the HTML structure and create the most appropriate schema for data extraction."}
|
Instructions to extract schema for the above given HTML:
|
||||||
|
{prompt_template}
|
||||||
|
|
||||||
"""
|
"""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if query:
|
||||||
|
user_message["content"] += f"\n\nImportant Notes to Consider:\n{query}"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Call LLM with backoff handling
|
# Call LLM with backoff handling
|
||||||
|
|||||||
@@ -143,6 +143,7 @@ class AsyncCrawlResponse(BaseModel):
|
|||||||
###############################
|
###############################
|
||||||
class MediaItem(BaseModel):
|
class MediaItem(BaseModel):
|
||||||
src: Optional[str] = ""
|
src: Optional[str] = ""
|
||||||
|
data: Optional[str] = ""
|
||||||
alt: Optional[str] = ""
|
alt: Optional[str] = ""
|
||||||
desc: Optional[str] = ""
|
desc: Optional[str] = ""
|
||||||
score: Optional[int] = 0
|
score: Optional[int] = 0
|
||||||
|
|||||||
164
crawl4ai/processors/pdf/__init__.py
Normal file
164
crawl4ai/processors/pdf/__init__.py
Normal file
@@ -0,0 +1,164 @@
|
|||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List, Optional
|
||||||
|
from dataclasses import asdict
|
||||||
|
|
||||||
|
from crawl4ai.async_logger import AsyncLogger
|
||||||
|
from crawl4ai.async_crawler_strategy import AsyncCrawlerStrategy
|
||||||
|
from crawl4ai.models import AsyncCrawlResponse, ScrapingResult
|
||||||
|
from crawl4ai.content_scraping_strategy import ContentScrapingStrategy
|
||||||
|
from .processor import NaivePDFProcessorStrategy # Assuming your current PDF code is in pdf_processor.py
|
||||||
|
|
||||||
|
class PDFCrawlerStrategy(AsyncCrawlerStrategy):
|
||||||
|
def __init__(self, logger: AsyncLogger = None):
|
||||||
|
self.logger = logger
|
||||||
|
|
||||||
|
async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
|
||||||
|
# Just pass through with empty HTML - scraper will handle actual processing
|
||||||
|
return AsyncCrawlResponse(
|
||||||
|
html="", # Scraper will handle the real work
|
||||||
|
response_headers={"Content-Type": "application/pdf"},
|
||||||
|
status_code=200
|
||||||
|
)
|
||||||
|
|
||||||
|
async def close(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
async def __aenter__(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||||
|
await self.close()
|
||||||
|
|
||||||
|
class PDFContentScrapingStrategy(ContentScrapingStrategy):
|
||||||
|
"""
|
||||||
|
A content scraping strategy for PDF files.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
save_images_locally (bool): Whether to save images locally.
|
||||||
|
extract_images (bool): Whether to extract images from PDF.
|
||||||
|
image_save_dir (str): Directory to save extracted images.
|
||||||
|
logger (AsyncLogger): Logger instance for recording events and errors.
|
||||||
|
|
||||||
|
Methods:
|
||||||
|
scrap(url: str, html: str, **params) -> ScrapingResult:
|
||||||
|
Scrap content from a PDF file.
|
||||||
|
ascrap(url: str, html: str, **kwargs) -> ScrapingResult:
|
||||||
|
Asynchronous version of scrap.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
strategy = PDFContentScrapingStrategy(
|
||||||
|
save_images_locally=False,
|
||||||
|
extract_images=False,
|
||||||
|
image_save_dir=None,
|
||||||
|
logger=logger
|
||||||
|
)
|
||||||
|
|
||||||
|
"""
|
||||||
|
def __init__(self,
|
||||||
|
save_images_locally=False,
|
||||||
|
extract_images=False,
|
||||||
|
image_save_dir=None,
|
||||||
|
logger: AsyncLogger = None):
|
||||||
|
self.logger = logger
|
||||||
|
self.pdf_processor = NaivePDFProcessorStrategy(
|
||||||
|
save_images_locally=False,
|
||||||
|
extract_images=False,
|
||||||
|
image_save_dir=None
|
||||||
|
)
|
||||||
|
|
||||||
|
def scrap(self, url: str, html: str, **params) -> ScrapingResult:
|
||||||
|
"""
|
||||||
|
Scrap content from a PDF file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url (str): The URL of the PDF file.
|
||||||
|
html (str): The HTML content of the page.
|
||||||
|
**params: Additional parameters.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ScrapingResult: The scraped content.
|
||||||
|
"""
|
||||||
|
# Download if URL or use local path
|
||||||
|
pdf_path = self._get_pdf_path(url)
|
||||||
|
try:
|
||||||
|
# Process PDF
|
||||||
|
result = self.pdf_processor.process(Path(pdf_path))
|
||||||
|
|
||||||
|
# Combine page HTML
|
||||||
|
cleaned_html = f"""
|
||||||
|
<html>
|
||||||
|
<head><meta name="pdf-pages" content="{len(result.pages)}"></head>
|
||||||
|
<body>
|
||||||
|
{''.join(f'<div class="pdf-page" data-page="{i+1}">{page.html}</div>'
|
||||||
|
for i, page in enumerate(result.pages))}
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Accumulate media and links with page numbers
|
||||||
|
media = {"images": []}
|
||||||
|
links = {"urls": []}
|
||||||
|
|
||||||
|
for page in result.pages:
|
||||||
|
# Add page number to each image
|
||||||
|
for img in page.images:
|
||||||
|
img["page"] = page.page_number
|
||||||
|
media["images"].append(img)
|
||||||
|
|
||||||
|
# Add page number to each link
|
||||||
|
for link in page.links:
|
||||||
|
links["urls"].append({
|
||||||
|
"url": link,
|
||||||
|
"page": page.page_number
|
||||||
|
})
|
||||||
|
|
||||||
|
return ScrapingResult(
|
||||||
|
cleaned_html=cleaned_html,
|
||||||
|
success=True,
|
||||||
|
media=media,
|
||||||
|
links=links,
|
||||||
|
metadata=asdict(result.metadata)
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
# Cleanup temp file if downloaded
|
||||||
|
if url.startswith(("http://", "https://")):
|
||||||
|
Path(pdf_path).unlink(missing_ok=True)
|
||||||
|
|
||||||
|
async def ascrap(self, url: str, html: str, **kwargs) -> ScrapingResult:
|
||||||
|
# For simple cases, you can use the sync version
|
||||||
|
return await asyncio.to_thread(self.scrap, url, html, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_pdf_path(self, url: str) -> str:
|
||||||
|
if url.startswith(("http://", "https://")):
|
||||||
|
import tempfile
|
||||||
|
import requests
|
||||||
|
|
||||||
|
# Create temp file with .pdf extension
|
||||||
|
temp_file = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Download PDF with streaming
|
||||||
|
response = requests.get(url, stream=True)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
# Write to temp file
|
||||||
|
with open(temp_file.name, 'wb') as f:
|
||||||
|
for chunk in response.iter_content(chunk_size=8192):
|
||||||
|
f.write(chunk)
|
||||||
|
|
||||||
|
return temp_file.name
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# Clean up temp file if download fails
|
||||||
|
Path(temp_file.name).unlink(missing_ok=True)
|
||||||
|
raise RuntimeError(f"Failed to download PDF from {url}: {str(e)}")
|
||||||
|
|
||||||
|
elif url.startswith("file://"):
|
||||||
|
return url[7:] # Strip file:// prefix
|
||||||
|
|
||||||
|
return url # Assume local path
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["PDFCrawlerStrategy", "PDFContentScrapingStrategy"]
|
||||||
372
crawl4ai/processors/pdf/processor.py
Normal file
372
crawl4ai/processors/pdf/processor.py
Normal file
@@ -0,0 +1,372 @@
|
|||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from time import time
|
||||||
|
from dataclasses import dataclass, asdict, field
|
||||||
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
import PyPDF2
|
||||||
|
from PIL import Image
|
||||||
|
from PyPDF2 import PdfReader
|
||||||
|
from .utils import *
|
||||||
|
import base64
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import List, Optional, Dict, Any
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PDFMetadata:
|
||||||
|
title: Optional[str] = None
|
||||||
|
author: Optional[str] = None
|
||||||
|
producer: Optional[str] = None
|
||||||
|
created: Optional[datetime] = None
|
||||||
|
modified: Optional[datetime] = None
|
||||||
|
pages: int = 0
|
||||||
|
encrypted: bool = False
|
||||||
|
file_size: Optional[int] = None
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PDFPage:
|
||||||
|
page_number: int
|
||||||
|
raw_text: str = ""
|
||||||
|
markdown: str = "" # Added per your request
|
||||||
|
html: str = "" # Added per your request
|
||||||
|
images: List[Dict] = field(default_factory=list)
|
||||||
|
links: List[str] = field(default_factory=list)
|
||||||
|
layout: List[Dict] = field(default_factory=list)
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PDFProcessResult:
|
||||||
|
metadata: PDFMetadata
|
||||||
|
pages: List[PDFPage]
|
||||||
|
processing_time: float = 0.0
|
||||||
|
version: str = "1.0"
|
||||||
|
|
||||||
|
class PDFProcessorStrategy(ABC):
|
||||||
|
@abstractmethod
|
||||||
|
def process(self, pdf_path: Path) -> PDFProcessResult:
|
||||||
|
pass
|
||||||
|
|
||||||
|
class NaivePDFProcessorStrategy(PDFProcessorStrategy):
|
||||||
|
def __init__(self, image_dpi: int = 144, image_quality: int = 85, extract_images: bool = True,
|
||||||
|
save_images_locally: bool = False, image_save_dir: Optional[Path] = None):
|
||||||
|
self.image_dpi = image_dpi
|
||||||
|
self.image_quality = image_quality
|
||||||
|
self.current_page_number = 0
|
||||||
|
self.extract_images = extract_images
|
||||||
|
self.save_images_locally = save_images_locally
|
||||||
|
self.image_save_dir = image_save_dir
|
||||||
|
self._temp_dir = None
|
||||||
|
|
||||||
|
def process(self, pdf_path: Path) -> PDFProcessResult:
|
||||||
|
start_time = time()
|
||||||
|
result = PDFProcessResult(
|
||||||
|
metadata=PDFMetadata(),
|
||||||
|
pages=[],
|
||||||
|
version="1.1"
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with pdf_path.open('rb') as file:
|
||||||
|
reader = PdfReader(file)
|
||||||
|
result.metadata = self._extract_metadata(pdf_path, reader)
|
||||||
|
|
||||||
|
# Handle image directory
|
||||||
|
image_dir = None
|
||||||
|
if self.extract_images and self.save_images_locally:
|
||||||
|
if self.image_save_dir:
|
||||||
|
image_dir = Path(self.image_save_dir)
|
||||||
|
image_dir.mkdir(exist_ok=True, parents=True)
|
||||||
|
else:
|
||||||
|
self._temp_dir = tempfile.mkdtemp(prefix='pdf_images_')
|
||||||
|
image_dir = Path(self._temp_dir)
|
||||||
|
|
||||||
|
for page_num, page in enumerate(reader.pages):
|
||||||
|
self.current_page_number = page_num + 1
|
||||||
|
pdf_page = self._process_page(page, image_dir, reader)
|
||||||
|
result.pages.append(pdf_page)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to process PDF: {str(e)}")
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
# Cleanup temp directory if it was created
|
||||||
|
if self._temp_dir and not self.image_save_dir:
|
||||||
|
import shutil
|
||||||
|
try:
|
||||||
|
shutil.rmtree(self._temp_dir)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to cleanup temp directory: {str(e)}")
|
||||||
|
|
||||||
|
result.processing_time = time() - start_time
|
||||||
|
return result
|
||||||
|
|
||||||
|
def _process_page(self, page, image_dir: Optional[Path], reader) -> PDFPage:
|
||||||
|
pdf_page = PDFPage(
|
||||||
|
page_number=self.current_page_number,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Text and font extraction
|
||||||
|
def visitor_text(text, cm, tm, font_dict, font_size):
|
||||||
|
pdf_page.raw_text += text
|
||||||
|
pdf_page.layout.append({
|
||||||
|
"type": "text",
|
||||||
|
"text": text,
|
||||||
|
"x": tm[4],
|
||||||
|
"y": tm[5],
|
||||||
|
})
|
||||||
|
|
||||||
|
page.extract_text(visitor_text=visitor_text)
|
||||||
|
|
||||||
|
# Image extraction
|
||||||
|
if self.extract_images:
|
||||||
|
pdf_page.images = self._extract_images(page, image_dir)
|
||||||
|
|
||||||
|
# Link extraction
|
||||||
|
pdf_page.links = self._extract_links(page)
|
||||||
|
|
||||||
|
# Add markdown content
|
||||||
|
pdf_page.markdown = clean_pdf_text(self.current_page_number, pdf_page.raw_text)
|
||||||
|
pdf_page.html = clean_pdf_text_to_html(self.current_page_number, pdf_page.raw_text)
|
||||||
|
|
||||||
|
return pdf_page
|
||||||
|
|
||||||
|
def _extract_images(self, page, image_dir: Optional[Path]) -> List[Dict]:
|
||||||
|
if not self.extract_images:
|
||||||
|
return []
|
||||||
|
|
||||||
|
images = []
|
||||||
|
try:
|
||||||
|
resources = page.get("/Resources")
|
||||||
|
if resources: # Check if resources exist
|
||||||
|
resources = resources.get_object() # Resolve IndirectObject
|
||||||
|
if '/XObject' in resources:
|
||||||
|
xobjects = resources['/XObject'].get_object()
|
||||||
|
img_count = 0
|
||||||
|
for obj_name in xobjects:
|
||||||
|
xobj = xobjects[obj_name]
|
||||||
|
if hasattr(xobj, 'get_object') and callable(xobj.get_object):
|
||||||
|
xobj = xobj.get_object()
|
||||||
|
if xobj.get('/Subtype') == '/Image':
|
||||||
|
try:
|
||||||
|
img_count += 1
|
||||||
|
img_filename = f"page_{self.current_page_number}_img_{img_count}"
|
||||||
|
data = xobj.get_data()
|
||||||
|
filters = xobj.get('/Filter', [])
|
||||||
|
if not isinstance(filters, list):
|
||||||
|
filters = [filters]
|
||||||
|
|
||||||
|
# Resolve IndirectObjects in properties
|
||||||
|
width = xobj.get('/Width', 0)
|
||||||
|
height = xobj.get('/Height', 0)
|
||||||
|
color_space = xobj.get('/ColorSpace', '/DeviceRGB')
|
||||||
|
if isinstance(color_space, PyPDF2.generic.IndirectObject):
|
||||||
|
color_space = color_space.get_object()
|
||||||
|
|
||||||
|
# Handle different image encodings
|
||||||
|
success = False
|
||||||
|
image_format = 'bin'
|
||||||
|
image_data = None
|
||||||
|
|
||||||
|
if '/FlateDecode' in filters:
|
||||||
|
try:
|
||||||
|
decode_parms = xobj.get('/DecodeParms', {})
|
||||||
|
if isinstance(decode_parms, PyPDF2.generic.IndirectObject):
|
||||||
|
decode_parms = decode_parms.get_object()
|
||||||
|
|
||||||
|
predictor = decode_parms.get('/Predictor', 1)
|
||||||
|
bits = xobj.get('/BitsPerComponent', 8)
|
||||||
|
colors = 3 if color_space == '/DeviceRGB' else 1
|
||||||
|
|
||||||
|
if predictor >= 10:
|
||||||
|
data = apply_png_predictor(data, width, bits, colors)
|
||||||
|
|
||||||
|
# Create PIL Image
|
||||||
|
mode = 'RGB' if color_space == '/DeviceRGB' else 'L'
|
||||||
|
img = Image.frombytes(mode, (width, height), data)
|
||||||
|
|
||||||
|
if self.save_images_locally:
|
||||||
|
final_path = (image_dir / img_filename).with_suffix('.png')
|
||||||
|
img.save(final_path)
|
||||||
|
image_data = str(final_path)
|
||||||
|
else:
|
||||||
|
import io
|
||||||
|
img_byte_arr = io.BytesIO()
|
||||||
|
img.save(img_byte_arr, format='PNG')
|
||||||
|
image_data = base64.b64encode(img_byte_arr.getvalue()).decode('utf-8')
|
||||||
|
|
||||||
|
success = True
|
||||||
|
image_format = 'png'
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"FlateDecode error: {str(e)}")
|
||||||
|
|
||||||
|
elif '/DCTDecode' in filters:
|
||||||
|
# JPEG image
|
||||||
|
try:
|
||||||
|
if self.save_images_locally:
|
||||||
|
final_path = (image_dir / img_filename).with_suffix('.jpg')
|
||||||
|
with open(final_path, 'wb') as f:
|
||||||
|
f.write(data)
|
||||||
|
image_data = str(final_path)
|
||||||
|
else:
|
||||||
|
image_data = base64.b64encode(data).decode('utf-8')
|
||||||
|
success = True
|
||||||
|
image_format = 'jpeg'
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"JPEG save error: {str(e)}")
|
||||||
|
|
||||||
|
elif '/CCITTFaxDecode' in filters:
|
||||||
|
try:
|
||||||
|
if data[:4] != b'II*\x00':
|
||||||
|
# Add TIFF header if missing
|
||||||
|
tiff_header = b'II*\x00\x08\x00\x00\x00\x0e\x00\x00\x01\x03\x00\x01\x00\x00\x00' + \
|
||||||
|
width.to_bytes(4, 'little') + \
|
||||||
|
b'\x01\x03\x00\x01\x00\x00\x00' + \
|
||||||
|
height.to_bytes(4, 'little') + \
|
||||||
|
b'\x01\x12\x00\x03\x00\x00\x00\x01\x00\x01\x00\x00\x01\x17\x00\x04\x00\x00\x00\x01\x00\x00\x00J\x01\x1B\x00\x05\x00\x00\x00\x01\x00\x00\x00R\x01\x28\x00\x03\x00\x00\x00\x01\x00\x02\x00\x00'
|
||||||
|
data = tiff_header + data
|
||||||
|
|
||||||
|
if self.save_images_locally:
|
||||||
|
final_path = (image_dir / img_filename).with_suffix('.tiff')
|
||||||
|
with open(final_path, 'wb') as f:
|
||||||
|
f.write(data)
|
||||||
|
image_data = str(final_path)
|
||||||
|
else:
|
||||||
|
image_data = base64.b64encode(data).decode('utf-8')
|
||||||
|
success = True
|
||||||
|
image_format = 'tiff'
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"CCITT save error: {str(e)}")
|
||||||
|
|
||||||
|
elif '/JPXDecode' in filters:
|
||||||
|
# JPEG 2000
|
||||||
|
try:
|
||||||
|
if self.save_images_locally:
|
||||||
|
final_path = (image_dir / img_filename).with_suffix('.jp2')
|
||||||
|
with open(final_path, 'wb') as f:
|
||||||
|
f.write(data)
|
||||||
|
image_data = str(final_path)
|
||||||
|
else:
|
||||||
|
image_data = base64.b64encode(data).decode('utf-8')
|
||||||
|
success = True
|
||||||
|
image_format = 'jpeg2000'
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"JPEG2000 save error: {str(e)}")
|
||||||
|
|
||||||
|
if success and image_data:
|
||||||
|
image_info = {
|
||||||
|
"format": image_format,
|
||||||
|
"width": width,
|
||||||
|
"height": height,
|
||||||
|
"color_space": str(color_space),
|
||||||
|
"bits_per_component": xobj.get('/BitsPerComponent', 1)
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.save_images_locally:
|
||||||
|
image_info["path"] = image_data
|
||||||
|
else:
|
||||||
|
image_info["data"] = image_data
|
||||||
|
|
||||||
|
images.append(image_info)
|
||||||
|
else:
|
||||||
|
# Fallback: Save raw data
|
||||||
|
if self.save_images_locally:
|
||||||
|
final_path = (image_dir / img_filename).with_suffix('.bin')
|
||||||
|
with open(final_path, 'wb') as f:
|
||||||
|
f.write(data)
|
||||||
|
logger.warning(f"Saved raw image data to {final_path}")
|
||||||
|
else:
|
||||||
|
image_data = base64.b64encode(data).decode('utf-8')
|
||||||
|
images.append({
|
||||||
|
"format": "bin",
|
||||||
|
"width": width,
|
||||||
|
"height": height,
|
||||||
|
"color_space": str(color_space),
|
||||||
|
"bits_per_component": xobj.get('/BitsPerComponent', 1),
|
||||||
|
"data": image_data
|
||||||
|
})
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing image: {str(e)}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Image extraction error: {str(e)}")
|
||||||
|
|
||||||
|
return images
|
||||||
|
|
||||||
|
def _extract_links(self, page) -> List[str]:
|
||||||
|
links = []
|
||||||
|
if '/Annots' in page:
|
||||||
|
try:
|
||||||
|
for annot in page['/Annots']:
|
||||||
|
a = annot.get_object()
|
||||||
|
if '/A' in a and '/URI' in a['/A']:
|
||||||
|
links.append(a['/A']['/URI'])
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Link error: {str(e)}")
|
||||||
|
return links
|
||||||
|
|
||||||
|
def _extract_metadata(self, pdf_path: Path, reader: PdfReader = None) -> PDFMetadata:
|
||||||
|
if not reader:
|
||||||
|
reader = PdfReader(pdf_path)
|
||||||
|
|
||||||
|
meta = reader.metadata or {}
|
||||||
|
created = self._parse_pdf_date(meta.get('/CreationDate', ''))
|
||||||
|
modified = self._parse_pdf_date(meta.get('/ModDate', ''))
|
||||||
|
|
||||||
|
return PDFMetadata(
|
||||||
|
title=meta.get('/Title'),
|
||||||
|
author=meta.get('/Author'),
|
||||||
|
producer=meta.get('/Producer'),
|
||||||
|
created=created,
|
||||||
|
modified=modified,
|
||||||
|
pages=len(reader.pages),
|
||||||
|
encrypted=reader.is_encrypted,
|
||||||
|
file_size=pdf_path.stat().st_size
|
||||||
|
)
|
||||||
|
|
||||||
|
def _parse_pdf_date(self, date_str: str) -> Optional[datetime]:
|
||||||
|
try:
|
||||||
|
match = re.match(r'D:(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})', date_str)
|
||||||
|
if not match:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return datetime(
|
||||||
|
year=int(match[1]),
|
||||||
|
month=int(match[2]),
|
||||||
|
day=int(match[3]),
|
||||||
|
hour=int(match[4]),
|
||||||
|
minute=int(match[5]),
|
||||||
|
second=int(match[6])
|
||||||
|
)
|
||||||
|
except:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Usage example
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
current_dir = Path(__file__).resolve().parent
|
||||||
|
pdf_path = f'{current_dir}/test.pdf'
|
||||||
|
|
||||||
|
strategy = NaivePDFProcessorStrategy()
|
||||||
|
result = strategy.process(Path(pdf_path))
|
||||||
|
|
||||||
|
# Convert to JSON
|
||||||
|
json_output = asdict(result)
|
||||||
|
print(json.dumps(json_output, indent=2, default=str))
|
||||||
|
|
||||||
|
with open(f'{current_dir}/test.html', 'w') as f:
|
||||||
|
for page in result.pages:
|
||||||
|
f.write(f'<h1>Page {page["page_number"]}</h1>')
|
||||||
|
f.write(page['html'])
|
||||||
|
with open(f'{current_dir}/test.md', 'w') as f:
|
||||||
|
for page in result.pages:
|
||||||
|
f.write(f'# Page {page["page_number"]}\n\n')
|
||||||
|
f.write(clean_pdf_text(page["page_number"], page['raw_text']))
|
||||||
|
f.write('\n\n')
|
||||||
350
crawl4ai/processors/pdf/utils.py
Normal file
350
crawl4ai/processors/pdf/utils.py
Normal file
@@ -0,0 +1,350 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
def apply_png_predictor(data, width, bits, color_channels):
|
||||||
|
"""Decode PNG predictor (PDF 1.5+ filter)"""
|
||||||
|
bytes_per_pixel = (bits * color_channels) // 8
|
||||||
|
if (bits * color_channels) % 8 != 0:
|
||||||
|
bytes_per_pixel += 1
|
||||||
|
|
||||||
|
stride = width * bytes_per_pixel
|
||||||
|
scanline_length = stride + 1 # +1 for filter byte
|
||||||
|
|
||||||
|
if len(data) % scanline_length != 0:
|
||||||
|
raise ValueError("Invalid scanline structure")
|
||||||
|
|
||||||
|
num_lines = len(data) // scanline_length
|
||||||
|
output = bytearray()
|
||||||
|
prev_line = b'\x00' * stride
|
||||||
|
|
||||||
|
for i in range(num_lines):
|
||||||
|
line = data[i*scanline_length:(i+1)*scanline_length]
|
||||||
|
filter_type = line[0]
|
||||||
|
filtered = line[1:]
|
||||||
|
|
||||||
|
if filter_type == 0: # None
|
||||||
|
decoded = filtered
|
||||||
|
elif filter_type == 1: # Sub
|
||||||
|
decoded = bytearray(filtered)
|
||||||
|
for j in range(bytes_per_pixel, len(decoded)):
|
||||||
|
decoded[j] = (decoded[j] + decoded[j - bytes_per_pixel]) % 256
|
||||||
|
elif filter_type == 2: # Up
|
||||||
|
decoded = bytearray([(filtered[j] + prev_line[j]) % 256
|
||||||
|
for j in range(len(filtered))])
|
||||||
|
elif filter_type == 3: # Average
|
||||||
|
decoded = bytearray(filtered)
|
||||||
|
for j in range(len(decoded)):
|
||||||
|
left = decoded[j - bytes_per_pixel] if j >= bytes_per_pixel else 0
|
||||||
|
up = prev_line[j]
|
||||||
|
avg = (left + up) // 2
|
||||||
|
decoded[j] = (decoded[j] + avg) % 256
|
||||||
|
elif filter_type == 4: # Paeth
|
||||||
|
decoded = bytearray(filtered)
|
||||||
|
for j in range(len(decoded)):
|
||||||
|
left = decoded[j - bytes_per_pixel] if j >= bytes_per_pixel else 0
|
||||||
|
up = prev_line[j]
|
||||||
|
up_left = prev_line[j - bytes_per_pixel] if j >= bytes_per_pixel else 0
|
||||||
|
paeth = paeth_predictor(left, up, up_left)
|
||||||
|
decoded[j] = (decoded[j] + paeth) % 256
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported filter type: {filter_type}")
|
||||||
|
|
||||||
|
output.extend(decoded)
|
||||||
|
prev_line = decoded
|
||||||
|
|
||||||
|
return bytes(output)
|
||||||
|
|
||||||
|
def paeth_predictor(a, b, c):
|
||||||
|
p = a + b - c
|
||||||
|
pa = abs(p - a)
|
||||||
|
pb = abs(p - b)
|
||||||
|
pc = abs(p - c)
|
||||||
|
if pa <= pb and pa <= pc:
|
||||||
|
return a
|
||||||
|
elif pb <= pc:
|
||||||
|
return b
|
||||||
|
else:
|
||||||
|
return c
|
||||||
|
|
||||||
|
import re
|
||||||
|
import html
|
||||||
|
|
||||||
|
def clean_pdf_text_to_html(page_number, text):
|
||||||
|
# Decode Unicode escapes and handle surrogate pairs
|
||||||
|
try:
|
||||||
|
decoded = text.encode('latin-1').decode('unicode-escape')
|
||||||
|
decoded = decoded.encode('utf-16', 'surrogatepass').decode('utf-16')
|
||||||
|
except Exception as e:
|
||||||
|
decoded = text # Fallback if decoding fails
|
||||||
|
|
||||||
|
article_title_detected = False
|
||||||
|
# decoded = re.sub(r'\.\n', '.\n\n', decoded)
|
||||||
|
# decoded = re.sub(r'\.\n', '<|break|>', decoded)
|
||||||
|
lines = decoded.split('\n')
|
||||||
|
output = []
|
||||||
|
current_paragraph = []
|
||||||
|
in_header = False
|
||||||
|
email_pattern = re.compile(r'\{.*?\}')
|
||||||
|
affiliation_pattern = re.compile(r'^†')
|
||||||
|
quote_pattern = re.compile(r'^["“]')
|
||||||
|
author_pattern = re.compile(
|
||||||
|
r'^\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?'
|
||||||
|
r'(?:,\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?)*'
|
||||||
|
r'(?:,\s*(?:and|&)\s+[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?)?\s*$'
|
||||||
|
)
|
||||||
|
|
||||||
|
def flush_paragraph():
|
||||||
|
if current_paragraph:
|
||||||
|
para = ' '.join(current_paragraph)
|
||||||
|
para = re.sub(r'\s+', ' ', para).strip()
|
||||||
|
if para:
|
||||||
|
# escaped_para = html.escape(para)
|
||||||
|
escaped_para = para
|
||||||
|
# escaped_para = re.sub(r'\.\n', '.\n\n', escaped_para)
|
||||||
|
# Split escaped_para by <|break|> to avoid HTML escaping
|
||||||
|
escaped_para = escaped_para.split('.\n\n')
|
||||||
|
# Wrap each part in <p> tag
|
||||||
|
escaped_para = [f'<p>{part}</p>' for part in escaped_para]
|
||||||
|
output.append(f'<div class="paragraph">{"".join(escaped_para)}</div><hr/>')
|
||||||
|
current_paragraph.clear()
|
||||||
|
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
line = line.strip()
|
||||||
|
|
||||||
|
# Handle empty lines
|
||||||
|
if not line:
|
||||||
|
flush_paragraph()
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Detect article title (first line with reasonable length)
|
||||||
|
if not article_title_detected and i == 0 and 3 <= len(line.split()) <= 8 and len(lines) > 1:
|
||||||
|
flush_paragraph()
|
||||||
|
escaped_line = html.escape(line)
|
||||||
|
output.append(f'<h2>{escaped_line}</h2>')
|
||||||
|
article_title_detected = True
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Detect numbered headers like "2.1 Background"
|
||||||
|
numbered_header = re.match(r'^(\d+(?:\.\d+)*)\s+(.+)$', line)
|
||||||
|
if i > 0 and not lines[i-1].strip() and numbered_header:
|
||||||
|
flush_paragraph()
|
||||||
|
level = numbered_header.group(1).count('.') + 1
|
||||||
|
header_text = numbered_header.group(2)
|
||||||
|
md_level = min(level + 1, 6)
|
||||||
|
escaped_header = html.escape(header_text)
|
||||||
|
output.append(f'<h{md_level}>{escaped_header}</h{md_level}>')
|
||||||
|
in_header = True
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Detect authors
|
||||||
|
if page_number == 1 and author_pattern.match(line):
|
||||||
|
authors = re.sub(r'[†â€]', '', line)
|
||||||
|
authors = re.split(r', | and ', authors)
|
||||||
|
formatted_authors = []
|
||||||
|
for author in authors:
|
||||||
|
if author.strip():
|
||||||
|
parts = [p for p in author.strip().split() if p]
|
||||||
|
formatted = ' '.join(parts)
|
||||||
|
escaped_author = html.escape(formatted)
|
||||||
|
formatted_authors.append(f'<strong>{escaped_author}</strong>')
|
||||||
|
|
||||||
|
if len(formatted_authors) > 1:
|
||||||
|
joined = ', '.join(formatted_authors[:-1]) + ' and ' + formatted_authors[-1]
|
||||||
|
else:
|
||||||
|
joined = formatted_authors[0]
|
||||||
|
|
||||||
|
output.append(f'<p>{joined}</p>')
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Detect affiliation
|
||||||
|
if affiliation_pattern.match(line):
|
||||||
|
escaped_line = html.escape(line)
|
||||||
|
output.append(f'<p><em>{escaped_line}</em></p>')
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Detect emails
|
||||||
|
if email_pattern.match(line):
|
||||||
|
escaped_line = html.escape(line)
|
||||||
|
output.append(f'<p><code>{escaped_line}</code></p>')
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Detect section headers
|
||||||
|
if re.match(r'^(Abstract|\d+\s+[A-Z]|References|Appendix|Figure|Table)', line):
|
||||||
|
flush_paragraph()
|
||||||
|
escaped_line = html.escape(line)
|
||||||
|
output.append(f'<h2 class="section-header"><em>{escaped_line}</em></h2>')
|
||||||
|
in_header = True
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Handle quotes
|
||||||
|
if quote_pattern.match(line):
|
||||||
|
flush_paragraph()
|
||||||
|
escaped_line = html.escape(line)
|
||||||
|
output.append(f'<blockquote><p>{escaped_line}</p></blockquote>')
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Handle hyphenated words
|
||||||
|
if line.endswith('-'):
|
||||||
|
current_paragraph.append(line[:-1].strip())
|
||||||
|
else:
|
||||||
|
current_paragraph.append(line)
|
||||||
|
|
||||||
|
# Handle paragraph breaks after headers
|
||||||
|
if in_header and not line.endswith(('.', '!', '?')):
|
||||||
|
flush_paragraph()
|
||||||
|
in_header = False
|
||||||
|
|
||||||
|
flush_paragraph()
|
||||||
|
|
||||||
|
# Post-process HTML
|
||||||
|
html_output = '\n'.join(output)
|
||||||
|
|
||||||
|
# Fix common citation patterns
|
||||||
|
html_output = re.sub(r'\(([A-Z][a-z]+ et al\. \d{4})\)', r'<cite>\1</cite>', html_output)
|
||||||
|
|
||||||
|
# Fix escaped characters
|
||||||
|
html_output = html_output.replace('\\ud835', '').replace('\\u2020', '†')
|
||||||
|
|
||||||
|
# Remove leftover hyphens and fix spacing
|
||||||
|
html_output = re.sub(r'\s+-\s+', '', html_output)
|
||||||
|
html_output = re.sub(r'\s+([.,!?)])', r'\1', html_output)
|
||||||
|
|
||||||
|
return html_output
|
||||||
|
|
||||||
|
def clean_pdf_text(page_number, text):
|
||||||
|
# Decode Unicode escapes and handle surrogate pairs
|
||||||
|
try:
|
||||||
|
decoded = text.encode('latin-1').decode('unicode-escape')
|
||||||
|
decoded = decoded.encode('utf-16', 'surrogatepass').decode('utf-16')
|
||||||
|
except Exception as e:
|
||||||
|
decoded = text # Fallback if decoding fails
|
||||||
|
|
||||||
|
article_title_detected = False
|
||||||
|
decoded = re.sub(r'\.\n', '.\n\n', decoded)
|
||||||
|
lines = decoded.split('\n')
|
||||||
|
output = []
|
||||||
|
current_paragraph = []
|
||||||
|
in_header = False
|
||||||
|
email_pattern = re.compile(r'\{.*?\}')
|
||||||
|
affiliation_pattern = re.compile(r'^†')
|
||||||
|
quote_pattern = re.compile(r'^["“]')
|
||||||
|
author_pattern = re.compile(
|
||||||
|
r'^\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?'
|
||||||
|
r'(?:,\s*[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?)*'
|
||||||
|
r'(?:,\s*(?:and|&)\s+[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\s*(?:[†*0-9]+)?)?\s*$'
|
||||||
|
)
|
||||||
|
|
||||||
|
def flush_paragraph():
|
||||||
|
if current_paragraph:
|
||||||
|
para = ' '.join(current_paragraph)
|
||||||
|
para = re.sub(r'\s+', ' ', para).strip()
|
||||||
|
if para:
|
||||||
|
output.append(para)
|
||||||
|
current_paragraph.clear()
|
||||||
|
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
line = line.strip()
|
||||||
|
|
||||||
|
# Handle special patterns
|
||||||
|
if not line:
|
||||||
|
flush_paragraph()
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Detect headline (first line, reasonable length, surrounded by empty lines)
|
||||||
|
if not article_title_detected and i == 0 and 3 <= len(line.split()) <= 8 and (len(lines) > 1):
|
||||||
|
flush_paragraph()
|
||||||
|
output.append(f'## {line}')
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Detect paragraph breaks for ALL paragraphs
|
||||||
|
if not line and current_paragraph:
|
||||||
|
flush_paragraph()
|
||||||
|
output.append('') # Add empty line between paragraphs
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Detect numbered headers like "2.1 Background"
|
||||||
|
numbered_header = re.match(r'^(\d+(?:\.\d+)*)\s+(.+)$', line)
|
||||||
|
if not lines[i-1].strip() and numbered_header:
|
||||||
|
flush_paragraph()
|
||||||
|
level = numbered_header.group(1).count('.') + 1 # Convert 2.1 → level 2
|
||||||
|
header_text = numbered_header.group(2)
|
||||||
|
# Never go beyond ### for subsections
|
||||||
|
md_level = min(level + 1, 6) # 1 → ##, 2 → ###, 3 → #### etc
|
||||||
|
output.append(f'{"#" * md_level} {header_text}')
|
||||||
|
in_header = True
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
|
# Detect authors
|
||||||
|
if page_number == 1 and author_pattern.match(line):
|
||||||
|
# Clean and format author names
|
||||||
|
authors = re.sub(r'[†â€]', '', line) # Remove affiliation markers
|
||||||
|
authors = re.split(r', | and ', authors)
|
||||||
|
formatted_authors = []
|
||||||
|
for author in authors:
|
||||||
|
if author.strip():
|
||||||
|
# Handle "First Last" formatting
|
||||||
|
parts = [p for p in author.strip().split() if p]
|
||||||
|
formatted = ' '.join(parts)
|
||||||
|
formatted_authors.append(f'**{formatted}**')
|
||||||
|
|
||||||
|
# Join with commas and "and"
|
||||||
|
if len(formatted_authors) > 1:
|
||||||
|
joined = ', '.join(formatted_authors[:-1]) + ' and ' + formatted_authors[-1]
|
||||||
|
else:
|
||||||
|
joined = formatted_authors[0]
|
||||||
|
|
||||||
|
output.append(joined)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Detect affiliation
|
||||||
|
if affiliation_pattern.match(line):
|
||||||
|
output.append(f'*{line}*')
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Detect emails
|
||||||
|
if email_pattern.match(line):
|
||||||
|
output.append(f'`{line}`')
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Detect section headers
|
||||||
|
if re.match(r'^(Abstract|\d+\s+[A-Z]|References|Appendix|Figure|Table)', line):
|
||||||
|
flush_paragraph()
|
||||||
|
output.append(f'_[{line}]_')
|
||||||
|
in_header = True
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
|
# Handle quotes
|
||||||
|
if quote_pattern.match(line):
|
||||||
|
flush_paragraph()
|
||||||
|
output.append(f'> {line}')
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Handle hyphenated words
|
||||||
|
if line.endswith('-'):
|
||||||
|
current_paragraph.append(line[:-1].strip())
|
||||||
|
else:
|
||||||
|
current_paragraph.append(line)
|
||||||
|
|
||||||
|
# Handle paragraph breaks after headers
|
||||||
|
if in_header and not line.endswith(('.', '!', '?')):
|
||||||
|
flush_paragraph()
|
||||||
|
in_header = False
|
||||||
|
|
||||||
|
flush_paragraph()
|
||||||
|
|
||||||
|
# Post-processing
|
||||||
|
markdown = '\n\n'.join(output)
|
||||||
|
|
||||||
|
# Fix common citation patterns
|
||||||
|
markdown = re.sub(r'\(([A-Z][a-z]+ et al\. \d{4})\)', r'[\1]', markdown)
|
||||||
|
|
||||||
|
# Fix escaped characters
|
||||||
|
markdown = markdown.replace('\\ud835', '').replace('\\u2020', '†')
|
||||||
|
|
||||||
|
# Remove leftover hyphens and fix spacing
|
||||||
|
markdown = re.sub(r'\s+-\s+', '', markdown) # Join hyphenated words
|
||||||
|
markdown = re.sub(r'\s+([.,!?)])', r'\1', markdown) # Fix punctuation spacing
|
||||||
|
|
||||||
|
|
||||||
|
return markdown
|
||||||
Reference in New Issue
Block a user