feat(pdf): add PDF processing capabilities

Add new PDF processing module with the following features:
- PDF text extraction and formatting to HTML/Markdown
- Image extraction with multiple format support (JPEG, PNG, TIFF)
- Link extraction from PDF documents
- Metadata extraction including title, author, dates
- Support for both local and remote PDF files

Also includes:
- New configuration options for HTML attribute handling
- Internal/external link filtering improvements
- Version bump to 0.4.300b4
This commit is contained in:
UncleCode
2025-01-27 21:24:15 +08:00
parent 54c84079c4
commit f8fd9d9eff
9 changed files with 933 additions and 49 deletions

View File

@@ -319,14 +319,6 @@ class AsyncWebCrawler:
try:
# Handle configuration
if crawler_config is not None:
# if any(param is not None for param in [
# word_count_threshold, extraction_strategy, chunking_strategy,
# content_filter, cache_mode, css_selector, screenshot, pdf
# ]):
# self.logger.warning(
# message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.",
# tag="WARNING"
# )
config = crawler_config
else:
# Merge all parameters into a single kwargs dict for config creation
@@ -350,14 +342,6 @@ class AsyncWebCrawler:
# Handle deprecated cache parameters
if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]):
if kwargs.get("warning", True):
warnings.warn(
"Cache control boolean flags are deprecated and will be removed in version 0.5.0. "
"Use 'cache_mode' parameter instead.",
DeprecationWarning,
stacklevel=2,
)
# Convert legacy parameters if cache_mode not provided
if config.cache_mode is None:
config.cache_mode = _legacy_to_cache_mode(
@@ -430,7 +414,9 @@ class AsyncWebCrawler:
response_headers={"X-Robots-Status": "Blocked by robots.txt"}
)
# Pass config to crawl method
##############################
# Call CrawlerStrategy.crawl #
##############################
async_response = await self.crawler_strategy.crawl(
url,
config=config, # Pass the entire config object
@@ -448,7 +434,9 @@ class AsyncWebCrawler:
tag="FETCH",
)
# Process the HTML content
###############################################################
# Process the HTML content, Call CrawlerStrategy.process_html #
###############################################################
crawl_result : CrawlResult = await self.aprocess_html(
url=url,
html=html,
@@ -469,26 +457,6 @@ class AsyncWebCrawler:
async_response.ssl_certificate
) # Add SSL certificate
# # Check and set values from async_response to crawl_result
# try:
# for key in vars(async_response):
# if hasattr(crawl_result, key):
# value = getattr(async_response, key, None)
# current_value = getattr(crawl_result, key, None)
# if value is not None and not current_value:
# try:
# setattr(crawl_result, key, value)
# except Exception as e:
# self.logger.warning(
# message=f"Failed to set attribute {key}: {str(e)}",
# tag="WARNING"
# )
# except Exception as e:
# self.logger.warning(
# message=f"Error copying response attributes: {str(e)}",
# tag="WARNING"
# )
crawl_result.success = bool(html)
crawl_result.session_id = getattr(config, "session_id", None)
@@ -538,8 +506,6 @@ class AsyncWebCrawler:
f"Error: {str(e)}\n\n"
f"Code context:\n{error_context['code_context']}"
)
# if not hasattr(e, "msg"):
# e.msg = str(e)
self.logger.error_status(
url=url,
@@ -578,6 +544,7 @@ class AsyncWebCrawler:
Returns:
CrawlResult: Processed result containing extracted and formatted content
"""
cleaned_html = ""
try:
_url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
t1 = time.perf_counter()
@@ -592,6 +559,10 @@ class AsyncWebCrawler:
# add keys from kwargs to params that doesn't exist in params
params.update({k: v for k, v in kwargs.items() if k not in params.keys()})
################################
# Scraping Strategy Execution #
################################
result = scraping_strategy.scrap(url, html, **params)
if result is None:
@@ -618,7 +589,9 @@ class AsyncWebCrawler:
links = result.links.model_dump()
metadata = result.metadata
# Markdown Generation
################################
# Generate Markdown #
################################
markdown_generator: Optional[MarkdownGenerationStrategy] = (
config.markdown_generator or DefaultMarkdownGenerator()
)
@@ -644,14 +617,15 @@ class AsyncWebCrawler:
params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000)},
)
# Handle content extraction if needed
################################
# Structured Content Extraction #
################################
if (
not bool(extracted_content)
and config.extraction_strategy
and not isinstance(config.extraction_strategy, NoExtractionStrategy)
):
t1 = time.perf_counter()
# Choose content based on input_format
content_format = config.extraction_strategy.input_format
if content_format == "fit_markdown" and not markdown_result.fit_markdown:
@@ -665,6 +639,7 @@ class AsyncWebCrawler:
content = {
"markdown": markdown,
"html": html,
"cleaned_html": cleaned_html,
"fit_markdown": markdown_result.raw_markdown,
}.get(content_format, markdown)