feat(pdf): add PDF processing capabilities
Add new PDF processing module with the following features: - PDF text extraction and formatting to HTML/Markdown - Image extraction with multiple format support (JPEG, PNG, TIFF) - Link extraction from PDF documents - Metadata extraction including title, author, dates - Support for both local and remote PDF files Also includes: - New configuration options for HTML attribute handling - Internal/external link filtering improvements - Version bump to 0.4.300b4
This commit is contained in:
@@ -529,6 +529,9 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
if normalized_href not in external_links_dict:
|
||||
external_links_dict[normalized_href] = link_data
|
||||
else:
|
||||
if kwargs.get("exclude_internal_links", False):
|
||||
element.decompose()
|
||||
return False
|
||||
if normalized_href not in internal_links_dict:
|
||||
internal_links_dict[normalized_href] = link_data
|
||||
|
||||
@@ -629,7 +632,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
|
||||
try:
|
||||
self.remove_unwanted_attributes(
|
||||
element, IMPORTANT_ATTRS, kwargs.get("keep_data_attributes", False)
|
||||
element, IMPORTANT_ATTRS + kwargs.get("keep_attrs", []) , kwargs.get("keep_data_attributes", False)
|
||||
)
|
||||
except Exception as e:
|
||||
# print('Error removing unwanted attributes:', str(e))
|
||||
|
||||
Reference in New Issue
Block a user