feat(pdf): add PDF processing capabilities

Add new PDF processing module with the following features: - PDF text extraction and formatting to HTML/Markdown - Image extraction with multiple format support (JPEG, PNG, TIFF) - Link extraction from PDF documents - Metadata extraction including title, author, dates - Support for both local and remote PDF files Also includes: - New configuration options for HTML attribute handling - Internal/external link filtering improvements - Version bump to 0.4.300b4
2025-01-27 21:24:15 +08:00
parent 54c84079c4
commit f8fd9d9eff
9 changed files with 933 additions and 49 deletions
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -529,6 +529,9 @@ class WebScrapingStrategy(ContentScrapingStrategy):
                        if normalized_href not in external_links_dict:
                            external_links_dict[normalized_href] = link_data
                    else:
+                        if kwargs.get("exclude_internal_links", False):
+                            element.decompose()
+                            return False
                        if normalized_href not in internal_links_dict:
                            internal_links_dict[normalized_href] = link_data

@@ -629,7 +632,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):

            try:
                self.remove_unwanted_attributes(
-                    element, IMPORTANT_ATTRS, kwargs.get("keep_data_attributes", False)
+                    element, IMPORTANT_ATTRS + kwargs.get("keep_attrs", []) , kwargs.get("keep_data_attributes", False)
                )
            except Exception as e:
                # print('Error removing unwanted attributes:', str(e))