[v0.3.72] Enhance content extraction and proxy support

- Add ContentCleaningStrategy for improved content extraction - Implement advanced proxy configuration with authentication - Enhance image source detection and handling - Add fit_markdown and fit_html for refined content output - Improve external link and image handling flexibility
2024-10-22 20:19:22 +08:00
parent 04d16e6d2b
commit 60ba131ac8
6 changed files with 260 additions and 3 deletions
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -212,6 +212,8 @@ class AsyncWebCrawler:

        cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
        markdown = sanitize_input_encode(result.get("markdown", ""))
+        fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
+        fit_html = sanitize_input_encode(result.get("fit_html", ""))
        media = result.get("media", [])
        links = result.get("links", [])
        metadata = result.get("metadata", {})
@@ -258,6 +260,8 @@ class AsyncWebCrawler:
            html=html,
            cleaned_html=format_html(cleaned_html),
            markdown=markdown,
+            fit_markdown=fit_markdown,
+            fit_html= fit_html,
            media=media,
            links=links,
            metadata=metadata,