[v0.3.72] Enhance content extraction and proxy support

- Add ContentCleaningStrategy for improved content extraction
- Implement advanced proxy configuration with authentication
- Enhance image source detection and handling
- Add fit_markdown and fit_html for refined content output
- Improve external link and image handling flexibility
This commit is contained in:
UncleCode
2024-10-22 20:19:22 +08:00
parent 04d16e6d2b
commit 60ba131ac8
6 changed files with 260 additions and 3 deletions

View File

@@ -212,6 +212,8 @@ class AsyncWebCrawler:
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
markdown = sanitize_input_encode(result.get("markdown", ""))
fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
fit_html = sanitize_input_encode(result.get("fit_html", ""))
media = result.get("media", [])
links = result.get("links", [])
metadata = result.get("metadata", {})
@@ -258,6 +260,8 @@ class AsyncWebCrawler:
html=html,
cleaned_html=format_html(cleaned_html),
markdown=markdown,
fit_markdown=fit_markdown,
fit_html= fit_html,
media=media,
links=links,
metadata=metadata,