fix the EXTRACT to match the styling of the other methods

This commit is contained in:
medo94my
2025-05-14 16:01:10 +08:00
parent 260e2dc347
commit 137556b3dc

View File

@@ -360,7 +360,7 @@ class AsyncWebCrawler:
pdf_data=pdf_data,
verbose=config.verbose,
is_raw_html=True if url.startswith("raw:") else False,
redirected_url=async_response.redirected_url,
redirected_url=async_response.redirected_url,
**kwargs,
)
@@ -503,7 +503,7 @@ class AsyncWebCrawler:
tables = media.pop("tables", [])
links = result.links.model_dump()
metadata = result.metadata
fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000)
################################
@@ -585,11 +585,13 @@ class AsyncWebCrawler:
# Choose content based on input_format
content_format = config.extraction_strategy.input_format
if content_format == "fit_markdown" and not markdown_result.fit_markdown:
self.logger.warning(
message="Fit markdown requested but not available. Falling back to raw markdown.",
tag="EXTRACT",
params={"url": _url},
)
self.logger.url_status(
url=_url,
success=bool(html),
timing=time.perf_counter() - t1,
tag="EXTRACT",
)
content_format = "markdown"
content = {