fix the EXTRACT to match the styling of the other methods

This commit is contained in:
medo94my
2025-05-14 16:01:10 +08:00
parent 260e2dc347
commit 137556b3dc

View File

@@ -360,7 +360,7 @@ class AsyncWebCrawler:
pdf_data=pdf_data, pdf_data=pdf_data,
verbose=config.verbose, verbose=config.verbose,
is_raw_html=True if url.startswith("raw:") else False, is_raw_html=True if url.startswith("raw:") else False,
redirected_url=async_response.redirected_url, redirected_url=async_response.redirected_url,
**kwargs, **kwargs,
) )
@@ -503,7 +503,7 @@ class AsyncWebCrawler:
tables = media.pop("tables", []) tables = media.pop("tables", [])
links = result.links.model_dump() links = result.links.model_dump()
metadata = result.metadata metadata = result.metadata
fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000) fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000)
################################ ################################
@@ -585,11 +585,13 @@ class AsyncWebCrawler:
# Choose content based on input_format # Choose content based on input_format
content_format = config.extraction_strategy.input_format content_format = config.extraction_strategy.input_format
if content_format == "fit_markdown" and not markdown_result.fit_markdown: if content_format == "fit_markdown" and not markdown_result.fit_markdown:
self.logger.warning(
message="Fit markdown requested but not available. Falling back to raw markdown.", self.logger.url_status(
tag="EXTRACT", url=_url,
params={"url": _url}, success=bool(html),
) timing=time.perf_counter() - t1,
tag="EXTRACT",
)
content_format = "markdown" content_format = "markdown"
content = { content = {