From 137556b3dce373bfd8af09e8bd5f9da0051ba463 Mon Sep 17 00:00:00 2001 From: medo94my Date: Wed, 14 May 2025 16:01:10 +0800 Subject: [PATCH] fix the EXTRACT to match the styling of the other methods --- crawl4ai/async_webcrawler.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 19b98522..9e42b824 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -360,7 +360,7 @@ class AsyncWebCrawler: pdf_data=pdf_data, verbose=config.verbose, is_raw_html=True if url.startswith("raw:") else False, - redirected_url=async_response.redirected_url, + redirected_url=async_response.redirected_url, **kwargs, ) @@ -503,7 +503,7 @@ class AsyncWebCrawler: tables = media.pop("tables", []) links = result.links.model_dump() metadata = result.metadata - + fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000) ################################ @@ -585,11 +585,13 @@ class AsyncWebCrawler: # Choose content based on input_format content_format = config.extraction_strategy.input_format if content_format == "fit_markdown" and not markdown_result.fit_markdown: - self.logger.warning( - message="Fit markdown requested but not available. Falling back to raw markdown.", - tag="EXTRACT", - params={"url": _url}, - ) + + self.logger.url_status( + url=_url, + success=bool(html), + timing=time.perf_counter() - t1, + tag="EXTRACT", + ) content_format = "markdown" content = {