fix the EXTRACT to match the styling of the other methods
This commit is contained in:
@@ -360,7 +360,7 @@ class AsyncWebCrawler:
|
|||||||
pdf_data=pdf_data,
|
pdf_data=pdf_data,
|
||||||
verbose=config.verbose,
|
verbose=config.verbose,
|
||||||
is_raw_html=True if url.startswith("raw:") else False,
|
is_raw_html=True if url.startswith("raw:") else False,
|
||||||
redirected_url=async_response.redirected_url,
|
redirected_url=async_response.redirected_url,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -503,7 +503,7 @@ class AsyncWebCrawler:
|
|||||||
tables = media.pop("tables", [])
|
tables = media.pop("tables", [])
|
||||||
links = result.links.model_dump()
|
links = result.links.model_dump()
|
||||||
metadata = result.metadata
|
metadata = result.metadata
|
||||||
|
|
||||||
fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000)
|
fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000)
|
||||||
|
|
||||||
################################
|
################################
|
||||||
@@ -585,11 +585,13 @@ class AsyncWebCrawler:
|
|||||||
# Choose content based on input_format
|
# Choose content based on input_format
|
||||||
content_format = config.extraction_strategy.input_format
|
content_format = config.extraction_strategy.input_format
|
||||||
if content_format == "fit_markdown" and not markdown_result.fit_markdown:
|
if content_format == "fit_markdown" and not markdown_result.fit_markdown:
|
||||||
self.logger.warning(
|
|
||||||
message="Fit markdown requested but not available. Falling back to raw markdown.",
|
self.logger.url_status(
|
||||||
tag="EXTRACT",
|
url=_url,
|
||||||
params={"url": _url},
|
success=bool(html),
|
||||||
)
|
timing=time.perf_counter() - t1,
|
||||||
|
tag="EXTRACT",
|
||||||
|
)
|
||||||
content_format = "markdown"
|
content_format = "markdown"
|
||||||
|
|
||||||
content = {
|
content = {
|
||||||
|
|||||||
Reference in New Issue
Block a user