Enhance crawler strategies with new features

- ReImplemented JsonXPathExtractionStrategy for enhanced JSON data extraction.
  - Updated existing extraction strategies for better performance.
  - Improved handling of response status codes during crawls.
This commit is contained in:
UncleCode
2024-12-17 22:40:10 +08:00
parent 4a5f1aebee
commit 393bb911c0
4 changed files with 48 additions and 41 deletions

View File

@@ -274,6 +274,7 @@ class AsyncWebCrawler:
if cached_result:
html = sanitize_input_encode(cached_result.html)
extracted_content = sanitize_input_encode(cached_result.extracted_content or "")
extracted_content = None if not extracted_content or extracted_content == "[]" else extracted_content
# If screenshot is requested but its not in cache, then set cache_result to None
screenshot_data = cached_result.screenshot
pdf_data = cached_result.pdf
@@ -476,7 +477,7 @@ class AsyncWebCrawler:
t1 = time.perf_counter()
# Handle different extraction strategy types
if isinstance(config.extraction_strategy, (JsonCssExtractionStrategy, JsonCssExtractionStrategy)):
if isinstance(config.extraction_strategy, (JsonCssExtractionStrategy, JsonXPathExtractionStrategy)):
config.extraction_strategy.verbose = verbose
extracted_content = config.extraction_strategy.run(url, [html])
extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)