Enhance crawler strategies with new features

- ReImplemented JsonXPathExtractionStrategy for enhanced JSON data extraction. - Updated existing extraction strategies for better performance. - Improved handling of response status codes during crawls.
2024-12-17 22:40:10 +08:00
parent 4a5f1aebee
commit 393bb911c0
4 changed files with 48 additions and 41 deletions
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -274,6 +274,7 @@ class AsyncWebCrawler:
                    if cached_result:
                        html = sanitize_input_encode(cached_result.html)
                        extracted_content = sanitize_input_encode(cached_result.extracted_content or "")
+                        extracted_content = None if not extracted_content or extracted_content == "[]" else extracted_content
                        # If screenshot is requested but its not in cache, then set cache_result to None
                        screenshot_data = cached_result.screenshot
                        pdf_data = cached_result.pdf
@@ -476,7 +477,7 @@ class AsyncWebCrawler:
                t1 = time.perf_counter()
                
                # Handle different extraction strategy types
-                if isinstance(config.extraction_strategy, (JsonCssExtractionStrategy, JsonCssExtractionStrategy)):
+                if isinstance(config.extraction_strategy, (JsonCssExtractionStrategy, JsonXPathExtractionStrategy)):
                    config.extraction_strategy.verbose = verbose
                    extracted_content = config.extraction_strategy.run(url, [html])
                    extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)