Enhance AsyncWebCrawler with smart waiting and screenshot capabilities

- Implement smart_wait function in AsyncPlaywrightCrawlerStrategy - Add screenshot support to AsyncCrawlResponse and AsyncWebCrawler - Improve error handling and timeout management in crawling process - Fix typo in CrawlResult model (responser_headers -> response_headers) - Update .gitignore to exclude additional files - Adjust import path in test_basic_crawling.py
2024-10-02 17:34:56 +08:00
parent e0e0db4247
commit 4750810a67
10 changed files with 281 additions and 21 deletions
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -80,7 +80,7 @@ class AsyncWebCrawler:
            
            word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD)

-            async_response : AsyncCrawlResponse = None
+            async_response: AsyncCrawlResponse = None
            cached = None
            screenshot_data = None
            extracted_content = None
@@ -102,15 +102,14 @@ class AsyncWebCrawler:
                t1 = time.time()
                if user_agent:
                    self.crawler_strategy.update_user_agent(user_agent)
-                async_response : AsyncCrawlResponse = await self.crawler_strategy.crawl(url, **kwargs)
+                async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(url, screenshot=screenshot, **kwargs)
                html = sanitize_input_encode(async_response.html)
+                screenshot_data = async_response.screenshot
                t2 = time.time()
                if verbose:
                    print(
                        f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds"
                    )
-                if screenshot:
-                    screenshot_data = await self.crawler_strategy.take_screenshot(url)

            crawl_result = await self.aprocess_html(
                url,
@@ -127,7 +126,7 @@ class AsyncWebCrawler:
                **kwargs,
            )
            crawl_result.status_code = async_response.status_code if async_response else 200
-            crawl_result.responser_headers = async_response.response_headers if async_response else {}
+            crawl_result.response_headers = async_response.response_headers if async_response else {}
            crawl_result.success = bool(html)
            crawl_result.session_id = kwargs.get("session_id", None)
            return crawl_result