From 17913f5acf28cfac775085b74496d1ed5aafcae6 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Wed, 13 Nov 2024 20:00:29 +0800
Subject: [PATCH] feat(crawler): support local files and raw HTML input in
 AsyncWebCrawler

---
 crawl4ai/async_webcrawler.py | 49 +++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 20 deletions(-)

diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index 9d0340dc..8415f9b9 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -104,6 +104,10 @@ class AsyncWebCrawler:
             extracted_content = None
             
             is_web_url = url.startswith(('http://', 'https://'))
+            is_local_file = url.startswith("file://")
+            is_raw_html = url.startswith("raw:")
+            _url = url if not is_raw_html else "Raw HTML"
+            
             if is_web_url and not bypass_cache and not self.always_by_pass_cache:
                 cached = await async_db_manager.aget_cached_url(url)
                         
@@ -131,7 +135,7 @@ class AsyncWebCrawler:
                 t2 = time.time()
                 if verbose:
                     print(
-                        f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds"
+                        f"[LOG] 🚀 Crawling done for {_url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds"
                     )
 
             crawl_result = await self.aprocess_html(
@@ -147,6 +151,9 @@ class AsyncWebCrawler:
                 is_cached=bool(cached),
                 async_response=async_response,
                 bypass_cache=bypass_cache,
+                is_web_url = is_web_url,
+                is_local_file = is_local_file,
+                is_raw_html = is_raw_html,
                 **kwargs,
             )
             
@@ -164,8 +171,8 @@ class AsyncWebCrawler:
         except Exception as e:
             if not hasattr(e, "msg"):
                 e.msg = str(e)
-            print(f"[ERROR] 🚫 arun(): Failed to crawl {url}, error: {e.msg}")
-            return CrawlResult(url=url, html="", markdown = f"[ERROR] 🚫 arun(): Failed to crawl {url}, error: {e.msg}", success=False, error_message=e.msg)
+            print(f"[ERROR] 🚫 arun(): Failed to crawl {_url}, error: {e.msg}")
+            return CrawlResult(url=url, html="", markdown = f"[ERROR] 🚫 arun(): Failed to crawl {_url}, error: {e.msg}", success=False, error_message=e.msg)
 
     async def arun_many(
         self,
@@ -233,6 +240,7 @@ class AsyncWebCrawler:
         t = time.time()
         # Extract content from HTML
         try:
+            _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
             t1 = time.time()
             scrapping_strategy = WebScrapingStrategy()
             # result = await scrapping_strategy.ascrap(
@@ -249,7 +257,7 @@ class AsyncWebCrawler:
             )
             if verbose:
                 print(
-                    f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds"
+                    f"[LOG] 🚀 Content extracted for {_url}, success: True, time taken: {time.time() - t1:.2f} seconds"
                 )
 
             if result is None:
@@ -270,7 +278,7 @@ class AsyncWebCrawler:
         if extracted_content is None and extraction_strategy and chunking_strategy:
             if verbose:
                 print(
-                    f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {self.__class__.__name__}"
+                    f"[LOG] 🔥 Extracting semantic blocks for {_url}, Strategy: {self.__class__.__name__}"
                 )
 
             # Check if extraction strategy is type of JsonCssExtractionStrategy
@@ -285,7 +293,7 @@ class AsyncWebCrawler:
 
         if verbose:
             print(
-                f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t:.2f} seconds."
+                f"[LOG] 🚀 Extraction done for {_url}, time taken: {time.time() - t:.2f} seconds."
             )
 
         screenshot = None if not screenshot else screenshot
@@ -296,20 +304,21 @@ class AsyncWebCrawler:
             response_headers = json.dumps(async_response.response_headers, ensure_ascii=False)
 
 
-        if not is_cached or kwargs.get("bypass_cache", False) or self.always_by_pass_cache:
-            await async_db_manager.acache_url(
-                url,
-                html,
-                cleaned_html,
-                markdown,
-                extracted_content,
-                True,
-                json.dumps(media),
-                json.dumps(links),
-                json.dumps(metadata),
-                screenshot=screenshot,
-                response_headers=response_headers,
-            )
+        if not kwargs.get("is_raw_html", False):
+            if not is_cached or kwargs.get("bypass_cache", False) or self.always_by_pass_cache:
+                await async_db_manager.acache_url(
+                    url,
+                    html,
+                    cleaned_html,
+                    markdown,
+                    extracted_content,
+                    True,
+                    json.dumps(media),
+                    json.dumps(links),
+                    json.dumps(metadata),
+                    screenshot=screenshot,
+                    response_headers=response_headers,
+                )
 
         return CrawlResult(
             url=url,