feat(crawler): support local files and raw HTML input in AsyncWebCrawler
This commit is contained in:
@@ -104,6 +104,10 @@ class AsyncWebCrawler:
|
|||||||
extracted_content = None
|
extracted_content = None
|
||||||
|
|
||||||
is_web_url = url.startswith(('http://', 'https://'))
|
is_web_url = url.startswith(('http://', 'https://'))
|
||||||
|
is_local_file = url.startswith("file://")
|
||||||
|
is_raw_html = url.startswith("raw:")
|
||||||
|
_url = url if not is_raw_html else "Raw HTML"
|
||||||
|
|
||||||
if is_web_url and not bypass_cache and not self.always_by_pass_cache:
|
if is_web_url and not bypass_cache and not self.always_by_pass_cache:
|
||||||
cached = await async_db_manager.aget_cached_url(url)
|
cached = await async_db_manager.aget_cached_url(url)
|
||||||
|
|
||||||
@@ -131,7 +135,7 @@ class AsyncWebCrawler:
|
|||||||
t2 = time.time()
|
t2 = time.time()
|
||||||
if verbose:
|
if verbose:
|
||||||
print(
|
print(
|
||||||
f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds"
|
f"[LOG] 🚀 Crawling done for {_url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds"
|
||||||
)
|
)
|
||||||
|
|
||||||
crawl_result = await self.aprocess_html(
|
crawl_result = await self.aprocess_html(
|
||||||
@@ -147,6 +151,9 @@ class AsyncWebCrawler:
|
|||||||
is_cached=bool(cached),
|
is_cached=bool(cached),
|
||||||
async_response=async_response,
|
async_response=async_response,
|
||||||
bypass_cache=bypass_cache,
|
bypass_cache=bypass_cache,
|
||||||
|
is_web_url = is_web_url,
|
||||||
|
is_local_file = is_local_file,
|
||||||
|
is_raw_html = is_raw_html,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -164,8 +171,8 @@ class AsyncWebCrawler:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
if not hasattr(e, "msg"):
|
if not hasattr(e, "msg"):
|
||||||
e.msg = str(e)
|
e.msg = str(e)
|
||||||
print(f"[ERROR] 🚫 arun(): Failed to crawl {url}, error: {e.msg}")
|
print(f"[ERROR] 🚫 arun(): Failed to crawl {_url}, error: {e.msg}")
|
||||||
return CrawlResult(url=url, html="", markdown = f"[ERROR] 🚫 arun(): Failed to crawl {url}, error: {e.msg}", success=False, error_message=e.msg)
|
return CrawlResult(url=url, html="", markdown = f"[ERROR] 🚫 arun(): Failed to crawl {_url}, error: {e.msg}", success=False, error_message=e.msg)
|
||||||
|
|
||||||
async def arun_many(
|
async def arun_many(
|
||||||
self,
|
self,
|
||||||
@@ -233,6 +240,7 @@ class AsyncWebCrawler:
|
|||||||
t = time.time()
|
t = time.time()
|
||||||
# Extract content from HTML
|
# Extract content from HTML
|
||||||
try:
|
try:
|
||||||
|
_url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
|
||||||
t1 = time.time()
|
t1 = time.time()
|
||||||
scrapping_strategy = WebScrapingStrategy()
|
scrapping_strategy = WebScrapingStrategy()
|
||||||
# result = await scrapping_strategy.ascrap(
|
# result = await scrapping_strategy.ascrap(
|
||||||
@@ -249,7 +257,7 @@ class AsyncWebCrawler:
|
|||||||
)
|
)
|
||||||
if verbose:
|
if verbose:
|
||||||
print(
|
print(
|
||||||
f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds"
|
f"[LOG] 🚀 Content extracted for {_url}, success: True, time taken: {time.time() - t1:.2f} seconds"
|
||||||
)
|
)
|
||||||
|
|
||||||
if result is None:
|
if result is None:
|
||||||
@@ -270,7 +278,7 @@ class AsyncWebCrawler:
|
|||||||
if extracted_content is None and extraction_strategy and chunking_strategy:
|
if extracted_content is None and extraction_strategy and chunking_strategy:
|
||||||
if verbose:
|
if verbose:
|
||||||
print(
|
print(
|
||||||
f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {self.__class__.__name__}"
|
f"[LOG] 🔥 Extracting semantic blocks for {_url}, Strategy: {self.__class__.__name__}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Check if extraction strategy is type of JsonCssExtractionStrategy
|
# Check if extraction strategy is type of JsonCssExtractionStrategy
|
||||||
@@ -285,7 +293,7 @@ class AsyncWebCrawler:
|
|||||||
|
|
||||||
if verbose:
|
if verbose:
|
||||||
print(
|
print(
|
||||||
f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t:.2f} seconds."
|
f"[LOG] 🚀 Extraction done for {_url}, time taken: {time.time() - t:.2f} seconds."
|
||||||
)
|
)
|
||||||
|
|
||||||
screenshot = None if not screenshot else screenshot
|
screenshot = None if not screenshot else screenshot
|
||||||
@@ -296,20 +304,21 @@ class AsyncWebCrawler:
|
|||||||
response_headers = json.dumps(async_response.response_headers, ensure_ascii=False)
|
response_headers = json.dumps(async_response.response_headers, ensure_ascii=False)
|
||||||
|
|
||||||
|
|
||||||
if not is_cached or kwargs.get("bypass_cache", False) or self.always_by_pass_cache:
|
if not kwargs.get("is_raw_html", False):
|
||||||
await async_db_manager.acache_url(
|
if not is_cached or kwargs.get("bypass_cache", False) or self.always_by_pass_cache:
|
||||||
url,
|
await async_db_manager.acache_url(
|
||||||
html,
|
url,
|
||||||
cleaned_html,
|
html,
|
||||||
markdown,
|
cleaned_html,
|
||||||
extracted_content,
|
markdown,
|
||||||
True,
|
extracted_content,
|
||||||
json.dumps(media),
|
True,
|
||||||
json.dumps(links),
|
json.dumps(media),
|
||||||
json.dumps(metadata),
|
json.dumps(links),
|
||||||
screenshot=screenshot,
|
json.dumps(metadata),
|
||||||
response_headers=response_headers,
|
screenshot=screenshot,
|
||||||
)
|
response_headers=response_headers,
|
||||||
|
)
|
||||||
|
|
||||||
return CrawlResult(
|
return CrawlResult(
|
||||||
url=url,
|
url=url,
|
||||||
|
|||||||
Reference in New Issue
Block a user