New async database manager and migration support
- Introduced AsyncDatabaseManager for async DB management. - Added migration feature to transition to file-based storage. - Enhanced web crawler with improved caching logic. - Updated requirements and setup for async processing.
This commit is contained in:
@@ -47,17 +47,17 @@ class AsyncWebCrawler:
|
||||
|
||||
async def awarmup(self):
|
||||
# Print a message for crawl4ai and its version
|
||||
print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}")
|
||||
if self.verbose:
|
||||
print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}")
|
||||
print("[LOG] 🌤️ Warming up the AsyncWebCrawler")
|
||||
# await async_db_manager.ainit_db()
|
||||
await async_db_manager.initialize()
|
||||
await self.arun(
|
||||
url="https://google.com/",
|
||||
word_count_threshold=5,
|
||||
bypass_cache=False,
|
||||
verbose=False,
|
||||
)
|
||||
# # await async_db_manager.initialize()
|
||||
# await self.arun(
|
||||
# url="https://google.com/",
|
||||
# word_count_threshold=5,
|
||||
# bypass_cache=False,
|
||||
# verbose=False,
|
||||
# )
|
||||
self.ready = True
|
||||
if self.verbose:
|
||||
print("[LOG] 🌞 AsyncWebCrawler is ready to crawl")
|
||||
@@ -73,6 +73,9 @@ class AsyncWebCrawler:
|
||||
screenshot: bool = False,
|
||||
user_agent: str = None,
|
||||
verbose=True,
|
||||
disable_cache: bool = False,
|
||||
no_cache_read: bool = False,
|
||||
no_cache_write: bool = False,
|
||||
**kwargs,
|
||||
) -> CrawlResult:
|
||||
"""
|
||||
@@ -89,6 +92,11 @@ class AsyncWebCrawler:
|
||||
CrawlResult: The result of the crawling and processing.
|
||||
"""
|
||||
try:
|
||||
if disable_cache:
|
||||
bypass_cache = True
|
||||
no_cache_read = True
|
||||
no_cache_write = True
|
||||
|
||||
extraction_strategy = extraction_strategy or NoExtractionStrategy()
|
||||
extraction_strategy.verbose = verbose
|
||||
if not isinstance(extraction_strategy, ExtractionStrategy):
|
||||
@@ -108,36 +116,39 @@ class AsyncWebCrawler:
|
||||
is_raw_html = url.startswith("raw:")
|
||||
_url = url if not is_raw_html else "Raw HTML"
|
||||
|
||||
if is_web_url and not bypass_cache and not self.always_by_pass_cache:
|
||||
cached = await async_db_manager.aget_cached_url(url)
|
||||
start_time = time.perf_counter()
|
||||
cached_result = None
|
||||
if is_web_url and (not bypass_cache or not no_cache_read) and not self.always_by_pass_cache:
|
||||
cached_result = await async_db_manager.aget_cached_url(url)
|
||||
|
||||
# if not bypass_cache and not self.always_by_pass_cache:
|
||||
# cached = await async_db_manager.aget_cached_url(url)
|
||||
|
||||
if kwargs.get("warmup", True) and not self.ready:
|
||||
return None
|
||||
|
||||
if cached:
|
||||
html = sanitize_input_encode(cached[1])
|
||||
extracted_content = sanitize_input_encode(cached[4])
|
||||
if cached_result:
|
||||
html = sanitize_input_encode(cached_result.html)
|
||||
extracted_content = sanitize_input_encode(cached_result.extracted_content or "")
|
||||
if screenshot:
|
||||
screenshot_data = cached[9]
|
||||
screenshot_data = cached_result.screenshot
|
||||
if not screenshot_data:
|
||||
cached = None
|
||||
cached_result = None
|
||||
if verbose:
|
||||
print(
|
||||
f"[LOG] 1️⃣ ✅ Page fetched (cache) for {_url}, success: {bool(html)}, time taken: {time.perf_counter() - start_time:.2f} seconds"
|
||||
)
|
||||
|
||||
|
||||
if not cached or not html:
|
||||
t1 = time.time()
|
||||
t1 = time.perf_counter()
|
||||
|
||||
if user_agent:
|
||||
self.crawler_strategy.update_user_agent(user_agent)
|
||||
async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(url, screenshot=screenshot, **kwargs)
|
||||
html = sanitize_input_encode(async_response.html)
|
||||
screenshot_data = async_response.screenshot
|
||||
t2 = time.time()
|
||||
t2 = time.perf_counter()
|
||||
if verbose:
|
||||
print(
|
||||
f"[LOG] 🚀 Crawling done for {_url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds"
|
||||
f"[LOG] 1️⃣ ✅ Page fetched (no-cache) for {_url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds"
|
||||
)
|
||||
|
||||
t1 = time.perf_counter()
|
||||
crawl_result = await self.aprocess_html(
|
||||
url=url,
|
||||
html=html,
|
||||
@@ -163,30 +174,19 @@ class AsyncWebCrawler:
|
||||
crawl_result.downloaded_files = async_response.downloaded_files
|
||||
else:
|
||||
crawl_result.status_code = 200
|
||||
crawl_result.response_headers = cached[10]
|
||||
# crawl_result.downloaded_files = cached[11]
|
||||
crawl_result.response_headers = cached_result.response_headers if cached_result else {}
|
||||
|
||||
crawl_result.success = bool(html)
|
||||
crawl_result.session_id = kwargs.get("session_id", None)
|
||||
|
||||
if verbose:
|
||||
print(
|
||||
f"[LOG] 🔥 🚀 Crawling done for {_url}, success: {crawl_result.success}, time taken: {time.perf_counter() - start_time:.2f} seconds"
|
||||
)
|
||||
|
||||
if not is_raw_html:
|
||||
if not bool(cached) or kwargs.get("bypass_cache", False) or self.always_by_pass_cache:
|
||||
await async_db_manager.acache_url(
|
||||
url = url,
|
||||
html = html,
|
||||
cleaned_html = crawl_result.cleaned_html,
|
||||
markdown = crawl_result.markdown,
|
||||
extracted_content = extracted_content,
|
||||
success = True,
|
||||
media = json.dumps(crawl_result.media),
|
||||
links = json.dumps(crawl_result.links),
|
||||
metadata = json.dumps(crawl_result.metadata),
|
||||
screenshot=screenshot,
|
||||
response_headers=json.dumps(crawl_result.response_headers),
|
||||
downloaded_files=json.dumps(crawl_result.downloaded_files),
|
||||
|
||||
)
|
||||
if not is_raw_html and not no_cache_write:
|
||||
if not bool(cached_result) or kwargs.get("bypass_cache", False) or self.always_by_pass_cache:
|
||||
await async_db_manager.acache_url(crawl_result)
|
||||
|
||||
|
||||
return crawl_result
|
||||
@@ -258,11 +258,11 @@ class AsyncWebCrawler:
|
||||
verbose: bool,
|
||||
**kwargs,
|
||||
) -> CrawlResult:
|
||||
t = time.time()
|
||||
t = time.perf_counter()
|
||||
# Extract content from HTML
|
||||
try:
|
||||
_url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
|
||||
t1 = time.time()
|
||||
t1 = time.perf_counter()
|
||||
scrapping_strategy = WebScrapingStrategy()
|
||||
# result = await scrapping_strategy.ascrap(
|
||||
result = scrapping_strategy.scrap(
|
||||
@@ -276,10 +276,6 @@ class AsyncWebCrawler:
|
||||
),
|
||||
**kwargs,
|
||||
)
|
||||
if verbose:
|
||||
print(
|
||||
f"[LOG] 🚀 Content extracted for {_url}, success: True, time taken: {time.time() - t1:.2f} seconds"
|
||||
)
|
||||
|
||||
if result is None:
|
||||
raise ValueError(f"Process HTML, Failed to extract content from the website: {url}")
|
||||
@@ -295,13 +291,14 @@ class AsyncWebCrawler:
|
||||
media = result.get("media", [])
|
||||
links = result.get("links", [])
|
||||
metadata = result.get("metadata", {})
|
||||
|
||||
if verbose:
|
||||
print(
|
||||
f"[LOG] 2️⃣ ✅ Scraping done for {_url}, success: True, time taken: {time.perf_counter() - t1:.2f} seconds"
|
||||
)
|
||||
|
||||
if extracted_content is None and extraction_strategy and chunking_strategy:
|
||||
if verbose:
|
||||
print(
|
||||
f"[LOG] 🔥 Extracting semantic blocks for {_url}, Strategy: {self.__class__.__name__}"
|
||||
)
|
||||
|
||||
if extracted_content is None and extraction_strategy and chunking_strategy and not isinstance(extraction_strategy, NoExtractionStrategy):
|
||||
t1 = time.perf_counter()
|
||||
# Check if extraction strategy is type of JsonCssExtractionStrategy
|
||||
if isinstance(extraction_strategy, JsonCssExtractionStrategy) or isinstance(extraction_strategy, JsonCssExtractionStrategy):
|
||||
extraction_strategy.verbose = verbose
|
||||
@@ -311,11 +308,10 @@ class AsyncWebCrawler:
|
||||
sections = chunking_strategy.chunk(markdown)
|
||||
extracted_content = extraction_strategy.run(url, sections)
|
||||
extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)
|
||||
|
||||
if verbose:
|
||||
print(
|
||||
f"[LOG] 🚀 Extraction done for {_url}, time taken: {time.time() - t:.2f} seconds."
|
||||
)
|
||||
if verbose:
|
||||
print(
|
||||
f"[LOG] 3️⃣ ✅ Extraction done for {_url}, time taken: {time.perf_counter() - t1:.2f} seconds"
|
||||
)
|
||||
|
||||
screenshot = None if not screenshot else screenshot
|
||||
|
||||
|
||||
Reference in New Issue
Block a user