New async database manager and migration support

- Introduced AsyncDatabaseManager for async DB management.
  - Added migration feature to transition to file-based storage.
  - Enhanced web crawler with improved caching logic.
  - Updated requirements and setup for async processing.
This commit is contained in:
UncleCode
2024-11-16 14:54:41 +08:00
parent ae7ebc0bd8
commit d0014c6793
8 changed files with 685 additions and 119 deletions

View File

@@ -47,17 +47,17 @@ class AsyncWebCrawler:
async def awarmup(self):
# Print a message for crawl4ai and its version
print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}")
if self.verbose:
print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}")
print("[LOG] 🌤️ Warming up the AsyncWebCrawler")
# await async_db_manager.ainit_db()
await async_db_manager.initialize()
await self.arun(
url="https://google.com/",
word_count_threshold=5,
bypass_cache=False,
verbose=False,
)
# # await async_db_manager.initialize()
# await self.arun(
# url="https://google.com/",
# word_count_threshold=5,
# bypass_cache=False,
# verbose=False,
# )
self.ready = True
if self.verbose:
print("[LOG] 🌞 AsyncWebCrawler is ready to crawl")
@@ -73,6 +73,9 @@ class AsyncWebCrawler:
screenshot: bool = False,
user_agent: str = None,
verbose=True,
disable_cache: bool = False,
no_cache_read: bool = False,
no_cache_write: bool = False,
**kwargs,
) -> CrawlResult:
"""
@@ -89,6 +92,11 @@ class AsyncWebCrawler:
CrawlResult: The result of the crawling and processing.
"""
try:
if disable_cache:
bypass_cache = True
no_cache_read = True
no_cache_write = True
extraction_strategy = extraction_strategy or NoExtractionStrategy()
extraction_strategy.verbose = verbose
if not isinstance(extraction_strategy, ExtractionStrategy):
@@ -108,36 +116,39 @@ class AsyncWebCrawler:
is_raw_html = url.startswith("raw:")
_url = url if not is_raw_html else "Raw HTML"
if is_web_url and not bypass_cache and not self.always_by_pass_cache:
cached = await async_db_manager.aget_cached_url(url)
start_time = time.perf_counter()
cached_result = None
if is_web_url and (not bypass_cache or not no_cache_read) and not self.always_by_pass_cache:
cached_result = await async_db_manager.aget_cached_url(url)
# if not bypass_cache and not self.always_by_pass_cache:
# cached = await async_db_manager.aget_cached_url(url)
if kwargs.get("warmup", True) and not self.ready:
return None
if cached:
html = sanitize_input_encode(cached[1])
extracted_content = sanitize_input_encode(cached[4])
if cached_result:
html = sanitize_input_encode(cached_result.html)
extracted_content = sanitize_input_encode(cached_result.extracted_content or "")
if screenshot:
screenshot_data = cached[9]
screenshot_data = cached_result.screenshot
if not screenshot_data:
cached = None
cached_result = None
if verbose:
print(
f"[LOG] 1⃣ ✅ Page fetched (cache) for {_url}, success: {bool(html)}, time taken: {time.perf_counter() - start_time:.2f} seconds"
)
if not cached or not html:
t1 = time.time()
t1 = time.perf_counter()
if user_agent:
self.crawler_strategy.update_user_agent(user_agent)
async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(url, screenshot=screenshot, **kwargs)
html = sanitize_input_encode(async_response.html)
screenshot_data = async_response.screenshot
t2 = time.time()
t2 = time.perf_counter()
if verbose:
print(
f"[LOG] 🚀 Crawling done for {_url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds"
f"[LOG] 1⃣ ✅ Page fetched (no-cache) for {_url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds"
)
t1 = time.perf_counter()
crawl_result = await self.aprocess_html(
url=url,
html=html,
@@ -163,30 +174,19 @@ class AsyncWebCrawler:
crawl_result.downloaded_files = async_response.downloaded_files
else:
crawl_result.status_code = 200
crawl_result.response_headers = cached[10]
# crawl_result.downloaded_files = cached[11]
crawl_result.response_headers = cached_result.response_headers if cached_result else {}
crawl_result.success = bool(html)
crawl_result.session_id = kwargs.get("session_id", None)
if verbose:
print(
f"[LOG] 🔥 🚀 Crawling done for {_url}, success: {crawl_result.success}, time taken: {time.perf_counter() - start_time:.2f} seconds"
)
if not is_raw_html:
if not bool(cached) or kwargs.get("bypass_cache", False) or self.always_by_pass_cache:
await async_db_manager.acache_url(
url = url,
html = html,
cleaned_html = crawl_result.cleaned_html,
markdown = crawl_result.markdown,
extracted_content = extracted_content,
success = True,
media = json.dumps(crawl_result.media),
links = json.dumps(crawl_result.links),
metadata = json.dumps(crawl_result.metadata),
screenshot=screenshot,
response_headers=json.dumps(crawl_result.response_headers),
downloaded_files=json.dumps(crawl_result.downloaded_files),
)
if not is_raw_html and not no_cache_write:
if not bool(cached_result) or kwargs.get("bypass_cache", False) or self.always_by_pass_cache:
await async_db_manager.acache_url(crawl_result)
return crawl_result
@@ -258,11 +258,11 @@ class AsyncWebCrawler:
verbose: bool,
**kwargs,
) -> CrawlResult:
t = time.time()
t = time.perf_counter()
# Extract content from HTML
try:
_url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
t1 = time.time()
t1 = time.perf_counter()
scrapping_strategy = WebScrapingStrategy()
# result = await scrapping_strategy.ascrap(
result = scrapping_strategy.scrap(
@@ -276,10 +276,6 @@ class AsyncWebCrawler:
),
**kwargs,
)
if verbose:
print(
f"[LOG] 🚀 Content extracted for {_url}, success: True, time taken: {time.time() - t1:.2f} seconds"
)
if result is None:
raise ValueError(f"Process HTML, Failed to extract content from the website: {url}")
@@ -295,13 +291,14 @@ class AsyncWebCrawler:
media = result.get("media", [])
links = result.get("links", [])
metadata = result.get("metadata", {})
if verbose:
print(
f"[LOG] 2⃣ ✅ Scraping done for {_url}, success: True, time taken: {time.perf_counter() - t1:.2f} seconds"
)
if extracted_content is None and extraction_strategy and chunking_strategy:
if verbose:
print(
f"[LOG] 🔥 Extracting semantic blocks for {_url}, Strategy: {self.__class__.__name__}"
)
if extracted_content is None and extraction_strategy and chunking_strategy and not isinstance(extraction_strategy, NoExtractionStrategy):
t1 = time.perf_counter()
# Check if extraction strategy is type of JsonCssExtractionStrategy
if isinstance(extraction_strategy, JsonCssExtractionStrategy) or isinstance(extraction_strategy, JsonCssExtractionStrategy):
extraction_strategy.verbose = verbose
@@ -311,11 +308,10 @@ class AsyncWebCrawler:
sections = chunking_strategy.chunk(markdown)
extracted_content = extraction_strategy.run(url, sections)
extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)
if verbose:
print(
f"[LOG] 🚀 Extraction done for {_url}, time taken: {time.time() - t:.2f} seconds."
)
if verbose:
print(
f"[LOG] 3⃣ ✅ Extraction done for {_url}, time taken: {time.perf_counter() - t1:.2f} seconds"
)
screenshot = None if not screenshot else screenshot