New async database manager and migration support

- Introduced AsyncDatabaseManager for async DB management. - Added migration feature to transition to file-based storage. - Enhanced web crawler with improved caching logic. - Updated requirements and setup for async processing.
2024-11-16 14:54:41 +08:00
parent ae7ebc0bd8
commit d0014c6793
8 changed files with 685 additions and 119 deletions
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -47,17 +47,17 @@ class AsyncWebCrawler:

    async def awarmup(self):
        # Print a message for crawl4ai and its version
-        print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}")
        if self.verbose:
+            print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}")
            print("[LOG] 🌤️  Warming up the AsyncWebCrawler")
        # await async_db_manager.ainit_db()
-        await async_db_manager.initialize()
-        await self.arun(
-            url="https://google.com/",
-            word_count_threshold=5,
-            bypass_cache=False,
-            verbose=False,
-        )
+        # # await async_db_manager.initialize()
+        # await self.arun(
+        #     url="https://google.com/",
+        #     word_count_threshold=5,
+        #     bypass_cache=False,
+        #     verbose=False,
+        # )
        self.ready = True
        if self.verbose:
            print("[LOG] 🌞 AsyncWebCrawler is ready to crawl")
@@ -73,6 +73,9 @@ class AsyncWebCrawler:
        screenshot: bool = False,
        user_agent: str = None,
        verbose=True,
+        disable_cache: bool = False,
+        no_cache_read: bool = False,
+        no_cache_write: bool = False,
        **kwargs,
    ) -> CrawlResult:
        """
@@ -89,6 +92,11 @@ class AsyncWebCrawler:
            CrawlResult: The result of the crawling and processing.
        """
        try:
+            if disable_cache:
+                bypass_cache = True
+                no_cache_read = True
+                no_cache_write = True
+            
            extraction_strategy = extraction_strategy or NoExtractionStrategy()
            extraction_strategy.verbose = verbose
            if not isinstance(extraction_strategy, ExtractionStrategy):
@@ -108,36 +116,39 @@ class AsyncWebCrawler:
            is_raw_html = url.startswith("raw:")
            _url = url if not is_raw_html else "Raw HTML"
            
-            if is_web_url and not bypass_cache and not self.always_by_pass_cache:
-                cached = await async_db_manager.aget_cached_url(url)
+            start_time = time.perf_counter()
+            cached_result = None
+            if is_web_url and (not bypass_cache or not no_cache_read) and not self.always_by_pass_cache:
+                cached_result = await async_db_manager.aget_cached_url(url)
                        
-            # if not bypass_cache and not self.always_by_pass_cache:
-            #     cached = await async_db_manager.aget_cached_url(url)
-
-            if kwargs.get("warmup", True) and not self.ready:
-                return None
-
-            if cached:
-                html = sanitize_input_encode(cached[1])
-                extracted_content = sanitize_input_encode(cached[4])
+            if cached_result:
+                html = sanitize_input_encode(cached_result.html)
+                extracted_content = sanitize_input_encode(cached_result.extracted_content or "")
                if screenshot:
-                    screenshot_data = cached[9]
+                    screenshot_data = cached_result.screenshot
                    if not screenshot_data:
-                        cached = None
+                        cached_result = None
+                if verbose:
+                    print(
+                        f"[LOG] 1️⃣  ✅ Page fetched (cache) for {_url}, success: {bool(html)}, time taken: {time.perf_counter() - start_time:.2f} seconds"
+                    )
+

            if not cached or not html:
-                t1 = time.time()
+                t1 = time.perf_counter()
+                
                if user_agent:
                    self.crawler_strategy.update_user_agent(user_agent)
                async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(url, screenshot=screenshot, **kwargs)
                html = sanitize_input_encode(async_response.html)
                screenshot_data = async_response.screenshot
-                t2 = time.time()
+                t2 = time.perf_counter()
                if verbose:
                    print(
-                        f"[LOG] 🚀 Crawling done for {_url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds"
+                        f"[LOG] 1️⃣  ✅ Page fetched (no-cache) for {_url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds"
                    )

+            t1 = time.perf_counter()
            crawl_result = await self.aprocess_html(
                url=url,
                html=html,
@@ -163,30 +174,19 @@ class AsyncWebCrawler:
                crawl_result.downloaded_files = async_response.downloaded_files
            else:
                crawl_result.status_code = 200
-                crawl_result.response_headers = cached[10]
-                # crawl_result.downloaded_files = cached[11]
+                crawl_result.response_headers = cached_result.response_headers if cached_result else {}

            crawl_result.success = bool(html)
            crawl_result.session_id = kwargs.get("session_id", None)

+            if verbose:
+                print(
+                    f"[LOG] 🔥 🚀 Crawling done for {_url}, success: {crawl_result.success}, time taken: {time.perf_counter() - start_time:.2f} seconds"
+                )

-            if not is_raw_html:
-                if not bool(cached) or kwargs.get("bypass_cache", False) or self.always_by_pass_cache:
-                    await async_db_manager.acache_url(
-                        url = url,
-                        html = html,
-                        cleaned_html = crawl_result.cleaned_html,
-                        markdown = crawl_result.markdown,
-                        extracted_content = extracted_content,
-                        success = True,
-                        media = json.dumps(crawl_result.media),
-                        links = json.dumps(crawl_result.links),
-                        metadata = json.dumps(crawl_result.metadata),
-                        screenshot=screenshot,
-                        response_headers=json.dumps(crawl_result.response_headers),
-                        downloaded_files=json.dumps(crawl_result.downloaded_files),
-                        
-                    )
+            if not is_raw_html and not no_cache_write:
+                if not bool(cached_result) or kwargs.get("bypass_cache", False) or self.always_by_pass_cache:
+                    await async_db_manager.acache_url(crawl_result)


            return crawl_result
@@ -258,11 +258,11 @@ class AsyncWebCrawler:
        verbose: bool,
        **kwargs,
    ) -> CrawlResult:
-        t = time.time()
+        t = time.perf_counter()
        # Extract content from HTML
        try:
            _url = url if not kwargs.get("is_raw_html", False) else "Raw HTML"
-            t1 = time.time()
+            t1 = time.perf_counter()
            scrapping_strategy = WebScrapingStrategy()
            # result = await scrapping_strategy.ascrap(
            result = scrapping_strategy.scrap(
@@ -276,10 +276,6 @@ class AsyncWebCrawler:
                ),
                **kwargs,
            )
-            if verbose:
-                print(
-                    f"[LOG] 🚀 Content extracted for {_url}, success: True, time taken: {time.time() - t1:.2f} seconds"
-                )

            if result is None:
                raise ValueError(f"Process HTML, Failed to extract content from the website: {url}")
@@ -295,13 +291,14 @@ class AsyncWebCrawler:
        media = result.get("media", [])
        links = result.get("links", [])
        metadata = result.get("metadata", {})
+        
+        if verbose:
+            print(
+                f"[LOG] 2️⃣  ✅ Scraping done for {_url}, success: True, time taken: {time.perf_counter() - t1:.2f} seconds"
+            )        

-        if extracted_content is None and extraction_strategy and chunking_strategy:
-            if verbose:
-                print(
-                    f"[LOG] 🔥 Extracting semantic blocks for {_url}, Strategy: {self.__class__.__name__}"
-                )
-
+        if extracted_content is None and extraction_strategy and chunking_strategy and not isinstance(extraction_strategy, NoExtractionStrategy):
+            t1 = time.perf_counter()
            # Check if extraction strategy is type of JsonCssExtractionStrategy
            if isinstance(extraction_strategy, JsonCssExtractionStrategy) or isinstance(extraction_strategy, JsonCssExtractionStrategy):
                extraction_strategy.verbose = verbose
@@ -311,11 +308,10 @@ class AsyncWebCrawler:
                sections = chunking_strategy.chunk(markdown)
                extracted_content = extraction_strategy.run(url, sections)
                extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False)
-
-        if verbose:
-            print(
-                f"[LOG] 🚀 Extraction done for {_url}, time taken: {time.time() - t:.2f} seconds."
-            )
+            if verbose:
+                print(
+                    f"[LOG] 3️⃣  ✅ Extraction done for {_url}, time taken: {time.perf_counter() - t1:.2f} seconds"
+                )

        screenshot = None if not screenshot else screenshot