perf(crawler): major performance improvements & raw HTML support

- Switch to lxml parser (~4x speedup) - Add raw HTML & local file crawling support - Fix cache headers & async cleanup - Add browser process monitoring - Optimize BeautifulSoup operations - Pre-compile regex patterns Breaking: Raw HTML handling requires new URL prefixes Fixes: #256, #253
2024-11-13 19:40:40 +08:00
parent 61b93ebf36
commit c38ac29edb
11 changed files with 2953 additions and 130 deletions
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -9,7 +9,7 @@ from .async_database import async_db_manager
 from .chunking_strategy import *
 from .extraction_strategy import *
 from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
-from .content_scrapping_strategy import WebScrappingStrategy
+from .content_scrapping_strategy import WebScrapingStrategy
 from .config import MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
 from .utils import (
    sanitize_input_encode,
@@ -47,17 +47,17 @@ class AsyncWebCrawler:

    async def awarmup(self):
        # Print a message for crawl4ai and its version
+        print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}")
        if self.verbose:
-            print(f"[LOG] 🚀 Crawl4AI {crawl4ai_version}")
            print("[LOG] 🌤️  Warming up the AsyncWebCrawler")
        # await async_db_manager.ainit_db()
        await async_db_manager.initialize()
-        # await self.arun(
-        #     url="https://google.com/",
-        #     word_count_threshold=5,
-        #     bypass_cache=False,
-        #     verbose=False,
-        # )
+        await self.arun(
+            url="https://google.com/",
+            word_count_threshold=5,
+            bypass_cache=False,
+            verbose=False,
+        )
        self.ready = True
        if self.verbose:
            print("[LOG] 🌞 AsyncWebCrawler is ready to crawl")
@@ -75,6 +75,19 @@ class AsyncWebCrawler:
        verbose=True,
        **kwargs,
    ) -> CrawlResult:
+        """
+        Runs the crawler for a single source: URL (web, local file, or raw HTML).
+
+        Args:
+            url (str): The URL to crawl. Supported prefixes:
+                - 'http://' or 'https://': Web URL to crawl.
+                - 'file://': Local file path to process.
+                - 'raw:': Raw HTML content to process.
+            ... [other existing parameters]
+
+        Returns:
+            CrawlResult: The result of the crawling and processing.
+        """
        try:
            extraction_strategy = extraction_strategy or NoExtractionStrategy()
            extraction_strategy.verbose = verbose
@@ -89,8 +102,13 @@ class AsyncWebCrawler:
            cached = None
            screenshot_data = None
            extracted_content = None
-            if not bypass_cache and not self.always_by_pass_cache:
+            
+            is_web_url = url.startswith(('http://', 'https://'))
+            if is_web_url and not bypass_cache and not self.always_by_pass_cache:
                cached = await async_db_manager.aget_cached_url(url)
+                        
+            # if not bypass_cache and not self.always_by_pass_cache:
+            #     cached = await async_db_manager.aget_cached_url(url)

            if kwargs.get("warmup", True) and not self.ready:
                return None
@@ -117,25 +135,32 @@ class AsyncWebCrawler:
                    )

            crawl_result = await self.aprocess_html(
-                url,
-                html,
-                extracted_content,
-                word_count_threshold,
-                extraction_strategy,
-                chunking_strategy,
-                css_selector,
-                screenshot_data,
-                verbose,
-                bool(cached),
+                url=url,
+                html=html,
+                extracted_content=extracted_content,
+                word_count_threshold=word_count_threshold,
+                extraction_strategy=extraction_strategy,
+                chunking_strategy=chunking_strategy,
+                css_selector=css_selector,
+                screenshot=screenshot_data,
+                verbose=verbose,
+                is_cached=bool(cached),
                async_response=async_response,
                bypass_cache=bypass_cache,
                **kwargs,
            )
-            crawl_result.status_code = async_response.status_code if async_response else 200
-            crawl_result.response_headers = async_response.response_headers if async_response else {}
+            
+            if async_response:
+                crawl_result.status_code = async_response.status_code
+                crawl_result.response_headers = async_response.response_headers
+            else:
+                crawl_result.status_code = 200
+                crawl_result.response_headers = cached[10]
+
            crawl_result.success = bool(html)
            crawl_result.session_id = kwargs.get("session_id", None)
            return crawl_result
+        
        except Exception as e:
            if not hasattr(e, "msg"):
                e.msg = str(e)
@@ -155,22 +180,40 @@ class AsyncWebCrawler:
        verbose=True,
        **kwargs,
    ) -> List[CrawlResult]:
-        tasks = [
-            self.arun(
-                url,
-                word_count_threshold,
-                extraction_strategy,
-                chunking_strategy,
-                bypass_cache,
-                css_selector,
-                screenshot,
-                user_agent,
-                verbose,
-                **kwargs
-            )
-            for url in urls
-        ]
-        return await asyncio.gather(*tasks)
+        """
+        Runs the crawler for multiple sources: URLs (web, local files, or raw HTML).
+
+        Args:
+            urls (List[str]): A list of URLs with supported prefixes:
+                - 'http://' or 'https://': Web URL to crawl.
+                - 'file://': Local file path to process.
+                - 'raw:': Raw HTML content to process.
+            ... [other existing parameters]
+
+        Returns:
+            List[CrawlResult]: The results of the crawling and processing.
+        """
+        semaphore_count = kwargs.get('semaphore_count', 5)  # Adjust as needed
+        semaphore = asyncio.Semaphore(semaphore_count)
+
+        async def crawl_with_semaphore(url):
+            async with semaphore:
+                return await self.arun(
+                    url,
+                    word_count_threshold=word_count_threshold,
+                    extraction_strategy=extraction_strategy,
+                    chunking_strategy=chunking_strategy,
+                    bypass_cache=bypass_cache,
+                    css_selector=css_selector,
+                    screenshot=screenshot,
+                    user_agent=user_agent,
+                    verbose=verbose,
+                    **kwargs,
+                )
+
+        tasks = [crawl_with_semaphore(url) for url in urls]
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        return [result if not isinstance(result, Exception) else str(result) for result in results]

    async def aprocess_html(
        self,
@@ -184,13 +227,14 @@ class AsyncWebCrawler:
        screenshot: str,
        verbose: bool,
        is_cached: bool,
+        async_response: Optional[AsyncCrawlResponse],
        **kwargs,
    ) -> CrawlResult:
        t = time.time()
        # Extract content from HTML
        try:
            t1 = time.time()
-            scrapping_strategy = WebScrappingStrategy()
+            scrapping_strategy = WebScrapingStrategy()
            # result = await scrapping_strategy.ascrap(
            result = scrapping_strategy.scrap(
                url,
@@ -245,6 +289,12 @@ class AsyncWebCrawler:
            )

        screenshot = None if not screenshot else screenshot
+        
+        response_headers = "{}"  # Default value
+        if async_response:
+            # Serialize response_headers dict to JSON string
+            response_headers = json.dumps(async_response.response_headers, ensure_ascii=False)
+

        if not is_cached or kwargs.get("bypass_cache", False) or self.always_by_pass_cache:
            await async_db_manager.acache_url(
@@ -258,6 +308,7 @@ class AsyncWebCrawler:
                json.dumps(links),
                json.dumps(metadata),
                screenshot=screenshot,
+                response_headers=response_headers,
            )

        return CrawlResult(