Merge next and resolve conflicts

2025-04-02 12:18:23 +05:30
parent 765f856ed4 c5cac2b459
commit 9e16a4bb26
31 changed files with 4824 additions and 2438 deletions
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -4,7 +4,7 @@ import sys
 import time
 from colorama import Fore
 from pathlib import Path
-from typing import Optional, List, Generic, TypeVar
+from typing import Optional, List
 import json
 import asyncio

@@ -15,6 +15,8 @@ from .models import (
    MarkdownGenerationResult,
    DispatchResult,
    ScrapingResult,
+    CrawlResultContainer,
+    RunManyReturn
 )
 from .async_database import async_db_manager
 from .chunking_strategy import *  # noqa: F403
@@ -47,47 +49,6 @@ from .utils import (
    RobotsParser,
 )

-from typing import Union, AsyncGenerator
-
-CrawlResultT = TypeVar("CrawlResultT", bound=CrawlResult)
-# RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
-
-
-class CrawlResultContainer(Generic[CrawlResultT]):
-    def __init__(self, results: Union[CrawlResultT, List[CrawlResultT]]):
-        # Normalize to a list
-        if isinstance(results, list):
-            self._results = results
-        else:
-            self._results = [results]
-
-    def __iter__(self):
-        return iter(self._results)
-
-    def __getitem__(self, index):
-        return self._results[index]
-
-    def __len__(self):
-        return len(self._results)
-
-    def __getattr__(self, attr):
-        # Delegate attribute access to the first element.
-        if self._results:
-            return getattr(self._results[0], attr)
-        raise AttributeError(
-            f"{self.__class__.__name__} object has no attribute '{attr}'"
-        )
-
-    def __repr__(self):
-        return f"{self.__class__.__name__}({self._results!r})"
-
-
-# Redefine the union type. Now synchronous calls always return a container,
-# while stream mode is handled with an AsyncGenerator.
-RunManyReturn = Union[
-    CrawlResultContainer[CrawlResultT], AsyncGenerator[CrawlResultT, None]
-]
-

 class AsyncWebCrawler:
    """
@@ -206,52 +167,12 @@ class AsyncWebCrawler:
        """
        Start the crawler explicitly without using context manager.
        This is equivalent to using 'async with' but gives more control over the lifecycle.
-
-        This method will:
-        1. Check for builtin browser if browser_mode is 'builtin'
-        2. Initialize the browser and context
-        3. Perform warmup sequence
-        4. Return the crawler instance for method chaining
-
        Returns:
            AsyncWebCrawler: The initialized crawler instance
        """
-        # Check for builtin browser if requested
-        if (
-            self.browser_config.browser_mode == "builtin"
-            and not self.browser_config.cdp_url
-        ):
-            # Import here to avoid circular imports
-            from .browser_profiler import BrowserProfiler
-
-            profiler = BrowserProfiler(logger=self.logger)
-
-            # Get builtin browser info or launch if needed
-            browser_info = profiler.get_builtin_browser_info()
-            if not browser_info:
-                self.logger.info(
-                    "Builtin browser not found, launching new instance...",
-                    tag="BROWSER",
-                )
-                cdp_url = await profiler.launch_builtin_browser()
-                if not cdp_url:
-                    self.logger.warning(
-                        "Failed to launch builtin browser, falling back to dedicated browser",
-                        tag="BROWSER",
-                    )
-                else:
-                    self.browser_config.cdp_url = cdp_url
-                    self.browser_config.use_managed_browser = True
-            else:
-                self.logger.info(
-                    f"Using existing builtin browser at {browser_info.get('cdp_url')}",
-                    tag="BROWSER",
-                )
-                self.browser_config.cdp_url = browser_info.get("cdp_url")
-                self.browser_config.use_managed_browser = True
-
        await self.crawler_strategy.__aenter__()
-        await self.awarmup()
+        self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
+        self.ready = True
        return self

    async def close(self):
@@ -271,18 +192,6 @@ class AsyncWebCrawler:
    async def __aexit__(self, exc_type, exc_val, exc_tb):
        await self.close()

-    async def awarmup(self):
-        """
-        Initialize the crawler with warm-up sequence.
-
-        This method:
-        1. Logs initialization info
-        2. Sets up browser configuration
-        3. Marks the crawler as ready
-        """
-        self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
-        self.ready = True
-
    @asynccontextmanager
    async def nullcontext(self):
        """异步空上下文管理器"""
@@ -607,7 +516,7 @@ class AsyncWebCrawler:
            url=_url,
            success=True,
            timing=int((time.perf_counter() - t1) * 1000) / 1000,
-            tag="SCRAPE",
+            tag="SCRAPE"
        )
        # self.logger.info(
        #     message="{url:.50}... | Time: {timing}s",
@@ -783,15 +692,3 @@ class AsyncWebCrawler:
        else:
            _results = await dispatcher.run_urls(crawler=self, urls=urls, config=config)
            return [transform_result(res) for res in _results]
-
-    async def aclear_cache(self):
-        """Clear the cache database."""
-        await async_db_manager.cleanup()
-
-    async def aflush_cache(self):
-        """Flush the cache database."""
-        await async_db_manager.aflush_db()
-
-    async def aget_cache_size(self):
-        """Get the total number of cached items."""
-        return await async_db_manager.aget_total_count()