refactor(browser): improve browser strategy architecture and lifecycle management

Major refactoring of browser strategy implementations to improve code organization and reliability:
- Move CrawlResultContainer and RunManyReturn types from async_webcrawler to models.py
- Simplify browser lifecycle management in AsyncWebCrawler
- Standardize browser strategy interface with _generate_page method
- Improve headless mode handling and browser args construction
- Clean up Docker and Playwright strategy implementations
- Fix session management and context handling across strategies

BREAKING CHANGE: Browser strategy interface has changed with new _generate_page method requirement
This commit is contained in:
UncleCode
2025-03-30 20:58:39 +08:00
parent 3ff7eec8f3
commit bb02398086
11 changed files with 271 additions and 459 deletions

View File

@@ -4,18 +4,25 @@ import sys
import time
from colorama import Fore
from pathlib import Path
from typing import Optional, List, Generic, TypeVar
from typing import Optional, List
import json
import asyncio
# from contextlib import nullcontext, asynccontextmanager
from contextlib import asynccontextmanager
from .models import CrawlResult, MarkdownGenerationResult, DispatchResult, ScrapingResult
from .models import (
CrawlResult,
MarkdownGenerationResult,
DispatchResult,
ScrapingResult,
CrawlResultContainer,
RunManyReturn
)
from .async_database import async_db_manager
from .chunking_strategy import * # noqa: F403
from .chunking_strategy import IdentityChunking
from .content_filter_strategy import * # noqa: F403
from .extraction_strategy import * # noqa: F403
from .extraction_strategy import * # noqa: F403
from .extraction_strategy import NoExtractionStrategy
from .async_crawler_strategy import (
AsyncCrawlerStrategy,
@@ -30,7 +37,7 @@ from .markdown_generation_strategy import (
from .deep_crawling import DeepCrawlDecorator
from .async_logger import AsyncLogger, AsyncLoggerBase
from .async_configs import BrowserConfig, CrawlerRunConfig
from .async_dispatcher import * # noqa: F403
from .async_dispatcher import * # noqa: F403
from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
from .utils import (
@@ -42,45 +49,6 @@ from .utils import (
RobotsParser,
)
from typing import Union, AsyncGenerator
CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult)
# RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
class CrawlResultContainer(Generic[CrawlResultT]):
def __init__(self, results: Union[CrawlResultT, List[CrawlResultT]]):
# Normalize to a list
if isinstance(results, list):
self._results = results
else:
self._results = [results]
def __iter__(self):
return iter(self._results)
def __getitem__(self, index):
return self._results[index]
def __len__(self):
return len(self._results)
def __getattr__(self, attr):
# Delegate attribute access to the first element.
if self._results:
return getattr(self._results[0], attr)
raise AttributeError(f"{self.__class__.__name__} object has no attribute '{attr}'")
def __repr__(self):
return f"{self.__class__.__name__}({self._results!r})"
# Redefine the union type. Now synchronous calls always return a container,
# while stream mode is handled with an AsyncGenerator.
RunManyReturn = Union[
CrawlResultContainer[CrawlResultT],
AsyncGenerator[CrawlResultT, None]
]
class AsyncWebCrawler:
"""
@@ -193,45 +161,18 @@ class AsyncWebCrawler:
# Decorate arun method with deep crawling capabilities
self._deep_handler = DeepCrawlDecorator(self)
self.arun = self._deep_handler(self.arun)
self.arun = self._deep_handler(self.arun)
async def start(self):
"""
Start the crawler explicitly without using context manager.
This is equivalent to using 'async with' but gives more control over the lifecycle.
This method will:
1. Check for builtin browser if browser_mode is 'builtin'
2. Initialize the browser and context
3. Perform warmup sequence
4. Return the crawler instance for method chaining
Returns:
AsyncWebCrawler: The initialized crawler instance
"""
# Check for builtin browser if requested
if self.browser_config.browser_mode == "builtin" and not self.browser_config.cdp_url:
# Import here to avoid circular imports
from .browser_profiler import BrowserProfiler
profiler = BrowserProfiler(logger=self.logger)
# Get builtin browser info or launch if needed
browser_info = profiler.get_builtin_browser_info()
if not browser_info:
self.logger.info("Builtin browser not found, launching new instance...", tag="BROWSER")
cdp_url = await profiler.launch_builtin_browser()
if not cdp_url:
self.logger.warning("Failed to launch builtin browser, falling back to dedicated browser", tag="BROWSER")
else:
self.browser_config.cdp_url = cdp_url
self.browser_config.use_managed_browser = True
else:
self.logger.info(f"Using existing builtin browser at {browser_info.get('cdp_url')}", tag="BROWSER")
self.browser_config.cdp_url = browser_info.get('cdp_url')
self.browser_config.use_managed_browser = True
await self.crawler_strategy.__aenter__()
await self.awarmup()
self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
self.ready = True
return self
async def close(self):
@@ -251,18 +192,6 @@ class AsyncWebCrawler:
async def __aexit__(self, exc_type, exc_val, exc_tb):
await self.close()
async def awarmup(self):
"""
Initialize the crawler with warm-up sequence.
This method:
1. Logs initialization info
2. Sets up browser configuration
3. Marks the crawler as ready
"""
self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
self.ready = True
@asynccontextmanager
async def nullcontext(self):
"""异步空上下文管理器"""
@@ -305,7 +234,7 @@ class AsyncWebCrawler:
# Auto-start if not ready
if not self.ready:
await self.start()
config = config or CrawlerRunConfig()
if not isinstance(url, str) or not url:
raise ValueError("Invalid URL, make sure the URL is a non-empty string")
@@ -319,9 +248,7 @@ class AsyncWebCrawler:
config.cache_mode = CacheMode.ENABLED
# Create cache context
cache_context = CacheContext(
url, config.cache_mode, False
)
cache_context = CacheContext(url, config.cache_mode, False)
# Initialize processing variables
async_response: AsyncCrawlResponse = None
@@ -351,7 +278,7 @@ class AsyncWebCrawler:
# if config.screenshot and not screenshot or config.pdf and not pdf:
if config.screenshot and not screenshot_data:
cached_result = None
if config.pdf and not pdf_data:
cached_result = None
@@ -383,14 +310,18 @@ class AsyncWebCrawler:
# Check robots.txt if enabled
if config and config.check_robots_txt:
if not await self.robots_parser.can_fetch(url, self.browser_config.user_agent):
if not await self.robots_parser.can_fetch(
url, self.browser_config.user_agent
):
return CrawlResult(
url=url,
html="",
success=False,
status_code=403,
error_message="Access denied by robots.txt",
response_headers={"X-Robots-Status": "Blocked by robots.txt"}
response_headers={
"X-Robots-Status": "Blocked by robots.txt"
},
)
##############################
@@ -417,7 +348,7 @@ class AsyncWebCrawler:
###############################################################
# Process the HTML content, Call CrawlerStrategy.process_html #
###############################################################
crawl_result : CrawlResult = await self.aprocess_html(
crawl_result: CrawlResult = await self.aprocess_html(
url=url,
html=html,
extracted_content=extracted_content,
@@ -494,7 +425,7 @@ class AsyncWebCrawler:
tag="ERROR",
)
return CrawlResultContainer(
return CrawlResultContainer(
CrawlResult(
url=url, html="", success=False, error_message=error_message
)
@@ -539,15 +470,14 @@ class AsyncWebCrawler:
# Process HTML content
params = config.__dict__.copy()
params.pop("url", None)
params.pop("url", None)
# add keys from kwargs to params that doesn't exist in params
params.update({k: v for k, v in kwargs.items() if k not in params.keys()})
################################
# Scraping Strategy Execution #
################################
result : ScrapingResult = scraping_strategy.scrap(url, html, **params)
result: ScrapingResult = scraping_strategy.scrap(url, html, **params)
if result is None:
raise ValueError(
@@ -596,7 +526,10 @@ class AsyncWebCrawler:
self.logger.info(
message="{url:.50}... | Time: {timing}s",
tag="SCRAPE",
params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000) / 1000},
params={
"url": _url,
"timing": int((time.perf_counter() - t1) * 1000) / 1000,
},
)
################################
@@ -671,7 +604,7 @@ class AsyncWebCrawler:
async def arun_many(
self,
urls: List[str],
config: Optional[CrawlerRunConfig] = None,
config: Optional[CrawlerRunConfig] = None,
dispatcher: Optional[BaseDispatcher] = None,
# Legacy parameters maintained for backwards compatibility
# word_count_threshold=MIN_WORD_THRESHOLD,
@@ -685,8 +618,8 @@ class AsyncWebCrawler:
# pdf: bool = False,
# user_agent: str = None,
# verbose=True,
**kwargs
) -> RunManyReturn:
**kwargs,
) -> RunManyReturn:
"""
Runs the crawler for multiple URLs concurrently using a configurable dispatcher strategy.
@@ -742,37 +675,32 @@ class AsyncWebCrawler:
def transform_result(task_result):
return (
setattr(task_result.result, 'dispatch_result',
DispatchResult(
task_id=task_result.task_id,
memory_usage=task_result.memory_usage,
peak_memory=task_result.peak_memory,
start_time=task_result.start_time,
end_time=task_result.end_time,
error_message=task_result.error_message,
)
) or task_result.result
setattr(
task_result.result,
"dispatch_result",
DispatchResult(
task_id=task_result.task_id,
memory_usage=task_result.memory_usage,
peak_memory=task_result.peak_memory,
start_time=task_result.start_time,
end_time=task_result.end_time,
error_message=task_result.error_message,
),
)
or task_result.result
)
stream = config.stream
if stream:
async def result_transformer():
async for task_result in dispatcher.run_urls_stream(crawler=self, urls=urls, config=config):
async for task_result in dispatcher.run_urls_stream(
crawler=self, urls=urls, config=config
):
yield transform_result(task_result)
return result_transformer()
else:
_results = await dispatcher.run_urls(crawler=self, urls=urls, config=config)
return [transform_result(res) for res in _results]
async def aclear_cache(self):
"""Clear the cache database."""
await async_db_manager.cleanup()
async def aflush_cache(self):
"""Flush the cache database."""
await async_db_manager.aflush_db()
async def aget_cache_size(self):
"""Get the total number of cached items."""
return await async_db_manager.aget_total_count()
return [transform_result(res) for res in _results]