refactor(browser): improve browser strategy architecture and lifecycle management
Major refactoring of browser strategy implementations to improve code organization and reliability: - Move CrawlResultContainer and RunManyReturn types from async_webcrawler to models.py - Simplify browser lifecycle management in AsyncWebCrawler - Standardize browser strategy interface with _generate_page method - Improve headless mode handling and browser args construction - Clean up Docker and Playwright strategy implementations - Fix session management and context handling across strategies BREAKING CHANGE: Browser strategy interface has changed with new _generate_page method requirement
This commit is contained in:
@@ -1,5 +1,7 @@
|
||||
from pydantic import BaseModel, HttpUrl, PrivateAttr
|
||||
from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
|
||||
from typing import AsyncGenerator
|
||||
from typing import Generic, TypeVar
|
||||
from enum import Enum
|
||||
from dataclasses import dataclass
|
||||
from .ssl_certificate import SSLCertificate
|
||||
@@ -34,34 +36,12 @@ class CrawlerTaskResult:
|
||||
def success(self) -> bool:
|
||||
return self.result.success
|
||||
|
||||
|
||||
class CrawlStatus(Enum):
|
||||
QUEUED = "QUEUED"
|
||||
IN_PROGRESS = "IN_PROGRESS"
|
||||
COMPLETED = "COMPLETED"
|
||||
FAILED = "FAILED"
|
||||
|
||||
|
||||
# @dataclass
|
||||
# class CrawlStats:
|
||||
# task_id: str
|
||||
# url: str
|
||||
# status: CrawlStatus
|
||||
# start_time: Optional[datetime] = None
|
||||
# end_time: Optional[datetime] = None
|
||||
# memory_usage: float = 0.0
|
||||
# peak_memory: float = 0.0
|
||||
# error_message: str = ""
|
||||
|
||||
# @property
|
||||
# def duration(self) -> str:
|
||||
# if not self.start_time:
|
||||
# return "0:00"
|
||||
# end = self.end_time or datetime.now()
|
||||
# duration = end - self.start_time
|
||||
# return str(timedelta(seconds=int(duration.total_seconds())))
|
||||
|
||||
|
||||
@dataclass
|
||||
class CrawlStats:
|
||||
task_id: str
|
||||
@@ -95,7 +75,6 @@ class CrawlStats:
|
||||
duration = end - start
|
||||
return str(timedelta(seconds=int(duration.total_seconds())))
|
||||
|
||||
|
||||
class DisplayMode(Enum):
|
||||
DETAILED = "DETAILED"
|
||||
AGGREGATED = "AGGREGATED"
|
||||
@@ -112,12 +91,10 @@ class TokenUsage:
|
||||
completion_tokens_details: Optional[dict] = None
|
||||
prompt_tokens_details: Optional[dict] = None
|
||||
|
||||
|
||||
class UrlModel(BaseModel):
|
||||
url: HttpUrl
|
||||
forced: bool = False
|
||||
|
||||
|
||||
class MarkdownGenerationResult(BaseModel):
|
||||
raw_markdown: str
|
||||
markdown_with_citations: str
|
||||
@@ -284,6 +261,40 @@ class StringCompatibleMarkdown(str):
|
||||
def __getattr__(self, name):
|
||||
return getattr(self._markdown_result, name)
|
||||
|
||||
CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult)
|
||||
|
||||
class CrawlResultContainer(Generic[CrawlResultT]):
|
||||
def __init__(self, results: Union[CrawlResultT, List[CrawlResultT]]):
|
||||
# Normalize to a list
|
||||
if isinstance(results, list):
|
||||
self._results = results
|
||||
else:
|
||||
self._results = [results]
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self._results)
|
||||
|
||||
def __getitem__(self, index):
|
||||
return self._results[index]
|
||||
|
||||
def __len__(self):
|
||||
return len(self._results)
|
||||
|
||||
def __getattr__(self, attr):
|
||||
# Delegate attribute access to the first element.
|
||||
if self._results:
|
||||
return getattr(self._results[0], attr)
|
||||
raise AttributeError(f"{self.__class__.__name__} object has no attribute '{attr}'")
|
||||
|
||||
def __repr__(self):
|
||||
return f"{self.__class__.__name__}({self._results!r})"
|
||||
|
||||
RunManyReturn = Union[
|
||||
CrawlResultContainer[CrawlResultT],
|
||||
AsyncGenerator[CrawlResultT, None]
|
||||
]
|
||||
|
||||
|
||||
# END of backward compatibility code for markdown/markdown_v2.
|
||||
# When removing this code in the future, make sure to:
|
||||
# 1. Replace the private attribute and property with a standard field
|
||||
@@ -304,7 +315,6 @@ class AsyncCrawlResponse(BaseModel):
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
|
||||
###############################
|
||||
# Scraping Models
|
||||
###############################
|
||||
|
||||
Reference in New Issue
Block a user