feat(crawler): add MHTML capture functionality

Add ability to capture web pages as MHTML format, which includes all page resources
in a single file. This enables complete page archival and offline viewing.

- Add capture_mhtml parameter to CrawlerRunConfig
- Implement MHTML capture using CDP in AsyncPlaywrightCrawlerStrategy
- Add mhtml field to CrawlResult and AsyncCrawlResponse models
- Add comprehensive tests for MHTML capture functionality
- Update documentation with MHTML capture details
- Add exclude_all_images option for better memory management

Breaking changes: None
This commit is contained in:
UncleCode
2025-04-09 15:39:04 +08:00
parent 9038e9acbd
commit a2061bf31e
14 changed files with 467 additions and 24 deletions

View File

@@ -772,10 +772,12 @@ class CrawlerRunConfig():
screenshot_wait_for: float = None,
screenshot_height_threshold: int = SCREENSHOT_HEIGHT_TRESHOLD,
pdf: bool = False,
capture_mhtml: bool = False,
image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
image_score_threshold: int = IMAGE_SCORE_THRESHOLD,
table_score_threshold: int = 7,
exclude_external_images: bool = False,
exclude_all_images: bool = False,
# Link and Domain Handling Parameters
exclude_social_media_domains: list = None,
exclude_external_links: bool = False,
@@ -860,9 +862,11 @@ class CrawlerRunConfig():
self.screenshot_wait_for = screenshot_wait_for
self.screenshot_height_threshold = screenshot_height_threshold
self.pdf = pdf
self.capture_mhtml = capture_mhtml
self.image_description_min_word_threshold = image_description_min_word_threshold
self.image_score_threshold = image_score_threshold
self.exclude_external_images = exclude_external_images
self.exclude_all_images = exclude_all_images
self.table_score_threshold = table_score_threshold
# Link and Domain Handling Parameters
@@ -991,6 +995,7 @@ class CrawlerRunConfig():
"screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD
),
pdf=kwargs.get("pdf", False),
capture_mhtml=kwargs.get("capture_mhtml", False),
image_description_min_word_threshold=kwargs.get(
"image_description_min_word_threshold",
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
@@ -999,6 +1004,7 @@ class CrawlerRunConfig():
"image_score_threshold", IMAGE_SCORE_THRESHOLD
),
table_score_threshold=kwargs.get("table_score_threshold", 7),
exclude_all_images=kwargs.get("exclude_all_images", False),
exclude_external_images=kwargs.get("exclude_external_images", False),
# Link and Domain Handling Parameters
exclude_social_media_domains=kwargs.get(
@@ -1088,9 +1094,11 @@ class CrawlerRunConfig():
"screenshot_wait_for": self.screenshot_wait_for,
"screenshot_height_threshold": self.screenshot_height_threshold,
"pdf": self.pdf,
"capture_mhtml": self.capture_mhtml,
"image_description_min_word_threshold": self.image_description_min_word_threshold,
"image_score_threshold": self.image_score_threshold,
"table_score_threshold": self.table_score_threshold,
"exclude_all_images": self.exclude_all_images,
"exclude_external_images": self.exclude_external_images,
"exclude_social_media_domains": self.exclude_social_media_domains,
"exclude_external_links": self.exclude_external_links,

View File

@@ -836,14 +836,18 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
"before_return_html", page=page, html=html, context=context, config=config
)
# Handle PDF and screenshot generation
# Handle PDF, MHTML and screenshot generation
start_export_time = time.perf_counter()
pdf_data = None
screenshot_data = None
mhtml_data = None
if config.pdf:
pdf_data = await self.export_pdf(page)
if config.capture_mhtml:
mhtml_data = await self.capture_mhtml(page)
if config.screenshot:
if config.screenshot_wait_for:
await asyncio.sleep(config.screenshot_wait_for)
@@ -851,9 +855,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
page, screenshot_height_threshold=config.screenshot_height_threshold
)
if screenshot_data or pdf_data:
if screenshot_data or pdf_data or mhtml_data:
self.logger.info(
message="Exporting PDF and taking screenshot took {duration:.2f}s",
message="Exporting media (PDF/MHTML/screenshot) took {duration:.2f}s",
tag="EXPORT",
params={"duration": time.perf_counter() - start_export_time},
)
@@ -876,6 +880,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
status_code=status_code,
screenshot=screenshot_data,
pdf_data=pdf_data,
mhtml_data=mhtml_data,
get_delayed_content=get_delayed_content,
ssl_certificate=ssl_cert,
downloaded_files=(
@@ -1052,6 +1057,70 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
"""
pdf_data = await page.pdf(print_background=True)
return pdf_data
async def capture_mhtml(self, page: Page) -> Optional[str]:
"""
Captures the current page as MHTML using CDP.
MHTML (MIME HTML) is a web page archive format that combines the HTML content
with its resources (images, CSS, etc.) into a single MIME-encoded file.
Args:
page (Page): The Playwright page object
Returns:
Optional[str]: The MHTML content as a string, or None if there was an error
"""
try:
# Ensure the page is fully loaded before capturing
try:
# Wait for DOM content and network to be idle
await page.wait_for_load_state("domcontentloaded", timeout=5000)
await page.wait_for_load_state("networkidle", timeout=5000)
# Give a little extra time for JavaScript execution
await page.wait_for_timeout(1000)
# Wait for any animations to complete
await page.evaluate("""
() => new Promise(resolve => {
// First requestAnimationFrame gets scheduled after the next repaint
requestAnimationFrame(() => {
// Second requestAnimationFrame gets called after all animations complete
requestAnimationFrame(resolve);
});
})
""")
except Error as e:
if self.logger:
self.logger.warning(
message="Wait for load state timed out: {error}",
tag="MHTML",
params={"error": str(e)},
)
# Create a new CDP session
cdp_session = await page.context.new_cdp_session(page)
# Call Page.captureSnapshot with format "mhtml"
result = await cdp_session.send("Page.captureSnapshot", {"format": "mhtml"})
# The result contains a 'data' field with the MHTML content
mhtml_content = result.get("data")
# Detach the CDP session to clean up resources
await cdp_session.detach()
return mhtml_content
except Exception as e:
# Log the error but don't raise it - we'll just return None for the MHTML
if self.logger:
self.logger.error(
message="Failed to capture MHTML: {error}",
tag="MHTML",
params={"error": str(e)},
)
return None
async def take_screenshot(self, page, **kwargs) -> str:
"""

View File

@@ -365,6 +365,7 @@ class AsyncWebCrawler:
crawl_result.response_headers = async_response.response_headers
crawl_result.downloaded_files = async_response.downloaded_files
crawl_result.js_execution_result = js_execution_result
crawl_result.mhtml = async_response.mhtml_data
crawl_result.ssl_certificate = (
async_response.ssl_certificate
) # Add SSL certificate

View File

@@ -440,8 +440,7 @@ class BrowserManager:
@classmethod
async def get_playwright(cls):
from playwright.async_api import async_playwright
if cls._playwright_instance is None:
cls._playwright_instance = await async_playwright().start()
cls._playwright_instance = await async_playwright().start()
return cls._playwright_instance
def __init__(self, browser_config: BrowserConfig, logger=None):
@@ -492,7 +491,6 @@ class BrowserManager:
Note: This method should be called in a separate task to avoid blocking the main event loop.
"""
self.playwright = await self.get_playwright()
if self.playwright is None:
from playwright.async_api import async_playwright

View File

@@ -860,6 +860,12 @@ class WebScrapingStrategy(ContentScrapingStrategy):
soup = BeautifulSoup(html, parser_type)
body = soup.body
base_domain = get_base_domain(url)
# Early removal of all images if exclude_all_images is set
# This happens before any processing to minimize memory usage
if kwargs.get("exclude_all_images", False):
for img in body.find_all('img'):
img.decompose()
try:
meta = extract_metadata("", soup)
@@ -1491,6 +1497,13 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
body = doc
base_domain = get_base_domain(url)
# Early removal of all images if exclude_all_images is set
# This is more efficient in lxml as we remove elements before any processing
if kwargs.get("exclude_all_images", False):
for img in body.xpath('//img'):
if img.getparent() is not None:
img.getparent().remove(img)
# Add comment removal
if kwargs.get("remove_comments", False):

View File

@@ -95,15 +95,7 @@ class UrlModel(BaseModel):
url: HttpUrl
forced: bool = False
class MarkdownGenerationResult(BaseModel):
raw_markdown: str
markdown_with_citations: str
references_markdown: str
fit_markdown: Optional[str] = None
fit_html: Optional[str] = None
def __str__(self):
return self.raw_markdown
@dataclass
class TraversalStats:
@@ -124,6 +116,16 @@ class DispatchResult(BaseModel):
end_time: Union[datetime, float]
error_message: str = ""
class MarkdownGenerationResult(BaseModel):
raw_markdown: str
markdown_with_citations: str
references_markdown: str
fit_markdown: Optional[str] = None
fit_html: Optional[str] = None
def __str__(self):
return self.raw_markdown
class CrawlResult(BaseModel):
url: str
html: str
@@ -135,6 +137,7 @@ class CrawlResult(BaseModel):
js_execution_result: Optional[Dict[str, Any]] = None
screenshot: Optional[str] = None
pdf: Optional[bytes] = None
mhtml: Optional[str] = None
_markdown: Optional[MarkdownGenerationResult] = PrivateAttr(default=None)
extracted_content: Optional[str] = None
metadata: Optional[dict] = None
@@ -307,6 +310,7 @@ class AsyncCrawlResponse(BaseModel):
status_code: int
screenshot: Optional[str] = None
pdf_data: Optional[bytes] = None
mhtml_data: Optional[str] = None
get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
downloaded_files: Optional[List[str]] = None
ssl_certificate: Optional[SSLCertificate] = None