feat(crawler): add MHTML capture functionality
Add ability to capture web pages as MHTML format, which includes all page resources in a single file. This enables complete page archival and offline viewing. - Add capture_mhtml parameter to CrawlerRunConfig - Implement MHTML capture using CDP in AsyncPlaywrightCrawlerStrategy - Add mhtml field to CrawlResult and AsyncCrawlResponse models - Add comprehensive tests for MHTML capture functionality - Update documentation with MHTML capture details - Add exclude_all_images option for better memory management Breaking changes: None
This commit is contained in:
@@ -860,6 +860,12 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
soup = BeautifulSoup(html, parser_type)
|
||||
body = soup.body
|
||||
base_domain = get_base_domain(url)
|
||||
|
||||
# Early removal of all images if exclude_all_images is set
|
||||
# This happens before any processing to minimize memory usage
|
||||
if kwargs.get("exclude_all_images", False):
|
||||
for img in body.find_all('img'):
|
||||
img.decompose()
|
||||
|
||||
try:
|
||||
meta = extract_metadata("", soup)
|
||||
@@ -1491,6 +1497,13 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
body = doc
|
||||
|
||||
base_domain = get_base_domain(url)
|
||||
|
||||
# Early removal of all images if exclude_all_images is set
|
||||
# This is more efficient in lxml as we remove elements before any processing
|
||||
if kwargs.get("exclude_all_images", False):
|
||||
for img in body.xpath('//img'):
|
||||
if img.getparent() is not None:
|
||||
img.getparent().remove(img)
|
||||
|
||||
# Add comment removal
|
||||
if kwargs.get("remove_comments", False):
|
||||
|
||||
Reference in New Issue
Block a user