docs + fix: Update example for full page screenshot & PDF export. Fix the bug Error: crawl4ai.async_webcrawler.AsyncWebCrawler.aprocess_html() got multiple values for keyword argument - for screenshot param. https://github.com/unclecode/crawl4ai/issues/822#issuecomment-2732602118

This commit is contained in:
Aravind Karnam
2025-03-18 17:20:24 +05:30
parent 529a79725e
commit 4359b12003
2 changed files with 12 additions and 14 deletions

View File

@@ -398,7 +398,7 @@ class AsyncWebCrawler:
html=html, html=html,
extracted_content=extracted_content, extracted_content=extracted_content,
config=config, # Pass the config object instead of individual parameters config=config, # Pass the config object instead of individual parameters
screenshot=screenshot_data, screenshot_data=screenshot_data,
pdf_data=pdf_data, pdf_data=pdf_data,
verbose=config.verbose, verbose=config.verbose,
is_raw_html=True if url.startswith("raw:") else False, is_raw_html=True if url.startswith("raw:") else False,
@@ -482,7 +482,7 @@ class AsyncWebCrawler:
html: str, html: str,
extracted_content: str, extracted_content: str,
config: CrawlerRunConfig, config: CrawlerRunConfig,
screenshot: str, screenshot_data: str,
pdf_data: str, pdf_data: str,
verbose: bool, verbose: bool,
**kwargs, **kwargs,
@@ -495,7 +495,7 @@ class AsyncWebCrawler:
html: Raw HTML content html: Raw HTML content
extracted_content: Previously extracted content (if any) extracted_content: Previously extracted content (if any)
config: Configuration object controlling processing behavior config: Configuration object controlling processing behavior
screenshot: Screenshot data (if any) screenshot_data: Screenshot data (if any)
pdf_data: PDF data (if any) pdf_data: PDF data (if any)
verbose: Whether to enable verbose logging verbose: Whether to enable verbose logging
**kwargs: Additional parameters for backwards compatibility **kwargs: Additional parameters for backwards compatibility
@@ -620,10 +620,6 @@ class AsyncWebCrawler:
params={"url": _url, "timing": time.perf_counter() - t1}, params={"url": _url, "timing": time.perf_counter() - t1},
) )
# Handle screenshot and PDF data
screenshot_data = None if not screenshot else screenshot
pdf_data = None if not pdf_data else pdf_data
# Apply HTML formatting if requested # Apply HTML formatting if requested
if config.prettiify: if config.prettiify:
cleaned_html = fast_format_html(cleaned_html) cleaned_html = fast_format_html(cleaned_html)

View File

@@ -12,9 +12,10 @@ Weve introduced a new feature that effortlessly handles even the biggest page
**Simple Example:** **Simple Example:**
```python ```python
import os, sys import os
import sys
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, CacheMode from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
# Adjust paths as needed # Adjust paths as needed
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -26,9 +27,11 @@ async def main():
# Request both PDF and screenshot # Request both PDF and screenshot
result = await crawler.arun( result = await crawler.arun(
url='https://en.wikipedia.org/wiki/List_of_common_misconceptions', url='https://en.wikipedia.org/wiki/List_of_common_misconceptions',
cache_mode=CacheMode.BYPASS, config=CrawlerRunConfig(
pdf=True, cache_mode=CacheMode.BYPASS,
screenshot=True pdf=True,
screenshot=True
)
) )
if result.success: if result.success:
@@ -40,9 +43,8 @@ async def main():
# Save PDF # Save PDF
if result.pdf: if result.pdf:
pdf_bytes = b64decode(result.pdf)
with open(os.path.join(__location__, "page.pdf"), "wb") as f: with open(os.path.join(__location__, "page.pdf"), "wb") as f:
f.write(pdf_bytes) f.write(result.pdf)
if __name__ == "__main__": if __name__ == "__main__":
asyncio.run(main()) asyncio.run(main())