docs + fix: Update example for full page screenshot & PDF export. Fix the bug Error: crawl4ai.async_webcrawler.AsyncWebCrawler.aprocess_html() got multiple values for keyword argument - for screenshot param. https://github.com/unclecode/crawl4ai/issues/822#issuecomment-2732602118
This commit is contained in:
@@ -398,7 +398,7 @@ class AsyncWebCrawler:
|
||||
html=html,
|
||||
extracted_content=extracted_content,
|
||||
config=config, # Pass the config object instead of individual parameters
|
||||
screenshot=screenshot_data,
|
||||
screenshot_data=screenshot_data,
|
||||
pdf_data=pdf_data,
|
||||
verbose=config.verbose,
|
||||
is_raw_html=True if url.startswith("raw:") else False,
|
||||
@@ -482,7 +482,7 @@ class AsyncWebCrawler:
|
||||
html: str,
|
||||
extracted_content: str,
|
||||
config: CrawlerRunConfig,
|
||||
screenshot: str,
|
||||
screenshot_data: str,
|
||||
pdf_data: str,
|
||||
verbose: bool,
|
||||
**kwargs,
|
||||
@@ -495,7 +495,7 @@ class AsyncWebCrawler:
|
||||
html: Raw HTML content
|
||||
extracted_content: Previously extracted content (if any)
|
||||
config: Configuration object controlling processing behavior
|
||||
screenshot: Screenshot data (if any)
|
||||
screenshot_data: Screenshot data (if any)
|
||||
pdf_data: PDF data (if any)
|
||||
verbose: Whether to enable verbose logging
|
||||
**kwargs: Additional parameters for backwards compatibility
|
||||
@@ -620,10 +620,6 @@ class AsyncWebCrawler:
|
||||
params={"url": _url, "timing": time.perf_counter() - t1},
|
||||
)
|
||||
|
||||
# Handle screenshot and PDF data
|
||||
screenshot_data = None if not screenshot else screenshot
|
||||
pdf_data = None if not pdf_data else pdf_data
|
||||
|
||||
# Apply HTML formatting if requested
|
||||
if config.prettiify:
|
||||
cleaned_html = fast_format_html(cleaned_html)
|
||||
|
||||
@@ -12,9 +12,10 @@ We’ve introduced a new feature that effortlessly handles even the biggest page
|
||||
|
||||
**Simple Example:**
|
||||
```python
|
||||
import os, sys
|
||||
import os
|
||||
import sys
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
|
||||
|
||||
# Adjust paths as needed
|
||||
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
@@ -26,9 +27,11 @@ async def main():
|
||||
# Request both PDF and screenshot
|
||||
result = await crawler.arun(
|
||||
url='https://en.wikipedia.org/wiki/List_of_common_misconceptions',
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
pdf=True,
|
||||
screenshot=True
|
||||
config=CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
pdf=True,
|
||||
screenshot=True
|
||||
)
|
||||
)
|
||||
|
||||
if result.success:
|
||||
@@ -40,9 +43,8 @@ async def main():
|
||||
|
||||
# Save PDF
|
||||
if result.pdf:
|
||||
pdf_bytes = b64decode(result.pdf)
|
||||
with open(os.path.join(__location__, "page.pdf"), "wb") as f:
|
||||
f.write(pdf_bytes)
|
||||
f.write(result.pdf)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
|
||||
Reference in New Issue
Block a user