docs + fix: Update example for full page screenshot & PDF export. Fix the bug Error: crawl4ai.async_webcrawler.AsyncWebCrawler.aprocess_html() got multiple values for keyword argument - for screenshot param. https://github.com/unclecode/crawl4ai/issues/822#issuecomment-2732602118
This commit is contained in:
@@ -398,7 +398,7 @@ class AsyncWebCrawler:
|
|||||||
html=html,
|
html=html,
|
||||||
extracted_content=extracted_content,
|
extracted_content=extracted_content,
|
||||||
config=config, # Pass the config object instead of individual parameters
|
config=config, # Pass the config object instead of individual parameters
|
||||||
screenshot=screenshot_data,
|
screenshot_data=screenshot_data,
|
||||||
pdf_data=pdf_data,
|
pdf_data=pdf_data,
|
||||||
verbose=config.verbose,
|
verbose=config.verbose,
|
||||||
is_raw_html=True if url.startswith("raw:") else False,
|
is_raw_html=True if url.startswith("raw:") else False,
|
||||||
@@ -482,7 +482,7 @@ class AsyncWebCrawler:
|
|||||||
html: str,
|
html: str,
|
||||||
extracted_content: str,
|
extracted_content: str,
|
||||||
config: CrawlerRunConfig,
|
config: CrawlerRunConfig,
|
||||||
screenshot: str,
|
screenshot_data: str,
|
||||||
pdf_data: str,
|
pdf_data: str,
|
||||||
verbose: bool,
|
verbose: bool,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
@@ -495,7 +495,7 @@ class AsyncWebCrawler:
|
|||||||
html: Raw HTML content
|
html: Raw HTML content
|
||||||
extracted_content: Previously extracted content (if any)
|
extracted_content: Previously extracted content (if any)
|
||||||
config: Configuration object controlling processing behavior
|
config: Configuration object controlling processing behavior
|
||||||
screenshot: Screenshot data (if any)
|
screenshot_data: Screenshot data (if any)
|
||||||
pdf_data: PDF data (if any)
|
pdf_data: PDF data (if any)
|
||||||
verbose: Whether to enable verbose logging
|
verbose: Whether to enable verbose logging
|
||||||
**kwargs: Additional parameters for backwards compatibility
|
**kwargs: Additional parameters for backwards compatibility
|
||||||
@@ -620,10 +620,6 @@ class AsyncWebCrawler:
|
|||||||
params={"url": _url, "timing": time.perf_counter() - t1},
|
params={"url": _url, "timing": time.perf_counter() - t1},
|
||||||
)
|
)
|
||||||
|
|
||||||
# Handle screenshot and PDF data
|
|
||||||
screenshot_data = None if not screenshot else screenshot
|
|
||||||
pdf_data = None if not pdf_data else pdf_data
|
|
||||||
|
|
||||||
# Apply HTML formatting if requested
|
# Apply HTML formatting if requested
|
||||||
if config.prettiify:
|
if config.prettiify:
|
||||||
cleaned_html = fast_format_html(cleaned_html)
|
cleaned_html = fast_format_html(cleaned_html)
|
||||||
|
|||||||
@@ -12,9 +12,10 @@ We’ve introduced a new feature that effortlessly handles even the biggest page
|
|||||||
|
|
||||||
**Simple Example:**
|
**Simple Example:**
|
||||||
```python
|
```python
|
||||||
import os, sys
|
import os
|
||||||
|
import sys
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
|
||||||
|
|
||||||
# Adjust paths as needed
|
# Adjust paths as needed
|
||||||
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
@@ -26,9 +27,11 @@ async def main():
|
|||||||
# Request both PDF and screenshot
|
# Request both PDF and screenshot
|
||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
url='https://en.wikipedia.org/wiki/List_of_common_misconceptions',
|
url='https://en.wikipedia.org/wiki/List_of_common_misconceptions',
|
||||||
cache_mode=CacheMode.BYPASS,
|
config=CrawlerRunConfig(
|
||||||
pdf=True,
|
cache_mode=CacheMode.BYPASS,
|
||||||
screenshot=True
|
pdf=True,
|
||||||
|
screenshot=True
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
if result.success:
|
if result.success:
|
||||||
@@ -40,9 +43,8 @@ async def main():
|
|||||||
|
|
||||||
# Save PDF
|
# Save PDF
|
||||||
if result.pdf:
|
if result.pdf:
|
||||||
pdf_bytes = b64decode(result.pdf)
|
|
||||||
with open(os.path.join(__location__, "page.pdf"), "wb") as f:
|
with open(os.path.join(__location__, "page.pdf"), "wb") as f:
|
||||||
f.write(pdf_bytes)
|
f.write(result.pdf)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
asyncio.run(main())
|
asyncio.run(main())
|
||||||
|
|||||||
Reference in New Issue
Block a user