Add PDF & screenshot functionality, new tutorial

- Added support for exporting pages as PDFs
  - Enhanced screenshot functionality for long pages
  - Created a tutorial on dynamic content loading with 'Load More' buttons.
  - Updated web crawler to handle PDF data in responses.
This commit is contained in:
UncleCode
2024-12-10 20:10:39 +08:00
parent e130fd8db9
commit 5431fa2d0c
5 changed files with 271 additions and 9 deletions

View File

@@ -147,6 +147,7 @@ class AsyncWebCrawler:
# Other parameters
css_selector: str = None,
screenshot: bool = False,
pdf: bool = False,
user_agent: str = None,
verbose=True,
**kwargs,
@@ -230,6 +231,7 @@ class AsyncWebCrawler:
async_response: AsyncCrawlResponse = None
cached_result = None
screenshot_data = None
pdf_data = None
extracted_content = None
start_time = time.perf_counter()
@@ -245,6 +247,10 @@ class AsyncWebCrawler:
screenshot_data = cached_result.screenshot
if not screenshot_data:
cached_result = None
if pdf:
pdf_data = cached_result.pdf
if not pdf_data:
cached_result = None
# if verbose:
# print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s")
self.logger.url_status(
@@ -264,10 +270,12 @@ class AsyncWebCrawler:
async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(
url,
screenshot=screenshot,
pdf=pdf,
**kwargs
)
html = sanitize_input_encode(async_response.html)
screenshot_data = async_response.screenshot
pdf_data = async_response.pdf_data
t2 = time.perf_counter()
self.logger.url_status(
url=cache_context.display_url,
@@ -289,6 +297,7 @@ class AsyncWebCrawler:
content_filter=content_filter,
css_selector=css_selector,
screenshot=screenshot_data,
pdf_data=pdf_data,
verbose=verbose,
is_cached=bool(cached_result),
async_response=async_response,
@@ -362,6 +371,7 @@ class AsyncWebCrawler:
bypass_cache: bool = False,
css_selector: str = None,
screenshot: bool = False,
pdf: bool = False,
user_agent: str = None,
verbose=True,
**kwargs,
@@ -550,6 +560,7 @@ class AsyncWebCrawler:
)
screenshot = None if not screenshot else screenshot
pdf_data = kwargs.get("pdf_data", None)
if kwargs.get("prettiify", False):
@@ -567,6 +578,7 @@ class AsyncWebCrawler:
links=links,
metadata=metadata,
screenshot=screenshot,
pdf=pdf_data,
extracted_content=extracted_content,
success=True,
error_message="",