Add PDF & screenshot functionality, new tutorial

- Added support for exporting pages as PDFs
  - Enhanced screenshot functionality for long pages
  - Created a tutorial on dynamic content loading with 'Load More' buttons.
  - Updated web crawler to handle PDF data in responses.
This commit is contained in:
UncleCode
2024-12-10 20:10:39 +08:00
parent e130fd8db9
commit 5431fa2d0c
5 changed files with 271 additions and 9 deletions

View File

@@ -19,8 +19,14 @@ from .js_snippet import load_js_script
from .models import AsyncCrawlResponse
from .utils import create_box_message
from .user_agent_generator import UserAgentGenerator
from .config import SCREENSHOT_HEIGHT_TRESHOLD
from playwright_stealth import StealthConfig, stealth_async
from io import BytesIO
import base64
from PIL import Image, ImageDraw, ImageFont
stealth_config = StealthConfig(
webdriver=True,
chrome_app=True,
@@ -481,6 +487,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
self.user_agent = user_agenr_generator.generate(
**kwargs.get("user_agent_generator_config", {})
)
self.pdf = kwargs.get("pdf", False) # New flag
self.screenshot_requested = kwargs.get('screenshot', False)
self.proxy = kwargs.get("proxy")
self.proxy_config = kwargs.get("proxy_config")
self.headless = kwargs.get("headless", True)
@@ -752,7 +761,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
"""
response_headers = {}
status_code = 200 # Default to 200 for local/raw HTML
screenshot_requested = kwargs.get('screenshot', False)
screenshot_requested = kwargs.get("screenshot", self.screenshot_requested)
pdf_requested = kwargs.get("pdf", self.pdf)
screenshot_data = None
if url.startswith(('http://', 'https://')):
@@ -796,6 +806,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
response_headers = {}
status_code = None
screenshot_requested = kwargs.get("screenshot", self.screenshot_requested)
pdf_requested = kwargs.get("pdf", self.pdf)
# Reset downloaded files list for new crawl
self._downloaded_files = []
@@ -1069,17 +1082,28 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
html = await page.content()
await self.execute_hook('before_return_html', page, html, context = context, **kwargs)
start_export_time = time.perf_counter()
pdf_data = None
if pdf_requested:
# Generate PDF once
pdf_data = await self.export_pdf(page)
# Check if kwargs has screenshot=True then take screenshot
screenshot_data = None
if kwargs.get("screenshot"):
if screenshot_requested: #kwargs.get("screenshot"):
# Check we have screenshot_wait_for parameter, if we have simply wait for that time
screenshot_wait_for = kwargs.get("screenshot_wait_for")
if screenshot_wait_for:
await asyncio.sleep(screenshot_wait_for)
screenshot_data = await self.take_screenshot(page)
# if self.verbose:
# print(f"[LOG] ✅ Crawled {url} successfully!")
screenshot_data = await self.take_screenshot(page, **kwargs)
end_export_time = time.perf_counter()
if screenshot_data or pdf_data:
self.logger.info(
message="Exporting PDF and taking screenshot took {duration:.2f}s",
tag="EXPORT",
params={"duration": end_export_time - start_export_time}
)
if self.use_cached_html:
cache_file_path = os.path.join(
@@ -1105,6 +1129,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
response_headers=response_headers,
status_code=status_code,
screenshot=screenshot_data,
pdf_data=pdf_data,
get_delayed_content=get_delayed_content,
downloaded_files=self._downloaded_files if self._downloaded_files else None
)
@@ -1181,7 +1206,112 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
# if self.verbose:
# print(f"Warning: Failed to remove overlay elements: {str(e)}")
async def take_screenshot(self, page: Page) -> str:
async def export_pdf(self, page: Page) -> bytes:
"""
Exports the current page as a PDF.
"""
pdf_data = await page.pdf(print_background=True)
return pdf_data
async def take_screenshot(self, page, **kwargs) -> str:
page_height = await page.evaluate("document.documentElement.scrollHeight")
if page_height < kwargs.get("screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD):
# Page is short enough, just take a screenshot
return await self.take_screenshot_naive(page)
else:
# Page is too long, try to take a full-page screenshot
return await self.take_screenshot_scroller(page, **kwargs)
# return await self.take_screenshot_from_pdf(await self.export_pdf(page))
async def take_screenshot_from_pdf(self, pdf_data: bytes) -> str:
"""
Convert the first page of the PDF to a screenshot.
Requires pdf2image and poppler.
"""
try:
from pdf2image import convert_from_bytes
images = convert_from_bytes(pdf_data)
final_img = images[0].convert('RGB')
buffered = BytesIO()
final_img.save(buffered, format="JPEG")
return base64.b64encode(buffered.getvalue()).decode('utf-8')
except Exception as e:
error_message = f"Failed to take PDF-based screenshot: {str(e)}"
self.logger.error(
message="PDF Screenshot failed: {error}",
tag="ERROR",
params={"error": error_message}
)
# Return error image as fallback
img = Image.new('RGB', (800, 600), color='black')
draw = ImageDraw.Draw(img)
font = ImageFont.load_default()
draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
buffered = BytesIO()
img.save(buffered, format="JPEG")
return base64.b64encode(buffered.getvalue()).decode('utf-8')
async def take_screenshot_scroller(self, page: Page, **kwargs) -> str:
"""
Attempt to set a large viewport and take a full-page screenshot.
If still too large, segment the page as before.
"""
try:
# Get page height
page_height = await page.evaluate("document.documentElement.scrollHeight")
page_width = await page.evaluate("document.documentElement.scrollWidth")
# Set a large viewport
large_viewport_height = min(page_height, kwargs.get("screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD))
await page.set_viewport_size({"width": page_width, "height": large_viewport_height})
# Page still too long, segment approach
segments = []
viewport_size = page.viewport_size
viewport_height = viewport_size["height"]
num_segments = (page_height // viewport_height) + 1
for i in range(num_segments):
y_offset = i * viewport_height
await page.evaluate(f"window.scrollTo(0, {y_offset})")
await asyncio.sleep(0.01) # wait for render
seg_shot = await page.screenshot(full_page=False)
img = Image.open(BytesIO(seg_shot)).convert('RGB')
segments.append(img)
total_height = sum(img.height for img in segments)
stitched = Image.new('RGB', (segments[0].width, total_height))
offset = 0
for img in segments:
# stitched.paste(img, (0, offset))
stitched.paste(img.convert('RGB'), (0, offset))
offset += img.height
buffered = BytesIO()
stitched = stitched.convert('RGB')
stitched.save(buffered, format="BMP", quality=85)
encoded = base64.b64encode(buffered.getvalue()).decode('utf-8')
return encoded
except Exception as e:
error_message = f"Failed to take large viewport screenshot: {str(e)}"
self.logger.error(
message="Large viewport screenshot failed: {error}",
tag="ERROR",
params={"error": error_message}
)
# return error image
img = Image.new('RGB', (800, 600), color='black')
draw = ImageDraw.Draw(img)
font = ImageFont.load_default()
draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
buffered = BytesIO()
img.save(buffered, format="JPEG")
return base64.b64encode(buffered.getvalue()).decode('utf-8')
finally:
await page.close()
async def take_screenshot_naive(self, page: Page) -> str:
"""
Takes a screenshot of the current page.
@@ -1193,7 +1323,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
"""
try:
# The page is already loaded, just take the screenshot
screenshot = await page.screenshot(full_page=True)
screenshot = await page.screenshot(full_page=False)
return base64.b64encode(screenshot).decode('utf-8')
except Exception as e:
error_message = f"Failed to take screenshot: {str(e)}"

View File

@@ -147,6 +147,7 @@ class AsyncWebCrawler:
# Other parameters
css_selector: str = None,
screenshot: bool = False,
pdf: bool = False,
user_agent: str = None,
verbose=True,
**kwargs,
@@ -230,6 +231,7 @@ class AsyncWebCrawler:
async_response: AsyncCrawlResponse = None
cached_result = None
screenshot_data = None
pdf_data = None
extracted_content = None
start_time = time.perf_counter()
@@ -245,6 +247,10 @@ class AsyncWebCrawler:
screenshot_data = cached_result.screenshot
if not screenshot_data:
cached_result = None
if pdf:
pdf_data = cached_result.pdf
if not pdf_data:
cached_result = None
# if verbose:
# print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s")
self.logger.url_status(
@@ -264,10 +270,12 @@ class AsyncWebCrawler:
async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(
url,
screenshot=screenshot,
pdf=pdf,
**kwargs
)
html = sanitize_input_encode(async_response.html)
screenshot_data = async_response.screenshot
pdf_data = async_response.pdf_data
t2 = time.perf_counter()
self.logger.url_status(
url=cache_context.display_url,
@@ -289,6 +297,7 @@ class AsyncWebCrawler:
content_filter=content_filter,
css_selector=css_selector,
screenshot=screenshot_data,
pdf_data=pdf_data,
verbose=verbose,
is_cached=bool(cached_result),
async_response=async_response,
@@ -362,6 +371,7 @@ class AsyncWebCrawler:
bypass_cache: bool = False,
css_selector: str = None,
screenshot: bool = False,
pdf: bool = False,
user_agent: str = None,
verbose=True,
**kwargs,
@@ -550,6 +560,7 @@ class AsyncWebCrawler:
)
screenshot = None if not screenshot else screenshot
pdf_data = kwargs.get("pdf_data", None)
if kwargs.get("prettiify", False):
@@ -567,6 +578,7 @@ class AsyncWebCrawler:
links=links,
metadata=metadata,
screenshot=screenshot,
pdf=pdf_data,
extracted_content=extracted_content,
success=True,
error_message="",

View File

@@ -57,3 +57,4 @@ MAX_METRICS_HISTORY = 1000
NEED_MIGRATION = True
URL_LOG_SHORTEN_LENGTH = 30
SHOW_DEPRECATION_WARNINGS = True
SCREENSHOT_HEIGHT_TRESHOLD = 10000

View File

@@ -23,6 +23,7 @@ class CrawlResult(BaseModel):
links: Dict[str, List[Dict]] = {}
downloaded_files: Optional[List[str]] = None
screenshot: Optional[str] = None
pdf : Optional[bytes] = None
markdown: Optional[Union[str, MarkdownGenerationResult]] = None
markdown_v2: Optional[MarkdownGenerationResult] = None
fit_markdown: Optional[str] = None
@@ -39,6 +40,7 @@ class AsyncCrawlResponse(BaseModel):
response_headers: Dict[str, str]
status_code: int
screenshot: Optional[str] = None
pdf_data: Optional[bytes] = None
get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
downloaded_files: Optional[List[str]] = None

View File

@@ -0,0 +1,117 @@
# Tutorial: Clicking Buttons to Load More Content with Crawl4AI
## Introduction
When scraping dynamic websites, its common to encounter “Load More” or “Next” buttons that must be clicked to reveal new content. Crawl4AI provides a straightforward way to handle these situations using JavaScript execution and waiting conditions. In this tutorial, well cover two approaches:
1. **Step-by-step (Session-based) Approach:** Multiple calls to `arun()` to progressively load more content.
2. **Single-call Approach:** Execute a more complex JavaScript snippet inside a single `arun()` call to handle all clicks at once before the extraction.
## Prerequisites
- A working installation of Crawl4AI
- Basic familiarity with Pythons `async`/`await` syntax
## Step-by-Step Approach
Use a session ID to maintain state across multiple `arun()` calls:
```python
from crawl4ai import AsyncWebCrawler, CacheMode
js_code = [
# This JS finds the “Next” button and clicks it
"const nextButton = document.querySelector('button.next'); nextButton && nextButton.click();"
]
wait_for_condition = "css:.new-content-class"
async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
# 1. Load the initial page
result_initial = await crawler.arun(
url="https://example.com",
cache_mode=CacheMode.BYPASS,
session_id="my_session"
)
# 2. Click the 'Next' button and wait for new content
result_next = await crawler.arun(
url="https://example.com",
session_id="my_session",
js_code=js_code,
wait_for=wait_for_condition,
js_only=True,
cache_mode=CacheMode.BYPASS
)
# `result_next` now contains the updated HTML after clicking 'Next'
```
**Key Points:**
- **`session_id`**: Keeps the same browser context open.
- **`js_code`**: Executes JavaScript in the context of the already loaded page.
- **`wait_for`**: Ensures the crawler waits until new content is fully loaded.
- **`js_only=True`**: Runs the JS in the current session without reloading the page.
By repeating the `arun()` call multiple times and modifying the `js_code` (e.g., clicking different modules or pages), you can iteratively load all the desired content.
## Single-call Approach
If the page allows it, you can run a single `arun()` call with a more elaborate JavaScript snippet that:
- Iterates over all the modules or "Next" buttons
- Clicks them one by one
- Waits for content updates between each click
- Once done, returns control to Crawl4AI for extraction.
Example snippet:
```python
from crawl4ai import AsyncWebCrawler, CacheMode
js_code = [
# Example JS that clicks multiple modules:
"""
(async () => {
const modules = document.querySelectorAll('.module-item');
for (let i = 0; i < modules.length; i++) {
modules[i].scrollIntoView();
modules[i].click();
// Wait for each modules content to load, adjust 100ms as needed
await new Promise(r => setTimeout(r, 100));
}
})();
"""
]
async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
result = await crawler.arun(
url="https://example.com",
js_code=js_code,
wait_for="css:.final-loaded-content-class",
cache_mode=CacheMode.BYPASS
)
# `result` now contains all content after all modules have been clicked in one go.
```
**Key Points:**
- All interactions (clicks and waits) happen before the extraction.
- Ideal for pages where all steps can be done in a single pass.
## Choosing the Right Approach
- **Step-by-Step (Session-based)**:
- Good when you need fine-grained control or must dynamically check conditions before clicking the next page.
- Useful if the page requires multiple conditions checked at runtime.
- **Single-call**:
- Perfect if the sequence of interactions is known in advance.
- Cleaner code if the pages structure is consistent and predictable.
## Conclusion
Crawl4AI makes it easy to handle dynamic content:
- Use session IDs and multiple `arun()` calls for stepwise crawling.
- Or pack all actions into one `arun()` call if the interactions are well-defined upfront.
This flexibility ensures you can handle a wide range of dynamic web pages efficiently.