Add PDF & screenshot functionality, new tutorial
- Added support for exporting pages as PDFs - Enhanced screenshot functionality for long pages - Created a tutorial on dynamic content loading with 'Load More' buttons. - Updated web crawler to handle PDF data in responses.
This commit is contained in:
@@ -19,8 +19,14 @@ from .js_snippet import load_js_script
|
||||
from .models import AsyncCrawlResponse
|
||||
from .utils import create_box_message
|
||||
from .user_agent_generator import UserAgentGenerator
|
||||
from .config import SCREENSHOT_HEIGHT_TRESHOLD
|
||||
from playwright_stealth import StealthConfig, stealth_async
|
||||
|
||||
|
||||
from io import BytesIO
|
||||
import base64
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
|
||||
stealth_config = StealthConfig(
|
||||
webdriver=True,
|
||||
chrome_app=True,
|
||||
@@ -481,6 +487,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
self.user_agent = user_agenr_generator.generate(
|
||||
**kwargs.get("user_agent_generator_config", {})
|
||||
)
|
||||
self.pdf = kwargs.get("pdf", False) # New flag
|
||||
self.screenshot_requested = kwargs.get('screenshot', False)
|
||||
|
||||
self.proxy = kwargs.get("proxy")
|
||||
self.proxy_config = kwargs.get("proxy_config")
|
||||
self.headless = kwargs.get("headless", True)
|
||||
@@ -752,7 +761,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
"""
|
||||
response_headers = {}
|
||||
status_code = 200 # Default to 200 for local/raw HTML
|
||||
screenshot_requested = kwargs.get('screenshot', False)
|
||||
screenshot_requested = kwargs.get("screenshot", self.screenshot_requested)
|
||||
pdf_requested = kwargs.get("pdf", self.pdf)
|
||||
screenshot_data = None
|
||||
|
||||
if url.startswith(('http://', 'https://')):
|
||||
@@ -796,6 +806,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
response_headers = {}
|
||||
status_code = None
|
||||
|
||||
screenshot_requested = kwargs.get("screenshot", self.screenshot_requested)
|
||||
pdf_requested = kwargs.get("pdf", self.pdf)
|
||||
|
||||
# Reset downloaded files list for new crawl
|
||||
self._downloaded_files = []
|
||||
|
||||
@@ -1069,17 +1082,28 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
html = await page.content()
|
||||
await self.execute_hook('before_return_html', page, html, context = context, **kwargs)
|
||||
|
||||
start_export_time = time.perf_counter()
|
||||
pdf_data = None
|
||||
if pdf_requested:
|
||||
# Generate PDF once
|
||||
pdf_data = await self.export_pdf(page)
|
||||
|
||||
# Check if kwargs has screenshot=True then take screenshot
|
||||
screenshot_data = None
|
||||
if kwargs.get("screenshot"):
|
||||
if screenshot_requested: #kwargs.get("screenshot"):
|
||||
# Check we have screenshot_wait_for parameter, if we have simply wait for that time
|
||||
screenshot_wait_for = kwargs.get("screenshot_wait_for")
|
||||
if screenshot_wait_for:
|
||||
await asyncio.sleep(screenshot_wait_for)
|
||||
screenshot_data = await self.take_screenshot(page)
|
||||
|
||||
# if self.verbose:
|
||||
# print(f"[LOG] ✅ Crawled {url} successfully!")
|
||||
|
||||
screenshot_data = await self.take_screenshot(page, **kwargs)
|
||||
end_export_time = time.perf_counter()
|
||||
if screenshot_data or pdf_data:
|
||||
self.logger.info(
|
||||
message="Exporting PDF and taking screenshot took {duration:.2f}s",
|
||||
tag="EXPORT",
|
||||
params={"duration": end_export_time - start_export_time}
|
||||
)
|
||||
|
||||
if self.use_cached_html:
|
||||
cache_file_path = os.path.join(
|
||||
@@ -1105,6 +1129,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
response_headers=response_headers,
|
||||
status_code=status_code,
|
||||
screenshot=screenshot_data,
|
||||
pdf_data=pdf_data,
|
||||
get_delayed_content=get_delayed_content,
|
||||
downloaded_files=self._downloaded_files if self._downloaded_files else None
|
||||
)
|
||||
@@ -1181,7 +1206,112 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
# if self.verbose:
|
||||
# print(f"Warning: Failed to remove overlay elements: {str(e)}")
|
||||
|
||||
async def take_screenshot(self, page: Page) -> str:
|
||||
async def export_pdf(self, page: Page) -> bytes:
|
||||
"""
|
||||
Exports the current page as a PDF.
|
||||
"""
|
||||
pdf_data = await page.pdf(print_background=True)
|
||||
return pdf_data
|
||||
|
||||
async def take_screenshot(self, page, **kwargs) -> str:
|
||||
page_height = await page.evaluate("document.documentElement.scrollHeight")
|
||||
if page_height < kwargs.get("screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD):
|
||||
# Page is short enough, just take a screenshot
|
||||
return await self.take_screenshot_naive(page)
|
||||
else:
|
||||
# Page is too long, try to take a full-page screenshot
|
||||
return await self.take_screenshot_scroller(page, **kwargs)
|
||||
# return await self.take_screenshot_from_pdf(await self.export_pdf(page))
|
||||
|
||||
async def take_screenshot_from_pdf(self, pdf_data: bytes) -> str:
|
||||
"""
|
||||
Convert the first page of the PDF to a screenshot.
|
||||
Requires pdf2image and poppler.
|
||||
"""
|
||||
try:
|
||||
from pdf2image import convert_from_bytes
|
||||
images = convert_from_bytes(pdf_data)
|
||||
final_img = images[0].convert('RGB')
|
||||
buffered = BytesIO()
|
||||
final_img.save(buffered, format="JPEG")
|
||||
return base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||
except Exception as e:
|
||||
error_message = f"Failed to take PDF-based screenshot: {str(e)}"
|
||||
self.logger.error(
|
||||
message="PDF Screenshot failed: {error}",
|
||||
tag="ERROR",
|
||||
params={"error": error_message}
|
||||
)
|
||||
# Return error image as fallback
|
||||
img = Image.new('RGB', (800, 600), color='black')
|
||||
draw = ImageDraw.Draw(img)
|
||||
font = ImageFont.load_default()
|
||||
draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
|
||||
buffered = BytesIO()
|
||||
img.save(buffered, format="JPEG")
|
||||
return base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||
|
||||
async def take_screenshot_scroller(self, page: Page, **kwargs) -> str:
|
||||
"""
|
||||
Attempt to set a large viewport and take a full-page screenshot.
|
||||
If still too large, segment the page as before.
|
||||
"""
|
||||
try:
|
||||
# Get page height
|
||||
page_height = await page.evaluate("document.documentElement.scrollHeight")
|
||||
page_width = await page.evaluate("document.documentElement.scrollWidth")
|
||||
|
||||
# Set a large viewport
|
||||
large_viewport_height = min(page_height, kwargs.get("screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD))
|
||||
await page.set_viewport_size({"width": page_width, "height": large_viewport_height})
|
||||
|
||||
# Page still too long, segment approach
|
||||
segments = []
|
||||
viewport_size = page.viewport_size
|
||||
viewport_height = viewport_size["height"]
|
||||
|
||||
num_segments = (page_height // viewport_height) + 1
|
||||
for i in range(num_segments):
|
||||
y_offset = i * viewport_height
|
||||
await page.evaluate(f"window.scrollTo(0, {y_offset})")
|
||||
await asyncio.sleep(0.01) # wait for render
|
||||
seg_shot = await page.screenshot(full_page=False)
|
||||
img = Image.open(BytesIO(seg_shot)).convert('RGB')
|
||||
segments.append(img)
|
||||
|
||||
total_height = sum(img.height for img in segments)
|
||||
stitched = Image.new('RGB', (segments[0].width, total_height))
|
||||
offset = 0
|
||||
for img in segments:
|
||||
# stitched.paste(img, (0, offset))
|
||||
stitched.paste(img.convert('RGB'), (0, offset))
|
||||
offset += img.height
|
||||
|
||||
buffered = BytesIO()
|
||||
stitched = stitched.convert('RGB')
|
||||
stitched.save(buffered, format="BMP", quality=85)
|
||||
encoded = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||
|
||||
return encoded
|
||||
except Exception as e:
|
||||
error_message = f"Failed to take large viewport screenshot: {str(e)}"
|
||||
self.logger.error(
|
||||
message="Large viewport screenshot failed: {error}",
|
||||
tag="ERROR",
|
||||
params={"error": error_message}
|
||||
)
|
||||
# return error image
|
||||
img = Image.new('RGB', (800, 600), color='black')
|
||||
draw = ImageDraw.Draw(img)
|
||||
font = ImageFont.load_default()
|
||||
draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
|
||||
buffered = BytesIO()
|
||||
img.save(buffered, format="JPEG")
|
||||
return base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
async def take_screenshot_naive(self, page: Page) -> str:
|
||||
"""
|
||||
Takes a screenshot of the current page.
|
||||
|
||||
@@ -1193,7 +1323,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
"""
|
||||
try:
|
||||
# The page is already loaded, just take the screenshot
|
||||
screenshot = await page.screenshot(full_page=True)
|
||||
screenshot = await page.screenshot(full_page=False)
|
||||
return base64.b64encode(screenshot).decode('utf-8')
|
||||
except Exception as e:
|
||||
error_message = f"Failed to take screenshot: {str(e)}"
|
||||
|
||||
@@ -147,6 +147,7 @@ class AsyncWebCrawler:
|
||||
# Other parameters
|
||||
css_selector: str = None,
|
||||
screenshot: bool = False,
|
||||
pdf: bool = False,
|
||||
user_agent: str = None,
|
||||
verbose=True,
|
||||
**kwargs,
|
||||
@@ -230,6 +231,7 @@ class AsyncWebCrawler:
|
||||
async_response: AsyncCrawlResponse = None
|
||||
cached_result = None
|
||||
screenshot_data = None
|
||||
pdf_data = None
|
||||
extracted_content = None
|
||||
|
||||
start_time = time.perf_counter()
|
||||
@@ -245,6 +247,10 @@ class AsyncWebCrawler:
|
||||
screenshot_data = cached_result.screenshot
|
||||
if not screenshot_data:
|
||||
cached_result = None
|
||||
if pdf:
|
||||
pdf_data = cached_result.pdf
|
||||
if not pdf_data:
|
||||
cached_result = None
|
||||
# if verbose:
|
||||
# print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s")
|
||||
self.logger.url_status(
|
||||
@@ -264,10 +270,12 @@ class AsyncWebCrawler:
|
||||
async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(
|
||||
url,
|
||||
screenshot=screenshot,
|
||||
pdf=pdf,
|
||||
**kwargs
|
||||
)
|
||||
html = sanitize_input_encode(async_response.html)
|
||||
screenshot_data = async_response.screenshot
|
||||
pdf_data = async_response.pdf_data
|
||||
t2 = time.perf_counter()
|
||||
self.logger.url_status(
|
||||
url=cache_context.display_url,
|
||||
@@ -289,6 +297,7 @@ class AsyncWebCrawler:
|
||||
content_filter=content_filter,
|
||||
css_selector=css_selector,
|
||||
screenshot=screenshot_data,
|
||||
pdf_data=pdf_data,
|
||||
verbose=verbose,
|
||||
is_cached=bool(cached_result),
|
||||
async_response=async_response,
|
||||
@@ -362,6 +371,7 @@ class AsyncWebCrawler:
|
||||
bypass_cache: bool = False,
|
||||
css_selector: str = None,
|
||||
screenshot: bool = False,
|
||||
pdf: bool = False,
|
||||
user_agent: str = None,
|
||||
verbose=True,
|
||||
**kwargs,
|
||||
@@ -550,6 +560,7 @@ class AsyncWebCrawler:
|
||||
)
|
||||
|
||||
screenshot = None if not screenshot else screenshot
|
||||
pdf_data = kwargs.get("pdf_data", None)
|
||||
|
||||
|
||||
if kwargs.get("prettiify", False):
|
||||
@@ -567,6 +578,7 @@ class AsyncWebCrawler:
|
||||
links=links,
|
||||
metadata=metadata,
|
||||
screenshot=screenshot,
|
||||
pdf=pdf_data,
|
||||
extracted_content=extracted_content,
|
||||
success=True,
|
||||
error_message="",
|
||||
|
||||
@@ -56,4 +56,5 @@ MAX_METRICS_HISTORY = 1000
|
||||
|
||||
NEED_MIGRATION = True
|
||||
URL_LOG_SHORTEN_LENGTH = 30
|
||||
SHOW_DEPRECATION_WARNINGS = True
|
||||
SHOW_DEPRECATION_WARNINGS = True
|
||||
SCREENSHOT_HEIGHT_TRESHOLD = 10000
|
||||
@@ -23,6 +23,7 @@ class CrawlResult(BaseModel):
|
||||
links: Dict[str, List[Dict]] = {}
|
||||
downloaded_files: Optional[List[str]] = None
|
||||
screenshot: Optional[str] = None
|
||||
pdf : Optional[bytes] = None
|
||||
markdown: Optional[Union[str, MarkdownGenerationResult]] = None
|
||||
markdown_v2: Optional[MarkdownGenerationResult] = None
|
||||
fit_markdown: Optional[str] = None
|
||||
@@ -39,6 +40,7 @@ class AsyncCrawlResponse(BaseModel):
|
||||
response_headers: Dict[str, str]
|
||||
status_code: int
|
||||
screenshot: Optional[str] = None
|
||||
pdf_data: Optional[bytes] = None
|
||||
get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
|
||||
downloaded_files: Optional[List[str]] = None
|
||||
|
||||
|
||||
117
docs/examples/tutorial_dynamic_clicks.md
Normal file
117
docs/examples/tutorial_dynamic_clicks.md
Normal file
@@ -0,0 +1,117 @@
|
||||
# Tutorial: Clicking Buttons to Load More Content with Crawl4AI
|
||||
|
||||
## Introduction
|
||||
|
||||
When scraping dynamic websites, it’s common to encounter “Load More” or “Next” buttons that must be clicked to reveal new content. Crawl4AI provides a straightforward way to handle these situations using JavaScript execution and waiting conditions. In this tutorial, we’ll cover two approaches:
|
||||
|
||||
1. **Step-by-step (Session-based) Approach:** Multiple calls to `arun()` to progressively load more content.
|
||||
2. **Single-call Approach:** Execute a more complex JavaScript snippet inside a single `arun()` call to handle all clicks at once before the extraction.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- A working installation of Crawl4AI
|
||||
- Basic familiarity with Python’s `async`/`await` syntax
|
||||
|
||||
## Step-by-Step Approach
|
||||
|
||||
Use a session ID to maintain state across multiple `arun()` calls:
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
|
||||
js_code = [
|
||||
# This JS finds the “Next” button and clicks it
|
||||
"const nextButton = document.querySelector('button.next'); nextButton && nextButton.click();"
|
||||
]
|
||||
|
||||
wait_for_condition = "css:.new-content-class"
|
||||
|
||||
async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
|
||||
# 1. Load the initial page
|
||||
result_initial = await crawler.arun(
|
||||
url="https://example.com",
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
session_id="my_session"
|
||||
)
|
||||
|
||||
# 2. Click the 'Next' button and wait for new content
|
||||
result_next = await crawler.arun(
|
||||
url="https://example.com",
|
||||
session_id="my_session",
|
||||
js_code=js_code,
|
||||
wait_for=wait_for_condition,
|
||||
js_only=True,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
# `result_next` now contains the updated HTML after clicking 'Next'
|
||||
```
|
||||
|
||||
**Key Points:**
|
||||
- **`session_id`**: Keeps the same browser context open.
|
||||
- **`js_code`**: Executes JavaScript in the context of the already loaded page.
|
||||
- **`wait_for`**: Ensures the crawler waits until new content is fully loaded.
|
||||
- **`js_only=True`**: Runs the JS in the current session without reloading the page.
|
||||
|
||||
By repeating the `arun()` call multiple times and modifying the `js_code` (e.g., clicking different modules or pages), you can iteratively load all the desired content.
|
||||
|
||||
## Single-call Approach
|
||||
|
||||
If the page allows it, you can run a single `arun()` call with a more elaborate JavaScript snippet that:
|
||||
- Iterates over all the modules or "Next" buttons
|
||||
- Clicks them one by one
|
||||
- Waits for content updates between each click
|
||||
- Once done, returns control to Crawl4AI for extraction.
|
||||
|
||||
Example snippet:
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
|
||||
js_code = [
|
||||
# Example JS that clicks multiple modules:
|
||||
"""
|
||||
(async () => {
|
||||
const modules = document.querySelectorAll('.module-item');
|
||||
for (let i = 0; i < modules.length; i++) {
|
||||
modules[i].scrollIntoView();
|
||||
modules[i].click();
|
||||
// Wait for each module’s content to load, adjust 100ms as needed
|
||||
await new Promise(r => setTimeout(r, 100));
|
||||
}
|
||||
})();
|
||||
"""
|
||||
]
|
||||
|
||||
async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
js_code=js_code,
|
||||
wait_for="css:.final-loaded-content-class",
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
# `result` now contains all content after all modules have been clicked in one go.
|
||||
```
|
||||
|
||||
**Key Points:**
|
||||
- All interactions (clicks and waits) happen before the extraction.
|
||||
- Ideal for pages where all steps can be done in a single pass.
|
||||
|
||||
## Choosing the Right Approach
|
||||
|
||||
- **Step-by-Step (Session-based)**:
|
||||
- Good when you need fine-grained control or must dynamically check conditions before clicking the next page.
|
||||
- Useful if the page requires multiple conditions checked at runtime.
|
||||
|
||||
- **Single-call**:
|
||||
- Perfect if the sequence of interactions is known in advance.
|
||||
- Cleaner code if the page’s structure is consistent and predictable.
|
||||
|
||||
## Conclusion
|
||||
|
||||
Crawl4AI makes it easy to handle dynamic content:
|
||||
- Use session IDs and multiple `arun()` calls for stepwise crawling.
|
||||
- Or pack all actions into one `arun()` call if the interactions are well-defined upfront.
|
||||
|
||||
This flexibility ensures you can handle a wide range of dynamic web pages efficiently.
|
||||
Reference in New Issue
Block a user