Add PDF & screenshot functionality, new tutorial
- Added support for exporting pages as PDFs - Enhanced screenshot functionality for long pages - Created a tutorial on dynamic content loading with 'Load More' buttons. - Updated web crawler to handle PDF data in responses.
This commit is contained in:
@@ -19,8 +19,14 @@ from .js_snippet import load_js_script
|
|||||||
from .models import AsyncCrawlResponse
|
from .models import AsyncCrawlResponse
|
||||||
from .utils import create_box_message
|
from .utils import create_box_message
|
||||||
from .user_agent_generator import UserAgentGenerator
|
from .user_agent_generator import UserAgentGenerator
|
||||||
|
from .config import SCREENSHOT_HEIGHT_TRESHOLD
|
||||||
from playwright_stealth import StealthConfig, stealth_async
|
from playwright_stealth import StealthConfig, stealth_async
|
||||||
|
|
||||||
|
|
||||||
|
from io import BytesIO
|
||||||
|
import base64
|
||||||
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
|
|
||||||
stealth_config = StealthConfig(
|
stealth_config = StealthConfig(
|
||||||
webdriver=True,
|
webdriver=True,
|
||||||
chrome_app=True,
|
chrome_app=True,
|
||||||
@@ -481,6 +487,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
self.user_agent = user_agenr_generator.generate(
|
self.user_agent = user_agenr_generator.generate(
|
||||||
**kwargs.get("user_agent_generator_config", {})
|
**kwargs.get("user_agent_generator_config", {})
|
||||||
)
|
)
|
||||||
|
self.pdf = kwargs.get("pdf", False) # New flag
|
||||||
|
self.screenshot_requested = kwargs.get('screenshot', False)
|
||||||
|
|
||||||
self.proxy = kwargs.get("proxy")
|
self.proxy = kwargs.get("proxy")
|
||||||
self.proxy_config = kwargs.get("proxy_config")
|
self.proxy_config = kwargs.get("proxy_config")
|
||||||
self.headless = kwargs.get("headless", True)
|
self.headless = kwargs.get("headless", True)
|
||||||
@@ -752,7 +761,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
"""
|
"""
|
||||||
response_headers = {}
|
response_headers = {}
|
||||||
status_code = 200 # Default to 200 for local/raw HTML
|
status_code = 200 # Default to 200 for local/raw HTML
|
||||||
screenshot_requested = kwargs.get('screenshot', False)
|
screenshot_requested = kwargs.get("screenshot", self.screenshot_requested)
|
||||||
|
pdf_requested = kwargs.get("pdf", self.pdf)
|
||||||
screenshot_data = None
|
screenshot_data = None
|
||||||
|
|
||||||
if url.startswith(('http://', 'https://')):
|
if url.startswith(('http://', 'https://')):
|
||||||
@@ -796,6 +806,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
response_headers = {}
|
response_headers = {}
|
||||||
status_code = None
|
status_code = None
|
||||||
|
|
||||||
|
screenshot_requested = kwargs.get("screenshot", self.screenshot_requested)
|
||||||
|
pdf_requested = kwargs.get("pdf", self.pdf)
|
||||||
|
|
||||||
# Reset downloaded files list for new crawl
|
# Reset downloaded files list for new crawl
|
||||||
self._downloaded_files = []
|
self._downloaded_files = []
|
||||||
|
|
||||||
@@ -1069,17 +1082,28 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
html = await page.content()
|
html = await page.content()
|
||||||
await self.execute_hook('before_return_html', page, html, context = context, **kwargs)
|
await self.execute_hook('before_return_html', page, html, context = context, **kwargs)
|
||||||
|
|
||||||
|
start_export_time = time.perf_counter()
|
||||||
|
pdf_data = None
|
||||||
|
if pdf_requested:
|
||||||
|
# Generate PDF once
|
||||||
|
pdf_data = await self.export_pdf(page)
|
||||||
|
|
||||||
# Check if kwargs has screenshot=True then take screenshot
|
# Check if kwargs has screenshot=True then take screenshot
|
||||||
screenshot_data = None
|
screenshot_data = None
|
||||||
if kwargs.get("screenshot"):
|
if screenshot_requested: #kwargs.get("screenshot"):
|
||||||
# Check we have screenshot_wait_for parameter, if we have simply wait for that time
|
# Check we have screenshot_wait_for parameter, if we have simply wait for that time
|
||||||
screenshot_wait_for = kwargs.get("screenshot_wait_for")
|
screenshot_wait_for = kwargs.get("screenshot_wait_for")
|
||||||
if screenshot_wait_for:
|
if screenshot_wait_for:
|
||||||
await asyncio.sleep(screenshot_wait_for)
|
await asyncio.sleep(screenshot_wait_for)
|
||||||
screenshot_data = await self.take_screenshot(page)
|
|
||||||
|
screenshot_data = await self.take_screenshot(page, **kwargs)
|
||||||
# if self.verbose:
|
end_export_time = time.perf_counter()
|
||||||
# print(f"[LOG] ✅ Crawled {url} successfully!")
|
if screenshot_data or pdf_data:
|
||||||
|
self.logger.info(
|
||||||
|
message="Exporting PDF and taking screenshot took {duration:.2f}s",
|
||||||
|
tag="EXPORT",
|
||||||
|
params={"duration": end_export_time - start_export_time}
|
||||||
|
)
|
||||||
|
|
||||||
if self.use_cached_html:
|
if self.use_cached_html:
|
||||||
cache_file_path = os.path.join(
|
cache_file_path = os.path.join(
|
||||||
@@ -1105,6 +1129,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
response_headers=response_headers,
|
response_headers=response_headers,
|
||||||
status_code=status_code,
|
status_code=status_code,
|
||||||
screenshot=screenshot_data,
|
screenshot=screenshot_data,
|
||||||
|
pdf_data=pdf_data,
|
||||||
get_delayed_content=get_delayed_content,
|
get_delayed_content=get_delayed_content,
|
||||||
downloaded_files=self._downloaded_files if self._downloaded_files else None
|
downloaded_files=self._downloaded_files if self._downloaded_files else None
|
||||||
)
|
)
|
||||||
@@ -1181,7 +1206,112 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
# if self.verbose:
|
# if self.verbose:
|
||||||
# print(f"Warning: Failed to remove overlay elements: {str(e)}")
|
# print(f"Warning: Failed to remove overlay elements: {str(e)}")
|
||||||
|
|
||||||
async def take_screenshot(self, page: Page) -> str:
|
async def export_pdf(self, page: Page) -> bytes:
|
||||||
|
"""
|
||||||
|
Exports the current page as a PDF.
|
||||||
|
"""
|
||||||
|
pdf_data = await page.pdf(print_background=True)
|
||||||
|
return pdf_data
|
||||||
|
|
||||||
|
async def take_screenshot(self, page, **kwargs) -> str:
|
||||||
|
page_height = await page.evaluate("document.documentElement.scrollHeight")
|
||||||
|
if page_height < kwargs.get("screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD):
|
||||||
|
# Page is short enough, just take a screenshot
|
||||||
|
return await self.take_screenshot_naive(page)
|
||||||
|
else:
|
||||||
|
# Page is too long, try to take a full-page screenshot
|
||||||
|
return await self.take_screenshot_scroller(page, **kwargs)
|
||||||
|
# return await self.take_screenshot_from_pdf(await self.export_pdf(page))
|
||||||
|
|
||||||
|
async def take_screenshot_from_pdf(self, pdf_data: bytes) -> str:
|
||||||
|
"""
|
||||||
|
Convert the first page of the PDF to a screenshot.
|
||||||
|
Requires pdf2image and poppler.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from pdf2image import convert_from_bytes
|
||||||
|
images = convert_from_bytes(pdf_data)
|
||||||
|
final_img = images[0].convert('RGB')
|
||||||
|
buffered = BytesIO()
|
||||||
|
final_img.save(buffered, format="JPEG")
|
||||||
|
return base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||||
|
except Exception as e:
|
||||||
|
error_message = f"Failed to take PDF-based screenshot: {str(e)}"
|
||||||
|
self.logger.error(
|
||||||
|
message="PDF Screenshot failed: {error}",
|
||||||
|
tag="ERROR",
|
||||||
|
params={"error": error_message}
|
||||||
|
)
|
||||||
|
# Return error image as fallback
|
||||||
|
img = Image.new('RGB', (800, 600), color='black')
|
||||||
|
draw = ImageDraw.Draw(img)
|
||||||
|
font = ImageFont.load_default()
|
||||||
|
draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
|
||||||
|
buffered = BytesIO()
|
||||||
|
img.save(buffered, format="JPEG")
|
||||||
|
return base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||||
|
|
||||||
|
async def take_screenshot_scroller(self, page: Page, **kwargs) -> str:
|
||||||
|
"""
|
||||||
|
Attempt to set a large viewport and take a full-page screenshot.
|
||||||
|
If still too large, segment the page as before.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Get page height
|
||||||
|
page_height = await page.evaluate("document.documentElement.scrollHeight")
|
||||||
|
page_width = await page.evaluate("document.documentElement.scrollWidth")
|
||||||
|
|
||||||
|
# Set a large viewport
|
||||||
|
large_viewport_height = min(page_height, kwargs.get("screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD))
|
||||||
|
await page.set_viewport_size({"width": page_width, "height": large_viewport_height})
|
||||||
|
|
||||||
|
# Page still too long, segment approach
|
||||||
|
segments = []
|
||||||
|
viewport_size = page.viewport_size
|
||||||
|
viewport_height = viewport_size["height"]
|
||||||
|
|
||||||
|
num_segments = (page_height // viewport_height) + 1
|
||||||
|
for i in range(num_segments):
|
||||||
|
y_offset = i * viewport_height
|
||||||
|
await page.evaluate(f"window.scrollTo(0, {y_offset})")
|
||||||
|
await asyncio.sleep(0.01) # wait for render
|
||||||
|
seg_shot = await page.screenshot(full_page=False)
|
||||||
|
img = Image.open(BytesIO(seg_shot)).convert('RGB')
|
||||||
|
segments.append(img)
|
||||||
|
|
||||||
|
total_height = sum(img.height for img in segments)
|
||||||
|
stitched = Image.new('RGB', (segments[0].width, total_height))
|
||||||
|
offset = 0
|
||||||
|
for img in segments:
|
||||||
|
# stitched.paste(img, (0, offset))
|
||||||
|
stitched.paste(img.convert('RGB'), (0, offset))
|
||||||
|
offset += img.height
|
||||||
|
|
||||||
|
buffered = BytesIO()
|
||||||
|
stitched = stitched.convert('RGB')
|
||||||
|
stitched.save(buffered, format="BMP", quality=85)
|
||||||
|
encoded = base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||||
|
|
||||||
|
return encoded
|
||||||
|
except Exception as e:
|
||||||
|
error_message = f"Failed to take large viewport screenshot: {str(e)}"
|
||||||
|
self.logger.error(
|
||||||
|
message="Large viewport screenshot failed: {error}",
|
||||||
|
tag="ERROR",
|
||||||
|
params={"error": error_message}
|
||||||
|
)
|
||||||
|
# return error image
|
||||||
|
img = Image.new('RGB', (800, 600), color='black')
|
||||||
|
draw = ImageDraw.Draw(img)
|
||||||
|
font = ImageFont.load_default()
|
||||||
|
draw.text((10, 10), error_message, fill=(255, 255, 255), font=font)
|
||||||
|
buffered = BytesIO()
|
||||||
|
img.save(buffered, format="JPEG")
|
||||||
|
return base64.b64encode(buffered.getvalue()).decode('utf-8')
|
||||||
|
finally:
|
||||||
|
await page.close()
|
||||||
|
|
||||||
|
async def take_screenshot_naive(self, page: Page) -> str:
|
||||||
"""
|
"""
|
||||||
Takes a screenshot of the current page.
|
Takes a screenshot of the current page.
|
||||||
|
|
||||||
@@ -1193,7 +1323,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# The page is already loaded, just take the screenshot
|
# The page is already loaded, just take the screenshot
|
||||||
screenshot = await page.screenshot(full_page=True)
|
screenshot = await page.screenshot(full_page=False)
|
||||||
return base64.b64encode(screenshot).decode('utf-8')
|
return base64.b64encode(screenshot).decode('utf-8')
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_message = f"Failed to take screenshot: {str(e)}"
|
error_message = f"Failed to take screenshot: {str(e)}"
|
||||||
|
|||||||
@@ -147,6 +147,7 @@ class AsyncWebCrawler:
|
|||||||
# Other parameters
|
# Other parameters
|
||||||
css_selector: str = None,
|
css_selector: str = None,
|
||||||
screenshot: bool = False,
|
screenshot: bool = False,
|
||||||
|
pdf: bool = False,
|
||||||
user_agent: str = None,
|
user_agent: str = None,
|
||||||
verbose=True,
|
verbose=True,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
@@ -230,6 +231,7 @@ class AsyncWebCrawler:
|
|||||||
async_response: AsyncCrawlResponse = None
|
async_response: AsyncCrawlResponse = None
|
||||||
cached_result = None
|
cached_result = None
|
||||||
screenshot_data = None
|
screenshot_data = None
|
||||||
|
pdf_data = None
|
||||||
extracted_content = None
|
extracted_content = None
|
||||||
|
|
||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
@@ -245,6 +247,10 @@ class AsyncWebCrawler:
|
|||||||
screenshot_data = cached_result.screenshot
|
screenshot_data = cached_result.screenshot
|
||||||
if not screenshot_data:
|
if not screenshot_data:
|
||||||
cached_result = None
|
cached_result = None
|
||||||
|
if pdf:
|
||||||
|
pdf_data = cached_result.pdf
|
||||||
|
if not pdf_data:
|
||||||
|
cached_result = None
|
||||||
# if verbose:
|
# if verbose:
|
||||||
# print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s")
|
# print(f"{Fore.BLUE}{self.tag_format('FETCH')} {self.log_icons['FETCH']} Cache hit for {cache_context.display_url} | Status: {Fore.GREEN if bool(html) else Fore.RED}{bool(html)}{Style.RESET_ALL} | Time: {time.perf_counter() - start_time:.2f}s")
|
||||||
self.logger.url_status(
|
self.logger.url_status(
|
||||||
@@ -264,10 +270,12 @@ class AsyncWebCrawler:
|
|||||||
async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(
|
async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(
|
||||||
url,
|
url,
|
||||||
screenshot=screenshot,
|
screenshot=screenshot,
|
||||||
|
pdf=pdf,
|
||||||
**kwargs
|
**kwargs
|
||||||
)
|
)
|
||||||
html = sanitize_input_encode(async_response.html)
|
html = sanitize_input_encode(async_response.html)
|
||||||
screenshot_data = async_response.screenshot
|
screenshot_data = async_response.screenshot
|
||||||
|
pdf_data = async_response.pdf_data
|
||||||
t2 = time.perf_counter()
|
t2 = time.perf_counter()
|
||||||
self.logger.url_status(
|
self.logger.url_status(
|
||||||
url=cache_context.display_url,
|
url=cache_context.display_url,
|
||||||
@@ -289,6 +297,7 @@ class AsyncWebCrawler:
|
|||||||
content_filter=content_filter,
|
content_filter=content_filter,
|
||||||
css_selector=css_selector,
|
css_selector=css_selector,
|
||||||
screenshot=screenshot_data,
|
screenshot=screenshot_data,
|
||||||
|
pdf_data=pdf_data,
|
||||||
verbose=verbose,
|
verbose=verbose,
|
||||||
is_cached=bool(cached_result),
|
is_cached=bool(cached_result),
|
||||||
async_response=async_response,
|
async_response=async_response,
|
||||||
@@ -362,6 +371,7 @@ class AsyncWebCrawler:
|
|||||||
bypass_cache: bool = False,
|
bypass_cache: bool = False,
|
||||||
css_selector: str = None,
|
css_selector: str = None,
|
||||||
screenshot: bool = False,
|
screenshot: bool = False,
|
||||||
|
pdf: bool = False,
|
||||||
user_agent: str = None,
|
user_agent: str = None,
|
||||||
verbose=True,
|
verbose=True,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
@@ -550,6 +560,7 @@ class AsyncWebCrawler:
|
|||||||
)
|
)
|
||||||
|
|
||||||
screenshot = None if not screenshot else screenshot
|
screenshot = None if not screenshot else screenshot
|
||||||
|
pdf_data = kwargs.get("pdf_data", None)
|
||||||
|
|
||||||
|
|
||||||
if kwargs.get("prettiify", False):
|
if kwargs.get("prettiify", False):
|
||||||
@@ -567,6 +578,7 @@ class AsyncWebCrawler:
|
|||||||
links=links,
|
links=links,
|
||||||
metadata=metadata,
|
metadata=metadata,
|
||||||
screenshot=screenshot,
|
screenshot=screenshot,
|
||||||
|
pdf=pdf_data,
|
||||||
extracted_content=extracted_content,
|
extracted_content=extracted_content,
|
||||||
success=True,
|
success=True,
|
||||||
error_message="",
|
error_message="",
|
||||||
|
|||||||
@@ -56,4 +56,5 @@ MAX_METRICS_HISTORY = 1000
|
|||||||
|
|
||||||
NEED_MIGRATION = True
|
NEED_MIGRATION = True
|
||||||
URL_LOG_SHORTEN_LENGTH = 30
|
URL_LOG_SHORTEN_LENGTH = 30
|
||||||
SHOW_DEPRECATION_WARNINGS = True
|
SHOW_DEPRECATION_WARNINGS = True
|
||||||
|
SCREENSHOT_HEIGHT_TRESHOLD = 10000
|
||||||
@@ -23,6 +23,7 @@ class CrawlResult(BaseModel):
|
|||||||
links: Dict[str, List[Dict]] = {}
|
links: Dict[str, List[Dict]] = {}
|
||||||
downloaded_files: Optional[List[str]] = None
|
downloaded_files: Optional[List[str]] = None
|
||||||
screenshot: Optional[str] = None
|
screenshot: Optional[str] = None
|
||||||
|
pdf : Optional[bytes] = None
|
||||||
markdown: Optional[Union[str, MarkdownGenerationResult]] = None
|
markdown: Optional[Union[str, MarkdownGenerationResult]] = None
|
||||||
markdown_v2: Optional[MarkdownGenerationResult] = None
|
markdown_v2: Optional[MarkdownGenerationResult] = None
|
||||||
fit_markdown: Optional[str] = None
|
fit_markdown: Optional[str] = None
|
||||||
@@ -39,6 +40,7 @@ class AsyncCrawlResponse(BaseModel):
|
|||||||
response_headers: Dict[str, str]
|
response_headers: Dict[str, str]
|
||||||
status_code: int
|
status_code: int
|
||||||
screenshot: Optional[str] = None
|
screenshot: Optional[str] = None
|
||||||
|
pdf_data: Optional[bytes] = None
|
||||||
get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
|
get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
|
||||||
downloaded_files: Optional[List[str]] = None
|
downloaded_files: Optional[List[str]] = None
|
||||||
|
|
||||||
|
|||||||
117
docs/examples/tutorial_dynamic_clicks.md
Normal file
117
docs/examples/tutorial_dynamic_clicks.md
Normal file
@@ -0,0 +1,117 @@
|
|||||||
|
# Tutorial: Clicking Buttons to Load More Content with Crawl4AI
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|
|
||||||
|
When scraping dynamic websites, it’s common to encounter “Load More” or “Next” buttons that must be clicked to reveal new content. Crawl4AI provides a straightforward way to handle these situations using JavaScript execution and waiting conditions. In this tutorial, we’ll cover two approaches:
|
||||||
|
|
||||||
|
1. **Step-by-step (Session-based) Approach:** Multiple calls to `arun()` to progressively load more content.
|
||||||
|
2. **Single-call Approach:** Execute a more complex JavaScript snippet inside a single `arun()` call to handle all clicks at once before the extraction.
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
- A working installation of Crawl4AI
|
||||||
|
- Basic familiarity with Python’s `async`/`await` syntax
|
||||||
|
|
||||||
|
## Step-by-Step Approach
|
||||||
|
|
||||||
|
Use a session ID to maintain state across multiple `arun()` calls:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||||
|
|
||||||
|
js_code = [
|
||||||
|
# This JS finds the “Next” button and clicks it
|
||||||
|
"const nextButton = document.querySelector('button.next'); nextButton && nextButton.click();"
|
||||||
|
]
|
||||||
|
|
||||||
|
wait_for_condition = "css:.new-content-class"
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
|
||||||
|
# 1. Load the initial page
|
||||||
|
result_initial = await crawler.arun(
|
||||||
|
url="https://example.com",
|
||||||
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
session_id="my_session"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 2. Click the 'Next' button and wait for new content
|
||||||
|
result_next = await crawler.arun(
|
||||||
|
url="https://example.com",
|
||||||
|
session_id="my_session",
|
||||||
|
js_code=js_code,
|
||||||
|
wait_for=wait_for_condition,
|
||||||
|
js_only=True,
|
||||||
|
cache_mode=CacheMode.BYPASS
|
||||||
|
)
|
||||||
|
|
||||||
|
# `result_next` now contains the updated HTML after clicking 'Next'
|
||||||
|
```
|
||||||
|
|
||||||
|
**Key Points:**
|
||||||
|
- **`session_id`**: Keeps the same browser context open.
|
||||||
|
- **`js_code`**: Executes JavaScript in the context of the already loaded page.
|
||||||
|
- **`wait_for`**: Ensures the crawler waits until new content is fully loaded.
|
||||||
|
- **`js_only=True`**: Runs the JS in the current session without reloading the page.
|
||||||
|
|
||||||
|
By repeating the `arun()` call multiple times and modifying the `js_code` (e.g., clicking different modules or pages), you can iteratively load all the desired content.
|
||||||
|
|
||||||
|
## Single-call Approach
|
||||||
|
|
||||||
|
If the page allows it, you can run a single `arun()` call with a more elaborate JavaScript snippet that:
|
||||||
|
- Iterates over all the modules or "Next" buttons
|
||||||
|
- Clicks them one by one
|
||||||
|
- Waits for content updates between each click
|
||||||
|
- Once done, returns control to Crawl4AI for extraction.
|
||||||
|
|
||||||
|
Example snippet:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||||
|
|
||||||
|
js_code = [
|
||||||
|
# Example JS that clicks multiple modules:
|
||||||
|
"""
|
||||||
|
(async () => {
|
||||||
|
const modules = document.querySelectorAll('.module-item');
|
||||||
|
for (let i = 0; i < modules.length; i++) {
|
||||||
|
modules[i].scrollIntoView();
|
||||||
|
modules[i].click();
|
||||||
|
// Wait for each module’s content to load, adjust 100ms as needed
|
||||||
|
await new Promise(r => setTimeout(r, 100));
|
||||||
|
}
|
||||||
|
})();
|
||||||
|
"""
|
||||||
|
]
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
|
||||||
|
result = await crawler.arun(
|
||||||
|
url="https://example.com",
|
||||||
|
js_code=js_code,
|
||||||
|
wait_for="css:.final-loaded-content-class",
|
||||||
|
cache_mode=CacheMode.BYPASS
|
||||||
|
)
|
||||||
|
|
||||||
|
# `result` now contains all content after all modules have been clicked in one go.
|
||||||
|
```
|
||||||
|
|
||||||
|
**Key Points:**
|
||||||
|
- All interactions (clicks and waits) happen before the extraction.
|
||||||
|
- Ideal for pages where all steps can be done in a single pass.
|
||||||
|
|
||||||
|
## Choosing the Right Approach
|
||||||
|
|
||||||
|
- **Step-by-Step (Session-based)**:
|
||||||
|
- Good when you need fine-grained control or must dynamically check conditions before clicking the next page.
|
||||||
|
- Useful if the page requires multiple conditions checked at runtime.
|
||||||
|
|
||||||
|
- **Single-call**:
|
||||||
|
- Perfect if the sequence of interactions is known in advance.
|
||||||
|
- Cleaner code if the page’s structure is consistent and predictable.
|
||||||
|
|
||||||
|
## Conclusion
|
||||||
|
|
||||||
|
Crawl4AI makes it easy to handle dynamic content:
|
||||||
|
- Use session IDs and multiple `arun()` calls for stepwise crawling.
|
||||||
|
- Or pack all actions into one `arun()` call if the interactions are well-defined upfront.
|
||||||
|
|
||||||
|
This flexibility ensures you can handle a wide range of dynamic web pages efficiently.
|
||||||
Reference in New Issue
Block a user