- Fix crawler text mode for improved performance; cover missing `srcset` and `data_srcset` attributes in image tags. - Introduced Managed Browsers for enhanced crawling experience. - Updated documentation for clearer navigation on configuration. - Changed 'text_only' to 'text_mode' in configuration and methods. - Improved performance and relevance in content filtering strategies.
5.8 KiB
5.8 KiB
Page Interaction
Crawl4AI provides powerful features for interacting with dynamic webpages, handling JavaScript execution, and managing page events.
JavaScript Execution
Basic Execution
from crawl4ai.async_configs import CrawlerRunConfig
# Single JavaScript command
config = CrawlerRunConfig(
js_code="window.scrollTo(0, document.body.scrollHeight);"
)
result = await crawler.arun(url="https://example.com", config=config)
# Multiple commands
js_commands = [
"window.scrollTo(0, document.body.scrollHeight);",
"document.querySelector('.load-more').click();",
"document.querySelector('#consent-button').click();"
]
config = CrawlerRunConfig(js_code=js_commands)
result = await crawler.arun(url="https://example.com", config=config)
Wait Conditions
CSS-Based Waiting
Wait for elements to appear:
config = CrawlerRunConfig(wait_for="css:.dynamic-content") # Wait for element with class 'dynamic-content'
result = await crawler.arun(url="https://example.com", config=config)
JavaScript-Based Waiting
Wait for custom conditions:
# Wait for number of elements
wait_condition = """() => {
return document.querySelectorAll('.item').length > 10;
}"""
config = CrawlerRunConfig(wait_for=f"js:{wait_condition}")
result = await crawler.arun(url="https://example.com", config=config)
# Wait for dynamic content to load
wait_for_content = """() => {
const content = document.querySelector('.content');
return content && content.innerText.length > 100;
}"""
config = CrawlerRunConfig(wait_for=f"js:{wait_for_content}")
result = await crawler.arun(url="https://example.com", config=config)
Handling Dynamic Content
Load More Content
Handle infinite scroll or load more buttons:
config = CrawlerRunConfig(
js_code=[
"window.scrollTo(0, document.body.scrollHeight);", # Scroll to bottom
"const loadMore = document.querySelector('.load-more'); if(loadMore) loadMore.click();" # Click load more
],
wait_for="js:() => document.querySelectorAll('.item').length > previousCount" # Wait for new content
)
result = await crawler.arun(url="https://example.com", config=config)
Form Interaction
Handle forms and inputs:
js_form_interaction = """
document.querySelector('#search').value = 'search term'; // Fill form fields
document.querySelector('form').submit(); // Submit form
"""
config = CrawlerRunConfig(
js_code=js_form_interaction,
wait_for="css:.results" # Wait for results to load
)
result = await crawler.arun(url="https://example.com", config=config)
Timing Control
Delays and Timeouts
Control timing of interactions:
config = CrawlerRunConfig(
page_timeout=60000, # Page load timeout (ms)
delay_before_return_html=2.0 # Wait before capturing content
)
result = await crawler.arun(url="https://example.com", config=config)
Complex Interactions Example
Here's an example of handling a dynamic page with multiple interactions:
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
async def crawl_dynamic_content():
async with AsyncWebCrawler() as crawler:
# Initial page load
config = CrawlerRunConfig(
js_code="document.querySelector('.cookie-accept')?.click();", # Handle cookie consent
wait_for="css:.main-content"
)
result = await crawler.arun(url="https://example.com", config=config)
# Load more content
session_id = "dynamic_session" # Keep session for multiple interactions
for page in range(3): # Load 3 pages of content
config = CrawlerRunConfig(
session_id=session_id,
js_code=[
"window.scrollTo(0, document.body.scrollHeight);", # Scroll to bottom
"window.previousCount = document.querySelectorAll('.item').length;", # Store item count
"document.querySelector('.load-more')?.click();" # Click load more
],
wait_for="""() => {
const currentCount = document.querySelectorAll('.item').length;
return currentCount > window.previousCount;
}""",
js_only=(page > 0) # Execute JS without reloading page for subsequent interactions
)
result = await crawler.arun(url="https://example.com", config=config)
print(f"Page {page + 1} items:", len(result.cleaned_html))
# Clean up session
await crawler.crawler_strategy.kill_session(session_id)
Using with Extraction Strategies
Combine page interaction with structured extraction:
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
from crawl4ai.async_configs import CrawlerRunConfig
# Pattern-based extraction after interaction
schema = {
"name": "Dynamic Items",
"baseSelector": ".item",
"fields": [
{"name": "title", "selector": "h2", "type": "text"},
{"name": "description", "selector": ".desc", "type": "text"}
]
}
config = CrawlerRunConfig(
js_code="window.scrollTo(0, document.body.scrollHeight);",
wait_for="css:.item:nth-child(10)", # Wait for 10 items
extraction_strategy=JsonCssExtractionStrategy(schema)
)
result = await crawler.arun(url="https://example.com", config=config)
# Or use LLM to analyze dynamic content
class ContentAnalysis(BaseModel):
topics: List[str]
summary: str
config = CrawlerRunConfig(
js_code="document.querySelector('.show-more').click();",
wait_for="css:.full-content",
extraction_strategy=LLMExtractionStrategy(
provider="ollama/nemotron",
schema=ContentAnalysis.schema(),
instruction="Analyze the full content"
)
)
result = await crawler.arun(url="https://example.com", config=config)