This commit introduces a comprehensive set of new scripts and examples to enhance the scripting capabilities of the crawl4ai project. The changes include the addition of several Python scripts for compiling and executing scripts, as well as a variety of example scripts demonstrating different functionalities such as login flows, data extraction, and multi-step workflows. Additionally, detailed documentation has been created to guide users on how to utilize these new features effectively. The following significant modifications were made: - Added core scripting files: , , and . - Created a new documentation file to provide an overview of the new features. - Introduced multiple example scripts in the directory to showcase various use cases. - Updated and to integrate the new functionalities. - Added font assets for improved documentation presentation. These changes significantly expand the functionality of the crawl4ai project, allowing users to create more complex and varied scripts with ease.
285 lines
7.6 KiB
Python
285 lines
7.6 KiB
Python
"""
|
|
Demonstration of C4A-Script integration with Crawl4AI
|
|
Shows various use cases and features
|
|
"""
|
|
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
|
from crawl4ai import c4a_compile, CompilationResult
|
|
|
|
async def example_basic_usage():
|
|
"""Basic C4A-Script usage with Crawl4AI"""
|
|
print("\n" + "="*60)
|
|
print("Example 1: Basic C4A-Script Usage")
|
|
print("="*60)
|
|
|
|
# Define your automation script
|
|
c4a_script = """
|
|
# Wait for page to load
|
|
WAIT `body` 2
|
|
|
|
# Handle cookie banner if present
|
|
IF (EXISTS `.cookie-banner`) THEN CLICK `.accept-btn`
|
|
|
|
# Scroll down to load more content
|
|
SCROLL DOWN 500
|
|
WAIT 1
|
|
|
|
# Click load more button if exists
|
|
IF (EXISTS `.load-more`) THEN CLICK `.load-more`
|
|
"""
|
|
|
|
# Create crawler config with C4A script
|
|
config = CrawlerRunConfig(
|
|
url="https://example.com",
|
|
c4a_script=c4a_script,
|
|
wait_for="css:.content",
|
|
verbose=False
|
|
)
|
|
|
|
print("✅ C4A Script compiled successfully!")
|
|
print(f"Generated {len(config.js_code)} JavaScript commands")
|
|
|
|
# In production, you would run:
|
|
# async with AsyncWebCrawler() as crawler:
|
|
# result = await crawler.arun(config=config)
|
|
|
|
|
|
async def example_form_filling():
|
|
"""Form filling with C4A-Script"""
|
|
print("\n" + "="*60)
|
|
print("Example 2: Form Filling with C4A-Script")
|
|
print("="*60)
|
|
|
|
# Form automation script
|
|
form_script = """
|
|
# Set form values
|
|
SET email = "test@example.com"
|
|
SET message = "This is a test message"
|
|
|
|
# Fill the form
|
|
CLICK `#email-input`
|
|
TYPE $email
|
|
|
|
CLICK `#message-textarea`
|
|
TYPE $message
|
|
|
|
# Submit the form
|
|
CLICK `button[type="submit"]`
|
|
|
|
# Wait for success message
|
|
WAIT `.success-message` 10
|
|
"""
|
|
|
|
config = CrawlerRunConfig(
|
|
url="https://example.com/contact",
|
|
c4a_script=form_script
|
|
)
|
|
|
|
print("✅ Form filling script ready")
|
|
print("Script will:")
|
|
print(" - Fill email field")
|
|
print(" - Fill message textarea")
|
|
print(" - Submit form")
|
|
print(" - Wait for confirmation")
|
|
|
|
|
|
async def example_dynamic_loading():
|
|
"""Handle dynamic content loading"""
|
|
print("\n" + "="*60)
|
|
print("Example 3: Dynamic Content Loading")
|
|
print("="*60)
|
|
|
|
# Script for infinite scroll or pagination
|
|
pagination_script = """
|
|
# Initial wait
|
|
WAIT `.product-list` 5
|
|
|
|
# Load all products by clicking "Load More" repeatedly
|
|
REPEAT (CLICK `.load-more`, `document.querySelector('.load-more') !== null`)
|
|
|
|
# Alternative: Scroll to load (infinite scroll)
|
|
# REPEAT (SCROLL DOWN 1000, `document.querySelectorAll('.product').length < 100`)
|
|
|
|
# Extract count
|
|
EVAL `console.log('Products loaded: ' + document.querySelectorAll('.product').length)`
|
|
"""
|
|
|
|
config = CrawlerRunConfig(
|
|
url="https://example.com/products",
|
|
c4a_script=pagination_script,
|
|
screenshot=True # Capture final state
|
|
)
|
|
|
|
print("✅ Dynamic loading script ready")
|
|
print("Script will load all products by repeatedly clicking 'Load More'")
|
|
|
|
|
|
async def example_multi_step_workflow():
|
|
"""Complex multi-step workflow with procedures"""
|
|
print("\n" + "="*60)
|
|
print("Example 4: Multi-Step Workflow with Procedures")
|
|
print("="*60)
|
|
|
|
# Complex workflow with reusable procedures
|
|
workflow_script = """
|
|
# Define login procedure
|
|
PROC login
|
|
CLICK `#username`
|
|
TYPE "demo_user"
|
|
CLICK `#password`
|
|
TYPE "demo_pass"
|
|
CLICK `#login-btn`
|
|
WAIT `.dashboard` 10
|
|
ENDPROC
|
|
|
|
# Define search procedure
|
|
PROC search_product
|
|
CLICK `.search-box`
|
|
TYPE "laptop"
|
|
PRESS Enter
|
|
WAIT `.search-results` 5
|
|
ENDPROC
|
|
|
|
# Main workflow
|
|
GO https://example.com
|
|
login
|
|
search_product
|
|
|
|
# Process results
|
|
IF (EXISTS `.no-results`) THEN EVAL `console.log('No products found')`
|
|
ELSE REPEAT (CLICK `.add-to-cart`, 3)
|
|
"""
|
|
|
|
# Compile to check for errors
|
|
result = c4a_compile(workflow_script)
|
|
|
|
if result.success:
|
|
print("✅ Complex workflow compiled successfully!")
|
|
print("Workflow includes:")
|
|
print(" - Login procedure")
|
|
print(" - Product search")
|
|
print(" - Conditional cart additions")
|
|
|
|
config = CrawlerRunConfig(
|
|
url="https://example.com",
|
|
c4a_script=workflow_script
|
|
)
|
|
else:
|
|
print("❌ Compilation error:")
|
|
error = result.first_error
|
|
print(f" Line {error.line}: {error.message}")
|
|
|
|
|
|
async def example_error_handling():
|
|
"""Demonstrate error handling"""
|
|
print("\n" + "="*60)
|
|
print("Example 5: Error Handling")
|
|
print("="*60)
|
|
|
|
# Script with intentional error
|
|
bad_script = """
|
|
WAIT body 2
|
|
CLICK button
|
|
IF (EXISTS .modal) CLICK .close
|
|
"""
|
|
|
|
try:
|
|
config = CrawlerRunConfig(
|
|
url="https://example.com",
|
|
c4a_script=bad_script
|
|
)
|
|
except ValueError as e:
|
|
print("✅ Error caught as expected:")
|
|
print(f" {e}")
|
|
|
|
# Fixed version
|
|
good_script = """
|
|
WAIT `body` 2
|
|
CLICK `button`
|
|
IF (EXISTS `.modal`) THEN CLICK `.close`
|
|
"""
|
|
|
|
config = CrawlerRunConfig(
|
|
url="https://example.com",
|
|
c4a_script=good_script
|
|
)
|
|
|
|
print("\n✅ Fixed script compiled successfully!")
|
|
|
|
|
|
async def example_combining_with_extraction():
|
|
"""Combine C4A-Script with extraction strategies"""
|
|
print("\n" + "="*60)
|
|
print("Example 6: C4A-Script + Extraction Strategies")
|
|
print("="*60)
|
|
|
|
from crawl4ai import JsonCssExtractionStrategy
|
|
|
|
# Script to prepare page for extraction
|
|
prep_script = """
|
|
# Expand all collapsed sections
|
|
REPEAT (CLICK `.expand-btn`, `document.querySelectorAll('.expand-btn:not(.expanded)').length > 0`)
|
|
|
|
# Load all comments
|
|
IF (EXISTS `.load-comments`) THEN CLICK `.load-comments`
|
|
WAIT `.comments-section` 5
|
|
|
|
# Close any popups
|
|
IF (EXISTS `.popup-close`) THEN CLICK `.popup-close`
|
|
"""
|
|
|
|
# Define extraction schema
|
|
schema = {
|
|
"name": "article",
|
|
"selector": "article.main",
|
|
"fields": {
|
|
"title": {"selector": "h1", "type": "text"},
|
|
"content": {"selector": ".content", "type": "text"},
|
|
"comments": {
|
|
"selector": ".comment",
|
|
"type": "list",
|
|
"fields": {
|
|
"author": {"selector": ".author", "type": "text"},
|
|
"text": {"selector": ".text", "type": "text"}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
config = CrawlerRunConfig(
|
|
url="https://example.com/article",
|
|
c4a_script=prep_script,
|
|
extraction_strategy=JsonCssExtractionStrategy(schema),
|
|
wait_for="css:.comments-section"
|
|
)
|
|
|
|
print("✅ Combined C4A + Extraction ready")
|
|
print("Workflow:")
|
|
print(" 1. Expand collapsed sections")
|
|
print(" 2. Load comments")
|
|
print(" 3. Extract structured data")
|
|
|
|
|
|
async def main():
|
|
"""Run all examples"""
|
|
print("\n🚀 C4A-Script + Crawl4AI Integration Demo\n")
|
|
|
|
# Run all examples
|
|
await example_basic_usage()
|
|
await example_form_filling()
|
|
await example_dynamic_loading()
|
|
await example_multi_step_workflow()
|
|
await example_error_handling()
|
|
await example_combining_with_extraction()
|
|
|
|
print("\n" + "="*60)
|
|
print("✅ All examples completed successfully!")
|
|
print("="*60)
|
|
|
|
print("\nTo run actual crawls, uncomment the AsyncWebCrawler sections")
|
|
print("or create your own scripts using these examples as templates.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main()) |