## Browser, Crawler & LLM Configuration Core configuration classes for controlling browser behavior, crawl operations, LLM providers, and understanding crawl results. ### BrowserConfig - Browser Environment Setup ```python from crawl4ai import BrowserConfig, AsyncWebCrawler # Basic browser configuration browser_config = BrowserConfig( browser_type="chromium", # "chromium", "firefox", "webkit" headless=True, # False for visible browser (debugging) viewport_width=1280, viewport_height=720, verbose=True ) # Advanced browser setup with proxy and persistence browser_config = BrowserConfig( headless=False, proxy="http://user:pass@proxy:8080", use_persistent_context=True, user_data_dir="./browser_data", cookies=[ {"name": "session", "value": "abc123", "domain": "example.com"} ], headers={"Accept-Language": "en-US,en;q=0.9"}, user_agent="Mozilla/5.0 (X11; Linux x86_64) Chrome/116.0.0.0 Safari/537.36", text_mode=True, # Disable images for faster crawling extra_args=["--disable-extensions", "--no-sandbox"] ) async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://example.com") ``` ### CrawlerRunConfig - Crawl Operation Control ```python from crawl4ai import CrawlerRunConfig, CacheMode from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator from crawl4ai.content_filter_strategy import PruningContentFilter # Basic crawl configuration run_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, word_count_threshold=10, excluded_tags=["nav", "footer", "script"], exclude_external_links=True, screenshot=True, pdf=True ) # Advanced content processing md_generator = DefaultMarkdownGenerator( content_filter=PruningContentFilter(threshold=0.6), options={"citations": True, "ignore_links": False} ) run_config = CrawlerRunConfig( # Content processing markdown_generator=md_generator, css_selector="main.content", # Focus on specific content target_elements=[".article", ".post"], # Multiple target selectors process_iframes=True, remove_overlay_elements=True, # Page interaction js_code=[ "window.scrollTo(0, document.body.scrollHeight);", "document.querySelector('.load-more')?.click();" ], wait_for="css:.content-loaded", wait_for_timeout=10000, scan_full_page=True, # Session management session_id="persistent_session", # Media handling screenshot=True, pdf=True, capture_mhtml=True, image_score_threshold=5, # Advanced options simulate_user=True, magic=True, # Auto-handle popups verbose=True ) ``` ### CrawlerRunConfig Parameters by Category ```python # Content Processing config = CrawlerRunConfig( word_count_threshold=10, # Min words per content block css_selector="main.article", # Focus on specific content target_elements=[".post", ".content"], # Multiple target selectors excluded_tags=["nav", "footer"], # Remove these tags excluded_selector="#ads, .tracker", # Remove by selector only_text=True, # Text-only extraction keep_data_attributes=True, # Preserve data-* attributes remove_forms=True, # Remove all forms process_iframes=True # Include iframe content ) # Page Navigation & Timing config = CrawlerRunConfig( wait_until="networkidle", # Wait condition page_timeout=60000, # 60 second timeout wait_for="css:.loaded", # Wait for specific element wait_for_images=True, # Wait for images to load delay_before_return_html=0.5, # Final delay before capture semaphore_count=10 # Max concurrent operations ) # Page Interaction config = CrawlerRunConfig( js_code="document.querySelector('button').click();", scan_full_page=True, # Auto-scroll page scroll_delay=0.3, # Delay between scrolls remove_overlay_elements=True, # Remove popups/modals simulate_user=True, # Simulate human behavior override_navigator=True, # Override navigator properties magic=True # Auto-handle common patterns ) # Caching & Session config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, # Cache behavior session_id="my_session", # Persistent session shared_data={"context": "value"} # Share data between hooks ) # Media & Output config = CrawlerRunConfig( screenshot=True, # Capture screenshot pdf=True, # Generate PDF capture_mhtml=True, # Capture MHTML archive image_score_threshold=3, # Filter low-quality images exclude_external_images=True # Remove external images ) # Link & Domain Filtering config = CrawlerRunConfig( exclude_external_links=True, # Remove external links exclude_social_media_links=True, # Remove social media links exclude_domains=["ads.com", "tracker.io"], # Custom domain filter exclude_internal_links=False # Keep internal links ) ``` ### LLMConfig - Language Model Setup ```python from crawl4ai import LLMConfig # OpenAI configuration llm_config = LLMConfig( provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"), # or "env:OPENAI_API_KEY" temperature=0.1, max_tokens=2000 ) # Local model with Ollama llm_config = LLMConfig( provider="ollama/llama3.3", api_token=None, # Not needed for Ollama base_url="http://localhost:11434" # Custom endpoint ) # Anthropic Claude llm_config = LLMConfig( provider="anthropic/claude-3-5-sonnet-20240620", api_token="env:ANTHROPIC_API_KEY", max_tokens=4000 ) # Google Gemini llm_config = LLMConfig( provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY" ) # Groq (fast inference) llm_config = LLMConfig( provider="groq/llama3-70b-8192", api_token="env:GROQ_API_KEY" ) ``` ### CrawlResult - Understanding Output ```python from crawl4ai import AsyncWebCrawler, CrawlerRunConfig async with AsyncWebCrawler() as crawler: result = await crawler.arun("https://example.com", config=run_config) # Basic status information print(f"Success: {result.success}") print(f"Status: {result.status_code}") print(f"URL: {result.url}") if not result.success: print(f"Error: {result.error_message}") return # HTML content variants print(f"Original HTML: {len(result.html)} chars") print(f"Cleaned HTML: {len(result.cleaned_html or '')} chars") # Markdown output (MarkdownGenerationResult) if result.markdown: print(f"Raw markdown: {len(result.markdown.raw_markdown)} chars") print(f"With citations: {len(result.markdown.markdown_with_citations)} chars") # Filtered content (if content filter was used) if result.markdown.fit_markdown: print(f"Fit markdown: {len(result.markdown.fit_markdown)} chars") print(f"Fit HTML: {len(result.markdown.fit_html)} chars") # Extracted structured data if result.extracted_content: import json data = json.loads(result.extracted_content) print(f"Extracted {len(data)} items") # Media and links images = result.media.get("images", []) print(f"Found {len(images)} images") for img in images[:3]: # First 3 images print(f" {img.get('src')} (score: {img.get('score', 0)})") internal_links = result.links.get("internal", []) external_links = result.links.get("external", []) print(f"Links: {len(internal_links)} internal, {len(external_links)} external") # Generated files if result.screenshot: print(f"Screenshot captured: {len(result.screenshot)} chars (base64)") # Save screenshot import base64 with open("page.png", "wb") as f: f.write(base64.b64decode(result.screenshot)) if result.pdf: print(f"PDF generated: {len(result.pdf)} bytes") with open("page.pdf", "wb") as f: f.write(result.pdf) if result.mhtml: print(f"MHTML captured: {len(result.mhtml)} chars") with open("page.mhtml", "w", encoding="utf-8") as f: f.write(result.mhtml) # SSL certificate information if result.ssl_certificate: print(f"SSL Issuer: {result.ssl_certificate.issuer}") print(f"Valid until: {result.ssl_certificate.valid_until}") # Network and console data (if captured) if result.network_requests: requests = [r for r in result.network_requests if r.get("event_type") == "request"] print(f"Network requests captured: {len(requests)}") if result.console_messages: errors = [m for m in result.console_messages if m.get("type") == "error"] print(f"Console messages: {len(result.console_messages)} ({len(errors)} errors)") # Session and metadata if result.session_id: print(f"Session ID: {result.session_id}") if result.metadata: print(f"Metadata: {result.metadata.get('title', 'No title')}") ``` ### Configuration Helpers and Best Practices ```python # Clone configurations for variations base_config = CrawlerRunConfig( cache_mode=CacheMode.ENABLED, word_count_threshold=200, verbose=True ) # Create streaming version stream_config = base_config.clone( stream=True, cache_mode=CacheMode.BYPASS ) # Create debug version debug_config = base_config.clone( headless=False, page_timeout=120000, verbose=True ) # Serialize/deserialize configurations config_dict = base_config.dump() # Convert to dict restored_config = CrawlerRunConfig.load(config_dict) # Restore from dict # Browser configuration management browser_config = BrowserConfig(headless=True, text_mode=True) browser_dict = browser_config.to_dict() cloned_browser = browser_config.clone(headless=False, verbose=True) ``` ### Common Configuration Patterns ```python # Fast text-only crawling fast_config = CrawlerRunConfig( cache_mode=CacheMode.ENABLED, text_mode=True, exclude_external_links=True, exclude_external_images=True, word_count_threshold=50 ) # Comprehensive data extraction comprehensive_config = CrawlerRunConfig( process_iframes=True, scan_full_page=True, wait_for_images=True, screenshot=True, capture_network_requests=True, capture_console_messages=True, magic=True ) # Stealth crawling stealth_config = CrawlerRunConfig( simulate_user=True, override_navigator=True, mean_delay=2.0, max_range=1.0, user_agent_mode="random" ) ``` ### Advanced Configuration Features #### User Agent Management & Bot Detection Avoidance ```python from crawl4ai import CrawlerRunConfig # Random user agent generation config = CrawlerRunConfig( user_agent_mode="random", user_agent_generator_config={ "platform": "windows", # "windows", "macos", "linux", "android", "ios" "browser": "chrome", # "chrome", "firefox", "safari", "edge" "device_type": "desktop" # "desktop", "mobile", "tablet" } ) # Custom user agent with stealth features config = CrawlerRunConfig( user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", simulate_user=True, # Simulate human mouse movements override_navigator=True, # Override navigator properties mean_delay=1.5, # Random delays between actions max_range=2.0 ) # Combined anti-detection approach stealth_config = CrawlerRunConfig( user_agent_mode="random", simulate_user=True, override_navigator=True, magic=True, # Auto-handle common bot detection patterns delay_before_return_html=2.0 ) ``` #### Proxy Configuration with ProxyConfig ```python from crawl4ai import CrawlerRunConfig, ProxyConfig, ProxyRotationStrategy # Single proxy configuration proxy_config = ProxyConfig( server="http://proxy.example.com:8080", username="proxy_user", password="proxy_pass" ) # From proxy string format proxy_config = ProxyConfig.from_string("192.168.1.100:8080:username:password") # Multiple proxies with rotation proxies = [ ProxyConfig(server="http://proxy1.com:8080", username="user1", password="pass1"), ProxyConfig(server="http://proxy2.com:8080", username="user2", password="pass2"), ProxyConfig(server="http://proxy3.com:8080", username="user3", password="pass3") ] rotation_strategy = ProxyRotationStrategy( proxies=proxies, rotation_method="round_robin" # or "random", "least_used" ) config = CrawlerRunConfig( proxy_config=proxy_config, proxy_rotation_strategy=rotation_strategy ) # Load proxies from environment variable proxies_from_env = ProxyConfig.from_env("MY_PROXIES") # comma-separated proxy strings ``` #### Content Selection: css_selector vs target_elements ```python from crawl4ai import CrawlerRunConfig # css_selector: Extracts HTML at top level, affects entire processing config = CrawlerRunConfig( css_selector="main.article, .content-area", # Can be list of selectors # Everything else (markdown, extraction, links) works only on this HTML subset ) # target_elements: Focuses extraction within already processed HTML config = CrawlerRunConfig( css_selector="body", # First extract entire body target_elements=[ # Then focus extraction on these elements ".article-content", ".post-body", ".main-text" ], # Links, media from entire body, but markdown/extraction only from target_elements ) # Hierarchical content selection config = CrawlerRunConfig( css_selector=["#main-content", ".article-wrapper"], # Top-level extraction target_elements=[ # Subset for processing ".article-title", ".article-body", ".article-metadata" ], excluded_selector="#sidebar, .ads, .comments" # Remove these from selection ) ``` #### Advanced wait_for Conditions ```python from crawl4ai import CrawlerRunConfig # CSS selector waiting config = CrawlerRunConfig( wait_for="css:.content-loaded", # Wait for element to appear wait_for_timeout=15000 ) # JavaScript boolean expression waiting config = CrawlerRunConfig( wait_for="js:() => window.dataLoaded === true", # Custom JS condition wait_for_timeout=20000 ) # Complex JavaScript conditions config = CrawlerRunConfig( wait_for="js:() => document.querySelectorAll('.item').length >= 10", js_code=[ "document.querySelector('.load-more')?.click();", "window.scrollTo(0, document.body.scrollHeight);" ] ) # Multiple conditions with JavaScript config = CrawlerRunConfig( wait_for="js:() => !document.querySelector('.loading') && document.querySelector('.results')", page_timeout=30000 ) ``` #### Session Management for Multi-Step Crawling ```python from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode # Persistent session across multiple arun() calls async def multi_step_crawling(): async with AsyncWebCrawler() as crawler: # Step 1: Login page login_config = CrawlerRunConfig( session_id="user_session", # Create persistent session js_code="document.querySelector('#username').value = 'user'; document.querySelector('#password').value = 'pass'; document.querySelector('#login').click();", wait_for="css:.dashboard", cache_mode=CacheMode.BYPASS ) result1 = await crawler.arun("https://example.com/login", config=login_config) # Step 2: Navigate to protected area (reuses same browser page) nav_config = CrawlerRunConfig( session_id="user_session", # Same session = same browser page js_only=True, # No page reload, just JS navigation js_code="window.location.href = '/dashboard/data';", wait_for="css:.data-table" ) result2 = await crawler.arun("https://example.com/dashboard/data", config=nav_config) # Step 3: Extract data from multiple pages for page in range(1, 6): page_config = CrawlerRunConfig( session_id="user_session", js_only=True, js_code=f"document.querySelector('.page-{page}').click();", wait_for=f"js:() => document.querySelector('.page-{page}').classList.contains('active')" ) result = await crawler.arun(f"https://example.com/data/page/{page}", config=page_config) print(f"Page {page} data extracted: {len(result.extracted_content)}") # Important: Kill session when done await crawler.kill_session("user_session") # Session with shared data between steps async def session_with_shared_data(): shared_context = {"user_id": "12345", "preferences": {"theme": "dark"}} config = CrawlerRunConfig( session_id="persistent_session", shared_data=shared_context, # Available across all session calls js_code="console.log('User ID:', window.sharedData.user_id);" ) ``` #### Identity-Based Crawling Parameters ```python from crawl4ai import CrawlerRunConfig, GeolocationConfig # Locale and timezone simulation config = CrawlerRunConfig( locale="en-US", # Browser language preference timezone_id="America/New_York", # Timezone setting user_agent_mode="random", user_agent_generator_config={ "platform": "windows", "locale": "en-US" } ) # Geolocation simulation geo_config = GeolocationConfig( latitude=40.7128, # New York coordinates longitude=-74.0060, accuracy=100.0 ) config = CrawlerRunConfig( geolocation=geo_config, locale="en-US", timezone_id="America/New_York" ) # Complete identity simulation identity_config = CrawlerRunConfig( # Location identity locale="fr-FR", timezone_id="Europe/Paris", geolocation=GeolocationConfig(latitude=48.8566, longitude=2.3522), # Browser identity user_agent_mode="random", user_agent_generator_config={ "platform": "windows", "locale": "fr-FR", "browser": "chrome" }, # Behavioral identity simulate_user=True, override_navigator=True, mean_delay=2.0, max_range=1.5 ) ``` #### Simplified Import Pattern ```python # Almost everything from crawl4ai main package from crawl4ai import ( AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig, CacheMode, ProxyConfig, GeolocationConfig ) # Specialized strategies (still from crawl4ai) from crawl4ai import ( JsonCssExtractionStrategy, LLMExtractionStrategy, DefaultMarkdownGenerator, PruningContentFilter, RegexChunking ) # Complete example with simplified imports async def example_crawl(): browser_config = BrowserConfig(headless=True) run_config = CrawlerRunConfig( user_agent_mode="random", proxy_config=ProxyConfig.from_string("192.168.1.1:8080:user:pass"), css_selector="main.content", target_elements=[".article", ".post"], wait_for="js:() => document.querySelector('.loaded')", session_id="my_session", simulate_user=True ) async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun("https://example.com", config=run_config) return result ``` ## Advanced Features Comprehensive guide to advanced crawling capabilities including file handling, authentication, dynamic content, monitoring, and session management. ### File Download Handling ```python from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig import os # Enable downloads with custom path downloads_path = os.path.join(os.getcwd(), "my_downloads") os.makedirs(downloads_path, exist_ok=True) browser_config = BrowserConfig( accept_downloads=True, downloads_path=downloads_path ) # Trigger downloads with JavaScript async def download_files(): async with AsyncWebCrawler(config=browser_config) as crawler: config = CrawlerRunConfig( js_code=""" // Click download links const downloadLinks = document.querySelectorAll('a[href$=".pdf"]'); for (const link of downloadLinks) { link.click(); await new Promise(r => setTimeout(r, 2000)); // Delay between downloads } """, wait_for=5 # Wait for downloads to start ) result = await crawler.arun("https://example.com/downloads", config=config) if result.downloaded_files: print("Downloaded files:") for file_path in result.downloaded_files: print(f"- {file_path} ({os.path.getsize(file_path)} bytes)") ``` ### Hooks & Authentication ```python from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig from playwright.async_api import Page, BrowserContext async def advanced_crawler_with_hooks(): browser_config = BrowserConfig(headless=True, verbose=True) crawler = AsyncWebCrawler(config=browser_config) # Hook functions for different stages async def on_browser_created(browser, **kwargs): print("[HOOK] Browser created successfully") return browser async def on_page_context_created(page: Page, context: BrowserContext, **kwargs): print("[HOOK] Setting up page & context") # Block images for faster crawling async def route_filter(route): if route.request.resource_type == "image": await route.abort() else: await route.continue_() await context.route("**", route_filter) # Simulate login if needed # await page.goto("https://example.com/login") # await page.fill("input[name='username']", "testuser") # await page.fill("input[name='password']", "password123") # await page.click("button[type='submit']") await page.set_viewport_size({"width": 1080, "height": 600}) return page async def before_goto(page: Page, context: BrowserContext, url: str, **kwargs): print(f"[HOOK] About to navigate to: {url}") await page.set_extra_http_headers({"Custom-Header": "my-value"}) return page async def after_goto(page: Page, context: BrowserContext, url: str, response, **kwargs): print(f"[HOOK] Successfully loaded: {url}") try: await page.wait_for_selector('.content', timeout=1000) print("[HOOK] Content found!") except: print("[HOOK] Content not found, continuing") return page async def before_retrieve_html(page: Page, context: BrowserContext, **kwargs): print("[HOOK] Final actions before HTML retrieval") await page.evaluate("window.scrollTo(0, document.body.scrollHeight);") return page # Attach hooks crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created) crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created) crawler.crawler_strategy.set_hook("before_goto", before_goto) crawler.crawler_strategy.set_hook("after_goto", after_goto) crawler.crawler_strategy.set_hook("before_retrieve_html", before_retrieve_html) await crawler.start() config = CrawlerRunConfig() result = await crawler.arun("https://example.com", config=config) if result.success: print(f"Crawled successfully: {len(result.html)} chars") await crawler.close() ``` ### Lazy Loading & Dynamic Content ```python # Handle lazy-loaded images and infinite scroll async def handle_lazy_loading(): config = CrawlerRunConfig( # Wait for images to fully load wait_for_images=True, # Automatically scroll entire page to trigger lazy loading scan_full_page=True, scroll_delay=0.5, # Delay between scroll steps # JavaScript for custom lazy loading js_code=""" // Scroll and wait for content to load window.scrollTo(0, document.body.scrollHeight); // Click "Load More" if available const loadMoreBtn = document.querySelector('.load-more'); if (loadMoreBtn) { loadMoreBtn.click(); } """, # Wait for specific content to appear wait_for="css:.lazy-content:nth-child(20)", # Wait for 20 items # Exclude external images to focus on main content exclude_external_images=True ) async with AsyncWebCrawler() as crawler: result = await crawler.arun("https://example.com/gallery", config=config) if result.success: images = result.media.get("images", []) print(f"Loaded {len(images)} images after lazy loading") for img in images[:3]: print(f"- {img.get('src')} (score: {img.get('score', 'N/A')})") ``` ### Network & Console Monitoring ```python # Capture all network requests and console messages for debugging async def monitor_network_and_console(): config = CrawlerRunConfig( capture_network_requests=True, capture_console_messages=True ) async with AsyncWebCrawler() as crawler: result = await crawler.arun("https://example.com", config=config) if result.success: # Analyze network requests if result.network_requests: requests = [r for r in result.network_requests if r.get("event_type") == "request"] responses = [r for r in result.network_requests if r.get("event_type") == "response"] failures = [r for r in result.network_requests if r.get("event_type") == "request_failed"] print(f"Network activity: {len(requests)} requests, {len(responses)} responses, {len(failures)} failures") # Find API calls api_calls = [r for r in requests if "api" in r.get("url", "")] print(f"API calls detected: {len(api_calls)}") # Show failed requests for failure in failures[:3]: print(f"Failed: {failure.get('url')} - {failure.get('failure_text')}") # Analyze console messages if result.console_messages: message_types = {} for msg in result.console_messages: msg_type = msg.get("type", "unknown") message_types[msg_type] = message_types.get(msg_type, 0) + 1 print(f"Console messages: {message_types}") # Show errors errors = [msg for msg in result.console_messages if msg.get("type") == "error"] for error in errors[:2]: print(f"JS Error: {error.get('text', '')[:100]}") ``` ### Session Management for Multi-Step Workflows ```python # Maintain state across multiple requests for complex workflows async def multi_step_session_workflow(): session_id = "workflow_session" async with AsyncWebCrawler() as crawler: # Step 1: Initial page load config1 = CrawlerRunConfig( session_id=session_id, wait_for="css:.content-loaded" ) result1 = await crawler.arun("https://example.com/step1", config=config1) print("Step 1 completed") # Step 2: Navigate and interact (same browser tab) config2 = CrawlerRunConfig( session_id=session_id, js_only=True, # Don't reload page, just run JS js_code=""" document.querySelector('#next-button').click(); """, wait_for="css:.step2-content" ) result2 = await crawler.arun("https://example.com/step2", config=config2) print("Step 2 completed") # Step 3: Form submission config3 = CrawlerRunConfig( session_id=session_id, js_only=True, js_code=""" document.querySelector('#form-field').value = 'test data'; document.querySelector('#submit-btn').click(); """, wait_for="css:.results" ) result3 = await crawler.arun("https://example.com/submit", config=config3) print("Step 3 completed") # Clean up session await crawler.crawler_strategy.kill_session(session_id) # Advanced GitHub commits pagination example async def github_commits_pagination(): session_id = "github_session" all_commits = [] async with AsyncWebCrawler() as crawler: for page in range(3): if page == 0: # Initial load config = CrawlerRunConfig( session_id=session_id, wait_for="js:() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0" ) else: # Navigate to next page config = CrawlerRunConfig( session_id=session_id, js_only=True, js_code='document.querySelector(\'a[data-testid="pagination-next-button"]\').click();', wait_for="js:() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0" ) result = await crawler.arun( "https://github.com/microsoft/TypeScript/commits/main", config=config ) if result.success: commit_count = result.cleaned_html.count('li.Box-sc-g0xbh4-0') print(f"Page {page + 1}: Found {commit_count} commits") await crawler.crawler_strategy.kill_session(session_id) ``` ### SSL Certificate Analysis ```python # Fetch and analyze SSL certificates async def analyze_ssl_certificates(): config = CrawlerRunConfig( fetch_ssl_certificate=True ) async with AsyncWebCrawler() as crawler: result = await crawler.arun("https://example.com", config=config) if result.success and result.ssl_certificate: cert = result.ssl_certificate # Basic certificate info print(f"Issuer: {cert.issuer.get('CN', 'Unknown')}") print(f"Subject: {cert.subject.get('CN', 'Unknown')}") print(f"Valid from: {cert.valid_from}") print(f"Valid until: {cert.valid_until}") print(f"Fingerprint: {cert.fingerprint}") # Export certificate in different formats import os os.makedirs("certificates", exist_ok=True) cert.to_json("certificates/cert.json") cert.to_pem("certificates/cert.pem") cert.to_der("certificates/cert.der") print("Certificate exported in multiple formats") ``` ### Advanced Page Interaction ```python # Complex page interactions with dynamic content async def advanced_page_interaction(): async with AsyncWebCrawler() as crawler: # Multi-step interaction with waiting config = CrawlerRunConfig( js_code=[ # Step 1: Scroll to load content "window.scrollTo(0, document.body.scrollHeight);", # Step 2: Wait and click load more """ (async () => { await new Promise(resolve => setTimeout(resolve, 2000)); const loadMore = document.querySelector('.load-more'); if (loadMore) loadMore.click(); })(); """ ], # Wait for new content to appear wait_for="js:() => document.querySelectorAll('.item').length > 20", # Additional timing controls page_timeout=60000, # 60 second timeout delay_before_return_html=2.0, # Wait before final capture # Handle overlays automatically remove_overlay_elements=True, magic=True, # Auto-handle common popup patterns # Simulate human behavior simulate_user=True, override_navigator=True ) result = await crawler.arun("https://example.com/dynamic", config=config) if result.success: print(f"Interactive crawl completed: {len(result.cleaned_html)} chars") # Form interaction example async def form_interaction_example(): config = CrawlerRunConfig( js_code=""" // Fill search form document.querySelector('#search-input').value = 'machine learning'; document.querySelector('#category-select').value = 'technology'; document.querySelector('#search-form').submit(); """, wait_for="css:.search-results", session_id="search_session" ) async with AsyncWebCrawler() as crawler: result = await crawler.arun("https://example.com/search", config=config) print("Search completed, results loaded") ``` ### Local File & Raw HTML Processing ```python # Handle different input types: URLs, local files, raw HTML async def handle_different_inputs(): async with AsyncWebCrawler() as crawler: # 1. Regular web URL result1 = await crawler.arun("https://example.com") # 2. Local HTML file local_file_path = "/path/to/file.html" result2 = await crawler.arun(f"file://{local_file_path}") # 3. Raw HTML content raw_html = "

Test Content

Sample text

" result3 = await crawler.arun(f"raw:{raw_html}") # All return the same CrawlResult structure for i, result in enumerate([result1, result2, result3], 1): if result.success: print(f"Input {i}: {len(result.markdown)} chars of markdown") # Save and re-process HTML example async def save_and_reprocess(): async with AsyncWebCrawler() as crawler: # Original crawl result = await crawler.arun("https://example.com") if result.success: # Save HTML to file with open("saved_page.html", "w", encoding="utf-8") as f: f.write(result.html) # Re-process from file file_result = await crawler.arun("file://./saved_page.html") # Process as raw HTML raw_result = await crawler.arun(f"raw:{result.html}") # Verify consistency assert len(result.markdown) == len(file_result.markdown) == len(raw_result.markdown) print("✅ All processing methods produced identical results") ``` ### Advanced Link & Media Handling ```python # Comprehensive link and media extraction with filtering async def advanced_link_media_handling(): config = CrawlerRunConfig( # Link filtering exclude_external_links=False, # Keep external links for analysis exclude_social_media_links=True, exclude_domains=["ads.com", "tracker.io", "spammy.net"], # Media handling exclude_external_images=True, image_score_threshold=5, # Only high-quality images table_score_threshold=7, # Only well-structured tables wait_for_images=True, # Capture additional formats screenshot=True, pdf=True, capture_mhtml=True # Full page archive ) async with AsyncWebCrawler() as crawler: result = await crawler.arun("https://example.com", config=config) if result.success: # Analyze links internal_links = result.links.get("internal", []) external_links = result.links.get("external", []) print(f"Links: {len(internal_links)} internal, {len(external_links)} external") # Analyze media images = result.media.get("images", []) tables = result.media.get("tables", []) print(f"Media: {len(images)} images, {len(tables)} tables") # High-quality images only quality_images = [img for img in images if img.get("score", 0) >= 5] print(f"High-quality images: {len(quality_images)}") # Table analysis for i, table in enumerate(tables[:2]): print(f"Table {i+1}: {len(table.get('headers', []))} columns, {len(table.get('rows', []))} rows") # Save captured files if result.screenshot: import base64 with open("page_screenshot.png", "wb") as f: f.write(base64.b64decode(result.screenshot)) if result.pdf: with open("page.pdf", "wb") as f: f.write(result.pdf) if result.mhtml: with open("page_archive.mhtml", "w", encoding="utf-8") as f: f.write(result.mhtml) print("Additional formats saved: screenshot, PDF, MHTML archive") ``` ### Performance & Resource Management ```python # Optimize performance for large-scale crawling async def performance_optimized_crawling(): # Lightweight browser config browser_config = BrowserConfig( headless=True, text_mode=True, # Disable images for speed light_mode=True, # Reduce background features extra_args=["--disable-extensions", "--no-sandbox"] ) # Efficient crawl config config = CrawlerRunConfig( # Content filtering for speed excluded_tags=["script", "style", "nav", "footer"], exclude_external_links=True, exclude_all_images=True, # Remove all images for max speed word_count_threshold=50, # Timing optimizations page_timeout=30000, # Faster timeout delay_before_return_html=0.1, # Resource monitoring capture_network_requests=False, # Disable unless needed capture_console_messages=False, # Cache for repeated URLs cache_mode=CacheMode.ENABLED ) async with AsyncWebCrawler(config=browser_config) as crawler: urls = ["https://example.com/page1", "https://example.com/page2", "https://example.com/page3"] # Efficient batch processing batch_config = config.clone( stream=True, # Stream results as they complete semaphore_count=3 # Control concurrency ) async for result in await crawler.arun_many(urls, config=batch_config): if result.success: print(f"✅ {result.url}: {len(result.markdown)} chars") else: print(f"❌ {result.url}: {result.error_message}") ``` **📖 Learn more:** [Complete Parameter Reference](https://docs.crawl4ai.com/api/parameters/), [Content Filtering](https://docs.crawl4ai.com/core/markdown-generation/), [Session Management](https://docs.crawl4ai.com/advanced/session-management/), [Network Capture](https://docs.crawl4ai.com/advanced/network-console-capture/) **📖 Learn more:** [Hooks & Authentication](https://docs.crawl4ai.com/advanced/hooks-auth/), [Session Management](https://docs.crawl4ai.com/advanced/session-management/), [Network Monitoring](https://docs.crawl4ai.com/advanced/network-console-capture/), [Page Interaction](https://docs.crawl4ai.com/core/page-interaction/), [File Downloads](https://docs.crawl4ai.com/advanced/file-downloading/)