Clean content...
### AsyncWebCrawler Constructor Parameters ```python AsyncWebCrawler( # Core Browser Settings browser_type: str = "chromium", # Options: "chromium", "firefox", "webkit" headless: bool = True, # Whether to run browser in headless mode verbose: bool = False, # Enable verbose logging # Cache Settings always_by_pass_cache: bool = False, # Always bypass cache regardless of run settings base_directory: str = str(Path.home()), # Base directory for cache storage # Network Settings proxy: str = None, # Simple proxy URL (e.g., "http://proxy.example.com:8080") proxy_config: Dict = None, # Advanced proxy settings with auth: {"server": str, "username": str, "password": str} # Browser Behavior sleep_on_close: bool = False, # Wait before closing browser # Other Settings passed to AsyncPlaywrightCrawlerStrategy user_agent: str = None, # Custom user agent string headers: Dict[str, str] = {}, # Custom HTTP headers js_code: Union[str, List[str]] = None, # Default JavaScript to execute ) ``` ### arun() Method Parameters ```python arun( # Core Parameters url: str, # Required: URL to crawl # Content Selection css_selector: str = None, # CSS selector to extract specific content word_count_threshold: int = MIN_WORD_THRESHOLD, # Minimum words for content blocks # Cache Control bypass_cache: bool = False, # Bypass cache for this request # Session Management session_id: str = None, # Session identifier for persistent browsing # Screenshot Options screenshot: bool = False, # Take page screenshot screenshot_wait_for: float = None, # Wait time before screenshot # Content Processing process_iframes: bool = False, # Process iframe content remove_overlay_elements: bool = False, # Remove popups/modals # Anti-Bot/Detection simulate_user: bool = False, # Simulate human-like behavior override_navigator: bool = False, # Override navigator properties magic: bool = False, # Enable all anti-detection features # Content Filtering excluded_tags: List[str] = None, # HTML tags to exclude exclude_external_links: bool = False, # Remove external links exclude_social_media_links: bool = False, # Remove social media links exclude_external_images: bool = False, # Remove external images exclude_social_media_domains: List[str] = None, # Additional social media domains to exclude remove_forms: bool = False, # Remove all form elements # JavaScript Handling js_code: Union[str, List[str]] = None, # JavaScript to execute js_only: bool = False, # Only execute JavaScript without reloading page wait_for: str = None, # Wait condition (CSS selector or JS function) # Page Loading page_timeout: int = 60000, # Page load timeout in milliseconds delay_before_return_html: float = None, # Wait before returning HTML # Debug Options log_console: bool = False, # Log browser console messages # Content Format Control only_text: bool = False, # Extract only text content keep_data_attributes: bool = False, # Keep data-* attributes in HTML # Markdown Options include_links_on_markdown: bool = False, # Include links in markdown output html2text: Dict = {}, # HTML to text conversion options # Extraction Strategy extraction_strategy: ExtractionStrategy = None, # Strategy for structured data extraction # Advanced Browser Control user_agent: str = None, # Override user agent for this request ) ``` ### Extraction Strategy Parameters ```python # JsonCssExtractionStrategy { "name": str, # Name of extraction schema "baseSelector": str, # Base CSS selector "fields": [ { "name": str, # Field name "selector": str, # CSS selector "type": str, # Data type ("text", etc.) "transform": str = None # Optional transformation } ] } # LLMExtractionStrategy { "provider": str, # LLM provider (e.g., "openai/gpt-4", "huggingface/...", "ollama/...") "api_token": str, # API token "schema": dict, # Pydantic model schema "extraction_type": str, # Type of extraction ("schema", etc.) "instruction": str, # Extraction instruction "extra_args": dict = None, # Additional provider-specific arguments "extra_headers": dict = None # Additional HTTP headers } ``` ### HTML to Text Conversion Options (html2text parameter) ```python { "escape_dot": bool = True, # Escape dots in text # Other html2text library options } ``` ### CrawlResult Fields ```python class CrawlResult(BaseModel): # Basic Information url: str # The crawled URL # Example: "https://example.com" success: bool # Whether the crawl was successful # Example: True/False status_code: Optional[int] # HTTP status code # Example: 200, 404, 500 # Content Fields html: str # Raw HTML content # Example: "
..." cleaned_html: Optional[str] # HTML after cleaning and processing # Example: "Clean content...
Most relevant content...