feat(browser): add standalone CDP browser launch and lxml extraction strategy

Add new features to enhance browser automation and HTML extraction: - Add CDP browser launch capability with customizable ports and profiles - Implement JsonLxmlExtractionStrategy for faster HTML parsing - Add CLI command 'crwl cdp' for launching standalone CDP browsers - Support connecting to external CDP browsers via URL - Optimize selector caching and context-sensitive queries BREAKING CHANGE: LLMConfig import path changed from crawl4ai.types to crawl4ai
2025-03-07 20:55:56 +08:00
parent f78c46446b
commit a68cbb232b
22 changed files with 745 additions and 29 deletions
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -23,6 +23,7 @@ from .extraction_strategy import (
    CosineStrategy,
    JsonCssExtractionStrategy,
    JsonXPathExtractionStrategy,
    JsonLxmlExtractionStrategy
 )
 from .chunking_strategy import ChunkingStrategy, RegexChunking
 from .markdown_generation_strategy import DefaultMarkdownGenerator
@@ -103,6 +104,7 @@ __all__ = [
    "CosineStrategy",
    "JsonCssExtractionStrategy",
    "JsonXPathExtractionStrategy",
    "JsonLxmlExtractionStrategy",
    "ChunkingStrategy",
    "RegexChunking",
    "DefaultMarkdownGenerator",
--- a/crawl4ai/browser_manager.py
+++ b/crawl4ai/browser_manager.py
@@ -434,8 +434,9 @@ class BrowserManager:
            self.playwright = await async_playwright().start()
-        if self.config.use_managed_browser:
+        if self.config.cdp_url or self.config.use_managed_browser:
-            cdp_url = await self.managed_browser.start()
+            self.config.use_managed_browser = True
            cdp_url = await self.managed_browser.start() if not self.config.cdp_url else self.config.cdp_url
            self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
            contexts = self.browser.contexts
            if contexts:
@@ -790,7 +791,10 @@ class BrowserManager:
        # If using a managed browser, just grab the shared default_context
        if self.config.use_managed_browser:
            context = self.default_context
-            page = await context.new_page()
+            pages = context.pages
            page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
            if not page:
                page = await context.new_page()
        else:
            # Otherwise, check if we have an existing context for this config
            config_signature = self._make_config_signature(crawlerRunConfig)
@@ -840,6 +844,9 @@ class BrowserManager:
    async def close(self):
        """Close all browser resources and clean up."""
        if self.config.cdp_url:
            return
        if self.config.sleep_on_close:
            await asyncio.sleep(0.5)
--- a/crawl4ai/browser_profiler.py
+++ b/crawl4ai/browser_profiler.py
@@ -342,7 +342,11 @@ class BrowserProfiler:
        # Check if path exists and is a valid profile
        if not os.path.isdir(profile_path):
-            return None
+            # Chrck if profile_name itself is full path
            if os.path.isabs(profile_name):
                profile_path = profile_name
            else:
                return None
        # Look for profile indicators
        is_profile = (
@@ -541,4 +545,225 @@ class BrowserProfiler:
                break
            else:
-                self.logger.error(f"Invalid choice. Please enter a number between 1 and {exit_option}.", tag="MENU")
+                self.logger.error(f"Invalid choice. Please enter a number between 1 and {exit_option}.", tag="MENU")
    async def launch_standalone_browser(self, 
                                  browser_type: str = "chromium",
                                  user_data_dir: Optional[str] = None,
                                  debugging_port: int = 9222,
                                  headless: bool = False) -> Optional[str]:
        """
        Launch a standalone browser with CDP debugging enabled and keep it running
        until the user presses 'q'. Returns and displays the CDP URL.
        Args:
            browser_type (str): Type of browser to launch ('chromium' or 'firefox')
            user_data_dir (str, optional): Path to user profile directory
            debugging_port (int): Port to use for CDP debugging
            headless (bool): Whether to run in headless mode
        Returns:
            str: CDP URL for the browser, or None if launch failed
        Example:
            ```python
            profiler = BrowserProfiler()
            cdp_url = await profiler.launch_standalone_browser(
                user_data_dir="/path/to/profile",
                debugging_port=9222
            )
            # Use cdp_url to connect to the browser
            ```
        """
        # Use the provided directory if specified, otherwise create a temporary directory
        if user_data_dir:
            # Directory is provided directly, ensure it exists
            profile_path = user_data_dir
            os.makedirs(profile_path, exist_ok=True)
        else:
            # Create a temporary profile directory
            timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
            profile_name = f"temp_{timestamp}_{uuid.uuid4().hex[:6]}"
            profile_path = os.path.join(self.profiles_dir, profile_name)
            os.makedirs(profile_path, exist_ok=True)
        # Print initial information
        border = f"{Fore.CYAN}{'='*80}{Style.RESET_ALL}"
        self.logger.info(f"\n{border}", tag="CDP")
        self.logger.info(f"Launching standalone browser with CDP debugging", tag="CDP")
        self.logger.info(f"Browser type: {Fore.GREEN}{browser_type}{Style.RESET_ALL}", tag="CDP")
        self.logger.info(f"Profile path: {Fore.YELLOW}{profile_path}{Style.RESET_ALL}", tag="CDP")
        self.logger.info(f"Debugging port: {Fore.CYAN}{debugging_port}{Style.RESET_ALL}", tag="CDP")
        self.logger.info(f"Headless mode: {Fore.CYAN}{headless}{Style.RESET_ALL}", tag="CDP")
        # Create managed browser instance
        managed_browser = ManagedBrowser(
            browser_type=browser_type,
            user_data_dir=profile_path,
            headless=headless,
            logger=self.logger,
            debugging_port=debugging_port
        )
        # Set up signal handlers to ensure cleanup on interrupt
        original_sigint = signal.getsignal(signal.SIGINT)
        original_sigterm = signal.getsignal(signal.SIGTERM)
        # Define cleanup handler for signals
        async def cleanup_handler(sig, frame):
            self.logger.warning("\nCleaning up browser process...", tag="CDP")
            await managed_browser.cleanup()
            # Restore original signal handlers
            signal.signal(signal.SIGINT, original_sigint)
            signal.signal(signal.SIGTERM, original_sigterm)
            if sig == signal.SIGINT:
                self.logger.error("Browser terminated by user.", tag="CDP")
                sys.exit(1)
        # Set signal handlers
        def sigint_handler(sig, frame):
            asyncio.create_task(cleanup_handler(sig, frame))
        signal.signal(signal.SIGINT, sigint_handler)
        signal.signal(signal.SIGTERM, sigint_handler)
        # Event to signal when user wants to exit
        user_done_event = asyncio.Event()
        # Run keyboard input loop in a separate task
        async def listen_for_quit_command():
            import termios
            import tty
            import select
            # First output the prompt
            self.logger.info(f"{Fore.CYAN}Press '{Fore.WHITE}q{Fore.CYAN}' to stop the browser and exit...{Style.RESET_ALL}", tag="CDP")
            # Save original terminal settings
            fd = sys.stdin.fileno()
            old_settings = termios.tcgetattr(fd)
            try:
                # Switch to non-canonical mode (no line buffering)
                tty.setcbreak(fd)
                while True:
                    # Check if input is available (non-blocking)
                    readable, _, _ = select.select([sys.stdin], [], [], 0.5)
                    if readable:
                        key = sys.stdin.read(1)
                        if key.lower() == 'q':
                            self.logger.info(f"{Fore.GREEN}Closing browser...{Style.RESET_ALL}", tag="CDP")
                            user_done_event.set()
                            return
                    # Check if the browser process has already exited
                    if managed_browser.browser_process and managed_browser.browser_process.poll() is not None:
                        self.logger.info("Browser already closed. Ending input listener.", tag="CDP")
                        user_done_event.set()
                        return
                    await asyncio.sleep(0.1)
            finally:
                # Restore terminal settings 
                termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
        # Function to retrieve and display CDP JSON config
        async def get_cdp_json(port):
            import aiohttp
            cdp_url = f"http://localhost:{port}"
            json_url = f"{cdp_url}/json/version"
            try:
                async with aiohttp.ClientSession() as session:
                    # Try multiple times in case the browser is still starting up
                    for _ in range(10):
                        try:
                            async with session.get(json_url) as response:
                                if response.status == 200:
                                    data = await response.json()
                                    return cdp_url, data
                        except Exception:
                            pass
                        await asyncio.sleep(0.5)
                    return cdp_url, None
            except Exception as e:
                self.logger.error(f"Error fetching CDP JSON: {str(e)}", tag="CDP")
                return cdp_url, None
        cdp_url = None
        config_json = None
        try:
            # Start the browser
            await managed_browser.start()
            # Check if browser started successfully
            browser_process = managed_browser.browser_process
            if not browser_process:
                self.logger.error("Failed to start browser process.", tag="CDP")
                return None
            self.logger.info(f"Browser launched successfully. Retrieving CDP information...", tag="CDP") 
            # Get CDP URL and JSON config
            cdp_url, config_json = await get_cdp_json(debugging_port)
            if cdp_url:
                self.logger.success(f"CDP URL: {Fore.GREEN}{cdp_url}{Style.RESET_ALL}", tag="CDP")
                if config_json:
                    # Display relevant CDP information
                    self.logger.info(f"Browser: {Fore.CYAN}{config_json.get('Browser', 'Unknown')}{Style.RESET_ALL}", tag="CDP")
                    self.logger.info(f"Protocol Version: {config_json.get('Protocol-Version', 'Unknown')}", tag="CDP")
                    if 'webSocketDebuggerUrl' in config_json:
                        self.logger.info(f"WebSocket URL: {Fore.GREEN}{config_json['webSocketDebuggerUrl']}{Style.RESET_ALL}", tag="CDP")
                else:
                    self.logger.warning("Could not retrieve CDP configuration JSON", tag="CDP")
            else:
                self.logger.error(f"Failed to get CDP URL on port {debugging_port}", tag="CDP")
                await managed_browser.cleanup()
                return None
            # Start listening for keyboard input
            listener_task = asyncio.create_task(listen_for_quit_command())
            # Wait for the user to press 'q' or for the browser process to exit naturally
            while not user_done_event.is_set() and browser_process.poll() is None:
                await asyncio.sleep(0.5)
            # Cancel the listener task if it's still running
            if not listener_task.done():
                listener_task.cancel()
                try:
                    await listener_task
                except asyncio.CancelledError:
                    pass
            # If the browser is still running and the user pressed 'q', terminate it
            if browser_process.poll() is None and user_done_event.is_set():
                self.logger.info("Terminating browser process...", tag="CDP")
                await managed_browser.cleanup()
            self.logger.success(f"Browser closed.", tag="CDP")
        except Exception as e:
            self.logger.error(f"Error launching standalone browser: {str(e)}", tag="CDP")
            await managed_browser.cleanup()
            return None
        finally:
            # Restore original signal handlers
            signal.signal(signal.SIGINT, original_sigint)
            signal.signal(signal.SIGTERM, original_sigterm)
            # Make sure browser is fully cleaned up
            await managed_browser.cleanup()
        # Return the CDP URL
        return cdp_url
--- a/crawl4ai/cli.py
+++ b/crawl4ai/cli.py
@@ -1,5 +1,6 @@
 import click
 import os
 import sys
 import time
 import humanize
@@ -198,7 +199,24 @@ def show_examples():
    # 2. Then use that profile to crawl the authenticated site:
    crwl https://site-requiring-login.com/dashboard -p my-profile-name
-5️⃣  Sample Config Files:
+5️⃣  CDP Mode for Browser Automation:
    # Launch browser with CDP debugging on default port 9222
    crwl cdp
    # Use a specific profile and custom port
    crwl cdp -p my-profile -P 9223
    # Launch headless browser with CDP enabled
    crwl cdp --headless
    # Launch in incognito mode (ignores profile)
    crwl cdp --incognito
    # Use the CDP URL with other tools (Puppeteer, Playwright, etc.)
    # The URL will be displayed in the terminal when the browser starts
 6️⃣  Sample Config Files:
 browser.yml:
    headless: true
@@ -256,7 +274,7 @@ llm_schema.json:
      }
    }
-6️⃣  Advanced Usage:
+7️⃣  Advanced Usage:
    # Combine configs with direct parameters
    crwl https://example.com -B browser.yml -b "headless=false,viewport_width=1920"
@@ -282,7 +300,7 @@ llm_schema.json:
 For more documentation visit: https://github.com/unclecode/crawl4ai
-7️⃣  Q&A with LLM:
+8️⃣  Q&A with LLM:
    # Ask a question about the content
    crwl https://example.com -q "What is the main topic discussed?"
@@ -310,7 +328,7 @@ For more documentation visit: https://github.com/unclecode/crawl4ai
    See full list of providers: https://docs.litellm.ai/docs/providers
-8️⃣ Profile Management:
+9️⃣ Profile Management:
    # Launch interactive profile manager
    crwl profiles
@@ -549,11 +567,89 @@ async def manage_profiles():
        # Add a separator between operations
        console.print("\n")
@click.group(context_settings={"help_option_names": ["-h", "--help"]})
 def cli():
    """Crawl4AI CLI - Web content extraction and browser profile management tool"""
    pass
@cli.command("cdp")
@click.option("--user-data-dir", "-d", help="Directory to use for browser data (will be created if it doesn't exist)")
@click.option("--port", "-P", type=int, default=9222, help="Debugging port (default: 9222)")
@click.option("--browser-type", "-b", type=click.Choice(["chromium", "firefox"]), default="chromium", 
              help="Browser type (default: chromium)")
@click.option("--headless", is_flag=True, help="Run browser in headless mode")
@click.option("--incognito", is_flag=True, help="Run in incognito/private mode (ignores user-data-dir)")
 def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless: bool, incognito: bool):
    """Launch a standalone browser with CDP debugging enabled
    This command launches a browser with Chrome DevTools Protocol (CDP) debugging enabled,
    prints the CDP URL, and keeps the browser running until you press 'q'.
    The CDP URL can be used for various automation and debugging tasks.
    Examples:
        # Launch Chromium with CDP on default port 9222
        crwl cdp
        # Use a specific directory for browser data and custom port
        crwl cdp --user-data-dir ~/browser-data --port 9223
        # Launch in headless mode
        crwl cdp --headless
        # Launch in incognito mode (ignores user-data-dir)
        crwl cdp --incognito
    """
    profiler = BrowserProfiler()
    try:
        # Handle data directory
        data_dir = None
        if not incognito and user_data_dir:
            # Expand user path (~/something)
            expanded_path = os.path.expanduser(user_data_dir)
            # Create directory if it doesn't exist
            if not os.path.exists(expanded_path):
                console.print(f"[yellow]Directory '{expanded_path}' doesn't exist. Creating it.[/yellow]")
                os.makedirs(expanded_path, exist_ok=True)
            data_dir = expanded_path
        # Print launch info
        console.print(Panel(
            f"[cyan]Launching browser with CDP debugging[/cyan]\n\n"
            f"Browser type: [green]{browser_type}[/green]\n"
            f"Debugging port: [yellow]{port}[/yellow]\n"
            f"User data directory: [cyan]{data_dir or 'Temporary directory'}[/cyan]\n"
            f"Headless: [cyan]{'Yes' if headless else 'No'}[/cyan]\n"
            f"Incognito: [cyan]{'Yes' if incognito else 'No'}[/cyan]\n\n"
            f"[yellow]Press 'q' to quit when done[/yellow]",
            title="CDP Browser",
            border_style="cyan"
        ))
        # Run the browser
        cdp_url = anyio.run(
            profiler.launch_standalone_browser,
            browser_type,
            data_dir,
            port,
            headless
        )
        if not cdp_url:
            console.print("[red]Failed to launch browser or get CDP URL[/red]")
            sys.exit(1)
    except Exception as e:
        console.print(f"[red]Error launching CDP browser: {str(e)}[/red]")
        sys.exit(1)
@cli.command("crawl")
@click.argument("url", required=True)
@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
@@ -737,6 +833,7 @@ def default(url: str, example: bool, browser_config: str, crawler_config: str, f
    Other commands:
        crwl profiles   - Manage browser profiles for identity-based crawling
        crwl crawl      - Crawl a website with advanced options
        crwl cdp        - Launch browser with CDP debugging enabled
        crwl examples   - Show more usage examples
    """
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -1168,7 +1168,8 @@ class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
        super().__init__(schema, **kwargs)
    def _parse_html(self, html_content: str):
-        return BeautifulSoup(html_content, "html.parser")
+        # return BeautifulSoup(html_content, "html.parser")
        return BeautifulSoup(html_content, "lxml")
    def _get_base_elements(self, parsed_html, selector: str):
        return parsed_html.select(selector)
@@ -1187,6 +1188,373 @@ class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
    def _get_element_attribute(self, element, attribute: str):
        return element.get(attribute)
 class JsonLxmlExtractionStrategy(JsonElementExtractionStrategy):
    def __init__(self, schema: Dict[str, Any], **kwargs):
        kwargs["input_format"] = "html"
        super().__init__(schema, **kwargs)
        self._selector_cache = {}
        self._xpath_cache = {}
        self._result_cache = {}
        # Control selector optimization strategy
        self.use_caching = kwargs.get("use_caching", True)
        self.optimize_common_patterns = kwargs.get("optimize_common_patterns", True)
        # Load lxml dependencies once
        from lxml import etree, html
        from lxml.cssselect import CSSSelector
        self.etree = etree
        self.html_parser = html
        self.CSSSelector = CSSSelector
    def _parse_html(self, html_content: str):
        """Parse HTML content with error recovery"""
        try:
            parser = self.etree.HTMLParser(recover=True, remove_blank_text=True)
            return self.etree.fromstring(html_content, parser)
        except Exception as e:
            if self.verbose:
                print(f"Error parsing HTML, falling back to alternative method: {e}")
            try:
                return self.html_parser.fromstring(html_content)
            except Exception as e2:
                if self.verbose:
                    print(f"Critical error parsing HTML: {e2}")
                # Create minimal document as fallback
                return self.etree.Element("html")
    def _optimize_selector(self, selector_str):
        """Optimize common selector patterns for better performance"""
        if not self.optimize_common_patterns:
            return selector_str
        # Handle td:nth-child(N) pattern which is very common in table scraping
        import re
        if re.search(r'td:nth-child\(\d+\)', selector_str):
            return selector_str  # Already handled specially in _apply_selector
        # Split complex selectors into parts for optimization
        parts = selector_str.split()
        if len(parts) <= 1:
            return selector_str
        # For very long selectors, consider using just the last specific part
        if len(parts) > 3 and any(p.startswith('.') or p.startswith('#') for p in parts):
            specific_parts = [p for p in parts if p.startswith('.') or p.startswith('#')]
            if specific_parts:
                return specific_parts[-1]  # Use most specific class/id selector
        return selector_str
    def _create_selector_function(self, selector_str):
        """Create a selector function that handles all edge cases"""
        original_selector = selector_str
        # Try to optimize the selector if appropriate
        if self.optimize_common_patterns:
            selector_str = self._optimize_selector(selector_str)
        try:
            # Attempt to compile the CSS selector
            compiled = self.CSSSelector(selector_str)
            xpath = compiled.path
            # Store XPath for later use
            self._xpath_cache[selector_str] = xpath
            # Create the wrapper function that implements the selection strategy
            def selector_func(element, context_sensitive=True):
                cache_key = None
                # Use result caching if enabled
                if self.use_caching:
                    # Create a cache key based on element and selector
                    element_id = element.get('id', '') or str(hash(element))
                    cache_key = f"{element_id}::{selector_str}"
                    if cache_key in self._result_cache:
                        return self._result_cache[cache_key]
                results = []
                try:
                    # Strategy 1: Direct CSS selector application (fastest)
                    results = compiled(element)
                    # If that fails and we need context sensitivity
                    if not results and context_sensitive:
                        # Strategy 2: Try XPath with context adjustment
                        context_xpath = self._make_context_sensitive_xpath(xpath, element)
                        if context_xpath:
                            results = element.xpath(context_xpath)
                        # Strategy 3: Handle special case - nth-child
                        if not results and 'nth-child' in original_selector:
                            results = self._handle_nth_child_selector(element, original_selector)
                        # Strategy 4: Direct descendant search for class/ID selectors
                        if not results:
                            results = self._fallback_class_id_search(element, original_selector)
                        # Strategy 5: Last resort - tag name search for the final part
                        if not results:
                            parts = original_selector.split()
                            if parts:
                                last_part = parts[-1]
                                # Extract tag name from the selector
                                tag_match = re.match(r'^(\w+)', last_part)
                                if tag_match:
                                    tag_name = tag_match.group(1)
                                    results = element.xpath(f".//{tag_name}")
                    # Cache results if caching is enabled
                    if self.use_caching and cache_key:
                        self._result_cache[cache_key] = results
                except Exception as e:
                    if self.verbose:
                        print(f"Error applying selector '{selector_str}': {e}")
                return results
            return selector_func
        except Exception as e:
            if self.verbose:
                print(f"Error compiling selector '{selector_str}': {e}")
            # Fallback function for invalid selectors
            return lambda element, context_sensitive=True: []
    def _make_context_sensitive_xpath(self, xpath, element):
        """Convert absolute XPath to context-sensitive XPath"""
        try:
            # If starts with descendant-or-self, it's already context-sensitive
            if xpath.startswith('descendant-or-self::'):
                return xpath
            # Remove leading slash if present
            if xpath.startswith('/'):
                context_xpath = f".{xpath}"
            else:
                context_xpath = f".//{xpath}"
            # Validate the XPath by trying it
            try:
                element.xpath(context_xpath)
                return context_xpath
            except:
                # If that fails, try a simpler descendant search
                return f".//{xpath.split('/')[-1]}"
        except:
            return None
    def _handle_nth_child_selector(self, element, selector_str):
        """Special handling for nth-child selectors in tables"""
        import re
        results = []
        try:
            # Extract the column number from td:nth-child(N)
            match = re.search(r'td:nth-child\((\d+)\)', selector_str)
            if match:
                col_num = match.group(1)
                # Check if there's content after the nth-child part
                remaining_selector = selector_str.split(f"td:nth-child({col_num})", 1)[-1].strip()
                if remaining_selector:
                    # If there's a specific element we're looking for after the column
                    # Extract any tag names from the remaining selector
                    tag_match = re.search(r'(\w+)', remaining_selector)
                    tag_name = tag_match.group(1) if tag_match else '*'
                    results = element.xpath(f".//td[{col_num}]//{tag_name}")
                else:
                    # Just get the column cell
                    results = element.xpath(f".//td[{col_num}]")
        except Exception as e:
            if self.verbose:
                print(f"Error handling nth-child selector: {e}")
        return results
    def _fallback_class_id_search(self, element, selector_str):
        """Fallback to search by class or ID"""
        results = []
        try:
            # Extract class selectors (.classname)
            import re
            class_matches = re.findall(r'\.([a-zA-Z0-9_-]+)', selector_str)
            # Extract ID selectors (#idname)
            id_matches = re.findall(r'#([a-zA-Z0-9_-]+)', selector_str)
            # Try each class
            for class_name in class_matches:
                class_results = element.xpath(f".//*[contains(@class, '{class_name}')]")
                results.extend(class_results)
            # Try each ID (usually more specific)
            for id_name in id_matches:
                id_results = element.xpath(f".//*[@id='{id_name}']")
                results.extend(id_results)
        except Exception as e:
            if self.verbose:
                print(f"Error in fallback class/id search: {e}")
        return results
    def _get_selector(self, selector_str):
        """Get or create a selector function with caching"""
        if selector_str not in self._selector_cache:
            self._selector_cache[selector_str] = self._create_selector_function(selector_str)
        return self._selector_cache[selector_str]
    def _get_base_elements(self, parsed_html, selector: str):
        """Get all base elements using the selector"""
        selector_func = self._get_selector(selector)
        # For base elements, we don't need context sensitivity
        return selector_func(parsed_html, context_sensitive=False)
    def _get_elements(self, element, selector: str):
        """Get child elements using the selector with context sensitivity"""
        selector_func = self._get_selector(selector)
        return selector_func(element, context_sensitive=True)
    def _get_element_text(self, element) -> str:
        """Extract normalized text from element"""
        try:
            # Get all text nodes and normalize
            text = " ".join(t.strip() for t in element.xpath(".//text()") if t.strip())
            return text
        except Exception as e:
            if self.verbose:
                print(f"Error extracting text: {e}")
            # Fallback
            try:
                return element.text_content().strip()
            except:
                return ""
    def _get_element_html(self, element) -> str:
        """Get HTML string representation of element"""
        try:
            return self.etree.tostring(element, encoding='unicode', method='html')
        except Exception as e:
            if self.verbose:
                print(f"Error serializing HTML: {e}")
            return ""
    def _get_element_attribute(self, element, attribute: str):
        """Get attribute value safely"""
        try:
            return element.get(attribute)
        except Exception as e:
            if self.verbose:
                print(f"Error getting attribute '{attribute}': {e}")
            return None
    def _clear_caches(self):
        """Clear caches to free memory"""
        if self.use_caching:
            self._result_cache.clear()
 class JsonLxmlExtractionStrategy_naive(JsonElementExtractionStrategy):
    def __init__(self, schema: Dict[str, Any], **kwargs):
        kwargs["input_format"] = "html"  # Force HTML input
        super().__init__(schema, **kwargs)
        self._selector_cache = {}
    def _parse_html(self, html_content: str):
        from lxml import etree
        parser = etree.HTMLParser(recover=True)
        return etree.fromstring(html_content, parser)
    def _get_selector(self, selector_str):
        """Get a selector function that works within the context of an element"""
        if selector_str not in self._selector_cache:
            from lxml.cssselect import CSSSelector
            try:
                # Store both the compiled selector and its xpath translation
                compiled = CSSSelector(selector_str)
                # Create a function that will apply this selector appropriately
                def select_func(element):
                    try:
                        # First attempt: direct CSS selector application
                        results = compiled(element)
                        if results:
                            return results
                        # Second attempt: contextual XPath selection
                        # Convert the root-based XPath to a context-based XPath
                        xpath = compiled.path
                        # If the XPath already starts with descendant-or-self, handle it specially
                        if xpath.startswith('descendant-or-self::'):
                            context_xpath = xpath
                        else:
                            # For normal XPath expressions, make them relative to current context
                            context_xpath = f"./{xpath.lstrip('/')}"
                        results = element.xpath(context_xpath)
                        if results:
                            return results
                        # Final fallback: simple descendant search for common patterns
                        if 'nth-child' in selector_str:
                            # Handle td:nth-child(N) pattern
                            import re
                            match = re.search(r'td:nth-child\((\d+)\)', selector_str)
                            if match:
                                col_num = match.group(1)
                                sub_selector = selector_str.split(')', 1)[-1].strip()
                                if sub_selector:
                                    return element.xpath(f".//td[{col_num}]//{sub_selector}")
                                else:
                                    return element.xpath(f".//td[{col_num}]")
                        # Last resort: try each part of the selector separately
                        parts = selector_str.split()
                        if len(parts) > 1 and parts[-1]:
                            return element.xpath(f".//{parts[-1]}")
                        return []
                    except Exception as e:
                        if self.verbose:
                            print(f"Error applying selector '{selector_str}': {e}")
                        return []
                self._selector_cache[selector_str] = select_func
            except Exception as e:
                if self.verbose:
                    print(f"Error compiling selector '{selector_str}': {e}")
                # Fallback function for invalid selectors
                def fallback_func(element):
                    return []
                self._selector_cache[selector_str] = fallback_func
        return self._selector_cache[selector_str]
    def _get_base_elements(self, parsed_html, selector: str):
        selector_func = self._get_selector(selector)
        return selector_func(parsed_html)
    def _get_elements(self, element, selector: str):
        selector_func = self._get_selector(selector)
        return selector_func(element)
    def _get_element_text(self, element) -> str:
        return "".join(element.xpath(".//text()")).strip()
    def _get_element_html(self, element) -> str:
        from lxml import etree
        return etree.tostring(element, encoding='unicode')
    def _get_element_attribute(self, element, attribute: str):
        return element.get(attribute)    
 class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
    """
--- a/docs/examples/extraction_strategies_examples.py
+++ b/docs/examples/extraction_strategies_examples.py
@@ -11,7 +11,7 @@ import asyncio
 import os
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
-from crawl4ai.types import LLMConfig
+from crawl4ai import LLMConfig
 from crawl4ai.extraction_strategy import (
    LLMExtractionStrategy,
    JsonCssExtractionStrategy,
--- a/docs/examples/llm_extraction_openai_pricing.py
+++ b/docs/examples/llm_extraction_openai_pricing.py
@@ -1,4 +1,4 @@
-from crawl4ai.types import LLMConfig
+from crawl4ai import LLMConfig
 from crawl4ai import AsyncWebCrawler, LLMExtractionStrategy
 import asyncio
 import os
--- a/docs/examples/llm_markdown_generator.py
+++ b/docs/examples/llm_markdown_generator.py
@@ -1,7 +1,7 @@
 import os
 import asyncio
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
-from crawl4ai.types import LLMConfig
+from crawl4ai import LLMConfig
 from crawl4ai.content_filter_strategy import LLMContentFilter
 async def test_llm_filter():
--- a/docs/examples/quickstart_async.config.py
+++ b/docs/examples/quickstart_async.config.py
@@ -1,6 +1,6 @@
 import os, sys
-from crawl4ai.types import LLMConfig
+from crawl4ai import LLMConfig
 sys.path.append(
    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
--- a/docs/examples/quickstart_async.py
+++ b/docs/examples/quickstart_async.py
@@ -1,6 +1,6 @@
 import os, sys
-from crawl4ai.types import LLMConfig
+from crawl4ai import LLMConfig
 # append parent directory to system path
 sys.path.append(
--- a/docs/examples/quickstart_sync.py
+++ b/docs/examples/quickstart_sync.py
@@ -1,6 +1,6 @@
 import os
 import time
-from crawl4ai.types import LLMConfig
+from crawl4ai import LLMConfig
 from crawl4ai.web_crawler import WebCrawler
 from crawl4ai.chunking_strategy import *
 from crawl4ai.extraction_strategy import *
--- a/docs/examples/tutorial_v0.5.py
+++ b/docs/examples/tutorial_v0.5.py
@@ -17,7 +17,7 @@ from crawl4ai.configs import ProxyConfig
 from crawl4ai import RoundRobinProxyStrategy
 from crawl4ai.content_filter_strategy import LLMContentFilter
 from crawl4ai import DefaultMarkdownGenerator
-from crawl4ai.types import LLMConfig
+from crawl4ai import LLMConfig
 from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
 from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
 from pprint import pprint
--- a/docs/md_v2/api/strategies.md
+++ b/docs/md_v2/api/strategies.md
@@ -131,7 +131,7 @@ OverlappingWindowChunking(
 ```python
 from pydantic import BaseModel
 from crawl4ai.extraction_strategy import LLMExtractionStrategy
-from crawl4ai.types import LLMConfig
+from crawl4ai import LLMConfig
 # Define schema
 class Article(BaseModel):
@@ -198,7 +198,7 @@ result = await crawler.arun(
 ```python
 from crawl4ai.chunking_strategy import OverlappingWindowChunking
-from crawl4ai.types import LLMConfig
+from crawl4ai import LLMConfig
 # Create chunking strategy
 chunker = OverlappingWindowChunking(
--- a/docs/md_v2/blog/releases/0.5.0.md
+++ b/docs/md_v2/blog/releases/0.5.0.md
@@ -305,7 +305,7 @@ asyncio.run(main())
 ```python
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
 from crawl4ai.content_filter_strategy import LLMContentFilter
-from crawl4ai.types import LLMConfig
+from crawl4ai import LLMConfig
 import asyncio
 llm_config = LLMConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
@@ -335,7 +335,7 @@ asyncio.run(main())
 ```python
 from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
-from crawl4ai.types import LLMConfig
+from crawl4ai import LLMConfig
 llm_config = LLMConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
@@ -401,7 +401,7 @@ print(schema)
  experimentation between different LLM configurations.
  ```python
-  from crawl4ai.types import LLMConfig
+  from crawl4ai import LLMConfig
  from crawl4ai.extraction_strategy import LLMExtractionStrategy
  from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
--- a/docs/md_v2/core/quickstart.md
+++ b/docs/md_v2/core/quickstart.md
@@ -128,7 +128,7 @@ Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. B
 ```python
 from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
-from crawl4ai.types import LLMConfig
+from crawl4ai import LLMConfig
 # Generate a schema (one-time cost)
 html = "<div class='product'><h2>Gaming Laptop</h2><span class='price'>$999.99</span></div>"
--- a/docs/md_v2/extraction/no-llm-strategies.md
+++ b/docs/md_v2/extraction/no-llm-strategies.md
@@ -415,7 +415,7 @@ The schema generator is available as a static method on both `JsonCssExtractionS
 ```python
 from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
-from crawl4ai.types import LLMConfig
+from crawl4ai import LLMConfig
 # Sample HTML with product information
 html = """
--- a/tests/20241401/test_llm_filter.py
+++ b/tests/20241401/test_llm_filter.py
@@ -1,7 +1,7 @@
 import os
 import asyncio
 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
-from crawl4ai.types import LLMConfig
+from crawl4ai import LLMConfig
 from crawl4ai.content_filter_strategy import LLMContentFilter
 async def test_llm_filter():
--- a/tests/async/test_chunking_and_extraction_strategies.py
+++ b/tests/async/test_chunking_and_extraction_strategies.py
@@ -7,7 +7,7 @@ import json
 parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.append(parent_dir)
-from crawl4ai.types import LLMConfig
+from crawl4ai import LLMConfig
 from crawl4ai.async_webcrawler import AsyncWebCrawler
 from crawl4ai.chunking_strategy import RegexChunking
 from crawl4ai.extraction_strategy import LLMExtractionStrategy
--- a/tests/browser/test_launch_standalone.py
+++ b/tests/browser/test_launch_standalone.py
@@ -0,0 +1,17 @@
 from crawl4ai.browser_profiler import BrowserProfiler
 import asyncio
 if __name__ == "__main__":
    # Test launching a standalone browser
    async def test_standalone_browser():
        profiler = BrowserProfiler()
        cdp_url = await profiler.launch_standalone_browser(
            browser_type="chromium",
            user_data_dir="~/.crawl4ai/browser_profile/test-browser-data",
            debugging_port=9222,
            headless=False
        )
        print(f"CDP URL: {cdp_url}")
    asyncio.run(test_standalone_browser())
--- a/tests/docker/test_docker.py
+++ b/tests/docker/test_docker.py
@@ -7,7 +7,7 @@ from crawl4ai import (
    BrowserConfig, CrawlerRunConfig, DefaultMarkdownGenerator,
    PruningContentFilter, JsonCssExtractionStrategy, LLMContentFilter, CacheMode
 )
-from crawl4ai.types import LLMConfig
+from crawl4ai import LLMConfig
 from crawl4ai.docker_client import Crawl4aiDockerClient
 class Crawl4AiTester:
--- a/tests/docker/test_serialization.py
+++ b/tests/docker/test_serialization.py
@@ -2,7 +2,7 @@ import inspect
 from typing import Any, Dict
 from enum import Enum
-from crawl4ai.types import LLMConfig
+from crawl4ai import LLMConfig
 def to_serializable_dict(obj: Any) -> Dict:
    """
--- a/tests/test_web_crawler.py
+++ b/tests/test_web_crawler.py
@@ -1,5 +1,5 @@
 import unittest, os
-from crawl4ai.types import LLMConfig
+from crawl4ai import LLMConfig
 from crawl4ai.web_crawler import WebCrawler
 from crawl4ai.chunking_strategy import (
    RegexChunking,