From b2f3cb0dfa3247afad098695dc5896642a7901e4 Mon Sep 17 00:00:00 2001
From: wakaka6 <48764488+wakaka6@users.noreply.github.com>
Date: Thu, 10 Apr 2025 23:02:19 +0800
Subject: [PATCH 1/8] WIP: logger migriate to rich

---
 crawl4ai/async_database.py          |  10 ++-
 crawl4ai/async_logger.py            |  95 +++++++++++---------------
 crawl4ai/async_webcrawler.py        |   9 ++-
 crawl4ai/browser_profiler.py        | 102 +++++++++++++++-------------
 crawl4ai/content_filter_strategy.py |  12 ++--
 crawl4ai/utils.py                   |  19 +++---
 6 files changed, 121 insertions(+), 126 deletions(-)

diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py
index 870350e9..a41ca97f 100644
--- a/crawl4ai/async_database.py
+++ b/crawl4ai/async_database.py
@@ -171,7 +171,10 @@ class AsyncDatabaseManager:
                             f"Code context:\n{error_context['code_context']}"
                         )
                         self.logger.error(
-                            message=create_box_message(error_message, type="error"),
+                            message="{error}",
+                            tag="ERROR",
+                            params={"error": str(error_message)},
+                            boxes=["error"],
                         )
 
                         raise
@@ -189,7 +192,10 @@ class AsyncDatabaseManager:
                 f"Code context:\n{error_context['code_context']}"
             )
             self.logger.error(
-                message=create_box_message(error_message, type="error"),
+                message="{error}",
+                tag="ERROR",
+                params={"error": str(error_message)},
+                boxes=["error"],
             )
             raise
         finally:
diff --git a/crawl4ai/async_logger.py b/crawl4ai/async_logger.py
index 273ef53b..7866e36f 100644
--- a/crawl4ai/async_logger.py
+++ b/crawl4ai/async_logger.py
@@ -1,9 +1,11 @@
 from abc import ABC, abstractmethod
 from enum import Enum
-from typing import Optional, Dict, Any
-from colorama import Fore, Style, init
+from typing import Optional, Dict, Any, List
 import os
 from datetime import datetime
+from rich.console import Console
+from rich.text import Text
+from .utils import create_box_message
 
 
 class LogLevel(Enum):
@@ -13,6 +15,8 @@ class LogLevel(Enum):
     WARNING = 4
     ERROR = 5
 
+    def __str__(self):
+        return self.name.lower()
 
 
 class AsyncLoggerBase(ABC):
@@ -64,11 +68,11 @@ class AsyncLogger(AsyncLoggerBase):
     }
 
     DEFAULT_COLORS = {
-        LogLevel.DEBUG: Fore.LIGHTBLACK_EX,
-        LogLevel.INFO: Fore.CYAN,
-        LogLevel.SUCCESS: Fore.GREEN,
-        LogLevel.WARNING: Fore.YELLOW,
-        LogLevel.ERROR: Fore.RED,
+        LogLevel.DEBUG: "lightblack",
+        LogLevel.INFO: "cyan",
+        LogLevel.SUCCESS: "green",
+        LogLevel.WARNING: "yellow",
+        LogLevel.ERROR: "red",
     }
 
     def __init__(
@@ -91,13 +95,13 @@ class AsyncLogger(AsyncLoggerBase):
             colors: Custom colors for different log levels
             verbose: Whether to output to console
         """
-        init()  # Initialize colorama
         self.log_file = log_file
         self.log_level = log_level
         self.tag_width = tag_width
         self.icons = icons or self.DEFAULT_ICONS
         self.colors = colors or self.DEFAULT_COLORS
         self.verbose = verbose
+        self.console = Console()
 
         # Create log file directory if needed
         if log_file:
@@ -114,16 +118,11 @@ class AsyncLogger(AsyncLoggerBase):
     def _write_to_file(self, message: str):
         """Write a message to the log file if configured."""
         if self.log_file:
+            text = Text.from_markup(message)
+            plain_text = text.plain
             timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
             with open(self.log_file, "a", encoding="utf-8") as f:
-                # Strip ANSI color codes for file output
-                clean_message = message.replace(Fore.RESET, "").replace(
-                    Style.RESET_ALL, ""
-                )
-                for color in vars(Fore).values():
-                    if isinstance(color, str):
-                        clean_message = clean_message.replace(color, "")
-                f.write(f"[{timestamp}] {clean_message}\n")
+                f.write(f"[{timestamp}] {plain_text}\n")
 
     def _log(
         self,
@@ -132,6 +131,7 @@ class AsyncLogger(AsyncLoggerBase):
         tag: str,
         params: Optional[Dict[str, Any]] = None,
         colors: Optional[Dict[str, str]] = None,
+        boxes: Optional[List[str]] = None,
         base_color: Optional[str] = None,
         **kwargs,
     ):
@@ -144,55 +144,41 @@ class AsyncLogger(AsyncLoggerBase):
             tag: Tag for the message
             params: Parameters to format into the message
             colors: Color overrides for specific parameters
+            boxes: Box overrides for specific parameters
             base_color: Base color for the entire message
         """
         if level.value < self.log_level.value:
             return
 
-        # Format the message with parameters if provided
+        # avoid conflict with rich formatting
+        parsed_message = message.replace("[", "[[").replace("]", "]]")
+        raw_message = message.format(**params) if params else message
         if params:
-            try:
-                # First format the message with raw parameters
-                formatted_message = message.format(**params)
-
-                # Then apply colors if specified
-                color_map = {
-                    "green": Fore.GREEN,
-                    "red": Fore.RED,
-                    "yellow": Fore.YELLOW,
-                    "blue": Fore.BLUE,
-                    "cyan": Fore.CYAN,
-                    "magenta": Fore.MAGENTA,
-                    "white": Fore.WHITE,
-                    "black": Fore.BLACK,
-                    "reset": Style.RESET_ALL,
-                }
-                if colors:
-                    for key, color in colors.items():
-                        # Find the formatted value in the message and wrap it with color
-                        if color in color_map:
-                            color = color_map[color]
-                        if key in params:
-                            value_str = str(params[key])
-                            formatted_message = formatted_message.replace(
-                                value_str, f"{color}{value_str}{Style.RESET_ALL}"
-                            )
-
-            except KeyError as e:
-                formatted_message = (
-                    f"LOGGING ERROR: Missing parameter {e} in message template"
-                )
-                level = LogLevel.ERROR
+            formatted_message = parsed_message.format(**params)
+            for key, value in params.items():
+                # value_str may discard `[` and `]`, so we need to replace it. 
+                value_str = str(value).replace("[", "[[").replace("]", "]]")
+                # check is need apply color
+                if colors and key in colors:
+                    color_str = f"[{colors[key]}]{value_str}[/{colors[key]}]"
+                    formatted_message = formatted_message.replace(value_str, color_str)
+                    value_str = color_str
+                
+                # check is need apply box
+                if boxes and key in boxes:
+                    formatted_message = formatted_message.replace(value_str, 
+                        create_box_message(value_str, type=str(level)))
+            
         else:
-            formatted_message = message
+            formatted_message = parsed_message
 
         # Construct the full log line
         color = base_color or self.colors[level]
-        log_line = f"{color}{self._format_tag(tag)} {self._get_icon(tag)} {formatted_message}{Style.RESET_ALL}"
+        log_line = f"[{color}]{self._format_tag(tag)} {self._get_icon(tag)} {formatted_message} [/{color}]"
 
         # Output to console if verbose
         if self.verbose or kwargs.get("force_verbose", False):
-            print(log_line)
+            self.console.print(log_line)
 
         # Write to file if configured
         self._write_to_file(log_line)
@@ -246,8 +232,8 @@ class AsyncLogger(AsyncLoggerBase):
                 "timing": timing,
             },
             colors={
-                "status": Fore.GREEN if success else Fore.RED,
-                "timing": Fore.YELLOW,
+                "status": "green" if success else "red",
+                "timing": "yellow",
             },
         )
 
@@ -268,6 +254,7 @@ class AsyncLogger(AsyncLoggerBase):
             message="{url:.{url_length}}... | Error: {error}",
             tag=tag,
             params={"url": url, "url_length": url_length, "error": error},
+            boxes=["error"],
         )
 
 class AsyncFileLogger(AsyncLoggerBase):
diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index 16bd5f57..963c2d05 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -2,7 +2,6 @@ from .__version__ import __version__ as crawl4ai_version
 import os
 import sys
 import time
-from colorama import Fore
 from pathlib import Path
 from typing import Optional, List
 import json
@@ -382,8 +381,8 @@ class AsyncWebCrawler:
                             "timing": f"{time.perf_counter() - start_time:.2f}s",
                         },
                         colors={
-                            "status": Fore.GREEN if crawl_result.success else Fore.RED,
-                            "timing": Fore.YELLOW,
+                            "status": "green" if crawl_result.success else "red",
+                            "timing": "yellow",
                         },
                     )
 
@@ -402,7 +401,7 @@ class AsyncWebCrawler:
                             "status": True,
                             "timing": f"{time.perf_counter() - start_time:.2f}s",
                         },
-                        colors={"status": Fore.GREEN, "timing": Fore.YELLOW},
+                        colors={"status": "green", "timing": "yellow"},
                     )
 
                     cached_result.success = bool(html)
@@ -422,7 +421,7 @@ class AsyncWebCrawler:
 
                 self.logger.error_status(
                     url=url,
-                    error=create_box_message(error_message, type="error"),
+                    error=error_message,
                     tag="ERROR",
                 )
 
diff --git a/crawl4ai/browser_profiler.py b/crawl4ai/browser_profiler.py
index 2291faa2..f8b9e2b0 100644
--- a/crawl4ai/browser_profiler.py
+++ b/crawl4ai/browser_profiler.py
@@ -15,8 +15,8 @@ import shutil
 import json
 import subprocess
 import time
-from typing import List, Dict, Optional, Any, Tuple
-from colorama import Fore, Style, init
+from typing import List, Dict, Optional, Any
+from rich.console import Console
 
 from .async_configs import BrowserConfig
 from .browser_manager import ManagedBrowser
@@ -45,8 +45,8 @@ class BrowserProfiler:
             logger (AsyncLoggerBase, optional): Logger for outputting messages.
                 If None, a default AsyncLogger will be created.
         """
-        # Initialize colorama for colorful terminal output
-        init()
+        # Initialize rich console for colorful input prompts
+        self.console = Console()
         
         # Create a logger if not provided
         if logger is None:
@@ -127,18 +127,18 @@ class BrowserProfiler:
         profile_path = os.path.join(self.profiles_dir, profile_name)
         os.makedirs(profile_path, exist_ok=True)
         
-        # Print instructions for the user with colorama formatting
-        border = f"{Fore.CYAN}{'='*80}{Style.RESET_ALL}"
-        self.logger.info(f"\n{border}", tag="PROFILE")
-        self.logger.info(f"Creating browser profile: {Fore.GREEN}{profile_name}{Style.RESET_ALL}", tag="PROFILE")
-        self.logger.info(f"Profile directory: {Fore.YELLOW}{profile_path}{Style.RESET_ALL}", tag="PROFILE")
+        # Print instructions for the user with rich formatting
+        border = "{'='*80}"
+        self.logger.info("{border}", tag="PROFILE", params={"border": f"\n{border}"}, colors={"border": "cyan"})
+        self.logger.info("Creating browser profile: {profile_name}", tag="PROFILE", params={"profile_name": profile_name}, colors={"profile_name": "green"})
+        self.logger.info("Profile directory: {profile_path}", tag="PROFILE", params={"profile_path": profile_path}, colors={"profile_path": "yellow"})
         
         self.logger.info("\nInstructions:", tag="PROFILE")
         self.logger.info("1. A browser window will open for you to set up your profile.", tag="PROFILE")
-        self.logger.info(f"2. {Fore.CYAN}Log in to websites{Style.RESET_ALL}, configure settings, etc. as needed.", tag="PROFILE")
-        self.logger.info(f"3. When you're done, {Fore.YELLOW}press 'q' in this terminal{Style.RESET_ALL} to close the browser.", tag="PROFILE")
+        self.logger.info("{segment}, configure settings, etc. as needed.", tag="PROFILE", params={"segment": "2. Log in to websites"}, colors={"segment": "cyan"})
+        self.logger.info("3. When you're done, {segment} to close the browser.", tag="PROFILE", params={"segment": "press 'q' in this terminal"}, colors={"segment": "yellow"})
         self.logger.info("4. The profile will be saved and ready to use with Crawl4AI.", tag="PROFILE")
-        self.logger.info(f"{border}\n", tag="PROFILE")
+        self.logger.info("{border}", tag="PROFILE", params={"border": f"{border}\n"}, colors={"border": "cyan"})
         
         # Create managed browser instance
         managed_browser = ManagedBrowser(
@@ -181,7 +181,7 @@ class BrowserProfiler:
             import select
             
             # First output the prompt
-            self.logger.info(f"{Fore.CYAN}Press '{Fore.WHITE}q{Fore.CYAN}' when you've finished using the browser...{Style.RESET_ALL}", tag="PROFILE")
+            self.logger.info("Press 'q' when you've finished using the browser...", tag="PROFILE")
             
             # Save original terminal settings
             fd = sys.stdin.fileno()
@@ -197,7 +197,7 @@ class BrowserProfiler:
                     if readable:
                         key = sys.stdin.read(1)
                         if key.lower() == 'q':
-                            self.logger.info(f"{Fore.GREEN}Closing browser and saving profile...{Style.RESET_ALL}", tag="PROFILE")
+                            self.logger.info("Closing browser and saving profile...", tag="PROFILE", base_color="green")
                             user_done_event.set()
                             return
                     
@@ -223,7 +223,7 @@ class BrowserProfiler:
                 self.logger.error("Failed to start browser process.", tag="PROFILE")
                 return None
             
-            self.logger.info(f"Browser launched. {Fore.CYAN}Waiting for you to finish...{Style.RESET_ALL}", tag="PROFILE") 
+            self.logger.info(f"Browser launched. Waiting for you to finish...", tag="PROFILE") 
             
             # Start listening for keyboard input
             listener_task = asyncio.create_task(listen_for_quit_command())
@@ -245,10 +245,10 @@ class BrowserProfiler:
                 self.logger.info("Terminating browser process...", tag="PROFILE")
                 await managed_browser.cleanup()
             
-            self.logger.success(f"Browser closed. Profile saved at: {Fore.GREEN}{profile_path}{Style.RESET_ALL}", tag="PROFILE")
+            self.logger.success(f"Browser closed. Profile saved at: {profile_path}", tag="PROFILE")
                 
         except Exception as e:
-            self.logger.error(f"Error creating profile: {str(e)}", tag="PROFILE")
+            self.logger.error(f"Error creating profile: {e!s}", tag="PROFILE")
             await managed_browser.cleanup()
             return None
         finally:
@@ -440,25 +440,27 @@ class BrowserProfiler:
             ```
         """
         while True:
-            self.logger.info(f"\n{Fore.CYAN}Profile Management Options:{Style.RESET_ALL}", tag="MENU")
-            self.logger.info(f"1. {Fore.GREEN}Create a new profile{Style.RESET_ALL}", tag="MENU")
-            self.logger.info(f"2. {Fore.YELLOW}List available profiles{Style.RESET_ALL}", tag="MENU")
-            self.logger.info(f"3. {Fore.RED}Delete a profile{Style.RESET_ALL}", tag="MENU")
+            self.logger.info(f"\nProfile Management Options:", tag="MENU")
+            self.logger.info(f"1. Create a new profile", tag="MENU", base_color="green")
+            self.logger.info(f"2. List available profiles", tag="MENU", base_color="yellow")
+            self.logger.info(f"3. Delete a profile", tag="MENU", base_color="red")
             
             # Only show crawl option if callback provided
             if crawl_callback:
-                self.logger.info(f"4. {Fore.CYAN}Use a profile to crawl a website{Style.RESET_ALL}", tag="MENU")
-                self.logger.info(f"5. {Fore.MAGENTA}Exit{Style.RESET_ALL}", tag="MENU")
+                self.logger.info(f"4. Use a profile to crawl a website", tag="MENU", base_color="cyan")
+                self.logger.info(f"5. Exit", tag="MENU", base_color="magenta")
                 exit_option = "5"
             else:
-                self.logger.info(f"4. {Fore.MAGENTA}Exit{Style.RESET_ALL}", tag="MENU")
+                self.logger.info(f"4. Exit", tag="MENU", base_color="magenta")
                 exit_option = "4"
             
-            choice = input(f"\n{Fore.CYAN}Enter your choice (1-{exit_option}): {Style.RESET_ALL}")
+            self.logger.print(f"\n[cyan]Enter your choice (1-{exit_option}): [/cyan]", end="")
+            choice = input()
             
             if choice == "1":
                 # Create new profile
-                name = input(f"{Fore.GREEN}Enter a name for the new profile (or press Enter for auto-generated name): {Style.RESET_ALL}")
+                self.console.print("[green]Enter a name for the new profile (or press Enter for auto-generated name): [/green]", end="")
+                name = input()
                 await self.create_profile(name or None)
                 
             elif choice == "2":
@@ -472,8 +474,8 @@ class BrowserProfiler:
                 # Print profile information with colorama formatting
                 self.logger.info("\nAvailable profiles:", tag="PROFILES")
                 for i, profile in enumerate(profiles):
-                    self.logger.info(f"[{i+1}] {Fore.CYAN}{profile['name']}{Style.RESET_ALL}", tag="PROFILES")
-                    self.logger.info(f"    Path: {Fore.YELLOW}{profile['path']}{Style.RESET_ALL}", tag="PROFILES")
+                    self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES")
+                    self.logger.info(f"    Path: {profile['path']}", tag="PROFILES", base_color="yellow")
                     self.logger.info(f"    Created: {profile['created'].strftime('%Y-%m-%d %H:%M:%S')}", tag="PROFILES")
                     self.logger.info(f"    Browser type: {profile['type']}", tag="PROFILES")
                     self.logger.info("", tag="PROFILES")  # Empty line for spacing
@@ -486,12 +488,13 @@ class BrowserProfiler:
                     continue
                     
                 # Display numbered list
-                self.logger.info(f"\n{Fore.YELLOW}Available profiles:{Style.RESET_ALL}", tag="PROFILES")
+                self.logger.info(f"\nAvailable profiles:", tag="PROFILES", base_color="yellow")
                 for i, profile in enumerate(profiles):
                     self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES")
                     
                 # Get profile to delete
-                profile_idx = input(f"{Fore.RED}Enter the number of the profile to delete (or 'c' to cancel): {Style.RESET_ALL}")
+                self.console.print("[red]Enter the number of the profile to delete (or 'c' to cancel): [/red]", end="")
+                profile_idx = input()
                 if profile_idx.lower() == 'c':
                     continue
                     
@@ -499,17 +502,18 @@ class BrowserProfiler:
                     idx = int(profile_idx) - 1
                     if 0 <= idx < len(profiles):
                         profile_name = profiles[idx]["name"]
-                        self.logger.info(f"Deleting profile: {Fore.YELLOW}{profile_name}{Style.RESET_ALL}", tag="PROFILES")
+                        self.logger.info(f"Deleting profile: [yellow]{profile_name}[/yellow]", tag="PROFILES")
                         
                         # Confirm deletion
-                        confirm = input(f"{Fore.RED}Are you sure you want to delete this profile? (y/n): {Style.RESET_ALL}")
+                        self.console.print("[red]Are you sure you want to delete this profile? (y/n): [/red]", end="")
+                        confirm = input()
                         if confirm.lower() == 'y':
                             success = self.delete_profile(profiles[idx]["path"])
                             
                             if success:
-                                self.logger.success(f"Profile {Fore.GREEN}{profile_name}{Style.RESET_ALL} deleted successfully", tag="PROFILES")
+                                self.logger.success(f"Profile {profile_name} deleted successfully", tag="PROFILES")
                             else:
-                                self.logger.error(f"Failed to delete profile {Fore.RED}{profile_name}{Style.RESET_ALL}", tag="PROFILES")
+                                self.logger.error(f"Failed to delete profile {profile_name}", tag="PROFILES")
                     else:
                         self.logger.error("Invalid profile number", tag="PROFILES")
                 except ValueError:
@@ -523,12 +527,13 @@ class BrowserProfiler:
                     continue
                     
                 # Display numbered list
-                self.logger.info(f"\n{Fore.YELLOW}Available profiles:{Style.RESET_ALL}", tag="PROFILES")
+                self.logger.info(f"\nAvailable profiles:", tag="PROFILES", base_color="yellow")
                 for i, profile in enumerate(profiles):
                     self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES")
                     
                 # Get profile to use
-                profile_idx = input(f"{Fore.CYAN}Enter the number of the profile to use (or 'c' to cancel): {Style.RESET_ALL}")
+                self.console.print("[cyan]Enter the number of the profile to use (or 'c' to cancel): [/cyan]", end="")
+                profile_idx = input()
                 if profile_idx.lower() == 'c':
                     continue
                     
@@ -536,7 +541,8 @@ class BrowserProfiler:
                     idx = int(profile_idx) - 1
                     if 0 <= idx < len(profiles):
                         profile_path = profiles[idx]["path"]
-                        url = input(f"{Fore.CYAN}Enter the URL to crawl: {Style.RESET_ALL}")
+                        self.console.print("[cyan]Enter the URL to crawl: [/cyan]", end="")
+                        url = input()
                         if url:
                             # Call the provided crawl callback
                             await crawl_callback(profile_path, url)
@@ -600,10 +606,10 @@ class BrowserProfiler:
         border = f"{Fore.CYAN}{'='*80}{Style.RESET_ALL}"
         self.logger.info(f"\n{border}", tag="CDP")
         self.logger.info(f"Launching standalone browser with CDP debugging", tag="CDP")
-        self.logger.info(f"Browser type: {Fore.GREEN}{browser_type}{Style.RESET_ALL}", tag="CDP")
-        self.logger.info(f"Profile path: {Fore.YELLOW}{profile_path}{Style.RESET_ALL}", tag="CDP")
-        self.logger.info(f"Debugging port: {Fore.CYAN}{debugging_port}{Style.RESET_ALL}", tag="CDP")
-        self.logger.info(f"Headless mode: {Fore.CYAN}{headless}{Style.RESET_ALL}", tag="CDP")
+        self.logger.info("Browser type: {browser_type}", tag="CDP", params={"browser_type": browser_type}, colors={"browser_type": "cyan"})
+        self.logger.info("Profile path: {profile_path}", tag="CDP", params={"profile_path": profile_path}, colors={"profile_path": "yellow"})
+        self.logger.info(f"Debugging port: {debugging_port}", tag="CDP")
+        self.logger.info(f"Headless mode: {headless}", tag="CDP")
         
         # Create managed browser instance
         managed_browser = ManagedBrowser(
@@ -646,7 +652,7 @@ class BrowserProfiler:
             import select
             
             # First output the prompt
-            self.logger.info(f"{Fore.CYAN}Press '{Fore.WHITE}q{Fore.CYAN}' to stop the browser and exit...{Style.RESET_ALL}", tag="CDP")
+            self.logger.info("Press 'q' to stop the browser and exit...", tag="CDP")
             
             # Save original terminal settings
             fd = sys.stdin.fileno()
@@ -662,7 +668,7 @@ class BrowserProfiler:
                     if readable:
                         key = sys.stdin.read(1)
                         if key.lower() == 'q':
-                            self.logger.info(f"{Fore.GREEN}Closing browser...{Style.RESET_ALL}", tag="CDP")
+                            self.logger.info("Closing browser...", tag="CDP")
                             user_done_event.set()
                             return
                     
@@ -722,14 +728,14 @@ class BrowserProfiler:
             cdp_url, config_json = await get_cdp_json(debugging_port)
             
             if cdp_url:
-                self.logger.success(f"CDP URL: {Fore.GREEN}{cdp_url}{Style.RESET_ALL}", tag="CDP")
+                self.logger.success(f"CDP URL: {cdp_url}", tag="CDP")
                 
                 if config_json:
                     # Display relevant CDP information
-                    self.logger.info(f"Browser: {Fore.CYAN}{config_json.get('Browser', 'Unknown')}{Style.RESET_ALL}", tag="CDP")
-                    self.logger.info(f"Protocol Version: {config_json.get('Protocol-Version', 'Unknown')}", tag="CDP")
+                    self.logger.info(f"Browser: {config_json.get('Browser', 'Unknown')}", tag="CDP", colors={"Browser": "cyan"})
+                    self.logger.info(f"Protocol Version: {config_json.get('Protocol-Version', 'Unknown')}", tag="CDP", colors={"Protocol-Version": "cyan"})
                     if 'webSocketDebuggerUrl' in config_json:
-                        self.logger.info(f"WebSocket URL: {Fore.GREEN}{config_json['webSocketDebuggerUrl']}{Style.RESET_ALL}", tag="CDP")
+                        self.logger.info("WebSocket URL: {webSocketDebuggerUrl}", tag="CDP", params={"webSocketDebuggerUrl": config_json['webSocketDebuggerUrl']}, colors={"webSocketDebuggerUrl": "green"})
                 else:
                     self.logger.warning("Could not retrieve CDP configuration JSON", tag="CDP")
             else:
@@ -757,7 +763,7 @@ class BrowserProfiler:
                 self.logger.info("Terminating browser process...", tag="CDP")
                 await managed_browser.cleanup()
             
-            self.logger.success(f"Browser closed.", tag="CDP")
+            self.logger.success("Browser closed.", tag="CDP")
                 
         except Exception as e:
             self.logger.error(f"Error launching standalone browser: {str(e)}", tag="CDP")
diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py
index 8d7a51b4..35c6ce8c 100644
--- a/crawl4ai/content_filter_strategy.py
+++ b/crawl4ai/content_filter_strategy.py
@@ -28,7 +28,8 @@ import hashlib
 from pathlib import Path
 from concurrent.futures import ThreadPoolExecutor
 from .async_logger import AsyncLogger, LogLevel
-from colorama import Fore, Style
+from rich.console import Console
+from rich.text import Text
 
 
 class RelevantContentFilter(ABC):
@@ -846,8 +847,7 @@ class LLMContentFilter(RelevantContentFilter):
                 },
                 colors={
                     **AsyncLogger.DEFAULT_COLORS,
-                    LogLevel.INFO: Fore.MAGENTA
-                    + Style.DIM,  # Dimmed purple for LLM ops
+                    LogLevel.INFO: "dim magenta"  # Dimmed purple for LLM ops
                 },
             )
         else:
@@ -892,7 +892,7 @@ class LLMContentFilter(RelevantContentFilter):
                 "Starting LLM markdown content filtering process",
                 tag="LLM",
                 params={"provider": self.llm_config.provider},
-                colors={"provider": Fore.CYAN},
+                colors={"provider": "cyan"},
             )
 
         # Cache handling
@@ -929,7 +929,7 @@ class LLMContentFilter(RelevantContentFilter):
                 "LLM markdown: Split content into {chunk_count} chunks",
                 tag="CHUNK",
                 params={"chunk_count": len(html_chunks)},
-                colors={"chunk_count": Fore.YELLOW},
+                colors={"chunk_count": "yellow"},
             )
 
         start_time = time.time()
@@ -1038,7 +1038,7 @@ class LLMContentFilter(RelevantContentFilter):
                 "LLM markdown: Completed processing in {time:.2f}s",
                 tag="LLM",
                 params={"time": end_time - start_time},
-                colors={"time": Fore.YELLOW},
+                colors={"time": "yellow"},
             )
 
         result = ordered_results if ordered_results else []
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index 02d105a9..7b3057b3 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -20,7 +20,6 @@ from urllib.parse import urljoin
 import requests
 from requests.exceptions import InvalidSchema
 import xxhash
-from colorama import Fore, Style, init
 import textwrap
 import cProfile
 import pstats
@@ -441,14 +440,13 @@ def create_box_message(
         str: A formatted string containing the styled message box.
     """
 
-    init()
-
     # Define border and text colors for different types
     styles = {
-        "warning": (Fore.YELLOW, Fore.LIGHTYELLOW_EX, "⚠"),
-        "info": (Fore.BLUE, Fore.LIGHTBLUE_EX, "ℹ"),
-        "success": (Fore.GREEN, Fore.LIGHTGREEN_EX, "✓"),
-        "error": (Fore.RED, Fore.LIGHTRED_EX, "×"),
+        "warning": ("yellow", "bright_yellow", "⚠"),
+        "info": ("blue", "bright_blue", "ℹ"),
+        "debug": ("lightblack", "bright_black", "⋯"),
+        "success": ("green", "bright_green", "✓"),
+        "error": ("red", "bright_red", "×"),
     }
 
     border_color, text_color, prefix = styles.get(type.lower(), styles["info"])
@@ -480,12 +478,12 @@ def create_box_message(
     # Create the box with colored borders and lighter text
     horizontal_line = h_line * (width - 1)
     box = [
-        f"{border_color}{tl}{horizontal_line}{tr}",
+        f"[{border_color}]{tl}{horizontal_line}{tr}[/{border_color}]",
         *[
-            f"{border_color}{v_line}{text_color} {line:<{width-2}}{border_color}{v_line}"
+            f"[{border_color}]{v_line}[{text_color}] {line:<{width-2}}[/{text_color}][{border_color}]{v_line}[/{border_color}]"
             for line in formatted_lines
         ],
-        f"{border_color}{bl}{horizontal_line}{br}{Style.RESET_ALL}",
+        f"[{border_color}]{bl}{horizontal_line}{br}[/{border_color}]",
     ]
 
     result = "\n".join(box)
@@ -2774,4 +2772,3 @@ def preprocess_html_for_schema(html_content, text_threshold=100, attr_value_thre
         # Fallback for parsing errors
         return html_content[:max_size] if len(html_content) > max_size else html_content
     
-

From ccec40ed174df45e6b4f0b00efae331fa4a3ba3a Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Thu, 24 Apr 2025 18:36:25 +0800
Subject: [PATCH 2/8] feat(models): add dedicated tables field to CrawlResult

- Add tables field to CrawlResult model while maintaining backward compatibility
- Update async_webcrawler.py to extract tables from media and pass to tables field
- Update crypto_analysis_example.py to use the new tables field
- Add /config/dump examples to demo_docker_api.py
- Bump version to 0.6.1
---
 CHANGELOG.md                               |   9 ++
 crawl4ai/__version__.py                    |   2 +-
 crawl4ai/async_webcrawler.py               |   3 +
 crawl4ai/models.py                         |   3 +-
 deploy/docker/static/playground/index.html | 176 ++++++++++++++++++---
 docs/examples/crypto_analysis_example.py   |   8 +-
 docs/examples/docker/demo_docker_api.py    | 112 ++++++++++++-
 7 files changed, 287 insertions(+), 26 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9205c0b0..16f96f47 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,15 @@ All notable changes to Crawl4AI will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.6.1] - 2025-04-24
+
+### Added
+- New dedicated `tables` field in `CrawlResult` model for better table extraction handling
+- Updated crypto_analysis_example.py to use the new tables field with backward compatibility
+
+### Changed
+- Improved playground UI in Docker deployment with better endpoint handling and UI feedback
+
 ## [0.6.0] ‑ 2025‑04‑22
 
 ### Added
diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py
index ee78de23..fe6f9b8a 100644
--- a/crawl4ai/__version__.py
+++ b/crawl4ai/__version__.py
@@ -1,3 +1,3 @@
 # crawl4ai/_version.py
-__version__ = "0.6.0"
+__version__ = "0.6.1"
 
diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index 98acfd12..bb3765c2 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -496,11 +496,13 @@ class AsyncWebCrawler:
             cleaned_html = sanitize_input_encode(
                 result.get("cleaned_html", ""))
             media = result.get("media", {})
+            tables = media.pop("tables", []) if isinstance(media, dict) else []
             links = result.get("links", {})
             metadata = result.get("metadata", {})
         else:
             cleaned_html = sanitize_input_encode(result.cleaned_html)
             media = result.media.model_dump()
+            tables = media.pop("tables", [])
             links = result.links.model_dump()
             metadata = result.metadata
 
@@ -627,6 +629,7 @@ class AsyncWebCrawler:
             cleaned_html=cleaned_html,
             markdown=markdown_result,
             media=media,
+            tables=tables,                       # NEW
             links=links,
             metadata=metadata,
             screenshot=screenshot_data,
diff --git a/crawl4ai/models.py b/crawl4ai/models.py
index 32cca3ed..64270b77 100644
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -1,4 +1,4 @@
-from pydantic import BaseModel, HttpUrl, PrivateAttr
+from pydantic import BaseModel, HttpUrl, PrivateAttr, Field
 from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
 from typing import AsyncGenerator
 from typing import Generic, TypeVar
@@ -150,6 +150,7 @@ class CrawlResult(BaseModel):
     redirected_url: Optional[str] = None
     network_requests: Optional[List[Dict[str, Any]]] = None
     console_messages: Optional[List[Dict[str, Any]]] = None
+    tables: List[Dict] = Field(default_factory=list)  # NEW – [{headers,rows,caption,summary}]
 
     class Config:
         arbitrary_types_allowed = True
diff --git a/deploy/docker/static/playground/index.html b/deploy/docker/static/playground/index.html
index 8f0e2bdd..7af96f1f 100644
--- a/deploy/docker/static/playground/index.html
+++ b/deploy/docker/static/playground/index.html
@@ -193,7 +193,48 @@
                 <textarea id="urls" class="w-full bg-dark border border-border rounded p-2 h-32 text-sm mb-4"
                     spellcheck="false">https://example.com</textarea>
 
-                <details class="mb-4">
+                <!-- Specific options for /md endpoint -->
+                <details id="md-options" class="mb-4 hidden">
+                    <summary class="text-sm text-secondary cursor-pointer">/md Options</summary>
+                    <div class="mt-2 space-y-3 p-2 border border-border rounded">
+                        <div>
+                            <label for="md-filter" class="block text-xs text-secondary mb-1">Filter Type</label>
+                            <select id="md-filter" class="bg-dark border border-border rounded px-2 py-1 text-sm w-full">
+                                <option value="fit">fit - Adaptive content filtering</option>
+                                <option value="raw">raw - No filtering</option>
+                                <option value="bm25">bm25 - BM25 keyword relevance</option>
+                                <option value="llm">llm - LLM-based filtering</option>
+                            </select>
+                        </div>
+                        <div>
+                            <label for="md-query" class="block text-xs text-secondary mb-1">Query (for BM25/LLM filters)</label>
+                            <input id="md-query" type="text" placeholder="Enter search terms or instructions" 
+                                class="bg-dark border border-border rounded px-2 py-1 text-sm w-full">
+                        </div>
+                        <div>
+                            <label for="md-cache" class="block text-xs text-secondary mb-1">Cache Mode</label>
+                            <select id="md-cache" class="bg-dark border border-border rounded px-2 py-1 text-sm w-full">
+                                <option value="0">Write-Only (0)</option>
+                                <option value="1">Enabled (1)</option>
+                            </select>
+                        </div>
+                    </div>
+                </details>
+
+                <!-- Specific options for /llm endpoint -->
+                <details id="llm-options" class="mb-4 hidden">
+                    <summary class="text-sm text-secondary cursor-pointer">/llm Options</summary>
+                    <div class="mt-2 space-y-3 p-2 border border-border rounded">
+                        <div>
+                            <label for="llm-question" class="block text-xs text-secondary mb-1">Question</label>
+                            <input id="llm-question" type="text" value="What is this page about?" 
+                                class="bg-dark border border-border rounded px-2 py-1 text-sm w-full">
+                        </div>
+                    </div>
+                </details>
+
+                <!-- Advanced config for /crawl endpoints -->
+                <details id="adv-config" class="mb-4">
                     <summary class="text-sm text-secondary cursor-pointer">Advanced Config <span
                         class="text-xs text-primary">(Python → auto‑JSON)</span></summary>
 
@@ -437,6 +478,33 @@
             cm.setValue(TEMPLATES[e.target.value]);
             document.getElementById('cfg-status').textContent = '';
         });
+        
+        // Handle endpoint selection change to show appropriate options
+        document.getElementById('endpoint').addEventListener('change', function(e) {
+            const endpoint = e.target.value;
+            const mdOptions = document.getElementById('md-options');
+            const llmOptions = document.getElementById('llm-options');
+            const advConfig = document.getElementById('adv-config');
+            
+            // Hide all option sections first
+            mdOptions.classList.add('hidden');
+            llmOptions.classList.add('hidden');
+            advConfig.classList.add('hidden');
+            
+            // Show the appropriate section based on endpoint
+            if (endpoint === 'md') {
+                mdOptions.classList.remove('hidden');
+                // Auto-open the /md options
+                mdOptions.setAttribute('open', '');
+            } else if (endpoint === 'llm') {
+                llmOptions.classList.remove('hidden');
+                // Auto-open the /llm options
+                llmOptions.setAttribute('open', '');
+            } else {
+                // For /crawl endpoints, show the advanced config
+                advConfig.classList.remove('hidden');
+            }
+        });
 
         async function pyConfigToJson() {
             const code = cm.getValue().trim();
@@ -494,10 +562,18 @@
         }
 
         // Generate code snippets
-        function generateSnippets(api, payload) {
+        function generateSnippets(api, payload, method = 'POST') {
             // Python snippet
             const pyCodeEl = document.querySelector('#python-content code');
-            const pySnippet = `import httpx\n\nasync def crawl():\n    async with httpx.AsyncClient() as client:\n        response = await client.post(\n            "${window.location.origin}${api}",\n            json=${JSON.stringify(payload, null, 4).replace(/\n/g, '\n            ')}\n        )\n        return response.json()`;
+            let pySnippet;
+            
+            if (method === 'GET') {
+                // GET request (for /llm endpoint)
+                pySnippet = `import httpx\n\nasync def crawl():\n    async with httpx.AsyncClient() as client:\n        response = await client.get(\n            "${window.location.origin}${api}"\n        )\n        return response.json()`;
+            } else {
+                // POST request (for /crawl and /md endpoints)
+                pySnippet = `import httpx\n\nasync def crawl():\n    async with httpx.AsyncClient() as client:\n        response = await client.post(\n            "${window.location.origin}${api}",\n            json=${JSON.stringify(payload, null, 4).replace(/\n/g, '\n            ')}\n        )\n        return response.json()`;
+            }
 
             pyCodeEl.textContent = pySnippet;
             pyCodeEl.className = 'python hljs'; // Reset classes
@@ -505,7 +581,15 @@
 
             // cURL snippet
             const curlCodeEl = document.querySelector('#curl-content code');
-            const curlSnippet = `curl -X POST ${window.location.origin}${api} \\\n  -H "Content-Type: application/json" \\\n  -d '${JSON.stringify(payload)}'`;
+            let curlSnippet;
+            
+            if (method === 'GET') {
+                // GET request (for /llm endpoint)
+                curlSnippet = `curl -X GET "${window.location.origin}${api}"`;
+            } else {
+                // POST request (for /crawl and /md endpoints)
+                curlSnippet = `curl -X POST ${window.location.origin}${api} \\\n  -H "Content-Type: application/json" \\\n  -d '${JSON.stringify(payload)}'`;
+            }
 
             curlCodeEl.textContent = curlSnippet;
             curlCodeEl.className = 'bash hljs'; // Reset classes
@@ -536,20 +620,39 @@
 
             const endpointMap = {
                 crawl: '/crawl',
-            };
-
-            /*const endpointMap = {
-                crawl: '/crawl',
-                crawl_stream: '/crawl/stream',
+                // crawl_stream: '/crawl/stream',
                 md: '/md',
                 llm: '/llm'
-            };*/
+            };
 
             const api = endpointMap[endpoint];
-            const payload = {
-                urls,
-                ...advConfig
-            };
+            let payload;
+            
+            // Create appropriate payload based on endpoint type
+            if (endpoint === 'md') {
+                // Get values from the /md specific inputs
+                const filterType = document.getElementById('md-filter').value;
+                const query = document.getElementById('md-query').value.trim();
+                const cache = document.getElementById('md-cache').value;
+                
+                // MD endpoint expects: { url, f, q, c }
+                payload = {
+                    url: urls[0], // Take first URL
+                    f: filterType, // Lowercase filter type as required by server
+                    q: query || null, // Use the query if provided, otherwise null
+                    c: cache
+                };
+            } else if (endpoint === 'llm') {
+                // LLM endpoint has a different URL pattern and uses query params
+                // This will be handled directly in the fetch below
+                payload = null;
+            } else {
+                // Default payload for /crawl and /crawl/stream
+                payload = {
+                    urls,
+                    ...advConfig
+                };
+            }
 
             updateStatus('processing');
 
@@ -557,7 +660,18 @@
                 const startTime = performance.now();
                 let response, responseData;
 
-                if (endpoint === 'crawl_stream') {
+                if (endpoint === 'llm') {
+                    // Special handling for LLM endpoint which uses URL pattern: /llm/{encoded_url}?q={query}
+                    const url = urls[0];
+                    const encodedUrl = encodeURIComponent(url);
+                    // Get the question from the LLM-specific input
+                    const question = document.getElementById('llm-question').value.trim() || "What is this page about?";
+                    
+                    response = await fetch(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, {
+                        method: 'GET',
+                        headers: { 'Accept': 'application/json' }
+                    });
+                } else if (endpoint === 'crawl_stream') {
                     // Stream processing
                     response = await fetch(api, {
                         method: 'POST',
@@ -597,7 +711,7 @@
                     document.querySelector('#response-content code').className = 'json hljs'; // Reset classes
                     forceHighlightElement(document.querySelector('#response-content code'));
                 } else {
-                    // Regular request
+                    // Regular request (handles /crawl and /md)
                     response = await fetch(api, {
                         method: 'POST',
                         headers: { 'Content-Type': 'application/json' },
@@ -625,7 +739,16 @@
                 }
 
                 forceHighlightElement(document.querySelector('#response-content code'));
-                generateSnippets(api, payload);
+                
+                // For generateSnippets, handle the LLM case specially
+                if (endpoint === 'llm') {
+                    const url = urls[0];
+                    const encodedUrl = encodeURIComponent(url);
+                    const question = document.getElementById('llm-question').value.trim() || "What is this page about?";
+                    generateSnippets(`${api}/${encodedUrl}?q=${encodeURIComponent(question)}`, null, 'GET');
+                } else {
+                    generateSnippets(api, payload);
+                }
             } catch (error) {
                 console.error('Error:', error);
                 updateStatus('error');
@@ -807,9 +930,24 @@
                 });
             });
         }
+        
+        // Function to initialize UI based on selected endpoint
+        function initUI() {
+            // Trigger the endpoint change handler to set initial UI state
+            const endpointSelect = document.getElementById('endpoint');
+            const event = new Event('change');
+            endpointSelect.dispatchEvent(event);
+            
+            // Initialize copy buttons
+            initCopyButtons();
+        }
 
-        // Call this in your DOMContentLoaded or initialization
-        initCopyButtons();
+        // Initialize on page load
+        document.addEventListener('DOMContentLoaded', initUI);
+        // Also call it immediately in case the script runs after DOM is already loaded
+        if (document.readyState !== 'loading') {
+            initUI();
+        }
 
     </script>
 </body>
diff --git a/docs/examples/crypto_analysis_example.py b/docs/examples/crypto_analysis_example.py
index 10b9e7ab..c5537a93 100644
--- a/docs/examples/crypto_analysis_example.py
+++ b/docs/examples/crypto_analysis_example.py
@@ -391,12 +391,14 @@ async def main():
         # Process results
         raw_df = pd.DataFrame()
         for result in results:
-            if result.success and result.media["tables"]:
+            # Use the new tables field, falling back to media["tables"] for backward compatibility
+            tables = result.tables if hasattr(result, "tables") and result.tables else result.media.get("tables", [])
+            if result.success and tables:
                 # Extract primary market table
                 # DataFrame
                 raw_df = pd.DataFrame(
-                    result.media["tables"][0]["rows"],
-                    columns=result.media["tables"][0]["headers"],
+                    tables[0]["rows"],
+                    columns=tables[0]["headers"],
                 )
                 break
 
diff --git a/docs/examples/docker/demo_docker_api.py b/docs/examples/docker/demo_docker_api.py
index 09625248..d989e030 100644
--- a/docs/examples/docker/demo_docker_api.py
+++ b/docs/examples/docker/demo_docker_api.py
@@ -4,6 +4,8 @@ import json
 import os
 import time
 from typing import List, Dict, Any, AsyncGenerator, Optional
+import textwrap          # ← new: for pretty code literals
+import urllib.parse  # ← needed for URL-safe /llm calls
 from dotenv import load_dotenv
 from rich.console import Console
 from rich.syntax import Syntax
@@ -969,13 +971,111 @@ async def demo_deep_with_ssl(client: httpx.AsyncClient):
             else:
                  console.print(f"  [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.")
 
+# 7. Markdown helper endpoint
+async def demo_markdown_endpoint(client: httpx.AsyncClient):
+    """
+    One-shot helper around /md.
+    Fetches PYTHON_URL with FIT filter and prints the first 500 chars of Markdown.
+    """
+    target_url = PYTHON_URL
+    payload = {"url": target_url, "f": "fit", "q": None, "c": "0"}
+
+    console.rule("[bold blue]Demo 7a: /md Endpoint[/]", style="blue")
+    print_payload(payload)
+
+    try:
+        t0 = time.time()
+        resp = await client.post("/md", json=payload)
+        dt = time.time() - t0
+        console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
+        resp.raise_for_status()
+        md = resp.json().get("markdown", "")
+        snippet = (md[:500] + "...") if len(md) > 500 else md
+        console.print(Panel(snippet, title="Markdown snippet", border_style="cyan", expand=False))
+    except Exception as e:
+        console.print(f"[bold red]Error hitting /md:[/] {e}")
+
+# 8. LLM QA helper endpoint
+async def demo_llm_endpoint(client: httpx.AsyncClient):
+    """
+    Quick QA round-trip with /llm.
+    Asks a trivial question against SIMPLE_URL just to show wiring.
+    """
+    page_url = SIMPLE_URL
+    question = "What is the title of this page?"
+
+    console.rule("[bold magenta]Demo 7b: /llm Endpoint[/]", style="magenta")
+    enc = urllib.parse.quote_plus(page_url, safe="")
+    console.print(f"GET /llm/{enc}?q={question}")
+
+    try:
+        t0 = time.time()
+        resp = await client.get(f"/llm/{enc}", params={"q": question})
+        dt = time.time() - t0
+        console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
+        resp.raise_for_status()
+        answer = resp.json().get("answer", "")
+        console.print(Panel(answer or "No answer returned", title="LLM answer", border_style="magenta", expand=False))
+    except Exception as e:
+        console.print(f"[bold red]Error hitting /llm:[/] {e}")
+
+
+# 9. /config/dump helpers --------------------------------------------------
+
+async def demo_config_dump_valid(client: httpx.AsyncClient):
+    """
+    Send a single top-level CrawlerRunConfig(...) expression and show the dump.
+    """
+    code_snippet = "CrawlerRunConfig(cache_mode='BYPASS', screenshot=True)"
+    payload = {"code": code_snippet}
+
+    console.rule("[bold blue]Demo 8a: /config/dump (valid)[/]", style="blue")
+    print_payload(payload)
+
+    try:
+        t0 = time.time()
+        resp = await client.post("/config/dump", json=payload)
+        dt = time.time() - t0
+        console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
+        resp.raise_for_status()
+        dump_json = resp.json()
+        console.print(Panel(Syntax(json.dumps(dump_json, indent=2), "json", theme="monokai"), title="Dump()", border_style="cyan"))
+    except Exception as e:
+        console.print(f"[bold red]Error in valid /config/dump call:[/] {e}")
+
+
+async def demo_config_dump_invalid(client: httpx.AsyncClient):
+    """
+    Purposely break the rule (nested call) to show the 400 parse error.
+    """
+    bad_code = textwrap.dedent("""
+        BrowserConfig(headless=True); CrawlerRunConfig()
+    """).strip()
+    payload = {"code": bad_code}
+
+    console.rule("[bold magenta]Demo 8b: /config/dump (invalid)[/]", style="magenta")
+    print_payload(payload)
+
+    try:
+        resp = await client.post("/config/dump", json=payload)
+        console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/]")
+        resp.raise_for_status()   # should throw -> except
+    except httpx.HTTPStatusError as e:
+        console.print("[cyan]Expected parse/validation failure captured:[/]")
+        try:
+            console.print(Panel(Syntax(json.dumps(e.response.json(), indent=2), "json", theme="fruity"), title="Error payload"))
+        except Exception:
+            console.print(e.response.text)
+    except Exception as e:
+        console.print(f"[bold red]Unexpected error during invalid test:[/] {e}")
+
 
 # --- Update Main Runner to include new demo ---
 async def main_demo():
     async with httpx.AsyncClient(base_url=BASE_URL, timeout=300.0) as client:
         if not await check_server_health(client):
             return
-
+        
         # --- Run Demos ---
         await demo_basic_single_url(client)
         await demo_basic_multi_url(client)
@@ -1001,7 +1101,15 @@ async def main_demo():
         await demo_deep_with_css_extraction(client)
         await demo_deep_with_llm_extraction(client) # Skips if no common LLM key env var
         await demo_deep_with_proxy(client) # Skips if no PROXIES env var
-        await demo_deep_with_ssl(client) # Added the new demo
+        await demo_deep_with_ssl(client)   # Added the new demo
+
+        # --- Helper endpoints ---
+        await demo_markdown_endpoint(client)
+        await demo_llm_endpoint(client)
+
+        # --- /config/dump sanity checks ---
+        await demo_config_dump_valid(client)
+        await demo_config_dump_invalid(client)
 
         console.rule("[bold green]Demo Complete[/]", style="green")
 

From 2140d9aca499580328886b0edfb0d6c4e502d4fd Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Sat, 26 Apr 2025 21:09:50 +0800
Subject: [PATCH 3/8] fix(browser): correct headless mode default behavior

Modify BrowserConfig to respect explicit headless parameter setting instead of forcing True. Update version to 0.6.2 and clean up code formatting in examples.

BREAKING CHANGE: BrowserConfig no longer defaults to headless=True when explicitly set to False
---
 crawl4ai/__version__.py                 |   2 +-
 crawl4ai/async_configs.py               |   2 +-
 docs/examples/docker/demo_docker_api.py | 609 ++++++++++++++++--------
 docs/examples/hello_world.py            |   2 +-
 4 files changed, 404 insertions(+), 211 deletions(-)

diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py
index fe6f9b8a..8a5cb2c4 100644
--- a/crawl4ai/__version__.py
+++ b/crawl4ai/__version__.py
@@ -1,3 +1,3 @@
 # crawl4ai/_version.py
-__version__ = "0.6.1"
+__version__ = "0.6.2"
 
diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py
index dd5c584a..c93516bd 100644
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -427,7 +427,7 @@ class BrowserConfig:
         host: str = "localhost",
     ):
         self.browser_type = browser_type
-        self.headless = headless or True
+        self.headless = headless 
         self.browser_mode = browser_mode
         self.use_managed_browser = use_managed_browser
         self.cdp_url = cdp_url
diff --git a/docs/examples/docker/demo_docker_api.py b/docs/examples/docker/demo_docker_api.py
index d989e030..0a3d51af 100644
--- a/docs/examples/docker/demo_docker_api.py
+++ b/docs/examples/docker/demo_docker_api.py
@@ -21,17 +21,20 @@ console = Console()
 BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020")
 BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235")
 # Target URLs
+SIMPLE_URL = "https://example.com"  # For demo purposes
 SIMPLE_URL = "https://httpbin.org/html"
 LINKS_URL = "https://httpbin.org/links/10/0"
-FORMS_URL = "https://httpbin.org/forms/post" # For JS demo
-BOOKS_URL = "http://books.toscrape.com/" # For CSS extraction
-PYTHON_URL = "https://python.org" # For deeper crawl
+FORMS_URL = "https://httpbin.org/forms/post"  # For JS demo
+BOOKS_URL = "http://books.toscrape.com/"  # For CSS extraction
+PYTHON_URL = "https://python.org"  # For deeper crawl
 # Use the same sample site as deep crawl tests for consistency
-DEEP_CRAWL_BASE_URL = os.getenv("DEEP_CRAWL_TEST_SITE", "https://docs.crawl4ai.com/samples/deepcrawl/")
+DEEP_CRAWL_BASE_URL = os.getenv(
+    "DEEP_CRAWL_TEST_SITE", "https://docs.crawl4ai.com/samples/deepcrawl/")
 DEEP_CRAWL_DOMAIN = "docs.crawl4ai.com"
 
 # --- Helper Functions ---
 
+
 async def check_server_health(client: httpx.AsyncClient):
     """Check if the server is healthy before running tests."""
     console.print("[bold cyan]Checking server health...[/]", end="")
@@ -39,7 +42,8 @@ async def check_server_health(client: httpx.AsyncClient):
         response = await client.get("/health", timeout=10.0)
         response.raise_for_status()
         health_data = response.json()
-        console.print(f"[bold green] Server OK! Version: {health_data.get('version', 'N/A')}[/]")
+        console.print(
+            f"[bold green] Server OK! Version: {health_data.get('version', 'N/A')}[/]")
         return True
     except (httpx.RequestError, httpx.HTTPStatusError) as e:
         console.print(f"\n[bold red]Server health check FAILED:[/]")
@@ -47,10 +51,12 @@ async def check_server_health(client: httpx.AsyncClient):
         console.print(f"Is the server running at {BASE_URL}?")
         return False
     except Exception as e:
-        console.print(f"\n[bold red]An unexpected error occurred during health check:[/]")
+        console.print(
+            f"\n[bold red]An unexpected error occurred during health check:[/]")
         console.print(e)
         return False
 
+
 def print_payload(payload: Dict[str, Any]):
     """Prints the JSON payload nicely with a dark theme."""
     syntax = Syntax(
@@ -60,7 +66,9 @@ def print_payload(payload: Dict[str, Any]):
         line_numbers=False,
         word_wrap=True      # Added word wrap for potentially long payloads
     )
-    console.print(Panel(syntax, title="Request Payload", border_style="blue", expand=False))
+    console.print(Panel(syntax, title="Request Payload",
+                  border_style="blue", expand=False))
+
 
 def print_result_summary(results: List[Dict[str, Any]], title: str = "Crawl Results Summary", max_items: int = 3):
     """Prints a concise summary of crawl results."""
@@ -68,11 +76,13 @@ def print_result_summary(results: List[Dict[str, Any]], title: str = "Crawl Resu
         console.print(f"[yellow]{title}: No results received.[/]")
         return
 
-    console.print(Panel(f"[bold]{title}[/]", border_style="green", expand=False))
+    console.print(Panel(f"[bold]{title}[/]",
+                  border_style="green", expand=False))
     count = 0
     for result in results:
         if count >= max_items:
-            console.print(f"... (showing first {max_items} of {len(results)} results)")
+            console.print(
+                f"... (showing first {max_items} of {len(results)} results)")
             break
         count += 1
         success_icon = "[green]✔[/]" if result.get('success') else "[red]✘[/]"
@@ -81,14 +91,16 @@ def print_result_summary(results: List[Dict[str, Any]], title: str = "Crawl Resu
         content_info = ""
         if result.get('extracted_content'):
             content_str = json.dumps(result['extracted_content'])
-            snippet = (content_str[:70] + '...') if len(content_str) > 70 else content_str
+            snippet = (
+                content_str[:70] + '...') if len(content_str) > 70 else content_str
             content_info = f" | Extracted: [cyan]{snippet}[/]"
         elif result.get('markdown'):
-             content_info = f" | Markdown: [cyan]Present[/]"
+            content_info = f" | Markdown: [cyan]Present[/]"
         elif result.get('html'):
             content_info = f" | HTML Size: [cyan]{len(result['html'])}[/]"
 
-        console.print(f"{success_icon} URL: [link={url}]{url}[/link] (Status: {status}){content_info}")
+        console.print(
+            f"{success_icon} URL: [link={url}]{url}[/link] (Status: {status}){content_info}")
         if "metadata" in result and "depth" in result["metadata"]:
             console.print(f"  Depth: {result['metadata']['depth']}")
         if not result.get('success') and result.get('error_message'):
@@ -104,7 +116,8 @@ async def make_request(client: httpx.AsyncClient, endpoint: str, payload: Dict[s
         start_time = time.time()
         response = await client.post(endpoint, json=payload)
         duration = time.time() - start_time
-        console.print(f"Response Status: [bold {'green' if response.is_success else 'red'}]{response.status_code}[/] (took {duration:.2f}s)")
+        console.print(
+            f"Response Status: [bold {'green' if response.is_success else 'red'}]{response.status_code}[/] (took {duration:.2f}s)")
         response.raise_for_status()
         data = response.json()
         if data.get("success"):
@@ -119,7 +132,8 @@ async def make_request(client: httpx.AsyncClient, endpoint: str, payload: Dict[s
         console.print(f"[bold red]HTTP Error:[/]")
         console.print(f"Status: {e.response.status_code}")
         try:
-            console.print(Panel(Syntax(json.dumps(e.response.json(), indent=2), "json", theme="default"), title="Error Response"))
+            console.print(Panel(Syntax(json.dumps(
+                e.response.json(), indent=2), "json", theme="default"), title="Error Response"))
         except json.JSONDecodeError:
             console.print(f"Response Body: {e.response.text}")
     except httpx.RequestError as e:
@@ -128,21 +142,24 @@ async def make_request(client: httpx.AsyncClient, endpoint: str, payload: Dict[s
         console.print(f"[bold red]Unexpected Error: {e}[/]")
     return None
 
+
 async def stream_request(client: httpx.AsyncClient, endpoint: str, payload: Dict[str, Any], title: str):
     """Handles streaming POST requests."""
     console.rule(f"[bold magenta]{title}[/]", style="magenta")
     print_payload(payload)
-    console.print(f"Sending POST stream request to {client.base_url}{endpoint}...")
+    console.print(
+        f"Sending POST stream request to {client.base_url}{endpoint}...")
     all_results = []
-    initial_status_code = None # Store initial status code
+    initial_status_code = None  # Store initial status code
 
     try:
         start_time = time.time()
         async with client.stream("POST", endpoint, json=payload) as response:
-            initial_status_code = response.status_code # Capture initial status
-            duration = time.time() - start_time # Time to first byte potentially
-            console.print(f"Initial Response Status: [bold {'green' if response.is_success else 'red'}]{initial_status_code}[/] (first byte ~{duration:.2f}s)")
-            response.raise_for_status() # Raise exception for bad *initial* status codes
+            initial_status_code = response.status_code  # Capture initial status
+            duration = time.time() - start_time  # Time to first byte potentially
+            console.print(
+                f"Initial Response Status: [bold {'green' if response.is_success else 'red'}]{initial_status_code}[/] (first byte ~{duration:.2f}s)")
+            response.raise_for_status()  # Raise exception for bad *initial* status codes
 
             console.print("[magenta]--- Streaming Results ---[/]")
             completed = False
@@ -152,24 +169,31 @@ async def stream_request(client: httpx.AsyncClient, endpoint: str, payload: Dict
                         data = json.loads(line)
                         if data.get("status") == "completed":
                             completed = True
-                            console.print("[bold green]--- Stream Completed ---[/]")
+                            console.print(
+                                "[bold green]--- Stream Completed ---[/]")
                             break
-                        elif data.get("url"): # Looks like a result dictionary
+                        elif data.get("url"):  # Looks like a result dictionary
                             all_results.append(data)
                             # Display summary info as it arrives
-                            success_icon = "[green]✔[/]" if data.get('success') else "[red]✘[/]"
+                            success_icon = "[green]✔[/]" if data.get(
+                                'success') else "[red]✘[/]"
                             url = data.get('url', 'N/A')
                             # Display status code FROM THE RESULT DATA if available
                             result_status = data.get('status_code', 'N/A')
-                            console.print(f"  {success_icon} Received: [link={url}]{url}[/link] (Status: {result_status})")
+                            console.print(
+                                f"  {success_icon} Received: [link={url}]{url}[/link] (Status: {result_status})")
                             if not data.get('success') and data.get('error_message'):
-                                console.print(f"    [red]Error: {data['error_message']}[/]")
+                                console.print(
+                                    f"    [red]Error: {data['error_message']}[/]")
                         else:
-                            console.print(f"  [yellow]Stream meta-data:[/yellow] {data}")
+                            console.print(
+                                f"  [yellow]Stream meta-data:[/yellow] {data}")
                     except json.JSONDecodeError:
-                        console.print(f"  [red]Stream decode error for line:[/red] {line}")
+                        console.print(
+                            f"  [red]Stream decode error for line:[/red] {line}")
             if not completed:
-                 console.print("[bold yellow]Warning: Stream ended without 'completed' marker.[/]")
+                console.print(
+                    "[bold yellow]Warning: Stream ended without 'completed' marker.[/]")
 
     except httpx.HTTPStatusError as e:
         # Use the captured initial status code if available, otherwise from the exception
@@ -177,18 +201,21 @@ async def stream_request(client: httpx.AsyncClient, endpoint: str, payload: Dict
         console.print(f"[bold red]HTTP Error (Initial Request):[/]")
         console.print(f"Status: {status}")
         try:
-            console.print(Panel(Syntax(json.dumps(e.response.json(), indent=2), "json", theme="default"), title="Error Response"))
+            console.print(Panel(Syntax(json.dumps(
+                e.response.json(), indent=2), "json", theme="default"), title="Error Response"))
         except json.JSONDecodeError:
             console.print(f"Response Body: {e.response.text}")
     except httpx.RequestError as e:
         console.print(f"[bold red]Request Error: {e}[/]")
     except Exception as e:
         console.print(f"[bold red]Unexpected Error during streaming: {e}[/]")
-        console.print_exception(show_locals=False) # Print stack trace for unexpected errors
+        # Print stack trace for unexpected errors
+        console.print_exception(show_locals=False)
 
     # Call print_result_summary with the *collected* results AFTER the stream is done
     print_result_summary(all_results, title=f"{title} Collected Results")
 
+
 def load_proxies_from_env() -> List[Dict]:
     """
     Load proxies from the PROXIES environment variable.
@@ -199,7 +226,7 @@ def load_proxies_from_env() -> List[Dict]:
     proxies_str = os.getenv("PROXIES", "")
     if not proxies_str:
         # console.print("[yellow]PROXIES environment variable not set or empty.[/]")
-        return proxies_params_list # Return empty list if not set
+        return proxies_params_list  # Return empty list if not set
 
     try:
         proxy_entries = proxies_str.split(",")
@@ -211,38 +238,40 @@ def load_proxies_from_env() -> List[Dict]:
             parts = entry.split(":")
             proxy_dict = {}
 
-            if len(parts) == 4: # Format: IP:PORT:USER:PASS
+            if len(parts) == 4:  # Format: IP:PORT:USER:PASS
                 ip, port, username, password = parts
                 proxy_dict = {
-                    "server": f"http://{ip}:{port}", # Assuming http protocol
+                    "server": f"http://{ip}:{port}",  # Assuming http protocol
                     "username": username,
                     "password": password,
                     # "ip": ip # 'ip' is not a standard ProxyConfig param, 'server' contains it
                 }
-            elif len(parts) == 2: # Format: IP:PORT
+            elif len(parts) == 2:  # Format: IP:PORT
                 ip, port = parts
                 proxy_dict = {
                     "server": f"http://{ip}:{port}",
                     # "ip": ip
                 }
             else:
-                 console.print(f"[yellow]Skipping invalid proxy string format:[/yellow] {entry}")
-                 continue
+                console.print(
+                    f"[yellow]Skipping invalid proxy string format:[/yellow] {entry}")
+                continue
 
             proxies_params_list.append(proxy_dict)
 
     except Exception as e:
-        console.print(f"[red]Error loading proxies from environment:[/red] {e}")
+        console.print(
+            f"[red]Error loading proxies from environment:[/red] {e}")
 
     if proxies_params_list:
-        console.print(f"[cyan]Loaded {len(proxies_params_list)} proxies from environment.[/]")
+        console.print(
+            f"[cyan]Loaded {len(proxies_params_list)} proxies from environment.[/]")
     # else:
     #     console.print("[yellow]No valid proxies loaded from environment.[/]")
 
     return proxies_params_list
 
 
-
 # --- Demo Functions ---
 
 # 1. Basic Crawling
@@ -250,11 +279,17 @@ async def demo_basic_single_url(client: httpx.AsyncClient):
     payload = {
         "urls": [SIMPLE_URL],
         "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
-        "crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "BYPASS"}}
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "cache_mode": "BYPASS"
+            }
+        }
     }
     result = await make_request(client, "/crawl", payload, "Demo 1a: Basic Single URL Crawl")
     return result
 
+
 async def demo_basic_multi_url(client: httpx.AsyncClient):
     payload = {
         "urls": [SIMPLE_URL, LINKS_URL],
@@ -264,16 +299,31 @@ async def demo_basic_multi_url(client: httpx.AsyncClient):
     result = await make_request(client, "/crawl", payload, "Demo 1b: Basic Multi URL Crawl")
     return result
 
+
 async def demo_streaming_multi_url(client: httpx.AsyncClient):
     payload = {
-        "urls": [SIMPLE_URL, LINKS_URL, FORMS_URL], # Add another URL
+        # "urls": [SIMPLE_URL, LINKS_URL, FORMS_URL, SIMPLE_URL, LINKS_URL, FORMS_URL], # Add another URL
+        "urls": [
+            "https://example.com/page1",
+            "https://example.com/page2",
+            "https://example.com/page3",
+            "https://example.com/page4",
+            "https://example.com/page5"
+        ],  # Add another URL
         "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
-        "crawler_config": {"type": "CrawlerRunConfig", "params": {"stream": True, "cache_mode": "BYPASS"}}
+        "crawler_config": {
+            "type": "CrawlerRunConfig",
+            "params": {
+                "stream": True,
+            }
+        }
     }
-    result =  stream_request(client, "/crawl/stream", payload, "Demo 1c: Streaming Multi URL Crawl")
+    result = await stream_request(client, "/crawl/stream", payload, "Demo 1c: Streaming Multi URL Crawl")
     return result
 
 # 2. Markdown Generation & Content Filtering
+
+
 async def demo_markdown_default(client: httpx.AsyncClient):
     payload = {
         "urls": [SIMPLE_URL],
@@ -281,17 +331,28 @@ async def demo_markdown_default(client: httpx.AsyncClient):
         "crawler_config": {
             "type": "CrawlerRunConfig",
             "params": {
-                "cache_mode": "BYPASS",
-                "markdown_generator": {"type": "DefaultMarkdownGenerator", "params": {}} # Explicitly default
+                "markdown_generator": {
+                    "type": "DefaultMarkdownGenerator",
+                    "params": {
+                        "content_source": "fit_html",
+                        "options": {
+                            "type": "dict",
+                            "value": {
+                                "ignore_links": True
+                            }
+                        }
+                    }
+                }  # Explicitly default
             }
         }
     }
     result = await make_request(client, "/crawl", payload, "Demo 2a: Default Markdown Generation")
     return result
 
+
 async def demo_markdown_pruning(client: httpx.AsyncClient):
     payload = {
-        "urls": [PYTHON_URL], # Use a more complex page
+        "urls": [PYTHON_URL],  # Use a more complex page
         "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
         "crawler_config": {
             "type": "CrawlerRunConfig",
@@ -302,7 +363,10 @@ async def demo_markdown_pruning(client: httpx.AsyncClient):
                     "params": {
                         "content_filter": {
                             "type": "PruningContentFilter",
-                            "params": {"threshold": 0.6, "threshold_type": "relative"}
+                            "params": {
+                                "threshold": 0.6,
+                                "threshold_type": "relative"
+                            }
                         }
                     }
                 }
@@ -312,6 +376,7 @@ async def demo_markdown_pruning(client: httpx.AsyncClient):
     result = await make_request(client, "/crawl", payload, "Demo 2b: Markdown with Pruning Filter")
     return result
 
+
 async def demo_markdown_bm25(client: httpx.AsyncClient):
     payload = {
         "urls": [PYTHON_URL],
@@ -325,7 +390,9 @@ async def demo_markdown_bm25(client: httpx.AsyncClient):
                     "params": {
                         "content_filter": {
                             "type": "BM25ContentFilter",
-                            "params": {"user_query": "Python documentation language reference"}
+                            "params": {
+                                "user_query": "Python documentation language reference"
+                            }
                         }
                     }
                 }
@@ -337,21 +404,22 @@ async def demo_markdown_bm25(client: httpx.AsyncClient):
 
 # 3. Specific Parameters
 # Corrected Demo Function: demo_param_css_selector
+
+
 async def demo_param_css_selector(client: httpx.AsyncClient):
-    target_selector = ".main-content" # Using the suggested correct selector
+    css_selector = ".main-content"  # Using the suggested correct selector
     payload = {
         "urls": [PYTHON_URL],
         "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
         "crawler_config": {
             "type": "CrawlerRunConfig",
             "params": {
-                "cache_mode": "BYPASS",
-                "css_selector": target_selector # Target specific div
+                "css_selector": css_selector  # Target specific div
                 # No extraction strategy is needed to demo this parameter's effect on input HTML
             }
         }
     }
-    results = await make_request(client, "/crawl", payload, f"Demo 3a: Using css_selector ('{target_selector}')")
+    results = await make_request(client, "/crawl", payload, f"Demo 3a: Using css_selector ('{css_selector}')")
 
     if results:
         result = results[0]
@@ -360,45 +428,55 @@ async def demo_param_css_selector(client: httpx.AsyncClient):
             # A simple check: does it contain expected content from within the selector,
             # and does it LACK content known to be outside (like footer links)?
             html_content = result['html']
-            content_present = 'Python Software Foundation' in html_content # Text likely within .main-content somewhere
-            footer_absent = 'Legal Statements' not in html_content # Text likely in the footer, outside .main-content
+            # Text likely within .main-content somewhere
+            content_present = 'Python Software Foundation' in html_content
+            # Text likely in the footer, outside .main-content
+            footer_absent = 'Legal Statements' not in html_content
 
-            console.print(f"  Content Check: Text inside '{target_selector}' likely present? {'[green]Yes[/]' if content_present else '[red]No[/]'}")
-            console.print(f"  Content Check: Text outside '{target_selector}' (footer) likely absent? {'[green]Yes[/]' if footer_absent else '[red]No[/]'}")
+            console.print(
+                f"  Content Check: Text inside '{css_selector}' likely present? {'[green]Yes[/]' if content_present else '[red]No[/]'}")
+            console.print(
+                f"  Content Check: Text outside '{css_selector}' (footer) likely absent? {'[green]Yes[/]' if footer_absent else '[red]No[/]'}")
 
             if not content_present or not footer_absent:
-                 console.print(f"  [yellow]Note:[/yellow] HTML filtering might not be precise or page structure changed. Result HTML length: {len(html_content)}")
+                console.print(
+                    f"  [yellow]Note:[/yellow] HTML filtering might not be precise or page structure changed. Result HTML length: {len(html_content)}")
             else:
-                 console.print(f"  [green]Verified:[/green] Returned HTML appears limited by css_selector. Result HTML length: {len(html_content)}")
+                console.print(
+                    f"  [green]Verified:[/green] Returned HTML appears limited by css_selector. Result HTML length: {len(html_content)}")
 
         elif result['success']:
-            console.print("[yellow]HTML content was empty in the successful result.[/]")
+            console.print(
+                "[yellow]HTML content was empty in the successful result.[/]")
         # Error message is handled by print_result_summary called by make_request
 
+
 async def demo_param_js_execution(client: httpx.AsyncClient):
     payload = {
-        "urls": [FORMS_URL], # Use a page with a form
+        "urls": ["https://example.com"],  # Use a page with a form
         "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
         "crawler_config": {
             "type": "CrawlerRunConfig",
             "params": {
                 "cache_mode": "BYPASS",
-                 # Simple JS to fill and maybe click (won't submit without more complex setup)
+                # Simple JS to fill and maybe click (won't submit without more complex setup)
                 "js_code": """
-                    () => {
-                        document.querySelector('[name="custname"]').value = 'Crawl4AI Demo';
-                        return { filled_name: document.querySelector('[name="custname"]').value };
-                    }
+                    (() => {
+                        document.querySelector('h1').innerText = 'Crawl4AI Demo';
+                        return { filled_name: document.querySelector('h1').innerText };
+                    })();
                 """,
-                "delay_before_return_html": 0.5 # Give JS time to potentially run
+                "delay_before_return_html": 0.5  # Give JS time to potentially run
             }
         }
     }
     results = await make_request(client, "/crawl", payload, "Demo 3b: Using js_code Parameter")
     if results and results[0].get("js_execution_result"):
-         console.print("[cyan]JS Execution Result:[/]", results[0]["js_execution_result"])
+        console.print("[cyan]JS Execution Result:[/]",
+                      results[0]["js_execution_result"])
     elif results:
-         console.print("[yellow]JS Execution Result not found in response.[/]")
+        console.print("[yellow]JS Execution Result not found in response.[/]")
+
 
 async def demo_param_screenshot(client: httpx.AsyncClient):
     payload = {
@@ -411,13 +489,15 @@ async def demo_param_screenshot(client: httpx.AsyncClient):
     }
     results = await make_request(client, "/crawl", payload, "Demo 3c: Taking a Screenshot")
     if results and results[0].get("screenshot"):
-        console.print(f"[cyan]Screenshot data received (length):[/] {len(results[0]['screenshot'])}")
+        console.print(
+            f"[cyan]Screenshot data received (length):[/] {len(results[0]['screenshot'])}")
     elif results:
-         console.print("[yellow]Screenshot data not found in response.[/]")
+        console.print("[yellow]Screenshot data not found in response.[/]")
+
 
 async def demo_param_ssl_fetch(client: httpx.AsyncClient):
     payload = {
-        "urls": [PYTHON_URL], # Needs HTTPS
+        "urls": [PYTHON_URL],  # Needs HTTPS
         "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
         "crawler_config": {
             "type": "CrawlerRunConfig",
@@ -429,18 +509,20 @@ async def demo_param_ssl_fetch(client: httpx.AsyncClient):
         console.print("[cyan]SSL Certificate Info:[/]")
         console.print(results[0]["ssl_certificate"])
     elif results:
-         console.print("[yellow]SSL Certificate data not found in response.[/]")
+        console.print("[yellow]SSL Certificate data not found in response.[/]")
+
 
 async def demo_param_proxy(client: httpx.AsyncClient):
-    proxy_params_list = load_proxies_from_env() # Get the list of parameter dicts
+    proxy_params_list = load_proxies_from_env()  # Get the list of parameter dicts
     if not proxy_params_list:
-        console.rule("[bold yellow]Demo 3e: Using Proxies (SKIPPED)[/]", style="yellow")
+        console.rule(
+            "[bold yellow]Demo 3e: Using Proxies (SKIPPED)[/]", style="yellow")
         console.print("Set the PROXIES environment variable to run this demo.")
         console.print("Format: IP:PORT:USR:PWD,IP:PORT,...")
         return
 
     payload = {
-        "urls": ["https://httpbin.org/ip"], # URL that shows originating IP
+        "urls": ["https://httpbin.org/ip"],  # URL that shows originating IP
         "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
         "crawler_config": {
             "type": "CrawlerRunConfig",
@@ -450,8 +532,21 @@ async def demo_param_proxy(client: httpx.AsyncClient):
                     "type": "RoundRobinProxyStrategy",
                     "params": {
                         "proxies": [
+                            # [
+                            # {
+                            # "type": "ProxyConfig",
+                            # "params": {
+                            # server:"...",
+                            # "username": "...",
+                            # "password": "..."
+                            # }
+                            # },
+                            # ...
+                            # ]
+
                             # Filter out the 'ip' key when sending to server, as it's not part of ProxyConfig
-                            {"type": "ProxyConfig", "params": {k: v for k, v in p.items() if k != 'ip'}}
+                            {"type": "ProxyConfig", "params": {
+                                k: v for k, v in p.items() if k != 'ip'}}
                             for p in proxy_params_list
                         ]
                     }
@@ -474,43 +569,55 @@ async def demo_param_proxy(client: httpx.AsyncClient):
                 end = html_content.rfind('}')
                 if start != -1 and end != -1:
                     json_str = html_content[start:end+1]
-            elif html_content.strip().startswith('{'): # Maybe it's just JSON
-                 json_str = html_content.strip()
+            elif html_content.strip().startswith('{'):  # Maybe it's just JSON
+                json_str = html_content.strip()
 
             if json_str:
                 ip_data = json.loads(json_str)
                 origin_ip = ip_data.get("origin")
-                console.print(f"  Origin IP reported by httpbin: [bold yellow]{origin_ip}[/]")
+                console.print(
+                    f"  Origin IP reported by httpbin: [bold yellow]{origin_ip}[/]")
 
                 # Extract the IPs from the proxy list for comparison
-                proxy_ips = {p.get("server").split(":")[1][2:] for p in proxy_params_list}
+                proxy_ips = {p.get("server").split(
+                    ":")[1][2:] for p in proxy_params_list}
 
                 if origin_ip and origin_ip in proxy_ips:
-                    console.print("[bold green]  Verification SUCCESS: Origin IP matches one of the provided proxies![/]")
+                    console.print(
+                        "[bold green]  Verification SUCCESS: Origin IP matches one of the provided proxies![/]")
                 elif origin_ip:
-                    console.print("[bold red]  Verification FAILED: Origin IP does not match any provided proxy IPs.[/]")
+                    console.print(
+                        "[bold red]  Verification FAILED: Origin IP does not match any provided proxy IPs.[/]")
                     console.print(f"  Provided Proxy IPs: {proxy_ips}")
                 else:
-                    console.print("[yellow]  Verification SKIPPED: Could not extract origin IP from response.[/]")
+                    console.print(
+                        "[yellow]  Verification SKIPPED: Could not extract origin IP from response.[/]")
             else:
-                 console.print("[yellow]  Verification SKIPPED: Could not find JSON in httpbin response HTML.[/]")
-                 # console.print(f"HTML Received:\n{html_content[:500]}...") # Uncomment for debugging
+                console.print(
+                    "[yellow]  Verification SKIPPED: Could not find JSON in httpbin response HTML.[/]")
+                # console.print(f"HTML Received:\n{html_content[:500]}...") # Uncomment for debugging
 
         except json.JSONDecodeError:
-             console.print("[red]  Verification FAILED: Could not parse JSON from httpbin response HTML.[/]")
+            console.print(
+                "[red]  Verification FAILED: Could not parse JSON from httpbin response HTML.[/]")
         except Exception as e:
-             console.print(f"[red]  Verification Error: An unexpected error occurred during IP check: {e}[/]")
+            console.print(
+                f"[red]  Verification Error: An unexpected error occurred during IP check: {e}[/]")
     elif results:
-        console.print("[yellow]  Verification SKIPPED: Crawl for IP check was not successful.[/]")
+        console.print(
+            "[yellow]  Verification SKIPPED: Crawl for IP check was not successful.[/]")
+
+# 4. Extraction Strategies
+
 
-# 4. Extraction Strategies (Non-Deep)
 async def demo_extract_css(client: httpx.AsyncClient):
     # Schema to extract book titles and prices
     book_schema = {
         "name": "BookList",
         "baseSelector": "ol.row li.col-xs-6",
         "fields": [
-            {"name": "title", "selector": "article.product_pod h3 a", "type": "attribute", "attribute": "title"},
+            {"name": "title", "selector": "article.product_pod h3 a",
+                "type": "attribute", "attribute": "title"},
             {"name": "price", "selector": "article.product_pod .price_color", "type": "text"},
         ]
     }
@@ -523,7 +630,12 @@ async def demo_extract_css(client: httpx.AsyncClient):
                 "cache_mode": "BYPASS",
                 "extraction_strategy": {
                     "type": "JsonCssExtractionStrategy",
-                     "params": {"schema": {"type": "dict", "value": book_schema}}
+                    "params": {
+                        "schema": {
+                            "type": "dict", 
+                            "value": book_schema
+                        }
+                    }
                 }
             }
         }
@@ -534,27 +646,34 @@ async def demo_extract_css(client: httpx.AsyncClient):
         try:
             extracted_data = json.loads(results[0]["extracted_content"])
             if isinstance(extracted_data, list) and extracted_data:
-                 console.print("[cyan]Sample Extracted Books (CSS):[/]")
-                 table = Table(show_header=True, header_style="bold magenta")
-                 table.add_column("Title", style="dim")
-                 table.add_column("Price")
-                 for item in extracted_data[:5]: # Show first 5
-                     table.add_row(item.get('title', 'N/A'), item.get('price', 'N/A'))
-                 console.print(table)
+                console.print("[cyan]Sample Extracted Books (CSS):[/]")
+                table = Table(show_header=True, header_style="bold magenta")
+                table.add_column("Title", style="dim")
+                table.add_column("Price")
+                for item in extracted_data[:5]:  # Show first 5
+                    table.add_row(item.get('title', 'N/A'),
+                                  item.get('price', 'N/A'))
+                console.print(table)
             else:
-                 console.print("[yellow]CSS extraction did not return a list of results.[/]")
-                 console.print(extracted_data)
+                console.print(
+                    "[yellow]CSS extraction did not return a list of results.[/]")
+                console.print(extracted_data)
         except json.JSONDecodeError:
-             console.print("[red]Failed to parse extracted_content as JSON.[/]")
+            console.print("[red]Failed to parse extracted_content as JSON.[/]")
         except Exception as e:
-             console.print(f"[red]Error processing extracted CSS content: {e}[/]")
+            console.print(
+                f"[red]Error processing extracted CSS content: {e}[/]")
 
 # 5. LLM Extraction
+
+
 async def demo_extract_llm(client: httpx.AsyncClient):
-    if not os.getenv("OPENAI_API_KEY"): # Basic check for a common key
-         console.rule("[bold yellow]Demo 4b: LLM Extraction (SKIPPED)[/]", style="yellow")
-         console.print("Set an LLM API key (e.g., OPENAI_API_KEY) in your .env file or environment.")
-         return
+    if not os.getenv("OPENAI_API_KEY"):  # Basic check for a common key
+        console.rule(
+            "[bold yellow]Demo 4b: LLM Extraction (SKIPPED)[/]", style="yellow")
+        console.print(
+            "Set an LLM API key (e.g., OPENAI_API_KEY) in your .env file or environment.")
+        return
 
     payload = {
         "urls": [SIMPLE_URL],
@@ -567,13 +686,16 @@ async def demo_extract_llm(client: httpx.AsyncClient):
                     "type": "LLMExtractionStrategy",
                     "params": {
                         "instruction": "Extract title and author into JSON.",
-                        "llm_config": { # Optional: Specify provider if not default
+                        "llm_config": {  # Optional: Specify provider if not default
                             "type": "LLMConfig",
                             "params": {}
                             # Relies on server's default provider from config.yml & keys from .llm.env
-                            # "params": {"provider": "openai/gpt-4o-mini"}
+                            # "params": {
+                                # "provider": "openai/gpt-4o-mini",
+                                # "api_key": os.getenv("OPENAI_API_KEY")  # Optional: Override key
+                            # }
                         },
-                         "schema": { # Request structured output
+                        "schema": {  # Request structured output
                             "type": "dict",
                             "value": {
                                 "title": "BookInfo", "type": "object",
@@ -598,18 +720,24 @@ async def demo_extract_llm(client: httpx.AsyncClient):
                 extracted_data = extracted_data[0]
 
             if isinstance(extracted_data, dict):
-                 console.print("[cyan]Extracted Data (LLM):[/]")
-                 syntax = Syntax(json.dumps(extracted_data, indent=2), "json", theme="monokai", line_numbers=False)
-                 console.print(Panel(syntax, border_style="cyan", expand=False))
+                console.print("[cyan]Extracted Data (LLM):[/]")
+                syntax = Syntax(json.dumps(extracted_data, indent=2),
+                                "json", theme="monokai", line_numbers=False)
+                console.print(Panel(syntax, border_style="cyan", expand=False))
             else:
-                 console.print("[yellow]LLM extraction did not return expected dictionary.[/]")
-                 console.print(extracted_data)
+                console.print(
+                    "[yellow]LLM extraction did not return expected dictionary.[/]")
+                console.print(extracted_data)
         except json.JSONDecodeError:
-             console.print("[red]Failed to parse LLM extracted_content as JSON.[/]")
+            console.print(
+                "[red]Failed to parse LLM extracted_content as JSON.[/]")
         except Exception as e:
-             console.print(f"[red]Error processing extracted LLM content: {e}[/]")
+            console.print(
+                f"[red]Error processing extracted LLM content: {e}[/]")
 
 # 6. Deep Crawling
+
+
 async def demo_deep_basic(client: httpx.AsyncClient):
     payload = {
         "urls": [DEEP_CRAWL_BASE_URL],
@@ -625,7 +753,17 @@ async def demo_deep_basic(client: httpx.AsyncClient):
                         "max_pages": 4,
                         "filter_chain": {
                             "type": "FilterChain",
-                            "params": {"filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
+                            "params": {
+                                "filters": [
+                                    {
+                                        "type": "DomainFilter", 
+                                        "params": 
+                                        {
+                                            "allowed_domains": [DEEP_CRAWL_DOMAIN]
+                                        }
+                                    }
+                                ]
+                            }
                         }
                     }
                 }
@@ -642,6 +780,8 @@ async def demo_deep_basic(client: httpx.AsyncClient):
             console.print(f"  [red]Error: {result['error_message']}[/]")
 
 # 5. Streaming Deep Crawl
+
+
 async def demo_deep_streaming(client: httpx.AsyncClient):
     payload = {
         "urls": [DEEP_CRAWL_BASE_URL],
@@ -649,7 +789,7 @@ async def demo_deep_streaming(client: httpx.AsyncClient):
         "crawler_config": {
             "type": "CrawlerRunConfig",
             "params": {
-                "stream": True, # Enable streaming
+                "stream": True,  # Enable streaming
                 "cache_mode": "BYPASS",
                 "deep_crawl_strategy": {
                     "type": "BFSDeepCrawlStrategy",
@@ -669,11 +809,13 @@ async def demo_deep_streaming(client: httpx.AsyncClient):
     await stream_request(client, "/crawl/stream", payload, "Demo 5b: Streaming Deep Crawl")
 
 # 5a. Deep Crawl with Filtering & Scoring
+
+
 async def demo_deep_filtering_scoring(client: httpx.AsyncClient):
     """Demonstrates deep crawl with advanced URL filtering and scoring."""
-    max_depth = 2 # Go a bit deeper to see scoring/filtering effects
+    max_depth = 2  # Go a bit deeper to see scoring/filtering effects
     max_pages = 6
-    excluded_pattern = "*/category-1/*" # Example pattern to exclude
+    excluded_pattern = "*/category-1/*"  # Example pattern to exclude
     keyword_to_score = "product"        # Example keyword to prioritize
 
     payload = {
@@ -705,7 +847,7 @@ async def demo_deep_filtering_scoring(client: httpx.AsyncClient):
                                         "type": "URLPatternFilter",
                                         "params": {
                                             "patterns": [excluded_pattern],
-                                            "reverse": True # Block if match
+                                            "reverse": True  # Block if match
                                         }
                                     }
                                 ]
@@ -717,7 +859,8 @@ async def demo_deep_filtering_scoring(client: httpx.AsyncClient):
                                 "scorers": [
                                     {   # Boost score for URLs containing the keyword
                                         "type": "KeywordRelevanceScorer",
-                                        "params": {"keywords": [keyword_to_score], "weight": 1.5} # Higher weight
+                                        # Higher weight
+                                        "params": {"keywords": [keyword_to_score], "weight": 1.5}
                                     },
                                     {   # Slightly penalize deeper pages
                                         "type": "PathDepthScorer",
@@ -747,39 +890,51 @@ async def demo_deep_filtering_scoring(client: httpx.AsyncClient):
             depth = result.get("metadata", {}).get("depth", -1)
 
             # Check Filtering
-            if excluded_pattern.strip('*') in url: # Check if the excluded part is present
-                console.print(f"  [bold red]Filter FAILED:[/bold red] Excluded pattern part '{excluded_pattern.strip('*')}' found in URL: {url}")
+            # Check if the excluded part is present
+            if excluded_pattern.strip('*') in url:
+                console.print(
+                    f"  [bold red]Filter FAILED:[/bold red] Excluded pattern part '{excluded_pattern.strip('*')}' found in URL: {url}")
                 excluded_found = True
 
             # Check Scoring (Observation)
             if keyword_to_score in url:
-                 prioritized_found_overall = True
-                 if depth == 1: # Check if prioritized keywords appeared early (depth 1)
-                     prioritized_found_at_depth1 = True
+                prioritized_found_overall = True
+                # Check if prioritized keywords appeared early (depth 1)
+                if depth == 1:
+                    prioritized_found_at_depth1 = True
 
         if not excluded_found:
-             console.print(f"  [green]Filter Check:[/green] No URLs matching excluded pattern '{excluded_pattern}' found.")
+            console.print(
+                f"  [green]Filter Check:[/green] No URLs matching excluded pattern '{excluded_pattern}' found.")
         else:
-             console.print(f"  [red]Filter Check:[/red] URLs matching excluded pattern '{excluded_pattern}' were found (unexpected).")
+            console.print(
+                f"  [red]Filter Check:[/red] URLs matching excluded pattern '{excluded_pattern}' were found (unexpected).")
 
         if prioritized_found_at_depth1:
-            console.print(f"  [green]Scoring Check:[/green] URLs with keyword '{keyword_to_score}' were found at depth 1 (scoring likely influenced).")
+            console.print(
+                f"  [green]Scoring Check:[/green] URLs with keyword '{keyword_to_score}' were found at depth 1 (scoring likely influenced).")
         elif prioritized_found_overall:
-            console.print(f"  [yellow]Scoring Check:[/yellow] URLs with keyword '{keyword_to_score}' found, but not necessarily prioritized early (check max_pages/depth limits).")
+            console.print(
+                f"  [yellow]Scoring Check:[/yellow] URLs with keyword '{keyword_to_score}' found, but not necessarily prioritized early (check max_pages/depth limits).")
         else:
-             console.print(f"  [yellow]Scoring Check:[/yellow] No URLs with keyword '{keyword_to_score}' found within crawl limits.")
+            console.print(
+                f"  [yellow]Scoring Check:[/yellow] No URLs with keyword '{keyword_to_score}' found within crawl limits.")
 
         # print_result_summary called by make_request already shows URLs and depths
 
 # 6. Deep Crawl with Extraction
+
+
 async def demo_deep_with_css_extraction(client: httpx.AsyncClient):
     # Schema to extract H1 and first paragraph from any page
     general_schema = {
         "name": "PageContent",
-        "baseSelector": "body", # Apply to whole body
+        "baseSelector": "body",  # Apply to whole body
         "fields": [
-            {"name": "page_title", "selector": "h1", "type": "text", "default": "N/A"},
-            {"name": "first_p", "selector": "p", "type": "text", "default": "N/A"}, # Gets first p tag
+            {"name": "page_title", "selector": "h1",
+                "type": "text", "default": "N/A"},
+            {"name": "first_p", "selector": "p", "type": "text",
+                "default": "N/A"},  # Gets first p tag
         ]
     }
     payload = {
@@ -789,7 +944,7 @@ async def demo_deep_with_css_extraction(client: httpx.AsyncClient):
             "type": "CrawlerRunConfig",
             "params": {
                 "cache_mode": "BYPASS",
-                "extraction_strategy": { # Apply CSS extraction to each page
+                "extraction_strategy": {  # Apply CSS extraction to each page
                     "type": "JsonCssExtractionStrategy",
                     "params": {"schema": {"type": "dict", "value": general_schema}}
                 },
@@ -801,9 +956,11 @@ async def demo_deep_with_css_extraction(client: httpx.AsyncClient):
                         "filter_chain": {
                             "type": "FilterChain",
                             "params": {"filters": [
-                                {"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
-                                {"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
-                                ]}
+                                {"type": "DomainFilter", "params": {
+                                    "allowed_domains": [DEEP_CRAWL_DOMAIN]}},
+                                {"type": "ContentTypeFilter", "params": {
+                                    "allowed_types": ["text/html"]}}
+                            ]}
                         }
                     }
                 }
@@ -818,22 +975,32 @@ async def demo_deep_with_css_extraction(client: httpx.AsyncClient):
             if result.get("success") and result.get("extracted_content"):
                 try:
                     extracted = json.loads(result["extracted_content"])
-                    if isinstance(extracted, list) and extracted: extracted = extracted[0] # Use first item
-                    title = extracted.get('page_title', 'N/A') if isinstance(extracted, dict) else 'Parse Error'
-                    console.print(f"  [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Title: {title}")
+                    if isinstance(extracted, list) and extracted:
+                        extracted = extracted[0]  # Use first item
+                    title = extracted.get(
+                        'page_title', 'N/A') if isinstance(extracted, dict) else 'Parse Error'
+                    console.print(
+                        f"  [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Title: {title}")
                 except Exception:
-                     console.print(f"  [yellow]![/] URL: [link={result['url']}]{result['url']}[/link] | Failed to parse extracted content")
+                    console.print(
+                        f"  [yellow]![/] URL: [link={result['url']}]{result['url']}[/link] | Failed to parse extracted content")
             elif result.get("success"):
-                 console.print(f"  [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | No content extracted.")
+                console.print(
+                    f"  [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | No content extracted.")
             else:
-                 console.print(f"  [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.")
+                console.print(
+                    f"  [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.")
 
 # 6b. Deep Crawl with LLM Extraction
+
+
 async def demo_deep_with_llm_extraction(client: httpx.AsyncClient):
-    if not os.getenv("OPENAI_API_KEY"): # Basic check
-         console.rule("[bold yellow]Demo 6b: Deep Crawl + LLM Extraction (SKIPPED)[/]", style="yellow")
-         console.print("Set an LLM API key (e.g., OPENAI_API_KEY) in your .env file or environment.")
-         return
+    if not os.getenv("OPENAI_API_KEY"):  # Basic check
+        console.rule(
+            "[bold yellow]Demo 6b: Deep Crawl + LLM Extraction (SKIPPED)[/]", style="yellow")
+        console.print(
+            "Set an LLM API key (e.g., OPENAI_API_KEY) in your .env file or environment.")
+        return
 
     payload = {
         "urls": [DEEP_CRAWL_BASE_URL],
@@ -842,7 +1009,7 @@ async def demo_deep_with_llm_extraction(client: httpx.AsyncClient):
             "type": "CrawlerRunConfig",
             "params": {
                 "cache_mode": "BYPASS",
-                 "extraction_strategy": { # Apply LLM extraction to each page
+                "extraction_strategy": {  # Apply LLM extraction to each page
                     "type": "LLMExtractionStrategy",
                     "params": {
                         "instruction": "What is the main topic of this page based on the H1 and first paragraph? Respond with just the topic.",
@@ -853,13 +1020,15 @@ async def demo_deep_with_llm_extraction(client: httpx.AsyncClient):
                     "type": "BFSDeepCrawlStrategy",
                     "params": {
                         "max_depth": 1,
-                        "max_pages": 2, # Reduce pages for LLM cost/time
-                         "filter_chain": {
+                        "max_pages": 2,  # Reduce pages for LLM cost/time
+                        "filter_chain": {
                             "type": "FilterChain",
                             "params": {"filters": [
-                                {"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
-                                {"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
-                                ]}
+                                {"type": "DomainFilter", "params": {
+                                    "allowed_domains": [DEEP_CRAWL_DOMAIN]}},
+                                {"type": "ContentTypeFilter", "params": {
+                                    "allowed_types": ["text/html"]}}
+                            ]}
                         }
                     }
                 }
@@ -872,34 +1041,40 @@ async def demo_deep_with_llm_extraction(client: httpx.AsyncClient):
         console.print("[cyan]LLM Extraction Summary from Deep Crawl:[/]")
         for result in results:
             if result.get("success") and result.get("extracted_content"):
-                 console.print(f"  [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Topic: {result['extracted_content']}")
+                console.print(
+                    f"  [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Topic: {result['extracted_content']}")
             elif result.get("success"):
-                 console.print(f"  [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | No content extracted.")
+                console.print(
+                    f"  [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | No content extracted.")
             else:
-                 console.print(f"  [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.")
+                console.print(
+                    f"  [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.")
 
 
 # 6c. Deep Crawl with Proxies
 async def demo_deep_with_proxy(client: httpx.AsyncClient):
-    proxy_params_list = load_proxies_from_env() # Get the list of parameter dicts
+    proxy_params_list = load_proxies_from_env()  # Get the list of parameter dicts
     if not proxy_params_list:
-        console.rule("[bold yellow]Demo 6c: Deep Crawl + Proxies (SKIPPED)[/]", style="yellow")
+        console.rule(
+            "[bold yellow]Demo 6c: Deep Crawl + Proxies (SKIPPED)[/]", style="yellow")
         console.print("Set the PROXIES environment variable to run this demo.")
         return
 
     payload = {
-        "urls": [DEEP_CRAWL_BASE_URL], # Use a site likely accessible via proxies
+        # Use a site likely accessible via proxies
+        "urls": [DEEP_CRAWL_BASE_URL],
         "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
         "crawler_config": {
             "type": "CrawlerRunConfig",
             "params": {
                 "cache_mode": "BYPASS",
-                 "proxy_rotation_strategy": {
+                "proxy_rotation_strategy": {
                     "type": "RoundRobinProxyStrategy",
-                     "params": {
+                    "params": {
                         # Correctly create the list of {"type": ..., "params": ...} structures, excluding the demo 'ip' key
                         "proxies": [
-                            {"type": "ProxyConfig", "params": {k: v for k, v in p.items() if k != 'ip'}}
+                            {"type": "ProxyConfig", "params": {
+                                k: v for k, v in p.items() if k != 'ip'}}
                             for p in proxy_params_list
                         ]
                     }
@@ -907,7 +1082,7 @@ async def demo_deep_with_proxy(client: httpx.AsyncClient):
                 "deep_crawl_strategy": {
                     "type": "BFSDeepCrawlStrategy",
                     "params": {
-                        "max_depth": 1, # Just crawl start URL via proxy
+                        "max_depth": 1,  # Just crawl start URL via proxy
                         "max_pages": 5,
                     }
                 }
@@ -933,18 +1108,18 @@ async def demo_deep_with_proxy(client: httpx.AsyncClient):
 async def demo_deep_with_ssl(client: httpx.AsyncClient):
     """Test BFS deep crawl with fetch_ssl_certificate enabled."""
     payload = {
-        "urls": [DEEP_CRAWL_BASE_URL], # Needs HTTPS
+        "urls": [DEEP_CRAWL_BASE_URL],  # Needs HTTPS
         "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
         "crawler_config": {
             "type": "CrawlerRunConfig",
             "params": {
                 "stream": False,
                 "cache_mode": "BYPASS",
-                "fetch_ssl_certificate": True, # <-- Enable SSL fetching
+                "fetch_ssl_certificate": True,  # <-- Enable SSL fetching
                 "deep_crawl_strategy": {
                     "type": "BFSDeepCrawlStrategy",
                     "params": {
-                        "max_depth": 1, # Crawl a bit deeper
+                        "max_depth": 1,  # Crawl a bit deeper
                         "max_pages": 3,
                         "filter_chain": {
                             "type": "FilterChain",
@@ -961,17 +1136,22 @@ async def demo_deep_with_ssl(client: httpx.AsyncClient):
         console.print("[cyan]SSL Certificate Summary from Deep Crawl:[/]")
         for result in results:
             if result.get("success") and result.get("ssl_certificate"):
-                 cert = result["ssl_certificate"]
-                 issuer_org = cert.get('issuer', {}).get('O', 'N/A')
-                 valid_from = cert.get('not_before', 'N/A')
-                 valid_to = cert.get('not_after', 'N/A')
-                 console.print(f"  [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Issuer: {issuer_org} | Valid: {valid_from} - {valid_to}")
+                cert = result["ssl_certificate"]
+                issuer_org = cert.get('issuer', {}).get('O', 'N/A')
+                valid_from = cert.get('not_before', 'N/A')
+                valid_to = cert.get('not_after', 'N/A')
+                console.print(
+                    f"  [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Issuer: {issuer_org} | Valid: {valid_from} - {valid_to}")
             elif result.get("success"):
-                 console.print(f"  [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | SSL cert not fetched or N/A.")
+                console.print(
+                    f"  [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | SSL cert not fetched or N/A.")
             else:
-                 console.print(f"  [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.")
+                console.print(
+                    f"  [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.")
 
 # 7. Markdown helper endpoint
+
+
 async def demo_markdown_endpoint(client: httpx.AsyncClient):
     """
     One-shot helper around /md.
@@ -987,15 +1167,19 @@ async def demo_markdown_endpoint(client: httpx.AsyncClient):
         t0 = time.time()
         resp = await client.post("/md", json=payload)
         dt = time.time() - t0
-        console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
+        console.print(
+            f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
         resp.raise_for_status()
         md = resp.json().get("markdown", "")
         snippet = (md[:500] + "...") if len(md) > 500 else md
-        console.print(Panel(snippet, title="Markdown snippet", border_style="cyan", expand=False))
+        console.print(Panel(snippet, title="Markdown snippet",
+                      border_style="cyan", expand=False))
     except Exception as e:
         console.print(f"[bold red]Error hitting /md:[/] {e}")
 
 # 8. LLM QA helper endpoint
+
+
 async def demo_llm_endpoint(client: httpx.AsyncClient):
     """
     Quick QA round-trip with /llm.
@@ -1012,10 +1196,12 @@ async def demo_llm_endpoint(client: httpx.AsyncClient):
         t0 = time.time()
         resp = await client.get(f"/llm/{enc}", params={"q": question})
         dt = time.time() - t0
-        console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
+        console.print(
+            f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
         resp.raise_for_status()
         answer = resp.json().get("answer", "")
-        console.print(Panel(answer or "No answer returned", title="LLM answer", border_style="magenta", expand=False))
+        console.print(Panel(answer or "No answer returned",
+                      title="LLM answer", border_style="magenta", expand=False))
     except Exception as e:
         console.print(f"[bold red]Error hitting /llm:[/] {e}")
 
@@ -1036,10 +1222,12 @@ async def demo_config_dump_valid(client: httpx.AsyncClient):
         t0 = time.time()
         resp = await client.post("/config/dump", json=payload)
         dt = time.time() - t0
-        console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
+        console.print(
+            f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
         resp.raise_for_status()
         dump_json = resp.json()
-        console.print(Panel(Syntax(json.dumps(dump_json, indent=2), "json", theme="monokai"), title="Dump()", border_style="cyan"))
+        console.print(Panel(Syntax(json.dumps(dump_json, indent=2),
+                      "json", theme="monokai"), title="Dump()", border_style="cyan"))
     except Exception as e:
         console.print(f"[bold red]Error in valid /config/dump call:[/] {e}")
 
@@ -1053,21 +1241,25 @@ async def demo_config_dump_invalid(client: httpx.AsyncClient):
     """).strip()
     payload = {"code": bad_code}
 
-    console.rule("[bold magenta]Demo 8b: /config/dump (invalid)[/]", style="magenta")
+    console.rule(
+        "[bold magenta]Demo 8b: /config/dump (invalid)[/]", style="magenta")
     print_payload(payload)
 
     try:
         resp = await client.post("/config/dump", json=payload)
-        console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/]")
+        console.print(
+            f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/]")
         resp.raise_for_status()   # should throw -> except
     except httpx.HTTPStatusError as e:
         console.print("[cyan]Expected parse/validation failure captured:[/]")
         try:
-            console.print(Panel(Syntax(json.dumps(e.response.json(), indent=2), "json", theme="fruity"), title="Error payload"))
+            console.print(Panel(Syntax(json.dumps(
+                e.response.json(), indent=2), "json", theme="fruity"), title="Error payload"))
         except Exception:
             console.print(e.response.text)
     except Exception as e:
-        console.print(f"[bold red]Unexpected error during invalid test:[/] {e}")
+        console.print(
+            f"[bold red]Unexpected error during invalid test:[/] {e}")
 
 
 # --- Update Main Runner to include new demo ---
@@ -1075,33 +1267,33 @@ async def main_demo():
     async with httpx.AsyncClient(base_url=BASE_URL, timeout=300.0) as client:
         if not await check_server_health(client):
             return
-        
+
         # --- Run Demos ---
-        await demo_basic_single_url(client)
-        await demo_basic_multi_url(client)
-        await demo_streaming_multi_url(client)
+        # await demo_basic_single_url(client)
+        # await demo_basic_multi_url(client)
+        # await demo_streaming_multi_url(client)
 
-        await demo_markdown_default(client)
-        await demo_markdown_pruning(client)
-        await demo_markdown_bm25(client)
+        # await demo_markdown_default(client)
+        # await demo_markdown_pruning(client)
+        # await demo_markdown_bm25(client)
 
-        await demo_param_css_selector(client)
-        await demo_param_js_execution(client)
-        await demo_param_screenshot(client)
-        await demo_param_ssl_fetch(client)
-        await demo_param_proxy(client) # Skips if no PROXIES env var
+        # await demo_param_css_selector(client)
+        # await demo_param_js_execution(client)
+        # await demo_param_screenshot(client)
+        # await demo_param_ssl_fetch(client)
+        # await demo_param_proxy(client) # Skips if no PROXIES env var
 
-        await demo_extract_css(client)
-        await demo_extract_llm(client) # Skips if no common LLM key env var
+        # await demo_extract_css(client)
+        # await demo_extract_llm(client)  # Skips if no common LLM key env var
 
-        await demo_deep_basic(client)
-        await demo_deep_streaming(client) # This need extra work
-        
+        # await demo_deep_basic(client)
+        # await demo_deep_streaming(client)  # This need extra work
 
-        await demo_deep_with_css_extraction(client)
-        await demo_deep_with_llm_extraction(client) # Skips if no common LLM key env var
-        await demo_deep_with_proxy(client) # Skips if no PROXIES env var
-        await demo_deep_with_ssl(client)   # Added the new demo
+        # await demo_deep_with_css_extraction(client)
+        # # Skips if no common LLM key env var
+        # await demo_deep_with_llm_extraction(client)
+        # await demo_deep_with_proxy(client)  # Skips if no PROXIES env var
+        # await demo_deep_with_ssl(client)   # Added the new demo
 
         # --- Helper endpoints ---
         await demo_markdown_endpoint(client)
@@ -1120,5 +1312,6 @@ if __name__ == "__main__":
     except KeyboardInterrupt:
         console.print("\n[yellow]Demo interrupted by user.[/]")
     except Exception as e:
-         console.print(f"\n[bold red]An error occurred during demo execution:[/]")
-         console.print_exception(show_locals=False)
\ No newline at end of file
+        console.print(
+            f"\n[bold red]An error occurred during demo execution:[/]")
+        console.print_exception(show_locals=False)
diff --git a/docs/examples/hello_world.py b/docs/examples/hello_world.py
index fbdd5283..0d351fa8 100644
--- a/docs/examples/hello_world.py
+++ b/docs/examples/hello_world.py
@@ -31,7 +31,7 @@ async def example_cdp():
                    
 
 async def main():
-    browser_config = BrowserConfig(headless=True, verbose=True)
+    browser_config = BrowserConfig(headless=False, verbose=True)
     async with AsyncWebCrawler(config=browser_config) as crawler:
         crawler_config = CrawlerRunConfig(
             cache_mode=CacheMode.BYPASS,

From 9499164d3c0d9912316c9876f32843360865aa57 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Tue, 29 Apr 2025 23:04:32 +0800
Subject: [PATCH 4/8] feat(browser): improve browser profile management and
 cleanup

Enhance browser profile handling with better process cleanup and documentation:
- Add process cleanup for existing Chromium instances on Windows/Unix
- Fix profile creation by passing complete browser config
- Add comprehensive documentation for browser and CLI components
- Add initial profile creation test
- Bump version to 0.6.3

This change improves reliability when managing browser profiles and provides better documentation for developers.
---
 crawl4ai/__version__.py                |  2 +-
 crawl4ai/browser_manager.py            | 44 +++++++++++++++++++++-
 crawl4ai/browser_profiler.py           | 39 ++++++++++++++++++--
 docs/codebase/browser.md               | 51 ++++++++++++++++++++++++++
 docs/codebase/cli.md                   | 40 ++++++++++++++++++++
 tests/profiler/test_crteate_profile.py | 32 ++++++++++++++++
 6 files changed, 202 insertions(+), 6 deletions(-)
 create mode 100644 docs/codebase/browser.md
 create mode 100644 docs/codebase/cli.md
 create mode 100644 tests/profiler/test_crteate_profile.py

diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py
index 8a5cb2c4..1be2ccd8 100644
--- a/crawl4ai/__version__.py
+++ b/crawl4ai/__version__.py
@@ -1,3 +1,3 @@
 # crawl4ai/_version.py
-__version__ = "0.6.2"
+__version__ = "0.6.3"
 
diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py
index 4be5f938..d4e074cf 100644
--- a/crawl4ai/browser_manager.py
+++ b/crawl4ai/browser_manager.py
@@ -5,7 +5,10 @@ import os
 import sys
 import shutil
 import tempfile
+import psutil  
+import signal
 import subprocess
+import shlex
 from playwright.async_api import BrowserContext
 import hashlib
 from .js_snippet import load_js_script
@@ -193,6 +196,45 @@ class ManagedBrowser:
         
         if self.browser_config.extra_args:
             args.extend(self.browser_config.extra_args)
+            
+
+        # ── make sure no old Chromium instance is owning the same port/profile ──
+        try:
+            if sys.platform == "win32":
+                if psutil is None:
+                    raise RuntimeError("psutil not available, cannot clean old browser")
+                for p in psutil.process_iter(["pid", "name", "cmdline"]):
+                    cl = " ".join(p.info.get("cmdline") or [])
+                    if (
+                        f"--remote-debugging-port={self.debugging_port}" in cl
+                        and f"--user-data-dir={self.user_data_dir}" in cl
+                    ):
+                        p.kill()
+                        p.wait(timeout=5)
+            else:  # macOS / Linux
+                # kill any process listening on the same debugging port
+                pids = (
+                    subprocess.check_output(shlex.split(f"lsof -t -i:{self.debugging_port}"))
+                    .decode()
+                    .strip()
+                    .splitlines()
+                )
+                for pid in pids:
+                    try:
+                        os.kill(int(pid), signal.SIGTERM)
+                    except ProcessLookupError:
+                        pass
+
+                # remove Chromium singleton locks, or new launch exits with
+                # “Opening in existing browser session.”
+                for f in ("SingletonLock", "SingletonSocket", "SingletonCookie"):
+                    fp = os.path.join(self.user_data_dir, f)
+                    if os.path.exists(fp):
+                        os.remove(fp)
+        except Exception as _e:
+            # non-fatal — we'll try to start anyway, but log what happened
+            self.logger.warning(f"pre-launch cleanup failed: {_e}", tag="BROWSER")            
+            
 
         # Start browser process
         try:
@@ -922,7 +964,7 @@ class BrowserManager:
             pages = context.pages
             page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
             if not page:
-                page = await context.new_page()
+                page = context.pages[0] # await context.new_page()
         else:
             # Otherwise, check if we have an existing context for this config
             config_signature = self._make_config_signature(crawlerRunConfig)
diff --git a/crawl4ai/browser_profiler.py b/crawl4ai/browser_profiler.py
index 2291faa2..09267bae 100644
--- a/crawl4ai/browser_profiler.py
+++ b/crawl4ai/browser_profiler.py
@@ -140,13 +140,17 @@ class BrowserProfiler:
         self.logger.info("4. The profile will be saved and ready to use with Crawl4AI.", tag="PROFILE")
         self.logger.info(f"{border}\n", tag="PROFILE")
         
+        browser_config.headless = False
+        browser_config.user_data_dir = profile_path
+        
+        
         # Create managed browser instance
         managed_browser = ManagedBrowser(
-            browser_type=browser_config.browser_type,
-            user_data_dir=profile_path,
-            headless=False,  # Must be visible
+            browser_config=browser_config,
+            # user_data_dir=profile_path,
+            # headless=False,  # Must be visible
             logger=self.logger,
-            debugging_port=browser_config.debugging_port
+            # debugging_port=browser_config.debugging_port
         )
         
         # Set up signal handlers to ensure cleanup on interrupt
@@ -972,3 +976,30 @@ class BrowserProfiler:
             'info': browser_info
         }
 
+
+if __name__ == "__main__":
+    # Example usage
+    profiler = BrowserProfiler()
+    
+    # Create a new profile
+    import os
+    from pathlib import Path
+    home_dir = Path.home()
+    profile_path = asyncio.run(profiler.create_profile( str(home_dir / ".crawl4ai/profiles/test-profile")))
+
+        
+            
+    # Launch a standalone browser
+    asyncio.run(profiler.launch_standalone_browser())
+    
+    # List profiles
+    profiles = profiler.list_profiles()
+    for profile in profiles:
+        print(f"Profile: {profile['name']}, Path: {profile['path']}")
+    
+    # Delete a profile
+    success = profiler.delete_profile("my-profile")
+    if success:
+        print("Profile deleted successfully")
+    else:
+        print("Failed to delete profile")
\ No newline at end of file
diff --git a/docs/codebase/browser.md b/docs/codebase/browser.md
new file mode 100644
index 00000000..17645c75
--- /dev/null
+++ b/docs/codebase/browser.md
@@ -0,0 +1,51 @@
+### browser_manager.py
+
+| Function | What it does |
+|---|---|
+| `ManagedBrowser.build_browser_flags` | Returns baseline Chromium CLI flags, disables GPU and sandbox, plugs locale, timezone, stealth tweaks, and any extras from `BrowserConfig`. |
+| `ManagedBrowser.__init__` | Stores config and logger, creates temp dir, preps internal state. |
+| `ManagedBrowser.start` | Spawns or connects to the Chromium process, returns its CDP endpoint plus the `subprocess.Popen` handle. |
+| `ManagedBrowser._initial_startup_check` | Pings the CDP endpoint once to be sure the browser is alive, raises if not. |
+| `ManagedBrowser._monitor_browser_process` | Async-loops on the subprocess, logs exits or crashes, restarts if policy allows. |
+| `ManagedBrowser._get_browser_path_WIP` | Old helper that maps OS + browser type to an executable path. |
+| `ManagedBrowser._get_browser_path` | Current helper, checks env vars, Playwright cache, and OS defaults for the real executable. |
+| `ManagedBrowser._get_browser_args` | Builds the final CLI arg list by merging user flags, stealth flags, and defaults. |
+| `ManagedBrowser.cleanup` | Terminates the browser, stops monitors, deletes the temp dir. |
+| `ManagedBrowser.create_profile` | Opens a visible browser so a human can log in, then zips the resulting user-data-dir to `~/.crawl4ai/profiles/<name>`. |
+| `ManagedBrowser.list_profiles` | Thin wrapper, now forwarded to `BrowserProfiler.list_profiles()`. |
+| `ManagedBrowser.delete_profile` | Thin wrapper, now forwarded to `BrowserProfiler.delete_profile()`. |
+| `BrowserManager.__init__` | Holds the global Playwright instance, browser handle, config signature cache, session map, and logger. |
+| `BrowserManager.start` | Boots the underlying `ManagedBrowser`, then spins up the default Playwright browser context with stealth patches. |
+| `BrowserManager._build_browser_args` | Translates `CrawlerRunConfig` (proxy, UA, timezone, headless flag, etc.) into Playwright `launch_args`. |
+| `BrowserManager.setup_context` | Applies locale, geolocation, permissions, cookies, and UA overrides on a fresh context. |
+| `BrowserManager.create_browser_context` | Internal helper that actually calls `browser.new_context(**options)` after running `setup_context`. |
+| `BrowserManager._make_config_signature` | Hashes the non-ephemeral parts of `CrawlerRunConfig` so contexts can be reused safely. |
+| `BrowserManager.get_page` | Returns a ready `Page` for a given session id, reusing an existing one or creating a new context/page, injects helper scripts, updates `last_used`. |
+| `BrowserManager.kill_session` | Force-closes a context/page for a session and removes it from the session map. |
+| `BrowserManager._cleanup_expired_sessions` | Periodic sweep that drops sessions idle longer than `ttl_seconds`. |
+| `BrowserManager.close` | Gracefully shuts down all contexts, the browser, Playwright, and background tasks. |
+
+---
+
+### browser_profiler.py
+
+| Function | What it does |
+|---|---|
+| `BrowserProfiler.__init__` | Sets up profile folder paths, async logger, and signal handlers. |
+| `BrowserProfiler.create_profile` | Launches a visible browser with a new user-data-dir for manual login, on exit compresses and stores it as a named profile. |
+| `BrowserProfiler.cleanup_handler` | General SIGTERM/SIGINT cleanup wrapper that kills child processes. |
+| `BrowserProfiler.sigint_handler` | Handles Ctrl-C during an interactive session, makes sure the browser shuts down cleanly. |
+| `BrowserProfiler.listen_for_quit_command` | Async REPL that exits when the user types `q`. |
+| `BrowserProfiler.list_profiles` | Enumerates `~/.crawl4ai/profiles`, prints profile name, browser type, size, and last modified. |
+| `BrowserProfiler.get_profile_path` | Returns the absolute path of a profile given its name, or `None` if missing. |
+| `BrowserProfiler.delete_profile` | Removes a profile folder or a direct path from disk, with optional confirmation prompt. |
+| `BrowserProfiler.interactive_manager` | Text UI loop for listing, creating, deleting, or launching profiles. |
+| `BrowserProfiler.launch_standalone_browser` | Starts a non-headless Chromium with remote debugging enabled and keeps it alive for manual tests. |
+| `BrowserProfiler.get_cdp_json` | Pulls `/json/version` from a CDP endpoint and returns the parsed JSON. |
+| `BrowserProfiler.launch_builtin_browser` | Spawns a headless Chromium in the background, saves `{wsEndpoint, pid, started_at}` to `~/.crawl4ai/builtin_browser.json`. |
+| `BrowserProfiler.get_builtin_browser_info` | Reads that JSON file, verifies the PID, and returns browser status info. |
+| `BrowserProfiler._is_browser_running` | Cross-platform helper that checks if a PID is still alive. |
+| `BrowserProfiler.kill_builtin_browser` | Terminates the background builtin browser and removes its status file. |
+| `BrowserProfiler.get_builtin_browser_status` | Returns `{running: bool, wsEndpoint, pid, started_at}` for quick health checks. |
+
+Let me know what you want to tweak or dive into next.
\ No newline at end of file
diff --git a/docs/codebase/cli.md b/docs/codebase/cli.md
new file mode 100644
index 00000000..6f5a348b
--- /dev/null
+++ b/docs/codebase/cli.md
@@ -0,0 +1,40 @@
+### `cli.py` command surface
+
+| Command | Inputs / flags | What it does |
+|---|---|---|
+| **profiles** | *(none)* | Opens the interactive profile manager, lets you list, create, delete saved browser profiles that live in `~/.crawl4ai/profiles`. |
+| **browser status** | – | Prints whether the always-on *builtin* browser is running, shows its CDP URL, PID, start time. |
+| **browser stop** | – | Kills the builtin browser and deletes its status file. |
+| **browser view** | `--url, -u` URL *(optional)* | Pops a visible window of the builtin browser, navigates to `URL` or `about:blank`. |
+| **config list** | – | Dumps every global setting, showing current value, default, and description. |
+| **config get** | `key` | Prints the value of a single setting, falls back to default if unset. |
+| **config set** | `key value` | Persists a new value in the global config (stored under `~/.crawl4ai/config.yml`). |
+| **examples** | – | Just spits out real-world CLI usage samples. |
+| **crawl** | `url` *(positional)*<br>`--browser-config,-B` path<br>`--crawler-config,-C` path<br>`--filter-config,-f` path<br>`--extraction-config,-e` path<br>`--json-extract,-j` [desc]\*<br>`--schema,-s` path<br>`--browser,-b` k=v list<br>`--crawler,-c` k=v list<br>`--output,-o` all,json,markdown,md,markdown-fit,md-fit *(default all)*<br>`--output-file,-O` path<br>`--bypass-cache,-b` *(flag, default true — note flag reuse)*<br>`--question,-q` str<br>`--verbose,-v` *(flag)*<br>`--profile,-p` profile-name | One-shot crawl + extraction. Builds `BrowserConfig` and `CrawlerRunConfig` from inline flags or separate YAML/JSON files, runs `AsyncWebCrawler.run()`, can route through a named saved profile and pipe the result to stdout or a file. |
+| **(default)** | Same flags as **crawl**, plus `--example` | Shortcut so you can type just `crwl https://site.com`. When first arg is not a known sub-command, it falls through to *crawl*. |
+
+\* `--json-extract/-j` with no value turns on LLM-based JSON extraction using an auto schema, supplying a string lets you prompt-engineer the field descriptions.
+
+> Quick mental model  
+> `profiles` = manage identities,  
+> `browser ...` = control long-running headless Chrome that all crawls can piggy-back on,  
+> `crawl` = do the actual work,  
+> `config` = tweak global defaults,  
+> everything else is sugar.
+
+### Quick-fire “profile” usage cheatsheet
+
+| Scenario | Command (copy-paste ready) | Notes |
+|---|---|---|
+| **Launch interactive Profile Manager UI** | `crwl profiles` | Opens TUI with options: 1 List, 2 Create, 3 Delete, 4 Use-to-crawl, 5 Exit. |
+| **Create a fresh profile** | `crwl profiles` → choose **2** → name it → browser opens → log in → press **q** in terminal | Saves to `~/.crawl4ai/profiles/<name>`. |
+| **List saved profiles** | `crwl profiles` → choose **1** | Shows name, browser type, size, last-modified. |
+| **Delete a profile** | `crwl profiles` → choose **3** → pick the profile index → confirm | Removes the folder. |
+| **Crawl with a profile (default alias)** | `crwl https://site.com/dashboard -p my-profile` | Keeps login cookies, sets `use_managed_browser=true` under the hood. |
+| **Crawl + verbose JSON output** | `crwl https://site.com -p my-profile -o json -v` | Any other `crawl` flags work the same. |
+| **Crawl with extra browser tweaks** | `crwl https://site.com -p my-profile -b "headless=true,viewport_width=1680"` | CLI overrides go on top of the profile. |
+| **Same but via explicit sub-command** | `crwl crawl https://site.com -p my-profile` | Identical to default alias. |
+| **Use profile from inside Profile Manager** | `crwl profiles` → choose **4** → pick profile → enter URL → follow prompts | Handy when demo-ing to non-CLI folks. |
+| **One-off crawl with a profile folder path (no name lookup)** | `crwl https://site.com -b "user_data_dir=$HOME/.crawl4ai/profiles/my-profile,use_managed_browser=true"` | Bypasses registry, useful for CI scripts. |
+| **Launch a dev browser on CDP port with the same identity** | `crwl cdp -d $HOME/.crawl4ai/profiles/my-profile -P 9223` | Lets Puppeteer/Playwright attach for debugging. |
+
diff --git a/tests/profiler/test_crteate_profile.py b/tests/profiler/test_crteate_profile.py
new file mode 100644
index 00000000..e441ea4a
--- /dev/null
+++ b/tests/profiler/test_crteate_profile.py
@@ -0,0 +1,32 @@
+from crawl4ai import BrowserProfiler
+import asyncio
+
+
+if __name__ == "__main__":
+    # Example usage
+    profiler = BrowserProfiler()
+    
+    # Create a new profile
+    import os
+    from pathlib import Path
+    home_dir = Path.home()
+    profile_path = asyncio.run(profiler.create_profile( str(home_dir / ".crawl4ai/profiles/test-profile")))
+    
+    print(f"Profile created at: {profile_path}")
+
+        
+            
+    # # Launch a standalone browser
+    # asyncio.run(profiler.launch_standalone_browser())
+    
+    # # List profiles
+    # profiles = profiler.list_profiles()
+    # for profile in profiles:
+    #     print(f"Profile: {profile['name']}, Path: {profile['path']}")
+    
+    # # Delete a profile
+    # success = profiler.delete_profile("my-profile")
+    # if success:
+    #     print("Profile deleted successfully")
+    # else:
+    #     print("Failed to delete profile")
\ No newline at end of file

From 50f0b83fcd4e951b7109b653d14bc3a04ca604a8 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Wed, 30 Apr 2025 19:38:25 +0800
Subject: [PATCH 5/8] feat(linkedin): add prospect-wizard app with scraping and
 visualization

Add new LinkedIn prospect discovery tool with three main components:
- c4ai_discover.py for company and people scraping
- c4ai_insights.py for org chart and decision maker analysis
- Interactive graph visualization with company/people exploration

Features include:
- Configurable LinkedIn search and scraping
- Org chart generation with decision maker scoring
- Interactive network graph visualization
- Company similarity analysis
- Chat interface for data exploration

Requires: crawl4ai, openai, sentence-transformers, networkx
---
 docs/apps/linkdin/README.md                   |  126 ++
 docs/apps/linkdin/c4ai_discover.py            |  440 +++++++
 docs/apps/linkdin/c4ai_insights.py            |  372 ++++++
 docs/apps/linkdin/schemas/company_card.json   |   39 +
 docs/apps/linkdin/schemas/people_card.json    |   38 +
 docs/apps/linkdin/snippets/company.html       |  143 ++
 docs/apps/linkdin/snippets/people.html        |   94 ++
 docs/apps/linkdin/templates/ai.js             |   50 +
 .../templates/graph_view_template.html        | 1171 +++++++++++++++++
 9 files changed, 2473 insertions(+)
 create mode 100644 docs/apps/linkdin/README.md
 create mode 100644 docs/apps/linkdin/c4ai_discover.py
 create mode 100644 docs/apps/linkdin/c4ai_insights.py
 create mode 100644 docs/apps/linkdin/schemas/company_card.json
 create mode 100644 docs/apps/linkdin/schemas/people_card.json
 create mode 100644 docs/apps/linkdin/snippets/company.html
 create mode 100644 docs/apps/linkdin/snippets/people.html
 create mode 100644 docs/apps/linkdin/templates/ai.js
 create mode 100644 docs/apps/linkdin/templates/graph_view_template.html

diff --git a/docs/apps/linkdin/README.md b/docs/apps/linkdin/README.md
new file mode 100644
index 00000000..cce244ac
--- /dev/null
+++ b/docs/apps/linkdin/README.md
@@ -0,0 +1,126 @@
+# Crawl4AI Prospect‑Wizard – step‑by‑step guide
+
+A three‑stage demo that goes from **LinkedIn scraping** ➜ **LLM reasoning** ➜ **graph visualisation**.
+
+```
+prospect‑wizard/
+├─ c4ai_discover.py         # Stage 1 – scrape companies + people
+├─ c4ai_insights.py         # Stage 2 – embeddings, org‑charts, scores
+├─ graph_view_template.html # Stage 3 – graph viewer (static HTML)
+└─ data/                    # output lands here (*.jsonl / *.json)
+```
+
+---
+
+## 1  Install & boot a LinkedIn profile (one‑time)
+
+### 1.1  Install dependencies
+```bash
+pip install crawl4ai openai sentence-transformers networkx pandas vis-network rich
+```
+
+### 1.2  Create / warm a LinkedIn browser profile
+```bash
+crwl profiler
+```
+1. The interactive shell shows **New profile** – hit **enter**.
+2. Choose a name, e.g. `profile_linkedin_uc`.
+3. A Chromium window opens – log in to LinkedIn, solve whatever CAPTCHA, then close.
+
+> Remember the **profile name**. All future runs take `--profile-name <your_name>`.
+
+---
+
+## 2  Discovery – scrape companies & people
+
+```bash
+python c4ai_discover.py full \ 
+  --query "health insurance management" \ 
+  --geo 102713980 \               # Malaysia geoUrn
+  --title_filters "" \            # or "Product,Engineering"
+  --max_companies 10 \            # default set small for workshops
+  --max_people 20 \               # \^ same
+  --profile-name profile_linkedin_uc \ 
+  --outdir ./data \ 
+  --concurrency 2 \ 
+  --log_level debug
+```
+**Outputs** in `./data/`:
+* `companies.jsonl` – one JSON per company
+* `people.jsonl` – one JSON per employee
+
+🛠️  **Dry‑run:** `C4AI_DEMO_DEBUG=1 python c4ai_discover.py full --query coffee` uses bundled HTML snippets, no network.
+
+### Handy geoUrn cheatsheet
+| Location | geoUrn |
+|----------|--------|
+| Singapore | **103644278** |
+| Malaysia | **102713980** |
+| United States | **103644922** |
+| United Kingdom | **102221843** |
+| Australia | **101452733** |
+_See more: <https://www.linkedin.com/search/results/companies/?geoUrn=XXX> – the number after `geoUrn=` is what you need._
+
+---
+
+## 3  Insights – embeddings, org‑charts, decision makers
+
+```bash
+python c4ai_insights.py \ 
+  --in  ./data \ 
+  --out ./data \ 
+  --embed_model all-MiniLM-L6-v2 \ 
+  --top_k 10 \ 
+  --openai_model gpt-4.1 \ 
+  --max_llm_tokens 8024 \ 
+  --llm_temperature 1.0 \ 
+  --workers 4
+```
+Emits next to the Stage‑1 files:
+* `company_graph.json` – inter‑company similarity graph
+* `org_chart_<handle>.json` – one per company
+* `decision_makers.csv` – hand‑picked ‘who to pitch’ list
+
+Flags reference (straight from `build_arg_parser()`):
+| Flag | Default | Purpose |
+|------|---------|---------|
+| `--in` | `.` | Stage‑1 output dir |
+| `--out` | `.` | Destination dir |
+| `--embed_model` | `all-MiniLM-L6-v2` | Sentence‑Transformer model |
+| `--top_k` | `10` | Neighbours per company in graph |
+| `--openai_model` | `gpt-4.1` | LLM for scoring decision makers |
+| `--max_llm_tokens` | `8024` | Token budget per LLM call |
+| `--llm_temperature` | `1.0` | Creativity knob |
+| `--stub` | off | Skip OpenAI and fabricate tiny charts |
+| `--workers` | `4` | Parallel LLM workers |
+
+---
+
+## 4  Visualise – interactive graph
+
+After Stage 2 completes, simply open the HTML viewer from the project root:
+```bash
+open graph_view_template.html   # or Live Server / Python -http
+```
+The page fetches `data/company_graph.json` and the `org_chart_*.json` files automatically; keep the `data/` folder beside the HTML file.
+
+* Left pane → list of companies (clans).
+* Click a node to load its org‑chart on the right.
+* Chat drawer lets you ask follow‑up questions; context is pulled from `people.jsonl`.
+
+---
+
+## 5  Common snags
+
+| Symptom | Fix |
+|---------|-----|
+| Infinite CAPTCHA | Use a residential proxy: `--proxy http://user:pass@ip:port` |
+| 429 Too Many Requests | Lower `--concurrency`, rotate profile, add delay |
+| Blank graph | Check JSON paths, clear `localStorage` in browser |
+
+---
+
+### TL;DR
+`crwl profiler` → `c4ai_discover.py` → `c4ai_insights.py` → open `graph_view_template.html`.  
+Live long and `import crawl4ai`.
+
diff --git a/docs/apps/linkdin/c4ai_discover.py b/docs/apps/linkdin/c4ai_discover.py
new file mode 100644
index 00000000..82874568
--- /dev/null
+++ b/docs/apps/linkdin/c4ai_discover.py
@@ -0,0 +1,440 @@
+#!/usr/bin/env python3
+"""
+c4ai-discover — Stage‑1 Discovery CLI
+
+Scrapes LinkedIn company search + their people pages and dumps two newline‑delimited
+JSON files: companies.jsonl and people.jsonl.
+
+Key design rules
+----------------
+* No BeautifulSoup — Crawl4AI only for network + HTML fetch.
+* JsonCssExtractionStrategy for structured scraping; schema auto‑generated once
+  from sample HTML provided by user and then cached under ./schemas/.
+* Defaults are embedded so the file runs inside VS Code debugger without CLI args.
+* If executed as a console script (argv > 1), CLI flags win.
+* Lightweight deps: argparse + Crawl4AI stack.
+
+Author: Tom @ Kidocode 2025‑04‑26
+"""
+from __future__ import annotations
+
+import warnings, re
+warnings.filterwarnings(
+    "ignore",
+    message=r"The pseudo class ':contains' is deprecated, ':-soup-contains' should be used.*",
+    category=FutureWarning,
+    module=r"soupsieve"
+)
+
+
+# ───────────────────────────────────────────────────────────────────────────────
+# Imports
+# ───────────────────────────────────────────────────────────────────────────────
+import argparse
+import random
+import asyncio
+import json
+import logging
+import os
+import pathlib
+import sys
+# 3rd-party rich for pretty logging
+from rich.console import Console
+from rich.logging import RichHandler
+
+from datetime import datetime, UTC
+from itertools import cycle
+from textwrap import dedent
+from types import SimpleNamespace
+from typing import Dict, List, Optional
+from urllib.parse import quote
+from pathlib import Path
+from glob import glob
+
+from crawl4ai import (
+    AsyncWebCrawler,
+    BrowserConfig,
+    CacheMode,
+    CrawlerRunConfig,
+    JsonCssExtractionStrategy,
+    BrowserProfiler,
+    LLMConfig,
+)
+
+# ───────────────────────────────────────────────────────────────────────────────
+# Constants / paths
+# ───────────────────────────────────────────────────────────────────────────────
+BASE_DIR = pathlib.Path(__file__).resolve().parent
+SCHEMA_DIR = BASE_DIR / "schemas"
+SCHEMA_DIR.mkdir(parents=True, exist_ok=True)
+COMPANY_SCHEMA_PATH = SCHEMA_DIR / "company_card.json"
+PEOPLE_SCHEMA_PATH = SCHEMA_DIR / "people_card.json"
+
+# ---------- deterministic target JSON examples ----------
+_COMPANY_SCHEMA_EXAMPLE = {
+    "handle": "/company/posify/",
+    "profile_image": "https://media.licdn.com/dms/image/v2/.../logo.jpg",
+    "name": "Management Research Services, Inc. (MRS, Inc)",
+    "descriptor": "Insurance • Milwaukee, Wisconsin",
+    "about": "Insurance • Milwaukee, Wisconsin",
+    "followers": 1000
+}
+
+_PEOPLE_SCHEMA_EXAMPLE = {
+    "profile_url": "https://www.linkedin.com/in/lily-ng/",
+    "name": "Lily Ng",
+    "headline": "VP Product @ Posify",
+    "followers": 890,
+    "connection_degree": "2nd",
+    "avatar_url": "https://media.licdn.com/dms/image/v2/.../lily.jpg"
+}
+
+# Provided sample HTML snippets (trimmed) — used exactly once to cold‑generate schema.
+_SAMPLE_COMPANY_HTML = (Path(__file__).resolve().parent / "snippets/company.html").read_text()
+_SAMPLE_PEOPLE_HTML = (Path(__file__).resolve().parent / "snippets/people.html").read_text()
+
+# --------- tighter schema prompts ----------
+_COMPANY_SCHEMA_QUERY = dedent(
+    """
+    Using the supplied <li> company-card HTML, build a JsonCssExtractionStrategy schema that,
+    for every card, outputs *exactly* the keys shown in the example JSON below.
+    JSON spec:
+      • handle        – href of the outermost <a> that wraps the logo/title, e.g. "/company/posify/"
+      • profile_image – absolute URL of the <img> inside that link
+      • name          – text of the <a> inside the <span class*='t-16'>
+      • descriptor    – text line with industry • location
+      • about         – text of the <div class*='t-normal'> below the name (industry + geo)
+      • followers     – integer parsed from the <div> containing 'followers'
+      
+    IMPORTANT: Do not use the base64 kind of classes to target element. It's not reliable.
+    The main div parent contains these li element is "div.search-results-container" you can use this.
+    The <ul> parent has "role" equal to "list". Using these two should be enough to target the <li> elements."
+    """
+)
+
+_PEOPLE_SCHEMA_QUERY = dedent(
+    """
+    Using the supplied <li> people-card HTML, build a JsonCssExtractionStrategy schema that
+    outputs exactly the keys in the example JSON below.
+    Fields:
+      • profile_url        – href of the outermost profile link
+      • name               – text inside artdeco-entity-lockup__title
+      • headline           – inner text of artdeco-entity-lockup__subtitle
+      • followers          – integer parsed from the span inside lt-line-clamp--multi-line
+      • connection_degree  – '1st', '2nd', etc. from artdeco-entity-lockup__badge
+      • avatar_url         – src of the <img> within artdeco-entity-lockup__image
+      
+    IMPORTANT: Do not use the base64 kind of classes to target element. It's not reliable.
+    The main div parent contains these li element is a "div" has these classes "artdeco-card org-people-profile-card__card-spacing org-people__card-margin-bottom".
+    """
+)
+
+# ---------------------------------------------------------------------------
+# Utility helpers
+# ---------------------------------------------------------------------------
+
+def _load_or_build_schema(
+    path: pathlib.Path, 
+    sample_html: str, 
+    query: str, 
+    example_json: Dict,
+    force = False
+) -> Dict:
+    """Load schema from path, else call generate_schema once and persist."""
+    if path.exists() and not force:
+        return json.loads(path.read_text())
+
+    logging.info("[SCHEMA] Generating schema %s", path.name)
+    schema = JsonCssExtractionStrategy.generate_schema(
+        html=sample_html,
+        llm_config=LLMConfig(
+            provider=os.getenv("C4AI_SCHEMA_PROVIDER", "openai/gpt-4o"),
+            api_token=os.getenv("OPENAI_API_KEY", "env:OPENAI_API_KEY"),
+        ),
+        query=query,
+        target_json_example=json.dumps(example_json, indent=2),
+    )
+    path.write_text(json.dumps(schema, indent=2))
+    return schema
+
+
+def _openai_friendly_number(text: str) -> Optional[int]:
+    """Extract first int from text like '1K followers' (returns 1000)."""
+    import re
+
+    m = re.search(r"(\d[\d,]*)", text.replace(",", ""))
+    if not m:
+        return None
+    val = int(m.group(1))
+    if "k" in text.lower():
+        val *= 1000
+    if "m" in text.lower():
+        val *= 1_000_000
+    return val
+
+# ---------------------------------------------------------------------------
+# Core async workers
+# ---------------------------------------------------------------------------
+async def crawl_company_search(crawler: AsyncWebCrawler, url: str, schema: Dict, limit: int) -> List[Dict]:
+    """Paginate 10-item company search pages until `limit` reached."""
+    extraction = JsonCssExtractionStrategy(schema)
+    cfg = CrawlerRunConfig(
+        extraction_strategy=extraction,
+        cache_mode=CacheMode.BYPASS,
+        wait_for = ".search-marvel-srp",
+        session_id="company_search",
+        delay_before_return_html=1,
+        magic = True,
+        verbose= False,
+    )
+    companies, page = [], 1
+    while len(companies) < max(limit, 10):
+        paged_url = f"{url}&page={page}"
+        res = await crawler.arun(paged_url, config=cfg)
+        batch = json.loads(res[0].extracted_content)
+        if not batch:
+            break
+        for item in batch:
+            name = item.get("name", "").strip()
+            handle = item.get("handle", "").strip()
+            if not handle or not name:
+                continue
+            descriptor = item.get("descriptor")
+            about = item.get("about")
+            followers = _openai_friendly_number(str(item.get("followers", "")))
+            companies.append(
+                {
+                    "handle": handle,
+                    "name": name,
+                    "descriptor": descriptor,
+                    "about": about,
+                    "followers": followers,
+                    "people_url": f"{handle}people/",
+                    "captured_at": datetime.now(UTC).isoformat(timespec="seconds") + "Z",
+                }
+            )
+        page += 1
+        logging.info(
+            f"[dim]Page {page}[/] — running total: {len(companies)}/{limit} companies"
+        )
+
+    return companies[:max(limit, 10)]
+
+
+async def crawl_people_page(
+    crawler: AsyncWebCrawler,
+    people_url: str,
+    schema: Dict,
+    limit: int,
+    title_kw: str,
+) -> List[Dict]:
+    people_u = f"{people_url}?keywords={quote(title_kw)}"
+    extraction = JsonCssExtractionStrategy(schema)
+    cfg = CrawlerRunConfig(
+        extraction_strategy=extraction,
+        # scan_full_page=True,
+        cache_mode=CacheMode.BYPASS,
+        magic=True,
+        wait_for=".org-people-profile-card__card-spacing",
+        delay_before_return_html=1,
+        session_id="people_search",
+    )
+    res = await crawler.arun(people_u, config=cfg)
+    if not res[0].success:
+        return []
+    raw = json.loads(res[0].extracted_content)
+    people = []
+    for p in raw[:limit]:
+        followers = _openai_friendly_number(str(p.get("followers", "")))
+        people.append(
+            {
+                "profile_url": p.get("profile_url"),
+                "name": p.get("name"),
+                "headline": p.get("headline"),
+                "followers": followers,
+                "connection_degree": p.get("connection_degree"),
+                "avatar_url": p.get("avatar_url"),
+            }
+        )
+    return people
+
+# ---------------------------------------------------------------------------
+# CLI + main
+# ---------------------------------------------------------------------------
+
+def build_arg_parser() -> argparse.ArgumentParser:
+    ap = argparse.ArgumentParser("c4ai-discover — Crawl4AI LinkedIn discovery")
+    sub = ap.add_subparsers(dest="cmd", required=False, help="run scope")
+
+    def add_flags(parser: argparse.ArgumentParser):
+        parser.add_argument("--query", required=False, help="query keyword(s)")
+        parser.add_argument("--geo", required=False, type=int, help="LinkedIn geoUrn")
+        parser.add_argument("--title-filters", default="Product,Engineering", help="comma list of job keywords")
+        parser.add_argument("--max-companies", type=int, default=1000)
+        parser.add_argument("--max-people", type=int, default=500)
+        parser.add_argument("--profile-path", default=str(pathlib.Path.home() / ".crawl4ai/profiles/profile_linkedin_uc"))
+        parser.add_argument("--outdir", default="./output")
+        parser.add_argument("--concurrency", type=int, default=4)
+        parser.add_argument("--log-level", default="info", choices=["debug", "info", "warn", "error"])
+
+    add_flags(sub.add_parser("full"))
+    add_flags(sub.add_parser("companies"))
+    add_flags(sub.add_parser("people"))
+
+    # global flags
+    ap.add_argument(
+        "--debug",
+        action="store_true",
+        help="Use built-in demo defaults (same as C4AI_DEMO_DEBUG=1)",
+    )
+    return ap
+
+
+def detect_debug_defaults(force = False) -> SimpleNamespace:
+    if not force and sys.gettrace() is None and not os.getenv("C4AI_DEMO_DEBUG"):
+        return SimpleNamespace()
+    # ----- debug‑friendly defaults -----
+    return SimpleNamespace(
+        cmd="full",
+        query="health insurance management",
+        geo=102713980,
+        # title_filters="Product,Engineering",
+        title_filters="",
+        max_companies=10,
+        max_people=5,
+        profile_name="profile_linkedin_uc",
+        outdir="./debug_out",
+        concurrency=2,
+        log_level="debug",
+    )
+
+
+async def async_main(opts):
+    # ─────────── logging setup ───────────
+    console = Console()
+    logging.basicConfig(
+        level=opts.log_level.upper(),
+        format="%(message)s",
+        handlers=[RichHandler(console=console, markup=True, rich_tracebacks=True)],
+    )
+
+    # -------------------------------------------------------------------
+    # Load or build schemas (one‑time LLM call each)
+    # -------------------------------------------------------------------
+    company_schema = _load_or_build_schema(
+        COMPANY_SCHEMA_PATH,
+        _SAMPLE_COMPANY_HTML,
+        _COMPANY_SCHEMA_QUERY,
+        _COMPANY_SCHEMA_EXAMPLE,
+        # True
+    )
+    people_schema = _load_or_build_schema(
+        PEOPLE_SCHEMA_PATH,
+        _SAMPLE_PEOPLE_HTML,
+        _PEOPLE_SCHEMA_QUERY,
+        _PEOPLE_SCHEMA_EXAMPLE,
+        # True
+    )
+
+    outdir = BASE_DIR / pathlib.Path(opts.outdir)
+    outdir.mkdir(parents=True, exist_ok=True)
+    f_companies = (BASE_DIR / outdir / "companies.jsonl").open("a", encoding="utf-8")
+    f_people = (BASE_DIR / outdir / "people.jsonl").open("a", encoding="utf-8")
+
+    # -------------------------------------------------------------------
+    # Prepare crawler with cookie pool rotation
+    # -------------------------------------------------------------------
+    profiler = BrowserProfiler()
+    path = profiler.get_profile_path(opts.profile_name)
+    bc = BrowserConfig(
+        headless=False,        
+        verbose=False,
+        user_data_dir=path,
+        use_managed_browser=True,
+        user_agent_mode = "random",
+        user_agent_generator_config= {
+            "platforms": "mobile",
+            "os": "Android"
+        },
+        verbose=False,
+    )
+    crawler = AsyncWebCrawler(config=bc)
+    
+    await crawler.start()
+
+    # Single worker for simplicity; concurrency can be scaled by arun_many if needed.
+    # crawler = await next_crawler().start()
+    try:
+        # Build LinkedIn search URL
+        search_url = f"https://www.linkedin.com/search/results/companies/?keywords={quote(opts.query)}&geoUrn={opts.geo}"
+        logging.info("Seed URL => %s", search_url)
+
+        companies: List[Dict] = []
+        if opts.cmd in ("companies", "full"):
+            companies = await crawl_company_search(
+                crawler, search_url, company_schema, opts.max_companies
+            )
+            for c in companies:
+                f_companies.write(json.dumps(c, ensure_ascii=False) + "\n")
+            logging.info(f"[bold green]✓[/] Companies scraped so far: {len(companies)}")
+
+        if opts.cmd in ("people", "full"):
+            if not companies:
+                # load from previous run
+                src = outdir / "companies.jsonl"
+                if not src.exists():
+                    logging.error("companies.jsonl missing — run companies/full first")
+                    return 10
+                companies = [json.loads(l) for l in src.read_text().splitlines()]
+            total_people = 0
+            title_kw = " ".join([t.strip() for t in opts.title_filters.split(",") if t.strip()]) if opts.title_filters else ""
+            for comp in companies:
+                people = await crawl_people_page(
+                    crawler,
+                    comp["people_url"],
+                    people_schema,
+                    opts.max_people,
+                    title_kw,
+                )
+                for p in people:
+                    rec = p | {
+                        "company_handle": comp["handle"],
+                        # "captured_at": datetime.now(UTC).isoformat(timespec="seconds") + "Z",
+                        "captured_at": datetime.now(UTC).isoformat(timespec="seconds") + "Z",
+                    }
+                    f_people.write(json.dumps(rec, ensure_ascii=False) + "\n")
+                total_people += len(people)
+                logging.info(
+                    f"{comp['name']} — [cyan]{len(people)}[/] people extracted"
+                )
+                await asyncio.sleep(random.uniform(0.5, 1))
+            logging.info("Total people scraped: %d", total_people)
+    finally:
+        await crawler.close()
+        f_companies.close()
+        f_people.close()
+
+    return 0
+
+
+def main():
+    parser = build_arg_parser()
+    cli_opts = parser.parse_args()
+
+    # decide on debug defaults
+    if cli_opts.debug:
+        opts = detect_debug_defaults(force=True)
+    else:
+        env_defaults = detect_debug_defaults()
+        env_defaults = detect_debug_defaults()
+        opts = env_defaults if env_defaults else cli_opts
+
+    if not getattr(opts, "cmd", None):
+        opts.cmd = "full"
+
+    exit_code = asyncio.run(async_main(opts))
+    sys.exit(exit_code)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/apps/linkdin/c4ai_insights.py b/docs/apps/linkdin/c4ai_insights.py
new file mode 100644
index 00000000..8307c30d
--- /dev/null
+++ b/docs/apps/linkdin/c4ai_insights.py
@@ -0,0 +1,372 @@
+#!/usr/bin/env python3
+"""
+Stage-2 Insights builder
+------------------------
+Reads companies.jsonl & people.jsonl (Stage-1 output) and produces:
+ • company_graph.json
+ • org_chart_<handle>.json  (one per company)
+ • decision_makers.csv
+ • graph_view.html          (interactive visualisation)
+
+Run:
+    python c4ai_insights.py --in ./stage1_out --out ./stage2_out
+
+Author : Tom @ Kidocode, 2025-04-28
+"""
+
+from __future__ import annotations
+
+# ───────────────────────────────────────────────────────────────────────────────
+# Imports & Third-party
+# ───────────────────────────────────────────────────────────────────────────────
+
+import argparse, asyncio, json, os, sys, pathlib, random, time, csv
+from datetime import datetime, UTC
+from types import SimpleNamespace
+from pathlib import Path
+from typing import List, Dict, Any
+# Pretty CLI UX
+from rich.console import Console
+from rich.logging import RichHandler
+from rich.progress import Progress, SpinnerColumn, BarColumn, TextColumn, TimeElapsedColumn
+import logging
+from jinja2 import Environment, FileSystemLoader, select_autoescape
+
+BASE_DIR = pathlib.Path(__file__).resolve().parent
+
+# ───────────────────────────────────────────────────────────────────────────────
+# 3rd-party deps
+# ───────────────────────────────────────────────────────────────────────────────
+import numpy as np
+# from sentence_transformers import SentenceTransformer
+# from sklearn.metrics.pairwise import cosine_similarity
+import pandas as pd
+import hashlib
+
+from openai import OpenAI                    # same SDK you pre-loaded
+
+# ───────────────────────────────────────────────────────────────────────────────
+# Utils
+# ───────────────────────────────────────────────────────────────────────────────
+def load_jsonl(path: Path) -> List[Dict[str, Any]]:
+    with open(path, "r", encoding="utf-8") as f:
+        return [json.loads(l) for l in f]
+
+def dump_json(obj, path: Path):
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(obj, f, ensure_ascii=False, indent=2)
+
+# ───────────────────────────────────────────────────────────────────────────────
+# Constants
+# ───────────────────────────────────────────────────────────────────────────────
+BASE_DIR = pathlib.Path(__file__).resolve().parent
+
+# ───────────────────────────────────────────────────────────────────────────────
+# Debug defaults   (mirrors Stage-1 trick)
+# ───────────────────────────────────────────────────────────────────────────────
+def dev_defaults() -> SimpleNamespace:
+    return SimpleNamespace(
+        in_dir="./debug_out",          
+        out_dir="./insights_debug",
+        embed_model="all-MiniLM-L6-v2",
+        top_k=10,
+        openai_model="gpt-4.1",
+        max_llm_tokens=8000,
+        llm_temperature=1.0,
+        workers=4,           # parallel processing
+        stub=False,          # manual
+    )
+
+# ───────────────────────────────────────────────────────────────────────────────
+# Graph builders
+# ───────────────────────────────────────────────────────────────────────────────
+def embed_descriptions(companies, model_name:str, opts) -> np.ndarray:
+    from sentence_transformers import SentenceTransformer
+    
+    logging.debug(f"Using embedding model: {model_name}")
+    cache_path = BASE_DIR / Path(opts.out_dir) / "embeds_cache.json"
+    cache = {}
+    if cache_path.exists():
+        with open(cache_path) as f:
+            cache = json.load(f)
+        # flush cache if model differs
+        if cache.get("_model") != model_name:
+            cache = {}
+
+    model = SentenceTransformer(model_name)
+    new_texts, new_indices = [], []
+    vectors = np.zeros((len(companies), 384), dtype=np.float32)
+
+    for idx, comp in enumerate(companies):
+        text = comp.get("about") or comp.get("descriptor","")
+        h = hashlib.sha1(text.encode("utf-8")).hexdigest()
+        cached = cache.get(comp["handle"])
+        if cached and cached["hash"] == h:
+            vectors[idx] = np.array(cached["vector"], dtype=np.float32)
+        else:
+            new_texts.append(text)
+            new_indices.append((idx, comp["handle"], h))
+
+    if new_texts:
+        embeds = model.encode(new_texts, show_progress_bar=False, convert_to_numpy=True)
+        for vec, (idx, handle, h) in zip(embeds, new_indices):
+            vectors[idx] = vec
+            cache[handle] = {"hash": h, "vector": vec.tolist()}
+        cache["_model"] = model_name
+        with open(cache_path, "w") as f:
+            json.dump(cache, f)
+
+    return vectors
+
+def build_company_graph(companies, embeds:np.ndarray, top_k:int) -> Dict[str,Any]:
+    from sklearn.metrics.pairwise import cosine_similarity
+    sims = cosine_similarity(embeds)
+    nodes, edges = [], []
+    idx_of = {c["handle"]: i for i,c in enumerate(companies)}
+    for i,c in enumerate(companies):
+        node = dict(
+            id=c["handle"].strip("/"),
+            name=c["name"],
+            handle=c["handle"],
+            about=c.get("about",""),
+            people_url=c.get("people_url",""),
+            industry=c.get("descriptor","").split("•")[0].strip(),
+            geoUrn=c.get("geoUrn"),
+            followers=c.get("followers",0),
+            # desc_embed=embeds[i].tolist(),
+            desc_embed=[],
+        )
+        nodes.append(node)
+        # pick top-k most similar except itself
+        top_idx = np.argsort(sims[i])[::-1][1:top_k+1]
+        for j in top_idx:
+            tgt = companies[j]
+            weight = float(sims[i,j])
+            if node["industry"] == tgt.get("descriptor","").split("•")[0].strip():
+                weight += 0.10
+            if node["geoUrn"] == tgt.get("geoUrn"):
+                weight += 0.05
+            tgt['followers'] = tgt.get("followers", None) or 1
+            node["followers"] = node.get("followers", None) or 1
+            follower_ratio = min(node["followers"], tgt.get("followers",1)) / max(node["followers"] or 1, tgt.get("followers",1))
+            weight += 0.05 * follower_ratio
+            edges.append(dict(
+                source=node["id"],
+                target=tgt["handle"].strip("/"),
+                weight=round(weight,4),
+                drivers=dict(
+                    embed_sim=round(float(sims[i,j]),4),
+                    industry_match=0.10 if node["industry"] == tgt.get("descriptor","").split("•")[0].strip() else 0,
+                    geo_overlap=0.05 if node["geoUrn"] == tgt.get("geoUrn") else 0,
+                )
+            ))
+    # return {"nodes":nodes,"edges":edges,"meta":{"generated_at":datetime.now(UTC).isoformat()}}
+    return {"nodes":nodes,"edges":edges,"meta":{"generated_at":datetime.now(UTC).isoformat()}}
+
+# ───────────────────────────────────────────────────────────────────────────────
+# Org-chart via LLM
+# ───────────────────────────────────────────────────────────────────────────────
+async def infer_org_chart_llm(company, people, client:OpenAI, model_name:str, max_tokens:int, temperature:float, stub:bool):
+    if stub:
+        # Tiny fake org-chart when debugging offline
+        chief = random.choice(people)
+        nodes = [{
+            "id": chief["profile_url"],
+            "name": chief["name"],
+            "title": chief["headline"],
+            "dept": chief["headline"].split()[:1][0],
+            "yoe_total": 8,
+            "yoe_current": 2,
+            "seniority_score": 0.8,
+            "decision_score": 0.9,
+            "avatar_url": chief.get("avatar_url")
+        }]
+        return {"nodes":nodes,"edges":[],"meta":{"debug_stub":True,"generated_at":datetime.now(UTC).isoformat()}}
+    
+    prompt = [
+        {"role":"system","content":"You are an expert B2B org-chart reasoner."},
+        {"role":"user","content":f"""Here is the company description:
+         
+<company>
+{json.dumps(company, ensure_ascii=False)}
+</company>
+                
+Here is a JSON list of employees:
+<employees>
+{json.dumps(people, ensure_ascii=False)}
+</employees>
+
+1) Build a reporting tree (manager -> direct reports)
+2) For each person output a decision_score 0-1 for buying new software
+
+Return JSON: {{ "nodes":[{{id,name,title,dept,yoe_total,yoe_current,seniority_score,decision_score,avatar_url,profile_url}}], "edges":[{{source,target,type,confidence}}] }}
+"""}
+    ]
+    resp = client.chat.completions.create(
+        model=model_name,
+        messages=prompt,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        response_format={"type":"json_object"}
+    )
+    chart = json.loads(resp.choices[0].message.content)
+    chart["meta"] = dict(model=model_name, generated_at=datetime.now(UTC).isoformat())
+    return chart
+
+# ───────────────────────────────────────────────────────────────────────────────
+# CSV flatten
+# ───────────────────────────────────────────────────────────────────────────────
+def export_decision_makers(charts_dir:Path, csv_path:Path, threshold:float=0.5):
+    rows=[]
+    for p in charts_dir.glob("org_chart_*.json"):
+        data=json.loads(p.read_text())
+        comp = p.stem.split("org_chart_")[1]
+        for n in data.get("nodes",[]):
+            if n.get("decision_score",0)>=threshold:
+                rows.append(dict(
+                    company=comp,
+                    person=n["name"],
+                    title=n["title"],
+                    decision_score=n["decision_score"],
+                    profile_url=n["id"]
+                ))
+    pd.DataFrame(rows).to_csv(csv_path,index=False)
+
+# ───────────────────────────────────────────────────────────────────────────────
+# HTML rendering
+# ───────────────────────────────────────────────────────────────────────────────
+def render_html(out:Path, template_dir:Path):
+    # From template folder cp graph_view.html and ai.js in out folder
+    import shutil
+    shutil.copy(template_dir/"graph_view_template.html", out / "graph_view.html")
+    shutil.copy(template_dir/"ai.js", out)
+
+
+# ───────────────────────────────────────────────────────────────────────────────
+# Main async pipeline
+# ───────────────────────────────────────────────────────────────────────────────
+async def run(opts):
+    # ── silence SDK noise ──────────────────────────────────────────────────────
+    for noisy in ("openai", "httpx", "httpcore"):
+        lg = logging.getLogger(noisy)
+        lg.setLevel(logging.WARNING)     # or ERROR if you want total silence
+        lg.propagate = False             # optional: stop them reaching root
+
+    # ────────────── logging bootstrap ──────────────
+    console = Console()
+    logging.basicConfig(
+        level="INFO",
+        format="%(message)s",
+        handlers=[RichHandler(console=console, markup=True, rich_tracebacks=True)],
+    )
+
+    in_dir  = BASE_DIR / Path(opts.in_dir)
+    out_dir = BASE_DIR / Path(opts.out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    companies = load_jsonl(in_dir/"companies.jsonl")
+    people    = load_jsonl(in_dir/"people.jsonl")
+
+    logging.info(f"[bold cyan]Loaded[/] {len(companies)} companies, {len(people)} people")
+
+    logging.info("[bold]⇢[/] Embedding company descriptions…")
+    # embeds = embed_descriptions(companies, opts.embed_model, opts)
+    
+    logging.info("[bold]⇢[/] Building similarity graph")
+    # company_graph = build_company_graph(companies, embeds, opts.top_k)
+    # dump_json(company_graph, out_dir/"company_graph.json")
+
+    # OpenAI client (only built if not debugging)
+    stub = bool(opts.stub)
+    client = OpenAI() if not stub else None
+
+    # Filter companies that need processing
+    to_process = []
+    for comp in companies:
+        handle = comp["handle"].strip("/").replace("/","_")
+        out_file = out_dir/f"org_chart_{handle}.json"
+        if out_file.exists() and False:
+            logging.info(f"[green]✓[/] Skipping existing {comp['name']}")
+            continue
+        to_process.append(comp)
+    
+    
+    if not to_process:
+        logging.info("[yellow]All companies already processed[/]")
+    else:
+        workers = getattr(opts, 'workers', 1)
+        parallel = workers > 1
+        
+        logging.info(f"[bold]⇢[/] Inferring org-charts via LLM {f'(parallel={workers} workers)' if parallel else ''}")
+        
+        with Progress(
+            SpinnerColumn(),
+            BarColumn(),
+            TextColumn("[progress.description]{task.description}"),
+            TimeElapsedColumn(),
+            console=console,
+        ) as progress:
+            task = progress.add_task("Org charts", total=len(to_process))
+            
+            async def process_one(comp):
+                handle = comp["handle"].strip("/").replace("/","_")
+                persons = [p for p in people if p["company_handle"].strip("/") == comp["handle"].strip("/")]
+                
+                chart = await infer_org_chart_llm(
+                    comp, persons,
+                    client=client if client else OpenAI(api_key="sk-debug"),
+                    model_name=opts.openai_model,
+                    max_tokens=opts.max_llm_tokens,
+                    temperature=opts.llm_temperature,
+                    stub=stub,
+                )
+                chart["meta"]["company"] = comp["name"]
+                
+                # Save the result immediately
+                dump_json(chart, out_dir/f"org_chart_{handle}.json")
+                
+                progress.update(task, advance=1, description=f"{comp['name']} ({len(persons)} ppl)")
+            
+            # Create tasks for all companies
+            tasks = [process_one(comp) for comp in to_process]
+            
+            # Process in batches based on worker count
+            semaphore = asyncio.Semaphore(workers)
+            
+            async def bounded_process(coro):
+                async with semaphore:
+                    return await coro
+            
+            # Run with concurrency control
+            await asyncio.gather(*(bounded_process(task) for task in tasks))
+
+    logging.info("[bold]⇢[/] Flattening decision-makers CSV")
+    export_decision_makers(out_dir, out_dir/"decision_makers.csv")
+        
+    render_html(out_dir, template_dir=BASE_DIR/"templates")
+    logging.success = lambda msg, **k: console.print(f"[bold green]✓[/] {msg}", **k)
+    logging.success(f"Stage-2 artefacts written to {out_dir}")
+
+# ───────────────────────────────────────────────────────────────────────────────
+# CLI
+# ───────────────────────────────────────────────────────────────────────────────
+def build_arg_parser():
+    p = argparse.ArgumentParser(description="Build graphs & visualisation from Stage-1 output")
+    p.add_argument("--in",       dest="in_dir",  required=False, help="Stage-1 output dir", default=".")
+    p.add_argument("--out",      dest="out_dir", required=False, help="Destination dir",   default=".")
+    p.add_argument("--embed_model", default="all-MiniLM-L6-v2")
+    p.add_argument("--top_k", type=int, default=10, help="Top-k neighbours per company")
+    p.add_argument("--openai_model", default="gpt-4.1")
+    p.add_argument("--max_llm_tokens", type=int, default=8024)
+    p.add_argument("--llm_temperature", type=float, default=1.0)
+    p.add_argument("--stub", action="store_true", help="Skip OpenAI call and generate tiny fake org charts")
+    p.add_argument("--workers", type=int, default=4, help="Number of parallel workers for LLM inference")
+    return p
+
+def main():
+    dbg = dev_defaults()
+    opts = dbg if True else build_arg_parser().parse_args()
+    asyncio.run(run(opts))
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/apps/linkdin/schemas/company_card.json b/docs/apps/linkdin/schemas/company_card.json
new file mode 100644
index 00000000..80ee8e2f
--- /dev/null
+++ b/docs/apps/linkdin/schemas/company_card.json
@@ -0,0 +1,39 @@
+{
+  "name": "LinkedIn Company Card",
+  "baseSelector": "div.search-results-container ul[role='list'] > li",
+  "fields": [
+    {
+      "name": "handle",
+      "selector": "a[href*='/company/']",
+      "type": "attribute",
+      "attribute": "href"
+    },
+    {
+      "name": "profile_image",
+      "selector": "a[href*='/company/'] img",
+      "type": "attribute",
+      "attribute": "src"
+    },
+    {
+      "name": "name",
+      "selector": "span[class*='t-16'] a",
+      "type": "text"
+    },
+    {
+      "name": "descriptor",
+      "selector": "div[class*='t-black t-normal']",
+      "type": "text"
+    },
+    {
+      "name": "about",
+      "selector": "p[class*='entity-result__summary--2-lines']",
+      "type": "text"
+    },
+    {
+      "name": "followers",
+      "selector": "div:contains('followers')",
+      "type": "regex",
+      "pattern": "(\\d+)\\s*followers"
+    }
+  ]
+}
\ No newline at end of file
diff --git a/docs/apps/linkdin/schemas/people_card.json b/docs/apps/linkdin/schemas/people_card.json
new file mode 100644
index 00000000..5514b981
--- /dev/null
+++ b/docs/apps/linkdin/schemas/people_card.json
@@ -0,0 +1,38 @@
+{
+  "name": "LinkedIn People Card",
+  "baseSelector": "li.org-people-profile-card__profile-card-spacing",
+  "fields": [
+    {
+      "name": "profile_url",
+      "selector": "a.eETATgYTipaVsmrBChiBJJvFsdPhNpulhPZUVLHLo",
+      "type": "attribute",
+      "attribute": "href"
+    },
+    {
+      "name": "name",
+      "selector": ".artdeco-entity-lockup__title .lt-line-clamp--single-line",
+      "type": "text"
+    },
+    {
+      "name": "headline",
+      "selector": ".artdeco-entity-lockup__subtitle .lt-line-clamp--multi-line",
+      "type": "text"
+    },
+    {
+      "name": "followers",
+      "selector": ".lt-line-clamp--multi-line.t-12",
+      "type": "text"
+    },
+    {
+      "name": "connection_degree",
+      "selector": ".artdeco-entity-lockup__badge .artdeco-entity-lockup__degree",
+      "type": "text"
+    },
+    {
+      "name": "avatar_url",
+      "selector": ".artdeco-entity-lockup__image img",
+      "type": "attribute",
+      "attribute": "src"
+    }
+  ]
+}
\ No newline at end of file
diff --git a/docs/apps/linkdin/snippets/company.html b/docs/apps/linkdin/snippets/company.html
new file mode 100644
index 00000000..8df4ea5f
--- /dev/null
+++ b/docs/apps/linkdin/snippets/company.html
@@ -0,0 +1,143 @@
+<li class="yCLWzruNprmIzaZzFFonVFBtMrbaVYnuDFA">
+    <!----><!---->
+
+
+
+    <div class="IxlEPbRZwQYrRltKPvHAyjBmCdIWTAoYo" data-chameleon-result-urn="urn:li:company:362492"
+        data-view-name="search-entity-result-universal-template">
+
+
+
+
+        <div class="linked-area flex-1
+              cursor-pointer">
+
+            <div class="BAEgVqVuxosMJZodcelsgPoyRcrkiqgVCGHXNQ">
+                <div class="afcvrbGzNuyRlhPPQWrWirJtUdHAAtUlqxwvVA">
+                    <div class="display-flex align-items-center">
+                        <!---->
+
+                        <a class="eETATgYTipaVsmrBChiBJJvFsdPhNpulhPZUVLHLo  scale-down " aria-hidden="true"
+                            tabindex="-1" href="https://www.linkedin.com/company/managment-research-services-inc./"
+                            data-test-app-aware-link="">
+
+                            <div class="ivm-image-view-model   ">
+
+                                <div class="ivm-view-attr__img-wrapper
+            
+            ">
+                                    <!---->
+                                    <!----> <img width="48"
+                                        src="https://media.licdn.com/dms/image/v2/C560BAQFWpusEOgW-ww/company-logo_100_100/company-logo_100_100/0/1630583697877/managment_research_services_inc_logo?e=1750896000&amp;v=beta&amp;t=Ch9vyEZdfng-1D1m_XqP5kjNpVXUBKkk9cNhMZUhx0E"
+                                        loading="lazy" height="48" alt="Management Research Services, Inc. (MRS, Inc)"
+                                        id="ember28"
+                                        class="ivm-view-attr__img--centered EntityPhoto-square-3   evi-image lazy-image ember-view">
+                                </div>
+
+                            </div>
+
+                        </a>
+
+
+                    </div>
+                </div>
+                <div
+                    class="wympnVuDByXHvafWrMGJLZuchDmCRqLmWPwg MmzCPRicJimZvjJhvqTzDcDbdHhWPzspERzA pt3 pb3 t-12 t-black--light">
+                    <div class="mb1">
+
+                        <div class="t-roman t-sans">
+
+
+
+                            <div class="display-flex">
+                                <span class="TikBXjihYvcNUoIzkslUaEjfIuLmYxfs OoHEyXgsiIqGADjcOtTmfdpoYVXrLKTvkwI ">
+                                    <span class="CgaWLOzmXNuKbRIRARSErqCJcBPYudEKo
+                t-16">
+                                        <a class="eETATgYTipaVsmrBChiBJJvFsdPhNpulhPZUVLHLo "
+                                            href="https://www.linkedin.com/company/managment-research-services-inc./"
+                                            data-test-app-aware-link="">
+                                            <!---->Management Research Services, Inc. (MRS, Inc)<!---->
+                                            <!----> </a>
+                                        <!----> </span>
+                                </span>
+                                <!---->
+                            </div>
+
+
+
+                        </div>
+
+
+
+                        <div class="LjmdKCEqKITHihFOiQsBAQylkdnsWhqZii
+              t-14 t-black t-normal">
+                            <!---->Insurance • Milwaukee, Wisconsin<!---->
+                        </div>
+
+                        <div class="cTPhJiHyNLmxdQYFlsEOutjznmqrVHUByZwZ
+              t-14 t-normal">
+                            <!---->1K followers<!---->
+                        </div>
+
+
+
+
+
+                    </div>
+
+                    <!---->
+                    <p class="yWzlqwKNlvCWVNoKqmzoDDEnBMUuyynaLg
+                    entity-result__summary--2-lines
+                    t-12 t-black--light
+                    ">
+                        <!---->MRS combines 30 years of experience supporting the Life,<span class="white-space-pre">
+                        </span><strong><!---->Health<!----></strong><span class="white-space-pre"> </span>and
+                        Annuities<span class="white-space-pre"> </span><strong><!---->Insurance<!----></strong><span
+                            class="white-space-pre"> </span>Industry with customized<span class="white-space-pre">
+                        </span><strong><!---->insurance<!----></strong><span class="white-space-pre">
+                        </span>underwriting solutions that efficiently support clients’ workflows. Supported by the
+                        Agenium Platform (www.agenium.ai) our innovative underwriting solutions are guaranteed to
+                        optimize requirements...<!---->
+                    </p>
+
+                    <!---->
+                </div>
+                <div class="qXxdnXtzRVFTnTnetmNpssucBwQBsWlUuk MmzCPRicJimZvjJhvqTzDcDbdHhWPzspERzA">
+                    <!---->
+
+
+                    <div>
+
+
+
+
+                        <button aria-label="Follow Management Research Services, Inc. (MRS, Inc)" id="ember61"
+                            class="artdeco-button artdeco-button--2 artdeco-button--secondary ember-view"
+                            type="button"><!---->
+                            <span class="artdeco-button__text">
+                                Follow
+                            </span></button>
+
+
+
+                        <!---->
+                        <!---->
+
+
+                    </div>
+
+
+
+                </div>
+            </div>
+
+        </div>
+
+
+
+
+    </div>
+
+
+
+</li>
\ No newline at end of file
diff --git a/docs/apps/linkdin/snippets/people.html b/docs/apps/linkdin/snippets/people.html
new file mode 100644
index 00000000..9faa9cda
--- /dev/null
+++ b/docs/apps/linkdin/snippets/people.html
@@ -0,0 +1,94 @@
+<li class="grid grid__col--lg-8 block org-people-profile-card__profile-card-spacing">
+    <div>
+
+
+        <section class="artdeco-card full-width qQdPErXQkSAbwApNgNfuxukTIPPykttCcZGOHk">
+            <!---->
+
+            <img width="210" src="data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7"
+                ariarole="presentation" loading="lazy" height="210" alt="" id="ember96"
+                class="evi-image lazy-image ghost-default ember-view org-people-profile-card__cover-photo org-people-profile-card__cover-photo--people">
+
+            <div class="org-people-profile-card__profile-info">
+                <div id="ember97"
+                    class="artdeco-entity-lockup artdeco-entity-lockup--stacked-center artdeco-entity-lockup--size-7 ember-view">
+                    <div id="ember98"
+                        class="artdeco-entity-lockup__image artdeco-entity-lockup__image--type-circle ember-view"
+                        type="circle">
+
+                        <a class="eETATgYTipaVsmrBChiBJJvFsdPhNpulhPZUVLHLo "
+                            id="org-people-profile-card__profile-image-0"
+                            href="https://www.linkedin.com/in/speakerrayna?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAABsqUBoBr5x071PuGGpNtK3NlvSARiVXPIs"
+                            data-test-app-aware-link="">
+                            <img width="104"
+                                src="https://media.licdn.com/dms/image/v2/D5603AQGs2Vyju4xZ7A/profile-displayphoto-shrink_100_100/profile-displayphoto-shrink_100_100/0/1681741067031?e=1750896000&amp;v=beta&amp;t=Hvj--IrrmpVIH7pec7-l_PQok8vsS__CGeUqBWOw7co"
+                                loading="lazy" height="104" alt="Dr. Rayna S." id="ember99"
+                                class="evi-image lazy-image ember-view">
+                        </a>
+
+
+                    </div>
+                    <div id="ember100" class="artdeco-entity-lockup__content ember-view">
+                        <div id="ember101" class="artdeco-entity-lockup__title ember-view">
+                            <a class="eETATgYTipaVsmrBChiBJJvFsdPhNpulhPZUVLHLo  link-without-visited-state"
+                                aria-label="View Dr. Rayna S.’s profile"
+                                href="https://www.linkedin.com/in/speakerrayna?miniProfileUrn=urn%3Ali%3Afs_miniProfile%3AACoAABsqUBoBr5x071PuGGpNtK3NlvSARiVXPIs"
+                                data-test-app-aware-link="">
+                                <div id="ember103" class="ember-view lt-line-clamp lt-line-clamp--single-line AGabuksChUpCmjWshSnaZryLKSthOKkwclxY
+          t-black" style="">
+                                    Dr. Rayna S.
+
+                                    <!---->
+                                </div>
+
+                            </a>
+
+                        </div>
+                        <div id="ember104" class="artdeco-entity-lockup__badge ember-view"> <span class="a11y-text">3rd+
+                                degree connection</span>
+                            <span class="artdeco-entity-lockup__degree" aria-hidden="true">
+                                ·&nbsp;3rd
+                            </span>
+                            <!----><!---->
+                        </div>
+                        <div id="ember105" class="artdeco-entity-lockup__subtitle ember-view">
+                            <div class="t-14 t-black--light t-normal">
+                                <div id="ember107" class="ember-view lt-line-clamp lt-line-clamp--multi-line"
+                                    style="-webkit-line-clamp: 2">
+                                    Leadership and Talent Development Consultant and Professional Speaker
+
+                                    <!---->
+                                </div>
+
+                            </div>
+                        </div>
+                        <div id="ember108" class="artdeco-entity-lockup__caption ember-view"></div>
+                    </div>
+
+                </div>
+                <span class="text-align-center">
+                    <span id="ember110"
+                        class="ember-view lt-line-clamp lt-line-clamp--multi-line t-12 t-black--light mt2"
+                        style="-webkit-line-clamp: 3">
+                        727 followers
+
+                        <!----> </span>
+
+                </span>
+            </div>
+
+            <footer class="ph3 pb3">
+                <button aria-label="Follow Dr. Rayna S." id="ember111"
+                    class="artdeco-button artdeco-button--2 artdeco-button--secondary ember-view full-width"
+                    type="button"><!---->
+                    <span class="artdeco-button__text">
+                        Follow
+                    </span></button>
+            </footer>
+
+        </section>
+
+
+    </div>
+
+</li>
\ No newline at end of file
diff --git a/docs/apps/linkdin/templates/ai.js b/docs/apps/linkdin/templates/ai.js
new file mode 100644
index 00000000..f67e1108
--- /dev/null
+++ b/docs/apps/linkdin/templates/ai.js
@@ -0,0 +1,50 @@
+// ==== File: ai.js ====
+
+class ApiHandler {
+    constructor(apiKey = null) {
+      this.apiKey = apiKey || localStorage.getItem("openai_api_key") || "";
+      console.log("ApiHandler ready");
+    }
+  
+    setApiKey(k) {
+      this.apiKey = k.trim();
+      if (this.apiKey) localStorage.setItem("openai_api_key", this.apiKey);
+    }
+  
+    async *chatStream(messages, {model = "gpt-4o", temperature = 0.7} = {}) {
+      if (!this.apiKey) throw new Error("OpenAI API key missing");
+      const payload = {model, messages, stream: true, max_tokens: 1024};
+      const controller = new AbortController();
+  
+      const res = await fetch("https://api.openai.com/v1/chat/completions", {
+        method: "POST",
+        headers: {
+          "Content-Type": "application/json",
+          Authorization: `Bearer ${this.apiKey}`,
+        },
+        body: JSON.stringify(payload),
+        signal: controller.signal,
+      });
+      if (!res.ok) throw new Error(`OpenAI: ${res.statusText}`);
+      const reader = res.body.getReader();
+      const dec = new TextDecoder();
+  
+      let buf = "";
+      while (true) {
+        const {done, value} = await reader.read();
+        if (done) break;
+        buf += dec.decode(value, {stream: true});
+        for (const line of buf.split("\n")) {
+          if (!line.startsWith("data: ")) continue;
+          if (line.includes("[DONE]")) return;
+          const json = JSON.parse(line.slice(6));
+          const delta = json.choices?.[0]?.delta?.content;
+          if (delta) yield delta;
+        }
+        buf = buf.endsWith("\n") ? "" : buf; // keep partial line
+      }
+    }
+  }
+  
+  window.API = new ApiHandler();
+  
\ No newline at end of file
diff --git a/docs/apps/linkdin/templates/graph_view_template.html b/docs/apps/linkdin/templates/graph_view_template.html
new file mode 100644
index 00000000..68b8ce59
--- /dev/null
+++ b/docs/apps/linkdin/templates/graph_view_template.html
@@ -0,0 +1,1171 @@
+<!DOCTYPE html>
+<html lang="en" class="dark">
+
+<head>
+    <meta charset="utf-8" />
+    <title>C4AI Insights</title>
+    <script src="https://cdn.tailwindcss.com"></script>
+    <script src="https://unpkg.com/vis-network@9.1.2/dist/vis-network.min.js"></script>
+    <!-- our tiny OpenAI wrapper -->
+    <script src="ai.js"></script>
+    <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/split.js/1.6.5/split.min.js"></script>
+    <link href="https://unpkg.com/vis-network@9.1.2/dist/vis-network.min.css" rel="stylesheet" />
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css" />
+    <style>
+        .vis-network canvas {
+            background-color: #1f1f1f !important;
+            background-image: 
+              linear-gradient(rgba(255, 255, 255, 0.03) 1px, transparent 1px),
+              linear-gradient(90deg, rgba(255, 255, 255, 0.03) 1px, transparent 1px);
+            background-size: 30px 30px;
+        }
+
+        #chatDrawer {
+            max-height: 45vh;
+            height: 45vh;
+            display: flex;
+            flex-direction: column;
+        }
+
+        #chatBody {
+            flex: 1;
+            overflow-y: auto;
+            min-height: 0;
+            max-height: calc(45vh - 90px);
+        }
+
+        #chatInputContainer {
+            min-height: 60px;
+        }
+
+        /* Split.js vertical gutter */
+        .gutter.gutter-vertical {
+            cursor: row-resize;
+            height: 6px;
+            background-image: url('data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAB4AAAAFCAYAAABSIVz6AAAABmJLR0QA/wD/AP+gvaeTAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAB3RJTUUH4AkKCQQBdo6l1QAAABl0RVh0Q29tbWVudABDcmVhdGVkIHdpdGggR0lNUFeBDhcAAAArSURBVCjPY2AYBfQMgVAFzGCGIpgBxTklpCgGQ0O54P//Y8zAs14lighENAAAVTsOYMqVl/QAAAAASUVORK5CYII=');
+        }
+
+        /* Split.js styles */
+        .gutter {
+            background-color: #2d2d2d;
+            background-repeat: no-repeat;
+            background-position: 50%;
+        }
+
+        .gutter.gutter-horizontal {
+            cursor: col-resize;
+            background-image: url('data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAeCAYAAADkftS9AAAAIklEQVQoU2M4c+bMfxAGAgYYmwGrIIiDjrELjpo5aiZeMwF+yNnOs5KSvgAAAABJRU5ErkJggg==');
+        }
+
+        /* Sidebar styles */
+        .sidebar-collapse-btn {
+            position: absolute;
+            top: 10px;
+            background-color: #2d2d2d;
+            color: #999;
+            border: none;
+            border-radius: 4px;
+            width: 24px;
+            height: 24px;
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            cursor: pointer;
+            z-index: 10;
+        }
+
+        .sidebar-collapse-btn:hover {
+            background-color: #444;
+            color: #eee;
+        }
+
+        #leftSidebarToggle {
+            right: -12px;
+        }
+
+        #leftSidebarToggle.collapsed {
+            right: -20px;
+        }
+
+        #rightSidebarToggle {
+            left: -12px;
+        }
+
+        .collapsed {
+            width: 0 !important;
+            padding: 0 !important;
+            overflow: visible !important;
+        }
+
+        .full-width {
+            width: 100% !important;
+        }
+
+        .splitter-container {
+            height: 100%;
+            display: flex;
+        }
+    </style>
+
+    <!-- Toast notification style -->
+    <style>
+        #toast {
+            position: fixed;
+            bottom: 20px;
+            left: 50%;
+            transform: translateX(-50%);
+            background-color: #2d2d2d;
+            color: #eee;
+            padding: 10px 20px;
+            border-radius: 4px;
+            box-shadow: 0 4px 8px rgba(0, 0, 0, 0.3);
+            opacity: 0;
+            transition: opacity 0.3s ease;
+            z-index: 1000;
+            pointer-events: none;
+        }
+
+        #toast.show {
+            opacity: 1;
+        }
+    </style>
+</head>
+
+<body class="h-screen flex flex-col bg-neutral-900 text-neutral-100  overflow-hidden">
+    <!-- Toast notification -->
+    <div id="toast"></div>
+    
+    <!-- API Settings Modal -->
+    <div id="settingsModal" class="fixed inset-0 bg-black bg-opacity-50 z-50 flex items-center justify-center hidden">
+        <div class="bg-neutral-800 rounded-lg shadow-lg p-6 max-w-md w-full mx-4">
+            <div class="flex justify-between items-center mb-4">
+                <h3 class="text-lg font-semibold">API Settings</h3>
+                <button id="closeSettingsModal" class="text-neutral-400 hover:text-neutral-200">
+                    <i class="fa fa-times"></i>
+                </button>
+            </div>
+            <div class="mb-4">
+                <label for="apiKeyInput" class="block text-sm font-medium text-neutral-300 mb-2">OpenAI API Key</label>
+                <input type="password" id="apiKeyInput" class="w-full p-2 rounded bg-neutral-700 text-neutral-100 border border-neutral-600 focus:border-blue-500 focus:outline-none" placeholder="sk-...">
+                <p class="text-xs text-neutral-400 mt-1">Your API key is stored locally in your browser and never sent to our servers.</p>
+            </div>
+            <div class="flex justify-end">
+                <button id="saveApiKey" class="bg-emerald-600 hover:bg-emerald-500 text-white px-4 py-2 rounded transition-colors">
+                    Save Settings
+                </button>
+            </div>
+        </div>
+    </div>
+    <div class="flex flex-1 splitter-container" id="mainSplitter">
+        <div id="leftSidebar" class="p-2 border-r border-neutral-700 relative w-72 overflow-y-auto">
+            <button id="leftSidebarToggle" class="sidebar-collapse-btn">
+                <i class="fa fa-chevron-left"></i>
+            </button>
+            <input id="search" class="w-full mb-3 p-2 border rounded bg-neutral-800 text-neutral-100 border-neutral-600"
+                placeholder="Search company">
+            <div class="text-xs text-neutral-400 mb-2 flex justify-between items-center">
+                <span>Companies</span>
+                <span id="companyCount">0 companies</span>
+            </div>
+            <ul id="companyList" class="space-y-3 text-sm"></ul>
+        </div>
+        <div id="mainContent" class="flex-1 relative">
+            <div class="absolute top-4 left-4 z-10 flex space-x-2">
+                <label class="bg-neutral-800 hover:bg-neutral-700 text-neutral-200 p-2 px-3 rounded-full shadow-lg transition-colors cursor-pointer flex items-center">
+                    <i class="fas fa-upload mr-2"></i>
+                    <span>Load Data</span>
+                    <input type="file" id="dataFileInput" accept=".json" class="hidden">
+                </label>
+                <button id="clearDataBtn" class="bg-red-800 hover:bg-red-700 text-neutral-200 p-2 px-3 rounded-full shadow-lg transition-colors flex items-center">
+                    <i class="fas fa-trash-alt mr-2"></i>
+                    <span>Clear Data</span>
+                </button>
+                <button id="settingsBtn" class="bg-neutral-800 hover:bg-neutral-700 text-neutral-200 p-2 px-3 rounded-full shadow-lg transition-colors flex items-center">
+                    <i class="fas fa-cog mr-2"></i>
+                    <span>Settings</span>
+                </button>
+            </div>
+            <div class="absolute top-4 right-4 z-10 flex space-x-2">
+                <button id="zoomInBtn"
+                    class="bg-neutral-800 hover:bg-neutral-700 text-neutral-200 p-2 px-3 rounded-full shadow-lg transition-colors">
+                    <i class="fas fa-search-plus"></i>
+                </button>
+                <button id="zoomOutBtn"
+                    class="bg-neutral-800 hover:bg-neutral-700 text-neutral-200 p-2  px-3 rounded-full shadow-lg transition-colors">
+                    <i class="fas fa-search-minus"></i>
+                </button>
+                <button id="resetViewBtn"
+                    class="bg-neutral-800 hover:bg-neutral-700 text-neutral-200 p-2  px-3 rounded-full shadow-lg transition-colors">
+                    <i class="fas fa-compress-arrows-alt"></i>
+                </button>
+            </div>
+            <div class="absolute bottom-4 left-4 z-10 text-xs text-neutral-500">
+                <div id="graphInfo" class="bg-neutral-800/70 backdrop-blur-sm p-2 rounded shadow-lg hidden">
+                    <div id="graphInfoContent"></div>
+                </div>
+            </div>
+            <div id="graph" class="w-full h-full"></div>
+            <!-- ───── Chat drawer (hidden by default) ───── -->
+            <div id="chatDrawer" class="absolute bottom-0 inset-x-0 bg-neutral-900 border-t
+    border-neutral-700 translate-y-full transition-transform
+    duration-300">
+                <div class="flex items-center px-3 py-2">
+                    <span class="font-semibold flex-1">🔮 Chat with C4AI Assistant</span>
+                    <button id="chatClose" class="text-neutral-400 hover:text-neutral-200">
+                        <i class="fa fa-times"></i>
+                    </button>
+                </div>
+                <div id="chatBody" class="flex-1 overflow-y-auto p-3 space-y-2 text-sm max-h-full"></div>
+                <div class="p-2 border-t border-neutral-700">
+                    <input id="chatInput" class="w-full bg-neutral-800 p-2 rounded outline-none"
+                        placeholder="Ask something… (Enter to send)">
+                </div>
+            </div>
+
+        </div>
+        <div id="rightSidebar" class="bg-neutral-800 shadow-lg relative w-80 overflow-y-auto">
+            <button id="rightSidebarToggle" class="sidebar-collapse-btn">
+                <i class="fa fa-chevron-right"></i>
+            </button>
+            <div id="rightPane" class="h-full">
+                <div class="flex flex-col items-center justify-center h-full p-8 text-center">
+                    <div class="text-neutral-500 mb-4">
+                        <i class="fas fa-sitemap text-4xl"></i>
+                    </div>
+                    <h3 class="text-lg font-semibold mb-2">Organization Details</h3>
+                    <p class="text-neutral-400 text-sm">Select a company to view its organization structure and key
+                        decision makers.</p>
+                </div>
+            </div>
+        </div>
+    </div>
+
+
+    <!-- chat floating action button -->
+    <button id="chatFab" class="fixed bottom-6 right-6 bg-emerald-500 hover:bg-emerald-400
+                 p-3 rounded-full shadow-lg focus:outline-none" style="padding: 0.65rem 0.75rem">
+        <i class="fa fa-comments text-neutral-900"></i>
+    </button>
+
+
+    <script>
+        // Check localStorage first, otherwise fetch default data
+        let data;
+        // Toast notification function
+        function showToast(message, duration = 3000) {
+            const toast = document.getElementById('toast');
+            toast.textContent = message;
+            toast.classList.add('show');
+
+            setTimeout(() => {
+                toast.classList.remove('show');
+            }, duration);
+        }
+
+        const loadDataFromSource = () => {
+            const savedData = localStorage.getItem('companyGraphData');
+
+            if (savedData) {
+                try {
+                    data = JSON.parse(savedData);
+                    console.log('Loaded data from localStorage');
+                    initializeGraph(data);
+                    showToast('Using data from local storage. Click "Clear Data" to revert to default.');
+                } catch (error) {
+                    console.error('Error parsing stored data:', error);
+                    fetchDefaultData();
+                }
+            } else {
+                fetchDefaultData();
+            }
+        };
+
+        const fetchDefaultData = () => {
+            fetch('./company_graph.json')
+                .then(response => response.json())
+                .then(data => {
+                    initializeGraph(data);
+                })
+                .catch(error => console.error('Error loading default JSON:', error));
+        };
+
+        // File input handler
+        document.getElementById('dataFileInput').addEventListener('change', (event) => {
+            const file = event.target.files[0];
+            if (!file) return;
+
+            const reader = new FileReader();
+            reader.onload = (e) => {
+                try {
+                    const data = JSON.parse(e.target.result);
+                    // Validate data structure
+                    if (!data.nodes || !data.edges) {
+                        alert('Invalid data format. File must contain nodes and edges arrays.');
+                        return;
+                    }
+
+                    // Save to localStorage
+                    localStorage.setItem('companyGraphData', JSON.stringify(data));
+
+                    // Show notification before reload
+                    showToast('Data file loaded successfully! Refreshing page...', 1500);
+
+                    // Reload the page to initialize with new data
+                    setTimeout(() => {
+                        window.location.reload();
+                    }, 1500);
+                } catch (error) {
+                    alert('Error parsing JSON file: ' + error.message);
+                }
+            };
+            reader.readAsText(file);
+        });
+
+        // Clear data button
+        document.getElementById('clearDataBtn').addEventListener('click', () => {
+            if (confirm('Are you sure you want to clear the loaded data? This will revert to the default dataset.')) {
+                // Clear the localStorage data
+                localStorage.removeItem('companyGraphData');
+                
+                // Force clear the current data from memory
+                window.companyGraphData = null;
+                
+                showToast('Custom data cleared! Loading default dataset...', 1500);
+                
+                // Completely reload the page to ensure fresh start
+                setTimeout(() => {
+                    window.location.href = window.location.href.split('?')[0] + '?nocache=' + new Date().getTime();
+                }, 1500);
+            }
+        });
+
+        // Initialize
+        loadDataFromSource();
+
+        function initializeGraph(data) {
+            window.companyGraphData = data          // expose globally
+
+            // lazy-load people.jsonl once so chat can reference raw rows
+            let peopleRows = null
+            async function getPeopleRows() {
+                if (peopleRows) return peopleRows
+                try {
+                    const txt = await fetch("people.jsonl").then(r => r.text())
+                    peopleRows = txt.trim().split("\n").map(l => JSON.parse(l))
+                } catch { peopleRows = [] }
+                return peopleRows
+            }
+            const container = document.getElementById('graph')
+            // Create node objects with enhanced styling and tooltips
+            const nodes = new vis.DataSet(data.nodes.map(n => ({
+                id: n.id,
+                label: n.name,
+                title: `${n.name}\n${n.industry || 'Industry: N/A'}\n${n.followers?.toLocaleString() || '0'} followers`,
+                shape: 'dot',
+                font: {
+                    color: '#ffffff',
+                    face: 'Inter, system-ui, sans-serif',
+                    size: 16,
+                    strokeWidth: 2,
+                    strokeColor: '#222222'
+                },
+                size: Math.max(15, Math.log10((n.followers || 1)) * 6 + 10),
+                borderWidth: 2,
+                borderWidthSelected: 4,
+                color: {
+                    background: '#3b82f6',  // blue-500
+                    border: '#1e40af',      // blue-800
+                    highlight: {
+                        background: '#60a5fa',  // blue-400
+                        border: '#ffffff'
+                    },
+                    hover: {
+                        background: '#93c5fd',  // blue-300
+                        border: '#ffffff'
+                    }
+                },
+                shadow: {
+                    enabled: true,
+                    color: 'rgba(0,0,0,0.3)',
+                    size: 10,
+                    x: 0,
+                    y: 0
+                }
+            })))
+
+            // Create edge objects with enhanced styling
+            const edges = new vis.DataSet(data.edges.map(e => ({
+                from: e.source,
+                to: e.target,
+                width: Math.max(1, Math.min(8, e.weight * 4)),
+                selectionWidth: 2,
+                color: {
+                    color: '#6b7280',       // gray-500
+                    highlight: '#10b981',   // emerald-500
+                    hover: '#a3e635'        // lime-400
+                },
+                arrows: {
+                    to: {
+                        enabled: e.weight > 0.3,  // Only show arrows for stronger connections
+                        scaleFactor: 0.5
+                    }
+                },
+                smooth: {
+                    type: 'continuous',
+                    forceDirection: 'none',
+                    roundness: 0.2
+                }
+            })))
+
+            // Configure and create the network
+            const network = new vis.Network(container, { nodes, edges }, {
+                physics: {
+                    barnesHut: {
+                        gravitationalConstant: -2000,
+                        springLength: 120,
+                        springConstant: 0.05,
+                        avoidOverlap: 0.5,
+                        damping: 0.09
+                    },
+                    stabilization: {
+                        iterations: 200,
+                        updateInterval: 25
+                    },
+                    enabled: true,
+                    timestep: 0.5,
+                    adaptiveTimestep: true
+                },
+                interaction: {
+                    hover: true,
+                    navigationButtons: false,
+                    keyboard: true,
+                    tooltipDelay: 100,
+                    hideEdgesOnDrag: false,  // Keep edges visible when dragging
+                    multiselect: false,
+                    selectable: true,
+                    dragNodes: true,
+                    dragView: true,
+                    zoomView: true,
+                    mouseWheel: {
+                        speed: 0.15,           // Reduced from default 1.0
+                        smooth: true           // Enable smooth zooming
+                    }
+                },
+                nodes: {
+                    font: {
+                        size: 16,
+                        strokeWidth: 2,
+                        strokeColor: '#222222'
+                    },
+                    fixed: false
+                },
+                edges: {
+                    smooth: {
+                        type: 'continuous',
+                        forceDirection: 'none',
+                        roundness: 0.2
+                    },
+                    hoverWidth: 1.5,
+                    selectionWidth: 2
+                }
+            })
+
+            window.network = network;
+
+            network.once('stabilized', () => {
+                console.log('Network stabilized')
+                // Freeze layout so nodes stop running away and dragging feels crisp
+                network.setOptions({ physics: false });
+
+                // get id of first node
+                let firstNodeId = data.nodes[0].id;
+                // network.focus(firstNodeId, { animation: true, scale: 1.5 });
+
+                // Automatically select the first company in the list
+                if (data.nodes.length > 0) {
+                    focusCompany(firstNodeId);
+                    // Highlight the first company in the sidebar
+                    const firstCompanyElement = document.querySelector('#companyList li');
+                    if (firstCompanyElement) {
+                        firstCompanyElement.classList.add('border-blue-500');
+                    }
+                }
+            });
+
+            const companyList = document.getElementById('companyList')
+            const companyCount = document.getElementById('companyCount')
+            companyCount.textContent = `${data.nodes.length} companies`
+
+            data.nodes.forEach(n => {
+                const li = document.createElement('li')
+                li.className = 'p-3 rounded bg-neutral-800 hover:bg-neutral-700 border border-neutral-700 transition-colors'
+                li.innerHTML = `
+                        <div class="flex justify-between items-start mb-1">
+                            <h3 class="font-semibold text-blue-400 cursor-pointer">${n.name}</h3>
+                            <span class="px-2 py-0.5 bg-neutral-700 rounded-full text-xs">${n.industry || 'N/A'}</span>
+                        </div>
+                        <p class="text-xs text-neutral-300 mb-2">${n.about || 'No description available'}</p>
+                        <div class="flex justify-between items-center text-xs">
+                            <div class="flex items-center">
+                                <i class="fa fa-users mr-1 text-neutral-400"></i>
+                                <span>${n.followers?.toLocaleString() || '0'} followers</span>
+                            </div>
+                            <a href="https://www.linkedin.com${n.handle}" target="_blank" class="text-emerald-400 hover:text-emerald-300">
+                                <i class="fab fa-linkedin mr-1"></i>View on LinkedIn
+                            </a>
+                        </div>
+                    `
+                // Make the entire card clickable for better UX
+                li.style.cursor = 'pointer'
+                li.onclick = (e) => {
+                    // Don't trigger if clicking on the LinkedIn link
+                    if (e.target.tagName === 'A' || e.target.closest('a')) return
+                    focusCompany(n.id)
+                    // Add active state visual indicator
+                    document.querySelectorAll('#companyList li').forEach(el =>
+                        el.classList.remove('border-blue-500'))
+                    li.classList.add('border-blue-500')
+                }
+                companyList.appendChild(li)
+            })
+
+            // Add search functionality
+            const searchInput = document.getElementById('search')
+            searchInput.addEventListener('input', () => {
+                const query = searchInput.value.toLowerCase()
+                const items = companyList.querySelectorAll('li')
+                let visibleCount = 0
+
+                items.forEach(item => {
+                    const companyName = item.querySelector('h3').textContent.toLowerCase()
+                    const industryText = item.querySelector('span').textContent.toLowerCase()
+                    const aboutText = item.querySelector('p').textContent.toLowerCase()
+
+                    if (companyName.includes(query) || industryText.includes(query) || aboutText.includes(query)) {
+                        item.style.display = ''
+                        visibleCount++
+                    } else {
+                        item.style.display = 'none'
+                    }
+                })
+
+                companyCount.textContent = `${visibleCount} of ${data.nodes.length} companies`
+            })
+
+            function focusCompany(id) {
+                network.focus(id, { scale: 1.5, animation: true, })
+                loadOrgChart(id)
+            }
+
+            async function loadOrgChart(id) {
+                const pane = document.getElementById('rightPane')
+                pane.innerHTML = '<div class="p-4 text-sm">Loading…</div>'
+                if (rightSidebar.classList.contains('collapsed')) {
+                    toggleRightSidebar()
+                }
+                try {
+                    const chart = await fetch(`org_chart_${id.replace(/\//g, "_")}.json`).then(r => r.json())
+                    currentCompany = id
+                    currentChart = chart
+
+                    // Clear any previously selected person
+                    selectedPerson = null
+
+                    renderOrg(chart, pane)
+                } catch (e) {
+                    pane.innerHTML = `
+                            <div class="flex flex-col items-center justify-center h-full p-8 text-center">
+                                <div class="text-red-500 mb-3">
+                                    <i class="fas fa-exclamation-circle text-4xl"></i>
+                                </div>
+                                <h3 class="text-lg font-semibold mb-2">Organization Chart Not Found</h3>
+                                <p class="text-neutral-400 text-sm">Data for this company is not available.</p>
+                            </div>`
+                }
+            }
+
+            // REMOVED - Using colorForScore instead
+
+            function renderOrg(chart, pane) {
+                // Format company info from chart metadata
+                const companyName = chart.meta?.company || 'Company';
+                const employeeCount = chart.nodes.length;
+                const decisionMakers = chart.nodes.filter(n => n.decision_score >= 0.5);
+
+                pane.innerHTML = `
+  <div class="p-4 border-b border-neutral-700 bg-neutral-800">
+    <div class="flex justify-between items-center">
+      <h2 class="font-semibold text-lg text-blue-400">${companyName}</h2>
+      <span class="px-2 py-1 bg-neutral-700 rounded-full text-xs">${employeeCount} employees</span>
+    </div>
+  </div>
+  <div id="orgNet" style="height:320px" class="border-b border-neutral-700"></div>
+  <div class="p-4 bg-neutral-800">
+     <div class="flex justify-between items-center mb-3">
+       <h3 class="font-semibold">Decision Makers</h3>
+       <span class="px-2 py-0.5 bg-emerald-700 text-emerald-100 rounded-full text-xs">${decisionMakers.length} key people</span>
+     </div>
+     <ul class="text-sm space-y-2 max-h-60 overflow-y-auto pr-1 mb-4">
+       ${decisionMakers.map(n => `
+         <li class="p-2 rounded bg-neutral-700 hover:bg-neutral-600 transition-colors flex justify-between items-center">
+           <div>
+             <div class="font-medium">${n.name}</div>
+             <div class="text-xs text-neutral-300">${n.title}</div>
+           </div>
+           <div class="flex items-center gap-2">
+             <span class="text-xs px-2 py-0.5 rounded-full" style="background-color:${colorForScore(n.decision_score)}">${(n.decision_score * 100).toFixed(0)}%</span>
+             <a href="${n.profile_url}" target="_blank" class="text-emerald-400 hover:text-emerald-300 text-xs">
+               <i class="fab fa-linkedin"></i>
+             </a>
+           </div>
+         </li>`).join('')}
+     </ul>
+     <div class="mt-4 text-xs border-t border-neutral-700 pt-3">
+       <div class="font-semibold mb-2">Influence Scale</div>
+       <div class="h-2 w-full rounded-full mb-1" style="background: linear-gradient(to right, rgb(255,0,100), rgb(0,255,100))"></div>
+       <div class="flex justify-between text-xs text-neutral-400">
+         <span>Low influence (0%)</span>
+         <span>High influence (100%)</span>
+       </div>
+     </div>
+  </div>
+  <div id="personPane" class="p-4 border-t border-neutral-700 text-sm bg-neutral-900 hidden"></div>`
+
+                const n = new vis.DataSet(chart.nodes.map(p => ({
+                    id: p.id,
+                    label: p.name,
+                    title: `${p.name} - ${p.title || 'Employee'} (${(p.decision_score * 100).toFixed(0)}%)`,
+                    shape: 'box',
+                    color: {
+                        background: colorForScore(p.decision_score || 0),
+                        border: '#333333',
+                        highlight: {
+                            background: '#4ade80',
+                            border: '#ffffff'
+                        }
+                    },
+                    borderWidth: 2
+                })))
+
+                const e = new vis.DataSet(chart.edges.map(e => ({ from: e.source, to: e.target, arrows: 'to' })))
+
+                const orgNet = new vis.Network(
+                    pane.querySelector('#orgNet'),
+                    { nodes: n, edges: e },
+                    {
+                        layout: {
+                            hierarchical: {
+                                direction: 'UD',
+                                sortMethod: 'directed',
+                                levelSeparation: 100
+                            }
+                        },
+                        nodes: {
+                            color: { border: '#333333' },
+                            font: { color: '#ffffff', size: 14 },
+                            shadow: { enabled: true, color: 'rgba(0,0,0,0.5)', size: 5 }
+                        },
+                        edges: {
+                            color: { color: '#555555' },
+                            width: 2,
+                            smooth: { type: 'cubicBezier' }
+                        },
+                        interaction: {
+                            hover: true,
+                            tooltipDelay: 200
+                        },
+                        physics: {
+                            enabled: false
+                        }
+                    }
+                )
+
+                let currentChart = chart        // stash for click handler
+
+                orgNet.on('click', params => {
+                    if (!params.nodes.length) return
+
+                    // Reset any previously highlighted nodes
+                    orgNet.selectNodes([]);
+
+                    // Get the selected person
+                    const person = currentChart.nodes.find(x => x.id === params.nodes[0])
+                    if (person) {
+                        // Highlight the selected node
+                        orgNet.selectNodes([person.id]);
+
+                        selectedPerson = person
+                        showPersonDetails(person)
+
+                        // Scroll right sidebar to the person details
+                        const rightSidebar = document.getElementById('rightSidebar');
+                        rightSidebar.scrollTo({
+                            top: rightSidebar.scrollHeight,
+                            behavior: 'smooth'
+                        });
+                    }
+                })
+            }
+
+            function colorForScore(s) {                // 0 → gray, 1 → emerald
+                const g = Math.round(200 * (1 - s))
+                return `rgb(${g},${255 - g},120)`
+            }
+
+            function showPersonDetails(p) {
+                const box = document.getElementById('personPane')
+                box.classList.remove('hidden')
+
+                // Render decision score badge with appropriate color
+                const scoreColor = colorForScore(p.decision_score || 0)
+                const scorePercentage = (p.decision_score * 100).toFixed(1)
+
+                box.innerHTML = `
+                    <div class="bg-neutral-800 rounded-lg p-4">
+                      <div class="flex items-start space-x-3">
+                        <img src="${p.avatar_url || 'https://ui-avatars.com/api/?name=' + encodeURIComponent(p.name)}" 
+                             class="h-16 w-16 rounded-full object-cover border-2 border-neutral-700"/>
+                        <div class="flex-1">
+                          <div class="flex justify-between items-start">
+                            <div>
+                              <div class="font-semibold text-lg">${p.name}</div>
+                              <div class="text-neutral-300 text-sm">${p.title || 'Employee'}</div>
+                            </div>
+                            <span class="px-2 py-1 rounded-full text-sm font-medium" 
+                                  style="background-color:${scoreColor}">
+                              ${scorePercentage}% influence
+                            </span>
+                          </div>
+                          
+                          <div class="mt-3 grid grid-cols-2 gap-2 text-xs text-neutral-300">
+                            <div class="flex items-center">
+                              <i class="fas fa-sitemap w-5 text-neutral-500"></i>
+                              <span class="ml-1">${p.dept || 'Department not specified'}</span>
+                            </div>
+                            <div class="flex items-center">
+                              <i class="fas fa-calendar-alt w-5 text-neutral-500"></i>
+                              <span class="ml-1">${p.yoe_current || '?'} years at company</span>
+                            </div>
+                            <div class="flex items-center">
+                              <i class="fas fa-briefcase w-5 text-neutral-500"></i>
+                              <span class="ml-1">${p.title_level || 'Level not specified'}</span>
+                            </div>
+                            <div class="flex items-center">
+                              <i class="fas fa-user-friends w-5 text-neutral-500"></i>
+                              <span class="ml-1">${p.connection_count || '?'} connections</span>
+                            </div>
+                          </div>
+                          
+                          <div class="mt-3 flex justify-end">
+                            <a href="${p.id}" target="_blank" 
+                               class="bg-emerald-800 hover:bg-emerald-700 text-emerald-100 px-3 py-1 rounded text-xs flex items-center transition-colors">
+                              <i class="fab fa-linkedin mr-1"></i> View on LinkedIn
+                            </a>
+                          </div>
+                        </div>
+                      </div>
+                    </div>`
+            }
+
+            // ───── Chat drawer logic ─────
+            const chatFab = document.getElementById('chatFab')
+            const chatDrawer = document.getElementById('chatDrawer')
+            const chatClose = document.getElementById('chatClose')
+            const chatBody = document.getElementById('chatBody')
+            const chatInput = document.getElementById('chatInput')
+
+            chatFab.onclick = () => {
+                chatDrawer.style.transform = 'translateY(0)';
+                chatFab.style.display = 'none';
+            }
+
+            chatClose.onclick = () => {
+                chatDrawer.style.transform = 'translateY(100%)'
+                chatFab.style.display = 'block';
+            }
+
+            chatInput.addEventListener('keydown', e => {
+                if (e.key === 'Enter' && chatInput.value.trim()) {
+                    sendChat(chatInput.value.trim())
+                    chatInput.value = ''
+                }
+            })
+
+            // context vars
+            let currentCompany = null, currentChart = null, companyMeta = null,
+                decisionMakers = null, similarCompanies = null, selectedPerson = null
+            function loadOrgChart(id) {
+                const pane = document.getElementById('rightPane')
+                pane.innerHTML = '<div class="p-4 text-sm">Loading…</div>'
+                pane.style.transform = 'translateX(0)'
+                try {
+                    fetch(`org_chart_${id.replace(/\//g, "_")}.json`)
+                        .then(r => r.json())
+                        .then(chart => {
+                            currentChart = chart
+                            currentCompany = id
+                            companyMeta = data.nodes.find(n => n.id === id) || {}
+                            decisionMakers = chart.nodes.filter(n => n.decision_score >= 0.5)
+                            similarCompanies = data.edges.filter(e => e.source === id)
+                                .sort((a, b) => b.weight - a.weight)
+                                .slice(0, 3).map(e => e.target)
+
+                            renderOrg(chart, pane)
+                        })
+                } catch (e) { pane.innerHTML = '<div class="p-4 text-red-600">Org chart not found</div>' }
+            }
+
+            async function sendChat(userMsg) {
+                appendMsg("you", userMsg)
+                try {
+                    const msgs = []
+                    const context = {
+                        company: companyMeta,
+                        orgChart: currentChart,
+                        decisionMakers,
+                        // similarCompanies,
+                        selectedPerson,
+                        rawEmployees: (await getPeopleRows()).filter(p => p.company_handle.replace(/\/$/, '') === currentCompany)
+                    }
+                    // remove desc_embed from company
+                    context.company.desc_embed = ""
+
+                    msgs.push({ role: "system", content: `CONTEXT:\n${JSON.stringify(context)}` })
+                    msgs.push({ role: "user", content: userMsg })
+
+                    for await (const chunk of API.chatStream(msgs)) {
+                        appendMsg("ai", chunk, true)
+                    }
+                } catch (err) { appendMsg("ai", `[error: ${err.message}]`) }
+            }
+
+            function appendMsg(sender, text, streaming = false) {
+                let el = chatBody.lastElementChild
+                if (streaming && el && el.dataset.sender === sender) {
+                    // Just append raw text for streaming mode
+                    el.lastChild.innerHTML += text.replace(/\n/g, "<br>")
+                } else {
+                    el = document.createElement('div')
+                    el.className = 'chat-message'
+                    el.dataset.sender = sender
+
+                    // Create sender element
+                    const senderEl = document.createElement('span')
+                    senderEl.className = sender === 'you' ? 'text-emerald-400' : 'text-cyan-400'
+                    senderEl.textContent = `${sender}:`
+
+                    // Create content element
+                    const contentEl = document.createElement('div')
+                    contentEl.className = 'ml-1 mt-1'
+
+                    // Apply markdown parsing
+                    contentEl.innerHTML = marked.parse(text)
+
+                    // Style markdown elements
+                    const style = document.createElement('style')
+                    style.textContent = `
+          .chat-message a { color: #34D399; text-decoration: underline; }
+          .chat-message p { margin-bottom: 0.5rem; }
+          .chat-message h1, .chat-message h2, .chat-message h3 { 
+            font-weight: bold; 
+            margin-top: 1rem;
+            margin-bottom: 0.5rem;
+          }
+          .chat-message code {
+            background-color: #222;
+            padding: 0.1rem 0.3rem;
+            border-radius: 0.25rem;
+            font-family: monospace;
+          }
+          .chat-message pre {
+            background-color: #222;
+            padding: 0.5rem;
+            border-radius: 0.25rem;
+            overflow-x: auto;
+            margin: 0.5rem 0;
+          }
+          .chat-message pre code {
+            background-color: transparent;
+            padding: 0;
+          }
+          .chat-message ul, .chat-message ol {
+            margin-left: 1.5rem;
+            margin-bottom: 0.5rem;
+          }
+          .chat-message ul { list-style-type: disc; }
+          .chat-message ol { list-style-type: decimal; }
+        `
+                    document.head.appendChild(style)
+
+                    // Append elements
+                    el.appendChild(senderEl)
+                    el.appendChild(contentEl)
+                    chatBody.appendChild(el)
+                }
+                chatBody.scrollTop = chatBody.scrollHeight
+            }
+
+            // Settings modal and API key management
+            const settingsBtn = document.getElementById('settingsBtn');
+            const settingsModal = document.getElementById('settingsModal');
+            const closeSettingsModal = document.getElementById('closeSettingsModal');
+            const apiKeyInput = document.getElementById('apiKeyInput');
+            const saveApiKey = document.getElementById('saveApiKey');
+            
+            // Check for saved API key in localStorage
+            const savedApiKey = localStorage.getItem('openai_api_key');
+            if (savedApiKey) {
+                API.setApiKey(savedApiKey);
+                apiKeyInput.value = savedApiKey;
+            } else {
+                // Show settings modal on page load if no API key is set
+                setTimeout(() => {
+                    settingsModal.classList.remove('hidden');
+                }, 500);
+            }
+            
+            // Open settings modal when settings button is clicked
+            settingsBtn.addEventListener('click', () => {
+                settingsModal.classList.remove('hidden');
+            });
+            
+            // Close settings modal
+            closeSettingsModal.addEventListener('click', () => {
+                settingsModal.classList.add('hidden');
+            });
+            
+            // Close modal when clicking outside of it
+            settingsModal.addEventListener('click', (e) => {
+                if (e.target === settingsModal) {
+                    settingsModal.classList.add('hidden');
+                }
+            });
+            
+            // Save API key
+            saveApiKey.addEventListener('click', () => {
+                const apiKey = apiKeyInput.value.trim();
+                if (apiKey) {
+                    localStorage.setItem('openai_api_key', apiKey);
+                    API.setApiKey(apiKey);
+                    settingsModal.classList.add('hidden');
+                    showToast('API key saved successfully', 2000);
+                } else {
+                    showToast('Please enter a valid API key', 2000);
+                }
+            });
+            
+            // Allow Enter key to submit API key
+            apiKeyInput.addEventListener('keydown', (e) => {
+                if (e.key === 'Enter') {
+                    saveApiKey.click();
+                }
+            });
+
+
+            // ───── Split.js and sidebar setup ─────
+            const leftSidebar = document.getElementById('leftSidebar');
+            const rightSidebar = document.getElementById('rightSidebar');
+            const mainContent = document.getElementById('mainContent');
+            const leftSidebarToggle = document.getElementById('leftSidebarToggle');
+            const rightSidebarToggle = document.getElementById('rightSidebarToggle');
+
+            // Load saved splitter sizes
+            const savedSizes = localStorage.getItem('mainSplitSizes');
+            const defaultSizes = [20, 80, 0];
+
+            // Initialize Split.js
+            const split = Split(['#mainContent', '#rightSidebar'], {
+                sizes: savedSizes ? JSON.parse(savedSizes) : defaultSizes,
+                minSize: [0, 300, 0],
+                gutterSize: 5,
+                snapOffset: 0,
+                dragInterval: 1,
+                direction: 'horizontal',
+                elementStyle: function (dimension, size, gutterSize) {
+                    return {
+                        'flex-basis': `calc(${size}% - ${gutterSize}px)`,
+                    }
+                },
+                gutterStyle: function (dimension, gutterSize) {
+                    return {
+                        'flex-basis': `${gutterSize}px`,
+                    }
+                },
+                onDragEnd: function (sizes) {
+                    localStorage.setItem('mainSplitSizes', JSON.stringify(sizes));
+                }
+            });
+
+            // Set initial sidebar states based on saved sizes
+            if (savedSizes) {
+                const sizes = JSON.parse(savedSizes);
+                if (sizes[0] < 1) {
+                    leftSidebar.classList.add('collapsed');
+                    leftSidebarToggle.innerHTML = '<i class="fa fa-chevron-right"></i>';
+                }
+                if (sizes[2] < 1) {
+                    rightSidebar.classList.add('collapsed');
+                    rightSidebarToggle.innerHTML = '<i class="fa fa-chevron-left"></i>';
+                }
+            }
+
+            // Toggle left sidebar
+            function toggleLeftSidebar() {
+                const isCollapsed = leftSidebar.classList.toggle('collapsed');
+                leftSidebarToggle.innerHTML = isCollapsed
+                    ? '<i class="fa fa-chevron-right"></i>'
+                    : '<i class="fa fa-chevron-left"></i>';
+
+                // if (isCollapsed) {
+                //     split.setSizes([0, rightSidebar.classList.contains('collapsed') ? 100 : 70, 30]);
+                // } else {
+                //     split.setSizes([20, rightSidebar.classList.contains('collapsed') ? 80 : 50, 30]);
+                // }
+
+                // Save current sizes
+                localStorage.setItem('mainSplitSizes', JSON.stringify(split.getSizes()));
+
+                // Resize graph when sidebar toggles
+                setTimeout(resizeGraph, 300);
+            }
+
+            // Toggle right sidebar
+            function toggleRightSidebar() {
+                const isCollapsed = rightSidebar.classList.toggle('collapsed');
+
+                if (isCollapsed) {
+                    // read current value of "flex-basis"
+                    let flexBasis = getComputedStyle(rightSidebar).flexBasis;
+                    rightSidebar.dataset.flexBasis = flexBasis;
+                    rightSidebar.style.flexBasis = '0px';
+                } else {
+                    // restore the value of "flex-basis" from the dataset
+                    rightSidebar.style.flexBasis = rightSidebar.dataset.flexBasis;
+                }
+
+                rightSidebarToggle.innerHTML = isCollapsed
+                    ? '<i class="fa fa-chevron-left"></i>'
+                    : '<i class="fa fa-chevron-right"></i>';
+
+                // Save current sizes
+                localStorage.setItem('mainSplitSizes', JSON.stringify(split.getSizes()));
+
+                // Resize graph when sidebar toggles
+                setTimeout(resizeGraph, 300);
+            }
+
+            // Add event listeners for toggle buttons
+            leftSidebarToggle.addEventListener('click', toggleLeftSidebar);
+            rightSidebarToggle.addEventListener('click', toggleRightSidebar);
+
+            // Resize the network graph when window or splitter changes
+            function resizeGraph() {
+                if (network) {
+                    const container = document.getElementById('graph');
+                    const availableWidth = container.clientWidth;
+                    const availableHeight = container.clientHeight;
+
+                    // Apply smart fit with appropriate scale and offset
+                    // get the current selected node id
+                    let selectedNodeId = network.getSelectedNodes()[0]
+                    // if no node is selected, use the first node
+                    if (!selectedNodeId) {
+                        const firstNode = data.nodes[0]
+                        selectedNodeId = firstNode.id
+                        // select the first node
+                        network.selectNodes([selectedNodeId])
+                        loadOrgChart(selectedNodeId)
+                    } else {
+                        network.focus(selectedNodeId, {
+                            animation: true,
+                            scale: Math.min(1.5, Math.max(0.5, Math.min(availableWidth, availableHeight) / 500))
+                        });
+                    }
+                }
+            }
+
+            window.resizeGraph = resizeGraph;
+            window.addEventListener('resize', resizeGraph);
+
+            // Add zoom control buttons functionality
+            const ZOOM_STEP = 0.2         // relative factor
+            document.getElementById('zoomInBtn').addEventListener('click', () => {
+                network.moveTo({
+                    scale: network.getScale() + ZOOM_STEP,
+                    animation: { duration: 300, easingFunction: 'easeInOutQuad' }
+                })
+            })
+            document.getElementById('zoomOutBtn').addEventListener('click', () => {
+                network.moveTo({
+                    scale: Math.max(0.1, network.getScale() - ZOOM_STEP),
+                    animation: { duration: 300, easingFunction: 'easeInOutQuad' }
+                })
+            });
+
+            document.getElementById('resetViewBtn').addEventListener('click', () => {
+                network.fit({
+                    animation: { duration: 800, easingFunction: 'easeInOutQuad' }
+                })
+            });
+
+            // Add hover information for nodes
+            network.on('hoverNode', params => {
+                const nodeId = params.node;
+                const node = data.nodes.find(n => n.id === nodeId);
+                if (node) {
+                    const graphInfo = document.getElementById('graphInfo');
+                    const graphInfoContent = document.getElementById('graphInfoContent');
+
+                    graphInfoContent.innerHTML = `
+                            <div class="font-semibold text-neutral-200">${node.name}</div>
+                            <div class="text-neutral-400">${node.industry || 'Industry: N/A'}</div>
+                            <div class="flex items-center mt-1">
+                                <i class="fas fa-users mr-1 text-blue-400"></i>
+                                <span>${node.followers?.toLocaleString() || '0'} followers</span>
+                            </div>
+                            <div class="mt-1">${node.about || ''}</div>
+                        `;
+
+                    graphInfo.classList.remove('hidden');
+                }
+            });
+
+            network.on('blurNode', () => {
+                document.getElementById('graphInfo').classList.add('hidden');
+            });
+
+            // Add selected node styling
+            network.on('selectNode', params => {
+                const nodeId = params.nodes[0];
+                if (nodeId) {
+                    // Focus on the selected node
+                    network.focus(nodeId, {
+                        scale: 1.2,
+                        animation: true,
+                    });
+
+                    // Also load the org chart for the selected company
+                    focusCompany(nodeId);
+
+                    // Add visual indicator in the sidebar
+                    document.querySelectorAll('#companyList li').forEach(el => {
+                        el.classList.remove('border-blue-500');
+                        // Find the corresponding company in the sidebar
+                        if (el.querySelector('h3').textContent === data.nodes.find(n => n.id === nodeId)?.name) {
+                            el.classList.add('border-blue-500');
+                            // Scroll the sidebar to show the selected company
+                            el.scrollIntoView({ behavior: 'smooth', block: 'nearest' });
+                        }
+                    });
+                }
+            });
+
+            // Initial fit after a short delay to ensure the network is properly initialized
+            setTimeout(resizeGraph, 500);
+        }
+
+    </script>
+</body>
+
+</html>
\ No newline at end of file

From cd2b490b4030bcbd2c5f58ce87c6d020ca2ddf1e Mon Sep 17 00:00:00 2001
From: wakaka6 <48764488+wakaka6@users.noreply.github.com>
Date: Thu, 1 May 2025 16:59:33 +0800
Subject: [PATCH 6/8] refactor(logger): Apply the Enumeration for color

---
 crawl4ai/async_logger.py            | 56 ++++++++++++++++++++---------
 crawl4ai/async_webcrawler.py        | 12 ++++---
 crawl4ai/browser_profiler.py        | 52 +++++++++++++--------------
 crawl4ai/content_filter_strategy.py | 12 +++----
 4 files changed, 77 insertions(+), 55 deletions(-)

diff --git a/crawl4ai/async_logger.py b/crawl4ai/async_logger.py
index 7866e36f..067e7a19 100644
--- a/crawl4ai/async_logger.py
+++ b/crawl4ai/async_logger.py
@@ -18,6 +18,24 @@ class LogLevel(Enum):
     def __str__(self):
         return self.name.lower()
 
+class LogColor(str, Enum):
+    """Enum for log colors."""
+
+    DEBUG = "lightblack"
+    INFO = "cyan"
+    SUCCESS = "green"
+    WARNING = "yellow"
+    ERROR = "red"
+    CYAN = "cyan"
+    GREEN = "green"
+    YELLOW = "yellow"
+    MAGENTA = "magenta"
+    DIM_MAGENTA = "dim magenta"
+
+    def __str__(self):
+        """Automatically convert rich color to string."""
+        return self.value
+
 
 class AsyncLoggerBase(ABC):
     @abstractmethod
@@ -48,6 +66,7 @@ class AsyncLoggerBase(ABC):
     def error_status(self, url: str, error: str, tag: str = "ERROR", url_length: int = 50):
         pass
 
+
 class AsyncLogger(AsyncLoggerBase):
     """
     Asynchronous logger with support for colored console output and file logging.
@@ -68,11 +87,11 @@ class AsyncLogger(AsyncLoggerBase):
     }
 
     DEFAULT_COLORS = {
-        LogLevel.DEBUG: "lightblack",
-        LogLevel.INFO: "cyan",
-        LogLevel.SUCCESS: "green",
-        LogLevel.WARNING: "yellow",
-        LogLevel.ERROR: "red",
+        LogLevel.DEBUG: LogColor.DEBUG,
+        LogLevel.INFO: LogColor.INFO,
+        LogLevel.SUCCESS: LogColor.SUCCESS,
+        LogLevel.WARNING: LogColor.WARNING,
+        LogLevel.ERROR: LogColor.ERROR,
     }
 
     def __init__(
@@ -81,7 +100,7 @@ class AsyncLogger(AsyncLoggerBase):
         log_level: LogLevel = LogLevel.DEBUG,
         tag_width: int = 10,
         icons: Optional[Dict[str, str]] = None,
-        colors: Optional[Dict[LogLevel, str]] = None,
+        colors: Optional[Dict[LogLevel, LogColor]] = None,
         verbose: bool = True,
     ):
         """
@@ -130,9 +149,9 @@ class AsyncLogger(AsyncLoggerBase):
         message: str,
         tag: str,
         params: Optional[Dict[str, Any]] = None,
-        colors: Optional[Dict[str, str]] = None,
+        colors: Optional[Dict[str, LogColor]] = None,
         boxes: Optional[List[str]] = None,
-        base_color: Optional[str] = None,
+        base_color: Optional[LogColor] = None,
         **kwargs,
     ):
         """
@@ -152,8 +171,11 @@ class AsyncLogger(AsyncLoggerBase):
 
         # avoid conflict with rich formatting
         parsed_message = message.replace("[", "[[").replace("]", "]]")
-        raw_message = message.format(**params) if params else message
         if params:
+            # FIXME: If there are formatting strings in floating point format, 
+            # this may result in colors and boxes not being applied properly.
+            # such as {value:.2f}, the value is 0.23333 format it to 0.23,
+            # but we replace("0.23333", "[color]0.23333[/color]")
             formatted_message = parsed_message.format(**params)
             for key, value in params.items():
                 # value_str may discard `[` and `]`, so we need to replace it. 
@@ -163,17 +185,17 @@ class AsyncLogger(AsyncLoggerBase):
                     color_str = f"[{colors[key]}]{value_str}[/{colors[key]}]"
                     formatted_message = formatted_message.replace(value_str, color_str)
                     value_str = color_str
-                
+
                 # check is need apply box
                 if boxes and key in boxes:
-                    formatted_message = formatted_message.replace(value_str, 
+                    formatted_message = formatted_message.replace(value_str,
                         create_box_message(value_str, type=str(level)))
-            
+
         else:
             formatted_message = parsed_message
 
         # Construct the full log line
-        color = base_color or self.colors[level]
+        color: LogColor = base_color or self.colors[level]
         log_line = f"[{color}]{self._format_tag(tag)} {self._get_icon(tag)} {formatted_message} [/{color}]"
 
         # Output to console if verbose
@@ -223,17 +245,17 @@ class AsyncLogger(AsyncLoggerBase):
         """
         self._log(
             level=LogLevel.SUCCESS if success else LogLevel.ERROR,
-            message="{url:.{url_length}}... | Status: {status} | Time: {timing:.2f}s",
+            message="{url:.{url_length}}... | Status: {status} | Time: {timing}s",
             tag=tag,
             params={
                 "url": url,
                 "url_length": url_length,
                 "status": success,
-                "timing": timing,
+                "timing": f"{timing:.2f}",  # aviod a format string
             },
             colors={
-                "status": "green" if success else "red",
-                "timing": "yellow",
+                "status": LogColor.SUCCESS if success else LogColor.ERROR,
+                "timing": LogColor.WARNING,
             },
         )
 
diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index 963c2d05..afaeeb24 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -34,7 +34,7 @@ from .markdown_generation_strategy import (
     MarkdownGenerationStrategy,
 )
 from .deep_crawling import DeepCrawlDecorator
-from .async_logger import AsyncLogger, AsyncLoggerBase
+from .async_logger import AsyncLogger, AsyncLoggerBase, LogColor
 from .async_configs import BrowserConfig, CrawlerRunConfig
 from .async_dispatcher import *  # noqa: F403
 from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
@@ -43,7 +43,6 @@ from .utils import (
     sanitize_input_encode,
     InvalidCSSSelectorError,
     fast_format_html,
-    create_box_message,
     get_error_context,
     RobotsParser,
 )
@@ -381,8 +380,8 @@ class AsyncWebCrawler:
                             "timing": f"{time.perf_counter() - start_time:.2f}s",
                         },
                         colors={
-                            "status": "green" if crawl_result.success else "red",
-                            "timing": "yellow",
+                            "status": LogColor.SUCCESS if crawl_result.success else LogColor.ERROR,
+                            "timing": LogColor.WARNING,
                         },
                     )
 
@@ -401,7 +400,10 @@ class AsyncWebCrawler:
                             "status": True,
                             "timing": f"{time.perf_counter() - start_time:.2f}s",
                         },
-                        colors={"status": "green", "timing": "yellow"},
+                        colors={
+                            "status": LogColor.SUCCESS if crawl_result.success else LogColor.ERROR,
+                            "timing": LogColor.WARNING,
+                        },
                     )
 
                     cached_result.success = bool(html)
diff --git a/crawl4ai/browser_profiler.py b/crawl4ai/browser_profiler.py
index f8b9e2b0..c9fd17c4 100644
--- a/crawl4ai/browser_profiler.py
+++ b/crawl4ai/browser_profiler.py
@@ -20,7 +20,7 @@ from rich.console import Console
 
 from .async_configs import BrowserConfig
 from .browser_manager import ManagedBrowser
-from .async_logger import AsyncLogger, AsyncLoggerBase
+from .async_logger import AsyncLogger, AsyncLoggerBase, LogColor
 from .utils import get_home_folder
 
 
@@ -129,16 +129,16 @@ class BrowserProfiler:
         
         # Print instructions for the user with rich formatting
         border = "{'='*80}"
-        self.logger.info("{border}", tag="PROFILE", params={"border": f"\n{border}"}, colors={"border": "cyan"})
-        self.logger.info("Creating browser profile: {profile_name}", tag="PROFILE", params={"profile_name": profile_name}, colors={"profile_name": "green"})
-        self.logger.info("Profile directory: {profile_path}", tag="PROFILE", params={"profile_path": profile_path}, colors={"profile_path": "yellow"})
+        self.logger.info("{border}", tag="PROFILE", params={"border": f"\n{border}"}, colors={"border": LogColor.CYAN})
+        self.logger.info("Creating browser profile: {profile_name}", tag="PROFILE", params={"profile_name": profile_name}, colors={"profile_name": LogColor.GREEN})
+        self.logger.info("Profile directory: {profile_path}", tag="PROFILE", params={"profile_path": profile_path}, colors={"profile_path": LogColor.YELLOW})
         
         self.logger.info("\nInstructions:", tag="PROFILE")
         self.logger.info("1. A browser window will open for you to set up your profile.", tag="PROFILE")
-        self.logger.info("{segment}, configure settings, etc. as needed.", tag="PROFILE", params={"segment": "2. Log in to websites"}, colors={"segment": "cyan"})
-        self.logger.info("3. When you're done, {segment} to close the browser.", tag="PROFILE", params={"segment": "press 'q' in this terminal"}, colors={"segment": "yellow"})
+        self.logger.info("{segment}, configure settings, etc. as needed.", tag="PROFILE", params={"segment": "2. Log in to websites"}, colors={"segment": LogColor.CYAN})
+        self.logger.info("3. When you're done, {segment} to close the browser.", tag="PROFILE", params={"segment": "press 'q' in this terminal"}, colors={"segment": LogColor.YELLOW})
         self.logger.info("4. The profile will be saved and ready to use with Crawl4AI.", tag="PROFILE")
-        self.logger.info("{border}", tag="PROFILE", params={"border": f"{border}\n"}, colors={"border": "cyan"})
+        self.logger.info("{border}", tag="PROFILE", params={"border": f"{border}\n"}, colors={"border": LogColor.CYAN})
         
         # Create managed browser instance
         managed_browser = ManagedBrowser(
@@ -197,7 +197,7 @@ class BrowserProfiler:
                     if readable:
                         key = sys.stdin.read(1)
                         if key.lower() == 'q':
-                            self.logger.info("Closing browser and saving profile...", tag="PROFILE", base_color="green")
+                            self.logger.info("Closing browser and saving profile...", tag="PROFILE", base_color=LogColor.GREEN)
                             user_done_event.set()
                             return
                     
@@ -223,7 +223,7 @@ class BrowserProfiler:
                 self.logger.error("Failed to start browser process.", tag="PROFILE")
                 return None
             
-            self.logger.info(f"Browser launched. Waiting for you to finish...", tag="PROFILE") 
+            self.logger.info("Browser launched. Waiting for you to finish...", tag="PROFILE") 
             
             # Start listening for keyboard input
             listener_task = asyncio.create_task(listen_for_quit_command())
@@ -440,18 +440,18 @@ class BrowserProfiler:
             ```
         """
         while True:
-            self.logger.info(f"\nProfile Management Options:", tag="MENU")
-            self.logger.info(f"1. Create a new profile", tag="MENU", base_color="green")
-            self.logger.info(f"2. List available profiles", tag="MENU", base_color="yellow")
-            self.logger.info(f"3. Delete a profile", tag="MENU", base_color="red")
+            self.logger.info("\nProfile Management Options:", tag="MENU")
+            self.logger.info("1. Create a new profile", tag="MENU", base_color=LogColor.GREEN)
+            self.logger.info("2. List available profiles", tag="MENU", base_color=LogColor.YELLOW)
+            self.logger.info("3. Delete a profile", tag="MENU", base_color=LogColor.RED)
             
             # Only show crawl option if callback provided
             if crawl_callback:
-                self.logger.info(f"4. Use a profile to crawl a website", tag="MENU", base_color="cyan")
-                self.logger.info(f"5. Exit", tag="MENU", base_color="magenta")
+                self.logger.info("4. Use a profile to crawl a website", tag="MENU", base_color=LogColor.CYAN)
+                self.logger.info("5. Exit", tag="MENU", base_color=LogColor.MAGENTA)
                 exit_option = "5"
             else:
-                self.logger.info(f"4. Exit", tag="MENU", base_color="magenta")
+                self.logger.info("4. Exit", tag="MENU", base_color=LogColor.MAGENTA)
                 exit_option = "4"
             
             self.logger.print(f"\n[cyan]Enter your choice (1-{exit_option}): [/cyan]", end="")
@@ -475,7 +475,7 @@ class BrowserProfiler:
                 self.logger.info("\nAvailable profiles:", tag="PROFILES")
                 for i, profile in enumerate(profiles):
                     self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES")
-                    self.logger.info(f"    Path: {profile['path']}", tag="PROFILES", base_color="yellow")
+                    self.logger.info(f"    Path: {profile['path']}", tag="PROFILES", base_color=LogColor.YELLOW)
                     self.logger.info(f"    Created: {profile['created'].strftime('%Y-%m-%d %H:%M:%S')}", tag="PROFILES")
                     self.logger.info(f"    Browser type: {profile['type']}", tag="PROFILES")
                     self.logger.info("", tag="PROFILES")  # Empty line for spacing
@@ -488,7 +488,7 @@ class BrowserProfiler:
                     continue
                     
                 # Display numbered list
-                self.logger.info(f"\nAvailable profiles:", tag="PROFILES", base_color="yellow")
+                self.logger.info("\nAvailable profiles:", tag="PROFILES", base_color=LogColor.YELLOW)
                 for i, profile in enumerate(profiles):
                     self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES")
                     
@@ -527,7 +527,7 @@ class BrowserProfiler:
                     continue
                     
                 # Display numbered list
-                self.logger.info(f"\nAvailable profiles:", tag="PROFILES", base_color="yellow")
+                self.logger.info("\nAvailable profiles:", tag="PROFILES", base_color=LogColor.YELLOW)
                 for i, profile in enumerate(profiles):
                     self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES")
                     
@@ -605,9 +605,9 @@ class BrowserProfiler:
         # Print initial information
         border = f"{Fore.CYAN}{'='*80}{Style.RESET_ALL}"
         self.logger.info(f"\n{border}", tag="CDP")
-        self.logger.info(f"Launching standalone browser with CDP debugging", tag="CDP")
-        self.logger.info("Browser type: {browser_type}", tag="CDP", params={"browser_type": browser_type}, colors={"browser_type": "cyan"})
-        self.logger.info("Profile path: {profile_path}", tag="CDP", params={"profile_path": profile_path}, colors={"profile_path": "yellow"})
+        self.logger.info("Launching standalone browser with CDP debugging", tag="CDP")
+        self.logger.info("Browser type: {browser_type}", tag="CDP", params={"browser_type": browser_type}, colors={"browser_type": LogColor.CYAN})
+        self.logger.info("Profile path: {profile_path}", tag="CDP", params={"profile_path": profile_path}, colors={"profile_path": LogColor.YELLOW})
         self.logger.info(f"Debugging port: {debugging_port}", tag="CDP")
         self.logger.info(f"Headless mode: {headless}", tag="CDP")
         
@@ -722,7 +722,7 @@ class BrowserProfiler:
                 self.logger.error("Failed to start browser process.", tag="CDP")
                 return None
             
-            self.logger.info(f"Browser launched successfully. Retrieving CDP information...", tag="CDP") 
+            self.logger.info("Browser launched successfully. Retrieving CDP information...", tag="CDP") 
             
             # Get CDP URL and JSON config
             cdp_url, config_json = await get_cdp_json(debugging_port)
@@ -732,10 +732,10 @@ class BrowserProfiler:
                 
                 if config_json:
                     # Display relevant CDP information
-                    self.logger.info(f"Browser: {config_json.get('Browser', 'Unknown')}", tag="CDP", colors={"Browser": "cyan"})
-                    self.logger.info(f"Protocol Version: {config_json.get('Protocol-Version', 'Unknown')}", tag="CDP", colors={"Protocol-Version": "cyan"})
+                    self.logger.info(f"Browser: {config_json.get('Browser', 'Unknown')}", tag="CDP", colors={"Browser": LogColor.CYAN})
+                    self.logger.info(f"Protocol Version: {config_json.get('Protocol-Version', 'Unknown')}", tag="CDP", colors={"Protocol-Version": LogColor.CYAN})
                     if 'webSocketDebuggerUrl' in config_json:
-                        self.logger.info("WebSocket URL: {webSocketDebuggerUrl}", tag="CDP", params={"webSocketDebuggerUrl": config_json['webSocketDebuggerUrl']}, colors={"webSocketDebuggerUrl": "green"})
+                        self.logger.info("WebSocket URL: {webSocketDebuggerUrl}", tag="CDP", params={"webSocketDebuggerUrl": config_json['webSocketDebuggerUrl']}, colors={"webSocketDebuggerUrl": LogColor.GREEN})
                 else:
                     self.logger.warning("Could not retrieve CDP configuration JSON", tag="CDP")
             else:
diff --git a/crawl4ai/content_filter_strategy.py b/crawl4ai/content_filter_strategy.py
index 35c6ce8c..4102cbad 100644
--- a/crawl4ai/content_filter_strategy.py
+++ b/crawl4ai/content_filter_strategy.py
@@ -27,9 +27,7 @@ import json
 import hashlib
 from pathlib import Path
 from concurrent.futures import ThreadPoolExecutor
-from .async_logger import AsyncLogger, LogLevel
-from rich.console import Console
-from rich.text import Text
+from .async_logger import AsyncLogger, LogLevel, LogColor
 
 
 class RelevantContentFilter(ABC):
@@ -847,7 +845,7 @@ class LLMContentFilter(RelevantContentFilter):
                 },
                 colors={
                     **AsyncLogger.DEFAULT_COLORS,
-                    LogLevel.INFO: "dim magenta"  # Dimmed purple for LLM ops
+                    LogLevel.INFO: LogColor.DIM_MAGENTA  # Dimmed purple for LLM ops
                 },
             )
         else:
@@ -892,7 +890,7 @@ class LLMContentFilter(RelevantContentFilter):
                 "Starting LLM markdown content filtering process",
                 tag="LLM",
                 params={"provider": self.llm_config.provider},
-                colors={"provider": "cyan"},
+                colors={"provider": LogColor.CYAN},
             )
 
         # Cache handling
@@ -929,7 +927,7 @@ class LLMContentFilter(RelevantContentFilter):
                 "LLM markdown: Split content into {chunk_count} chunks",
                 tag="CHUNK",
                 params={"chunk_count": len(html_chunks)},
-                colors={"chunk_count": "yellow"},
+                colors={"chunk_count": LogColor.YELLOW},
             )
 
         start_time = time.time()
@@ -1038,7 +1036,7 @@ class LLMContentFilter(RelevantContentFilter):
                 "LLM markdown: Completed processing in {time:.2f}s",
                 tag="LLM",
                 params={"time": end_time - start_time},
-                colors={"time": "yellow"},
+                colors={"time": LogColor.YELLOW},
             )
 
         result = ordered_results if ordered_results else []

From 94e9959fe09f966ede32f1718a87791acc32f84c Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Thu, 1 May 2025 21:24:52 +0800
Subject: [PATCH 7/8] feat(docker-api): add job-based polling endpoints for
 crawl and LLM tasks

Implements new asynchronous endpoints for handling long-running crawl and LLM tasks:
- POST /crawl/job and GET /crawl/job/{task_id} for crawl operations
- POST /llm/job and GET /llm/job/{task_id} for LLM operations
- Added Redis-based task management with configurable TTL
- Moved schema definitions to dedicated schemas.py
- Added example polling client demo_docker_polling.py

This change allows clients to handle long-running operations asynchronously through a polling pattern rather than holding connections open.
---
 .gitignore                                  |   4 +-
 deploy/docker/api.py                        |  56 +++++++-
 deploy/docker/config.yml                    |   2 +-
 deploy/docker/job.py                        |  99 +++++++++++++
 deploy/docker/schemas.py                    |  42 ++++++
 deploy/docker/server.py                     | 104 +++++---------
 deploy/docker/utils.py                      |   4 +-
 docs/examples/docker/demo_docker_polling.py | 149 ++++++++++++++++++++
 8 files changed, 385 insertions(+), 75 deletions(-)
 create mode 100644 deploy/docker/job.py
 create mode 100644 deploy/docker/schemas.py
 create mode 100644 docs/examples/docker/demo_docker_polling.py

diff --git a/.gitignore b/.gitignore
index 1658a987..7e29c6eb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -261,4 +261,6 @@ CLAUDE.md
 
 tests/**/test_site
 tests/**/reports
-tests/**/benchmark_reports
\ No newline at end of file
+tests/**/benchmark_reports
+
+.codecat/
\ No newline at end of file
diff --git a/deploy/docker/api.py b/deploy/docker/api.py
index 032ea45c..732371f7 100644
--- a/deploy/docker/api.py
+++ b/deploy/docker/api.py
@@ -1,8 +1,10 @@
 import os
 import json
 import asyncio
-from typing import List, Tuple
+from typing import List, Tuple, Dict
 from functools import partial
+from uuid import uuid4
+from datetime import datetime
 
 import logging
 from typing import Optional, AsyncGenerator
@@ -272,7 +274,9 @@ async def handle_llm_request(
 async def handle_task_status(
     redis: aioredis.Redis,
     task_id: str,
-    base_url: str
+    base_url: str,
+    *,
+    keep: bool = False
 ) -> JSONResponse:
     """Handle task status check requests."""
     task = await redis.hgetall(f"task:{task_id}")
@@ -286,7 +290,7 @@ async def handle_task_status(
     response = create_task_response(task, task_id, base_url)
 
     if task["status"] in [TaskStatus.COMPLETED, TaskStatus.FAILED]:
-        if should_cleanup_task(task["created_at"]):
+        if not keep and should_cleanup_task(task["created_at"]):
             await redis.delete(f"task:{task_id}")
 
     return JSONResponse(response)
@@ -520,4 +524,48 @@ async def handle_stream_crawl_request(
         raise HTTPException(
             status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
             detail=str(e)
-        )
\ No newline at end of file
+        )
+        
+async def handle_crawl_job(
+    redis,
+    background_tasks: BackgroundTasks,
+    urls: List[str],
+    browser_config: Dict,
+    crawler_config: Dict,
+    config: Dict,
+) -> Dict:
+    """
+    Fire-and-forget version of handle_crawl_request.
+    Creates a task in Redis, runs the heavy work in a background task,
+    lets /crawl/job/{task_id} polling fetch the result.
+    """
+    task_id = f"crawl_{uuid4().hex[:8]}"
+    await redis.hset(f"task:{task_id}", mapping={
+        "status": TaskStatus.PROCESSING,         # <-- keep enum values consistent
+        "created_at": datetime.utcnow().isoformat(),
+        "url": json.dumps(urls),                 # store list as JSON string
+        "result": "",
+        "error": "",
+    })
+
+    async def _runner():
+        try:
+            result = await handle_crawl_request(
+                urls=urls,
+                browser_config=browser_config,
+                crawler_config=crawler_config,
+                config=config,
+            )
+            await redis.hset(f"task:{task_id}", mapping={
+                "status": TaskStatus.COMPLETED,
+                "result": json.dumps(result),
+            })
+            await asyncio.sleep(5)  # Give Redis time to process the update
+        except Exception as exc:
+            await redis.hset(f"task:{task_id}", mapping={
+                "status": TaskStatus.FAILED,
+                "error": str(exc),
+            })
+
+    background_tasks.add_task(_runner)
+    return {"task_id": task_id}
\ No newline at end of file
diff --git a/deploy/docker/config.yml b/deploy/docker/config.yml
index 680765a3..c81badc4 100644
--- a/deploy/docker/config.yml
+++ b/deploy/docker/config.yml
@@ -3,7 +3,7 @@ app:
   title: "Crawl4AI API"
   version: "1.0.0"
   host: "0.0.0.0"
-  port: 11235
+  port: 11234
   reload: False
   workers: 1
   timeout_keep_alive: 300
diff --git a/deploy/docker/job.py b/deploy/docker/job.py
new file mode 100644
index 00000000..4cd63009
--- /dev/null
+++ b/deploy/docker/job.py
@@ -0,0 +1,99 @@
+"""
+Job endpoints (enqueue + poll) for long-running LL​M extraction and raw crawl.
+Relies on the existing Redis task helpers in api.py
+"""
+
+from typing import Dict, Optional, Callable
+from fastapi import APIRouter, BackgroundTasks, Depends, Request
+from pydantic import BaseModel, HttpUrl
+
+from api import (
+    handle_llm_request,
+    handle_crawl_job,
+    handle_task_status,
+)
+
+# ------------- dependency placeholders -------------
+_redis = None        # will be injected from server.py
+_config = None
+_token_dep: Callable = lambda: None  # dummy until injected
+
+# public router
+router = APIRouter()
+
+
+# === init hook called by server.py =========================================
+def init_job_router(redis, config, token_dep) -> APIRouter:
+    """Inject shared singletons and return the router for mounting."""
+    global _redis, _config, _token_dep
+    _redis, _config, _token_dep = redis, config, token_dep
+    return router
+
+
+# ---------- payload models --------------------------------------------------
+class LlmJobPayload(BaseModel):
+    url:    HttpUrl
+    q:      str
+    schema: Optional[str] = None
+    cache:  bool = False
+
+
+class CrawlJobPayload(BaseModel):
+    urls:           list[HttpUrl]
+    browser_config: Dict = {}
+    crawler_config: Dict = {}
+
+
+# ---------- LL​M job ---------------------------------------------------------
+@router.post("/llm/job", status_code=202)
+async def llm_job_enqueue(
+        payload: LlmJobPayload,
+        background_tasks: BackgroundTasks,
+        request: Request,
+        _td: Dict = Depends(lambda: _token_dep()),   # late-bound dep
+):
+    return await handle_llm_request(
+        _redis,
+        background_tasks,
+        request,
+        str(payload.url),
+        query=payload.q,
+        schema=payload.schema,
+        cache=payload.cache,
+        config=_config,
+    )
+
+
+@router.get("/llm/job/{task_id}")
+async def llm_job_status(
+    request: Request,
+    task_id: str,
+    _td: Dict = Depends(lambda: _token_dep())
+):
+    return await handle_task_status(_redis, task_id)
+
+
+# ---------- CRAWL job -------------------------------------------------------
+@router.post("/crawl/job", status_code=202)
+async def crawl_job_enqueue(
+        payload: CrawlJobPayload,
+        background_tasks: BackgroundTasks,
+        _td: Dict = Depends(lambda: _token_dep()),
+):
+    return await handle_crawl_job(
+        _redis,
+        background_tasks,
+        [str(u) for u in payload.urls],
+        payload.browser_config,
+        payload.crawler_config,
+        config=_config,
+    )
+
+
+@router.get("/crawl/job/{task_id}")
+async def crawl_job_status(
+    request: Request,
+    task_id: str,
+    _td: Dict = Depends(lambda: _token_dep())
+):
+    return await handle_task_status(_redis, task_id, base_url=str(request.base_url))
diff --git a/deploy/docker/schemas.py b/deploy/docker/schemas.py
new file mode 100644
index 00000000..ea32b6c6
--- /dev/null
+++ b/deploy/docker/schemas.py
@@ -0,0 +1,42 @@
+from typing import List, Optional, Dict
+from enum import Enum
+from pydantic import BaseModel, Field
+from utils import FilterType
+
+
+class CrawlRequest(BaseModel):
+    urls: List[str] = Field(min_length=1, max_length=100)
+    browser_config: Optional[Dict] = Field(default_factory=dict)
+    crawler_config: Optional[Dict] = Field(default_factory=dict)
+
+class MarkdownRequest(BaseModel):
+    """Request body for the /md endpoint."""
+    url: str                    = Field(...,  description="Absolute http/https URL to fetch")
+    f:   FilterType             = Field(FilterType.FIT,
+                                        description="Content‑filter strategy: FIT, RAW, BM25, or LLM")
+    q:   Optional[str] = Field(None,  description="Query string used by BM25/LLM filters")
+    c:   Optional[str] = Field("0",   description="Cache‑bust / revision counter")
+
+
+class RawCode(BaseModel):
+    code: str
+
+class HTMLRequest(BaseModel):
+    url: str
+    
+class ScreenshotRequest(BaseModel):
+    url: str
+    screenshot_wait_for: Optional[float] = 2
+    output_path: Optional[str] = None
+
+class PDFRequest(BaseModel):
+    url: str
+    output_path: Optional[str] = None
+
+
+class JSEndpointRequest(BaseModel):
+    url: str
+    scripts: List[str] = Field(
+        ...,
+        description="List of separated JavaScript snippets to execute"
+    )
\ No newline at end of file
diff --git a/deploy/docker/server.py b/deploy/docker/server.py
index bda9d891..0bd6ac2d 100644
--- a/deploy/docker/server.py
+++ b/deploy/docker/server.py
@@ -12,7 +12,7 @@ from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
 from auth import create_access_token, get_token_dependency, TokenRequest
 from pydantic import BaseModel
 from typing import Optional, List, Dict
-from fastapi import Request, Depends 
+from fastapi import Request, Depends
 from fastapi.responses import FileResponse
 import base64
 import re
@@ -22,6 +22,16 @@ from api import (
     handle_stream_crawl_request, handle_crawl_request,
     stream_results
 )
+from schemas import (
+    CrawlRequest,
+    MarkdownRequest,
+    RawCode,
+    HTMLRequest,
+    ScreenshotRequest,
+    PDFRequest,
+    JSEndpointRequest,
+)
+
 from utils import (
     FilterType, load_config, setup_logging, verify_email_domain
 )
@@ -37,23 +47,13 @@ from fastapi import (
     FastAPI, HTTPException, Request, Path, Query, Depends
 )
 from rank_bm25 import BM25Okapi
-
-def chunk_code_functions(code: str) -> List[str]:
-    tree = ast.parse(code)
-    lines = code.splitlines()
-    chunks = []
-    for node in tree.body:
-        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
-            start = node.lineno - 1
-            end = getattr(node, 'end_lineno', start + 1)
-            chunks.append("\n".join(lines[start:end]))
-    return chunks
 from fastapi.responses import (
     StreamingResponse, RedirectResponse, PlainTextResponse, JSONResponse
 )
 from fastapi.middleware.httpsredirect import HTTPSRedirectMiddleware
 from fastapi.middleware.trustedhost import TrustedHostMiddleware
 from fastapi.staticfiles import StaticFiles
+from job import init_job_router
 
 from mcp_bridge import attach_mcp, mcp_resource, mcp_template, mcp_tool
 
@@ -129,8 +129,6 @@ app.mount(
     name="play",
 )
 
-# Optional nice‑to‑have: opening the root shows the playground
-
 
 @app.get("/")
 async def root():
@@ -211,48 +209,10 @@ def _safe_eval_config(expr: str) -> dict:
     return obj.dump()
 
 
-# ───────────────────────── Schemas ───────────────────────────
-class CrawlRequest(BaseModel):
-    urls: List[str] = Field(min_length=1, max_length=100)
-    browser_config: Optional[Dict] = Field(default_factory=dict)
-    crawler_config: Optional[Dict] = Field(default_factory=dict)
-
-# ────────────── Schemas ──────────────
-class MarkdownRequest(BaseModel):
-    """Request body for the /md endpoint."""
-    url: str                    = Field(...,  description="Absolute http/https URL to fetch")
-    f:   FilterType             = Field(FilterType.FIT,
-                                        description="Content‑filter strategy: FIT, RAW, BM25, or LLM")
-    q:   Optional[str] = Field(None,  description="Query string used by BM25/LLM filters")
-    c:   Optional[str] = Field("0",   description="Cache‑bust / revision counter")
-
-
-class RawCode(BaseModel):
-    code: str
-
-class HTMLRequest(BaseModel):
-    url: str
-    
-class ScreenshotRequest(BaseModel):
-    url: str
-    screenshot_wait_for: Optional[float] = 2
-    output_path: Optional[str] = None
-
-class PDFRequest(BaseModel):
-    url: str
-    output_path: Optional[str] = None
-
-
-class JSEndpointRequest(BaseModel):
-    url: str
-    scripts: List[str] = Field(
-        ...,
-        description="List of separated JavaScript snippets to execute"
-    )
+# ── job router ──────────────────────────────────────────────
+app.include_router(init_job_router(redis, config, token_dep))
 
 # ──────────────────────── Endpoints ──────────────────────────
-
-
 @app.post("/token")
 async def get_token(req: TokenRequest):
     if not verify_email_domain(req.email):
@@ -278,7 +238,8 @@ async def get_markdown(
     _td: Dict = Depends(token_dep),
 ):
     if not body.url.startswith(("http://", "https://")):
-        raise HTTPException(400, "URL must be absolute and start with http/https")
+        raise HTTPException(
+            400, "URL must be absolute and start with http/https")
     markdown = await handle_markdown_request(
         body.url, body.f, body.q, body.c, config
     )
@@ -314,12 +275,13 @@ async def generate_html(
 
 # Screenshot endpoint
 
+
 @app.post("/screenshot")
 @limiter.limit(config["rate_limiting"]["default_limit"])
 @mcp_tool("screenshot")
 async def generate_screenshot(
     request: Request,
-    body: ScreenshotRequest, 
+    body: ScreenshotRequest,
     _td: Dict = Depends(token_dep),
 ):
     """
@@ -327,7 +289,8 @@ async def generate_screenshot(
     Use when you need an image snapshot of the rendered page. Its recommened to provide an output path to save the screenshot.
     Then in result instead of the screenshot you will get a path to the saved file.
     """
-    cfg = CrawlerRunConfig(screenshot=True, screenshot_wait_for=body.screenshot_wait_for)
+    cfg = CrawlerRunConfig(
+        screenshot=True, screenshot_wait_for=body.screenshot_wait_for)
     async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
         results = await crawler.arun(url=body.url, config=cfg)
     screenshot_data = results[0].screenshot
@@ -341,12 +304,13 @@ async def generate_screenshot(
 
 # PDF endpoint
 
+
 @app.post("/pdf")
 @limiter.limit(config["rate_limiting"]["default_limit"])
 @mcp_tool("pdf")
 async def generate_pdf(
     request: Request,
-    body: PDFRequest, 
+    body: PDFRequest,
     _td: Dict = Depends(token_dep),
 ):
     """
@@ -384,7 +348,7 @@ async def execute_js(
         Your script will replace '{script}' and execute in the browser context. So provide either an IIFE or a sync/async function that returns a value.
     Return Format:
         - The return result is an instance of CrawlResult, so you have access to markdown, links, and other stuff. If this is enough, you don't need to call again for other endpoints.
-        
+
         ```python
         class CrawlResult(BaseModel):
             url: str
@@ -418,7 +382,7 @@ async def execute_js(
             fit_markdown: Optional[str] = None
             fit_html: Optional[str] = None
         ```
-        
+
     """
     cfg = CrawlerRunConfig(js_code=body.scripts)
     async with AsyncWebCrawler(config=BrowserConfig()) as crawler:
@@ -507,6 +471,7 @@ async def crawl_stream(
         },
     )
 
+
 def chunk_code_functions(code_md: str) -> List[str]:
     """Extract each function/class from markdown code blocks per file."""
     pattern = re.compile(
@@ -530,6 +495,7 @@ def chunk_code_functions(code_md: str) -> List[str]:
                 chunks.append(f"# File: {file_path}\n{snippet}")
     return chunks
 
+
 def chunk_doc_sections(doc: str) -> List[str]:
     lines = doc.splitlines(keepends=True)
     sections = []
@@ -545,6 +511,7 @@ def chunk_doc_sections(doc: str) -> List[str]:
         sections.append("".join(current))
     return sections
 
+
 @app.get("/ask")
 @limiter.limit(config["rate_limiting"]["default_limit"])
 @mcp_tool("ask")
@@ -552,21 +519,24 @@ async def get_context(
     request: Request,
     _td: Dict = Depends(token_dep),
     context_type: str = Query("all", regex="^(code|doc|all)$"),
-    query: Optional[str] = Query(None, description="search query to filter chunks"),
-    score_ratio: float = Query(0.5, ge=0.0, le=1.0, description="min score as fraction of max_score"),
-    max_results: int = Query(20, ge=1, description="absolute cap on returned chunks"),
+    query: Optional[str] = Query(
+        None, description="search query to filter chunks"),
+    score_ratio: float = Query(
+        0.5, ge=0.0, le=1.0, description="min score as fraction of max_score"),
+    max_results: int = Query(
+        20, ge=1, description="absolute cap on returned chunks"),
 ):
     """
     This end point is design for any questions about Crawl4ai library. It returns a plain text markdown with extensive information about Crawl4ai. 
     You can use this as a context for any AI assistant. Use this endpoint for AI assistants to retrieve library context for decision making or code generation tasks.
     Alway is BEST practice you provide a query to filter the context. Otherwise the lenght of the response will be very long.
-    
+
     Parameters:
     - context_type: Specify "code" for code context, "doc" for documentation context, or "all" for both.
     - query: RECOMMENDED search query to filter paragraphs using BM25. You can leave this empty to get all the context.
     - score_ratio: Minimum score as a fraction of the maximum score for filtering results.
     - max_results: Maximum number of results to return. Default is 20.
-    
+
     Returns:
     - JSON response with the requested context.
     - If "code" is specified, returns the code context.
@@ -576,7 +546,7 @@ async def get_context(
     # load contexts
     base = os.path.dirname(__file__)
     code_path = os.path.join(base, "c4ai-code-context.md")
-    doc_path  = os.path.join(base, "c4ai-doc-context.md")
+    doc_path = os.path.join(base, "c4ai-doc-context.md")
     if not os.path.exists(code_path) or not os.path.exists(doc_path):
         raise HTTPException(404, "Context files not found")
 
@@ -626,7 +596,7 @@ async def get_context(
         ]
 
     return JSONResponse(results)
-    
+
 
 # attach MCP layer (adds /mcp/ws, /mcp/sse, /mcp/schema)
 print(f"MCP server running on {config['app']['host']}:{config['app']['port']}")
diff --git a/deploy/docker/utils.py b/deploy/docker/utils.py
index ff0aa2df..05af2139 100644
--- a/deploy/docker/utils.py
+++ b/deploy/docker/utils.py
@@ -45,10 +45,10 @@ def datetime_handler(obj: any) -> Optional[str]:
         return obj.isoformat()
     raise TypeError(f"Object of type {type(obj)} is not JSON serializable")
 
-def should_cleanup_task(created_at: str) -> bool:
+def should_cleanup_task(created_at: str, ttl_seconds: int = 3600) -> bool:
     """Check if task should be cleaned up based on creation time."""
     created = datetime.fromisoformat(created_at)
-    return (datetime.now() - created).total_seconds() > 3600
+    return (datetime.now() - created).total_seconds() > ttl_seconds
 
 def decode_redis_hash(hash_data: Dict[bytes, bytes]) -> Dict[str, str]:
     """Decode Redis hash data from bytes to strings."""
diff --git a/docs/examples/docker/demo_docker_polling.py b/docs/examples/docker/demo_docker_polling.py
new file mode 100644
index 00000000..ee895723
--- /dev/null
+++ b/docs/examples/docker/demo_docker_polling.py
@@ -0,0 +1,149 @@
+
+#!/usr/bin/env python3
+"""
+demo_docker_polling.py
+Quick sanity-check for the asynchronous crawl job endpoints:
+
+  • POST  /crawl/job          – enqueue work, get task_id
+  • GET   /crawl/job/{id}     – poll status / fetch result
+
+The style matches demo_docker_api.py (console.rule banners, helper
+functions, coloured status lines).  Adjust BASE_URL as needed.
+
+Run:  python demo_docker_polling.py
+"""
+
+import asyncio, json, os, time, urllib.parse
+from typing import Dict, List
+
+import httpx
+from rich.console import Console
+from rich.panel   import Panel
+from rich.syntax  import Syntax
+
+console   = Console()
+BASE_URL  = os.getenv("BASE_URL", "http://localhost:11234")
+SIMPLE_URL = "https://example.org"
+LINKS_URL  = "https://httpbin.org/links/10/1"
+
+# --- helpers --------------------------------------------------------------
+
+
+def print_payload(payload: Dict):
+    console.print(Panel(Syntax(json.dumps(payload, indent=2),
+                               "json", theme="monokai", line_numbers=False),
+                        title="Payload", border_style="cyan", expand=False))
+
+
+async def check_server_health(client: httpx.AsyncClient) -> bool:
+    try:
+        resp = await client.get("/health")
+        if resp.is_success:
+            console.print("[green]Server healthy[/]")
+            return True
+    except Exception:
+        pass
+    console.print("[bold red]Server is not responding on /health[/]")
+    return False
+
+
+async def poll_for_result(client: httpx.AsyncClient, task_id: str,
+                          poll_interval: float = 1.5, timeout: float = 90.0):
+    """Hit /crawl/job/{id} until COMPLETED/FAILED or timeout."""
+    start = time.time()
+    while True:
+        resp = await client.get(f"/crawl/job/{task_id}")
+        resp.raise_for_status()
+        data = resp.json()
+        status = data.get("status")
+        if status.upper() in ("COMPLETED", "FAILED"):
+            return data
+        if time.time() - start > timeout:
+            raise TimeoutError(f"Task {task_id} did not finish in {timeout}s")
+        await asyncio.sleep(poll_interval)
+
+
+# --- demo functions -------------------------------------------------------
+
+
+async def demo_poll_single_url(client: httpx.AsyncClient):
+    payload = {
+        "urls": [SIMPLE_URL],
+        "browser_config": {"type": "BrowserConfig",
+                           "params": {"headless": True}},
+        "crawler_config": {"type": "CrawlerRunConfig",
+                           "params": {"cache_mode": "BYPASS"}}
+    }
+
+    console.rule("[bold blue]Demo A: /crawl/job Single URL[/]", style="blue")
+    print_payload(payload)
+
+    # enqueue
+    resp = await client.post("/crawl/job", json=payload)
+    console.print(f"Enqueue status: [bold]{resp.status_code}[/]")
+    resp.raise_for_status()
+    task_id = resp.json()["task_id"]
+    console.print(f"Task ID: [yellow]{task_id}[/]")
+
+    # poll
+    console.print("Polling…")
+    result = await poll_for_result(client, task_id)
+    console.print(Panel(Syntax(json.dumps(result, indent=2),
+                               "json", theme="fruity"),
+                        title="Final result", border_style="green"))
+    if result["status"] == "COMPLETED":
+        console.print("[green]✅ Crawl succeeded[/]")
+    else:
+        console.print("[red]❌ Crawl failed[/]")
+
+
+async def demo_poll_multi_url(client: httpx.AsyncClient):
+    payload = {
+        "urls": [SIMPLE_URL, LINKS_URL],
+        "browser_config": {"type": "BrowserConfig",
+                           "params": {"headless": True}},
+        "crawler_config": {"type": "CrawlerRunConfig",
+                           "params": {"cache_mode": "BYPASS"}}
+    }
+
+    console.rule("[bold magenta]Demo B: /crawl/job Multi-URL[/]",
+                 style="magenta")
+    print_payload(payload)
+
+    resp = await client.post("/crawl/job", json=payload)
+    console.print(f"Enqueue status: [bold]{resp.status_code}[/]")
+    resp.raise_for_status()
+    task_id = resp.json()["task_id"]
+    console.print(f"Task ID: [yellow]{task_id}[/]")
+
+    console.print("Polling…")
+    result = await poll_for_result(client, task_id)
+    console.print(Panel(Syntax(json.dumps(result, indent=2),
+                               "json", theme="fruity"),
+                        title="Final result", border_style="green"))
+    if result["status"] == "COMPLETED":
+        console.print(
+            f"[green]✅ {len(json.loads(result['result'])['results'])} URLs crawled[/]")
+    else:
+        console.print("[red]❌ Crawl failed[/]")
+
+
+# --- main runner ----------------------------------------------------------
+
+
+async def main_demo():
+    async with httpx.AsyncClient(base_url=BASE_URL, timeout=300.0) as client:
+        if not await check_server_health(client):
+            return
+        await demo_poll_single_url(client)
+        await demo_poll_multi_url(client)
+        console.rule("[bold green]Polling demos complete[/]", style="green")
+
+
+if __name__ == "__main__":
+    try:
+        asyncio.run(main_demo())
+    except KeyboardInterrupt:
+        console.print("\n[yellow]Interrupted by user[/]")
+    except Exception:
+        console.print_exception(show_locals=False)

From 9b5ccac76eab917e844bbe012dc03ef3fcda46a5 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Fri, 2 May 2025 21:15:24 +0800
Subject: [PATCH 8/8] feat(extraction): add RegexExtractionStrategy for
 pattern-based extraction

Add new RegexExtractionStrategy for fast, zero-LLM extraction of common data types:
- Built-in patterns for emails, URLs, phones, dates, and more
- Support for custom regex patterns
- LLM-assisted pattern generation utility
- Optimized HTML preprocessing with fit_html field
- Enhanced network response body capture

Breaking changes: None
---
 CHANGELOG.md                                 |  15 +
 crawl4ai/__init__.py                         |   4 +-
 crawl4ai/async_crawler_strategy.py           |  15 +-
 crawl4ai/async_webcrawler.py                 |   8 +-
 crawl4ai/browser_profiler.py                 |   2 +-
 crawl4ai/extraction_strategy.py              | 303 ++++++++++++++++-
 crawl4ai/models.py                           |   1 +
 crawl4ai/utils.py                            |  78 +++--
 docs/examples/hello_world.py                 |  29 +-
 docs/examples/regex_extraction_quickstart.py | 143 ++++++++
 docs/md_v2/api/crawl-result.md               |  23 +-
 docs/md_v2/api/strategies.md                 | 155 ++++++++-
 docs/md_v2/extraction/no-llm-strategies.md   | 332 ++++++++++++++++---
 13 files changed, 984 insertions(+), 124 deletions(-)
 create mode 100644 docs/examples/regex_extraction_quickstart.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 16f96f47..4475e12e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,21 @@ All notable changes to Crawl4AI will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.6.2] - 2025-05-02
+
+### Added
+- New `RegexExtractionStrategy` for fast pattern-based extraction without requiring LLM
+  - Built-in patterns for emails, URLs, phone numbers, dates, and more
+  - Support for custom regex patterns
+  - `generate_pattern` utility for LLM-assisted pattern creation (one-time use)
+- Added `fit_html` as a top-level field in `CrawlResult` for optimized HTML extraction
+- Added support for network response body capture in network request tracking
+
+### Changed
+- Updated documentation for no-LLM extraction strategies
+- Enhanced API reference to include RegexExtractionStrategy examples and usage
+- Improved HTML preprocessing with optimized performance for extraction strategies
+
 ## [0.6.1] - 2025-04-24
 
 ### Added
diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py
index 9dff4453..3ba22ece 100644
--- a/crawl4ai/__init__.py
+++ b/crawl4ai/__init__.py
@@ -23,7 +23,8 @@ from .extraction_strategy import (
     CosineStrategy,
     JsonCssExtractionStrategy,
     JsonXPathExtractionStrategy,
-    JsonLxmlExtractionStrategy
+    JsonLxmlExtractionStrategy,
+    RegexExtractionStrategy
 )
 from .chunking_strategy import ChunkingStrategy, RegexChunking
 from .markdown_generation_strategy import DefaultMarkdownGenerator
@@ -105,6 +106,7 @@ __all__ = [
     "JsonCssExtractionStrategy",
     "JsonXPathExtractionStrategy",
     "JsonLxmlExtractionStrategy",
+    "RegexExtractionStrategy",
     "ChunkingStrategy",
     "RegexChunking",
     "DefaultMarkdownGenerator",
diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index ffc7626f..0d62c7b5 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -571,6 +571,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
 
             async def handle_response_capture(response):
                 try:
+                    try:
+                        # body = await response.body()
+                        # json_body = await response.json()
+                        text_body = await response.text()
+                    except Exception as e:
+                        body = None
+                        # json_body = None
+                        # text_body = None
                     captured_requests.append({
                         "event_type": "response",
                         "url": response.url,
@@ -579,7 +587,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                         "headers": dict(response.headers), # Convert Header dict
                         "from_service_worker": response.from_service_worker,
                         "request_timing": response.request.timing, # Detailed timing info
-                        "timestamp": time.time()
+                        "timestamp": time.time(),
+                        "body" : {
+                            # "raw": body,
+                            # "json": json_body,
+                            "text": text_body
+                        }
                     })
                 except Exception as e:
                     if self.logger:
diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index b7ded49d..19b98522 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -503,6 +503,8 @@ class AsyncWebCrawler:
             tables = media.pop("tables", [])
             links = result.links.model_dump()
             metadata = result.metadata
+            
+        fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000)
 
         ################################
         # Generate Markdown            #
@@ -519,7 +521,7 @@ class AsyncWebCrawler:
         html_source_selector = {
             "raw_html": lambda: html,  # The original raw HTML
             "cleaned_html": lambda: cleaned_html,  # The HTML after scraping strategy
-            "fit_html": lambda: preprocess_html_for_schema(html_content=html),  # Preprocessed raw HTML
+            "fit_html": lambda: fit_html,  # The HTML after preprocessing for schema
         }
 
         markdown_input_html = cleaned_html  # Default to cleaned_html
@@ -593,6 +595,7 @@ class AsyncWebCrawler:
             content = {
                 "markdown": markdown_result.raw_markdown,
                 "html": html,
+                "fit_html": fit_html,
                 "cleaned_html": cleaned_html,
                 "fit_markdown": markdown_result.fit_markdown,
             }.get(content_format, markdown_result.raw_markdown)
@@ -600,7 +603,7 @@ class AsyncWebCrawler:
             # Use IdentityChunking for HTML input, otherwise use provided chunking strategy
             chunking = (
                 IdentityChunking()
-                if content_format in ["html", "cleaned_html"]
+                if content_format in ["html", "cleaned_html", "fit_html"]
                 else config.chunking_strategy
             )
             sections = chunking.chunk(content)
@@ -624,6 +627,7 @@ class AsyncWebCrawler:
         return CrawlResult(
             url=url,
             html=html,
+            fit_html=fit_html,
             cleaned_html=cleaned_html,
             markdown=markdown_result,
             media=media,
diff --git a/crawl4ai/browser_profiler.py b/crawl4ai/browser_profiler.py
index 0ebbb91e..5a95b25d 100644
--- a/crawl4ai/browser_profiler.py
+++ b/crawl4ai/browser_profiler.py
@@ -475,7 +475,7 @@ class BrowserProfiler:
                     self.logger.warning("  No profiles found. Create one first with option 1.", tag="PROFILES")
                     continue
                 
-                # Print profile information with colorama formatting
+                # Print profile information 
                 self.logger.info("\nAvailable profiles:", tag="PROFILES")
                 for i, profile in enumerate(profiles):
                     self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES")
diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py
index 954fe37e..245abc54 100644
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -1,9 +1,10 @@
 from abc import ABC, abstractmethod
 import inspect
-from typing import Any, List, Dict, Optional
+from typing import Any, List, Dict, Optional, Tuple, Pattern, Union
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import json
 import time
+from enum import IntFlag, auto
 
 from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH, PROMPT_EXTRACT_INFERRED_SCHEMA
 from .config import (
@@ -1668,3 +1669,303 @@ class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
     def _get_element_attribute(self, element, attribute: str):
         return element.get(attribute)
 
+"""
+RegexExtractionStrategy
+Fast, zero-LLM extraction of common entities via regular expressions.
+"""
+
+_CTRL = {c: rf"\x{ord(c):02x}" for c in map(chr, range(32)) if c not in "\t\n\r"}
+
+_WB_FIX = re.compile(r"\x08")               # stray back-space   →   word-boundary
+_NEEDS_ESCAPE = re.compile(r"(?<!\\)\\(?![\\u])")   # lone backslash
+
+def _sanitize_schema(schema: Dict[str, str]) -> Dict[str, str]:
+    """Fix common JSON-escape goofs coming from LLMs or manual edits."""
+    safe = {}
+    for label, pat in schema.items():
+        # 1️⃣ replace accidental control chars (inc. the infamous back-space)
+        pat = _WB_FIX.sub(r"\\b", pat).translate(_CTRL)
+
+        # 2️⃣ double any single backslash that JSON kept single
+        pat = _NEEDS_ESCAPE.sub(r"\\\\", pat)
+
+        # 3️⃣ quick sanity compile
+        try:
+            re.compile(pat)
+        except re.error as e:
+            raise ValueError(f"Regex for '{label}' won’t compile after fix: {e}") from None
+
+        safe[label] = pat
+    return safe
+
+
+class RegexExtractionStrategy(ExtractionStrategy):
+    """
+    A lean strategy that finds e-mails, phones, URLs, dates, money, etc.,
+    using nothing but pre-compiled regular expressions.
+
+    Extraction returns::
+
+        {
+            "url":   "<page-url>",
+            "label": "<pattern-label>",
+            "value": "<matched-string>",
+            "span":  [start, end]
+        }
+
+    Only `generate_schema()` touches an LLM, extraction itself is pure Python.
+    """
+
+    # -------------------------------------------------------------- #
+    # Built-in patterns exposed as IntFlag so callers can bit-OR them
+    # -------------------------------------------------------------- #
+    class _B(IntFlag):
+        EMAIL           = auto()
+        PHONE_INTL      = auto()
+        PHONE_US        = auto()
+        URL             = auto()
+        IPV4            = auto()
+        IPV6            = auto()
+        UUID            = auto()
+        CURRENCY        = auto()
+        PERCENTAGE      = auto()
+        NUMBER          = auto()
+        DATE_ISO        = auto()
+        DATE_US         = auto()
+        TIME_24H        = auto()
+        POSTAL_US       = auto()
+        POSTAL_UK       = auto()
+        HTML_COLOR_HEX  = auto()
+        TWITTER_HANDLE  = auto()
+        HASHTAG         = auto()
+        MAC_ADDR        = auto()
+        IBAN            = auto()
+        CREDIT_CARD     = auto()
+        NOTHING         = auto()
+        ALL             = (
+            EMAIL | PHONE_INTL | PHONE_US | URL | IPV4 | IPV6 | UUID
+            | CURRENCY | PERCENTAGE | NUMBER | DATE_ISO | DATE_US | TIME_24H
+            | POSTAL_US | POSTAL_UK | HTML_COLOR_HEX | TWITTER_HANDLE
+            | HASHTAG | MAC_ADDR | IBAN | CREDIT_CARD
+        )
+
+    # user-friendly aliases  (RegexExtractionStrategy.Email, .IPv4, …)
+    Email          = _B.EMAIL
+    PhoneIntl      = _B.PHONE_INTL
+    PhoneUS        = _B.PHONE_US
+    Url            = _B.URL
+    IPv4           = _B.IPV4
+    IPv6           = _B.IPV6
+    Uuid           = _B.UUID
+    Currency       = _B.CURRENCY
+    Percentage     = _B.PERCENTAGE
+    Number         = _B.NUMBER
+    DateIso        = _B.DATE_ISO
+    DateUS         = _B.DATE_US
+    Time24h        = _B.TIME_24H
+    PostalUS       = _B.POSTAL_US
+    PostalUK       = _B.POSTAL_UK
+    HexColor       = _B.HTML_COLOR_HEX
+    TwitterHandle  = _B.TWITTER_HANDLE
+    Hashtag        = _B.HASHTAG
+    MacAddr        = _B.MAC_ADDR
+    Iban           = _B.IBAN
+    CreditCard     = _B.CREDIT_CARD
+    All            = _B.ALL
+    Nothing        = _B(0)  # no patterns
+
+    # ------------------------------------------------------------------ #
+    # Built-in pattern catalog
+    # ------------------------------------------------------------------ #
+    DEFAULT_PATTERNS: Dict[str, str] = {
+        # Communication
+        "email":           r"[\w.+-]+@[\w-]+\.[\w.-]+",
+        "phone_intl":      r"\+?\d[\d .()-]{7,}\d",
+        "phone_us":        r"\(?\d{3}\)?[ -. ]?\d{3}[ -. ]?\d{4}",
+        # Web
+        "url":             r"https?://[^\s\"'<>]+",
+        "ipv4":            r"(?:\d{1,3}\.){3}\d{1,3}",
+        "ipv6":            r"[A-F0-9]{1,4}(?::[A-F0-9]{1,4}){7}",
+        # IDs
+        "uuid":            r"[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}",
+        # Money / numbers
+        "currency":        r"(?:USD|EUR|RM|\$|€|£)\s?\d+(?:[.,]\d{2})?",
+        "percentage":      r"\d+(?:\.\d+)?%",
+        "number":          r"\b\d{1,3}(?:[,.\s]\d{3})*(?:\.\d+)?\b",
+        # Dates / Times
+        "date_iso":        r"\d{4}-\d{2}-\d{2}",
+        "date_us":         r"\d{1,2}/\d{1,2}/\d{2,4}",
+        "time_24h":        r"\b(?:[01]?\d|2[0-3]):[0-5]\d(?:[:.][0-5]\d)?\b",
+        # Misc
+        "postal_us":       r"\b\d{5}(?:-\d{4})?\b",
+        "postal_uk":       r"\b[A-Z]{1,2}\d[A-Z\d]? ?\d[A-Z]{2}\b",
+        "html_color_hex":  r"#[0-9A-Fa-f]{6}\b",
+        "twitter_handle":  r"@[\w]{1,15}",
+        "hashtag":         r"#[\w-]+",
+        "mac_addr":        r"(?:[0-9A-Fa-f]{2}:){5}[0-9A-Fa-f]{2}",
+        "iban":            r"[A-Z]{2}\d{2}[A-Z0-9]{11,30}",
+        "credit_card":     r"\b(?:4\d{12}(?:\d{3})?|5[1-5]\d{14}|3[47]\d{13}|6(?:011|5\d{2})\d{12})\b",
+    }
+
+    _FLAGS = re.IGNORECASE | re.MULTILINE
+    _UNWANTED_PROPS = {
+        "provider": "Use llm_config instead",
+        "api_token": "Use llm_config instead",
+    }
+
+    # ------------------------------------------------------------------ #
+    # Construction
+    # ------------------------------------------------------------------ #
+    def __init__(
+        self,
+        pattern: "_B" = _B.NOTHING,
+        *,
+        custom: Optional[Union[Dict[str, str], List[Tuple[str, str]]]] = None,
+        input_format: str = "fit_html",
+        **kwargs,
+    ) -> None:
+        """
+        Args:
+            patterns: Custom patterns overriding or extending defaults.
+                      Dict[label, regex] or list[tuple(label, regex)].
+            input_format: "html", "markdown" or "text".
+            **kwargs: Forwarded to ExtractionStrategy.
+        """
+        super().__init__(input_format=input_format, **kwargs)
+
+        # 1️⃣  take only the requested built-ins
+        merged: Dict[str, str] = {
+            key: rx
+            for key, rx in self.DEFAULT_PATTERNS.items()
+            if getattr(self._B, key.upper()).value & pattern
+        }
+
+        # 2️⃣  apply user overrides / additions
+        if custom:
+            if isinstance(custom, dict):
+                merged.update(custom)
+            else:  # iterable of (label, regex)
+                merged.update({lbl: rx for lbl, rx in custom})
+
+        self._compiled: Dict[str, Pattern] = {
+            lbl: re.compile(rx, self._FLAGS) for lbl, rx in merged.items()
+        }
+
+    # ------------------------------------------------------------------ #
+    # Extraction
+    # ------------------------------------------------------------------ #
+    def extract(self, url: str, content: str, *q, **kw) -> List[Dict[str, Any]]:
+        # text = self._plain_text(html)
+        out: List[Dict[str, Any]] = []
+
+        for label, cre in self._compiled.items():
+            for m in cre.finditer(content):
+                out.append(
+                    {
+                        "url": url,
+                        "label": label,
+                        "value": m.group(0),
+                        "span": [m.start(), m.end()],
+                    }
+                )
+        return out
+
+    # ------------------------------------------------------------------ #
+    # Helpers
+    # ------------------------------------------------------------------ #
+    def _plain_text(self, content: str) -> str:
+        if self.input_format == "text":
+            return content
+        return BeautifulSoup(content, "lxml").get_text(" ", strip=True)
+
+    # ------------------------------------------------------------------ #
+    # LLM-assisted pattern generator
+    # ------------------------------------------------------------------ #
+    # ------------------------------------------------------------------ #
+    # LLM-assisted one-off pattern builder
+    # ------------------------------------------------------------------ #
+    @staticmethod
+    def generate_pattern(
+        label: str,
+        html: str,
+        *,
+        query: Optional[str] = None,
+        examples: Optional[List[str]] = None,
+        llm_config: Optional[LLMConfig] = None,
+        **kwargs,
+    ) -> Dict[str, str]:
+        """
+        Ask an LLM for a single page-specific regex and return
+            {label: pattern}   ── ready for RegexExtractionStrategy(custom=…)
+        """
+
+        # ── guard deprecated kwargs
+        for k in RegexExtractionStrategy._UNWANTED_PROPS:
+            if k in kwargs:
+                raise AttributeError(
+                    f"{k} is deprecated, {RegexExtractionStrategy._UNWANTED_PROPS[k]}"
+                )
+
+        # ── default LLM config
+        if llm_config is None:
+            llm_config = create_llm_config()
+
+        # ── system prompt – hardened
+        system_msg = (
+            "You are an expert Python-regex engineer.\n"
+            f"Return **one** JSON object whose single key is exactly \"{label}\", "
+            "and whose value is a raw-string regex pattern that works with "
+            "the standard `re` module in Python.\n\n"
+            "Strict rules (obey every bullet):\n"
+            "• If a *user query* is supplied, treat it as the precise semantic target and optimise the "
+            "  pattern to capture ONLY text that answers that query. If the query conflicts with the "
+            "  sample HTML, the HTML wins.\n"
+            "• Tailor the pattern to the *sample HTML* – reproduce its exact punctuation, spacing, "
+            "  symbols, capitalisation, etc. Do **NOT** invent a generic form.\n"
+            "• Keep it minimal and fast: avoid unnecessary capturing, prefer non-capturing `(?: … )`, "
+            "  and guard against catastrophic backtracking.\n"
+            "• Anchor with `^`, `$`, or `\\b` only when it genuinely improves precision.\n"
+            "• Use inline flags like `(?i)` when needed; no verbose flag comments.\n"
+            "• Output must be valid JSON – no markdown, code fences, comments, or extra keys.\n"
+            "• The regex value must be a Python string literal: **double every backslash** "
+            "(e.g. `\\\\b`, `\\\\d`, `\\\\\\\\`).\n\n"
+            "Example valid output:\n"
+            f"{{\"{label}\": \"(?:RM|rm)\\\\s?\\\\d{{1,3}}(?:,\\\\d{{3}})*(?:\\\\.\\\\d{{2}})?\"}}"
+        )
+
+        # ── user message: cropped HTML + optional hints
+        user_parts = ["```html", html[:5000], "```"]  # protect token budget
+        if query:
+            user_parts.append(f"\n\n## Query\n{query.strip()}")
+        if examples:
+            user_parts.append("## Examples\n" + "\n".join(examples[:20]))
+        user_msg = "\n\n".join(user_parts)
+
+        # ── LLM call (with retry/backoff)
+        resp = perform_completion_with_backoff(
+            provider=llm_config.provider,
+            prompt_with_variables="\n\n".join([system_msg, user_msg]),
+            json_response=True,
+            api_token=llm_config.api_token,
+            base_url=llm_config.base_url,
+            extra_args=kwargs,
+        )
+
+        # ── clean & load JSON (fix common escape mistakes *before* json.loads)
+        raw = resp.choices[0].message.content
+        raw = raw.replace("\x08", "\\b")                     # stray back-space → \b
+        raw = re.sub(r'(?<!\\)\\(?![\\u"])', r"\\\\", raw)   # lone \ → \\
+
+        try:
+            pattern_dict = json.loads(raw)
+        except Exception as exc:
+            raise ValueError(f"LLM did not return valid JSON: {raw}") from exc
+
+        # quick sanity-compile
+        for lbl, pat in pattern_dict.items():
+            try:
+                re.compile(pat)
+            except re.error as e:
+                raise ValueError(f"Invalid regex for '{lbl}': {e}") from None
+
+        return pattern_dict
diff --git a/crawl4ai/models.py b/crawl4ai/models.py
index 64270b77..0c48294a 100644
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -129,6 +129,7 @@ class MarkdownGenerationResult(BaseModel):
 class CrawlResult(BaseModel):
     url: str
     html: str
+    fit_html: Optional[str] = None
     success: bool
     cleaned_html: Optional[str] = None
     media: Dict[str, List[Dict]] = {}
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index 8059d35a..bfa8ce9d 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -2735,33 +2735,67 @@ def preprocess_html_for_schema(html_content, text_threshold=100, attr_value_thre
             # Also truncate tail text if present
             if element.tail and len(element.tail.strip()) > text_threshold:
                 element.tail = element.tail.strip()[:text_threshold] + '...'
-        
-        # 4. Find repeated patterns and keep only a few examples
-        # This is a simplistic approach - more sophisticated pattern detection could be implemented
-        pattern_elements = {}
-        for element in tree.xpath('//*[contains(@class, "")]'):
-            parent = element.getparent()
+
+        # 4. Detect duplicates and drop them in a single pass
+        seen: dict[tuple, None] = {}
+        for el in list(tree.xpath('//*[@class]')):          # snapshot once, XPath is fast
+            parent = el.getparent()
             if parent is None:
                 continue
-                
-            # Create a signature based on tag and classes
-            classes = element.get('class', '')
-            if not classes:
+
+            cls = el.get('class')
+            if not cls:
                 continue
-            signature = f"{element.tag}.{classes}"
-            
-            if signature in pattern_elements:
-                pattern_elements[signature].append(element)
+
+            # ── build signature ───────────────────────────────────────────
+            h = xxhash.xxh64()                              # stream, no big join()
+            for txt in el.itertext():
+                h.update(txt)
+            sig = (el.tag, cls, h.intdigest())             # tuple cheaper & hashable
+
+            # ── first seen? keep – else drop ─────────────
+            if sig in seen and parent is not None:
+                parent.remove(el)                           # duplicate
             else:
-                pattern_elements[signature] = [element]
+                seen[sig] = None
         
-        # Keep only 3 examples of each repeating pattern
-        for signature, elements in pattern_elements.items():
-            if len(elements) > 3:
-                # Keep the first 2 and last elements
-                for element in elements[2:-1]:
-                    if element.getparent() is not None:
-                        element.getparent().remove(element)
+        # # 4. Find repeated patterns and keep only a few examples
+        # # This is a simplistic approach - more sophisticated pattern detection could be implemented
+        # pattern_elements = {}
+        # for element in tree.xpath('//*[contains(@class, "")]'):
+        #     parent = element.getparent()
+        #     if parent is None:
+        #         continue
+                
+        #     # Create a signature based on tag and classes
+        #     classes = element.get('class', '')
+        #     if not classes:
+        #         continue
+        #     innert_text = ''.join(element.xpath('.//text()'))
+        #     innert_text_hash = xxhash.xxh64(innert_text.encode()).hexdigest()
+        #     signature = f"{element.tag}.{classes}.{innert_text_hash}"
+            
+        #     if signature in pattern_elements:
+        #         pattern_elements[signature].append(element)
+        #     else:
+        #         pattern_elements[signature] = [element]
+        
+        # # Keep only first examples of each repeating pattern
+        # for signature, elements in pattern_elements.items():
+        #     if len(elements) > 1:
+        #         # Keep the first element and remove the rest
+        #         for element in elements[1:]:
+        #             if element.getparent() is not None:
+        #                 element.getparent().remove(element)
+
+
+        # # Keep only 3 examples of each repeating pattern
+        # for signature, elements in pattern_elements.items():
+        #     if len(elements) > 3:
+        #         # Keep the first 2 and last elements
+        #         for element in elements[2:-1]:
+        #             if element.getparent() is not None:
+        #                 element.getparent().remove(element)
         
         # 5. Convert back to string
         result = etree.tostring(tree, encoding='unicode', method='html')
diff --git a/docs/examples/hello_world.py b/docs/examples/hello_world.py
index 0d351fa8..b9f1b328 100644
--- a/docs/examples/hello_world.py
+++ b/docs/examples/hello_world.py
@@ -3,42 +3,19 @@ from crawl4ai import (
     AsyncWebCrawler,
     BrowserConfig,
     CrawlerRunConfig,
-    CacheMode,
     DefaultMarkdownGenerator,
     PruningContentFilter,
     CrawlResult
 )
 
-async def example_cdp():
-    browser_conf = BrowserConfig(
-        headless=False,
-        cdp_url="http://localhost:9223"
-    )
-    crawler_config = CrawlerRunConfig(
-        session_id="test",
-        js_code = """(() => { return {"result": "Hello World!"} })()""",
-        js_only=True
-    )
-    async with AsyncWebCrawler(
-        config=browser_conf,
-        verbose=True,
-    ) as crawler:
-        result : CrawlResult = await crawler.arun(
-            url="https://www.helloworld.org",
-            config=crawler_config,
-        )
-        print(result.js_execution_result)
                    
 
-async def main():
-    browser_config = BrowserConfig(headless=False, verbose=True)
+async def main():    
+    browser_config = BrowserConfig(headless=True, verbose=True)
     async with AsyncWebCrawler(config=browser_config) as crawler:
         crawler_config = CrawlerRunConfig(
-            cache_mode=CacheMode.BYPASS,
             markdown_generator=DefaultMarkdownGenerator(
-                content_filter=PruningContentFilter(
-                     threshold=0.48, threshold_type="fixed", min_word_threshold=0
-                )
+                content_filter=PruningContentFilter()
             ),
         )
         result : CrawlResult = await crawler.arun(
diff --git a/docs/examples/regex_extraction_quickstart.py b/docs/examples/regex_extraction_quickstart.py
new file mode 100644
index 00000000..54b9c384
--- /dev/null
+++ b/docs/examples/regex_extraction_quickstart.py
@@ -0,0 +1,143 @@
+# == File: regex_extraction_quickstart.py ==
+"""
+Mini–quick-start for RegexExtractionStrategy
+────────────────────────────────────────────
+3 bite-sized demos that parallel the style of *quickstart_examples_set_1.py*:
+
+1.  **Default catalog** – scrape a page and pull out e-mails / phones / URLs, etc.
+2.  **Custom pattern**  – add your own regex at instantiation time.
+3.  **LLM-assisted schema** – ask the model to write a pattern, cache it, then
+    run extraction _without_ further LLM calls.
+
+Run the whole thing with::
+
+    python regex_extraction_quickstart.py
+"""
+
+import os, json, asyncio
+from pathlib import Path
+from typing import List
+
+from crawl4ai import (
+    AsyncWebCrawler,
+    CrawlerRunConfig,
+    CrawlResult,
+    RegexExtractionStrategy,
+    LLMConfig,
+)
+
+# ────────────────────────────────────────────────────────────────────────────
+# 1. Default-catalog extraction
+# ────────────────────────────────────────────────────────────────────────────
+async def demo_regex_default() -> None:
+    print("\n=== 1. Regex extraction – default patterns ===")
+
+    url = "https://www.iana.org/domains/example"      # has e-mail + URLs
+    strategy = RegexExtractionStrategy(
+        pattern = RegexExtractionStrategy.Url | RegexExtractionStrategy.Currency
+    )               
+    config   = CrawlerRunConfig(extraction_strategy=strategy)
+
+    async with AsyncWebCrawler() as crawler:
+        result: CrawlResult = await crawler.arun(url, config=config)
+
+    print(f"Fetched {url} - success={result.success}")
+    if result.success:
+        data = json.loads(result.extracted_content)
+        for d in data[:10]:
+            print(f"  {d['label']:<12} {d['value']}")
+        print(f"... total matches: {len(data)}")
+    else:
+        print("  !!! crawl failed")
+
+
+# ────────────────────────────────────────────────────────────────────────────
+# 2. Custom pattern override / extension
+# ────────────────────────────────────────────────────────────────────────────
+async def demo_regex_custom() -> None:
+    print("\n=== 2. Regex extraction – custom price pattern ===")
+
+    url = "https://www.apple.com/shop/buy-mac/macbook-pro"
+    price_pattern = {"usd_price": r"\$\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?"}
+
+    strategy = RegexExtractionStrategy(custom = price_pattern)
+    config   = CrawlerRunConfig(extraction_strategy=strategy)
+
+    async with AsyncWebCrawler() as crawler:
+        result: CrawlResult = await crawler.arun(url, config=config)
+
+    if result.success:
+        data = json.loads(result.extracted_content)
+        for d in data:
+            print(f"  {d['value']}")
+        if not data:
+            print("  (No prices found - page layout may have changed)")
+    else:
+        print("  !!! crawl failed")
+
+
+# ────────────────────────────────────────────────────────────────────────────
+# 3. One-shot LLM pattern generation, then fast extraction
+# ────────────────────────────────────────────────────────────────────────────
+async def demo_regex_generate_pattern() -> None:
+    print("\n=== 3. generate_pattern → regex extraction ===")
+
+    cache_dir   = Path(__file__).parent / "tmp"
+    cache_dir.mkdir(exist_ok=True)
+    pattern_file = cache_dir / "price_pattern.json"
+
+    url = "https://www.lazada.sg/tag/smartphone/"
+
+    # ── 3-A. build or load the cached pattern
+    if pattern_file.exists():
+        pattern = json.load(pattern_file.open(encoding="utf-8"))
+        print("Loaded cached pattern:", pattern)
+    else:
+        print("Generating pattern via LLM…")
+
+        llm_cfg = LLMConfig(
+            provider="openai/gpt-4o-mini",
+            api_token="env:OPENAI_API_KEY",
+        )
+
+        # pull one sample page as HTML context
+        async with AsyncWebCrawler() as crawler:
+            html = (await crawler.arun(url)).fit_html 
+
+        pattern = RegexExtractionStrategy.generate_pattern(
+            label="price",
+            html=html,
+            query="Prices in Malaysian Ringgit (e.g. RM1,299.00 or RM200)",
+            llm_config=llm_cfg,
+        )
+
+        json.dump(pattern, pattern_file.open("w", encoding="utf-8"), indent=2)
+        print("Saved pattern:", pattern_file)
+
+    # ── 3-B. extraction pass – zero LLM calls
+    strategy = RegexExtractionStrategy(custom=pattern)
+    config   = CrawlerRunConfig(extraction_strategy=strategy, delay_before_return_html=3)
+
+    async with AsyncWebCrawler() as crawler:
+        result: CrawlResult = await crawler.arun(url, config=config)
+
+    if result.success:
+        data = json.loads(result.extracted_content)
+        for d in data[:15]:
+            print(f"  {d['value']}")
+        print(f"... total matches: {len(data)}")
+    else:
+        print("  !!! crawl failed")
+
+
+# ────────────────────────────────────────────────────────────────────────────
+# Entrypoint
+# ────────────────────────────────────────────────────────────────────────────
+async def main() -> None:
+    # await demo_regex_default()
+    # await demo_regex_custom()
+    await demo_regex_generate_pattern()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/docs/md_v2/api/crawl-result.md b/docs/md_v2/api/crawl-result.md
index 52cf6ace..a27a87d2 100644
--- a/docs/md_v2/api/crawl-result.md
+++ b/docs/md_v2/api/crawl-result.md
@@ -10,6 +10,7 @@ class CrawlResult(BaseModel):
     html: str
     success: bool
     cleaned_html: Optional[str] = None
+    fit_html: Optional[str] = None  # Preprocessed HTML optimized for extraction
     media: Dict[str, List[Dict]] = {}
     links: Dict[str, List[Dict]] = {}
     downloaded_files: Optional[List[str]] = None
@@ -50,7 +51,7 @@ if not result.success:
 ```
 
 ### 1.3 **`status_code`** *(Optional[int])*  
-**What**: The page’s HTTP status code (e.g., 200, 404).  
+**What**: The page's HTTP status code (e.g., 200, 404).  
 **Usage**:
 ```python
 if result.status_code == 404:
@@ -82,7 +83,7 @@ if result.response_headers:
 ```
 
 ### 1.7 **`ssl_certificate`** *(Optional[SSLCertificate])*  
-**What**: If `fetch_ssl_certificate=True` in your CrawlerRunConfig, **`result.ssl_certificate`** contains a  [**`SSLCertificate`**](../advanced/ssl-certificate.md) object describing the site’s certificate. You can export the cert in multiple formats (PEM/DER/JSON) or access its properties like `issuer`, 
+**What**: If `fetch_ssl_certificate=True` in your CrawlerRunConfig, **`result.ssl_certificate`** contains a  [**`SSLCertificate`**](../advanced/ssl-certificate.md) object describing the site's certificate. You can export the cert in multiple formats (PEM/DER/JSON) or access its properties like `issuer`, 
  `subject`, `valid_from`, `valid_until`, etc. 
 **Usage**:
 ```python
@@ -109,14 +110,6 @@ print(len(result.html))
 print(result.cleaned_html[:500])  # Show a snippet
 ```
 
-### 2.3 **`fit_html`** *(Optional[str])*  
-**What**: If a **content filter** or heuristic (e.g., Pruning/BM25) modifies the HTML, the “fit” or post-filter version.  
-**When**: This is **only** present if your `markdown_generator` or `content_filter` produces it.  
-**Usage**:
-```python
-if result.markdown.fit_html:
-    print("High-value HTML content:", result.markdown.fit_html[:300])
-```
 
 ---
 
@@ -135,7 +128,7 @@ Crawl4AI can convert HTML→Markdown, optionally including:
 - **`raw_markdown`** *(str)*: The full HTML→Markdown conversion.  
 - **`markdown_with_citations`** *(str)*: Same markdown, but with link references as academic-style citations.  
 - **`references_markdown`** *(str)*: The reference list or footnotes at the end.  
-- **`fit_markdown`** *(Optional[str])*: If content filtering (Pruning/BM25) was applied, the filtered “fit” text.  
+- **`fit_markdown`** *(Optional[str])*: If content filtering (Pruning/BM25) was applied, the filtered "fit" text.  
 - **`fit_html`** *(Optional[str])*: The HTML that led to `fit_markdown`.
 
 **Usage**:
@@ -157,7 +150,7 @@ print(result.markdown.raw_markdown[:200])
 print(result.markdown.fit_markdown)
 print(result.markdown.fit_html)
 ```
-**Important**: “Fit” content (in `fit_markdown`/`fit_html`) exists in result.markdown, only if you used a **filter** (like **PruningContentFilter** or **BM25ContentFilter**) within a `MarkdownGenerationStrategy`.
+**Important**: "Fit" content (in `fit_markdown`/`fit_html`) exists in result.markdown, only if you used a **filter** (like **PruningContentFilter** or **BM25ContentFilter**) within a `MarkdownGenerationStrategy`.
 
 ---
 
@@ -169,7 +162,7 @@ print(result.markdown.fit_html)
 
 - `src` *(str)*: Media URL  
 - `alt` or `title` *(str)*: Descriptive text  
-- `score` *(float)*: Relevance score if the crawler’s heuristic found it “important”  
+- `score` *(float)*: Relevance score if the crawler's heuristic found it "important"  
 - `desc` or `description` *(Optional[str])*: Additional context extracted from surrounding text  
 
 **Usage**:
@@ -263,7 +256,7 @@ A `DispatchResult` object providing additional concurrency and resource usage in
 
 - **`task_id`**: A unique identifier for the parallel task.
 - **`memory_usage`** (float): The memory (in MB) used at the time of completion.
-- **`peak_memory`** (float): The peak memory usage (in MB) recorded during the task’s execution.
+- **`peak_memory`** (float): The peak memory usage (in MB) recorded during the task's execution.
 - **`start_time`** / **`end_time`** (datetime): Time range for this crawling task.
 - **`error_message`** (str): Any dispatcher- or concurrency-related error encountered.
 
@@ -358,7 +351,7 @@ async def handle_result(result: CrawlResult):
     # HTML
     print("Original HTML size:", len(result.html))
     print("Cleaned HTML size:", len(result.cleaned_html or ""))
-
+    
     # Markdown output
     if result.markdown:
         print("Raw Markdown:", result.markdown.raw_markdown[:300])
diff --git a/docs/md_v2/api/strategies.md b/docs/md_v2/api/strategies.md
index 45d44950..a44d0fcd 100644
--- a/docs/md_v2/api/strategies.md
+++ b/docs/md_v2/api/strategies.md
@@ -36,6 +36,45 @@ LLMExtractionStrategy(
 )
 ```
 
+### RegexExtractionStrategy
+
+Used for fast pattern-based extraction of common entities using regular expressions.
+
+```python
+RegexExtractionStrategy(
+    # Pattern Configuration
+    pattern: IntFlag = RegexExtractionStrategy.Nothing,  # Bit flags of built-in patterns to use
+    custom: Optional[Dict[str, str]] = None,           # Custom pattern dictionary {label: regex}
+    
+    # Input Format
+    input_format: str = "fit_html",                    # "html", "markdown", "text" or "fit_html"
+)
+
+# Built-in Patterns as Bit Flags
+RegexExtractionStrategy.Email           # Email addresses
+RegexExtractionStrategy.PhoneIntl       # International phone numbers 
+RegexExtractionStrategy.PhoneUS         # US-format phone numbers
+RegexExtractionStrategy.Url             # HTTP/HTTPS URLs
+RegexExtractionStrategy.IPv4            # IPv4 addresses
+RegexExtractionStrategy.IPv6            # IPv6 addresses
+RegexExtractionStrategy.Uuid            # UUIDs
+RegexExtractionStrategy.Currency        # Currency values (USD, EUR, etc)
+RegexExtractionStrategy.Percentage      # Percentage values
+RegexExtractionStrategy.Number          # Numeric values
+RegexExtractionStrategy.DateIso         # ISO format dates
+RegexExtractionStrategy.DateUS          # US format dates
+RegexExtractionStrategy.Time24h         # 24-hour format times
+RegexExtractionStrategy.PostalUS        # US postal codes
+RegexExtractionStrategy.PostalUK        # UK postal codes
+RegexExtractionStrategy.HexColor        # HTML hex color codes
+RegexExtractionStrategy.TwitterHandle   # Twitter handles
+RegexExtractionStrategy.Hashtag         # Hashtags
+RegexExtractionStrategy.MacAddr         # MAC addresses
+RegexExtractionStrategy.Iban            # International bank account numbers
+RegexExtractionStrategy.CreditCard      # Credit card numbers
+RegexExtractionStrategy.All             # All available patterns
+```
+
 ### CosineStrategy
 
 Used for content similarity-based extraction and clustering.
@@ -156,6 +195,55 @@ result = await crawler.arun(
 data = json.loads(result.extracted_content)
 ```
 
+### Regex Extraction
+
+```python
+import json
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, RegexExtractionStrategy
+
+# Method 1: Use built-in patterns
+strategy = RegexExtractionStrategy(
+    pattern = RegexExtractionStrategy.Email | RegexExtractionStrategy.Url
+)
+
+# Method 2: Use custom patterns
+price_pattern = {"usd_price": r"\$\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?"}
+strategy = RegexExtractionStrategy(custom=price_pattern)
+
+# Method 3: Generate pattern with LLM assistance (one-time)
+from crawl4ai import LLMConfig
+
+async with AsyncWebCrawler() as crawler:
+    # Get sample HTML first
+    sample_result = await crawler.arun("https://example.com/products")
+    html = sample_result.fit_html
+    
+    # Generate regex pattern once
+    pattern = RegexExtractionStrategy.generate_pattern(
+        label="price",
+        html=html,
+        query="Product prices in USD format",
+        llm_config=LLMConfig(provider="openai/gpt-4o-mini")
+    )
+    
+    # Save pattern for reuse
+    import json
+    with open("price_pattern.json", "w") as f:
+        json.dump(pattern, f)
+    
+    # Use pattern for extraction (no LLM calls)
+    strategy = RegexExtractionStrategy(custom=pattern)
+    result = await crawler.arun(
+        url="https://example.com/products",
+        config=CrawlerRunConfig(extraction_strategy=strategy)
+    )
+    
+    # Process results
+    data = json.loads(result.extracted_content)
+    for item in data:
+        print(f"{item['label']}: {item['value']}")
+```
+
 ### CSS Extraction
 
 ```python
@@ -220,12 +308,28 @@ result = await crawler.arun(
 
 ## Best Practices
 
-1. **Choose the Right Strategy**
-   - Use `LLMExtractionStrategy` for complex, unstructured content
-   - Use `JsonCssExtractionStrategy` for well-structured HTML
+1. **Choose the Right Strategy**
+   - Use `RegexExtractionStrategy` for common data types like emails, phones, URLs, dates
+   - Use `JsonCssExtractionStrategy` for well-structured HTML with consistent patterns
+   - Use `LLMExtractionStrategy` for complex, unstructured content requiring reasoning
    - Use `CosineStrategy` for content similarity and clustering
 
-2. **Optimize Chunking**
+2. **Strategy Selection Guide**
+   ```
+   Is the target data a common type (email/phone/date/URL)? 
+   → RegexExtractionStrategy
+   
+   Does the page have consistent HTML structure?
+   → JsonCssExtractionStrategy or JsonXPathExtractionStrategy
+   
+   Is the data semantically complex or unstructured?
+   → LLMExtractionStrategy
+   
+   Need to find content similar to a specific topic?
+   → CosineStrategy
+   ```
+
+3. **Optimize Chunking**
    ```python
    # For long documents
    strategy = LLMExtractionStrategy(
@@ -234,7 +338,26 @@ result = await crawler.arun(
    )
    ```
 
-3. **Handle Errors**
+4. **Combine Strategies for Best Performance**
+   ```python
+   # First pass: Extract structure with CSS
+   css_strategy = JsonCssExtractionStrategy(product_schema)
+   css_result = await crawler.arun(url, config=CrawlerRunConfig(extraction_strategy=css_strategy))
+   product_data = json.loads(css_result.extracted_content)
+   
+   # Second pass: Extract specific fields with regex
+   descriptions = [product["description"] for product in product_data]
+   regex_strategy = RegexExtractionStrategy(
+       pattern=RegexExtractionStrategy.Email | RegexExtractionStrategy.PhoneUS,
+       custom={"dimension": r"\d+x\d+x\d+ (?:cm|in)"}
+   )
+   
+   # Process descriptions with regex
+   for text in descriptions:
+       matches = regex_strategy.extract("", text)  # Direct extraction
+   ```
+
+5. **Handle Errors**
    ```python
    try:
        result = await crawler.arun(
@@ -247,11 +370,31 @@ result = await crawler.arun(
        print(f"Extraction failed: {e}")
    ```
 
-4. **Monitor Performance**
+6. **Monitor Performance**
    ```python
    strategy = CosineStrategy(
        verbose=True,  # Enable logging
        word_count_threshold=20,  # Filter short content
        top_k=5  # Limit results
    )
+   ```
+
+7. **Cache Generated Patterns**
+   ```python
+   # For RegexExtractionStrategy pattern generation
+   import json
+   from pathlib import Path
+   
+   cache_dir = Path("./pattern_cache")
+   cache_dir.mkdir(exist_ok=True)
+   pattern_file = cache_dir / "product_pattern.json"
+   
+   if pattern_file.exists():
+       with open(pattern_file) as f:
+           pattern = json.load(f)
+   else:
+       # Generate once with LLM
+       pattern = RegexExtractionStrategy.generate_pattern(...)
+       with open(pattern_file, "w") as f:
+           json.dump(pattern, f)
    ```
\ No newline at end of file
diff --git a/docs/md_v2/extraction/no-llm-strategies.md b/docs/md_v2/extraction/no-llm-strategies.md
index b216c0ed..23fa7ad2 100644
--- a/docs/md_v2/extraction/no-llm-strategies.md
+++ b/docs/md_v2/extraction/no-llm-strategies.md
@@ -1,15 +1,20 @@
 # Extracting JSON (No LLM)
 
-One of Crawl4AI’s **most powerful** features is extracting **structured JSON** from websites **without** relying on large language models. By defining a **schema** with CSS or XPath selectors, you can extract data instantly—even from complex or nested HTML structures—without the cost, latency, or environmental impact of an LLM.
+One of Crawl4AI's **most powerful** features is extracting **structured JSON** from websites **without** relying on large language models. Crawl4AI offers several strategies for LLM-free extraction:
+
+1. **Schema-based extraction** with CSS or XPath selectors via `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy`
+2. **Regular expression extraction** with `RegexExtractionStrategy` for fast pattern matching
+
+These approaches let you extract data instantly—even from complex or nested HTML structures—without the cost, latency, or environmental impact of an LLM.
 
 **Why avoid LLM for basic extractions?**
 
-1. **Faster & Cheaper**: No API calls or GPU overhead.  
-2. **Lower Carbon Footprint**: LLM inference can be energy-intensive. A well-defined schema is practically carbon-free.  
-3. **Precise & Repeatable**: CSS/XPath selectors do exactly what you specify. LLM outputs can vary or hallucinate.  
-4. **Scales Readily**: For thousands of pages, schema-based extraction runs quickly and in parallel.
+1. **Faster & Cheaper**: No API calls or GPU overhead.  
+2. **Lower Carbon Footprint**: LLM inference can be energy-intensive. Pattern-based extraction is practically carbon-free.  
+3. **Precise & Repeatable**: CSS/XPath selectors and regex patterns do exactly what you specify. LLM outputs can vary or hallucinate.  
+4. **Scales Readily**: For thousands of pages, pattern-based extraction runs quickly and in parallel.
 
-Below, we’ll explore how to craft these schemas and use them with **JsonCssExtractionStrategy** (or **JsonXPathExtractionStrategy** if you prefer XPath). We’ll also highlight advanced features like **nested fields** and **base element attributes**.
+Below, we'll explore how to craft these schemas and use them with **JsonCssExtractionStrategy** (or **JsonXPathExtractionStrategy** if you prefer XPath). We'll also highlight advanced features like **nested fields** and **base element attributes**.
 
 ---
 
@@ -17,17 +22,17 @@ Below, we’ll explore how to craft these schemas and use them with **JsonCssExt
 
 A schema defines:
 
-1. A **base selector** that identifies each “container” element on the page (e.g., a product row, a blog post card).  
-2. **Fields** describing which CSS/XPath selectors to use for each piece of data you want to capture (text, attribute, HTML block, etc.).  
-3. **Nested** or **list** types for repeated or hierarchical structures.  
+1. A **base selector** that identifies each "container" element on the page (e.g., a product row, a blog post card).  
+2. **Fields** describing which CSS/XPath selectors to use for each piece of data you want to capture (text, attribute, HTML block, etc.).  
+3. **Nested** or **list** types for repeated or hierarchical structures.  
 
-For example, if you have a list of products, each one might have a name, price, reviews, and “related products.” This approach is faster and more reliable than an LLM for consistent, structured pages.
+For example, if you have a list of products, each one might have a name, price, reviews, and "related products." This approach is faster and more reliable than an LLM for consistent, structured pages.
 
 ---
 
 ## 2. Simple Example: Crypto Prices
 
-Let’s begin with a **simple** schema-based extraction using the `JsonCssExtractionStrategy`. Below is a snippet that extracts cryptocurrency prices from a site (similar to the legacy Coinbase example). Notice we **don’t** call any LLM:
+Let's begin with a **simple** schema-based extraction using the `JsonCssExtractionStrategy`. Below is a snippet that extracts cryptocurrency prices from a site (similar to the legacy Coinbase example). Notice we **don't** call any LLM:
 
 ```python
 import json
@@ -87,7 +92,7 @@ asyncio.run(extract_crypto_prices())
 
 **Highlights**:
 
-- **`baseSelector`**: Tells us where each “item” (crypto row) is.  
+- **`baseSelector`**: Tells us where each "item" (crypto row) is.  
 - **`fields`**: Two fields (`coin_name`, `price`) using simple CSS selectors.  
 - Each field defines a **`type`** (e.g., `text`, `attribute`, `html`, `regex`, etc.).
 
@@ -97,7 +102,7 @@ No LLM is needed, and the performance is **near-instant** for hundreds or thousa
 
 ### **XPath Example with `raw://` HTML**
 
-Below is a short example demonstrating **XPath** extraction plus the **`raw://`** scheme. We’ll pass a **dummy HTML** directly (no network request) and define the extraction strategy in `CrawlerRunConfig`.
+Below is a short example demonstrating **XPath** extraction plus the **`raw://`** scheme. We'll pass a **dummy HTML** directly (no network request) and define the extraction strategy in `CrawlerRunConfig`.
 
 ```python
 import json
@@ -168,12 +173,12 @@ asyncio.run(extract_crypto_prices_xpath())
 
 **Key Points**:
 
-1. **`JsonXPathExtractionStrategy`** is used instead of `JsonCssExtractionStrategy`.  
-2. **`baseSelector`** and each field’s `"selector"` use **XPath** instead of CSS.  
-3. **`raw://`** lets us pass `dummy_html` with no real network request—handy for local testing.  
+1. **`JsonXPathExtractionStrategy`** is used instead of `JsonCssExtractionStrategy`.  
+2. **`baseSelector`** and each field's `"selector"` use **XPath** instead of CSS.  
+3. **`raw://`** lets us pass `dummy_html` with no real network request—handy for local testing.  
 4. Everything (including the extraction strategy) is in **`CrawlerRunConfig`**.  
 
-That’s how you keep the config self-contained, illustrate **XPath** usage, and demonstrate the **raw** scheme for direct HTML input—all while avoiding the old approach of passing `extraction_strategy` directly to `arun()`.
+That's how you keep the config self-contained, illustrate **XPath** usage, and demonstrate the **raw** scheme for direct HTML input—all while avoiding the old approach of passing `extraction_strategy` directly to `arun()`.
 
 ---
 
@@ -187,7 +192,7 @@ We have a **sample e-commerce** HTML file on GitHub (example):
 ```
 https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html
 ```
-This snippet includes categories, products, features, reviews, and related items. Let’s see how to define a schema that fully captures that structure **without LLM**.
+This snippet includes categories, products, features, reviews, and related items. Let's see how to define a schema that fully captures that structure **without LLM**.
 
 ```python
 schema = {
@@ -333,24 +338,253 @@ async def extract_ecommerce_data():
 asyncio.run(extract_ecommerce_data())
 ```
 
-If all goes well, you get a **structured** JSON array with each “category,” containing an array of `products`. Each product includes `details`, `features`, `reviews`, etc. All of that **without** an LLM.
+If all goes well, you get a **structured** JSON array with each "category," containing an array of `products`. Each product includes `details`, `features`, `reviews`, etc. All of that **without** an LLM.
 
 ---
 
-## 4. Why “No LLM” Is Often Better
+## 4. RegexExtractionStrategy - Fast Pattern-Based Extraction
 
-1. **Zero Hallucination**: Schema-based extraction doesn’t guess text. It either finds it or not.  
-2. **Guaranteed Structure**: The same schema yields consistent JSON across many pages, so your downstream pipeline can rely on stable keys.  
-3. **Speed**: LLM-based extraction can be 10–1000x slower for large-scale crawling.  
-4. **Scalable**: Adding or updating a field is a matter of adjusting the schema, not re-tuning a model.
+Crawl4AI now offers a powerful new zero-LLM extraction strategy: `RegexExtractionStrategy`. This strategy provides lightning-fast extraction of common data types like emails, phone numbers, URLs, dates, and more using pre-compiled regular expressions.
 
-**When might you consider an LLM?** Possibly if the site is extremely unstructured or you want AI summarization. But always try a schema approach first for repeated or consistent data patterns.
+### Key Features
+
+- **Zero LLM Dependency**: Extracts data without any AI model calls
+- **Blazing Fast**: Uses pre-compiled regex patterns for maximum performance
+- **Built-in Patterns**: Includes ready-to-use patterns for common data types
+- **Custom Patterns**: Add your own regex patterns for domain-specific extraction
+- **LLM-Assisted Pattern Generation**: Optionally use an LLM once to generate optimized patterns, then reuse them without further LLM calls
+
+### Simple Example: Extracting Common Entities
+
+The easiest way to start is by using the built-in pattern catalog:
+
+```python
+import json
+import asyncio
+from crawl4ai import (
+    AsyncWebCrawler,
+    CrawlerRunConfig,
+    RegexExtractionStrategy
+)
+
+async def extract_with_regex():
+    # Create a strategy using built-in patterns for URLs and currencies
+    strategy = RegexExtractionStrategy(
+        pattern = RegexExtractionStrategy.Url | RegexExtractionStrategy.Currency
+    )
+    
+    config = CrawlerRunConfig(extraction_strategy=strategy)
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com",
+            config=config
+        )
+        
+        if result.success:
+            data = json.loads(result.extracted_content)
+            for item in data[:5]:  # Show first 5 matches
+                print(f"{item['label']}: {item['value']}")
+            print(f"Total matches: {len(data)}")
+
+asyncio.run(extract_with_regex())
+```
+
+### Available Built-in Patterns
+
+`RegexExtractionStrategy` provides these common patterns as IntFlag attributes for easy combining:
+
+```python
+# Use individual patterns
+strategy = RegexExtractionStrategy(pattern=RegexExtractionStrategy.Email)
+
+# Combine multiple patterns
+strategy = RegexExtractionStrategy(
+    pattern = (
+        RegexExtractionStrategy.Email | 
+        RegexExtractionStrategy.PhoneUS | 
+        RegexExtractionStrategy.Url
+    )
+)
+
+# Use all available patterns
+strategy = RegexExtractionStrategy(pattern=RegexExtractionStrategy.All)
+```
+
+Available patterns include:
+- `Email` - Email addresses
+- `PhoneIntl` - International phone numbers
+- `PhoneUS` - US-format phone numbers
+- `Url` - HTTP/HTTPS URLs
+- `IPv4` - IPv4 addresses
+- `IPv6` - IPv6 addresses
+- `Uuid` - UUIDs
+- `Currency` - Currency values (USD, EUR, etc.)
+- `Percentage` - Percentage values
+- `Number` - Numeric values
+- `DateIso` - ISO format dates
+- `DateUS` - US format dates
+- `Time24h` - 24-hour format times
+- `PostalUS` - US postal codes
+- `PostalUK` - UK postal codes
+- `HexColor` - HTML hex color codes
+- `TwitterHandle` - Twitter handles
+- `Hashtag` - Hashtags
+- `MacAddr` - MAC addresses
+- `Iban` - International bank account numbers
+- `CreditCard` - Credit card numbers
+
+### Custom Pattern Example
+
+For more targeted extraction, you can provide custom patterns:
+
+```python
+import json
+import asyncio
+from crawl4ai import (
+    AsyncWebCrawler,
+    CrawlerRunConfig,
+    RegexExtractionStrategy
+)
+
+async def extract_prices():
+    # Define a custom pattern for US Dollar prices
+    price_pattern = {"usd_price": r"\$\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?"}
+    
+    # Create strategy with custom pattern
+    strategy = RegexExtractionStrategy(custom=price_pattern)
+    config = CrawlerRunConfig(extraction_strategy=strategy)
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://www.example.com/products",
+            config=config
+        )
+        
+        if result.success:
+            data = json.loads(result.extracted_content)
+            for item in data:
+                print(f"Found price: {item['value']}")
+
+asyncio.run(extract_prices())
+```
+
+### LLM-Assisted Pattern Generation
+
+For complex or site-specific patterns, you can use an LLM once to generate an optimized pattern, then save and reuse it without further LLM calls:
+
+```python
+import json
+import asyncio
+from pathlib import Path
+from crawl4ai import (
+    AsyncWebCrawler,
+    CrawlerRunConfig,
+    RegexExtractionStrategy,
+    LLMConfig
+)
+
+async def extract_with_generated_pattern():
+    cache_dir = Path("./pattern_cache")
+    cache_dir.mkdir(exist_ok=True)
+    pattern_file = cache_dir / "price_pattern.json"
+    
+    # 1. Generate or load pattern
+    if pattern_file.exists():
+        pattern = json.load(pattern_file.open())
+        print(f"Using cached pattern: {pattern}")
+    else:
+        print("Generating pattern via LLM...")
+        
+        # Configure LLM
+        llm_config = LLMConfig(
+            provider="openai/gpt-4o-mini",
+            api_token="env:OPENAI_API_KEY",
+        )
+        
+        # Get sample HTML for context
+        async with AsyncWebCrawler() as crawler:
+            result = await crawler.arun("https://example.com/products")
+            html = result.fit_html
+        
+        # Generate pattern (one-time LLM usage)
+        pattern = RegexExtractionStrategy.generate_pattern(
+            label="price",
+            html=html,
+            query="Product prices in USD format",
+            llm_config=llm_config,
+        )
+        
+        # Cache pattern for future use
+        json.dump(pattern, pattern_file.open("w"), indent=2)
+    
+    # 2. Use pattern for extraction (no LLM calls)
+    strategy = RegexExtractionStrategy(custom=pattern)
+    config = CrawlerRunConfig(extraction_strategy=strategy)
+    
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url="https://example.com/products",
+            config=config
+        )
+        
+        if result.success:
+            data = json.loads(result.extracted_content)
+            for item in data[:10]:
+                print(f"Extracted: {item['value']}")
+            print(f"Total matches: {len(data)}")
+
+asyncio.run(extract_with_generated_pattern())
+```
+
+This pattern allows you to:
+1. Use an LLM once to generate a highly optimized regex for your specific site
+2. Save the pattern to disk for reuse 
+3. Extract data using only regex (no further LLM calls) in production
+
+### Extraction Results Format
+
+The `RegexExtractionStrategy` returns results in a consistent format:
+
+```json
+[
+  {
+    "url": "https://example.com",
+    "label": "email",
+    "value": "contact@example.com",
+    "span": [145, 163]
+  },
+  {
+    "url": "https://example.com",
+    "label": "url",
+    "value": "https://support.example.com",
+    "span": [210, 235]
+  }
+]
+```
+
+Each match includes:
+- `url`: The source URL
+- `label`: The pattern name that matched (e.g., "email", "phone_us")
+- `value`: The extracted text
+- `span`: The start and end positions in the source content
 
 ---
 
-## 5. Base Element Attributes & Additional Fields
+## 5. Why "No LLM" Is Often Better
 
-It’s easy to **extract attributes** (like `href`, `src`, or `data-xxx`) from your base or nested elements using:
+1. **Zero Hallucination**: Pattern-based extraction doesn't guess text. It either finds it or not.  
+2. **Guaranteed Structure**: The same schema or regex yields consistent JSON across many pages, so your downstream pipeline can rely on stable keys.  
+3. **Speed**: LLM-based extraction can be 10–1000x slower for large-scale crawling.  
+4. **Scalable**: Adding or updating a field is a matter of adjusting the schema or regex, not re-tuning a model.
+
+**When might you consider an LLM?** Possibly if the site is extremely unstructured or you want AI summarization. But always try a schema or regex approach first for repeated or consistent data patterns.
+
+---
+
+## 6. Base Element Attributes & Additional Fields
+
+It's easy to **extract attributes** (like `href`, `src`, or `data-xxx`) from your base or nested elements using:
 
 ```json
 {
@@ -361,11 +595,11 @@ It’s easy to **extract attributes** (like `href`, `src`, or `data-xxx`) from y
 }
 ```
 
-You can define them in **`baseFields`** (extracted from the main container element) or in each field’s sub-lists. This is especially helpful if you need an item’s link or ID stored in the parent `<div>`.
+You can define them in **`baseFields`** (extracted from the main container element) or in each field's sub-lists. This is especially helpful if you need an item's link or ID stored in the parent `<div>`.
 
 ---
 
-## 6. Putting It All Together: Larger Example
+## 7. Putting It All Together: Larger Example
 
 Consider a blog site. We have a schema that extracts the **URL** from each post card (via `baseFields` with an `"attribute": "href"`), plus the title, date, summary, and author:
 
@@ -389,19 +623,20 @@ Then run with `JsonCssExtractionStrategy(schema)` to get an array of blog post o
 
 ---
 
-## 7. Tips & Best Practices
+## 8. Tips & Best Practices
 
-1. **Inspect the DOM** in Chrome DevTools or Firefox’s Inspector to find stable selectors.  
-2. **Start Simple**: Verify you can extract a single field. Then add complexity like nested objects or lists.  
-3. **Test** your schema on partial HTML or a test page before a big crawl.  
-4. **Combine with JS Execution** if the site loads content dynamically. You can pass `js_code` or `wait_for` in `CrawlerRunConfig`.  
-5. **Look at Logs** when `verbose=True`: if your selectors are off or your schema is malformed, it’ll often show warnings.  
-6. **Use baseFields** if you need attributes from the container element (e.g., `href`, `data-id`), especially for the “parent” item.  
-7. **Performance**: For large pages, make sure your selectors are as narrow as possible.
+1. **Inspect the DOM** in Chrome DevTools or Firefox's Inspector to find stable selectors.  
+2. **Start Simple**: Verify you can extract a single field. Then add complexity like nested objects or lists.  
+3. **Test** your schema on partial HTML or a test page before a big crawl.  
+4. **Combine with JS Execution** if the site loads content dynamically. You can pass `js_code` or `wait_for` in `CrawlerRunConfig`.  
+5. **Look at Logs** when `verbose=True`: if your selectors are off or your schema is malformed, it'll often show warnings.  
+6. **Use baseFields** if you need attributes from the container element (e.g., `href`, `data-id`), especially for the "parent" item.  
+7. **Performance**: For large pages, make sure your selectors are as narrow as possible.
+8. **Consider Using Regex First**: For simple data types like emails, URLs, and dates, `RegexExtractionStrategy` is often the fastest approach.
 
 ---
 
-## 8. Schema Generation Utility
+## 9. Schema Generation Utility
 
 While manually crafting schemas is powerful and precise, Crawl4AI now offers a convenient utility to **automatically generate** extraction schemas using LLM. This is particularly useful when:
 
@@ -481,27 +716,26 @@ strategy = JsonCssExtractionStrategy(css_schema)
    - Use OpenAI for production-quality schemas
    - Use Ollama for development, testing, or when you need a self-hosted solution
 
-That's it for **Extracting JSON (No LLM)**! You've seen how schema-based approaches (either CSS or XPath) can handle everything from simple lists to deeply nested product catalogs—instantly, with minimal overhead. Enjoy building robust scrapers that produce consistent, structured JSON for your data pipelines!
-
 ---
 
-## 9. Conclusion
+## 10. Conclusion
 
-With **JsonCssExtractionStrategy** (or **JsonXPathExtractionStrategy**), you can build powerful, **LLM-free** pipelines that:
+With Crawl4AI's LLM-free extraction strategies - `JsonCssExtractionStrategy`, `JsonXPathExtractionStrategy`, and now `RegexExtractionStrategy` - you can build powerful pipelines that:
 
 - Scrape any consistent site for structured data.  
-- Support nested objects, repeating lists, or advanced transformations.  
+- Support nested objects, repeating lists, or pattern-based extraction.  
 - Scale to thousands of pages quickly and reliably.
 
-**Next Steps**:
+**Choosing the Right Strategy**:
 
-- Combine your extracted JSON with advanced filtering or summarization in a second pass if needed.  
-- For dynamic pages, combine strategies with `js_code` or infinite scroll hooking to ensure all content is loaded.
+- Use **`RegexExtractionStrategy`** for fast extraction of common data types like emails, phones, URLs, dates, etc.
+- Use **`JsonCssExtractionStrategy`** or **`JsonXPathExtractionStrategy`** for structured data with clear HTML patterns
+- If you need both: first extract structured data with JSON strategies, then use regex on specific fields
 
-**Remember**: For repeated, structured data, you don’t need to pay for or wait on an LLM. A well-crafted schema plus CSS or XPath gets you the data faster, cleaner, and cheaper—**the real power** of Crawl4AI.
+**Remember**: For repeated, structured data, you don't need to pay for or wait on an LLM. Well-crafted schemas and regex patterns get you the data faster, cleaner, and cheaper—**the real power** of Crawl4AI.
 
-**Last Updated**: 2025-01-01
+**Last Updated**: 2025-05-02
 
 ---
 
-That’s it for **Extracting JSON (No LLM)**! You’ve seen how schema-based approaches (either CSS or XPath) can handle everything from simple lists to deeply nested product catalogs—instantly, with minimal overhead. Enjoy building robust scrapers that produce consistent, structured JSON for your data pipelines!
\ No newline at end of file
+That's it for **Extracting JSON (No LLM)**! You've seen how schema-based approaches (either CSS or XPath) and regex patterns can handle everything from simple lists to deeply nested product catalogs—instantly, with minimal overhead. Enjoy building robust scrapers that produce consistent, structured JSON for your data pipelines!
\ No newline at end of file