Merge branch 'next' into 2025-MAR-ALPHA-1

2025-03-27 12:29:32 +05:30
parent 585e5e5973 40d4dd36c9
commit 7be5427283
7 changed files with 476 additions and 81 deletions
--- a/crawl4ai/version.py
+++ b/crawl4ai/version.py
@@ -1,2 +1,2 @@
 # crawl4ai/_version.py
-__version__ = "0.5.0.post4"
+__version__ = "0.5.0.post8"
--- a/crawl4ai/cli.py
+++ b/crawl4ai/cli.py
@@ -20,13 +20,16 @@ from crawl4ai import (
    BrowserConfig, 
    CrawlerRunConfig,
    LLMExtractionStrategy, 
+    LXMLWebScrapingStrategy,
    JsonCssExtractionStrategy,
    JsonXPathExtractionStrategy,
    BM25ContentFilter, 
    PruningContentFilter,
    BrowserProfiler,
+    DefaultMarkdownGenerator,
    LLMConfig
 )
+from crawl4ai.config import USER_SETTINGS
 from litellm import completion
 from pathlib import Path

@@ -175,8 +178,12 @@ def show_examples():
    # CSS-based extraction
    crwl https://example.com -e extract_css.yml -s css_schema.json -o json

-    # LLM-based extraction
+    # LLM-based extraction with config file
    crwl https://example.com -e extract_llm.yml -s llm_schema.json -o json
+    
+    # Quick LLM-based JSON extraction (prompts for LLM provider first time)
+    crwl https://example.com -j  # Auto-extracts structured data
+    crwl https://example.com -j "Extract product details including name, price, and features"  # With specific instructions

 3️⃣  Direct Parameters:
    # Browser settings
@@ -278,7 +285,7 @@ llm_schema.json:
    # Combine configs with direct parameters
    crwl https://example.com -B browser.yml -b "headless=false,viewport_width=1920"

-    # Full extraction pipeline
+    # Full extraction pipeline with config files
    crwl https://example.com \\
        -B browser.yml \\
        -C crawler.yml \\
@@ -286,6 +293,12 @@ llm_schema.json:
        -s llm_schema.json \\
        -o json \\
        -v
+        
+    # Quick LLM-based extraction with specific instructions
+    crwl https://amazon.com/dp/B01DFKC2SO \\
+        -j "Extract product title, current price, original price, rating, and all product specifications" \\
+        -b "headless=true,viewport_width=1280" \\
+        -v

    # Content filtering with BM25
    crwl https://example.com \\
@@ -327,6 +340,14 @@ For more documentation visit: https://github.com/unclecode/crawl4ai
      - google/gemini-pro
    
    See full list of providers: https://docs.litellm.ai/docs/providers
+    
+    # Set default LLM provider and token in advance
+    crwl config set DEFAULT_LLM_PROVIDER "anthropic/claude-3-sonnet"
+    crwl config set DEFAULT_LLM_PROVIDER_TOKEN "your-api-token-here"
+    
+    # Set default browser behavior
+    crwl config set BROWSER_HEADLESS false  # Always show browser window
+    crwl config set USER_AGENT_MODE random  # Use random user agent

 9️⃣ Profile Management:
    # Launch interactive profile manager
@@ -983,17 +1004,19 @@ def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless
@click.option("--crawler-config", "-C", type=click.Path(exists=True), help="Crawler config file (YAML/JSON)")
@click.option("--filter-config", "-f", type=click.Path(exists=True), help="Content filter config file")
@click.option("--extraction-config", "-e", type=click.Path(exists=True), help="Extraction strategy config file")
+@click.option("--json-extract", "-j", is_flag=False, flag_value="", default=None, help="Extract structured data using LLM with optional description")
@click.option("--schema", "-s", type=click.Path(exists=True), help="JSON schema for extraction")
@click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2")
@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
@click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all")
-@click.option("--bypass-cache", is_flag=True, default=True, help="Bypass cache when crawling")
+@click.option("--output-file", "-O", type=click.Path(), help="Output file path (default: stdout)")
+@click.option("--bypass-cache", "-b", is_flag=True, default=True, help="Bypass cache when crawling")
@click.option("--question", "-q", help="Ask a question about the crawled content")
@click.option("--verbose", "-v", is_flag=True)
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
 def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: str, 
-           extraction_config: str, schema: str, browser: Dict, crawler: Dict,
-           output: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
+           extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
+           output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
    """Crawl a website and extract content
    
    Simple Usage:
@@ -1037,21 +1060,65 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config:
            crawler_cfg = crawler_cfg.clone(**crawler)
            
        # Handle content filter config
-        if filter_config:
-            filter_conf = load_config_file(filter_config)
+        if filter_config or output in ["markdown-fit", "md-fit"]:
+            if filter_config:
+                filter_conf = load_config_file(filter_config)
+            elif not filter_config and output in ["markdown-fit", "md-fit"]:
+                filter_conf = {
+                    "type": "pruning",
+                    "query": "",
+                    "threshold": 0.48
+                }
            if filter_conf["type"] == "bm25":
-                crawler_cfg.content_filter = BM25ContentFilter(
-                    user_query=filter_conf.get("query"),
-                    bm25_threshold=filter_conf.get("threshold", 1.0)
+                crawler_cfg.markdown_generator = DefaultMarkdownGenerator(
+                    content_filter = BM25ContentFilter(
+                        user_query=filter_conf.get("query"),
+                        bm25_threshold=filter_conf.get("threshold", 1.0)
+                    )
                )
            elif filter_conf["type"] == "pruning":
-                crawler_cfg.content_filter = PruningContentFilter(
-                    user_query=filter_conf.get("query"),
-                    threshold=filter_conf.get("threshold", 0.48)
+                crawler_cfg.markdown_generator = DefaultMarkdownGenerator(
+                    content_filter = PruningContentFilter(
+                        user_query=filter_conf.get("query"),
+                        threshold=filter_conf.get("threshold", 0.48)
+                    )
                )
+        
+        # Handle json-extract option (takes precedence over extraction-config)
+        if json_extract is not None:
+            # Get LLM provider and token
+            provider, token = setup_llm_config()
+            
+            # Default sophisticated instruction for structured data extraction
+            default_instruction = """Analyze the web page content and extract structured data as JSON. 
+If the page contains a list of items with repeated patterns, extract all items in an array. 
+If the page is an article or contains unique content, extract a comprehensive JSON object with all relevant information.
+Look at the content, intention of content, what it offers and find the data item(s) in the page.
+Always return valid, properly formatted JSON."""
+            
+            
+            default_instruction_with_user_query = """Analyze the web page content and extract structured data as JSON, following the below instruction and explanation of schema and always return valid, properly formatted JSON. \n\nInstruction:\n\n""" + json_extract
+            
+            # Determine instruction based on whether json_extract is empty or has content
+            instruction = default_instruction_with_user_query if json_extract else default_instruction
+            
+            # Create LLM extraction strategy
+            crawler_cfg.extraction_strategy = LLMExtractionStrategy(
+                llm_config=LLMConfig(provider=provider, api_token=token),
+                instruction=instruction,
+                schema=load_schema_file(schema),  # Will be None if no schema is provided
+                extraction_type="schema", #if schema else "block",
+                apply_chunking=False,
+                force_json_response=True,
+                verbose=verbose,
+            )
+            
+            # Set output to JSON if not explicitly specified
+            if output == "all":
+                output = "json"
                
-        # Handle extraction strategy
-        if extraction_config:
+        # Handle extraction strategy from config file (only if json-extract wasn't used)
+        elif extraction_config:
            extract_conf = load_config_file(extraction_config)
            schema_data = load_schema_file(schema)
            
@@ -1085,6 +1152,13 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config:
        # No cache
        if bypass_cache:
            crawler_cfg.cache_mode = CacheMode.BYPASS
+
+        crawler_cfg.scraping_strategy = LXMLWebScrapingStrategy()    
+
+        config = get_global_config()
+        
+        browser_cfg.verbose = config.get("VERBOSE", False)
+        crawler_cfg.verbose = config.get("VERBOSE", False)
        
        # Run crawler
        result : CrawlResult = anyio.run(
@@ -1103,14 +1177,31 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config:
            return
        
        # Handle output
-        if output == "all":
-            click.echo(json.dumps(result.model_dump(), indent=2))
-        elif output == "json":
-            click.echo(json.dumps(json.loads(result.extracted_content), indent=2))
-        elif output in ["markdown", "md"]:
-            click.echo(result.markdown.raw_markdown)
-        elif output in ["markdown-fit", "md-fit"]:
-            click.echo(result.markdown.fit_markdown)
+        if not output_file:
+            if output == "all":
+                click.echo(json.dumps(result.model_dump(), indent=2))
+            elif output == "json":
+                print(result.extracted_content)
+                extracted_items = json.loads(result.extracted_content)
+                click.echo(json.dumps(extracted_items, indent=2))
+                
+            elif output in ["markdown", "md"]:
+                click.echo(result.markdown.raw_markdown)
+            elif output in ["markdown-fit", "md-fit"]:
+                click.echo(result.markdown.fit_markdown)
+        else:
+            if output == "all":
+                with open(output_file, "w") as f:
+                    f.write(json.dumps(result.model_dump(), indent=2))
+            elif output == "json":
+                with open(output_file, "w") as f:
+                    f.write(result.extracted_content)
+            elif output in ["markdown", "md"]:
+                with open(output_file, "w") as f:
+                    f.write(result.markdown.raw_markdown)
+            elif output in ["markdown-fit", "md-fit"]:
+                with open(output_file, "w") as f:
+                    f.write(result.markdown.fit_markdown)
            
    except Exception as e:
        raise click.ClickException(str(e))
@@ -1120,6 +1211,120 @@ def examples_cmd():
    """Show usage examples"""
    show_examples()

+@cli.group("config")
+def config_cmd():
+    """Manage global configuration settings
+    
+    Commands to view and update global configuration settings:
+    - list: Display all current configuration settings
+    - get: Get the value of a specific setting
+    - set: Set the value of a specific setting
+    """
+    pass
+
+@config_cmd.command("list")
+def config_list_cmd():
+    """List all configuration settings"""
+    config = get_global_config()
+    
+    table = Table(title="Crawl4AI Configuration", show_header=True, header_style="bold cyan", border_style="blue")
+    table.add_column("Setting", style="cyan")
+    table.add_column("Value", style="green")
+    table.add_column("Default", style="yellow")
+    table.add_column("Description", style="white")
+    
+    for key, setting in USER_SETTINGS.items():
+        value = config.get(key, setting["default"])
+        
+        # Handle secret values
+        display_value = value
+        if setting.get("secret", False) and value:
+            display_value = "********"
+            
+        # Handle boolean values
+        if setting["type"] == "boolean":
+            display_value = str(value).lower()
+            default_value = str(setting["default"]).lower()
+        else:
+            default_value = str(setting["default"])
+        
+        table.add_row(
+            key,
+            str(display_value),
+            default_value,
+            setting["description"]
+        )
+    
+    console.print(table)
+
+@config_cmd.command("get")
+@click.argument("key", required=True)
+def config_get_cmd(key: str):
+    """Get a specific configuration setting"""
+    config = get_global_config()
+    
+    # Normalize key to uppercase
+    key = key.upper()
+    
+    if key not in USER_SETTINGS:
+        console.print(f"[red]Error: Unknown setting '{key}'[/red]")
+        return
+    
+    value = config.get(key, USER_SETTINGS[key]["default"])
+    
+    # Handle secret values
+    display_value = value
+    if USER_SETTINGS[key].get("secret", False) and value:
+        display_value = "********"
+    
+    console.print(f"[cyan]{key}[/cyan] = [green]{display_value}[/green]")
+    console.print(f"[dim]Description: {USER_SETTINGS[key]['description']}[/dim]")
+
+@config_cmd.command("set")
+@click.argument("key", required=True)
+@click.argument("value", required=True)
+def config_set_cmd(key: str, value: str):
+    """Set a configuration setting"""
+    config = get_global_config()
+    
+    # Normalize key to uppercase
+    key = key.upper()
+    
+    if key not in USER_SETTINGS:
+        console.print(f"[red]Error: Unknown setting '{key}'[/red]")
+        console.print(f"[yellow]Available settings: {', '.join(USER_SETTINGS.keys())}[/yellow]")
+        return
+    
+    setting = USER_SETTINGS[key]
+    
+    # Type conversion and validation
+    if setting["type"] == "boolean":
+        if value.lower() in ["true", "yes", "1", "y"]:
+            typed_value = True
+        elif value.lower() in ["false", "no", "0", "n"]:
+            typed_value = False
+        else:
+            console.print(f"[red]Error: Invalid boolean value. Use 'true' or 'false'.[/red]")
+            return
+    elif setting["type"] == "string":
+        typed_value = value
+        
+        # Check if the value should be one of the allowed options
+        if "options" in setting and value not in setting["options"]:
+            console.print(f"[red]Error: Value must be one of: {', '.join(setting['options'])}[/red]")
+            return
+    
+    # Update config
+    config[key] = typed_value
+    save_global_config(config)
+    
+    # Handle secret values for display
+    display_value = typed_value
+    if setting.get("secret", False) and typed_value:
+        display_value = "********"
+        
+    console.print(f"[green]Successfully set[/green] [cyan]{key}[/cyan] = [green]{display_value}[/green]")
+
@cli.command("profiles")
 def profiles_cmd():
    """Manage browser profiles interactively
@@ -1139,6 +1344,7 @@ def profiles_cmd():
@click.option("--crawler-config", "-C", type=click.Path(exists=True), help="Crawler config file (YAML/JSON)")
@click.option("--filter-config", "-f", type=click.Path(exists=True), help="Content filter config file")
@click.option("--extraction-config", "-e", type=click.Path(exists=True), help="Extraction strategy config file")
+@click.option("--json-extract", "-j", is_flag=False, flag_value="", default=None, help="Extract structured data using LLM with optional description")
@click.option("--schema", "-s", type=click.Path(exists=True), help="JSON schema for extraction")
@click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2")
@click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2")
@@ -1148,7 +1354,7 @@ def profiles_cmd():
@click.option("--verbose", "-v", is_flag=True)
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
 def default(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str, 
-        extraction_config: str, schema: str, browser: Dict, crawler: Dict,
+        extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
        output: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
    """Crawl4AI CLI - Web content extraction tool

@@ -1162,7 +1368,14 @@ def default(url: str, example: bool, browser_config: str, crawler_config: str, f
        crwl crawl      - Crawl a website with advanced options
        crwl cdp        - Launch browser with CDP debugging enabled
        crwl browser    - Manage builtin browser (start, stop, status, restart)
+        crwl config     - Manage global configuration settings
        crwl examples   - Show more usage examples
+        
+    Configuration Examples:
+        crwl config list                         - List all configuration settings
+        crwl config get DEFAULT_LLM_PROVIDER     - Show current LLM provider
+        crwl config set VERBOSE true             - Enable verbose mode globally
+        crwl config set BROWSER_HEADLESS false   - Default to visible browser
    """

    if example:
@@ -1183,7 +1396,8 @@ def default(url: str, example: bool, browser_config: str, crawler_config: str, f
        browser_config=browser_config,
        crawler_config=crawler_config,
        filter_config=filter_config,
-        extraction_config=extraction_config, 
+        extraction_config=extraction_config,
+        json_extract=json_extract,
        schema=schema,
        browser=browser,
        crawler=crawler,
--- a/crawl4ai/config.py
+++ b/crawl4ai/config.py
@@ -93,3 +93,46 @@ SHOW_DEPRECATION_WARNINGS = True
 SCREENSHOT_HEIGHT_TRESHOLD = 10000
 PAGE_TIMEOUT = 60000
 DOWNLOAD_PAGE_TIMEOUT = 60000
+
+# Global user settings with descriptions and default values
+USER_SETTINGS = {
+    "DEFAULT_LLM_PROVIDER": {
+        "default": "openai/gpt-4o",
+        "description": "Default LLM provider in 'company/model' format (e.g., 'openai/gpt-4o', 'anthropic/claude-3-sonnet')",
+        "type": "string"
+    },
+    "DEFAULT_LLM_PROVIDER_TOKEN": {
+        "default": "",
+        "description": "API token for the default LLM provider",
+        "type": "string",
+        "secret": True
+    },
+    "VERBOSE": {
+        "default": False,
+        "description": "Enable verbose output for all commands",
+        "type": "boolean"
+    },
+    "BROWSER_HEADLESS": {
+        "default": True,
+        "description": "Run browser in headless mode by default",
+        "type": "boolean"
+    },
+    "BROWSER_TYPE": {
+        "default": "chromium",
+        "description": "Default browser type (chromium or firefox)",
+        "type": "string",
+        "options": ["chromium", "firefox"]
+    },
+    "CACHE_MODE": {
+        "default": "bypass",
+        "description": "Default cache mode (bypass, use, or refresh)",
+        "type": "string",
+        "options": ["bypass", "use", "refresh"]
+    },
+    "USER_AGENT_MODE": {
+        "default": "default",
+        "description": "Default user agent mode (default, random, or mobile)",
+        "type": "string",
+        "options": ["default", "random", "mobile"]
+    }
+}
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -5,7 +5,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 import json
 import time

-from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH
+from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH, PROMPT_EXTRACT_INFERRED_SCHEMA
 from .config import (
    DEFAULT_PROVIDER, CHUNK_TOKEN_THRESHOLD,
    OVERLAP_RATE,
@@ -507,6 +507,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
        word_token_rate=WORD_TOKEN_RATE,
        apply_chunking=True,
        input_format: str = "markdown",
+        force_json_response=False,
        verbose=False,
        # Deprecated arguments
        provider: str = DEFAULT_PROVIDER,
@@ -527,9 +528,10 @@ class LLMExtractionStrategy(ExtractionStrategy):
            overlap_rate: Overlap between chunks.
            word_token_rate: Word to token conversion rate.
            apply_chunking: Whether to apply chunking.
+            input_format: Content format to use for extraction.
+                            Options: "markdown" (default), "html", "fit_markdown"
+            force_json_response: Whether to force a JSON response from the LLM.
            verbose: Whether to print verbose output.
-            usages: List of individual token usages.
-            total_usage: Accumulated token usage.

            # Deprecated arguments, will be removed very soon
            provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
@@ -545,6 +547,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
        self.schema = schema
        if schema:
            self.extract_type = "schema"
+        self.force_json_response = force_json_response
        self.chunk_token_threshold = chunk_token_threshold or CHUNK_TOKEN_THRESHOLD
        self.overlap_rate = overlap_rate
        self.word_token_rate = word_token_rate
@@ -608,64 +611,97 @@ class LLMExtractionStrategy(ExtractionStrategy):
            variable_values["SCHEMA"] = json.dumps(self.schema, indent=2) # if type of self.schema is dict else self.schema
            prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION

+        if self.extract_type == "schema" and not self.schema:
+            prompt_with_variables = PROMPT_EXTRACT_INFERRED_SCHEMA
+
        for variable in variable_values:
            prompt_with_variables = prompt_with_variables.replace(
                "{" + variable + "}", variable_values[variable]
            )

-        response = perform_completion_with_backoff(
-            self.llm_config.provider,
-            prompt_with_variables,
-            self.llm_config.api_token,
-            base_url=self.llm_config.base_url,
-            extra_args=self.extra_args,
-        )  # , json_response=self.extract_type == "schema")
-        # Track usage
-        usage = TokenUsage(
-            completion_tokens=response.usage.completion_tokens,
-            prompt_tokens=response.usage.prompt_tokens,
-            total_tokens=response.usage.total_tokens,
-            completion_tokens_details=response.usage.completion_tokens_details.__dict__
-            if response.usage.completion_tokens_details
-            else {},
-            prompt_tokens_details=response.usage.prompt_tokens_details.__dict__
-            if response.usage.prompt_tokens_details
-            else {},
-        )
-        self.usages.append(usage)
-
-        # Update totals
-        self.total_usage.completion_tokens += usage.completion_tokens
-        self.total_usage.prompt_tokens += usage.prompt_tokens
-        self.total_usage.total_tokens += usage.total_tokens
-
        try:
-            blocks = extract_xml_data(["blocks"], response.choices[0].message.content)[
-                "blocks"
-            ]
-            blocks = json.loads(blocks)
-            for block in blocks:
-                block["error"] = False
-        except Exception:
-            parsed, unparsed = split_and_parse_json_objects(
-                response.choices[0].message.content
+            response = perform_completion_with_backoff(
+                self.llm_config.provider,
+                prompt_with_variables,
+                self.llm_config.api_token,
+                base_url=self.llm_config.base_url,
+                json_response=self.force_json_response,
+                extra_args=self.extra_args,
+            )  # , json_response=self.extract_type == "schema")
+            # Track usage
+            usage = TokenUsage(
+                completion_tokens=response.usage.completion_tokens,
+                prompt_tokens=response.usage.prompt_tokens,
+                total_tokens=response.usage.total_tokens,
+                completion_tokens_details=response.usage.completion_tokens_details.__dict__
+                if response.usage.completion_tokens_details
+                else {},
+                prompt_tokens_details=response.usage.prompt_tokens_details.__dict__
+                if response.usage.prompt_tokens_details
+                else {},
            )
-            blocks = parsed
-            if unparsed:
-                blocks.append(
-                    {"index": 0, "error": True, "tags": ["error"], "content": unparsed}
-                )
+            self.usages.append(usage)

-        if self.verbose:
-            print(
-                "[LOG] Extracted",
-                len(blocks),
-                "blocks from URL:",
-                url,
-                "block index:",
-                ix,
-            )
-        return blocks
+            # Update totals
+            self.total_usage.completion_tokens += usage.completion_tokens
+            self.total_usage.prompt_tokens += usage.prompt_tokens
+            self.total_usage.total_tokens += usage.total_tokens
+
+            try:
+                response = response.choices[0].message.content
+                blocks = None
+
+                if self.force_json_response:
+                    blocks = json.loads(response)
+                    if isinstance(blocks, dict):
+                        # If it has only one key which calue is list then assign that to blocks, exampled: {"news": [..]}
+                        if len(blocks) == 1 and isinstance(list(blocks.values())[0], list):
+                            blocks = list(blocks.values())[0]
+                        else:
+                            # If it has only one key which value is not list then assign that to blocks, exampled: { "article_id": "1234", ... }
+                            blocks = [blocks]
+                    elif isinstance(blocks, list):
+                        # If it is a list then assign that to blocks
+                        blocks = blocks
+                else: 
+                    # blocks = extract_xml_data(["blocks"], response.choices[0].message.content)["blocks"]
+                    blocks = extract_xml_data(["blocks"], response)["blocks"]
+                    blocks = json.loads(blocks)
+
+                for block in blocks:
+                    block["error"] = False
+            except Exception:
+                parsed, unparsed = split_and_parse_json_objects(
+                    response.choices[0].message.content
+                )
+                blocks = parsed
+                if unparsed:
+                    blocks.append(
+                        {"index": 0, "error": True, "tags": ["error"], "content": unparsed}
+                    )
+
+            if self.verbose:
+                print(
+                    "[LOG] Extracted",
+                    len(blocks),
+                    "blocks from URL:",
+                    url,
+                    "block index:",
+                    ix,
+                )
+            return blocks
+        except Exception as e:
+            if self.verbose:
+                print(f"[LOG] Error in LLM extraction: {e}")
+            # Add error information to extracted_content
+            return [
+                {
+                    "index": ix,
+                    "error": True,
+                    "tags": ["error"],
+                    "content": str(e),
+                }
+            ]

    def _merge(self, documents, chunk_token_threshold, overlap) -> List[str]:
        """
--- a/crawl4ai/install.py
+++ b/crawl4ai/install.py
@@ -45,7 +45,8 @@ def post_install():
    setup_home_directory()
    install_playwright()
    run_migration()
-    setup_builtin_browser()
+    # TODO: Will be added in the future
+    # setup_builtin_browser()
    logger.success("Post-installation setup completed!", tag="COMPLETE")
    
 def setup_builtin_browser():
--- a/crawl4ai/prompts.py
+++ b/crawl4ai/prompts.py
@@ -203,6 +203,62 @@ Avoid Common Mistakes:
 Result
 Output the final list of JSON objects, wrapped in <blocks>...</blocks> XML tags. Make sure to close the tag properly."""

+PROMPT_EXTRACT_INFERRED_SCHEMA = """Here is the content from the URL:
+<url>{URL}</url>
+
+<url_content>
+{HTML}
+</url_content>
+
+Please carefully read the URL content and the user's request. Analyze the page structure and infer the most appropriate JSON schema based on the content and request.
+
+Extraction Strategy:
+1. First, determine if the page contains repetitive items (like multiple products, articles, etc.) or a single content item (like a single article or page).
+2. For repetitive items: Identify the common pattern and extract each instance as a separate JSON object in an array.
+3. For single content: Extract the key information into a comprehensive JSON object that captures the essential details.
+
+Extraction instructions:
+Return the extracted information as a list of JSON objects. For repetitive content, each object in the list should correspond to a distinct item. For single content, you may return just one detailed JSON object. Wrap the entire JSON list in <blocks>...</blocks> XML tags.
+
+Schema Design Guidelines:
+- Create meaningful property names that clearly describe the data they contain
+- Use nested objects for hierarchical information
+- Use arrays for lists of related items
+- Include all information requested by the user
+- Maintain consistency in property names and data structures
+- Only include properties that are actually present in the content
+- For dates, prefer ISO format (YYYY-MM-DD)
+- For prices or numeric values, extract them without currency symbols when possible
+
+Quality Reflection:
+Before outputting your final answer, double check that:
+1. The inferred schema makes logical sense for the type of content
+2. All requested information is included
+3. The JSON is valid and could be parsed without errors
+4. Property names are consistent and descriptive
+5. The structure is optimal for the type of data being represented
+
+Avoid Common Mistakes:
+- Do NOT add any comments using "//" or "#" in the JSON output. It causes parsing errors.
+- Make sure the JSON is properly formatted with curly braces, square brackets, and commas in the right places.
+- Do not miss closing </blocks> tag at the end of the JSON output.
+- Do not generate Python code showing how to do the task; this is your task to extract the information and return it in JSON format.
+- Ensure consistency in property names across all objects
+- Don't include empty properties or null values unless they're meaningful
+- For repetitive content, ensure all objects follow the same schema
+
+Important: If user specific instruction is provided, then stress significantly on what user is requesting and describing about the schema of end result (if any). If user is requesting to extract specific information, then focus on that and ignore the rest of the content.
+<user_request>
+{REQUEST}
+</user_request>
+
+Result:
+Output the final list of JSON objects, wrapped in <blocks>...</blocks> XML tags. Make sure to close the tag properly.
+
+DO NOT ADD ANY PRE OR POST COMMENTS. JUST RETURN THE JSON OBJECTS INSIDE <blocks>...</blocks> TAGS.
+
+CRITICAL: The content inside the <blocks> tags MUST be a direct array of JSON objects (starting with '[' and ending with ']'), not a dictionary/object containing an array. For example, use <blocks>[{...}, {...}]</blocks> instead of <blocks>{"items": [{...}, {...}]}</blocks>. This is essential for proper parsing.
+"""

 PROMPT_FILTER_CONTENT = """Your task is to filter and convert HTML content into clean, focused markdown that's optimized for use with LLMs and information retrieval systems.

--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -1551,7 +1551,7 @@ def extract_xml_tags(string):
    return list(set(tags))


-def extract_xml_data(tags, string):
+def extract_xml_data_legacy(tags, string):
    """
    Extract data for specified XML tags from a string.

@@ -1580,6 +1580,38 @@ def extract_xml_data(tags, string):

    return data

+def extract_xml_data(tags, string):
+    """
+    Extract data for specified XML tags from a string, returning the longest content for each tag.
+
+    How it works:
+    1. Finds all occurrences of each tag in the string using regex.
+    2. For each tag, selects the occurrence with the longest content.
+    3. Returns a dictionary of tag-content pairs.
+
+    Args:
+        tags (List[str]): The list of XML tags to extract.
+        string (str): The input string containing XML data.
+
+    Returns:
+        Dict[str, str]: A dictionary with tag names as keys and longest extracted content as values.
+    """
+
+    data = {}
+
+    for tag in tags:
+        pattern = f"<{tag}>(.*?)</{tag}>"
+        matches = re.findall(pattern, string, re.DOTALL)
+        
+        if matches:
+            # Find the longest content for this tag
+            longest_content = max(matches, key=len).strip()
+            data[tag] = longest_content
+        else:
+            data[tag] = ""
+
+    return data
+

 def perform_completion_with_backoff(
    provider,
@@ -1648,6 +1680,19 @@ def perform_completion_with_backoff(
                        "content": ["Rate limit error. Please try again later."],
                    }
                ]
+        except Exception as e:
+            raise e  # Raise any other exceptions immediately
+            # print("Error during completion request:", str(e))
+            # error_message = e.message
+            # return [
+            #     {
+            #         "index": 0,
+            #         "tags": ["error"],
+            #         "content": [
+            #             f"Error during LLM completion request. {error_message}"
+            #         ],
+            #     }
+            # ]


 def extract_blocks(url, html, provider=DEFAULT_PROVIDER, api_token=None, base_url=None):