From 4a20d7f7c2054bb735a31d13b23942a7512b07a1 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Tue, 25 Mar 2025 20:30:25 +0800 Subject: [PATCH] feat(cli): add quick JSON extraction and global config management Adds new features to improve user experience and configuration: - Quick JSON extraction with -j flag for direct LLM-based structured data extraction - Global configuration management with 'crwl config' commands - Enhanced LLM extraction with better JSON handling and error management - New user settings for default behaviors (LLM provider, browser settings, etc.) Breaking changes: None --- crawl4ai/cli.py | 198 ++++++++++++++++++++++++++++++-- crawl4ai/config.py | 43 +++++++ crawl4ai/extraction_strategy.py | 142 ++++++++++++++--------- crawl4ai/prompts.py | 56 +++++++++ crawl4ai/utils.py | 47 +++++++- 5 files changed, 424 insertions(+), 62 deletions(-) diff --git a/crawl4ai/cli.py b/crawl4ai/cli.py index 1805ab46..cbd90b0c 100644 --- a/crawl4ai/cli.py +++ b/crawl4ai/cli.py @@ -28,6 +28,7 @@ from crawl4ai import ( DefaultMarkdownGenerator, LLMConfig ) +from crawl4ai.config import USER_SETTINGS from litellm import completion from pathlib import Path @@ -176,8 +177,12 @@ def show_examples(): # CSS-based extraction crwl https://example.com -e extract_css.yml -s css_schema.json -o json - # LLM-based extraction + # LLM-based extraction with config file crwl https://example.com -e extract_llm.yml -s llm_schema.json -o json + + # Quick LLM-based JSON extraction (prompts for LLM provider first time) + crwl https://example.com -j # Auto-extracts structured data + crwl https://example.com -j "Extract product details including name, price, and features" # With specific instructions 3️⃣ Direct Parameters: # Browser settings @@ -279,7 +284,7 @@ llm_schema.json: # Combine configs with direct parameters crwl https://example.com -B browser.yml -b "headless=false,viewport_width=1920" - # Full extraction pipeline + # Full extraction pipeline with config files crwl https://example.com \\ -B browser.yml \\ -C crawler.yml \\ @@ -287,6 +292,12 @@ llm_schema.json: -s llm_schema.json \\ -o json \\ -v + + # Quick LLM-based extraction with specific instructions + crwl https://amazon.com/dp/B01DFKC2SO \\ + -j "Extract product title, current price, original price, rating, and all product specifications" \\ + -b "headless=true,viewport_width=1280" \\ + -v # Content filtering with BM25 crwl https://example.com \\ @@ -328,6 +339,14 @@ For more documentation visit: https://github.com/unclecode/crawl4ai - google/gemini-pro See full list of providers: https://docs.litellm.ai/docs/providers + + # Set default LLM provider and token in advance + crwl config set DEFAULT_LLM_PROVIDER "anthropic/claude-3-sonnet" + crwl config set DEFAULT_LLM_PROVIDER_TOKEN "your-api-token-here" + + # Set default browser behavior + crwl config set BROWSER_HEADLESS false # Always show browser window + crwl config set USER_AGENT_MODE random # Use random user agent 9️⃣ Profile Management: # Launch interactive profile manager @@ -984,6 +1003,7 @@ def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless @click.option("--crawler-config", "-C", type=click.Path(exists=True), help="Crawler config file (YAML/JSON)") @click.option("--filter-config", "-f", type=click.Path(exists=True), help="Content filter config file") @click.option("--extraction-config", "-e", type=click.Path(exists=True), help="Extraction strategy config file") +@click.option("--json-extract", "-j", is_flag=False, flag_value="", default=None, help="Extract structured data using LLM with optional description") @click.option("--schema", "-s", type=click.Path(exists=True), help="JSON schema for extraction") @click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2") @click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2") @@ -993,7 +1013,7 @@ def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless @click.option("--verbose", "-v", is_flag=True) @click.option("--profile", "-p", help="Use a specific browser profile (by name)") def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: str, - extraction_config: str, schema: str, browser: Dict, crawler: Dict, + extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict, output: str, bypass_cache: bool, question: str, verbose: bool, profile: str): """Crawl a website and extract content @@ -1061,9 +1081,42 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: threshold=filter_conf.get("threshold", 0.48) ) ) + + # Handle json-extract option (takes precedence over extraction-config) + if json_extract is not None: + # Get LLM provider and token + provider, token = setup_llm_config() + + # Default sophisticated instruction for structured data extraction + default_instruction = """Analyze the web page content and extract structured data as JSON. +If the page contains a list of items with repeated patterns, extract all items in an array. +If the page is an article or contains unique content, extract a comprehensive JSON object with all relevant information. +Look at the content, intention of content, what it offers and find the data item(s) in the page. +Always return valid, properly formatted JSON.""" + + + default_instruction_with_user_query = """Analyze the web page content and extract structured data as JSON, following the below instruction and explanation of schema and always return valid, properly formatted JSON. \n\nInstruction:\n\n""" + json_extract + + # Determine instruction based on whether json_extract is empty or has content + instruction = default_instruction_with_user_query if json_extract else default_instruction + + # Create LLM extraction strategy + crawler_cfg.extraction_strategy = LLMExtractionStrategy( + llm_config=LLMConfig(provider=provider, api_token=token), + instruction=instruction, + schema=load_schema_file(schema), # Will be None if no schema is provided + extraction_type="schema", #if schema else "block", + apply_chunking=False, + force_json_response=True, + verbose=verbose, + ) + + # Set output to JSON if not explicitly specified + if output == "all": + output = "json" - # Handle extraction strategy - if extraction_config: + # Handle extraction strategy from config file (only if json-extract wasn't used) + elif extraction_config: extract_conf = load_config_file(extraction_config) schema_data = load_schema_file(schema) @@ -1097,6 +1150,9 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: # No cache if bypass_cache: crawler_cfg.cache_mode = CacheMode.BYPASS + + browser_cfg.verbose = verbose + crawler_cfg.verbose = verbose # Run crawler result : CrawlResult = anyio.run( @@ -1118,7 +1174,10 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: if output == "all": click.echo(json.dumps(result.model_dump(), indent=2)) elif output == "json": - click.echo(json.dumps(json.loads(result.extracted_content), indent=2)) + print(result.extracted_content) + extracted_items = json.loads(result.extracted_content) + click.echo(json.dumps(extracted_items, indent=2)) + elif output in ["markdown", "md"]: click.echo(result.markdown.raw_markdown) elif output in ["markdown-fit", "md-fit"]: @@ -1132,6 +1191,120 @@ def examples_cmd(): """Show usage examples""" show_examples() +@cli.group("config") +def config_cmd(): + """Manage global configuration settings + + Commands to view and update global configuration settings: + - list: Display all current configuration settings + - get: Get the value of a specific setting + - set: Set the value of a specific setting + """ + pass + +@config_cmd.command("list") +def config_list_cmd(): + """List all configuration settings""" + config = get_global_config() + + table = Table(title="Crawl4AI Configuration", show_header=True, header_style="bold cyan", border_style="blue") + table.add_column("Setting", style="cyan") + table.add_column("Value", style="green") + table.add_column("Default", style="yellow") + table.add_column("Description", style="white") + + for key, setting in USER_SETTINGS.items(): + value = config.get(key, setting["default"]) + + # Handle secret values + display_value = value + if setting.get("secret", False) and value: + display_value = "********" + + # Handle boolean values + if setting["type"] == "boolean": + display_value = str(value).lower() + default_value = str(setting["default"]).lower() + else: + default_value = str(setting["default"]) + + table.add_row( + key, + str(display_value), + default_value, + setting["description"] + ) + + console.print(table) + +@config_cmd.command("get") +@click.argument("key", required=True) +def config_get_cmd(key: str): + """Get a specific configuration setting""" + config = get_global_config() + + # Normalize key to uppercase + key = key.upper() + + if key not in USER_SETTINGS: + console.print(f"[red]Error: Unknown setting '{key}'[/red]") + return + + value = config.get(key, USER_SETTINGS[key]["default"]) + + # Handle secret values + display_value = value + if USER_SETTINGS[key].get("secret", False) and value: + display_value = "********" + + console.print(f"[cyan]{key}[/cyan] = [green]{display_value}[/green]") + console.print(f"[dim]Description: {USER_SETTINGS[key]['description']}[/dim]") + +@config_cmd.command("set") +@click.argument("key", required=True) +@click.argument("value", required=True) +def config_set_cmd(key: str, value: str): + """Set a configuration setting""" + config = get_global_config() + + # Normalize key to uppercase + key = key.upper() + + if key not in USER_SETTINGS: + console.print(f"[red]Error: Unknown setting '{key}'[/red]") + console.print(f"[yellow]Available settings: {', '.join(USER_SETTINGS.keys())}[/yellow]") + return + + setting = USER_SETTINGS[key] + + # Type conversion and validation + if setting["type"] == "boolean": + if value.lower() in ["true", "yes", "1", "y"]: + typed_value = True + elif value.lower() in ["false", "no", "0", "n"]: + typed_value = False + else: + console.print(f"[red]Error: Invalid boolean value. Use 'true' or 'false'.[/red]") + return + elif setting["type"] == "string": + typed_value = value + + # Check if the value should be one of the allowed options + if "options" in setting and value not in setting["options"]: + console.print(f"[red]Error: Value must be one of: {', '.join(setting['options'])}[/red]") + return + + # Update config + config[key] = typed_value + save_global_config(config) + + # Handle secret values for display + display_value = typed_value + if setting.get("secret", False) and typed_value: + display_value = "********" + + console.print(f"[green]Successfully set[/green] [cyan]{key}[/cyan] = [green]{display_value}[/green]") + @cli.command("profiles") def profiles_cmd(): """Manage browser profiles interactively @@ -1151,6 +1324,7 @@ def profiles_cmd(): @click.option("--crawler-config", "-C", type=click.Path(exists=True), help="Crawler config file (YAML/JSON)") @click.option("--filter-config", "-f", type=click.Path(exists=True), help="Content filter config file") @click.option("--extraction-config", "-e", type=click.Path(exists=True), help="Extraction strategy config file") +@click.option("--json-extract", "-j", is_flag=False, flag_value="", default=None, help="Extract structured data using LLM with optional description") @click.option("--schema", "-s", type=click.Path(exists=True), help="JSON schema for extraction") @click.option("--browser", "-b", type=str, callback=parse_key_values, help="Browser parameters as key1=value1,key2=value2") @click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2") @@ -1160,7 +1334,7 @@ def profiles_cmd(): @click.option("--verbose", "-v", is_flag=True) @click.option("--profile", "-p", help="Use a specific browser profile (by name)") def default(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str, - extraction_config: str, schema: str, browser: Dict, crawler: Dict, + extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict, output: str, bypass_cache: bool, question: str, verbose: bool, profile: str): """Crawl4AI CLI - Web content extraction tool @@ -1174,7 +1348,14 @@ def default(url: str, example: bool, browser_config: str, crawler_config: str, f crwl crawl - Crawl a website with advanced options crwl cdp - Launch browser with CDP debugging enabled crwl browser - Manage builtin browser (start, stop, status, restart) + crwl config - Manage global configuration settings crwl examples - Show more usage examples + + Configuration Examples: + crwl config list - List all configuration settings + crwl config get DEFAULT_LLM_PROVIDER - Show current LLM provider + crwl config set VERBOSE true - Enable verbose mode globally + crwl config set BROWSER_HEADLESS false - Default to visible browser """ if example: @@ -1195,7 +1376,8 @@ def default(url: str, example: bool, browser_config: str, crawler_config: str, f browser_config=browser_config, crawler_config=crawler_config, filter_config=filter_config, - extraction_config=extraction_config, + extraction_config=extraction_config, + json_extract=json_extract, schema=schema, browser=browser, crawler=crawler, diff --git a/crawl4ai/config.py b/crawl4ai/config.py index 866c7dc0..103dc1b7 100644 --- a/crawl4ai/config.py +++ b/crawl4ai/config.py @@ -93,3 +93,46 @@ SHOW_DEPRECATION_WARNINGS = True SCREENSHOT_HEIGHT_TRESHOLD = 10000 PAGE_TIMEOUT = 60000 DOWNLOAD_PAGE_TIMEOUT = 60000 + +# Global user settings with descriptions and default values +USER_SETTINGS = { + "DEFAULT_LLM_PROVIDER": { + "default": "openai/gpt-4o", + "description": "Default LLM provider in 'company/model' format (e.g., 'openai/gpt-4o', 'anthropic/claude-3-sonnet')", + "type": "string" + }, + "DEFAULT_LLM_PROVIDER_TOKEN": { + "default": "", + "description": "API token for the default LLM provider", + "type": "string", + "secret": True + }, + "VERBOSE": { + "default": False, + "description": "Enable verbose output for all commands", + "type": "boolean" + }, + "BROWSER_HEADLESS": { + "default": True, + "description": "Run browser in headless mode by default", + "type": "boolean" + }, + "BROWSER_TYPE": { + "default": "chromium", + "description": "Default browser type (chromium or firefox)", + "type": "string", + "options": ["chromium", "firefox"] + }, + "CACHE_MODE": { + "default": "bypass", + "description": "Default cache mode (bypass, use, or refresh)", + "type": "string", + "options": ["bypass", "use", "refresh"] + }, + "USER_AGENT_MODE": { + "default": "default", + "description": "Default user agent mode (default, random, or mobile)", + "type": "string", + "options": ["default", "random", "mobile"] + } +} diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 0790c8d5..bf4825cc 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -5,7 +5,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed import json import time -from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH +from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH, PROMPT_EXTRACT_INFERRED_SCHEMA from .config import ( DEFAULT_PROVIDER, CHUNK_TOKEN_THRESHOLD, OVERLAP_RATE, @@ -507,6 +507,7 @@ class LLMExtractionStrategy(ExtractionStrategy): word_token_rate=WORD_TOKEN_RATE, apply_chunking=True, input_format: str = "markdown", + force_json_response=False, verbose=False, # Deprecated arguments provider: str = DEFAULT_PROVIDER, @@ -527,9 +528,10 @@ class LLMExtractionStrategy(ExtractionStrategy): overlap_rate: Overlap between chunks. word_token_rate: Word to token conversion rate. apply_chunking: Whether to apply chunking. + input_format: Content format to use for extraction. + Options: "markdown" (default), "html", "fit_markdown" + force_json_response: Whether to force a JSON response from the LLM. verbose: Whether to print verbose output. - usages: List of individual token usages. - total_usage: Accumulated token usage. # Deprecated arguments, will be removed very soon provider: The provider to use for extraction. It follows the format /, e.g., "ollama/llama3.3". @@ -545,6 +547,7 @@ class LLMExtractionStrategy(ExtractionStrategy): self.schema = schema if schema: self.extract_type = "schema" + self.force_json_response = force_json_response self.chunk_token_threshold = chunk_token_threshold or CHUNK_TOKEN_THRESHOLD self.overlap_rate = overlap_rate self.word_token_rate = word_token_rate @@ -608,64 +611,97 @@ class LLMExtractionStrategy(ExtractionStrategy): variable_values["SCHEMA"] = json.dumps(self.schema, indent=2) # if type of self.schema is dict else self.schema prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION + if self.extract_type == "schema" and not self.schema: + prompt_with_variables = PROMPT_EXTRACT_INFERRED_SCHEMA + for variable in variable_values: prompt_with_variables = prompt_with_variables.replace( "{" + variable + "}", variable_values[variable] ) - response = perform_completion_with_backoff( - self.llm_config.provider, - prompt_with_variables, - self.llm_config.api_token, - base_url=self.llm_config.base_url, - extra_args=self.extra_args, - ) # , json_response=self.extract_type == "schema") - # Track usage - usage = TokenUsage( - completion_tokens=response.usage.completion_tokens, - prompt_tokens=response.usage.prompt_tokens, - total_tokens=response.usage.total_tokens, - completion_tokens_details=response.usage.completion_tokens_details.__dict__ - if response.usage.completion_tokens_details - else {}, - prompt_tokens_details=response.usage.prompt_tokens_details.__dict__ - if response.usage.prompt_tokens_details - else {}, - ) - self.usages.append(usage) - - # Update totals - self.total_usage.completion_tokens += usage.completion_tokens - self.total_usage.prompt_tokens += usage.prompt_tokens - self.total_usage.total_tokens += usage.total_tokens - try: - blocks = extract_xml_data(["blocks"], response.choices[0].message.content)[ - "blocks" - ] - blocks = json.loads(blocks) - for block in blocks: - block["error"] = False - except Exception: - parsed, unparsed = split_and_parse_json_objects( - response.choices[0].message.content + response = perform_completion_with_backoff( + self.llm_config.provider, + prompt_with_variables, + self.llm_config.api_token, + base_url=self.llm_config.base_url, + json_response=self.force_json_response, + extra_args=self.extra_args, + ) # , json_response=self.extract_type == "schema") + # Track usage + usage = TokenUsage( + completion_tokens=response.usage.completion_tokens, + prompt_tokens=response.usage.prompt_tokens, + total_tokens=response.usage.total_tokens, + completion_tokens_details=response.usage.completion_tokens_details.__dict__ + if response.usage.completion_tokens_details + else {}, + prompt_tokens_details=response.usage.prompt_tokens_details.__dict__ + if response.usage.prompt_tokens_details + else {}, ) - blocks = parsed - if unparsed: - blocks.append( - {"index": 0, "error": True, "tags": ["error"], "content": unparsed} - ) + self.usages.append(usage) - if self.verbose: - print( - "[LOG] Extracted", - len(blocks), - "blocks from URL:", - url, - "block index:", - ix, - ) - return blocks + # Update totals + self.total_usage.completion_tokens += usage.completion_tokens + self.total_usage.prompt_tokens += usage.prompt_tokens + self.total_usage.total_tokens += usage.total_tokens + + try: + response = response.choices[0].message.content + blocks = None + + if self.force_json_response: + blocks = json.loads(response) + if isinstance(blocks, dict): + # If it has only one key which calue is list then assign that to blocks, exampled: {"news": [..]} + if len(blocks) == 1 and isinstance(list(blocks.values())[0], list): + blocks = list(blocks.values())[0] + else: + # If it has only one key which value is not list then assign that to blocks, exampled: { "article_id": "1234", ... } + blocks = [blocks] + elif isinstance(blocks, list): + # If it is a list then assign that to blocks + blocks = blocks + else: + # blocks = extract_xml_data(["blocks"], response.choices[0].message.content)["blocks"] + blocks = extract_xml_data(["blocks"], response)["blocks"] + blocks = json.loads(blocks) + + for block in blocks: + block["error"] = False + except Exception: + parsed, unparsed = split_and_parse_json_objects( + response.choices[0].message.content + ) + blocks = parsed + if unparsed: + blocks.append( + {"index": 0, "error": True, "tags": ["error"], "content": unparsed} + ) + + if self.verbose: + print( + "[LOG] Extracted", + len(blocks), + "blocks from URL:", + url, + "block index:", + ix, + ) + return blocks + except Exception as e: + if self.verbose: + print(f"[LOG] Error in LLM extraction: {e}") + # Add error information to extracted_content + return [ + { + "index": ix, + "error": True, + "tags": ["error"], + "content": str(e), + } + ] def _merge(self, documents, chunk_token_threshold, overlap) -> List[str]: """ diff --git a/crawl4ai/prompts.py b/crawl4ai/prompts.py index 59e9baf0..84ffea88 100644 --- a/crawl4ai/prompts.py +++ b/crawl4ai/prompts.py @@ -203,6 +203,62 @@ Avoid Common Mistakes: Result Output the final list of JSON objects, wrapped in ... XML tags. Make sure to close the tag properly.""" +PROMPT_EXTRACT_INFERRED_SCHEMA = """Here is the content from the URL: +{URL} + + +{HTML} + + +Please carefully read the URL content and the user's request. Analyze the page structure and infer the most appropriate JSON schema based on the content and request. + +Extraction Strategy: +1. First, determine if the page contains repetitive items (like multiple products, articles, etc.) or a single content item (like a single article or page). +2. For repetitive items: Identify the common pattern and extract each instance as a separate JSON object in an array. +3. For single content: Extract the key information into a comprehensive JSON object that captures the essential details. + +Extraction instructions: +Return the extracted information as a list of JSON objects. For repetitive content, each object in the list should correspond to a distinct item. For single content, you may return just one detailed JSON object. Wrap the entire JSON list in ... XML tags. + +Schema Design Guidelines: +- Create meaningful property names that clearly describe the data they contain +- Use nested objects for hierarchical information +- Use arrays for lists of related items +- Include all information requested by the user +- Maintain consistency in property names and data structures +- Only include properties that are actually present in the content +- For dates, prefer ISO format (YYYY-MM-DD) +- For prices or numeric values, extract them without currency symbols when possible + +Quality Reflection: +Before outputting your final answer, double check that: +1. The inferred schema makes logical sense for the type of content +2. All requested information is included +3. The JSON is valid and could be parsed without errors +4. Property names are consistent and descriptive +5. The structure is optimal for the type of data being represented + +Avoid Common Mistakes: +- Do NOT add any comments using "//" or "#" in the JSON output. It causes parsing errors. +- Make sure the JSON is properly formatted with curly braces, square brackets, and commas in the right places. +- Do not miss closing tag at the end of the JSON output. +- Do not generate Python code showing how to do the task; this is your task to extract the information and return it in JSON format. +- Ensure consistency in property names across all objects +- Don't include empty properties or null values unless they're meaningful +- For repetitive content, ensure all objects follow the same schema + +Important: If user specific instruction is provided, then stress significantly on what user is requesting and describing about the schema of end result (if any). If user is requesting to extract specific information, then focus on that and ignore the rest of the content. + +{REQUEST} + + +Result: +Output the final list of JSON objects, wrapped in ... XML tags. Make sure to close the tag properly. + +DO NOT ADD ANY PRE OR POST COMMENTS. JUST RETURN THE JSON OBJECTS INSIDE ... TAGS. + +CRITICAL: The content inside the tags MUST be a direct array of JSON objects (starting with '[' and ending with ']'), not a dictionary/object containing an array. For example, use [{...}, {...}] instead of {"items": [{...}, {...}]}. This is essential for proper parsing. +""" PROMPT_FILTER_CONTENT = """Your task is to filter and convert HTML content into clean, focused markdown that's optimized for use with LLMs and information retrieval systems. diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index acaf7933..02d105a9 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -1551,7 +1551,7 @@ def extract_xml_tags(string): return list(set(tags)) -def extract_xml_data(tags, string): +def extract_xml_data_legacy(tags, string): """ Extract data for specified XML tags from a string. @@ -1580,6 +1580,38 @@ def extract_xml_data(tags, string): return data +def extract_xml_data(tags, string): + """ + Extract data for specified XML tags from a string, returning the longest content for each tag. + + How it works: + 1. Finds all occurrences of each tag in the string using regex. + 2. For each tag, selects the occurrence with the longest content. + 3. Returns a dictionary of tag-content pairs. + + Args: + tags (List[str]): The list of XML tags to extract. + string (str): The input string containing XML data. + + Returns: + Dict[str, str]: A dictionary with tag names as keys and longest extracted content as values. + """ + + data = {} + + for tag in tags: + pattern = f"<{tag}>(.*?)" + matches = re.findall(pattern, string, re.DOTALL) + + if matches: + # Find the longest content for this tag + longest_content = max(matches, key=len).strip() + data[tag] = longest_content + else: + data[tag] = "" + + return data + def perform_completion_with_backoff( provider, @@ -1648,6 +1680,19 @@ def perform_completion_with_backoff( "content": ["Rate limit error. Please try again later."], } ] + except Exception as e: + raise e # Raise any other exceptions immediately + # print("Error during completion request:", str(e)) + # error_message = e.message + # return [ + # { + # "index": 0, + # "tags": ["error"], + # "content": [ + # f"Error during LLM completion request. {error_message}" + # ], + # } + # ] def extract_blocks(url, html, provider=DEFAULT_PROVIDER, api_token=None, base_url=None):