feat(cli): add quick JSON extraction and global config management

Adds new features to improve user experience and configuration:
- Quick JSON extraction with -j flag for direct LLM-based structured data extraction
- Global configuration management with 'crwl config' commands
- Enhanced LLM extraction with better JSON handling and error management
- New user settings for default behaviors (LLM provider, browser settings, etc.)

Breaking changes: None
This commit is contained in:
UncleCode
2025-03-25 20:30:25 +08:00
parent 6405cf0a6f
commit 4a20d7f7c2
5 changed files with 424 additions and 62 deletions

View File

@@ -5,7 +5,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
import json
import time
from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH
from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH, PROMPT_EXTRACT_INFERRED_SCHEMA
from .config import (
DEFAULT_PROVIDER, CHUNK_TOKEN_THRESHOLD,
OVERLAP_RATE,
@@ -507,6 +507,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
word_token_rate=WORD_TOKEN_RATE,
apply_chunking=True,
input_format: str = "markdown",
force_json_response=False,
verbose=False,
# Deprecated arguments
provider: str = DEFAULT_PROVIDER,
@@ -527,9 +528,10 @@ class LLMExtractionStrategy(ExtractionStrategy):
overlap_rate: Overlap between chunks.
word_token_rate: Word to token conversion rate.
apply_chunking: Whether to apply chunking.
input_format: Content format to use for extraction.
Options: "markdown" (default), "html", "fit_markdown"
force_json_response: Whether to force a JSON response from the LLM.
verbose: Whether to print verbose output.
usages: List of individual token usages.
total_usage: Accumulated token usage.
# Deprecated arguments, will be removed very soon
provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
@@ -545,6 +547,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
self.schema = schema
if schema:
self.extract_type = "schema"
self.force_json_response = force_json_response
self.chunk_token_threshold = chunk_token_threshold or CHUNK_TOKEN_THRESHOLD
self.overlap_rate = overlap_rate
self.word_token_rate = word_token_rate
@@ -608,64 +611,97 @@ class LLMExtractionStrategy(ExtractionStrategy):
variable_values["SCHEMA"] = json.dumps(self.schema, indent=2) # if type of self.schema is dict else self.schema
prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION
if self.extract_type == "schema" and not self.schema:
prompt_with_variables = PROMPT_EXTRACT_INFERRED_SCHEMA
for variable in variable_values:
prompt_with_variables = prompt_with_variables.replace(
"{" + variable + "}", variable_values[variable]
)
response = perform_completion_with_backoff(
self.llm_config.provider,
prompt_with_variables,
self.llm_config.api_token,
base_url=self.llm_config.base_url,
extra_args=self.extra_args,
) # , json_response=self.extract_type == "schema")
# Track usage
usage = TokenUsage(
completion_tokens=response.usage.completion_tokens,
prompt_tokens=response.usage.prompt_tokens,
total_tokens=response.usage.total_tokens,
completion_tokens_details=response.usage.completion_tokens_details.__dict__
if response.usage.completion_tokens_details
else {},
prompt_tokens_details=response.usage.prompt_tokens_details.__dict__
if response.usage.prompt_tokens_details
else {},
)
self.usages.append(usage)
# Update totals
self.total_usage.completion_tokens += usage.completion_tokens
self.total_usage.prompt_tokens += usage.prompt_tokens
self.total_usage.total_tokens += usage.total_tokens
try:
blocks = extract_xml_data(["blocks"], response.choices[0].message.content)[
"blocks"
]
blocks = json.loads(blocks)
for block in blocks:
block["error"] = False
except Exception:
parsed, unparsed = split_and_parse_json_objects(
response.choices[0].message.content
response = perform_completion_with_backoff(
self.llm_config.provider,
prompt_with_variables,
self.llm_config.api_token,
base_url=self.llm_config.base_url,
json_response=self.force_json_response,
extra_args=self.extra_args,
) # , json_response=self.extract_type == "schema")
# Track usage
usage = TokenUsage(
completion_tokens=response.usage.completion_tokens,
prompt_tokens=response.usage.prompt_tokens,
total_tokens=response.usage.total_tokens,
completion_tokens_details=response.usage.completion_tokens_details.__dict__
if response.usage.completion_tokens_details
else {},
prompt_tokens_details=response.usage.prompt_tokens_details.__dict__
if response.usage.prompt_tokens_details
else {},
)
blocks = parsed
if unparsed:
blocks.append(
{"index": 0, "error": True, "tags": ["error"], "content": unparsed}
)
self.usages.append(usage)
if self.verbose:
print(
"[LOG] Extracted",
len(blocks),
"blocks from URL:",
url,
"block index:",
ix,
)
return blocks
# Update totals
self.total_usage.completion_tokens += usage.completion_tokens
self.total_usage.prompt_tokens += usage.prompt_tokens
self.total_usage.total_tokens += usage.total_tokens
try:
response = response.choices[0].message.content
blocks = None
if self.force_json_response:
blocks = json.loads(response)
if isinstance(blocks, dict):
# If it has only one key which calue is list then assign that to blocks, exampled: {"news": [..]}
if len(blocks) == 1 and isinstance(list(blocks.values())[0], list):
blocks = list(blocks.values())[0]
else:
# If it has only one key which value is not list then assign that to blocks, exampled: { "article_id": "1234", ... }
blocks = [blocks]
elif isinstance(blocks, list):
# If it is a list then assign that to blocks
blocks = blocks
else:
# blocks = extract_xml_data(["blocks"], response.choices[0].message.content)["blocks"]
blocks = extract_xml_data(["blocks"], response)["blocks"]
blocks = json.loads(blocks)
for block in blocks:
block["error"] = False
except Exception:
parsed, unparsed = split_and_parse_json_objects(
response.choices[0].message.content
)
blocks = parsed
if unparsed:
blocks.append(
{"index": 0, "error": True, "tags": ["error"], "content": unparsed}
)
if self.verbose:
print(
"[LOG] Extracted",
len(blocks),
"blocks from URL:",
url,
"block index:",
ix,
)
return blocks
except Exception as e:
if self.verbose:
print(f"[LOG] Error in LLM extraction: {e}")
# Add error information to extracted_content
return [
{
"index": ix,
"error": True,
"tags": ["error"],
"content": str(e),
}
]
def _merge(self, documents, chunk_token_threshold, overlap) -> List[str]:
"""