Merge branch 'next' into 2025-MAR-ALPHA-1

This commit is contained in:
Aravind Karnam
2025-03-13 10:42:22 +05:30
67 changed files with 4194 additions and 729 deletions

View File

@@ -76,7 +76,7 @@ Below is an overview of important LLM extraction parameters. All are typically s
```python
extraction_strategy = LLMExtractionStrategy(
llmConfig = LlmConfig(provider="openai/gpt-4", api_token="YOUR_OPENAI_KEY"),
llm_config = LLMConfig(provider="openai/gpt-4", api_token="YOUR_OPENAI_KEY"),
schema=MyModel.model_json_schema(),
extraction_type="schema",
instruction="Extract a list of items from the text with 'name' and 'price' fields.",
@@ -101,7 +101,7 @@ import asyncio
import json
from pydantic import BaseModel, Field
from typing import List
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LlmConfig
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy
class Product(BaseModel):
@@ -111,7 +111,7 @@ class Product(BaseModel):
async def main():
# 1. Define the LLM extraction strategy
llm_strategy = LLMExtractionStrategy(
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv('OPENAI_API_KEY')),
llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv('OPENAI_API_KEY')),
schema=Product.schema_json(), # Or use model_json_schema()
extraction_type="schema",
instruction="Extract all product objects with 'name' and 'price' from the content.",

View File

@@ -415,7 +415,7 @@ The schema generator is available as a static method on both `JsonCssExtractionS
```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
from crawl4ai.async_configs import LlmConfig
from crawl4ai import LLMConfig
# Sample HTML with product information
html = """
@@ -435,14 +435,14 @@ html = """
css_schema = JsonCssExtractionStrategy.generate_schema(
html,
schema_type="css",
llmConfig = LlmConfig(provider="openai/gpt-4o",api_token="your-openai-token")
llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-openai-token")
)
# Option 2: Using Ollama (open source, no token needed)
xpath_schema = JsonXPathExtractionStrategy.generate_schema(
html,
schema_type="xpath",
llmConfig = LlmConfig(provider="ollama/llama3.3", api_token=None) # Not needed for Ollama
llm_config = LLMConfig(provider="ollama/llama3.3", api_token=None) # Not needed for Ollama
)
# Use the generated schema for fast, repeated extractions