Update all documentation to import extraction strategies directly from crawl4ai.

This commit is contained in:
UncleCode
2025-06-10 18:08:27 +08:00
parent cab457e9c7
commit c0fd36982d
43 changed files with 7811 additions and 7803 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -5,7 +5,7 @@ prices, ratings, and other details using CSS selectors.
"""
from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
import json

View File

@@ -5,7 +5,7 @@ prices, ratings, and other details using CSS selectors.
"""
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
import json
from playwright.async_api import Page, BrowserContext

View File

@@ -5,7 +5,7 @@ prices, ratings, and other details using CSS selectors.
"""
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
import json

View File

@@ -20,7 +20,7 @@ from pathlib import Path
from typing import List, Dict, Any
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
from crawl4ai.script.c4a_compile import C4ACompiler

View File

@@ -20,7 +20,7 @@ from pathlib import Path
from typing import List, Dict, Any
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
from crawl4ai.script.c4a_compile import C4ACompiler

View File

@@ -12,7 +12,7 @@ import os
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai import LLMConfig
from crawl4ai.extraction_strategy import (
from crawl4ai import (
LLMExtractionStrategy,
JsonCssExtractionStrategy,
JsonXPathExtractionStrategy,

View File

@@ -518,7 +518,7 @@
}
],
"source": [
"from crawl4ai.extraction_strategy import LLMExtractionStrategy\n",
"from crawl4ai import LLMExtractionStrategy\n",
"from pydantic import BaseModel, Field\n",
"import os, json\n",
"\n",
@@ -594,7 +594,7 @@
}
],
"source": [
"from crawl4ai.extraction_strategy import CosineStrategy\n",
"from crawl4ai import CosineStrategy\n",
"\n",
"async def cosine_similarity_extraction():\n",
" async with AsyncWebCrawler() as crawler:\n",

View File

@@ -16,7 +16,7 @@ from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.extraction_strategy import (
from crawl4ai import (
JsonCssExtractionStrategy,
LLMExtractionStrategy,
)
@@ -416,7 +416,7 @@ async def crawl_dynamic_content_pages_method_2():
async def cosine_similarity_extraction():
from crawl4ai.extraction_strategy import CosineStrategy
from crawl4ai import CosineStrategy
crawl_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
extraction_strategy=CosineStrategy(

View File

@@ -16,7 +16,7 @@ from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.extraction_strategy import (
from crawl4ai import (
JsonCssExtractionStrategy,
LLMExtractionStrategy,
)
@@ -416,7 +416,7 @@ async def crawl_dynamic_content_pages_method_2():
async def cosine_similarity_extraction():
from crawl4ai.extraction_strategy import CosineStrategy
from crawl4ai import CosineStrategy
crawl_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
extraction_strategy=CosineStrategy(

View File

@@ -2,7 +2,7 @@ import os
import json
from crawl4ai.web_crawler import WebCrawler
from crawl4ai.chunking_strategy import *
from crawl4ai.extraction_strategy import *
from crawl4ai import *
from crawl4ai.crawler_strategy import *
url = r"https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot"

View File

@@ -18,7 +18,7 @@ from crawl4ai import RoundRobinProxyStrategy
from crawl4ai.content_filter_strategy import LLMContentFilter
from crawl4ai import DefaultMarkdownGenerator
from crawl4ai import LLMConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
from pprint import pprint

View File

@@ -45,7 +45,7 @@ Here's an example of crawling GitHub commits across multiple pages while preserv
```python
from crawl4ai.async_configs import CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
from crawl4ai.cache_context import CacheMode
async def crawl_dynamic_content():

View File

@@ -215,7 +215,7 @@ Below is a snippet combining many parameters:
```python
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
async def main():
# Example schema

View File

@@ -217,7 +217,7 @@ Below is an example hooking it all together:
```python
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
import json
async def main():

View File

@@ -169,7 +169,7 @@ OverlappingWindowChunking(
```python
from pydantic import BaseModel
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from crawl4ai import LLMExtractionStrategy
from crawl4ai import LLMConfig
# Define schema
@@ -247,7 +247,7 @@ async with AsyncWebCrawler() as crawler:
### CSS Extraction
```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
# Define schema
schema = {

View File

@@ -1701,7 +1701,7 @@ Generated: ${new Date().toISOString()}
import asyncio
import json
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
# The extraction schema generated from your selections
EXTRACTION_SCHEMA = ${schemaJson}
@@ -1782,7 +1782,7 @@ import asyncio
import json
from pathlib import Path
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
# HTML snippet of the selected container element
HTML_SNIPPET = """

View File

@@ -2437,7 +2437,7 @@ Generated: ${timestamp}
import asyncio
from pathlib import Path
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
# C4A Script commands
C4A_SCRIPT = """

View File

@@ -476,7 +476,7 @@ services:
```python
# Method 1: Create config objects and dump to see expected JSON structure
from crawl4ai import BrowserConfig, CrawlerRunConfig, LLMConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy
import json
# Create browser config and see JSON structure

View File

@@ -37,7 +37,7 @@ import json
from pydantic import BaseModel, Field
from typing import List
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from crawl4ai import LLMExtractionStrategy
class SentimentAnalysis(BaseModel):
"""Use LLM when you need semantic understanding"""

View File

@@ -39,7 +39,7 @@ import json
import asyncio
from pathlib import Path
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
async def smart_extraction_workflow():
"""
@@ -176,7 +176,7 @@ social_schema = JsonCssExtractionStrategy.generate_schema(
import json
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
# Manual schema for consistent product pages
simple_schema = {
@@ -342,7 +342,7 @@ asyncio.run(extract_complex_ecommerce())
### XPath Alternative (When CSS Isn't Enough)
```python
from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
from crawl4ai import JsonXPathExtractionStrategy
# XPath for more complex selections
xpath_schema = {
@@ -387,7 +387,7 @@ strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True)
import json
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.extraction_strategy import RegexExtractionStrategy
from crawl4ai import RegexExtractionStrategy
async def extract_common_patterns():
# Use built-in patterns for common data types

View File

@@ -1835,7 +1835,7 @@ import json
from pydantic import BaseModel, Field
from typing import List
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from crawl4ai import LLMExtractionStrategy
class SentimentAnalysis(BaseModel):
"""Use LLM when you need semantic understanding"""
@@ -2743,7 +2743,7 @@ import json
import asyncio
from pathlib import Path
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
async def smart_extraction_workflow():
"""
@@ -2880,7 +2880,7 @@ social_schema = JsonCssExtractionStrategy.generate_schema(
import json
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
# Manual schema for consistent product pages
simple_schema = {
@@ -3046,7 +3046,7 @@ asyncio.run(extract_complex_ecommerce())
### XPath Alternative (When CSS Isn't Enough)
```python
from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
from crawl4ai import JsonXPathExtractionStrategy
# XPath for more complex selections
xpath_schema = {
@@ -3091,7 +3091,7 @@ strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True)
import json
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.extraction_strategy import RegexExtractionStrategy
from crawl4ai import RegexExtractionStrategy
async def extract_common_patterns():
# Use built-in patterns for common data types
@@ -4711,7 +4711,7 @@ services:
```python
# Method 1: Create config objects and dump to see expected JSON structure
from crawl4ai import BrowserConfig, CrawlerRunConfig, LLMConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy
import json
# Create browser config and see JSON structure
@@ -5792,7 +5792,7 @@ import json
import asyncio
from pathlib import Path
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
async def smart_extraction_workflow():
"""
@@ -5929,7 +5929,7 @@ social_schema = JsonCssExtractionStrategy.generate_schema(
import json
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
# Manual schema for consistent product pages
simple_schema = {
@@ -6095,7 +6095,7 @@ asyncio.run(extract_complex_ecommerce())
### XPath Alternative (When CSS Isn't Enough)
```python
from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
from crawl4ai import JsonXPathExtractionStrategy
# XPath for more complex selections
xpath_schema = {
@@ -6140,7 +6140,7 @@ strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True)
import json
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.extraction_strategy import RegexExtractionStrategy
from crawl4ai import RegexExtractionStrategy
async def extract_common_patterns():
# Use built-in patterns for common data types

View File

@@ -1835,7 +1835,7 @@ import json
from pydantic import BaseModel, Field
from typing import List
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from crawl4ai import LLMExtractionStrategy
class SentimentAnalysis(BaseModel):
"""Use LLM when you need semantic understanding"""
@@ -2743,7 +2743,7 @@ import json
import asyncio
from pathlib import Path
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
async def smart_extraction_workflow():
"""
@@ -2880,7 +2880,7 @@ social_schema = JsonCssExtractionStrategy.generate_schema(
import json
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
# Manual schema for consistent product pages
simple_schema = {
@@ -3046,7 +3046,7 @@ asyncio.run(extract_complex_ecommerce())
### XPath Alternative (When CSS Isn't Enough)
```python
from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
from crawl4ai import JsonXPathExtractionStrategy
# XPath for more complex selections
xpath_schema = {
@@ -3091,7 +3091,7 @@ strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True)
import json
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.extraction_strategy import RegexExtractionStrategy
from crawl4ai import RegexExtractionStrategy
async def extract_common_patterns():
# Use built-in patterns for common data types
@@ -4711,7 +4711,7 @@ services:
```python
# Method 1: Create config objects and dump to see expected JSON structure
from crawl4ai import BrowserConfig, CrawlerRunConfig, LLMConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy
import json
# Create browser config and see JSON structure
@@ -5792,7 +5792,7 @@ import json
import asyncio
from pathlib import Path
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
async def smart_extraction_workflow():
"""
@@ -5929,7 +5929,7 @@ social_schema = JsonCssExtractionStrategy.generate_schema(
import json
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
# Manual schema for consistent product pages
simple_schema = {
@@ -6095,7 +6095,7 @@ asyncio.run(extract_complex_ecommerce())
### XPath Alternative (When CSS Isn't Enough)
```python
from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
from crawl4ai import JsonXPathExtractionStrategy
# XPath for more complex selections
xpath_schema = {
@@ -6140,7 +6140,7 @@ strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True)
import json
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.extraction_strategy import RegexExtractionStrategy
from crawl4ai import RegexExtractionStrategy
async def extract_common_patterns():
# Use built-in patterns for common data types

File diff suppressed because it is too large Load Diff

View File

@@ -334,7 +334,7 @@ asyncio.run(main())
schemas.
```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
from crawl4ai import LLMConfig
llm_config = LLMConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
@@ -402,7 +402,7 @@ print(schema)
```python
from crawl4ai import LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from crawl4ai import LLMExtractionStrategy
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
# Example of using LLMConfig with LLMExtractionStrategy

View File

@@ -274,7 +274,7 @@ In a typical scenario, you define **one** `BrowserConfig` for your crawler sessi
```python
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
async def main():
# 1) Browser config: headless, bigger viewport, no proxy

View File

@@ -191,7 +191,7 @@ You can combine content selection with a more advanced extraction strategy. For
import asyncio
import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
async def main():
# Minimal schema for repeated items
@@ -243,7 +243,7 @@ import asyncio
import json
from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from crawl4ai import LLMExtractionStrategy
class ArticleData(BaseModel):
headline: str
@@ -288,7 +288,7 @@ Below is a short function that unifies **CSS selection**, **exclusion** logic, a
import asyncio
import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
async def extract_main_articles(url: str):
schema = {

View File

@@ -138,7 +138,7 @@ If you run a JSON-based extraction strategy (CSS, XPath, LLM, etc.), the structu
import asyncio
import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
async def main():
schema = {

View File

@@ -296,7 +296,7 @@ if __name__ == "__main__":
Once dynamic content is loaded, you can attach an **`extraction_strategy`** (like `JsonCssExtractionStrategy` or `LLMExtractionStrategy`). For example:
```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
schema = {
"name": "Commits",

View File

@@ -127,7 +127,7 @@ Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. B
> **New!** Crawl4AI now provides a powerful utility to automatically generate extraction schemas using LLM. This is a one-time cost that gives you a reusable schema for fast, LLM-free extractions:
```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
from crawl4ai import LLMConfig
# Generate a schema (one-time cost)
@@ -157,7 +157,7 @@ Here's a basic extraction example:
import asyncio
import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
async def main():
schema = {
@@ -212,7 +212,7 @@ import json
import asyncio
from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from crawl4ai import LLMExtractionStrategy
class OpenAIModelFee(BaseModel):
model_name: str = Field(..., description="Name of the OpenAI model.")
@@ -328,7 +328,7 @@ Some sites require multiple “page clicks” or dynamic JavaScript updates. Bel
```python
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
async def extract_structured_data_using_css_extractor():
print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")

View File

@@ -14,7 +14,7 @@ The Cosine Strategy:
## Basic Usage
```python
from crawl4ai.extraction_strategy import CosineStrategy
from crawl4ai import CosineStrategy
strategy = CosineStrategy(
semantic_filter="product reviews", # Target content type

View File

@@ -102,7 +102,7 @@ import json
from pydantic import BaseModel, Field
from typing import List
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from crawl4ai import LLMExtractionStrategy
class Product(BaseModel):
name: str
@@ -219,7 +219,7 @@ import asyncio
from typing import List
from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from crawl4ai import LLMExtractionStrategy
class Entity(BaseModel):
name: str

View File

@@ -38,7 +38,7 @@ Let's begin with a **simple** schema-based extraction using the `JsonCssExtracti
import json
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
async def extract_crypto_prices():
# 1. Define a simple extraction schema
@@ -108,7 +108,7 @@ Below is a short example demonstrating **XPath** extraction plus the **`raw://`*
import json
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
from crawl4ai import JsonXPathExtractionStrategy
async def extract_crypto_prices_xpath():
# 1. Minimal dummy HTML with some repeating rows
@@ -309,7 +309,7 @@ Key Takeaways:
import json
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy
ecommerce_schema = {
# ... the advanced schema from above ...
@@ -649,7 +649,7 @@ While manually crafting schemas is powerful and precise, Crawl4AI now offers a c
The schema generator is available as a static method on both `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy`. You can choose between OpenAI's GPT-4 or the open-source Ollama for schema generation:
```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
from crawl4ai import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
from crawl4ai import LLMConfig
# Sample HTML with product information

View File

@@ -149,7 +149,7 @@
"metadata": {},
"outputs": [],
"source": [
"from crawl4ai.extraction_strategy import LLMExtractionStrategy\n",
"from crawl4ai import LLMExtractionStrategy\n",
"from pydantic import BaseModel\n",
"import json, os\n",
"from typing import List\n",