Update all documentation to import extraction strategies directly from crawl4ai.

This commit is contained in:
UncleCode
2025-06-10 18:08:27 +08:00
parent cab457e9c7
commit c0fd36982d
43 changed files with 7811 additions and 7803 deletions

View File

@@ -352,7 +352,7 @@ if __name__ == "__main__":
```python ```python
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
import json import json
async def main(): async def main():
@@ -426,7 +426,7 @@ if __name__ == "__main__":
import os import os
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy from crawl4ai import LLMExtractionStrategy
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
class OpenAIModelFee(BaseModel): class OpenAIModelFee(BaseModel):

View File

@@ -1,7 +1,7 @@
from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig, CacheMode from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.hub import BaseCrawler from crawl4ai.hub import BaseCrawler
from crawl4ai.utils import optimize_html, get_home_folder, preprocess_html_for_schema from crawl4ai.utils import optimize_html, get_home_folder, preprocess_html_for_schema
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
from pathlib import Path from pathlib import Path
import json import json
import os import os

View File

@@ -7901,7 +7901,7 @@ from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import PruningContentFilter from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.extraction_strategy import ( from crawl4ai import (
JsonCssExtractionStrategy, JsonCssExtractionStrategy,
LLMExtractionStrategy, LLMExtractionStrategy,
) )
@@ -8301,7 +8301,7 @@ async def crawl_dynamic_content_pages_method_2():
async def cosine_similarity_extraction(): async def cosine_similarity_extraction():
from crawl4ai.extraction_strategy import CosineStrategy from crawl4ai import CosineStrategy
crawl_config = CrawlerRunConfig( crawl_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, cache_mode=CacheMode.BYPASS,
extraction_strategy=CosineStrategy( extraction_strategy=CosineStrategy(

View File

@@ -354,7 +354,7 @@ In a typical scenario, you define **one** `BrowserConfig` for your crawler sessi
```python ```python
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
async def main(): async def main():
# 1) Browser config: headless, bigger viewport, no proxy # 1) Browser config: headless, bigger viewport, no proxy
@@ -1042,7 +1042,7 @@ You can combine content selection with a more advanced extraction strategy. For
import asyncio import asyncio
import json import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
async def main(): async def main():
# Minimal schema for repeated items # Minimal schema for repeated items
@@ -1094,7 +1094,7 @@ import asyncio
import json import json
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy from crawl4ai import LLMExtractionStrategy
class ArticleData(BaseModel): class ArticleData(BaseModel):
headline: str headline: str
@@ -1139,7 +1139,7 @@ Below is a short function that unifies **CSS selection**, **exclusion** logic, a
import asyncio import asyncio
import json import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
async def extract_main_articles(url: str): async def extract_main_articles(url: str):
schema = { schema = {
@@ -1488,7 +1488,7 @@ If you run a JSON-based extraction strategy (CSS, XPath, LLM, etc.), the structu
import asyncio import asyncio
import json import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
async def main(): async def main():
schema = { schema = {
@@ -4722,7 +4722,7 @@ if __name__ == "__main__":
Once dynamic content is loaded, you can attach an **`extraction_strategy`** (like `JsonCssExtractionStrategy` or `LLMExtractionStrategy`). For example: Once dynamic content is loaded, you can attach an **`extraction_strategy`** (like `JsonCssExtractionStrategy` or `LLMExtractionStrategy`). For example:
```python ```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
schema = { schema = {
"name": "Commits", "name": "Commits",
@@ -4902,7 +4902,7 @@ Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. B
> **New!** Crawl4AI now provides a powerful utility to automatically generate extraction schemas using LLM. This is a one-time cost that gives you a reusable schema for fast, LLM-free extractions: > **New!** Crawl4AI now provides a powerful utility to automatically generate extraction schemas using LLM. This is a one-time cost that gives you a reusable schema for fast, LLM-free extractions:
```python ```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
from crawl4ai import LLMConfig from crawl4ai import LLMConfig
# Generate a schema (one-time cost) # Generate a schema (one-time cost)
@@ -4932,7 +4932,7 @@ Here's a basic extraction example:
import asyncio import asyncio
import json import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
async def main(): async def main():
schema = { schema = {
@@ -4987,7 +4987,7 @@ import json
import asyncio import asyncio
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy from crawl4ai import LLMExtractionStrategy
class OpenAIModelFee(BaseModel): class OpenAIModelFee(BaseModel):
model_name: str = Field(..., description="Name of the OpenAI model.") model_name: str = Field(..., description="Name of the OpenAI model.")
@@ -5103,7 +5103,7 @@ Some sites require multiple “page clicks” or dynamic JavaScript updates. Bel
```python ```python
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
async def extract_structured_data_using_css_extractor(): async def extract_structured_data_using_css_extractor():
print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---") print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
@@ -7300,7 +7300,7 @@ Here's an example of crawling GitHub commits across multiple pages while preserv
```python ```python
from crawl4ai.async_configs import CrawlerRunConfig from crawl4ai.async_configs import CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
from crawl4ai.cache_context import CacheMode from crawl4ai.cache_context import CacheMode
async def crawl_dynamic_content(): async def crawl_dynamic_content():
@@ -7850,7 +7850,7 @@ The Cosine Strategy:
## Basic Usage ## Basic Usage
```python ```python
from crawl4ai.extraction_strategy import CosineStrategy from crawl4ai import CosineStrategy
strategy = CosineStrategy( strategy = CosineStrategy(
semantic_filter="product reviews", # Target content type semantic_filter="product reviews", # Target content type
@@ -8161,7 +8161,7 @@ import json
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from typing import List from typing import List
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy from crawl4ai import LLMExtractionStrategy
class Product(BaseModel): class Product(BaseModel):
name: str name: str
@@ -8278,7 +8278,7 @@ import asyncio
from typing import List from typing import List
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import LLMExtractionStrategy from crawl4ai import LLMExtractionStrategy
class Entity(BaseModel): class Entity(BaseModel):
name: str name: str
@@ -8423,7 +8423,7 @@ Lets begin with a **simple** schema-based extraction using the `JsonCssExtrac
import json import json
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
async def extract_crypto_prices(): async def extract_crypto_prices():
# 1. Define a simple extraction schema # 1. Define a simple extraction schema
@@ -8493,7 +8493,7 @@ Below is a short example demonstrating **XPath** extraction plus the **`raw://`*
import json import json
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy from crawl4ai import JsonXPathExtractionStrategy
async def extract_crypto_prices_xpath(): async def extract_crypto_prices_xpath():
# 1. Minimal dummy HTML with some repeating rows # 1. Minimal dummy HTML with some repeating rows
@@ -8694,7 +8694,7 @@ Key Takeaways:
import json import json
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
ecommerce_schema = { ecommerce_schema = {
# ... the advanced schema from above ... # ... the advanced schema from above ...
@@ -8804,7 +8804,7 @@ While manually crafting schemas is powerful and precise, Crawl4AI now offers a c
The schema generator is available as a static method on both `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy`. You can choose between OpenAI's GPT-4 or the open-source Ollama for schema generation: The schema generator is available as a static method on both `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy`. You can choose between OpenAI's GPT-4 or the open-source Ollama for schema generation:
```python ```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy from crawl4ai import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
from crawl4ai import LLMConfig from crawl4ai import LLMConfig
# Sample HTML with product information # Sample HTML with product information

File diff suppressed because it is too large Load Diff

View File

@@ -5,7 +5,7 @@ prices, ratings, and other details using CSS selectors.
""" """
from crawl4ai import AsyncWebCrawler from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
import json import json

View File

@@ -5,7 +5,7 @@ prices, ratings, and other details using CSS selectors.
""" """
from crawl4ai import AsyncWebCrawler, CacheMode from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
import json import json
from playwright.async_api import Page, BrowserContext from playwright.async_api import Page, BrowserContext

View File

@@ -5,7 +5,7 @@ prices, ratings, and other details using CSS selectors.
""" """
from crawl4ai import AsyncWebCrawler, CacheMode from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
import json import json

View File

@@ -20,7 +20,7 @@ from pathlib import Path
from typing import List, Dict, Any from typing import List, Dict, Any
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
from crawl4ai.script.c4a_compile import C4ACompiler from crawl4ai.script.c4a_compile import C4ACompiler

View File

@@ -20,7 +20,7 @@ from pathlib import Path
from typing import List, Dict, Any from typing import List, Dict, Any
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
from crawl4ai.script.c4a_compile import C4ACompiler from crawl4ai.script.c4a_compile import C4ACompiler

View File

@@ -12,7 +12,7 @@ import os
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai import LLMConfig from crawl4ai import LLMConfig
from crawl4ai.extraction_strategy import ( from crawl4ai import (
LLMExtractionStrategy, LLMExtractionStrategy,
JsonCssExtractionStrategy, JsonCssExtractionStrategy,
JsonXPathExtractionStrategy, JsonXPathExtractionStrategy,

View File

@@ -518,7 +518,7 @@
} }
], ],
"source": [ "source": [
"from crawl4ai.extraction_strategy import LLMExtractionStrategy\n", "from crawl4ai import LLMExtractionStrategy\n",
"from pydantic import BaseModel, Field\n", "from pydantic import BaseModel, Field\n",
"import os, json\n", "import os, json\n",
"\n", "\n",
@@ -594,7 +594,7 @@
} }
], ],
"source": [ "source": [
"from crawl4ai.extraction_strategy import CosineStrategy\n", "from crawl4ai import CosineStrategy\n",
"\n", "\n",
"async def cosine_similarity_extraction():\n", "async def cosine_similarity_extraction():\n",
" async with AsyncWebCrawler() as crawler:\n", " async with AsyncWebCrawler() as crawler:\n",

View File

@@ -16,7 +16,7 @@ from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import PruningContentFilter from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.extraction_strategy import ( from crawl4ai import (
JsonCssExtractionStrategy, JsonCssExtractionStrategy,
LLMExtractionStrategy, LLMExtractionStrategy,
) )
@@ -416,7 +416,7 @@ async def crawl_dynamic_content_pages_method_2():
async def cosine_similarity_extraction(): async def cosine_similarity_extraction():
from crawl4ai.extraction_strategy import CosineStrategy from crawl4ai import CosineStrategy
crawl_config = CrawlerRunConfig( crawl_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, cache_mode=CacheMode.BYPASS,
extraction_strategy=CosineStrategy( extraction_strategy=CosineStrategy(

View File

@@ -16,7 +16,7 @@ from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import PruningContentFilter from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.extraction_strategy import ( from crawl4ai import (
JsonCssExtractionStrategy, JsonCssExtractionStrategy,
LLMExtractionStrategy, LLMExtractionStrategy,
) )
@@ -416,7 +416,7 @@ async def crawl_dynamic_content_pages_method_2():
async def cosine_similarity_extraction(): async def cosine_similarity_extraction():
from crawl4ai.extraction_strategy import CosineStrategy from crawl4ai import CosineStrategy
crawl_config = CrawlerRunConfig( crawl_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, cache_mode=CacheMode.BYPASS,
extraction_strategy=CosineStrategy( extraction_strategy=CosineStrategy(

View File

@@ -2,7 +2,7 @@ import os
import json import json
from crawl4ai.web_crawler import WebCrawler from crawl4ai.web_crawler import WebCrawler
from crawl4ai.chunking_strategy import * from crawl4ai.chunking_strategy import *
from crawl4ai.extraction_strategy import * from crawl4ai import *
from crawl4ai.crawler_strategy import * from crawl4ai.crawler_strategy import *
url = r"https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot" url = r"https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot"

View File

@@ -18,7 +18,7 @@ from crawl4ai import RoundRobinProxyStrategy
from crawl4ai.content_filter_strategy import LLMContentFilter from crawl4ai.content_filter_strategy import LLMContentFilter
from crawl4ai import DefaultMarkdownGenerator from crawl4ai import DefaultMarkdownGenerator
from crawl4ai import LLMConfig from crawl4ai import LLMConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
from pprint import pprint from pprint import pprint

View File

@@ -45,7 +45,7 @@ Here's an example of crawling GitHub commits across multiple pages while preserv
```python ```python
from crawl4ai.async_configs import CrawlerRunConfig from crawl4ai.async_configs import CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
from crawl4ai.cache_context import CacheMode from crawl4ai.cache_context import CacheMode
async def crawl_dynamic_content(): async def crawl_dynamic_content():

View File

@@ -215,7 +215,7 @@ Below is a snippet combining many parameters:
```python ```python
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
async def main(): async def main():
# Example schema # Example schema

View File

@@ -217,7 +217,7 @@ Below is an example hooking it all together:
```python ```python
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
import json import json
async def main(): async def main():

View File

@@ -169,7 +169,7 @@ OverlappingWindowChunking(
```python ```python
from pydantic import BaseModel from pydantic import BaseModel
from crawl4ai.extraction_strategy import LLMExtractionStrategy from crawl4ai import LLMExtractionStrategy
from crawl4ai import LLMConfig from crawl4ai import LLMConfig
# Define schema # Define schema
@@ -247,7 +247,7 @@ async with AsyncWebCrawler() as crawler:
### CSS Extraction ### CSS Extraction
```python ```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
# Define schema # Define schema
schema = { schema = {

View File

@@ -1701,7 +1701,7 @@ Generated: ${new Date().toISOString()}
import asyncio import asyncio
import json import json
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
# The extraction schema generated from your selections # The extraction schema generated from your selections
EXTRACTION_SCHEMA = ${schemaJson} EXTRACTION_SCHEMA = ${schemaJson}
@@ -1782,7 +1782,7 @@ import asyncio
import json import json
from pathlib import Path from pathlib import Path
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
# HTML snippet of the selected container element # HTML snippet of the selected container element
HTML_SNIPPET = """ HTML_SNIPPET = """

View File

@@ -2437,7 +2437,7 @@ Generated: ${timestamp}
import asyncio import asyncio
from pathlib import Path from pathlib import Path
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
# C4A Script commands # C4A Script commands
C4A_SCRIPT = """ C4A_SCRIPT = """

View File

@@ -476,7 +476,7 @@ services:
```python ```python
# Method 1: Create config objects and dump to see expected JSON structure # Method 1: Create config objects and dump to see expected JSON structure
from crawl4ai import BrowserConfig, CrawlerRunConfig, LLMConfig, CacheMode from crawl4ai import BrowserConfig, CrawlerRunConfig, LLMConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy
import json import json
# Create browser config and see JSON structure # Create browser config and see JSON structure

View File

@@ -37,7 +37,7 @@ import json
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from typing import List from typing import List
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy from crawl4ai import LLMExtractionStrategy
class SentimentAnalysis(BaseModel): class SentimentAnalysis(BaseModel):
"""Use LLM when you need semantic understanding""" """Use LLM when you need semantic understanding"""

View File

@@ -39,7 +39,7 @@ import json
import asyncio import asyncio
from pathlib import Path from pathlib import Path
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
async def smart_extraction_workflow(): async def smart_extraction_workflow():
""" """
@@ -176,7 +176,7 @@ social_schema = JsonCssExtractionStrategy.generate_schema(
import json import json
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
# Manual schema for consistent product pages # Manual schema for consistent product pages
simple_schema = { simple_schema = {
@@ -342,7 +342,7 @@ asyncio.run(extract_complex_ecommerce())
### XPath Alternative (When CSS Isn't Enough) ### XPath Alternative (When CSS Isn't Enough)
```python ```python
from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy from crawl4ai import JsonXPathExtractionStrategy
# XPath for more complex selections # XPath for more complex selections
xpath_schema = { xpath_schema = {
@@ -387,7 +387,7 @@ strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True)
import json import json
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.extraction_strategy import RegexExtractionStrategy from crawl4ai import RegexExtractionStrategy
async def extract_common_patterns(): async def extract_common_patterns():
# Use built-in patterns for common data types # Use built-in patterns for common data types

View File

@@ -1835,7 +1835,7 @@ import json
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from typing import List from typing import List
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy from crawl4ai import LLMExtractionStrategy
class SentimentAnalysis(BaseModel): class SentimentAnalysis(BaseModel):
"""Use LLM when you need semantic understanding""" """Use LLM when you need semantic understanding"""
@@ -2743,7 +2743,7 @@ import json
import asyncio import asyncio
from pathlib import Path from pathlib import Path
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
async def smart_extraction_workflow(): async def smart_extraction_workflow():
""" """
@@ -2880,7 +2880,7 @@ social_schema = JsonCssExtractionStrategy.generate_schema(
import json import json
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
# Manual schema for consistent product pages # Manual schema for consistent product pages
simple_schema = { simple_schema = {
@@ -3046,7 +3046,7 @@ asyncio.run(extract_complex_ecommerce())
### XPath Alternative (When CSS Isn't Enough) ### XPath Alternative (When CSS Isn't Enough)
```python ```python
from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy from crawl4ai import JsonXPathExtractionStrategy
# XPath for more complex selections # XPath for more complex selections
xpath_schema = { xpath_schema = {
@@ -3091,7 +3091,7 @@ strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True)
import json import json
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.extraction_strategy import RegexExtractionStrategy from crawl4ai import RegexExtractionStrategy
async def extract_common_patterns(): async def extract_common_patterns():
# Use built-in patterns for common data types # Use built-in patterns for common data types
@@ -4711,7 +4711,7 @@ services:
```python ```python
# Method 1: Create config objects and dump to see expected JSON structure # Method 1: Create config objects and dump to see expected JSON structure
from crawl4ai import BrowserConfig, CrawlerRunConfig, LLMConfig, CacheMode from crawl4ai import BrowserConfig, CrawlerRunConfig, LLMConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy
import json import json
# Create browser config and see JSON structure # Create browser config and see JSON structure
@@ -5792,7 +5792,7 @@ import json
import asyncio import asyncio
from pathlib import Path from pathlib import Path
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
async def smart_extraction_workflow(): async def smart_extraction_workflow():
""" """
@@ -5929,7 +5929,7 @@ social_schema = JsonCssExtractionStrategy.generate_schema(
import json import json
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
# Manual schema for consistent product pages # Manual schema for consistent product pages
simple_schema = { simple_schema = {
@@ -6095,7 +6095,7 @@ asyncio.run(extract_complex_ecommerce())
### XPath Alternative (When CSS Isn't Enough) ### XPath Alternative (When CSS Isn't Enough)
```python ```python
from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy from crawl4ai import JsonXPathExtractionStrategy
# XPath for more complex selections # XPath for more complex selections
xpath_schema = { xpath_schema = {
@@ -6140,7 +6140,7 @@ strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True)
import json import json
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.extraction_strategy import RegexExtractionStrategy from crawl4ai import RegexExtractionStrategy
async def extract_common_patterns(): async def extract_common_patterns():
# Use built-in patterns for common data types # Use built-in patterns for common data types

View File

@@ -1835,7 +1835,7 @@ import json
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from typing import List from typing import List
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy from crawl4ai import LLMExtractionStrategy
class SentimentAnalysis(BaseModel): class SentimentAnalysis(BaseModel):
"""Use LLM when you need semantic understanding""" """Use LLM when you need semantic understanding"""
@@ -2743,7 +2743,7 @@ import json
import asyncio import asyncio
from pathlib import Path from pathlib import Path
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
async def smart_extraction_workflow(): async def smart_extraction_workflow():
""" """
@@ -2880,7 +2880,7 @@ social_schema = JsonCssExtractionStrategy.generate_schema(
import json import json
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
# Manual schema for consistent product pages # Manual schema for consistent product pages
simple_schema = { simple_schema = {
@@ -3046,7 +3046,7 @@ asyncio.run(extract_complex_ecommerce())
### XPath Alternative (When CSS Isn't Enough) ### XPath Alternative (When CSS Isn't Enough)
```python ```python
from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy from crawl4ai import JsonXPathExtractionStrategy
# XPath for more complex selections # XPath for more complex selections
xpath_schema = { xpath_schema = {
@@ -3091,7 +3091,7 @@ strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True)
import json import json
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.extraction_strategy import RegexExtractionStrategy from crawl4ai import RegexExtractionStrategy
async def extract_common_patterns(): async def extract_common_patterns():
# Use built-in patterns for common data types # Use built-in patterns for common data types
@@ -4711,7 +4711,7 @@ services:
```python ```python
# Method 1: Create config objects and dump to see expected JSON structure # Method 1: Create config objects and dump to see expected JSON structure
from crawl4ai import BrowserConfig, CrawlerRunConfig, LLMConfig, CacheMode from crawl4ai import BrowserConfig, CrawlerRunConfig, LLMConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy
import json import json
# Create browser config and see JSON structure # Create browser config and see JSON structure
@@ -5792,7 +5792,7 @@ import json
import asyncio import asyncio
from pathlib import Path from pathlib import Path
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
async def smart_extraction_workflow(): async def smart_extraction_workflow():
""" """
@@ -5929,7 +5929,7 @@ social_schema = JsonCssExtractionStrategy.generate_schema(
import json import json
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
# Manual schema for consistent product pages # Manual schema for consistent product pages
simple_schema = { simple_schema = {
@@ -6095,7 +6095,7 @@ asyncio.run(extract_complex_ecommerce())
### XPath Alternative (When CSS Isn't Enough) ### XPath Alternative (When CSS Isn't Enough)
```python ```python
from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy from crawl4ai import JsonXPathExtractionStrategy
# XPath for more complex selections # XPath for more complex selections
xpath_schema = { xpath_schema = {
@@ -6140,7 +6140,7 @@ strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True)
import json import json
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.extraction_strategy import RegexExtractionStrategy from crawl4ai import RegexExtractionStrategy
async def extract_common_patterns(): async def extract_common_patterns():
# Use built-in patterns for common data types # Use built-in patterns for common data types

File diff suppressed because it is too large Load Diff

View File

@@ -334,7 +334,7 @@ asyncio.run(main())
schemas. schemas.
```python ```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
from crawl4ai import LLMConfig from crawl4ai import LLMConfig
llm_config = LLMConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY") llm_config = LLMConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
@@ -402,7 +402,7 @@ print(schema)
```python ```python
from crawl4ai import LLMConfig from crawl4ai import LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy from crawl4ai import LLMExtractionStrategy
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
# Example of using LLMConfig with LLMExtractionStrategy # Example of using LLMConfig with LLMExtractionStrategy

View File

@@ -274,7 +274,7 @@ In a typical scenario, you define **one** `BrowserConfig` for your crawler sessi
```python ```python
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
async def main(): async def main():
# 1) Browser config: headless, bigger viewport, no proxy # 1) Browser config: headless, bigger viewport, no proxy

View File

@@ -191,7 +191,7 @@ You can combine content selection with a more advanced extraction strategy. For
import asyncio import asyncio
import json import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
async def main(): async def main():
# Minimal schema for repeated items # Minimal schema for repeated items
@@ -243,7 +243,7 @@ import asyncio
import json import json
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy from crawl4ai import LLMExtractionStrategy
class ArticleData(BaseModel): class ArticleData(BaseModel):
headline: str headline: str
@@ -288,7 +288,7 @@ Below is a short function that unifies **CSS selection**, **exclusion** logic, a
import asyncio import asyncio
import json import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
async def extract_main_articles(url: str): async def extract_main_articles(url: str):
schema = { schema = {

View File

@@ -138,7 +138,7 @@ If you run a JSON-based extraction strategy (CSS, XPath, LLM, etc.), the structu
import asyncio import asyncio
import json import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
async def main(): async def main():
schema = { schema = {

View File

@@ -296,7 +296,7 @@ if __name__ == "__main__":
Once dynamic content is loaded, you can attach an **`extraction_strategy`** (like `JsonCssExtractionStrategy` or `LLMExtractionStrategy`). For example: Once dynamic content is loaded, you can attach an **`extraction_strategy`** (like `JsonCssExtractionStrategy` or `LLMExtractionStrategy`). For example:
```python ```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
schema = { schema = {
"name": "Commits", "name": "Commits",

View File

@@ -127,7 +127,7 @@ Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. B
> **New!** Crawl4AI now provides a powerful utility to automatically generate extraction schemas using LLM. This is a one-time cost that gives you a reusable schema for fast, LLM-free extractions: > **New!** Crawl4AI now provides a powerful utility to automatically generate extraction schemas using LLM. This is a one-time cost that gives you a reusable schema for fast, LLM-free extractions:
```python ```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
from crawl4ai import LLMConfig from crawl4ai import LLMConfig
# Generate a schema (one-time cost) # Generate a schema (one-time cost)
@@ -157,7 +157,7 @@ Here's a basic extraction example:
import asyncio import asyncio
import json import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
async def main(): async def main():
schema = { schema = {
@@ -212,7 +212,7 @@ import json
import asyncio import asyncio
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy from crawl4ai import LLMExtractionStrategy
class OpenAIModelFee(BaseModel): class OpenAIModelFee(BaseModel):
model_name: str = Field(..., description="Name of the OpenAI model.") model_name: str = Field(..., description="Name of the OpenAI model.")
@@ -328,7 +328,7 @@ Some sites require multiple “page clicks” or dynamic JavaScript updates. Bel
```python ```python
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
async def extract_structured_data_using_css_extractor(): async def extract_structured_data_using_css_extractor():
print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---") print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")

View File

@@ -14,7 +14,7 @@ The Cosine Strategy:
## Basic Usage ## Basic Usage
```python ```python
from crawl4ai.extraction_strategy import CosineStrategy from crawl4ai import CosineStrategy
strategy = CosineStrategy( strategy = CosineStrategy(
semantic_filter="product reviews", # Target content type semantic_filter="product reviews", # Target content type

View File

@@ -102,7 +102,7 @@ import json
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from typing import List from typing import List
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy from crawl4ai import LLMExtractionStrategy
class Product(BaseModel): class Product(BaseModel):
name: str name: str
@@ -219,7 +219,7 @@ import asyncio
from typing import List from typing import List
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import LLMExtractionStrategy from crawl4ai import LLMExtractionStrategy
class Entity(BaseModel): class Entity(BaseModel):
name: str name: str

View File

@@ -38,7 +38,7 @@ Let's begin with a **simple** schema-based extraction using the `JsonCssExtracti
import json import json
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
async def extract_crypto_prices(): async def extract_crypto_prices():
# 1. Define a simple extraction schema # 1. Define a simple extraction schema
@@ -108,7 +108,7 @@ Below is a short example demonstrating **XPath** extraction plus the **`raw://`*
import json import json
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy from crawl4ai import JsonXPathExtractionStrategy
async def extract_crypto_prices_xpath(): async def extract_crypto_prices_xpath():
# 1. Minimal dummy HTML with some repeating rows # 1. Minimal dummy HTML with some repeating rows
@@ -309,7 +309,7 @@ Key Takeaways:
import json import json
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
ecommerce_schema = { ecommerce_schema = {
# ... the advanced schema from above ... # ... the advanced schema from above ...
@@ -649,7 +649,7 @@ While manually crafting schemas is powerful and precise, Crawl4AI now offers a c
The schema generator is available as a static method on both `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy`. You can choose between OpenAI's GPT-4 or the open-source Ollama for schema generation: The schema generator is available as a static method on both `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy`. You can choose between OpenAI's GPT-4 or the open-source Ollama for schema generation:
```python ```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy from crawl4ai import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
from crawl4ai import LLMConfig from crawl4ai import LLMConfig
# Sample HTML with product information # Sample HTML with product information

View File

@@ -149,7 +149,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"from crawl4ai.extraction_strategy import LLMExtractionStrategy\n", "from crawl4ai import LLMExtractionStrategy\n",
"from pydantic import BaseModel\n", "from pydantic import BaseModel\n",
"import json, os\n", "import json, os\n",
"from typing import List\n", "from typing import List\n",

View File

@@ -44,6 +44,7 @@ dependencies = [
"aiohttp>=3.11.11", "aiohttp>=3.11.11",
"brotli>=1.1.0", "brotli>=1.1.0",
"humanize>=4.10.0", "humanize>=4.10.0",
"lark>=1.2.2"
] ]
classifiers = [ classifiers = [
"Development Status :: 4 - Beta", "Development Status :: 4 - Beta",

View File

@@ -8,7 +8,7 @@ import asyncio
from crawl4ai import AsyncWebCrawler, CacheMode from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from crawl4ai.content_filter_strategy import PruningContentFilter from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy from crawl4ai import JsonCssExtractionStrategy
from crawl4ai.chunking_strategy import RegexChunking from crawl4ai.chunking_strategy import RegexChunking

View File

@@ -10,7 +10,7 @@ sys.path.append(parent_dir)
from crawl4ai import LLMConfig from crawl4ai import LLMConfig
from crawl4ai.async_webcrawler import AsyncWebCrawler from crawl4ai.async_webcrawler import AsyncWebCrawler
from crawl4ai.chunking_strategy import RegexChunking from crawl4ai.chunking_strategy import RegexChunking
from crawl4ai.extraction_strategy import LLMExtractionStrategy from crawl4ai import LLMExtractionStrategy
@pytest.mark.asyncio @pytest.mark.asyncio

View File

@@ -9,7 +9,7 @@ __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy from crawl4ai import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
from crawl4ai.utils import preprocess_html_for_schema, JsonXPathExtractionStrategy from crawl4ai.utils import preprocess_html_for_schema, JsonXPathExtractionStrategy
import json import json

View File

@@ -6,7 +6,7 @@ from crawl4ai.chunking_strategy import (
FixedLengthWordChunking, FixedLengthWordChunking,
SlidingWindowChunking, SlidingWindowChunking,
) )
from crawl4ai.extraction_strategy import ( from crawl4ai import (
CosineStrategy, CosineStrategy,
LLMExtractionStrategy, LLMExtractionStrategy,
TopicExtractionStrategy, TopicExtractionStrategy,