Merge branch 'vr0.5.0.post1' into next
This commit is contained in:
@@ -11,7 +11,7 @@ import asyncio
|
||||
import os
|
||||
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai.types import LLMConfig
|
||||
from crawl4ai.extraction_strategy import (
|
||||
LLMExtractionStrategy,
|
||||
JsonCssExtractionStrategy,
|
||||
@@ -61,19 +61,19 @@ async def main():
|
||||
|
||||
# 1. LLM Extraction with different input formats
|
||||
markdown_strategy = LLMExtractionStrategy(
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
|
||||
llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
|
||||
instruction="Extract product information including name, price, and description",
|
||||
)
|
||||
|
||||
html_strategy = LLMExtractionStrategy(
|
||||
input_format="html",
|
||||
llmConfig=LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
|
||||
llm_config=LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
|
||||
instruction="Extract product information from HTML including structured data",
|
||||
)
|
||||
|
||||
fit_markdown_strategy = LLMExtractionStrategy(
|
||||
input_format="fit_markdown",
|
||||
llmConfig=LlmConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),
|
||||
llm_config=LLMConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),
|
||||
instruction="Extract product information from cleaned markdown",
|
||||
)
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai.types import LLMConfig
|
||||
from crawl4ai import AsyncWebCrawler, LLMExtractionStrategy
|
||||
import asyncio
|
||||
import os
|
||||
@@ -23,7 +23,7 @@ async def main():
|
||||
word_count_threshold=1,
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
# provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
|
||||
llmConfig=LlmConfig(provider="groq/llama-3.1-70b-versatile", api_token=os.getenv("GROQ_API_KEY")),
|
||||
llm_config=LLMConfig(provider="groq/llama-3.1-70b-versatile", api_token=os.getenv("GROQ_API_KEY")),
|
||||
schema=OpenAIModelFee.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="From the crawled content, extract all mentioned model names along with their "
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import os
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai.types import LLMConfig
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
|
||||
async def test_llm_filter():
|
||||
@@ -23,7 +23,7 @@ async def test_llm_filter():
|
||||
|
||||
# Initialize LLM filter with focused instruction
|
||||
filter = LLMContentFilter(
|
||||
llmConfig=LlmConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')),
|
||||
llm_config=LLMConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')),
|
||||
instruction="""
|
||||
Focus on extracting the core educational content about Python classes.
|
||||
Include:
|
||||
@@ -43,7 +43,7 @@ async def test_llm_filter():
|
||||
)
|
||||
|
||||
filter = LLMContentFilter(
|
||||
llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
|
||||
llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
|
||||
chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
|
||||
ignore_cache = True,
|
||||
instruction="""
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import os, sys
|
||||
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai.types import LLMConfig
|
||||
|
||||
sys.path.append(
|
||||
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
@@ -211,7 +211,7 @@ async def extract_structured_data_using_llm(
|
||||
word_count_threshold=1,
|
||||
page_timeout=80000,
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
llmConfig=LlmConfig(provider=provider,api_token=api_token),
|
||||
llm_config=LLMConfig(provider=provider,api_token=api_token),
|
||||
schema=OpenAIModelFee.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import os, sys
|
||||
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai.types import LLMConfig
|
||||
|
||||
# append parent directory to system path
|
||||
sys.path.append(
|
||||
@@ -147,7 +147,7 @@ async def extract_structured_data_using_llm(
|
||||
url="https://openai.com/api/pricing/",
|
||||
word_count_threshold=1,
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
llmConfig=LlmConfig(provider=provider,api_token=api_token),
|
||||
llm_config=LLMConfig(provider=provider,api_token=api_token),
|
||||
schema=OpenAIModelFee.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
||||
@@ -570,7 +570,7 @@ async def generate_knowledge_graph():
|
||||
relationships: List[Relationship]
|
||||
|
||||
extraction_strategy = LLMExtractionStrategy(
|
||||
llmConfig=LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")), # In case of Ollama just pass "no-token"
|
||||
llm_config=LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")), # In case of Ollama just pass "no-token"
|
||||
schema=KnowledgeGraph.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="""Extract entities and relationships from the given text.""",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import os
|
||||
import time
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai.types import LLMConfig
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.chunking_strategy import *
|
||||
from crawl4ai.extraction_strategy import *
|
||||
@@ -179,7 +179,7 @@ def add_llm_extraction_strategy(crawler):
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY"))
|
||||
llm_config = LLMConfig(provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY"))
|
||||
),
|
||||
)
|
||||
cprint(
|
||||
@@ -198,7 +198,7 @@ def add_llm_extraction_strategy(crawler):
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
|
||||
llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
|
||||
instruction="I am interested in only financial news",
|
||||
),
|
||||
)
|
||||
@@ -210,7 +210,7 @@ def add_llm_extraction_strategy(crawler):
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
|
||||
llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
|
||||
instruction="Extract only content related to technology",
|
||||
),
|
||||
)
|
||||
|
||||
@@ -17,7 +17,7 @@ from crawl4ai.configs import ProxyConfig
|
||||
from crawl4ai import RoundRobinProxyStrategy
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
from crawl4ai import DefaultMarkdownGenerator
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai.types import LLMConfig
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
|
||||
from pprint import pprint
|
||||
@@ -284,9 +284,9 @@ async def llm_content_filter():
|
||||
PART 5: LLM Content Filter
|
||||
|
||||
This function demonstrates:
|
||||
- Configuring LLM providers via LlmConfig
|
||||
- Configuring LLM providers via LLMConfig
|
||||
- Using LLM to generate focused markdown
|
||||
- LlmConfig for configuration
|
||||
- LLMConfig for configuration
|
||||
|
||||
Note: Requires a valid API key for the chosen LLM provider
|
||||
"""
|
||||
@@ -296,7 +296,7 @@ async def llm_content_filter():
|
||||
|
||||
# Create LLM configuration
|
||||
# Replace with your actual API key or set as environment variable
|
||||
llm_config = LlmConfig(
|
||||
llm_config = LLMConfig(
|
||||
provider="gemini/gemini-1.5-pro",
|
||||
api_token="env:GEMINI_API_KEY" # Will read from GEMINI_API_KEY environment variable
|
||||
)
|
||||
@@ -309,7 +309,7 @@ async def llm_content_filter():
|
||||
# Create markdown generator with LLM filter
|
||||
markdown_generator = DefaultMarkdownGenerator(
|
||||
content_filter=LLMContentFilter(
|
||||
llmConfig=llm_config,
|
||||
llm_config=llm_config,
|
||||
instruction="Extract key concepts and summaries"
|
||||
)
|
||||
)
|
||||
@@ -381,7 +381,7 @@ async def llm_schema_generation():
|
||||
PART 7: LLM Schema Generation
|
||||
|
||||
This function demonstrates:
|
||||
- Configuring LLM providers via LlmConfig
|
||||
- Configuring LLM providers via LLMConfig
|
||||
- Using LLM to generate extraction schemas
|
||||
- JsonCssExtractionStrategy
|
||||
|
||||
@@ -406,9 +406,9 @@ async def llm_schema_generation():
|
||||
<div class="rating">4.7/5</div>
|
||||
</div>
|
||||
"""
|
||||
print("\n📊 Setting up LlmConfig...")
|
||||
print("\n📊 Setting up LLMConfig...")
|
||||
# Create LLM configuration
|
||||
llm_config = LlmConfig(
|
||||
llm_config = LLMConfig(
|
||||
provider="gemini/gemini-1.5-pro",
|
||||
api_token="env:GEMINI_API_KEY"
|
||||
)
|
||||
@@ -416,7 +416,7 @@ async def llm_schema_generation():
|
||||
print(" This would use the LLM to analyze HTML and create an extraction schema")
|
||||
schema = JsonCssExtractionStrategy.generate_schema(
|
||||
html=sample_html,
|
||||
llmConfig = llm_config,
|
||||
llm_config = llm_config,
|
||||
query="Extract product name and price"
|
||||
)
|
||||
print("\n✅ Generated Schema:")
|
||||
|
||||
@@ -245,8 +245,8 @@ run_config = CrawlerRunConfig(
|
||||
)
|
||||
```
|
||||
|
||||
# 3. **LlmConfig** - Setting up LLM providers
|
||||
LlmConfig is useful to pass LLM provider config to strategies and functions that rely on LLMs to do extraction, filtering, schema generation etc. Currently it can be used in the following -
|
||||
# 3. **LLMConfig** - Setting up LLM providers
|
||||
LLMConfig is useful to pass LLM provider config to strategies and functions that rely on LLMs to do extraction, filtering, schema generation etc. Currently it can be used in the following -
|
||||
|
||||
1. LLMExtractionStrategy
|
||||
2. LLMContentFilter
|
||||
@@ -262,7 +262,7 @@ LlmConfig is useful to pass LLM provider config to strategies and functions that
|
||||
|
||||
## 3.2 Example Usage
|
||||
```python
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
|
||||
llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
|
||||
```
|
||||
|
||||
## 4. Putting It All Together
|
||||
@@ -270,7 +270,7 @@ llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI
|
||||
- **Use** `BrowserConfig` for **global** browser settings: engine, headless, proxy, user agent.
|
||||
- **Use** `CrawlerRunConfig` for each crawl’s **context**: how to filter content, handle caching, wait for dynamic elements, or run JS.
|
||||
- **Pass** both configs to `AsyncWebCrawler` (the `BrowserConfig`) and then to `arun()` (the `CrawlerRunConfig`).
|
||||
- **Use** `LlmConfig` for LLM provider configurations that can be used across all extraction, filtering, schema generation tasks. Can be used in - `LLMExtractionStrategy`, `LLMContentFilter`, `JsonCssExtractionStrategy.generate_schema` & `JsonXPathExtractionStrategy.generate_schema`
|
||||
- **Use** `LLMConfig` for LLM provider configurations that can be used across all extraction, filtering, schema generation tasks. Can be used in - `LLMExtractionStrategy`, `LLMContentFilter`, `JsonCssExtractionStrategy.generate_schema` & `JsonXPathExtractionStrategy.generate_schema`
|
||||
|
||||
```python
|
||||
# Create a modified copy with the clone() method
|
||||
|
||||
@@ -131,7 +131,7 @@ OverlappingWindowChunking(
|
||||
```python
|
||||
from pydantic import BaseModel
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai.types import LLMConfig
|
||||
|
||||
# Define schema
|
||||
class Article(BaseModel):
|
||||
@@ -141,7 +141,7 @@ class Article(BaseModel):
|
||||
|
||||
# Create strategy
|
||||
strategy = LLMExtractionStrategy(
|
||||
llmConfig = LlmConfig(provider="ollama/llama2"),
|
||||
llm_config = LLMConfig(provider="ollama/llama2"),
|
||||
schema=Article.schema(),
|
||||
instruction="Extract article details"
|
||||
)
|
||||
@@ -198,7 +198,7 @@ result = await crawler.arun(
|
||||
|
||||
```python
|
||||
from crawl4ai.chunking_strategy import OverlappingWindowChunking
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai.types import LLMConfig
|
||||
|
||||
# Create chunking strategy
|
||||
chunker = OverlappingWindowChunking(
|
||||
@@ -208,7 +208,7 @@ chunker = OverlappingWindowChunking(
|
||||
|
||||
# Use with extraction strategy
|
||||
strategy = LLMExtractionStrategy(
|
||||
llmConfig = LlmConfig(provider="ollama/llama2"),
|
||||
llm_config = LLMConfig(provider="ollama/llama2"),
|
||||
chunking_strategy=chunker
|
||||
)
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@ My dear friends and crawlers, there you go, this is the release of Crawl4AI v0.5
|
||||
* **Multiple Crawler Strategies:** Choose between the full-featured Playwright browser-based crawler or a new, *much* faster HTTP-only crawler for simpler tasks.
|
||||
* **Docker Deployment:** Deploy Crawl4AI as a scalable, self-contained service with built-in API endpoints and optional JWT authentication.
|
||||
* **Command-Line Interface (CLI):** Interact with Crawl4AI directly from your terminal. Crawl, configure, and extract data with simple commands.
|
||||
* **LLM Configuration (`LlmConfig`):** A new, unified way to configure LLM providers (OpenAI, Anthropic, Ollama, etc.) for extraction, filtering, and schema generation. Simplifies API key management and switching between models.
|
||||
* **LLM Configuration (`LLMConfig`):** A new, unified way to configure LLM providers (OpenAI, Anthropic, Ollama, etc.) for extraction, filtering, and schema generation. Simplifies API key management and switching between models.
|
||||
|
||||
**Minor Updates & Improvements:**
|
||||
|
||||
@@ -47,7 +47,7 @@ This release includes several breaking changes to improve the library's structur
|
||||
* **Config**: FastFilterChain has been replaced with FilterChain
|
||||
* **Deep-Crawl**: DeepCrawlStrategy.arun now returns Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
||||
* **Proxy**: Removed synchronous WebCrawler support and related rate limiting configurations
|
||||
* **LLM Parameters:** Use the new `LlmConfig` object instead of passing `provider`, `api_token`, `base_url`, and `api_base` directly to `LLMExtractionStrategy` and `LLMContentFilter`.
|
||||
* **LLM Parameters:** Use the new `LLMConfig` object instead of passing `provider`, `api_token`, `base_url`, and `api_base` directly to `LLMExtractionStrategy` and `LLMContentFilter`.
|
||||
|
||||
**In short:** Update imports, adjust `arun_many()` usage, check for optional fields, and review the Docker deployment guide.
|
||||
|
||||
|
||||
@@ -305,13 +305,13 @@ asyncio.run(main())
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai.types import LLMConfig
|
||||
import asyncio
|
||||
|
||||
llm_config = LlmConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
|
||||
llm_config = LLMConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
|
||||
|
||||
markdown_generator = DefaultMarkdownGenerator(
|
||||
content_filter=LLMContentFilter(llmConfig=llm_config, instruction="Extract key concepts and summaries")
|
||||
content_filter=LLMContentFilter(llm_config=llm_config, instruction="Extract key concepts and summaries")
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(markdown_generator=markdown_generator)
|
||||
@@ -335,13 +335,13 @@ asyncio.run(main())
|
||||
|
||||
```python
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai.types import LLMConfig
|
||||
|
||||
llm_config = LlmConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
|
||||
llm_config = LLMConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
|
||||
|
||||
schema = JsonCssExtractionStrategy.generate_schema(
|
||||
html="<div class='product'><h2>Product Name</h2><span class='price'>$99</span></div>",
|
||||
llmConfig = llm_config,
|
||||
llm_config = llm_config,
|
||||
query="Extract product name and price"
|
||||
)
|
||||
print(schema)
|
||||
@@ -394,20 +394,20 @@ print(schema)
|
||||
serialization, especially for sets of allowed/blocked domains. No code changes
|
||||
required.
|
||||
|
||||
- **Added: New `LlmConfig` parameter.** This new parameter can be passed for
|
||||
- **Added: New `LLMConfig` parameter.** This new parameter can be passed for
|
||||
extraction, filtering, and schema generation tasks. It simplifies passing
|
||||
provider strings, API tokens, and base URLs across all sections where LLM
|
||||
configuration is necessary. It also enables reuse and allows for quick
|
||||
experimentation between different LLM configurations.
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai.types import LLMConfig
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
|
||||
# Example of using LlmConfig with LLMExtractionStrategy
|
||||
llm_config = LlmConfig(provider="openai/gpt-4o", api_token="YOUR_API_KEY")
|
||||
strategy = LLMExtractionStrategy(llmConfig=llm_config, schema=...)
|
||||
# Example of using LLMConfig with LLMExtractionStrategy
|
||||
llm_config = LLMConfig(provider="openai/gpt-4o", api_token="YOUR_API_KEY")
|
||||
strategy = LLMExtractionStrategy(llm_config=llm_config, schema=...)
|
||||
|
||||
# Example usage within a crawler
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
@@ -418,7 +418,7 @@ print(schema)
|
||||
```
|
||||
**Breaking Change:** Removed old parameters like `provider`, `api_token`,
|
||||
`base_url`, and `api_base` from `LLMExtractionStrategy` and
|
||||
`LLMContentFilter`. Users should migrate to using the `LlmConfig` object.
|
||||
`LLMContentFilter`. Users should migrate to using the `LLMConfig` object.
|
||||
|
||||
- **Changed: Improved browser context management and added shared data support.
|
||||
(Breaking Change:** `BrowserContext` API updated). Browser contexts are now
|
||||
|
||||
@@ -4,7 +4,7 @@ Crawl4AI’s flexibility stems from two key classes:
|
||||
|
||||
1. **`BrowserConfig`** – Dictates **how** the browser is launched and behaves (e.g., headless or visible, proxy, user agent).
|
||||
2. **`CrawlerRunConfig`** – Dictates **how** each **crawl** operates (e.g., caching, extraction, timeouts, JavaScript code to run, etc.).
|
||||
3. **`LlmConfig`** - Dictates **how** LLM providers are configured. (model, api token, base url, temperature etc.)
|
||||
3. **`LLMConfig`** - Dictates **how** LLM providers are configured. (model, api token, base url, temperature etc.)
|
||||
|
||||
In most examples, you create **one** `BrowserConfig` for the entire crawler session, then pass a **fresh** or re-used `CrawlerRunConfig` whenever you call `arun()`. This tutorial shows the most commonly used parameters. If you need advanced or rarely used fields, see the [Configuration Parameters](../api/parameters.md).
|
||||
|
||||
@@ -239,7 +239,7 @@ The `clone()` method:
|
||||
|
||||
|
||||
|
||||
## 3. LlmConfig Essentials
|
||||
## 3. LLMConfig Essentials
|
||||
|
||||
### Key fields to note
|
||||
|
||||
@@ -256,16 +256,16 @@ The `clone()` method:
|
||||
- If your provider has a custom endpoint
|
||||
|
||||
```python
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
|
||||
llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
|
||||
```
|
||||
|
||||
## 4. Putting It All Together
|
||||
|
||||
In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` & `LlmConfig` depending on each call’s needs:
|
||||
In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` & `LLMConfig` depending on each call’s needs:
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LlmConfig
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
|
||||
async def main():
|
||||
@@ -289,14 +289,14 @@ async def main():
|
||||
|
||||
# 3) Example LLM content filtering
|
||||
|
||||
gemini_config = LlmConfig(
|
||||
gemini_config = LLMConfig(
|
||||
provider="gemini/gemini-1.5-pro"
|
||||
api_token = "env:GEMINI_API_TOKEN"
|
||||
)
|
||||
|
||||
# Initialize LLM filter with specific instruction
|
||||
filter = LLMContentFilter(
|
||||
llmConfig=gemini_config, # or your preferred provider
|
||||
llm_config=gemini_config, # or your preferred provider
|
||||
instruction="""
|
||||
Focus on extracting the core educational content.
|
||||
Include:
|
||||
@@ -343,7 +343,7 @@ if __name__ == "__main__":
|
||||
|
||||
For a **detailed list** of available parameters (including advanced ones), see:
|
||||
|
||||
- [BrowserConfig, CrawlerRunConfig & LlmConfig Reference](../api/parameters.md)
|
||||
- [BrowserConfig, CrawlerRunConfig & LLMConfig Reference](../api/parameters.md)
|
||||
|
||||
You can explore topics like:
|
||||
|
||||
@@ -356,7 +356,7 @@ You can explore topics like:
|
||||
|
||||
## 6. Conclusion
|
||||
|
||||
**BrowserConfig**, **CrawlerRunConfig** and **LlmConfig** give you straightforward ways to define:
|
||||
**BrowserConfig**, **CrawlerRunConfig** and **LLMConfig** give you straightforward ways to define:
|
||||
|
||||
- **Which** browser to launch, how it should run, and any proxy or user agent needs.
|
||||
- **How** each crawl should behave—caching, timeouts, JavaScript code, extraction strategies, etc.
|
||||
|
||||
@@ -211,7 +211,7 @@ if __name__ == "__main__":
|
||||
import asyncio
|
||||
import json
|
||||
from pydantic import BaseModel, Field
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LlmConfig
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
|
||||
class ArticleData(BaseModel):
|
||||
@@ -220,7 +220,7 @@ class ArticleData(BaseModel):
|
||||
|
||||
async def main():
|
||||
llm_strategy = LLMExtractionStrategy(
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4",api_token="sk-YOUR_API_KEY")
|
||||
llm_config = LLMConfig(provider="openai/gpt-4",api_token="sk-YOUR_API_KEY")
|
||||
schema=ArticleData.schema(),
|
||||
extraction_type="schema",
|
||||
instruction="Extract 'headline' and a short 'summary' from the content."
|
||||
|
||||
@@ -175,13 +175,13 @@ prune_filter = PruningContentFilter(
|
||||
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LlmConfig
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
|
||||
async def main():
|
||||
# Initialize LLM filter with specific instruction
|
||||
filter = LLMContentFilter(
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4o",api_token="your-api-token"), #or use environment variable
|
||||
llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-api-token"), #or use environment variable
|
||||
instruction="""
|
||||
Focus on extracting the core educational content.
|
||||
Include:
|
||||
|
||||
@@ -128,7 +128,7 @@ Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. B
|
||||
|
||||
```python
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai.types import LLMConfig
|
||||
|
||||
# Generate a schema (one-time cost)
|
||||
html = "<div class='product'><h2>Gaming Laptop</h2><span class='price'>$999.99</span></div>"
|
||||
@@ -136,13 +136,13 @@ html = "<div class='product'><h2>Gaming Laptop</h2><span class='price'>$999.99</
|
||||
# Using OpenAI (requires API token)
|
||||
schema = JsonCssExtractionStrategy.generate_schema(
|
||||
html,
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4o",api_token="your-openai-token") # Required for OpenAI
|
||||
llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-openai-token") # Required for OpenAI
|
||||
)
|
||||
|
||||
# Or using Ollama (open source, no token needed)
|
||||
schema = JsonCssExtractionStrategy.generate_schema(
|
||||
html,
|
||||
llmConfig = LlmConfig(provider="ollama/llama3.3", api_token=None) # Not needed for Ollama
|
||||
llm_config = LLMConfig(provider="ollama/llama3.3", api_token=None) # Not needed for Ollama
|
||||
)
|
||||
|
||||
# Use the schema for fast, repeated extractions
|
||||
@@ -211,7 +211,7 @@ import os
|
||||
import json
|
||||
import asyncio
|
||||
from pydantic import BaseModel, Field
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LlmConfig
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
|
||||
class OpenAIModelFee(BaseModel):
|
||||
@@ -241,7 +241,7 @@ async def extract_structured_data_using_llm(
|
||||
word_count_threshold=1,
|
||||
page_timeout=80000,
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
llmConfig = LlmConfig(provider=provider,api_token=api_token),
|
||||
llm_config = LLMConfig(provider=provider,api_token=api_token),
|
||||
schema=OpenAIModelFee.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
||||
|
||||
@@ -71,7 +71,7 @@ Below is an overview of important LLM extraction parameters. All are typically s
|
||||
|
||||
```python
|
||||
extraction_strategy = LLMExtractionStrategy(
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4", api_token="YOUR_OPENAI_KEY"),
|
||||
llm_config = LLMConfig(provider="openai/gpt-4", api_token="YOUR_OPENAI_KEY"),
|
||||
schema=MyModel.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="Extract a list of items from the text with 'name' and 'price' fields.",
|
||||
@@ -96,7 +96,7 @@ import asyncio
|
||||
import json
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import List
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LlmConfig
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
|
||||
class Product(BaseModel):
|
||||
@@ -106,7 +106,7 @@ class Product(BaseModel):
|
||||
async def main():
|
||||
# 1. Define the LLM extraction strategy
|
||||
llm_strategy = LLMExtractionStrategy(
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv('OPENAI_API_KEY')),
|
||||
llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv('OPENAI_API_KEY')),
|
||||
schema=Product.schema_json(), # Or use model_json_schema()
|
||||
extraction_type="schema",
|
||||
instruction="Extract all product objects with 'name' and 'price' from the content.",
|
||||
|
||||
@@ -415,7 +415,7 @@ The schema generator is available as a static method on both `JsonCssExtractionS
|
||||
|
||||
```python
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai.types import LLMConfig
|
||||
|
||||
# Sample HTML with product information
|
||||
html = """
|
||||
@@ -435,14 +435,14 @@ html = """
|
||||
css_schema = JsonCssExtractionStrategy.generate_schema(
|
||||
html,
|
||||
schema_type="css",
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4o",api_token="your-openai-token")
|
||||
llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-openai-token")
|
||||
)
|
||||
|
||||
# Option 2: Using Ollama (open source, no token needed)
|
||||
xpath_schema = JsonXPathExtractionStrategy.generate_schema(
|
||||
html,
|
||||
schema_type="xpath",
|
||||
llmConfig = LlmConfig(provider="ollama/llama3.3", api_token=None) # Not needed for Ollama
|
||||
llm_config = LLMConfig(provider="ollama/llama3.3", api_token=None) # Not needed for Ollama
|
||||
)
|
||||
|
||||
# Use the generated schema for fast, repeated extractions
|
||||
|
||||
Reference in New Issue
Block a user