Merge branch 'next' into 2025-MAR-ALPHA-1
This commit is contained in:
@@ -167,13 +167,114 @@ async with AsyncWebCrawler() as crawler:
|
||||
|
||||
---
|
||||
|
||||
## 6. Summary
|
||||
## 6. Using the BrowserProfiler Class
|
||||
|
||||
- **Create** your user-data directory by launching Chrome/Chromium externally with `--user-data-dir=/some/path`.
|
||||
- **Log in** or configure sites as needed, then close the browser.
|
||||
- **Reference** that folder in `BrowserConfig(user_data_dir="...")` + `use_managed_browser=True`.
|
||||
- Enjoy **persistent** sessions that reflect your real identity.
|
||||
- If you only need quick, ephemeral automation, **Magic Mode** might suffice.
|
||||
Crawl4AI provides a dedicated `BrowserProfiler` class for managing browser profiles, making it easy to create, list, and delete profiles for identity-based browsing.
|
||||
|
||||
### Creating and Managing Profiles with BrowserProfiler
|
||||
|
||||
The `BrowserProfiler` class offers a comprehensive API for browser profile management:
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import BrowserProfiler
|
||||
|
||||
async def manage_profiles():
|
||||
# Create a profiler instance
|
||||
profiler = BrowserProfiler()
|
||||
|
||||
# Create a profile interactively - opens a browser window
|
||||
profile_path = await profiler.create_profile(
|
||||
profile_name="my-login-profile" # Optional: name your profile
|
||||
)
|
||||
|
||||
print(f"Profile saved at: {profile_path}")
|
||||
|
||||
# List all available profiles
|
||||
profiles = profiler.list_profiles()
|
||||
|
||||
for profile in profiles:
|
||||
print(f"Profile: {profile['name']}")
|
||||
print(f" Path: {profile['path']}")
|
||||
print(f" Created: {profile['created']}")
|
||||
print(f" Browser type: {profile['type']}")
|
||||
|
||||
# Get a specific profile path by name
|
||||
specific_profile = profiler.get_profile_path("my-login-profile")
|
||||
|
||||
# Delete a profile when no longer needed
|
||||
success = profiler.delete_profile("old-profile-name")
|
||||
|
||||
asyncio.run(manage_profiles())
|
||||
```
|
||||
|
||||
**How profile creation works:**
|
||||
1. A browser window opens for you to interact with
|
||||
2. You log in to websites, set preferences, etc.
|
||||
3. When you're done, press 'q' in the terminal to close the browser
|
||||
4. The profile is saved in the Crawl4AI profiles directory
|
||||
5. You can use the returned path with `BrowserConfig.user_data_dir`
|
||||
|
||||
### Interactive Profile Management
|
||||
|
||||
The `BrowserProfiler` also offers an interactive management console that guides you through profile creation, listing, and deletion:
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import BrowserProfiler, AsyncWebCrawler, BrowserConfig
|
||||
|
||||
# Define a function to use a profile for crawling
|
||||
async def crawl_with_profile(profile_path, url):
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
use_managed_browser=True,
|
||||
user_data_dir=profile_path
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url)
|
||||
return result
|
||||
|
||||
async def main():
|
||||
# Create a profiler instance
|
||||
profiler = BrowserProfiler()
|
||||
|
||||
# Launch the interactive profile manager
|
||||
# Passing the crawl function as a callback adds a "crawl with profile" option
|
||||
await profiler.interactive_manager(crawl_callback=crawl_with_profile)
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
### Legacy Methods
|
||||
|
||||
For backward compatibility, the previous methods on `ManagedBrowser` are still available, but they delegate to the new `BrowserProfiler` class:
|
||||
|
||||
```python
|
||||
from crawl4ai.browser_manager import ManagedBrowser
|
||||
|
||||
# These methods still work but use BrowserProfiler internally
|
||||
profiles = ManagedBrowser.list_profiles()
|
||||
```
|
||||
|
||||
### Complete Example
|
||||
|
||||
See the full example in `docs/examples/identity_based_browsing.py` for a complete demonstration of creating and using profiles for authenticated browsing using the new `BrowserProfiler` class.
|
||||
|
||||
---
|
||||
|
||||
## 7. Summary
|
||||
|
||||
- **Create** your user-data directory either:
|
||||
- By launching Chrome/Chromium externally with `--user-data-dir=/some/path`
|
||||
- Or by using the built-in `BrowserProfiler.create_profile()` method
|
||||
- Or through the interactive interface with `profiler.interactive_manager()`
|
||||
- **Log in** or configure sites as needed, then close the browser
|
||||
- **Reference** that folder in `BrowserConfig(user_data_dir="...")` + `use_managed_browser=True`
|
||||
- **List and reuse** profiles with `BrowserProfiler.list_profiles()`
|
||||
- **Manage** your profiles with the dedicated `BrowserProfiler` class
|
||||
- Enjoy **persistent** sessions that reflect your real identity
|
||||
- If you only need quick, ephemeral automation, **Magic Mode** might suffice
|
||||
|
||||
**Recommended**: Always prefer a **Managed Browser** for robust, identity-based crawling and simpler interactions with complex sites. Use **Magic Mode** for quick tasks or prototypes where persistent data is unnecessary.
|
||||
|
||||
|
||||
@@ -71,7 +71,8 @@ We group them by category.
|
||||
| **`word_count_threshold`** | `int` (default: ~200) | Skips text blocks below X words. Helps ignore trivial sections. |
|
||||
| **`extraction_strategy`** | `ExtractionStrategy` (default: None) | If set, extracts structured data (CSS-based, LLM-based, etc.). |
|
||||
| **`markdown_generator`** | `MarkdownGenerationStrategy` (None) | If you want specialized markdown output (citations, filtering, chunking, etc.). |
|
||||
| **`css_selector`** | `str` (None) | Retains only the part of the page matching this selector. |
|
||||
| **`css_selector`** | `str` (None) | Retains only the part of the page matching this selector. Affects the entire extraction process. |
|
||||
| **`target_elements`** | `List[str]` (None) | List of CSS selectors for elements to focus on for markdown generation and data extraction, while still processing the entire page for links, media, etc. Provides more flexibility than `css_selector`. |
|
||||
| **`excluded_tags`** | `list` (None) | Removes entire tags (e.g. `["script", "style"]`). |
|
||||
| **`excluded_selector`** | `str` (None) | Like `css_selector` but to exclude. E.g. `"#ads, .tracker"`. |
|
||||
| **`only_text`** | `bool` (False) | If `True`, tries to extract text-only content. |
|
||||
@@ -246,8 +247,8 @@ run_config = CrawlerRunConfig(
|
||||
)
|
||||
```
|
||||
|
||||
# 3. **LlmConfig** - Setting up LLM providers
|
||||
LlmConfig is useful to pass LLM provider config to strategies and functions that rely on LLMs to do extraction, filtering, schema generation etc. Currently it can be used in the following -
|
||||
# 3. **LLMConfig** - Setting up LLM providers
|
||||
LLMConfig is useful to pass LLM provider config to strategies and functions that rely on LLMs to do extraction, filtering, schema generation etc. Currently it can be used in the following -
|
||||
|
||||
1. LLMExtractionStrategy
|
||||
2. LLMContentFilter
|
||||
@@ -263,7 +264,7 @@ LlmConfig is useful to pass LLM provider config to strategies and functions that
|
||||
|
||||
## 3.2 Example Usage
|
||||
```python
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
|
||||
llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
|
||||
```
|
||||
|
||||
## 4. Putting It All Together
|
||||
@@ -271,7 +272,7 @@ llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI
|
||||
- **Use** `BrowserConfig` for **global** browser settings: engine, headless, proxy, user agent.
|
||||
- **Use** `CrawlerRunConfig` for each crawl’s **context**: how to filter content, handle caching, wait for dynamic elements, or run JS.
|
||||
- **Pass** both configs to `AsyncWebCrawler` (the `BrowserConfig`) and then to `arun()` (the `CrawlerRunConfig`).
|
||||
- **Use** `LlmConfig` for LLM provider configurations that can be used across all extraction, filtering, schema generation tasks. Can be used in - `LLMExtractionStrategy`, `LLMContentFilter`, `JsonCssExtractionStrategy.generate_schema` & `JsonXPathExtractionStrategy.generate_schema`
|
||||
- **Use** `LLMConfig` for LLM provider configurations that can be used across all extraction, filtering, schema generation tasks. Can be used in - `LLMExtractionStrategy`, `LLMContentFilter`, `JsonCssExtractionStrategy.generate_schema` & `JsonXPathExtractionStrategy.generate_schema`
|
||||
|
||||
```python
|
||||
# Create a modified copy with the clone() method
|
||||
|
||||
@@ -131,7 +131,7 @@ OverlappingWindowChunking(
|
||||
```python
|
||||
from pydantic import BaseModel
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai import LLMConfig
|
||||
|
||||
# Define schema
|
||||
class Article(BaseModel):
|
||||
@@ -141,7 +141,7 @@ class Article(BaseModel):
|
||||
|
||||
# Create strategy
|
||||
strategy = LLMExtractionStrategy(
|
||||
llmConfig = LlmConfig(provider="ollama/llama2"),
|
||||
llm_config = LLMConfig(provider="ollama/llama2"),
|
||||
schema=Article.schema(),
|
||||
instruction="Extract article details"
|
||||
)
|
||||
@@ -198,7 +198,7 @@ result = await crawler.arun(
|
||||
|
||||
```python
|
||||
from crawl4ai.chunking_strategy import OverlappingWindowChunking
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai import LLMConfig
|
||||
|
||||
# Create chunking strategy
|
||||
chunker = OverlappingWindowChunking(
|
||||
@@ -208,7 +208,7 @@ chunker = OverlappingWindowChunking(
|
||||
|
||||
# Use with extraction strategy
|
||||
strategy = LLMExtractionStrategy(
|
||||
llmConfig = LlmConfig(provider="ollama/llama2"),
|
||||
llm_config = LLMConfig(provider="ollama/llama2"),
|
||||
chunking_strategy=chunker
|
||||
)
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@ My dear friends and crawlers, there you go, this is the release of Crawl4AI v0.5
|
||||
* **Multiple Crawler Strategies:** Choose between the full-featured Playwright browser-based crawler or a new, *much* faster HTTP-only crawler for simpler tasks.
|
||||
* **Docker Deployment:** Deploy Crawl4AI as a scalable, self-contained service with built-in API endpoints and optional JWT authentication.
|
||||
* **Command-Line Interface (CLI):** Interact with Crawl4AI directly from your terminal. Crawl, configure, and extract data with simple commands.
|
||||
* **LLM Configuration (`LlmConfig`):** A new, unified way to configure LLM providers (OpenAI, Anthropic, Ollama, etc.) for extraction, filtering, and schema generation. Simplifies API key management and switching between models.
|
||||
* **LLM Configuration (`LLMConfig`):** A new, unified way to configure LLM providers (OpenAI, Anthropic, Ollama, etc.) for extraction, filtering, and schema generation. Simplifies API key management and switching between models.
|
||||
|
||||
**Minor Updates & Improvements:**
|
||||
|
||||
@@ -47,7 +47,7 @@ This release includes several breaking changes to improve the library's structur
|
||||
* **Config**: FastFilterChain has been replaced with FilterChain
|
||||
* **Deep-Crawl**: DeepCrawlStrategy.arun now returns Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
||||
* **Proxy**: Removed synchronous WebCrawler support and related rate limiting configurations
|
||||
* **LLM Parameters:** Use the new `LlmConfig` object instead of passing `provider`, `api_token`, `base_url`, and `api_base` directly to `LLMExtractionStrategy` and `LLMContentFilter`.
|
||||
* **LLM Parameters:** Use the new `LLMConfig` object instead of passing `provider`, `api_token`, `base_url`, and `api_base` directly to `LLMExtractionStrategy` and `LLMContentFilter`.
|
||||
|
||||
**In short:** Update imports, adjust `arun_many()` usage, check for optional fields, and review the Docker deployment guide.
|
||||
|
||||
|
||||
@@ -251,7 +251,7 @@ from crawl4ai import (
|
||||
RoundRobinProxyStrategy,
|
||||
)
|
||||
import asyncio
|
||||
from crawl4ai.configs import ProxyConfig
|
||||
from crawl4ai.proxy_strategy import ProxyConfig
|
||||
async def main():
|
||||
# Load proxies and create rotation strategy
|
||||
proxies = ProxyConfig.from_env()
|
||||
@@ -305,13 +305,13 @@ asyncio.run(main())
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai import LLMConfig
|
||||
import asyncio
|
||||
|
||||
llm_config = LlmConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
|
||||
llm_config = LLMConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
|
||||
|
||||
markdown_generator = DefaultMarkdownGenerator(
|
||||
content_filter=LLMContentFilter(llmConfig=llm_config, instruction="Extract key concepts and summaries")
|
||||
content_filter=LLMContentFilter(llm_config=llm_config, instruction="Extract key concepts and summaries")
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(markdown_generator=markdown_generator)
|
||||
@@ -335,13 +335,13 @@ asyncio.run(main())
|
||||
|
||||
```python
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai import LLMConfig
|
||||
|
||||
llm_config = LlmConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
|
||||
llm_config = LLMConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
|
||||
|
||||
schema = JsonCssExtractionStrategy.generate_schema(
|
||||
html="<div class='product'><h2>Product Name</h2><span class='price'>$99</span></div>",
|
||||
llmConfig = llm_config,
|
||||
llm_config = llm_config,
|
||||
query="Extract product name and price"
|
||||
)
|
||||
print(schema)
|
||||
@@ -394,20 +394,20 @@ print(schema)
|
||||
serialization, especially for sets of allowed/blocked domains. No code changes
|
||||
required.
|
||||
|
||||
- **Added: New `LlmConfig` parameter.** This new parameter can be passed for
|
||||
- **Added: New `LLMConfig` parameter.** This new parameter can be passed for
|
||||
extraction, filtering, and schema generation tasks. It simplifies passing
|
||||
provider strings, API tokens, and base URLs across all sections where LLM
|
||||
configuration is necessary. It also enables reuse and allows for quick
|
||||
experimentation between different LLM configurations.
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai import LLMConfig
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
|
||||
# Example of using LlmConfig with LLMExtractionStrategy
|
||||
llm_config = LlmConfig(provider="openai/gpt-4o", api_token="YOUR_API_KEY")
|
||||
strategy = LLMExtractionStrategy(llmConfig=llm_config, schema=...)
|
||||
# Example of using LLMConfig with LLMExtractionStrategy
|
||||
llm_config = LLMConfig(provider="openai/gpt-4o", api_token="YOUR_API_KEY")
|
||||
strategy = LLMExtractionStrategy(llm_config=llm_config, schema=...)
|
||||
|
||||
# Example usage within a crawler
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
@@ -418,7 +418,7 @@ print(schema)
|
||||
```
|
||||
**Breaking Change:** Removed old parameters like `provider`, `api_token`,
|
||||
`base_url`, and `api_base` from `LLMExtractionStrategy` and
|
||||
`LLMContentFilter`. Users should migrate to using the `LlmConfig` object.
|
||||
`LLMContentFilter`. Users should migrate to using the `LLMConfig` object.
|
||||
|
||||
- **Changed: Improved browser context management and added shared data support.
|
||||
(Breaking Change:** `BrowserContext` API updated). Browser contexts are now
|
||||
|
||||
@@ -4,7 +4,7 @@ Crawl4AI’s flexibility stems from two key classes:
|
||||
|
||||
1. **`BrowserConfig`** – Dictates **how** the browser is launched and behaves (e.g., headless or visible, proxy, user agent).
|
||||
2. **`CrawlerRunConfig`** – Dictates **how** each **crawl** operates (e.g., caching, extraction, timeouts, JavaScript code to run, etc.).
|
||||
3. **`LlmConfig`** - Dictates **how** LLM providers are configured. (model, api token, base url, temperature etc.)
|
||||
3. **`LLMConfig`** - Dictates **how** LLM providers are configured. (model, api token, base url, temperature etc.)
|
||||
|
||||
In most examples, you create **one** `BrowserConfig` for the entire crawler session, then pass a **fresh** or re-used `CrawlerRunConfig` whenever you call `arun()`. This tutorial shows the most commonly used parameters. If you need advanced or rarely used fields, see the [Configuration Parameters](../api/parameters.md).
|
||||
|
||||
@@ -239,7 +239,7 @@ The `clone()` method:
|
||||
|
||||
|
||||
|
||||
## 3. LlmConfig Essentials
|
||||
## 3. LLMConfig Essentials
|
||||
|
||||
### Key fields to note
|
||||
|
||||
@@ -256,16 +256,16 @@ The `clone()` method:
|
||||
- If your provider has a custom endpoint
|
||||
|
||||
```python
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
|
||||
llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
|
||||
```
|
||||
|
||||
## 4. Putting It All Together
|
||||
|
||||
In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` & `LlmConfig` depending on each call’s needs:
|
||||
In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` & `LLMConfig` depending on each call’s needs:
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LlmConfig
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
|
||||
async def main():
|
||||
@@ -289,14 +289,14 @@ async def main():
|
||||
|
||||
# 3) Example LLM content filtering
|
||||
|
||||
gemini_config = LlmConfig(
|
||||
gemini_config = LLMConfig(
|
||||
provider="gemini/gemini-1.5-pro"
|
||||
api_token = "env:GEMINI_API_TOKEN"
|
||||
)
|
||||
|
||||
# Initialize LLM filter with specific instruction
|
||||
filter = LLMContentFilter(
|
||||
llmConfig=gemini_config, # or your preferred provider
|
||||
llm_config=gemini_config, # or your preferred provider
|
||||
instruction="""
|
||||
Focus on extracting the core educational content.
|
||||
Include:
|
||||
@@ -343,7 +343,7 @@ if __name__ == "__main__":
|
||||
|
||||
For a **detailed list** of available parameters (including advanced ones), see:
|
||||
|
||||
- [BrowserConfig, CrawlerRunConfig & LlmConfig Reference](../api/parameters.md)
|
||||
- [BrowserConfig, CrawlerRunConfig & LLMConfig Reference](../api/parameters.md)
|
||||
|
||||
You can explore topics like:
|
||||
|
||||
@@ -356,7 +356,7 @@ You can explore topics like:
|
||||
|
||||
## 6. Conclusion
|
||||
|
||||
**BrowserConfig**, **CrawlerRunConfig** and **LlmConfig** give you straightforward ways to define:
|
||||
**BrowserConfig**, **CrawlerRunConfig** and **LLMConfig** give you straightforward ways to define:
|
||||
|
||||
- **Which** browser to launch, how it should run, and any proxy or user agent needs.
|
||||
- **How** each crawl should behave—caching, timeouts, JavaScript code, extraction strategies, etc.
|
||||
|
||||
@@ -8,6 +8,10 @@ Below, we show how to configure these parameters and combine them for precise co
|
||||
|
||||
## 1. CSS-Based Selection
|
||||
|
||||
There are two ways to select content from a page: using `css_selector` or the more flexible `target_elements`.
|
||||
|
||||
### 1.1 Using `css_selector`
|
||||
|
||||
A straightforward way to **limit** your crawl results to a certain region of the page is **`css_selector`** in **`CrawlerRunConfig`**:
|
||||
|
||||
```python
|
||||
@@ -32,6 +36,33 @@ if __name__ == "__main__":
|
||||
|
||||
**Result**: Only elements matching that selector remain in `result.cleaned_html`.
|
||||
|
||||
### 1.2 Using `target_elements`
|
||||
|
||||
The `target_elements` parameter provides more flexibility by allowing you to target **multiple elements** for content extraction while preserving the entire page context for other features:
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
|
||||
async def main():
|
||||
config = CrawlerRunConfig(
|
||||
# Target article body and sidebar, but not other content
|
||||
target_elements=["article.main-content", "aside.sidebar"]
|
||||
)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com/blog-post",
|
||||
config=config
|
||||
)
|
||||
print("Markdown focused on target elements")
|
||||
print("Links from entire page still available:", len(result.links.get("internal", [])))
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Key difference**: With `target_elements`, the markdown generation and structural data extraction focus on those elements, but other page elements (like links, images, and tables) are still extracted from the entire page. This gives you fine-grained control over what appears in your markdown content while preserving full page context for link analysis and media collection.
|
||||
|
||||
---
|
||||
|
||||
## 2. Content Filtering & Exclusions
|
||||
@@ -211,7 +242,7 @@ if __name__ == "__main__":
|
||||
import asyncio
|
||||
import json
|
||||
from pydantic import BaseModel, Field
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LlmConfig
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
|
||||
class ArticleData(BaseModel):
|
||||
@@ -220,7 +251,7 @@ class ArticleData(BaseModel):
|
||||
|
||||
async def main():
|
||||
llm_strategy = LLMExtractionStrategy(
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4",api_token="sk-YOUR_API_KEY")
|
||||
llm_config = LLMConfig(provider="openai/gpt-4",api_token="sk-YOUR_API_KEY")
|
||||
schema=ArticleData.schema(),
|
||||
extraction_type="schema",
|
||||
instruction="Extract 'headline' and a short 'summary' from the content."
|
||||
@@ -404,15 +435,59 @@ Stick to BeautifulSoup strategy (default) when:
|
||||
|
||||
---
|
||||
|
||||
## 7. Conclusion
|
||||
## 7. Combining CSS Selection Methods
|
||||
|
||||
By mixing **css_selector** scoping, **content filtering** parameters, and advanced **extraction strategies**, you can precisely **choose** which data to keep. Key parameters in **`CrawlerRunConfig`** for content selection include:
|
||||
You can combine `css_selector` and `target_elements` in powerful ways to achieve fine-grained control over your output:
|
||||
|
||||
1. **`css_selector`** – Basic scoping to an element or region.
|
||||
2. **`word_count_threshold`** – Skip short blocks.
|
||||
3. **`excluded_tags`** – Remove entire HTML tags.
|
||||
4. **`exclude_external_links`**, **`exclude_social_media_links`**, **`exclude_domains`** – Filter out unwanted links or domains.
|
||||
5. **`exclude_external_images`** – Remove images from external sources.
|
||||
6. **`process_iframes`** – Merge iframe content if needed.
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||
|
||||
async def main():
|
||||
# Target specific content but preserve page context
|
||||
config = CrawlerRunConfig(
|
||||
# Focus markdown on main content and sidebar
|
||||
target_elements=["#main-content", ".sidebar"],
|
||||
|
||||
# Global filters applied to entire page
|
||||
excluded_tags=["nav", "footer", "header"],
|
||||
exclude_external_links=True,
|
||||
|
||||
# Use basic content thresholds
|
||||
word_count_threshold=15,
|
||||
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com/article",
|
||||
config=config
|
||||
)
|
||||
|
||||
print(f"Content focuses on specific elements, but all links still analyzed")
|
||||
print(f"Internal links: {len(result.links.get('internal', []))}")
|
||||
print(f"External links: {len(result.links.get('external', []))}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
This approach gives you the best of both worlds:
|
||||
- Markdown generation and content extraction focus on the elements you care about
|
||||
- Links, images and other page data still give you the full context of the page
|
||||
- Content filtering still applies globally
|
||||
|
||||
## 8. Conclusion
|
||||
|
||||
By mixing **target_elements** or **css_selector** scoping, **content filtering** parameters, and advanced **extraction strategies**, you can precisely **choose** which data to keep. Key parameters in **`CrawlerRunConfig`** for content selection include:
|
||||
|
||||
1. **`target_elements`** – Array of CSS selectors to focus markdown generation and data extraction, while preserving full page context for links and media.
|
||||
2. **`css_selector`** – Basic scoping to an element or region for all extraction processes.
|
||||
3. **`word_count_threshold`** – Skip short blocks.
|
||||
4. **`excluded_tags`** – Remove entire HTML tags.
|
||||
5. **`exclude_external_links`**, **`exclude_social_media_links`**, **`exclude_domains`** – Filter out unwanted links or domains.
|
||||
6. **`exclude_external_images`** – Remove images from external sources.
|
||||
7. **`process_iframes`** – Merge iframe content if needed.
|
||||
|
||||
Combine these with structured extraction (CSS, LLM-based, or others) to build powerful crawls that yield exactly the content you want, from raw or cleaned HTML up to sophisticated JSON structures. For more detail, see [Configuration Reference](../api/parameters.md). Enjoy curating your data to the max!
|
||||
@@ -73,12 +73,18 @@ from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
|
||||
strategy = BFSDeepCrawlStrategy(
|
||||
max_depth=2, # Crawl initial page + 2 levels deep
|
||||
include_external=False, # Stay within the same domain
|
||||
max_pages=50, # Maximum number of pages to crawl (optional)
|
||||
score_threshold=0.3, # Minimum score for URLs to be crawled (optional)
|
||||
)
|
||||
```
|
||||
|
||||
**Key parameters:**
|
||||
- **`max_depth`**: Number of levels to crawl beyond the starting page
|
||||
- **`include_external`**: Whether to follow links to other domains
|
||||
- **`max_pages`**: Maximum number of pages to crawl (default: infinite)
|
||||
- **`score_threshold`**: Minimum score for URLs to be crawled (default: -inf)
|
||||
- **`filter_chain`**: FilterChain instance for URL filtering
|
||||
- **`url_scorer`**: Scorer instance for evaluating URLs
|
||||
|
||||
### 2.2 DFSDeepCrawlStrategy (Depth-First Search)
|
||||
|
||||
@@ -91,12 +97,18 @@ from crawl4ai.deep_crawling import DFSDeepCrawlStrategy
|
||||
strategy = DFSDeepCrawlStrategy(
|
||||
max_depth=2, # Crawl initial page + 2 levels deep
|
||||
include_external=False, # Stay within the same domain
|
||||
max_pages=30, # Maximum number of pages to crawl (optional)
|
||||
score_threshold=0.5, # Minimum score for URLs to be crawled (optional)
|
||||
)
|
||||
```
|
||||
|
||||
**Key parameters:**
|
||||
- **`max_depth`**: Number of levels to crawl beyond the starting page
|
||||
- **`include_external`**: Whether to follow links to other domains
|
||||
- **`max_pages`**: Maximum number of pages to crawl (default: infinite)
|
||||
- **`score_threshold`**: Minimum score for URLs to be crawled (default: -inf)
|
||||
- **`filter_chain`**: FilterChain instance for URL filtering
|
||||
- **`url_scorer`**: Scorer instance for evaluating URLs
|
||||
|
||||
### 2.3 BestFirstCrawlingStrategy (⭐️ - Recommended Deep crawl strategy)
|
||||
|
||||
@@ -116,7 +128,8 @@ scorer = KeywordRelevanceScorer(
|
||||
strategy = BestFirstCrawlingStrategy(
|
||||
max_depth=2,
|
||||
include_external=False,
|
||||
url_scorer=scorer
|
||||
url_scorer=scorer,
|
||||
max_pages=25, # Maximum number of pages to crawl (optional)
|
||||
)
|
||||
```
|
||||
|
||||
@@ -124,6 +137,8 @@ This crawling approach:
|
||||
- Evaluates each discovered URL based on scorer criteria
|
||||
- Visits higher-scoring pages first
|
||||
- Helps focus crawl resources on the most relevant content
|
||||
- Can limit total pages crawled with `max_pages`
|
||||
- Does not need `score_threshold` as it naturally prioritizes by score
|
||||
|
||||
---
|
||||
|
||||
@@ -410,27 +425,64 @@ if __name__ == "__main__":
|
||||
---
|
||||
|
||||
|
||||
## 8. Common Pitfalls & Tips
|
||||
## 8. Limiting and Controlling Crawl Size
|
||||
|
||||
1.**Set realistic depth limits.** Be cautious with `max_depth` values > 3, which can exponentially increase crawl size.
|
||||
### 8.1 Using max_pages
|
||||
|
||||
You can limit the total number of pages crawled with the `max_pages` parameter:
|
||||
|
||||
```python
|
||||
# Limit to exactly 20 pages regardless of depth
|
||||
strategy = BFSDeepCrawlStrategy(
|
||||
max_depth=3,
|
||||
max_pages=20
|
||||
)
|
||||
```
|
||||
|
||||
This feature is useful for:
|
||||
- Controlling API costs
|
||||
- Setting predictable execution times
|
||||
- Focusing on the most important content
|
||||
- Testing crawl configurations before full execution
|
||||
|
||||
### 8.2 Using score_threshold
|
||||
|
||||
For BFS and DFS strategies, you can set a minimum score threshold to only crawl high-quality pages:
|
||||
|
||||
```python
|
||||
# Only follow links with scores above 0.4
|
||||
strategy = DFSDeepCrawlStrategy(
|
||||
max_depth=2,
|
||||
url_scorer=KeywordRelevanceScorer(keywords=["api", "guide", "reference"]),
|
||||
score_threshold=0.4 # Skip URLs with scores below this value
|
||||
)
|
||||
```
|
||||
|
||||
Note that for BestFirstCrawlingStrategy, score_threshold is not needed since pages are already processed in order of highest score first.
|
||||
|
||||
## 9. Common Pitfalls & Tips
|
||||
|
||||
1.**Set realistic limits.** Be cautious with `max_depth` values > 3, which can exponentially increase crawl size. Use `max_pages` to set hard limits.
|
||||
|
||||
2.**Don't neglect the scoring component.** BestFirstCrawling works best with well-tuned scorers. Experiment with keyword weights for optimal prioritization.
|
||||
|
||||
3.**Be a good web citizen.** Respect robots.txt. (disabled by default)
|
||||
|
||||
4.**Handle page errors gracefully.** Not all pages will be accessible. Check `result.status` when processing results.
|
||||
|
||||
4.**Handle page errors gracefully.** Not all pages will be accessible. Check `result.success` and `result.error_message` when processing results.
|
||||
5.**Balance breadth vs. depth.** Choose your strategy wisely - BFS for comprehensive coverage, DFS for deep exploration, BestFirst for focused relevance-based crawling.
|
||||
|
||||
---
|
||||
|
||||
## 9. Summary & Next Steps
|
||||
## 10. Summary & Next Steps
|
||||
|
||||
In this **Deep Crawling with Crawl4AI** tutorial, you learned to:
|
||||
|
||||
- Configure **BFSDeepCrawlStrategy** and **BestFirstCrawlingStrategy**
|
||||
- Configure **BFSDeepCrawlStrategy**, **DFSDeepCrawlStrategy**, and **BestFirstCrawlingStrategy**
|
||||
- Process results in streaming or non-streaming mode
|
||||
- Apply filters to target specific content
|
||||
- Use scorers to prioritize the most relevant pages
|
||||
- Limit crawls with `max_pages` and `score_threshold` parameters
|
||||
- Build a complete advanced crawler with combined techniques
|
||||
|
||||
With these tools, you can efficiently extract structured data from websites at scale, focusing precisely on the content you need for your specific use case.
|
||||
|
||||
@@ -133,19 +133,28 @@ This approach is handy when you still want external links but need to block cert
|
||||
|
||||
### 3.1 Accessing `result.media`
|
||||
|
||||
By default, Crawl4AI collects images, audio, and video URLs it finds on the page. These are stored in `result.media`, a dictionary keyed by media type (e.g., `images`, `videos`, `audio`).
|
||||
By default, Crawl4AI collects images, audio, video URLs, and data tables it finds on the page. These are stored in `result.media`, a dictionary keyed by media type (e.g., `images`, `videos`, `audio`, `tables`).
|
||||
|
||||
**Basic Example**:
|
||||
|
||||
```python
|
||||
if result.success:
|
||||
# Get images
|
||||
images_info = result.media.get("images", [])
|
||||
print(f"Found {len(images_info)} images in total.")
|
||||
for i, img in enumerate(images_info[:5]): # Inspect just the first 5
|
||||
for i, img in enumerate(images_info[:3]): # Inspect just the first 3
|
||||
print(f"[Image {i}] URL: {img['src']}")
|
||||
print(f" Alt text: {img.get('alt', '')}")
|
||||
print(f" Score: {img.get('score')}")
|
||||
print(f" Description: {img.get('desc', '')}\n")
|
||||
|
||||
# Get tables
|
||||
tables = result.media.get("tables", [])
|
||||
print(f"Found {len(tables)} data tables in total.")
|
||||
for i, table in enumerate(tables):
|
||||
print(f"[Table {i}] Caption: {table.get('caption', 'No caption')}")
|
||||
print(f" Columns: {len(table.get('headers', []))}")
|
||||
print(f" Rows: {len(table.get('rows', []))}")
|
||||
```
|
||||
|
||||
**Structure Example**:
|
||||
@@ -171,6 +180,19 @@ result.media = {
|
||||
],
|
||||
"audio": [
|
||||
# Similar structure but with audio-specific fields
|
||||
],
|
||||
"tables": [
|
||||
{
|
||||
"headers": ["Name", "Age", "Location"],
|
||||
"rows": [
|
||||
["John Doe", "34", "New York"],
|
||||
["Jane Smith", "28", "San Francisco"],
|
||||
["Alex Johnson", "42", "Chicago"]
|
||||
],
|
||||
"caption": "Employee Directory",
|
||||
"summary": "Directory of company employees"
|
||||
},
|
||||
# More tables if present
|
||||
]
|
||||
}
|
||||
```
|
||||
@@ -199,7 +221,53 @@ crawler_cfg = CrawlerRunConfig(
|
||||
|
||||
This setting attempts to discard images from outside the primary domain, keeping only those from the site you’re crawling.
|
||||
|
||||
### 3.3 Additional Media Config
|
||||
### 3.3 Working with Tables
|
||||
|
||||
Crawl4AI can detect and extract structured data from HTML tables. Tables are analyzed based on various criteria to determine if they are actual data tables (as opposed to layout tables), including:
|
||||
|
||||
- Presence of thead and tbody sections
|
||||
- Use of th elements for headers
|
||||
- Column consistency
|
||||
- Text density
|
||||
- And other factors
|
||||
|
||||
Tables that score above the threshold (default: 7) are extracted and stored in `result.media.tables`.
|
||||
|
||||
**Accessing Table Data**:
|
||||
|
||||
```python
|
||||
if result.success:
|
||||
tables = result.media.get("tables", [])
|
||||
print(f"Found {len(tables)} data tables on the page")
|
||||
|
||||
if tables:
|
||||
# Access the first table
|
||||
first_table = tables[0]
|
||||
print(f"Table caption: {first_table.get('caption', 'No caption')}")
|
||||
print(f"Headers: {first_table.get('headers', [])}")
|
||||
|
||||
# Print the first 3 rows
|
||||
for i, row in enumerate(first_table.get('rows', [])[:3]):
|
||||
print(f"Row {i+1}: {row}")
|
||||
```
|
||||
|
||||
**Configuring Table Extraction**:
|
||||
|
||||
You can adjust the sensitivity of the table detection algorithm with:
|
||||
|
||||
```python
|
||||
crawler_cfg = CrawlerRunConfig(
|
||||
table_score_threshold=5 # Lower value = more tables detected (default: 7)
|
||||
)
|
||||
```
|
||||
|
||||
Each extracted table contains:
|
||||
- `headers`: Column header names
|
||||
- `rows`: List of rows, each containing cell values
|
||||
- `caption`: Table caption text (if available)
|
||||
- `summary`: Table summary attribute (if specified)
|
||||
|
||||
### 3.4 Additional Media Config
|
||||
|
||||
- **`screenshot`**: Set to `True` if you want a full-page screenshot stored as `base64` in `result.screenshot`.
|
||||
- **`pdf`**: Set to `True` if you want a PDF version of the page in `result.pdf`.
|
||||
@@ -273,4 +341,11 @@ if __name__ == "__main__":
|
||||
|
||||
---
|
||||
|
||||
**That’s it for Link & Media Analysis!** You’re now equipped to filter out unwanted sites and zero in on the images and videos that matter for your project.
|
||||
**That’s it for Link & Media Analysis!** You’re now equipped to filter out unwanted sites and zero in on the images and videos that matter for your project.
|
||||
### Table Extraction Tips
|
||||
|
||||
- Not all HTML tables are extracted - only those detected as "data tables" vs. layout tables.
|
||||
- Tables with inconsistent cell counts, nested tables, or those used purely for layout may be skipped.
|
||||
- If you're missing tables, try adjusting the `table_score_threshold` to a lower value (default is 7).
|
||||
|
||||
The table detection algorithm scores tables based on features like consistent columns, presence of headers, text density, and more. Tables scoring above the threshold are considered data tables worth extracting.
|
||||
|
||||
@@ -175,13 +175,13 @@ prune_filter = PruningContentFilter(
|
||||
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LlmConfig
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
|
||||
async def main():
|
||||
# Initialize LLM filter with specific instruction
|
||||
filter = LLMContentFilter(
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4o",api_token="your-api-token"), #or use environment variable
|
||||
llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-api-token"), #or use environment variable
|
||||
instruction="""
|
||||
Focus on extracting the core educational content.
|
||||
Include:
|
||||
|
||||
@@ -128,7 +128,7 @@ Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. B
|
||||
|
||||
```python
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai import LLMConfig
|
||||
|
||||
# Generate a schema (one-time cost)
|
||||
html = "<div class='product'><h2>Gaming Laptop</h2><span class='price'>$999.99</span></div>"
|
||||
@@ -136,13 +136,13 @@ html = "<div class='product'><h2>Gaming Laptop</h2><span class='price'>$999.99</
|
||||
# Using OpenAI (requires API token)
|
||||
schema = JsonCssExtractionStrategy.generate_schema(
|
||||
html,
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4o",api_token="your-openai-token") # Required for OpenAI
|
||||
llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-openai-token") # Required for OpenAI
|
||||
)
|
||||
|
||||
# Or using Ollama (open source, no token needed)
|
||||
schema = JsonCssExtractionStrategy.generate_schema(
|
||||
html,
|
||||
llmConfig = LlmConfig(provider="ollama/llama3.3", api_token=None) # Not needed for Ollama
|
||||
llm_config = LLMConfig(provider="ollama/llama3.3", api_token=None) # Not needed for Ollama
|
||||
)
|
||||
|
||||
# Use the schema for fast, repeated extractions
|
||||
@@ -211,7 +211,7 @@ import os
|
||||
import json
|
||||
import asyncio
|
||||
from pydantic import BaseModel, Field
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LlmConfig
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
|
||||
class OpenAIModelFee(BaseModel):
|
||||
@@ -241,7 +241,7 @@ async def extract_structured_data_using_llm(
|
||||
word_count_threshold=1,
|
||||
page_timeout=80000,
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
llmConfig = LlmConfig(provider=provider,api_token=api_token),
|
||||
llm_config = LLMConfig(provider=provider,api_token=api_token),
|
||||
schema=OpenAIModelFee.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
||||
|
||||
@@ -76,7 +76,7 @@ Below is an overview of important LLM extraction parameters. All are typically s
|
||||
|
||||
```python
|
||||
extraction_strategy = LLMExtractionStrategy(
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4", api_token="YOUR_OPENAI_KEY"),
|
||||
llm_config = LLMConfig(provider="openai/gpt-4", api_token="YOUR_OPENAI_KEY"),
|
||||
schema=MyModel.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="Extract a list of items from the text with 'name' and 'price' fields.",
|
||||
@@ -101,7 +101,7 @@ import asyncio
|
||||
import json
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import List
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LlmConfig
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
|
||||
class Product(BaseModel):
|
||||
@@ -111,7 +111,7 @@ class Product(BaseModel):
|
||||
async def main():
|
||||
# 1. Define the LLM extraction strategy
|
||||
llm_strategy = LLMExtractionStrategy(
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv('OPENAI_API_KEY')),
|
||||
llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv('OPENAI_API_KEY')),
|
||||
schema=Product.schema_json(), # Or use model_json_schema()
|
||||
extraction_type="schema",
|
||||
instruction="Extract all product objects with 'name' and 'price' from the content.",
|
||||
|
||||
@@ -415,7 +415,7 @@ The schema generator is available as a static method on both `JsonCssExtractionS
|
||||
|
||||
```python
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai import LLMConfig
|
||||
|
||||
# Sample HTML with product information
|
||||
html = """
|
||||
@@ -435,14 +435,14 @@ html = """
|
||||
css_schema = JsonCssExtractionStrategy.generate_schema(
|
||||
html,
|
||||
schema_type="css",
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4o",api_token="your-openai-token")
|
||||
llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-openai-token")
|
||||
)
|
||||
|
||||
# Option 2: Using Ollama (open source, no token needed)
|
||||
xpath_schema = JsonXPathExtractionStrategy.generate_schema(
|
||||
html,
|
||||
schema_type="xpath",
|
||||
llmConfig = LlmConfig(provider="ollama/llama3.3", api_token=None) # Not needed for Ollama
|
||||
llm_config = LLMConfig(provider="ollama/llama3.3", api_token=None) # Not needed for Ollama
|
||||
)
|
||||
|
||||
# Use the generated schema for fast, repeated extractions
|
||||
|
||||
Reference in New Issue
Block a user