refactor(docs): reorganize documentation structure and update styles
Reorganize documentation into core/advanced/extraction sections for better navigation. Update terminal theme styles and add rich library for better CLI output. Remove redundant tutorial files and consolidate content into core sections. Add personal story to index page for project context. BREAKING CHANGE: Documentation structure has been significantly reorganized
This commit is contained in:
@@ -1,208 +0,0 @@
|
||||
# Browser Configuration
|
||||
|
||||
Crawl4AI supports multiple browser engines and offers extensive configuration options for browser behavior.
|
||||
|
||||
## Browser Types
|
||||
|
||||
Choose from three browser engines:
|
||||
|
||||
```python
|
||||
# Chromium (default)
|
||||
async with AsyncWebCrawler(browser_type="chromium") as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
|
||||
# Firefox
|
||||
async with AsyncWebCrawler(browser_type="firefox") as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
|
||||
# WebKit
|
||||
async with AsyncWebCrawler(browser_type="webkit") as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
```
|
||||
|
||||
## Basic Configuration
|
||||
|
||||
Common browser settings:
|
||||
|
||||
```python
|
||||
async with AsyncWebCrawler(
|
||||
headless=True, # Run in headless mode (no GUI)
|
||||
verbose=True, # Enable detailed logging
|
||||
sleep_on_close=False # No delay when closing browser
|
||||
) as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
```
|
||||
|
||||
## Identity Management
|
||||
|
||||
Control how your crawler appears to websites:
|
||||
|
||||
```python
|
||||
# Custom user agent
|
||||
async with AsyncWebCrawler(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||||
) as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
|
||||
# Custom headers
|
||||
headers = {
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
"Cache-Control": "no-cache"
|
||||
}
|
||||
async with AsyncWebCrawler(headers=headers) as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
```
|
||||
|
||||
## Screenshot Capabilities
|
||||
|
||||
Capture page screenshots with enhanced error handling:
|
||||
|
||||
```python
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
screenshot=True, # Enable screenshot
|
||||
screenshot_wait_for=2.0 # Wait 2 seconds before capture
|
||||
)
|
||||
|
||||
if result.screenshot: # Base64 encoded image
|
||||
import base64
|
||||
with open("screenshot.png", "wb") as f:
|
||||
f.write(base64.b64decode(result.screenshot))
|
||||
```
|
||||
|
||||
## Timeouts and Waiting
|
||||
|
||||
Control page loading behavior:
|
||||
|
||||
```python
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
page_timeout=60000, # Page load timeout (ms)
|
||||
delay_before_return_html=2.0, # Wait before content capture
|
||||
wait_for="css:.dynamic-content" # Wait for specific element
|
||||
)
|
||||
```
|
||||
|
||||
## JavaScript Execution
|
||||
|
||||
Execute custom JavaScript before crawling:
|
||||
|
||||
```python
|
||||
# Single JavaScript command
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
js_code="window.scrollTo(0, document.body.scrollHeight);"
|
||||
)
|
||||
|
||||
# Multiple commands
|
||||
js_commands = [
|
||||
"window.scrollTo(0, document.body.scrollHeight);",
|
||||
"document.querySelector('.load-more').click();"
|
||||
]
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
js_code=js_commands
|
||||
)
|
||||
```
|
||||
|
||||
## Proxy Configuration
|
||||
|
||||
Use proxies for enhanced access:
|
||||
|
||||
```python
|
||||
# Simple proxy
|
||||
async with AsyncWebCrawler(
|
||||
proxy="http://proxy.example.com:8080"
|
||||
) as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
|
||||
# Proxy with authentication
|
||||
proxy_config = {
|
||||
"server": "http://proxy.example.com:8080",
|
||||
"username": "user",
|
||||
"password": "pass"
|
||||
}
|
||||
async with AsyncWebCrawler(proxy_config=proxy_config) as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
```
|
||||
|
||||
## Anti-Detection Features
|
||||
|
||||
Enable stealth features to avoid bot detection:
|
||||
|
||||
```python
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
simulate_user=True, # Simulate human behavior
|
||||
override_navigator=True, # Mask automation signals
|
||||
magic=True # Enable all anti-detection features
|
||||
)
|
||||
```
|
||||
|
||||
## Handling Dynamic Content
|
||||
|
||||
Configure browser to handle dynamic content:
|
||||
|
||||
```python
|
||||
# Wait for dynamic content
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
wait_for="js:() => document.querySelector('.content').children.length > 10",
|
||||
process_iframes=True # Process iframe content
|
||||
)
|
||||
|
||||
# Handle lazy-loaded images
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
js_code="window.scrollTo(0, document.body.scrollHeight);",
|
||||
delay_before_return_html=2.0 # Wait for images to load
|
||||
)
|
||||
```
|
||||
|
||||
## Comprehensive Example
|
||||
|
||||
Here's how to combine various browser configurations:
|
||||
|
||||
```python
|
||||
async def crawl_with_advanced_config(url: str):
|
||||
async with AsyncWebCrawler(
|
||||
# Browser setup
|
||||
browser_type="chromium",
|
||||
headless=True,
|
||||
verbose=True,
|
||||
|
||||
# Identity
|
||||
user_agent="Custom User Agent",
|
||||
headers={"Accept-Language": "en-US"},
|
||||
|
||||
# Proxy setup
|
||||
proxy="http://proxy.example.com:8080"
|
||||
) as crawler:
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
# Content handling
|
||||
process_iframes=True,
|
||||
screenshot=True,
|
||||
|
||||
# Timing
|
||||
page_timeout=60000,
|
||||
delay_before_return_html=2.0,
|
||||
|
||||
# Anti-detection
|
||||
magic=True,
|
||||
simulate_user=True,
|
||||
|
||||
# Dynamic content
|
||||
js_code=[
|
||||
"window.scrollTo(0, document.body.scrollHeight);",
|
||||
"document.querySelector('.load-more')?.click();"
|
||||
],
|
||||
wait_for="css:.dynamic-content"
|
||||
)
|
||||
|
||||
return {
|
||||
"content": result.markdown,
|
||||
"screenshot": result.screenshot,
|
||||
"success": result.success
|
||||
}
|
||||
```
|
||||
@@ -1,81 +0,0 @@
|
||||
# Crawl4AI Cache System and Migration Guide
|
||||
|
||||
## Overview
|
||||
Starting from version 0.5.0, Crawl4AI introduces a new caching system that replaces the old boolean flags with a more intuitive `CacheMode` enum. This change simplifies cache control and makes the behavior more predictable.
|
||||
|
||||
## Old vs New Approach
|
||||
|
||||
### Old Way (Deprecated)
|
||||
The old system used multiple boolean flags:
|
||||
- `bypass_cache`: Skip cache entirely
|
||||
- `disable_cache`: Disable all caching
|
||||
- `no_cache_read`: Don't read from cache
|
||||
- `no_cache_write`: Don't write to cache
|
||||
|
||||
### New Way (Recommended)
|
||||
The new system uses a single `CacheMode` enum:
|
||||
- `CacheMode.ENABLED`: Normal caching (read/write)
|
||||
- `CacheMode.DISABLED`: No caching at all
|
||||
- `CacheMode.READ_ONLY`: Only read from cache
|
||||
- `CacheMode.WRITE_ONLY`: Only write to cache
|
||||
- `CacheMode.BYPASS`: Skip cache for this operation
|
||||
|
||||
## Migration Example
|
||||
|
||||
### Old Code (Deprecated)
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
|
||||
async def use_proxy():
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
bypass_cache=True # Old way
|
||||
)
|
||||
print(len(result.markdown))
|
||||
|
||||
async def main():
|
||||
await use_proxy()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
### New Code (Recommended)
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
|
||||
async def use_proxy():
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) # Use CacheMode in CrawlerRunConfig
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.nbcnews.com/business",
|
||||
config=config # Pass the configuration object
|
||||
)
|
||||
print(len(result.markdown))
|
||||
|
||||
async def main():
|
||||
await use_proxy()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
## Common Migration Patterns
|
||||
|
||||
| Old Flag | New Mode |
|
||||
|-----------------------|---------------------------------|
|
||||
| `bypass_cache=True` | `cache_mode=CacheMode.BYPASS` |
|
||||
| `disable_cache=True` | `cache_mode=CacheMode.DISABLED`|
|
||||
| `no_cache_read=True` | `cache_mode=CacheMode.WRITE_ONLY` |
|
||||
| `no_cache_write=True` | `cache_mode=CacheMode.READ_ONLY` |
|
||||
|
||||
## Suppressing Deprecation Warnings
|
||||
If you need time to migrate, you can temporarily suppress deprecation warnings:
|
||||
```python
|
||||
# In your config.py
|
||||
SHOW_DEPRECATION_WARNINGS = False
|
||||
```
|
||||
@@ -1,135 +0,0 @@
|
||||
### Content Selection
|
||||
|
||||
Crawl4AI provides multiple ways to select and filter specific content from webpages. Learn how to precisely target the content you need.
|
||||
|
||||
#### CSS Selectors
|
||||
|
||||
Extract specific content using a `CrawlerRunConfig` with CSS selectors:
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
|
||||
config = CrawlerRunConfig(css_selector=".main-article") # Target main article content
|
||||
result = await crawler.arun(url="https://crawl4ai.com", config=config)
|
||||
|
||||
config = CrawlerRunConfig(css_selector="article h1, article .content") # Target heading and content
|
||||
result = await crawler.arun(url="https://crawl4ai.com", config=config)
|
||||
```
|
||||
|
||||
#### Content Filtering
|
||||
|
||||
Control content inclusion or exclusion with `CrawlerRunConfig`:
|
||||
|
||||
```python
|
||||
config = CrawlerRunConfig(
|
||||
word_count_threshold=10, # Minimum words per block
|
||||
excluded_tags=['form', 'header', 'footer', 'nav'], # Excluded tags
|
||||
exclude_external_links=True, # Remove external links
|
||||
exclude_social_media_links=True, # Remove social media links
|
||||
exclude_external_images=True # Remove external images
|
||||
)
|
||||
|
||||
result = await crawler.arun(url="https://crawl4ai.com", config=config)
|
||||
```
|
||||
|
||||
#### Iframe Content
|
||||
|
||||
Process iframe content by enabling specific options in `CrawlerRunConfig`:
|
||||
|
||||
```python
|
||||
config = CrawlerRunConfig(
|
||||
process_iframes=True, # Extract iframe content
|
||||
remove_overlay_elements=True # Remove popups/modals that might block iframes
|
||||
)
|
||||
|
||||
result = await crawler.arun(url="https://crawl4ai.com", config=config)
|
||||
```
|
||||
|
||||
#### Structured Content Selection Using LLMs
|
||||
|
||||
Leverage LLMs for intelligent content extraction:
|
||||
|
||||
```python
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
from pydantic import BaseModel
|
||||
from typing import List
|
||||
|
||||
class ArticleContent(BaseModel):
|
||||
title: str
|
||||
main_points: List[str]
|
||||
conclusion: str
|
||||
|
||||
strategy = LLMExtractionStrategy(
|
||||
provider="ollama/nemotron",
|
||||
schema=ArticleContent.schema(),
|
||||
instruction="Extract the main article title, key points, and conclusion"
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(extraction_strategy=strategy)
|
||||
|
||||
result = await crawler.arun(url="https://crawl4ai.com", config=config)
|
||||
article = json.loads(result.extracted_content)
|
||||
```
|
||||
|
||||
#### Pattern-Based Selection
|
||||
|
||||
Extract content matching repetitive patterns:
|
||||
|
||||
```python
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
|
||||
schema = {
|
||||
"name": "News Articles",
|
||||
"baseSelector": "article.news-item",
|
||||
"fields": [
|
||||
{"name": "headline", "selector": "h2", "type": "text"},
|
||||
{"name": "summary", "selector": ".summary", "type": "text"},
|
||||
{"name": "category", "selector": ".category", "type": "text"},
|
||||
{
|
||||
"name": "metadata",
|
||||
"type": "nested",
|
||||
"fields": [
|
||||
{"name": "author", "selector": ".author", "type": "text"},
|
||||
{"name": "date", "selector": ".date", "type": "text"}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
strategy = JsonCssExtractionStrategy(schema)
|
||||
config = CrawlerRunConfig(extraction_strategy=strategy)
|
||||
|
||||
result = await crawler.arun(url="https://crawl4ai.com", config=config)
|
||||
articles = json.loads(result.extracted_content)
|
||||
```
|
||||
|
||||
#### Comprehensive Example
|
||||
|
||||
Combine different selection methods using `CrawlerRunConfig`:
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
|
||||
async def extract_article_content(url: str):
|
||||
# Define structured extraction
|
||||
article_schema = {
|
||||
"name": "Article",
|
||||
"baseSelector": "article.main",
|
||||
"fields": [
|
||||
{"name": "title", "selector": "h1", "type": "text"},
|
||||
{"name": "content", "selector": ".content", "type": "text"}
|
||||
]
|
||||
}
|
||||
|
||||
# Define configuration
|
||||
config = CrawlerRunConfig(
|
||||
extraction_strategy=JsonCssExtractionStrategy(article_schema),
|
||||
word_count_threshold=10,
|
||||
excluded_tags=['nav', 'footer'],
|
||||
exclude_external_links=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url=url, config=config)
|
||||
return json.loads(result.extracted_content)
|
||||
```
|
||||
@@ -1,83 +0,0 @@
|
||||
# Content Filtering in Crawl4AI
|
||||
|
||||
This guide explains how to use content filtering strategies in Crawl4AI to extract the most relevant information from crawled web pages. You'll learn how to use the built-in `BM25ContentFilter` and how to create your own custom content filtering strategies.
|
||||
|
||||
## Relevance Content Filter
|
||||
|
||||
The `RelevanceContentFilter` is an abstract class providing a common interface for content filtering strategies. Specific algorithms, like `PruningContentFilter` or `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks.
|
||||
|
||||
## Pruning Content Filter
|
||||
|
||||
The `PruningContentFilter` removes less relevant nodes based on metrics like text density, link density, and tag importance. Nodes that fall below a defined threshold are pruned, leaving only high-value content.
|
||||
|
||||
### Usage
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
content_filter=PruningContentFilter(
|
||||
min_word_threshold=5,
|
||||
threshold_type='dynamic',
|
||||
threshold=0.45
|
||||
),
|
||||
fit_markdown=True # Activates markdown fitting
|
||||
)
|
||||
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
|
||||
if result.success:
|
||||
print(f"Cleaned Markdown:\n{result.fit_markdown}")
|
||||
```
|
||||
|
||||
### Parameters
|
||||
|
||||
- **`min_word_threshold`**: (Optional) Minimum number of words a node must contain to be considered relevant. Nodes with fewer words are automatically pruned.
|
||||
- **`threshold_type`**: (Optional, default 'fixed') Controls how pruning thresholds are calculated:
|
||||
- `'fixed'`: Uses a constant threshold value for all nodes.
|
||||
- `'dynamic'`: Adjusts thresholds based on node properties (e.g., tag importance, text/link ratios).
|
||||
- **`threshold`**: (Optional, default 0.48) Base threshold for pruning:
|
||||
- Fixed: Nodes scoring below this value are removed.
|
||||
- Dynamic: This value adjusts based on node characteristics.
|
||||
|
||||
### How It Works
|
||||
|
||||
The algorithm evaluates each node using:
|
||||
- **Text density**: Ratio of text to overall content.
|
||||
- **Link density**: Proportion of text within links.
|
||||
- **Tag importance**: Weights based on HTML tag type (e.g., `<article>`, `<p>`, `<div>`).
|
||||
- **Content quality**: Metrics like text length and structural importance.
|
||||
|
||||
## BM25 Algorithm
|
||||
|
||||
The `BM25ContentFilter` uses the BM25 algorithm to rank and extract text chunks based on relevance to a search query or page metadata.
|
||||
|
||||
### Usage
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
from crawl4ai.content_filter_strategy import BM25ContentFilter
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
content_filter=BM25ContentFilter(user_query="fruit nutrition health"),
|
||||
fit_markdown=True # Activates markdown fitting
|
||||
)
|
||||
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
|
||||
if result.success:
|
||||
print(f"Filtered Content:\n{result.extracted_content}")
|
||||
print(f"\nFiltered Markdown:\n{result.fit_markdown}")
|
||||
print(f"\nFiltered HTML:\n{result.fit_html}")
|
||||
else:
|
||||
print("Error:", result.error_message)
|
||||
```
|
||||
|
||||
### Parameters
|
||||
|
||||
- **`user_query`**: (Optional) A string representing the search query. If not provided, the filter extracts metadata (title, description, keywords) and uses it as the query.
|
||||
- **`bm25_threshold`**: (Optional, default 1.0) Threshold controlling relevance:
|
||||
- Higher values return stricter, more relevant results.
|
||||
- Lower values include more lenient filtering.
|
||||
|
||||
@@ -1,702 +0,0 @@
|
||||
# Docker Deployment
|
||||
|
||||
Crawl4AI provides official Docker images for easy deployment and scalability. This guide covers installation, configuration, and usage of Crawl4AI in Docker environments.
|
||||
|
||||
## Quick Start 🚀
|
||||
|
||||
Pull and run the basic version:
|
||||
|
||||
```bash
|
||||
# Basic run without security
|
||||
docker pull unclecode/crawl4ai:basic
|
||||
docker run -p 11235:11235 unclecode/crawl4ai:basic
|
||||
|
||||
# Run with API security enabled
|
||||
docker run -p 11235:11235 -e CRAWL4AI_API_TOKEN=your_secret_token unclecode/crawl4ai:basic
|
||||
```
|
||||
|
||||
## Running with Docker Compose 🐳
|
||||
|
||||
### Use Docker Compose (From Local Dockerfile or Docker Hub)
|
||||
|
||||
Crawl4AI provides flexibility to use Docker Compose for managing your containerized services. You can either build the image locally from the provided `Dockerfile` or use the pre-built image from Docker Hub.
|
||||
|
||||
### **Option 1: Using Docker Compose to Build Locally**
|
||||
If you want to build the image locally, use the provided `docker-compose.local.yml` file.
|
||||
|
||||
```bash
|
||||
docker-compose -f docker-compose.local.yml up -d
|
||||
```
|
||||
|
||||
This will:
|
||||
1. Build the Docker image from the provided `Dockerfile`.
|
||||
2. Start the container and expose it on `http://localhost:11235`.
|
||||
|
||||
---
|
||||
|
||||
### **Option 2: Using Docker Compose with Pre-Built Image from Hub**
|
||||
If you prefer using the pre-built image on Docker Hub, use the `docker-compose.hub.yml` file.
|
||||
|
||||
```bash
|
||||
docker-compose -f docker-compose.hub.yml up -d
|
||||
```
|
||||
|
||||
This will:
|
||||
1. Pull the pre-built image `unclecode/crawl4ai:basic` (or `all`, depending on your configuration).
|
||||
2. Start the container and expose it on `http://localhost:11235`.
|
||||
|
||||
---
|
||||
|
||||
### **Stopping the Running Services**
|
||||
|
||||
To stop the services started via Docker Compose, you can use:
|
||||
|
||||
```bash
|
||||
docker-compose -f docker-compose.local.yml down
|
||||
# OR
|
||||
docker-compose -f docker-compose.hub.yml down
|
||||
```
|
||||
|
||||
If the containers don’t stop and the application is still running, check the running containers:
|
||||
|
||||
```bash
|
||||
docker ps
|
||||
```
|
||||
|
||||
Find the `CONTAINER ID` of the running service and stop it forcefully:
|
||||
|
||||
```bash
|
||||
docker stop <CONTAINER_ID>
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### **Debugging with Docker Compose**
|
||||
|
||||
- **Check Logs**: To view the container logs:
|
||||
```bash
|
||||
docker-compose -f docker-compose.local.yml logs -f
|
||||
```
|
||||
|
||||
- **Remove Orphaned Containers**: If the service is still running unexpectedly:
|
||||
```bash
|
||||
docker-compose -f docker-compose.local.yml down --remove-orphans
|
||||
```
|
||||
|
||||
- **Manually Remove Network**: If the network is still in use:
|
||||
```bash
|
||||
docker network ls
|
||||
docker network rm crawl4ai_default
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Why Use Docker Compose?
|
||||
|
||||
Docker Compose is the recommended way to deploy Crawl4AI because:
|
||||
1. It simplifies multi-container setups.
|
||||
2. Allows you to define environment variables, resources, and ports in a single file.
|
||||
3. Makes it easier to switch between local development and production-ready images.
|
||||
|
||||
For example, your `docker-compose.yml` could include API keys, token settings, and memory limits, making deployment quick and consistent.
|
||||
|
||||
|
||||
|
||||
|
||||
## API Security 🔒
|
||||
|
||||
### Understanding CRAWL4AI_API_TOKEN
|
||||
|
||||
The `CRAWL4AI_API_TOKEN` provides optional security for your Crawl4AI instance:
|
||||
|
||||
- If `CRAWL4AI_API_TOKEN` is set: All API endpoints (except `/health`) require authentication
|
||||
- If `CRAWL4AI_API_TOKEN` is not set: The API is publicly accessible
|
||||
|
||||
```bash
|
||||
# Secured Instance
|
||||
docker run -p 11235:11235 -e CRAWL4AI_API_TOKEN=your_secret_token unclecode/crawl4ai:all
|
||||
|
||||
# Unsecured Instance
|
||||
docker run -p 11235:11235 unclecode/crawl4ai:all
|
||||
```
|
||||
|
||||
### Making API Calls
|
||||
|
||||
For secured instances, include the token in all requests:
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
# Setup headers if token is being used
|
||||
api_token = "your_secret_token" # Same token set in CRAWL4AI_API_TOKEN
|
||||
headers = {"Authorization": f"Bearer {api_token}"} if api_token else {}
|
||||
|
||||
# Making authenticated requests
|
||||
response = requests.post(
|
||||
"http://localhost:11235/crawl",
|
||||
headers=headers,
|
||||
json={
|
||||
"urls": "https://example.com",
|
||||
"priority": 10
|
||||
}
|
||||
)
|
||||
|
||||
# Checking task status
|
||||
task_id = response.json()["task_id"]
|
||||
status = requests.get(
|
||||
f"http://localhost:11235/task/{task_id}",
|
||||
headers=headers
|
||||
)
|
||||
```
|
||||
|
||||
### Using with Docker Compose
|
||||
|
||||
In your `docker-compose.yml`:
|
||||
```yaml
|
||||
services:
|
||||
crawl4ai:
|
||||
image: unclecode/crawl4ai:all
|
||||
environment:
|
||||
- CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional
|
||||
# ... other configuration
|
||||
```
|
||||
|
||||
Then either:
|
||||
1. Set in `.env` file:
|
||||
```env
|
||||
CRAWL4AI_API_TOKEN=your_secret_token
|
||||
```
|
||||
|
||||
2. Or set via command line:
|
||||
```bash
|
||||
CRAWL4AI_API_TOKEN=your_secret_token docker-compose up
|
||||
```
|
||||
|
||||
> **Security Note**: If you enable the API token, make sure to keep it secure and never commit it to version control. The token will be required for all API endpoints except the health check endpoint (`/health`).
|
||||
|
||||
## Configuration Options 🔧
|
||||
|
||||
### Environment Variables
|
||||
|
||||
You can configure the service using environment variables:
|
||||
|
||||
```bash
|
||||
# Basic configuration
|
||||
docker run -p 11235:11235 \
|
||||
-e MAX_CONCURRENT_TASKS=5 \
|
||||
unclecode/crawl4ai:all
|
||||
|
||||
# With security and LLM support
|
||||
docker run -p 11235:11235 \
|
||||
-e CRAWL4AI_API_TOKEN=your_secret_token \
|
||||
-e OPENAI_API_KEY=sk-... \
|
||||
-e ANTHROPIC_API_KEY=sk-ant-... \
|
||||
unclecode/crawl4ai:all
|
||||
```
|
||||
|
||||
### Using Docker Compose (Recommended) 🐳
|
||||
|
||||
Create a `docker-compose.yml`:
|
||||
|
||||
```yaml
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
crawl4ai:
|
||||
image: unclecode/crawl4ai:all
|
||||
ports:
|
||||
- "11235:11235"
|
||||
environment:
|
||||
- CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API security
|
||||
- MAX_CONCURRENT_TASKS=5
|
||||
# LLM Provider Keys
|
||||
- OPENAI_API_KEY=${OPENAI_API_KEY:-}
|
||||
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
|
||||
volumes:
|
||||
- /dev/shm:/dev/shm
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 4G
|
||||
reservations:
|
||||
memory: 1G
|
||||
```
|
||||
|
||||
You can run it in two ways:
|
||||
|
||||
1. Using environment variables directly:
|
||||
```bash
|
||||
CRAWL4AI_API_TOKEN=secret123 OPENAI_API_KEY=sk-... docker-compose up
|
||||
```
|
||||
|
||||
2. Using a `.env` file (recommended):
|
||||
Create a `.env` file in the same directory:
|
||||
```env
|
||||
# API Security (optional)
|
||||
CRAWL4AI_API_TOKEN=your_secret_token
|
||||
|
||||
# LLM Provider Keys
|
||||
OPENAI_API_KEY=sk-...
|
||||
ANTHROPIC_API_KEY=sk-ant-...
|
||||
|
||||
# Other Configuration
|
||||
MAX_CONCURRENT_TASKS=5
|
||||
```
|
||||
|
||||
Then simply run:
|
||||
```bash
|
||||
docker-compose up
|
||||
```
|
||||
|
||||
### Testing the Deployment 🧪
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
# For unsecured instances
|
||||
def test_unsecured():
|
||||
# Health check
|
||||
health = requests.get("http://localhost:11235/health")
|
||||
print("Health check:", health.json())
|
||||
|
||||
# Basic crawl
|
||||
response = requests.post(
|
||||
"http://localhost:11235/crawl",
|
||||
json={
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"priority": 10
|
||||
}
|
||||
)
|
||||
task_id = response.json()["task_id"]
|
||||
print("Task ID:", task_id)
|
||||
|
||||
# For secured instances
|
||||
def test_secured(api_token):
|
||||
headers = {"Authorization": f"Bearer {api_token}"}
|
||||
|
||||
# Basic crawl with authentication
|
||||
response = requests.post(
|
||||
"http://localhost:11235/crawl",
|
||||
headers=headers,
|
||||
json={
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"priority": 10
|
||||
}
|
||||
)
|
||||
task_id = response.json()["task_id"]
|
||||
print("Task ID:", task_id)
|
||||
```
|
||||
|
||||
### LLM Extraction Example 🤖
|
||||
|
||||
When you've configured your LLM provider keys (via environment variables or `.env`), you can use LLM extraction:
|
||||
|
||||
```python
|
||||
request = {
|
||||
"urls": "https://example.com",
|
||||
"extraction_config": {
|
||||
"type": "llm",
|
||||
"params": {
|
||||
"provider": "openai/gpt-4",
|
||||
"instruction": "Extract main topics from the page"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Make the request (add headers if using API security)
|
||||
response = requests.post("http://localhost:11235/crawl", json=request)
|
||||
```
|
||||
|
||||
> **Note**: Remember to add `.env` to your `.gitignore` to keep your API keys secure!
|
||||
|
||||
|
||||
## Usage Examples 📝
|
||||
|
||||
### Basic Crawling
|
||||
|
||||
```python
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"priority": 10
|
||||
}
|
||||
|
||||
response = requests.post("http://localhost:11235/crawl", json=request)
|
||||
task_id = response.json()["task_id"]
|
||||
|
||||
# Get results
|
||||
result = requests.get(f"http://localhost:11235/task/{task_id}")
|
||||
```
|
||||
|
||||
### Structured Data Extraction
|
||||
|
||||
```python
|
||||
schema = {
|
||||
"name": "Crypto Prices",
|
||||
"baseSelector": ".cds-tableRow-t45thuk",
|
||||
"fields": [
|
||||
{
|
||||
"name": "crypto",
|
||||
"selector": "td:nth-child(1) h2",
|
||||
"type": "text",
|
||||
},
|
||||
{
|
||||
"name": "price",
|
||||
"selector": "td:nth-child(2)",
|
||||
"type": "text",
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
request = {
|
||||
"urls": "https://www.coinbase.com/explore",
|
||||
"extraction_config": {
|
||||
"type": "json_css",
|
||||
"params": {"schema": schema}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Dynamic Content Handling
|
||||
|
||||
```python
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"js_code": [
|
||||
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
|
||||
],
|
||||
"wait_for": "article.tease-card:nth-child(10)"
|
||||
}
|
||||
```
|
||||
|
||||
### AI-Powered Extraction (Full Version)
|
||||
|
||||
```python
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"extraction_config": {
|
||||
"type": "cosine",
|
||||
"params": {
|
||||
"semantic_filter": "business finance economy",
|
||||
"word_count_threshold": 10,
|
||||
"max_dist": 0.2,
|
||||
"top_k": 3
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Platform-Specific Instructions 💻
|
||||
|
||||
### macOS
|
||||
```bash
|
||||
docker pull unclecode/crawl4ai:basic
|
||||
docker run -p 11235:11235 unclecode/crawl4ai:basic
|
||||
```
|
||||
|
||||
### Ubuntu
|
||||
```bash
|
||||
# Basic version
|
||||
docker pull unclecode/crawl4ai:basic
|
||||
docker run -p 11235:11235 unclecode/crawl4ai:basic
|
||||
|
||||
# With GPU support
|
||||
docker pull unclecode/crawl4ai:gpu
|
||||
docker run --gpus all -p 11235:11235 unclecode/crawl4ai:gpu
|
||||
```
|
||||
|
||||
### Windows (PowerShell)
|
||||
```powershell
|
||||
docker pull unclecode/crawl4ai:basic
|
||||
docker run -p 11235:11235 unclecode/crawl4ai:basic
|
||||
```
|
||||
|
||||
## Testing 🧪
|
||||
|
||||
Save this as `test_docker.py`:
|
||||
|
||||
```python
|
||||
import requests
|
||||
import json
|
||||
import time
|
||||
import sys
|
||||
|
||||
class Crawl4AiTester:
|
||||
def __init__(self, base_url: str = "http://localhost:11235"):
|
||||
self.base_url = base_url
|
||||
|
||||
def submit_and_wait(self, request_data: dict, timeout: int = 300) -> dict:
|
||||
# Submit crawl job
|
||||
response = requests.post(f"{self.base_url}/crawl", json=request_data)
|
||||
task_id = response.json()["task_id"]
|
||||
print(f"Task ID: {task_id}")
|
||||
|
||||
# Poll for result
|
||||
start_time = time.time()
|
||||
while True:
|
||||
if time.time() - start_time > timeout:
|
||||
raise TimeoutError(f"Task {task_id} timeout")
|
||||
|
||||
result = requests.get(f"{self.base_url}/task/{task_id}")
|
||||
status = result.json()
|
||||
|
||||
if status["status"] == "completed":
|
||||
return status
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
def test_deployment():
|
||||
tester = Crawl4AiTester()
|
||||
|
||||
# Test basic crawl
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"priority": 10
|
||||
}
|
||||
|
||||
result = tester.submit_and_wait(request)
|
||||
print("Basic crawl successful!")
|
||||
print(f"Content length: {len(result['result']['markdown'])}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_deployment()
|
||||
```
|
||||
|
||||
## Advanced Configuration ⚙️
|
||||
|
||||
### Crawler Parameters
|
||||
|
||||
The `crawler_params` field allows you to configure the browser instance and crawling behavior. Here are key parameters you can use:
|
||||
|
||||
```python
|
||||
request = {
|
||||
"urls": "https://example.com",
|
||||
"crawler_params": {
|
||||
# Browser Configuration
|
||||
"headless": True, # Run in headless mode
|
||||
"browser_type": "chromium", # chromium/firefox/webkit
|
||||
"user_agent": "custom-agent", # Custom user agent
|
||||
"proxy": "http://proxy:8080", # Proxy configuration
|
||||
|
||||
# Performance & Behavior
|
||||
"page_timeout": 30000, # Page load timeout (ms)
|
||||
"verbose": True, # Enable detailed logging
|
||||
"semaphore_count": 5, # Concurrent request limit
|
||||
|
||||
# Anti-Detection Features
|
||||
"simulate_user": True, # Simulate human behavior
|
||||
"magic": True, # Advanced anti-detection
|
||||
"override_navigator": True, # Override navigator properties
|
||||
|
||||
# Session Management
|
||||
"user_data_dir": "./browser-data", # Browser profile location
|
||||
"use_managed_browser": True, # Use persistent browser
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Extra Parameters
|
||||
|
||||
The `extra` field allows passing additional parameters directly to the crawler's `arun` function:
|
||||
|
||||
```python
|
||||
request = {
|
||||
"urls": "https://example.com",
|
||||
"extra": {
|
||||
"word_count_threshold": 10, # Min words per block
|
||||
"only_text": True, # Extract only text
|
||||
"bypass_cache": True, # Force fresh crawl
|
||||
"process_iframes": True, # Include iframe content
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Complete Examples
|
||||
|
||||
1. **Advanced News Crawling**
|
||||
```python
|
||||
request = {
|
||||
"urls": "https://www.nbcnews.com/business",
|
||||
"crawler_params": {
|
||||
"headless": True,
|
||||
"page_timeout": 30000,
|
||||
"remove_overlay_elements": True # Remove popups
|
||||
},
|
||||
"extra": {
|
||||
"word_count_threshold": 50, # Longer content blocks
|
||||
"bypass_cache": True # Fresh content
|
||||
},
|
||||
"css_selector": ".article-body"
|
||||
}
|
||||
```
|
||||
|
||||
2. **Anti-Detection Configuration**
|
||||
```python
|
||||
request = {
|
||||
"urls": "https://example.com",
|
||||
"crawler_params": {
|
||||
"simulate_user": True,
|
||||
"magic": True,
|
||||
"override_navigator": True,
|
||||
"user_agent": "Mozilla/5.0 ...",
|
||||
"headers": {
|
||||
"Accept-Language": "en-US,en;q=0.9"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
3. **LLM Extraction with Custom Parameters**
|
||||
```python
|
||||
request = {
|
||||
"urls": "https://openai.com/pricing",
|
||||
"extraction_config": {
|
||||
"type": "llm",
|
||||
"params": {
|
||||
"provider": "openai/gpt-4",
|
||||
"schema": pricing_schema
|
||||
}
|
||||
},
|
||||
"crawler_params": {
|
||||
"verbose": True,
|
||||
"page_timeout": 60000
|
||||
},
|
||||
"extra": {
|
||||
"word_count_threshold": 1,
|
||||
"only_text": True
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
4. **Session-Based Dynamic Content**
|
||||
```python
|
||||
request = {
|
||||
"urls": "https://example.com",
|
||||
"crawler_params": {
|
||||
"session_id": "dynamic_session",
|
||||
"headless": False,
|
||||
"page_timeout": 60000
|
||||
},
|
||||
"js_code": ["window.scrollTo(0, document.body.scrollHeight);"],
|
||||
"wait_for": "js:() => document.querySelectorAll('.item').length > 10",
|
||||
"extra": {
|
||||
"delay_before_return_html": 2.0
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
5. **Screenshot with Custom Timing**
|
||||
```python
|
||||
request = {
|
||||
"urls": "https://example.com",
|
||||
"screenshot": True,
|
||||
"crawler_params": {
|
||||
"headless": True,
|
||||
"screenshot_wait_for": ".main-content"
|
||||
},
|
||||
"extra": {
|
||||
"delay_before_return_html": 3.0
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Parameter Reference Table
|
||||
|
||||
| Category | Parameter | Type | Description |
|
||||
|----------|-----------|------|-------------|
|
||||
| Browser | headless | bool | Run browser in headless mode |
|
||||
| Browser | browser_type | str | Browser engine selection |
|
||||
| Browser | user_agent | str | Custom user agent string |
|
||||
| Network | proxy | str | Proxy server URL |
|
||||
| Network | headers | dict | Custom HTTP headers |
|
||||
| Timing | page_timeout | int | Page load timeout (ms) |
|
||||
| Timing | delay_before_return_html | float | Wait before capture |
|
||||
| Anti-Detection | simulate_user | bool | Human behavior simulation |
|
||||
| Anti-Detection | magic | bool | Advanced protection |
|
||||
| Session | session_id | str | Browser session ID |
|
||||
| Session | user_data_dir | str | Profile directory |
|
||||
| Content | word_count_threshold | int | Minimum words per block |
|
||||
| Content | only_text | bool | Text-only extraction |
|
||||
| Content | process_iframes | bool | Include iframe content |
|
||||
| Debug | verbose | bool | Detailed logging |
|
||||
| Debug | log_console | bool | Browser console logs |
|
||||
|
||||
## Troubleshooting 🔍
|
||||
|
||||
### Common Issues
|
||||
|
||||
1. **Connection Refused**
|
||||
```
|
||||
Error: Connection refused at localhost:11235
|
||||
```
|
||||
Solution: Ensure the container is running and ports are properly mapped.
|
||||
|
||||
2. **Resource Limits**
|
||||
```
|
||||
Error: No available slots
|
||||
```
|
||||
Solution: Increase MAX_CONCURRENT_TASKS or container resources.
|
||||
|
||||
3. **GPU Access**
|
||||
```
|
||||
Error: GPU not found
|
||||
```
|
||||
Solution: Ensure proper NVIDIA drivers and use `--gpus all` flag.
|
||||
|
||||
### Debug Mode
|
||||
|
||||
Access container for debugging:
|
||||
```bash
|
||||
docker run -it --entrypoint /bin/bash unclecode/crawl4ai:all
|
||||
```
|
||||
|
||||
View container logs:
|
||||
```bash
|
||||
docker logs [container_id]
|
||||
```
|
||||
|
||||
## Best Practices 🌟
|
||||
|
||||
1. **Resource Management**
|
||||
- Set appropriate memory and CPU limits
|
||||
- Monitor resource usage via health endpoint
|
||||
- Use basic version for simple crawling tasks
|
||||
|
||||
2. **Scaling**
|
||||
- Use multiple containers for high load
|
||||
- Implement proper load balancing
|
||||
- Monitor performance metrics
|
||||
|
||||
3. **Security**
|
||||
- Use environment variables for sensitive data
|
||||
- Implement proper network isolation
|
||||
- Regular security updates
|
||||
|
||||
## API Reference 📚
|
||||
|
||||
### Health Check
|
||||
```http
|
||||
GET /health
|
||||
```
|
||||
|
||||
### Submit Crawl Task
|
||||
```http
|
||||
POST /crawl
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"urls": "string or array",
|
||||
"extraction_config": {
|
||||
"type": "basic|llm|cosine|json_css",
|
||||
"params": {}
|
||||
},
|
||||
"priority": 1-10,
|
||||
"ttl": 3600
|
||||
}
|
||||
```
|
||||
|
||||
### Get Task Status
|
||||
```http
|
||||
GET /task/{task_id}
|
||||
```
|
||||
|
||||
For more details, visit the [official documentation](https://crawl4ai.com/mkdocs/).
|
||||
@@ -1,129 +0,0 @@
|
||||
# Download Handling in Crawl4AI
|
||||
|
||||
This guide explains how to use Crawl4AI to handle file downloads during crawling. You'll learn how to trigger downloads, specify download locations, and access downloaded files.
|
||||
|
||||
## Enabling Downloads
|
||||
|
||||
To enable downloads, set the `accept_downloads` parameter in the `BrowserConfig` object and pass it to the crawler.
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import BrowserConfig, AsyncWebCrawler
|
||||
|
||||
async def main():
|
||||
config = BrowserConfig(accept_downloads=True) # Enable downloads globally
|
||||
async with AsyncWebCrawler(config=config) as crawler:
|
||||
# ... your crawling logic ...
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
Or, enable it for a specific crawl by using `CrawlerRunConfig`:
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
|
||||
async def main():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(accept_downloads=True)
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
# ...
|
||||
```
|
||||
|
||||
## Specifying Download Location
|
||||
|
||||
Specify the download directory using the `downloads_path` attribute in the `BrowserConfig` object. If not provided, Crawl4AI defaults to creating a "downloads" directory inside the `.crawl4ai` folder in your home directory.
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import BrowserConfig
|
||||
import os
|
||||
|
||||
downloads_path = os.path.join(os.getcwd(), "my_downloads") # Custom download path
|
||||
os.makedirs(downloads_path, exist_ok=True)
|
||||
|
||||
config = BrowserConfig(accept_downloads=True, downloads_path=downloads_path)
|
||||
|
||||
async def main():
|
||||
async with AsyncWebCrawler(config=config) as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
# ...
|
||||
```
|
||||
|
||||
## Triggering Downloads
|
||||
|
||||
Downloads are typically triggered by user interactions on a web page, such as clicking a download button. Use `js_code` in `CrawlerRunConfig` to simulate these actions and `wait_for` to allow sufficient time for downloads to start.
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
js_code="""
|
||||
const downloadLink = document.querySelector('a[href$=".exe"]');
|
||||
if (downloadLink) {
|
||||
downloadLink.click();
|
||||
}
|
||||
""",
|
||||
wait_for=5 # Wait 5 seconds for the download to start
|
||||
)
|
||||
|
||||
result = await crawler.arun(url="https://www.python.org/downloads/", config=config)
|
||||
```
|
||||
|
||||
## Accessing Downloaded Files
|
||||
|
||||
The `downloaded_files` attribute of the `CrawlResult` object contains paths to downloaded files.
|
||||
|
||||
```python
|
||||
if result.downloaded_files:
|
||||
print("Downloaded files:")
|
||||
for file_path in result.downloaded_files:
|
||||
print(f"- {file_path}")
|
||||
file_size = os.path.getsize(file_path)
|
||||
print(f"- File size: {file_size} bytes")
|
||||
else:
|
||||
print("No files downloaded.")
|
||||
```
|
||||
|
||||
## Example: Downloading Multiple Files
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
async def download_multiple_files(url: str, download_path: str):
|
||||
config = BrowserConfig(accept_downloads=True, downloads_path=download_path)
|
||||
async with AsyncWebCrawler(config=config) as crawler:
|
||||
run_config = CrawlerRunConfig(
|
||||
js_code="""
|
||||
const downloadLinks = document.querySelectorAll('a[download]');
|
||||
for (const link of downloadLinks) {
|
||||
link.click();
|
||||
await new Promise(r => setTimeout(r, 2000)); // Delay between clicks
|
||||
}
|
||||
""",
|
||||
wait_for=10 # Wait for all downloads to start
|
||||
)
|
||||
result = await crawler.arun(url=url, config=run_config)
|
||||
|
||||
if result.downloaded_files:
|
||||
print("Downloaded files:")
|
||||
for file in result.downloaded_files:
|
||||
print(f"- {file}")
|
||||
else:
|
||||
print("No files downloaded.")
|
||||
|
||||
# Usage
|
||||
download_path = os.path.join(Path.home(), ".crawl4ai", "downloads")
|
||||
os.makedirs(download_path, exist_ok=True)
|
||||
|
||||
asyncio.run(download_multiple_files("https://www.python.org/downloads/windows/", download_path))
|
||||
```
|
||||
|
||||
## Important Considerations
|
||||
|
||||
- **Browser Context:** Downloads are managed within the browser context. Ensure `js_code` correctly targets the download triggers on the webpage.
|
||||
- **Timing:** Use `wait_for` in `CrawlerRunConfig` to manage download timing.
|
||||
- **Error Handling:** Handle errors to manage failed downloads or incorrect paths gracefully.
|
||||
- **Security:** Scan downloaded files for potential security threats before use.
|
||||
|
||||
This revised guide ensures consistency with the `Crawl4AI` codebase by using `BrowserConfig` and `CrawlerRunConfig` for all download-related configurations. Let me know if further adjustments are needed!
|
||||
@@ -1,137 +0,0 @@
|
||||
# Installation 💻
|
||||
|
||||
Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package, use it with Docker, or run it as a local server.
|
||||
|
||||
## Option 1: Python Package Installation (Recommended)
|
||||
|
||||
Crawl4AI is now available on PyPI, making installation easier than ever. Choose the option that best fits your needs:
|
||||
|
||||
### Basic Installation
|
||||
|
||||
For basic web crawling and scraping tasks:
|
||||
|
||||
```bash
|
||||
pip install crawl4ai
|
||||
playwright install # Install Playwright dependencies
|
||||
```
|
||||
|
||||
### Installation with PyTorch
|
||||
|
||||
For advanced text clustering (includes CosineSimilarity cluster strategy):
|
||||
|
||||
```bash
|
||||
pip install crawl4ai[torch]
|
||||
```
|
||||
|
||||
### Installation with Transformers
|
||||
|
||||
For text summarization and Hugging Face models:
|
||||
|
||||
```bash
|
||||
pip install crawl4ai[transformer]
|
||||
```
|
||||
|
||||
### Full Installation
|
||||
|
||||
For all features:
|
||||
|
||||
```bash
|
||||
pip install crawl4ai[all]
|
||||
```
|
||||
|
||||
### Development Installation
|
||||
|
||||
For contributors who plan to modify the source code:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/unclecode/crawl4ai.git
|
||||
cd crawl4ai
|
||||
pip install -e ".[all]"
|
||||
playwright install # Install Playwright dependencies
|
||||
```
|
||||
|
||||
💡 After installation with "torch", "transformer", or "all" options, it's recommended to run the following CLI command to load the required models:
|
||||
|
||||
```bash
|
||||
crawl4ai-download-models
|
||||
```
|
||||
|
||||
This is optional but will boost the performance and speed of the crawler. You only need to do this once after installation.
|
||||
|
||||
## Playwright Installation Note for Ubuntu
|
||||
|
||||
If you encounter issues with Playwright installation on Ubuntu, you may need to install additional dependencies:
|
||||
|
||||
```bash
|
||||
sudo apt-get install -y \
|
||||
libwoff1 \
|
||||
libopus0 \
|
||||
libwebp7 \
|
||||
libwebpdemux2 \
|
||||
libenchant-2-2 \
|
||||
libgudev-1.0-0 \
|
||||
libsecret-1-0 \
|
||||
libhyphen0 \
|
||||
libgdk-pixbuf2.0-0 \
|
||||
libegl1 \
|
||||
libnotify4 \
|
||||
libxslt1.1 \
|
||||
libevent-2.1-7 \
|
||||
libgles2 \
|
||||
libxcomposite1 \
|
||||
libatk1.0-0 \
|
||||
libatk-bridge2.0-0 \
|
||||
libepoxy0 \
|
||||
libgtk-3-0 \
|
||||
libharfbuzz-icu0 \
|
||||
libgstreamer-gl1.0-0 \
|
||||
libgstreamer-plugins-bad1.0-0 \
|
||||
gstreamer1.0-plugins-good \
|
||||
gstreamer1.0-plugins-bad \
|
||||
libxt6 \
|
||||
libxaw7 \
|
||||
xvfb \
|
||||
fonts-noto-color-emoji \
|
||||
libfontconfig \
|
||||
libfreetype6 \
|
||||
xfonts-cyrillic \
|
||||
xfonts-scalable \
|
||||
fonts-liberation \
|
||||
fonts-ipafont-gothic \
|
||||
fonts-wqy-zenhei \
|
||||
fonts-tlwg-loma-otf \
|
||||
fonts-freefont-ttf
|
||||
```
|
||||
|
||||
## Option 2: Using Docker (Coming Soon)
|
||||
|
||||
Docker support for Crawl4AI is currently in progress and will be available soon. This will allow you to run Crawl4AI in a containerized environment, ensuring consistency across different systems.
|
||||
|
||||
## Option 3: Local Server Installation
|
||||
|
||||
For those who prefer to run Crawl4AI as a local server, instructions will be provided once the Docker implementation is complete.
|
||||
|
||||
## Verifying Your Installation
|
||||
|
||||
After installation, you can verify that Crawl4AI is working correctly by running a simple Python script:
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
|
||||
async def main():
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
result = await crawler.arun(url="https://www.example.com")
|
||||
print(result.markdown[:500]) # Print first 500 characters
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
This script should successfully crawl the example website and print the first 500 characters of the extracted content.
|
||||
|
||||
## Getting Help
|
||||
|
||||
If you encounter any issues during installation or usage, please check the [documentation](https://crawl4ai.com/mkdocs/) or raise an issue on the [GitHub repository](https://github.com/unclecode/crawl4ai/issues).
|
||||
|
||||
Happy crawling! 🕷️🤖
|
||||
@@ -1,102 +0,0 @@
|
||||
# Output Formats
|
||||
|
||||
Crawl4AI provides multiple output formats to suit different needs, ranging from raw HTML to structured data using LLM or pattern-based extraction, and versatile markdown outputs.
|
||||
|
||||
## Basic Formats
|
||||
|
||||
```python
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
|
||||
# Access different formats
|
||||
raw_html = result.html # Original HTML
|
||||
clean_html = result.cleaned_html # Sanitized HTML
|
||||
markdown_v2 = result.markdown_v2 # Detailed markdown generation results
|
||||
fit_md = result.markdown_v2.fit_markdown # Most relevant content in markdown
|
||||
```
|
||||
|
||||
> **Note**: The `markdown_v2` property will soon be replaced by `markdown`. It is recommended to start transitioning to using `markdown` for new implementations.
|
||||
|
||||
## Raw HTML
|
||||
|
||||
Original, unmodified HTML from the webpage. Useful when you need to:
|
||||
- Preserve the exact page structure.
|
||||
- Process HTML with your own tools.
|
||||
- Debug page issues.
|
||||
|
||||
```python
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
print(result.html) # Complete HTML including headers, scripts, etc.
|
||||
```
|
||||
|
||||
## Cleaned HTML
|
||||
|
||||
Sanitized HTML with unnecessary elements removed. Automatically:
|
||||
- Removes scripts and styles.
|
||||
- Cleans up formatting.
|
||||
- Preserves semantic structure.
|
||||
|
||||
```python
|
||||
config = CrawlerRunConfig(
|
||||
excluded_tags=['form', 'header', 'footer'], # Additional tags to remove
|
||||
keep_data_attributes=False # Remove data-* attributes
|
||||
)
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
print(result.cleaned_html)
|
||||
```
|
||||
|
||||
## Standard Markdown
|
||||
|
||||
HTML converted to clean markdown format. This output is useful for:
|
||||
- Content analysis.
|
||||
- Documentation.
|
||||
- Readability.
|
||||
|
||||
```python
|
||||
config = CrawlerRunConfig(
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
options={"include_links": True} # Include links in markdown
|
||||
)
|
||||
)
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
print(result.markdown_v2.raw_markdown) # Standard markdown with links
|
||||
```
|
||||
|
||||
## Fit Markdown
|
||||
|
||||
Extract and convert only the most relevant content into markdown format. Best suited for:
|
||||
- Article extraction.
|
||||
- Focusing on the main content.
|
||||
- Removing boilerplate.
|
||||
|
||||
To generate `fit_markdown`, use a content filter like `PruningContentFilter`:
|
||||
|
||||
```python
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
content_filter=PruningContentFilter(
|
||||
threshold=0.7,
|
||||
threshold_type="dynamic",
|
||||
min_word_threshold=100
|
||||
)
|
||||
)
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
print(result.markdown_v2.fit_markdown) # Extracted main content in markdown
|
||||
```
|
||||
|
||||
## Markdown with Citations
|
||||
|
||||
Generate markdown that includes citations for links. This format is ideal for:
|
||||
- Creating structured documentation.
|
||||
- Including references for extracted content.
|
||||
|
||||
```python
|
||||
config = CrawlerRunConfig(
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
options={"citations": True} # Enable citations
|
||||
)
|
||||
)
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
print(result.markdown_v2.markdown_with_citations)
|
||||
print(result.markdown_v2.references_markdown) # Citations section
|
||||
```
|
||||
@@ -1,190 +0,0 @@
|
||||
# Page Interaction
|
||||
|
||||
Crawl4AI provides powerful features for interacting with dynamic webpages, handling JavaScript execution, and managing page events.
|
||||
|
||||
## JavaScript Execution
|
||||
|
||||
### Basic Execution
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
|
||||
# Single JavaScript command
|
||||
config = CrawlerRunConfig(
|
||||
js_code="window.scrollTo(0, document.body.scrollHeight);"
|
||||
)
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
|
||||
# Multiple commands
|
||||
js_commands = [
|
||||
"window.scrollTo(0, document.body.scrollHeight);",
|
||||
"document.querySelector('.load-more').click();",
|
||||
"document.querySelector('#consent-button').click();"
|
||||
]
|
||||
config = CrawlerRunConfig(js_code=js_commands)
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
```
|
||||
|
||||
## Wait Conditions
|
||||
|
||||
### CSS-Based Waiting
|
||||
|
||||
Wait for elements to appear:
|
||||
|
||||
```python
|
||||
config = CrawlerRunConfig(wait_for="css:.dynamic-content") # Wait for element with class 'dynamic-content'
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
```
|
||||
|
||||
### JavaScript-Based Waiting
|
||||
|
||||
Wait for custom conditions:
|
||||
|
||||
```python
|
||||
# Wait for number of elements
|
||||
wait_condition = """() => {
|
||||
return document.querySelectorAll('.item').length > 10;
|
||||
}"""
|
||||
|
||||
config = CrawlerRunConfig(wait_for=f"js:{wait_condition}")
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
|
||||
# Wait for dynamic content to load
|
||||
wait_for_content = """() => {
|
||||
const content = document.querySelector('.content');
|
||||
return content && content.innerText.length > 100;
|
||||
}"""
|
||||
|
||||
config = CrawlerRunConfig(wait_for=f"js:{wait_for_content}")
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
```
|
||||
|
||||
## Handling Dynamic Content
|
||||
|
||||
### Load More Content
|
||||
|
||||
Handle infinite scroll or load more buttons:
|
||||
|
||||
```python
|
||||
config = CrawlerRunConfig(
|
||||
js_code=[
|
||||
"window.scrollTo(0, document.body.scrollHeight);", # Scroll to bottom
|
||||
"const loadMore = document.querySelector('.load-more'); if(loadMore) loadMore.click();" # Click load more
|
||||
],
|
||||
wait_for="js:() => document.querySelectorAll('.item').length > previousCount" # Wait for new content
|
||||
)
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
```
|
||||
|
||||
### Form Interaction
|
||||
|
||||
Handle forms and inputs:
|
||||
|
||||
```python
|
||||
js_form_interaction = """
|
||||
document.querySelector('#search').value = 'search term'; // Fill form fields
|
||||
document.querySelector('form').submit(); // Submit form
|
||||
"""
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
js_code=js_form_interaction,
|
||||
wait_for="css:.results" # Wait for results to load
|
||||
)
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
```
|
||||
|
||||
## Timing Control
|
||||
|
||||
### Delays and Timeouts
|
||||
|
||||
Control timing of interactions:
|
||||
|
||||
```python
|
||||
config = CrawlerRunConfig(
|
||||
page_timeout=60000, # Page load timeout (ms)
|
||||
delay_before_return_html=2.0 # Wait before capturing content
|
||||
)
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
```
|
||||
|
||||
## Complex Interactions Example
|
||||
|
||||
Here's an example of handling a dynamic page with multiple interactions:
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
|
||||
async def crawl_dynamic_content():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Initial page load
|
||||
config = CrawlerRunConfig(
|
||||
js_code="document.querySelector('.cookie-accept')?.click();", # Handle cookie consent
|
||||
wait_for="css:.main-content"
|
||||
)
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
|
||||
# Load more content
|
||||
session_id = "dynamic_session" # Keep session for multiple interactions
|
||||
|
||||
for page in range(3): # Load 3 pages of content
|
||||
config = CrawlerRunConfig(
|
||||
session_id=session_id,
|
||||
js_code=[
|
||||
"window.scrollTo(0, document.body.scrollHeight);", # Scroll to bottom
|
||||
"window.previousCount = document.querySelectorAll('.item').length;", # Store item count
|
||||
"document.querySelector('.load-more')?.click();" # Click load more
|
||||
],
|
||||
wait_for="""() => {
|
||||
const currentCount = document.querySelectorAll('.item').length;
|
||||
return currentCount > window.previousCount;
|
||||
}""",
|
||||
js_only=(page > 0) # Execute JS without reloading page for subsequent interactions
|
||||
)
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
print(f"Page {page + 1} items:", len(result.cleaned_html))
|
||||
|
||||
# Clean up session
|
||||
await crawler.crawler_strategy.kill_session(session_id)
|
||||
```
|
||||
|
||||
## Using with Extraction Strategies
|
||||
|
||||
Combine page interaction with structured extraction:
|
||||
|
||||
```python
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
|
||||
# Pattern-based extraction after interaction
|
||||
schema = {
|
||||
"name": "Dynamic Items",
|
||||
"baseSelector": ".item",
|
||||
"fields": [
|
||||
{"name": "title", "selector": "h2", "type": "text"},
|
||||
{"name": "description", "selector": ".desc", "type": "text"}
|
||||
]
|
||||
}
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
js_code="window.scrollTo(0, document.body.scrollHeight);",
|
||||
wait_for="css:.item:nth-child(10)", # Wait for 10 items
|
||||
extraction_strategy=JsonCssExtractionStrategy(schema)
|
||||
)
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
|
||||
# Or use LLM to analyze dynamic content
|
||||
class ContentAnalysis(BaseModel):
|
||||
topics: List[str]
|
||||
summary: str
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
js_code="document.querySelector('.show-more').click();",
|
||||
wait_for="css:.full-content",
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
provider="ollama/nemotron",
|
||||
schema=ContentAnalysis.schema(),
|
||||
instruction="Analyze the full content"
|
||||
)
|
||||
)
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
```
|
||||
@@ -1,158 +0,0 @@
|
||||
# Prefix-Based Input Handling in Crawl4AI
|
||||
|
||||
This guide will walk you through using the Crawl4AI library to crawl web pages, local HTML files, and raw HTML strings. We'll demonstrate these capabilities using a Wikipedia page as an example.
|
||||
|
||||
## Crawling a Web URL
|
||||
|
||||
To crawl a live web page, provide the URL starting with `http://` or `https://`, using a `CrawlerRunConfig` object:
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
|
||||
async def crawl_web():
|
||||
config = CrawlerRunConfig(bypass_cache=True)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url="https://en.wikipedia.org/wiki/apple", config=config)
|
||||
if result.success:
|
||||
print("Markdown Content:")
|
||||
print(result.markdown)
|
||||
else:
|
||||
print(f"Failed to crawl: {result.error_message}")
|
||||
|
||||
asyncio.run(crawl_web())
|
||||
```
|
||||
|
||||
## Crawling a Local HTML File
|
||||
|
||||
To crawl a local HTML file, prefix the file path with `file://`.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
|
||||
async def crawl_local_file():
|
||||
local_file_path = "/path/to/apple.html" # Replace with your file path
|
||||
file_url = f"file://{local_file_path}"
|
||||
config = CrawlerRunConfig(bypass_cache=True)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url=file_url, config=config)
|
||||
if result.success:
|
||||
print("Markdown Content from Local File:")
|
||||
print(result.markdown)
|
||||
else:
|
||||
print(f"Failed to crawl local file: {result.error_message}")
|
||||
|
||||
asyncio.run(crawl_local_file())
|
||||
```
|
||||
|
||||
## Crawling Raw HTML Content
|
||||
|
||||
To crawl raw HTML content, prefix the HTML string with `raw:`.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
|
||||
async def crawl_raw_html():
|
||||
raw_html = "<html><body><h1>Hello, World!</h1></body></html>"
|
||||
raw_html_url = f"raw:{raw_html}"
|
||||
config = CrawlerRunConfig(bypass_cache=True)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url=raw_html_url, config=config)
|
||||
if result.success:
|
||||
print("Markdown Content from Raw HTML:")
|
||||
print(result.markdown)
|
||||
else:
|
||||
print(f"Failed to crawl raw HTML: {result.error_message}")
|
||||
|
||||
asyncio.run(crawl_raw_html())
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
# Complete Example
|
||||
|
||||
Below is a comprehensive script that:
|
||||
|
||||
1. Crawls the Wikipedia page for "Apple."
|
||||
2. Saves the HTML content to a local file (`apple.html`).
|
||||
3. Crawls the local HTML file and verifies the markdown length matches the original crawl.
|
||||
4. Crawls the raw HTML content from the saved file and verifies consistency.
|
||||
|
||||
```python
|
||||
import os
|
||||
import sys
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
|
||||
async def main():
|
||||
wikipedia_url = "https://en.wikipedia.org/wiki/apple"
|
||||
script_dir = Path(__file__).parent
|
||||
html_file_path = script_dir / "apple.html"
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Step 1: Crawl the Web URL
|
||||
print("\n=== Step 1: Crawling the Wikipedia URL ===")
|
||||
web_config = CrawlerRunConfig(bypass_cache=True)
|
||||
result = await crawler.arun(url=wikipedia_url, config=web_config)
|
||||
|
||||
if not result.success:
|
||||
print(f"Failed to crawl {wikipedia_url}: {result.error_message}")
|
||||
return
|
||||
|
||||
with open(html_file_path, 'w', encoding='utf-8') as f:
|
||||
f.write(result.html)
|
||||
web_crawl_length = len(result.markdown)
|
||||
print(f"Length of markdown from web crawl: {web_crawl_length}\n")
|
||||
|
||||
# Step 2: Crawl from the Local HTML File
|
||||
print("=== Step 2: Crawling from the Local HTML File ===")
|
||||
file_url = f"file://{html_file_path.resolve()}"
|
||||
file_config = CrawlerRunConfig(bypass_cache=True)
|
||||
local_result = await crawler.arun(url=file_url, config=file_config)
|
||||
|
||||
if not local_result.success:
|
||||
print(f"Failed to crawl local file {file_url}: {local_result.error_message}")
|
||||
return
|
||||
|
||||
local_crawl_length = len(local_result.markdown)
|
||||
assert web_crawl_length == local_crawl_length, "Markdown length mismatch"
|
||||
print("✅ Markdown length matches between web and local file crawl.\n")
|
||||
|
||||
# Step 3: Crawl Using Raw HTML Content
|
||||
print("=== Step 3: Crawling Using Raw HTML Content ===")
|
||||
with open(html_file_path, 'r', encoding='utf-8') as f:
|
||||
raw_html_content = f.read()
|
||||
raw_html_url = f"raw:{raw_html_content}"
|
||||
raw_config = CrawlerRunConfig(bypass_cache=True)
|
||||
raw_result = await crawler.arun(url=raw_html_url, config=raw_config)
|
||||
|
||||
if not raw_result.success:
|
||||
print(f"Failed to crawl raw HTML content: {raw_result.error_message}")
|
||||
return
|
||||
|
||||
raw_crawl_length = len(raw_result.markdown)
|
||||
assert web_crawl_length == raw_crawl_length, "Markdown length mismatch"
|
||||
print("✅ Markdown length matches between web and raw HTML crawl.\n")
|
||||
|
||||
print("All tests passed successfully!")
|
||||
if html_file_path.exists():
|
||||
os.remove(html_file_path)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
# Conclusion
|
||||
|
||||
With the unified `url` parameter and prefix-based handling in **Crawl4AI**, you can seamlessly handle web URLs, local HTML files, and raw HTML content. Use `CrawlerRunConfig` for flexible and consistent configuration in all scenarios.
|
||||
@@ -1,172 +0,0 @@
|
||||
# Quick Start Guide 🚀
|
||||
|
||||
Welcome to the Crawl4AI Quickstart Guide! In this tutorial, we'll walk you through the basic usage of Crawl4AI, covering everything from initial setup to advanced features like chunking and extraction strategies, using asynchronous programming. Let's dive in! 🌟
|
||||
|
||||
---
|
||||
|
||||
## Getting Started 🛠️
|
||||
|
||||
Set up your environment with `BrowserConfig` and create an `AsyncWebCrawler` instance.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.async_configs import BrowserConfig
|
||||
|
||||
async def main():
|
||||
browser_config = BrowserConfig(verbose=True)
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Add your crawling logic here
|
||||
pass
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Basic Usage
|
||||
|
||||
Provide a URL and let Crawl4AI do the work!
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
|
||||
async def main():
|
||||
browser_config = BrowserConfig(verbose=True)
|
||||
crawl_config = CrawlerRunConfig(url="https://www.nbcnews.com/business")
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(config=crawl_config)
|
||||
print(f"Basic crawl result: {result.markdown[:500]}") # Print first 500 characters
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Taking Screenshots 📸
|
||||
|
||||
Capture and save webpage screenshots with `CrawlerRunConfig`:
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import CacheMode
|
||||
|
||||
async def capture_and_save_screenshot(url: str, output_path: str):
|
||||
browser_config = BrowserConfig(verbose=True)
|
||||
crawl_config = CrawlerRunConfig(
|
||||
url=url,
|
||||
screenshot=True,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(config=crawl_config)
|
||||
|
||||
if result.success and result.screenshot:
|
||||
import base64
|
||||
screenshot_data = base64.b64decode(result.screenshot)
|
||||
with open(output_path, 'wb') as f:
|
||||
f.write(screenshot_data)
|
||||
print(f"Screenshot saved successfully to {output_path}")
|
||||
else:
|
||||
print("Failed to capture screenshot")
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Browser Selection 🌐
|
||||
|
||||
Choose from multiple browser engines using `BrowserConfig`:
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import BrowserConfig
|
||||
|
||||
# Use Firefox
|
||||
firefox_config = BrowserConfig(browser_type="firefox", verbose=True, headless=True)
|
||||
async with AsyncWebCrawler(config=firefox_config) as crawler:
|
||||
result = await crawler.arun(config=CrawlerRunConfig(url="https://www.example.com"))
|
||||
|
||||
# Use WebKit
|
||||
webkit_config = BrowserConfig(browser_type="webkit", verbose=True, headless=True)
|
||||
async with AsyncWebCrawler(config=webkit_config) as crawler:
|
||||
result = await crawler.arun(config=CrawlerRunConfig(url="https://www.example.com"))
|
||||
|
||||
# Use Chromium (default)
|
||||
chromium_config = BrowserConfig(verbose=True, headless=True)
|
||||
async with AsyncWebCrawler(config=chromium_config) as crawler:
|
||||
result = await crawler.arun(config=CrawlerRunConfig(url="https://www.example.com"))
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### User Simulation 🎭
|
||||
|
||||
Simulate real user behavior to bypass detection:
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
|
||||
browser_config = BrowserConfig(verbose=True, headless=True)
|
||||
crawl_config = CrawlerRunConfig(
|
||||
url="YOUR-URL-HERE",
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
simulate_user=True, # Random mouse movements and clicks
|
||||
override_navigator=True # Makes the browser appear like a real user
|
||||
)
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(config=crawl_config)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Understanding Parameters 🧠
|
||||
|
||||
Explore caching and forcing fresh crawls:
|
||||
|
||||
```python
|
||||
async def main():
|
||||
browser_config = BrowserConfig(verbose=True)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# First crawl (uses cache)
|
||||
result1 = await crawler.arun(config=CrawlerRunConfig(url="https://www.nbcnews.com/business"))
|
||||
print(f"First crawl result: {result1.markdown[:100]}...")
|
||||
|
||||
# Force fresh crawl
|
||||
result2 = await crawler.arun(
|
||||
config=CrawlerRunConfig(url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS)
|
||||
)
|
||||
print(f"Second crawl result: {result2.markdown[:100]}...")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Adding a Chunking Strategy 🧩
|
||||
|
||||
Split content into chunks using `RegexChunking`:
|
||||
|
||||
```python
|
||||
from crawl4ai.chunking_strategy import RegexChunking
|
||||
|
||||
async def main():
|
||||
browser_config = BrowserConfig(verbose=True)
|
||||
crawl_config = CrawlerRunConfig(
|
||||
url="https://www.nbcnews.com/business",
|
||||
chunking_strategy=RegexChunking(patterns=["\n\n"])
|
||||
)
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(config=crawl_config)
|
||||
print(f"RegexChunking result: {result.extracted_content[:200]}...")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Advanced Features and Configurations
|
||||
|
||||
For advanced examples (LLM strategies, knowledge graphs, pagination handling), ensure all code aligns with the `BrowserConfig` and `CrawlerRunConfig` pattern shown above.
|
||||
@@ -1,145 +0,0 @@
|
||||
# Simple Crawling
|
||||
|
||||
This guide covers the basics of web crawling with Crawl4AI. You'll learn how to set up a crawler, make your first request, and understand the response.
|
||||
|
||||
## Basic Usage
|
||||
|
||||
Set up a simple crawl using `BrowserConfig` and `CrawlerRunConfig`:
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
|
||||
async def main():
|
||||
browser_config = BrowserConfig() # Default browser configuration
|
||||
run_config = CrawlerRunConfig() # Default crawl run configuration
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=run_config
|
||||
)
|
||||
print(result.markdown) # Print clean markdown content
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
## Understanding the Response
|
||||
|
||||
The `arun()` method returns a `CrawlResult` object with several useful properties. Here's a quick overview (see [CrawlResult](../api/crawl-result.md) for complete details):
|
||||
|
||||
```python
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=CrawlerRunConfig(fit_markdown=True)
|
||||
)
|
||||
|
||||
# Different content formats
|
||||
print(result.html) # Raw HTML
|
||||
print(result.cleaned_html) # Cleaned HTML
|
||||
print(result.markdown) # Markdown version
|
||||
print(result.fit_markdown) # Most relevant content in markdown
|
||||
|
||||
# Check success status
|
||||
print(result.success) # True if crawl succeeded
|
||||
print(result.status_code) # HTTP status code (e.g., 200, 404)
|
||||
|
||||
# Access extracted media and links
|
||||
print(result.media) # Dictionary of found media (images, videos, audio)
|
||||
print(result.links) # Dictionary of internal and external links
|
||||
```
|
||||
|
||||
## Adding Basic Options
|
||||
|
||||
Customize your crawl using `CrawlerRunConfig`:
|
||||
|
||||
```python
|
||||
run_config = CrawlerRunConfig(
|
||||
word_count_threshold=10, # Minimum words per content block
|
||||
exclude_external_links=True, # Remove external links
|
||||
remove_overlay_elements=True, # Remove popups/modals
|
||||
process_iframes=True # Process iframe content
|
||||
)
|
||||
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=run_config
|
||||
)
|
||||
```
|
||||
|
||||
## Handling Errors
|
||||
|
||||
Always check if the crawl was successful:
|
||||
|
||||
```python
|
||||
run_config = CrawlerRunConfig()
|
||||
result = await crawler.arun(url="https://example.com", config=run_config)
|
||||
|
||||
if not result.success:
|
||||
print(f"Crawl failed: {result.error_message}")
|
||||
print(f"Status code: {result.status_code}")
|
||||
```
|
||||
|
||||
## Logging and Debugging
|
||||
|
||||
Enable verbose logging in `BrowserConfig`:
|
||||
|
||||
```python
|
||||
browser_config = BrowserConfig(verbose=True)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
run_config = CrawlerRunConfig()
|
||||
result = await crawler.arun(url="https://example.com", config=run_config)
|
||||
```
|
||||
|
||||
## Complete Example
|
||||
|
||||
Here's a more comprehensive example demonstrating common usage patterns:
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
|
||||
async def main():
|
||||
browser_config = BrowserConfig(verbose=True)
|
||||
run_config = CrawlerRunConfig(
|
||||
# Content filtering
|
||||
word_count_threshold=10,
|
||||
excluded_tags=['form', 'header'],
|
||||
exclude_external_links=True,
|
||||
|
||||
# Content processing
|
||||
process_iframes=True,
|
||||
remove_overlay_elements=True,
|
||||
|
||||
# Cache control
|
||||
cache_mode=CacheMode.ENABLED # Use cache if available
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=run_config
|
||||
)
|
||||
|
||||
if result.success:
|
||||
# Print clean content
|
||||
print("Content:", result.markdown[:500]) # First 500 chars
|
||||
|
||||
# Process images
|
||||
for image in result.media["images"]:
|
||||
print(f"Found image: {image['src']}")
|
||||
|
||||
# Process links
|
||||
for link in result.links["internal"]:
|
||||
print(f"Internal link: {link['href']}")
|
||||
|
||||
else:
|
||||
print(f"Crawl failed: {result.error_message}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
Reference in New Issue
Block a user