Update Documentation
This commit is contained in:
282
docs/details/input_output.md
Normal file
282
docs/details/input_output.md
Normal file
@@ -0,0 +1,282 @@
|
||||
### AsyncWebCrawler Constructor Parameters
|
||||
```python
|
||||
AsyncWebCrawler(
|
||||
# Core Browser Settings
|
||||
browser_type: str = "chromium", # Options: "chromium", "firefox", "webkit"
|
||||
headless: bool = True, # Whether to run browser in headless mode
|
||||
verbose: bool = False, # Enable verbose logging
|
||||
|
||||
# Cache Settings
|
||||
always_by_pass_cache: bool = False, # Always bypass cache regardless of run settings
|
||||
base_directory: str = str(Path.home()), # Base directory for cache storage
|
||||
|
||||
# Network Settings
|
||||
proxy: str = None, # Simple proxy URL (e.g., "http://proxy.example.com:8080")
|
||||
proxy_config: Dict = None, # Advanced proxy settings with auth: {"server": str, "username": str, "password": str}
|
||||
|
||||
# Browser Behavior
|
||||
sleep_on_close: bool = False, # Wait before closing browser
|
||||
|
||||
# Other Settings passed to AsyncPlaywrightCrawlerStrategy
|
||||
user_agent: str = None, # Custom user agent string
|
||||
headers: Dict[str, str] = {}, # Custom HTTP headers
|
||||
js_code: Union[str, List[str]] = None, # Default JavaScript to execute
|
||||
)
|
||||
```
|
||||
|
||||
### arun() Method Parameters
|
||||
```python
|
||||
arun(
|
||||
# Core Parameters
|
||||
url: str, # Required: URL to crawl
|
||||
|
||||
# Content Selection
|
||||
css_selector: str = None, # CSS selector to extract specific content
|
||||
word_count_threshold: int = MIN_WORD_THRESHOLD, # Minimum words for content blocks
|
||||
|
||||
# Cache Control
|
||||
bypass_cache: bool = False, # Bypass cache for this request
|
||||
|
||||
# Session Management
|
||||
session_id: str = None, # Session identifier for persistent browsing
|
||||
|
||||
# Screenshot Options
|
||||
screenshot: bool = False, # Take page screenshot
|
||||
screenshot_wait_for: float = None, # Wait time before screenshot
|
||||
|
||||
# Content Processing
|
||||
process_iframes: bool = False, # Process iframe content
|
||||
remove_overlay_elements: bool = False, # Remove popups/modals
|
||||
|
||||
# Anti-Bot/Detection
|
||||
simulate_user: bool = False, # Simulate human-like behavior
|
||||
override_navigator: bool = False, # Override navigator properties
|
||||
magic: bool = False, # Enable all anti-detection features
|
||||
|
||||
# Content Filtering
|
||||
excluded_tags: List[str] = None, # HTML tags to exclude
|
||||
exclude_external_links: bool = False, # Remove external links
|
||||
exclude_social_media_links: bool = False, # Remove social media links
|
||||
exclude_external_images: bool = False, # Remove external images
|
||||
exclude_social_media_domains: List[str] = None, # Additional social media domains to exclude
|
||||
remove_forms: bool = False, # Remove all form elements
|
||||
|
||||
# JavaScript Handling
|
||||
js_code: Union[str, List[str]] = None, # JavaScript to execute
|
||||
js_only: bool = False, # Only execute JavaScript without reloading page
|
||||
wait_for: str = None, # Wait condition (CSS selector or JS function)
|
||||
|
||||
# Page Loading
|
||||
page_timeout: int = 60000, # Page load timeout in milliseconds
|
||||
delay_before_return_html: float = None, # Wait before returning HTML
|
||||
|
||||
# Debug Options
|
||||
log_console: bool = False, # Log browser console messages
|
||||
|
||||
# Content Format Control
|
||||
only_text: bool = False, # Extract only text content
|
||||
keep_data_attributes: bool = False, # Keep data-* attributes in HTML
|
||||
|
||||
# Markdown Options
|
||||
include_links_on_markdown: bool = False, # Include links in markdown output
|
||||
html2text: Dict = {}, # HTML to text conversion options
|
||||
|
||||
# Extraction Strategy
|
||||
extraction_strategy: ExtractionStrategy = None, # Strategy for structured data extraction
|
||||
|
||||
# Advanced Browser Control
|
||||
user_agent: str = None, # Override user agent for this request
|
||||
)
|
||||
```
|
||||
|
||||
### Extraction Strategy Parameters
|
||||
```python
|
||||
# JsonCssExtractionStrategy
|
||||
{
|
||||
"name": str, # Name of extraction schema
|
||||
"baseSelector": str, # Base CSS selector
|
||||
"fields": [
|
||||
{
|
||||
"name": str, # Field name
|
||||
"selector": str, # CSS selector
|
||||
"type": str, # Data type ("text", etc.)
|
||||
"transform": str = None # Optional transformation
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
# LLMExtractionStrategy
|
||||
{
|
||||
"provider": str, # LLM provider (e.g., "openai/gpt-4", "huggingface/...", "ollama/...")
|
||||
"api_token": str, # API token
|
||||
"schema": dict, # Pydantic model schema
|
||||
"extraction_type": str, # Type of extraction ("schema", etc.)
|
||||
"instruction": str, # Extraction instruction
|
||||
"extra_args": dict = None, # Additional provider-specific arguments
|
||||
"extra_headers": dict = None # Additional HTTP headers
|
||||
}
|
||||
```
|
||||
|
||||
### HTML to Text Conversion Options (html2text parameter)
|
||||
```python
|
||||
{
|
||||
"escape_dot": bool = True, # Escape dots in text
|
||||
# Other html2text library options
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
### CrawlResult Fields
|
||||
|
||||
```python
|
||||
class CrawlResult(BaseModel):
|
||||
# Basic Information
|
||||
url: str # The crawled URL
|
||||
# Example: "https://example.com"
|
||||
|
||||
success: bool # Whether the crawl was successful
|
||||
# Example: True/False
|
||||
|
||||
status_code: Optional[int] # HTTP status code
|
||||
# Example: 200, 404, 500
|
||||
|
||||
# Content Fields
|
||||
html: str # Raw HTML content
|
||||
# Example: "<html><body>...</body></html>"
|
||||
|
||||
cleaned_html: Optional[str] # HTML after cleaning and processing
|
||||
# Example: "<article><p>Clean content...</p></article>"
|
||||
|
||||
fit_html: Optional[str] # Most relevant HTML content after content cleaning strategy
|
||||
# Example: "<div><p>Most relevant content...</p></div>"
|
||||
|
||||
markdown: Optional[str] # HTML converted to markdown
|
||||
# Example: "# Title\n\nContent paragraph..."
|
||||
|
||||
fit_markdown: Optional[str] # Most relevant content in markdown
|
||||
# Example: "# Main Article\n\nKey content..."
|
||||
|
||||
# Media Content
|
||||
media: Dict[str, List[Dict]] = {} # Extracted media information
|
||||
# Example: {
|
||||
# "images": [
|
||||
# {
|
||||
# "src": "https://example.com/image.jpg",
|
||||
# "alt": "Image description",
|
||||
# "desc": "Contextual description",
|
||||
# "score": 5, # Relevance score
|
||||
# "type": "image"
|
||||
# }
|
||||
# ],
|
||||
# "videos": [
|
||||
# {
|
||||
# "src": "https://example.com/video.mp4",
|
||||
# "alt": "Video title",
|
||||
# "type": "video",
|
||||
# "description": "Video context"
|
||||
# }
|
||||
# ],
|
||||
# "audios": [
|
||||
# {
|
||||
# "src": "https://example.com/audio.mp3",
|
||||
# "alt": "Audio title",
|
||||
# "type": "audio",
|
||||
# "description": "Audio context"
|
||||
# }
|
||||
# ]
|
||||
# }
|
||||
|
||||
# Link Information
|
||||
links: Dict[str, List[Dict]] = {} # Extracted links
|
||||
# Example: {
|
||||
# "internal": [
|
||||
# {
|
||||
# "href": "https://example.com/page",
|
||||
# "text": "Link text",
|
||||
# "title": "Link title"
|
||||
# }
|
||||
# ],
|
||||
# "external": [
|
||||
# {
|
||||
# "href": "https://external.com",
|
||||
# "text": "External link text",
|
||||
# "title": "External link title"
|
||||
# }
|
||||
# ]
|
||||
# }
|
||||
|
||||
# Extraction Results
|
||||
extracted_content: Optional[str] # Content from extraction strategy
|
||||
# Example for JsonCssExtractionStrategy:
|
||||
# '[{"title": "Article 1", "date": "2024-03-20"}, ...]'
|
||||
# Example for LLMExtractionStrategy:
|
||||
# '{"entities": [...], "relationships": [...]}'
|
||||
|
||||
# Additional Information
|
||||
metadata: Optional[dict] = None # Page metadata
|
||||
# Example: {
|
||||
# "title": "Page Title",
|
||||
# "description": "Meta description",
|
||||
# "keywords": ["keyword1", "keyword2"],
|
||||
# "author": "Author Name",
|
||||
# "published_date": "2024-03-20"
|
||||
# }
|
||||
|
||||
screenshot: Optional[str] = None # Base64 encoded screenshot
|
||||
# Example: "iVBORw0KGgoAAAANSUhEUgAA..."
|
||||
|
||||
error_message: Optional[str] = None # Error message if crawl failed
|
||||
# Example: "Failed to load page: timeout"
|
||||
|
||||
session_id: Optional[str] = None # Session identifier
|
||||
# Example: "session_123456"
|
||||
|
||||
response_headers: Optional[dict] = None # HTTP response headers
|
||||
# Example: {
|
||||
# "content-type": "text/html",
|
||||
# "server": "nginx/1.18.0",
|
||||
# "date": "Wed, 20 Mar 2024 12:00:00 GMT"
|
||||
# }
|
||||
```
|
||||
|
||||
### Common Usage Patterns:
|
||||
|
||||
1. Basic Content Extraction:
|
||||
```python
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
print(result.markdown) # Clean, readable content
|
||||
print(result.cleaned_html) # Cleaned HTML
|
||||
```
|
||||
|
||||
2. Media Analysis:
|
||||
```python
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
for image in result.media["images"]:
|
||||
if image["score"] > 3: # High-relevance images
|
||||
print(f"High-quality image: {image['src']}")
|
||||
```
|
||||
|
||||
3. Link Analysis:
|
||||
```python
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
internal_links = [link["href"] for link in result.links["internal"]]
|
||||
external_links = [link["href"] for link in result.links["external"]]
|
||||
```
|
||||
|
||||
4. Structured Data Extraction:
|
||||
```python
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
extraction_strategy=my_strategy
|
||||
)
|
||||
structured_data = json.loads(result.extracted_content)
|
||||
```
|
||||
|
||||
5. Error Handling:
|
||||
```python
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
if not result.success:
|
||||
print(f"Crawl failed: {result.error_message}")
|
||||
print(f"Status code: {result.status_code}")
|
||||
```
|
||||
|
||||
Reference in New Issue
Block a user