Update Documentation

2024-10-27 19:24:46 +08:00
parent 38474bd66a
commit 4239654722
111 changed files with 7680 additions and 53 deletions
--- a/docs/md_v2/basic/browser-config.md
+++ b/docs/md_v2/basic/browser-config.md
@@ -0,0 +1,208 @@
+# Browser Configuration
+
+Crawl4AI supports multiple browser engines and offers extensive configuration options for browser behavior.
+
+## Browser Types
+
+Choose from three browser engines:
+
+```python
+# Chromium (default)
+async with AsyncWebCrawler(browser_type="chromium") as crawler:
+    result = await crawler.arun(url="https://example.com")
+
+# Firefox
+async with AsyncWebCrawler(browser_type="firefox") as crawler:
+    result = await crawler.arun(url="https://example.com")
+
+# WebKit
+async with AsyncWebCrawler(browser_type="webkit") as crawler:
+    result = await crawler.arun(url="https://example.com")
+```
+
+## Basic Configuration
+
+Common browser settings:
+
+```python
+async with AsyncWebCrawler(
+    headless=True,           # Run in headless mode (no GUI)
+    verbose=True,           # Enable detailed logging
+    sleep_on_close=False    # No delay when closing browser
+) as crawler:
+    result = await crawler.arun(url="https://example.com")
+```
+
+## Identity Management
+
+Control how your crawler appears to websites:
+
+```python
+# Custom user agent
+async with AsyncWebCrawler(
+    user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+) as crawler:
+    result = await crawler.arun(url="https://example.com")
+
+# Custom headers
+headers = {
+    "Accept-Language": "en-US,en;q=0.9",
+    "Cache-Control": "no-cache"
+}
+async with AsyncWebCrawler(headers=headers) as crawler:
+    result = await crawler.arun(url="https://example.com")
+```
+
+## Screenshot Capabilities
+
+Capture page screenshots with enhanced error handling:
+
+```python
+result = await crawler.arun(
+    url="https://example.com",
+    screenshot=True,                # Enable screenshot
+    screenshot_wait_for=2.0        # Wait 2 seconds before capture
+)
+
+if result.screenshot:  # Base64 encoded image
+    import base64
+    with open("screenshot.png", "wb") as f:
+        f.write(base64.b64decode(result.screenshot))
+```
+
+## Timeouts and Waiting
+
+Control page loading behavior:
+
+```python
+result = await crawler.arun(
+    url="https://example.com",
+    page_timeout=60000,              # Page load timeout (ms)
+    delay_before_return_html=2.0,    # Wait before content capture
+    wait_for="css:.dynamic-content"  # Wait for specific element
+)
+```
+
+## JavaScript Execution
+
+Execute custom JavaScript before crawling:
+
+```python
+# Single JavaScript command
+result = await crawler.arun(
+    url="https://example.com",
+    js_code="window.scrollTo(0, document.body.scrollHeight);"
+)
+
+# Multiple commands
+js_commands = [
+    "window.scrollTo(0, document.body.scrollHeight);",
+    "document.querySelector('.load-more').click();"
+]
+result = await crawler.arun(
+    url="https://example.com",
+    js_code=js_commands
+)
+```
+
+## Proxy Configuration
+
+Use proxies for enhanced access:
+
+```python
+# Simple proxy
+async with AsyncWebCrawler(
+    proxy="http://proxy.example.com:8080"
+) as crawler:
+    result = await crawler.arun(url="https://example.com")
+
+# Proxy with authentication
+proxy_config = {
+    "server": "http://proxy.example.com:8080",
+    "username": "user",
+    "password": "pass"
+}
+async with AsyncWebCrawler(proxy_config=proxy_config) as crawler:
+    result = await crawler.arun(url="https://example.com")
+```
+
+## Anti-Detection Features
+
+Enable stealth features to avoid bot detection:
+
+```python
+result = await crawler.arun(
+    url="https://example.com",
+    simulate_user=True,        # Simulate human behavior
+    override_navigator=True,   # Mask automation signals
+    magic=True               # Enable all anti-detection features
+)
+```
+
+## Handling Dynamic Content
+
+Configure browser to handle dynamic content:
+
+```python
+# Wait for dynamic content
+result = await crawler.arun(
+    url="https://example.com",
+    wait_for="js:() => document.querySelector('.content').children.length > 10",
+    process_iframes=True     # Process iframe content
+)
+
+# Handle lazy-loaded images
+result = await crawler.arun(
+    url="https://example.com",
+    js_code="window.scrollTo(0, document.body.scrollHeight);",
+    delay_before_return_html=2.0  # Wait for images to load
+)
+```
+
+## Comprehensive Example
+
+Here's how to combine various browser configurations:
+
+```python
+async def crawl_with_advanced_config(url: str):
+    async with AsyncWebCrawler(
+        # Browser setup
+        browser_type="chromium",
+        headless=True,
+        verbose=True,
+        
+        # Identity
+        user_agent="Custom User Agent",
+        headers={"Accept-Language": "en-US"},
+        
+        # Proxy setup
+        proxy="http://proxy.example.com:8080"
+    ) as crawler:
+        result = await crawler.arun(
+            url=url,
+            # Content handling
+            process_iframes=True,
+            screenshot=True,
+            
+            # Timing
+            page_timeout=60000,
+            delay_before_return_html=2.0,
+            
+            # Anti-detection
+            magic=True,
+            simulate_user=True,
+            
+            # Dynamic content
+            js_code=[
+                "window.scrollTo(0, document.body.scrollHeight);",
+                "document.querySelector('.load-more')?.click();"
+            ],
+            wait_for="css:.dynamic-content"
+        )
+        
+        return {
+            "content": result.markdown,
+            "screenshot": result.screenshot,
+            "success": result.success
+        }
+```
--- a/docs/md_v2/basic/content-selection.md
+++ b/docs/md_v2/basic/content-selection.md
@@ -0,0 +1,199 @@
+# Content Selection
+
+Crawl4AI provides multiple ways to select and filter specific content from webpages. Learn how to precisely target the content you need.
+
+## CSS Selectors
+
+The simplest way to extract specific content:
+
+```python
+# Extract specific content using CSS selector
+result = await crawler.arun(
+    url="https://example.com",
+    css_selector=".main-article"  # Target main article content
+)
+
+# Multiple selectors
+result = await crawler.arun(
+    url="https://example.com",
+    css_selector="article h1, article .content"  # Target heading and content
+)
+```
+
+## Content Filtering
+
+Control what content is included or excluded:
+
+```python
+result = await crawler.arun(
+    url="https://example.com",
+    # Content thresholds
+    word_count_threshold=10,        # Minimum words per block
+    
+    # Tag exclusions
+    excluded_tags=['form', 'header', 'footer', 'nav'],
+    
+    # Link filtering
+    exclude_external_links=True,    # Remove external links
+    exclude_social_media_links=True,  # Remove social media links
+    
+    # Media filtering
+    exclude_external_images=True   # Remove external images
+)
+```
+
+## Iframe Content
+
+Process content inside iframes:
+
+```python
+result = await crawler.arun(
+    url="https://example.com",
+    process_iframes=True,  # Extract iframe content
+    remove_overlay_elements=True  # Remove popups/modals that might block iframes
+)
+```
+
+## Structured Content Selection
+
+### Using LLMs for Smart Selection
+
+Use LLMs to intelligently extract specific types of content:
+
+```python
+from pydantic import BaseModel
+from crawl4ai.extraction_strategy import LLMExtractionStrategy
+
+class ArticleContent(BaseModel):
+    title: str
+    main_points: List[str]
+    conclusion: str
+
+strategy = LLMExtractionStrategy(
+    provider="ollama/nemotron",  # Works with any supported LLM
+    schema=ArticleContent.schema(),
+    instruction="Extract the main article title, key points, and conclusion"
+)
+
+result = await crawler.arun(
+    url="https://example.com",
+    extraction_strategy=strategy
+)
+article = json.loads(result.extracted_content)
+```
+
+### Pattern-Based Selection
+
+For repeated content patterns (like product listings, news feeds):
+
+```python
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+schema = {
+    "name": "News Articles",
+    "baseSelector": "article.news-item",  # Repeated element
+    "fields": [
+        {"name": "headline", "selector": "h2", "type": "text"},
+        {"name": "summary", "selector": ".summary", "type": "text"},
+        {"name": "category", "selector": ".category", "type": "text"},
+        {
+            "name": "metadata",
+            "type": "nested",
+            "fields": [
+                {"name": "author", "selector": ".author", "type": "text"},
+                {"name": "date", "selector": ".date", "type": "text"}
+            ]
+        }
+    ]
+}
+
+strategy = JsonCssExtractionStrategy(schema)
+result = await crawler.arun(
+    url="https://example.com",
+    extraction_strategy=strategy
+)
+articles = json.loads(result.extracted_content)
+```
+
+## Domain-Based Filtering
+
+Control content based on domains:
+
+```python
+result = await crawler.arun(
+    url="https://example.com",
+    exclude_domains=["ads.com", "tracker.com"],
+    exclude_social_media_domains=["facebook.com", "twitter.com"],  # Custom social media domains to exclude
+    exclude_social_media_links=True
+)
+```
+
+## Media Selection
+
+Select specific types of media:
+
+```python
+result = await crawler.arun(url="https://example.com")
+
+# Access different media types
+images = result.media["images"]  # List of image details
+videos = result.media["videos"]  # List of video details
+audios = result.media["audios"]  # List of audio details
+
+# Image with metadata
+for image in images:
+    print(f"URL: {image['src']}")
+    print(f"Alt text: {image['alt']}")
+    print(f"Description: {image['desc']}")
+    print(f"Relevance score: {image['score']}")
+```
+
+## Comprehensive Example
+
+Here's how to combine different selection methods:
+
+```python
+async def extract_article_content(url: str):
+    # Define structured extraction
+    article_schema = {
+        "name": "Article",
+        "baseSelector": "article.main",
+        "fields": [
+            {"name": "title", "selector": "h1", "type": "text"},
+            {"name": "content", "selector": ".content", "type": "text"}
+        ]
+    }
+    
+    # Define LLM extraction
+    class ArticleAnalysis(BaseModel):
+        key_points: List[str]
+        sentiment: str
+        category: str
+
+    async with AsyncWebCrawler() as crawler:
+        # Get structured content
+        pattern_result = await crawler.arun(
+            url=url,
+            extraction_strategy=JsonCssExtractionStrategy(article_schema),
+            word_count_threshold=10,
+            excluded_tags=['nav', 'footer'],
+            exclude_external_links=True
+        )
+        
+        # Get semantic analysis
+        analysis_result = await crawler.arun(
+            url=url,
+            extraction_strategy=LLMExtractionStrategy(
+                provider="ollama/nemotron",
+                schema=ArticleAnalysis.schema(),
+                instruction="Analyze the article content"
+            )
+        )
+        
+        # Combine results
+        return {
+            "article": json.loads(pattern_result.extracted_content),
+            "analysis": json.loads(analysis_result.extracted_content),
+            "media": pattern_result.media
+        }
+```
--- a/docs/md_v2/basic/installation.md
+++ b/docs/md_v2/basic/installation.md
@@ -0,0 +1,92 @@
+# Installation 💻
+
+Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package, use it with Docker, or run it as a local server.
+
+## Option 1: Python Package Installation (Recommended)
+
+Crawl4AI is now available on PyPI, making installation easier than ever. Choose the option that best fits your needs:
+
+### Basic Installation
+
+For basic web crawling and scraping tasks:
+
+```bash
+pip install crawl4ai
+playwright install # Install Playwright dependencies
+```
+
+### Installation with PyTorch
+
+For advanced text clustering (includes CosineSimilarity cluster strategy):
+
+```bash
+pip install crawl4ai[torch]
+```
+
+### Installation with Transformers
+
+For text summarization and Hugging Face models:
+
+```bash
+pip install crawl4ai[transformer]
+```
+
+### Full Installation
+
+For all features:
+
+```bash
+pip install crawl4ai[all]
+```
+
+### Development Installation
+
+For contributors who plan to modify the source code:
+
+```bash
+git clone https://github.com/unclecode/crawl4ai.git
+cd crawl4ai
+pip install -e ".[all]"
+playwright install # Install Playwright dependencies
+```
+
+💡 After installation with "torch", "transformer", or "all" options, it's recommended to run the following CLI command to load the required models:
+
+```bash
+crawl4ai-download-models
+```
+
+This is optional but will boost the performance and speed of the crawler. You only need to do this once after installation.
+
+## Option 2: Using Docker (Coming Soon)
+
+Docker support for Crawl4AI is currently in progress and will be available soon. This will allow you to run Crawl4AI in a containerized environment, ensuring consistency across different systems.
+
+## Option 3: Local Server Installation
+
+For those who prefer to run Crawl4AI as a local server, instructions will be provided once the Docker implementation is complete.
+
+## Verifying Your Installation
+
+After installation, you can verify that Crawl4AI is working correctly by running a simple Python script:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def main():
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        result = await crawler.arun(url="https://www.example.com")
+        print(result.markdown[:500])  # Print first 500 characters
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+This script should successfully crawl the example website and print the first 500 characters of the extracted content.
+
+## Getting Help
+
+If you encounter any issues during installation or usage, please check the [documentation](https://crawl4ai.com/mkdocs/) or raise an issue on the [GitHub repository](https://github.com/unclecode/crawl4ai/issues).
+
+Happy crawling! 🕷️🤖
--- a/docs/md_v2/basic/output-formats.md
+++ b/docs/md_v2/basic/output-formats.md
@@ -0,0 +1,195 @@
+# Output Formats
+
+Crawl4AI provides multiple output formats to suit different needs, from raw HTML to structured data using LLM or pattern-based extraction.
+
+## Basic Formats
+
+```python
+result = await crawler.arun(url="https://example.com")
+
+# Access different formats
+raw_html = result.html           # Original HTML
+clean_html = result.cleaned_html # Sanitized HTML
+markdown = result.markdown       # Standard markdown
+fit_md = result.fit_markdown    # Most relevant content in markdown
+```
+
+## Raw HTML
+
+Original, unmodified HTML from the webpage. Useful when you need to:
+- Preserve the exact page structure
+- Process HTML with your own tools
+- Debug page issues
+
+```python
+result = await crawler.arun(url="https://example.com")
+print(result.html)  # Complete HTML including headers, scripts, etc.
+```
+
+## Cleaned HTML
+
+Sanitized HTML with unnecessary elements removed. Automatically:
+- Removes scripts and styles
+- Cleans up formatting
+- Preserves semantic structure
+
+```python
+result = await crawler.arun(
+    url="https://example.com",
+    excluded_tags=['form', 'header', 'footer'],  # Additional tags to remove
+    keep_data_attributes=False  # Remove data-* attributes
+)
+print(result.cleaned_html)
+```
+
+## Standard Markdown
+
+HTML converted to clean markdown format. Great for:
+- Content analysis
+- Documentation
+- Readability
+
+```python
+result = await crawler.arun(
+    url="https://example.com",
+    include_links_on_markdown=True  # Include links in markdown
+)
+print(result.markdown)
+```
+
+## Fit Markdown
+
+Most relevant content extracted and converted to markdown. Ideal for:
+- Article extraction
+- Main content focus
+- Removing boilerplate
+
+```python
+result = await crawler.arun(url="https://example.com")
+print(result.fit_markdown)  # Only the main content
+```
+
+## Structured Data Extraction
+
+Crawl4AI offers two powerful approaches for structured data extraction:
+
+### 1. LLM-Based Extraction
+
+Use any LLM (OpenAI, HuggingFace, Ollama, etc.) to extract structured data with high accuracy:
+
+```python
+from pydantic import BaseModel
+from crawl4ai.extraction_strategy import LLMExtractionStrategy
+
+class KnowledgeGraph(BaseModel):
+    entities: List[dict]
+    relationships: List[dict]
+
+strategy = LLMExtractionStrategy(
+    provider="ollama/nemotron",  # or "huggingface/...", "ollama/..."
+    api_token="your-token",   # not needed for Ollama
+    schema=KnowledgeGraph.schema(),
+    instruction="Extract entities and relationships from the content"
+)
+
+result = await crawler.arun(
+    url="https://example.com",
+    extraction_strategy=strategy
+)
+knowledge_graph = json.loads(result.extracted_content)
+```
+
+### 2. Pattern-Based Extraction
+
+For pages with repetitive patterns (e.g., product listings, article feeds), use JsonCssExtractionStrategy:
+
+```python
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+schema = {
+    "name": "Product Listing",
+    "baseSelector": ".product-card",  # Repeated element
+    "fields": [
+        {"name": "title", "selector": "h2", "type": "text"},
+        {"name": "price", "selector": ".price", "type": "text"},
+        {"name": "description", "selector": ".desc", "type": "text"}
+    ]
+}
+
+strategy = JsonCssExtractionStrategy(schema)
+result = await crawler.arun(
+    url="https://example.com",
+    extraction_strategy=strategy
+)
+products = json.loads(result.extracted_content)
+```
+
+## Content Customization
+
+### HTML to Text Options
+
+Configure markdown conversion:
+
+```python
+result = await crawler.arun(
+    url="https://example.com",
+    html2text={
+        "escape_dot": False,
+        "body_width": 0,
+        "protect_links": True,
+        "unicode_snob": True
+    }
+)
+```
+
+### Content Filters
+
+Control what content is included:
+
+```python
+result = await crawler.arun(
+    url="https://example.com",
+    word_count_threshold=10,        # Minimum words per block
+    exclude_external_links=True,    # Remove external links
+    exclude_external_images=True,   # Remove external images
+    excluded_tags=['form', 'nav']   # Remove specific HTML tags
+)
+```
+
+## Comprehensive Example
+
+Here's how to use multiple output formats together:
+
+```python
+async def crawl_content(url: str):
+    async with AsyncWebCrawler() as crawler:
+        # Extract main content with fit markdown
+        result = await crawler.arun(
+            url=url,
+            word_count_threshold=10,
+            exclude_external_links=True
+        )
+        
+        # Get structured data using LLM
+        llm_result = await crawler.arun(
+            url=url,
+            extraction_strategy=LLMExtractionStrategy(
+                provider="ollama/nemotron",
+                schema=YourSchema.schema(),
+                instruction="Extract key information"
+            )
+        )
+        
+        # Get repeated patterns (if any)
+        pattern_result = await crawler.arun(
+            url=url,
+            extraction_strategy=JsonCssExtractionStrategy(your_schema)
+        )
+        
+        return {
+            "main_content": result.fit_markdown,
+            "structured_data": json.loads(llm_result.extracted_content),
+            "pattern_data": json.loads(pattern_result.extracted_content),
+            "media": result.media
+        }
+```
--- a/docs/md_v2/basic/page-interaction.md
+++ b/docs/md_v2/basic/page-interaction.md
@@ -0,0 +1,207 @@
+# Page Interaction
+
+Crawl4AI provides powerful features for interacting with dynamic webpages, handling JavaScript execution, and managing page events.
+
+## JavaScript Execution
+
+### Basic Execution
+
+```python
+# Single JavaScript command
+result = await crawler.arun(
+    url="https://example.com",
+    js_code="window.scrollTo(0, document.body.scrollHeight);"
+)
+
+# Multiple commands
+js_commands = [
+    "window.scrollTo(0, document.body.scrollHeight);",
+    "document.querySelector('.load-more').click();",
+    "document.querySelector('#consent-button').click();"
+]
+result = await crawler.arun(
+    url="https://example.com",
+    js_code=js_commands
+)
+```
+
+## Wait Conditions
+
+### CSS-Based Waiting
+
+Wait for elements to appear:
+
+```python
+result = await crawler.arun(
+    url="https://example.com",
+    wait_for="css:.dynamic-content"  # Wait for element with class 'dynamic-content'
+)
+```
+
+### JavaScript-Based Waiting
+
+Wait for custom conditions:
+
+```python
+# Wait for number of elements
+wait_condition = """() => {
+    return document.querySelectorAll('.item').length > 10;
+}"""
+
+result = await crawler.arun(
+    url="https://example.com",
+    wait_for=f"js:{wait_condition}"
+)
+
+# Wait for dynamic content to load
+wait_for_content = """() => {
+    const content = document.querySelector('.content');
+    return content && content.innerText.length > 100;
+}"""
+
+result = await crawler.arun(
+    url="https://example.com",
+    wait_for=f"js:{wait_for_content}"
+)
+```
+
+## Handling Dynamic Content
+
+### Load More Content
+
+Handle infinite scroll or load more buttons:
+
+```python
+# Scroll and wait pattern
+result = await crawler.arun(
+    url="https://example.com",
+    js_code=[
+        # Scroll to bottom
+        "window.scrollTo(0, document.body.scrollHeight);",
+        # Click load more if exists
+        "const loadMore = document.querySelector('.load-more'); if(loadMore) loadMore.click();"
+    ],
+    # Wait for new content
+    wait_for="js:() => document.querySelectorAll('.item').length > previousCount"
+)
+```
+
+### Form Interaction
+
+Handle forms and inputs:
+
+```python
+js_form_interaction = """
+    // Fill form fields
+    document.querySelector('#search').value = 'search term';
+    // Submit form
+    document.querySelector('form').submit();
+"""
+
+result = await crawler.arun(
+    url="https://example.com",
+    js_code=js_form_interaction,
+    wait_for="css:.results"  # Wait for results to load
+)
+```
+
+## Timing Control
+
+### Delays and Timeouts
+
+Control timing of interactions:
+
+```python
+result = await crawler.arun(
+    url="https://example.com",
+    page_timeout=60000,              # Page load timeout (ms)
+    delay_before_return_html=2.0,    # Wait before capturing content
+)
+```
+
+## Complex Interactions Example
+
+Here's an example of handling a dynamic page with multiple interactions:
+
+```python
+async def crawl_dynamic_content():
+    async with AsyncWebCrawler() as crawler:
+        # Initial page load
+        result = await crawler.arun(
+            url="https://example.com",
+            # Handle cookie consent
+            js_code="document.querySelector('.cookie-accept')?.click();",
+            wait_for="css:.main-content"
+        )
+
+        # Load more content
+        session_id = "dynamic_session"  # Keep session for multiple interactions
+        
+        for page in range(3):  # Load 3 pages of content
+            result = await crawler.arun(
+                url="https://example.com",
+                session_id=session_id,
+                js_code=[
+                    # Scroll to bottom
+                    "window.scrollTo(0, document.body.scrollHeight);",
+                    # Store current item count
+                    "window.previousCount = document.querySelectorAll('.item').length;",
+                    # Click load more
+                    "document.querySelector('.load-more')?.click();"
+                ],
+                # Wait for new items
+                wait_for="""() => {
+                    const currentCount = document.querySelectorAll('.item').length;
+                    return currentCount > window.previousCount;
+                }""",
+                # Only execute JS without reloading page
+                js_only=True if page > 0 else False
+            )
+            
+            # Process content after each load
+            print(f"Page {page + 1} items:", len(result.cleaned_html))
+            
+        # Clean up session
+        await crawler.crawler_strategy.kill_session(session_id)
+```
+
+## Using with Extraction Strategies
+
+Combine page interaction with structured extraction:
+
+```python
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
+
+# Pattern-based extraction after interaction
+schema = {
+    "name": "Dynamic Items",
+    "baseSelector": ".item",
+    "fields": [
+        {"name": "title", "selector": "h2", "type": "text"},
+        {"name": "description", "selector": ".desc", "type": "text"}
+    ]
+}
+
+result = await crawler.arun(
+    url="https://example.com",
+    js_code="window.scrollTo(0, document.body.scrollHeight);",
+    wait_for="css:.item:nth-child(10)",  # Wait for 10 items
+    extraction_strategy=JsonCssExtractionStrategy(schema)
+)
+
+# Or use LLM to analyze dynamic content
+class ContentAnalysis(BaseModel):
+    topics: List[str]
+    summary: str
+
+result = await crawler.arun(
+    url="https://example.com",
+    js_code="document.querySelector('.show-more').click();",
+    wait_for="css:.full-content",
+    extraction_strategy=LLMExtractionStrategy(
+        provider="ollama/nemotron",
+        schema=ContentAnalysis.schema(),
+        instruction="Analyze the full content"
+    )
+)
+```
--- a/docs/md_v2/basic/quickstart.md
+++ b/docs/md_v2/basic/quickstart.md
@@ -0,0 +1,297 @@
+# Quick Start Guide 🚀
+
+Welcome to the Crawl4AI Quickstart Guide! In this tutorial, we'll walk you through the basic usage of Crawl4AI with a friendly and humorous tone. We'll cover everything from basic usage to advanced features like chunking and extraction strategies, all with the power of asynchronous programming. Let's dive in! 🌟
+
+## Getting Started 🛠️
+
+First, let's import the necessary modules and create an instance of `AsyncWebCrawler`. We'll use an async context manager, which handles the setup and teardown of the crawler for us.
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def main():
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        # We'll add our crawling code here
+        pass
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+### Basic Usage
+
+Simply provide a URL and let Crawl4AI do the magic!
+
+```python
+async def main():
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        result = await crawler.arun(url="https://www.nbcnews.com/business")
+        print(f"Basic crawl result: {result.markdown[:500]}")  # Print first 500 characters
+
+asyncio.run(main())
+```
+
+### Taking Screenshots 📸
+
+Capture screenshots of web pages easily:
+
+```python
+async def capture_and_save_screenshot(url: str, output_path: str):
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        result = await crawler.arun(
+            url=url,
+            screenshot=True,
+            bypass_cache=True
+        )
+        
+        if result.success and result.screenshot:
+            import base64
+            screenshot_data = base64.b64decode(result.screenshot)
+            with open(output_path, 'wb') as f:
+                f.write(screenshot_data)
+            print(f"Screenshot saved successfully to {output_path}")
+        else:
+            print("Failed to capture screenshot")
+```
+
+### Browser Selection 🌐
+
+Crawl4AI supports multiple browser engines. Here's how to use different browsers:
+
+```python
+# Use Firefox
+async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless=True) as crawler:
+    result = await crawler.arun(url="https://www.example.com", bypass_cache=True)
+
+# Use WebKit
+async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless=True) as crawler:
+    result = await crawler.arun(url="https://www.example.com", bypass_cache=True)
+
+# Use Chromium (default)
+async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
+    result = await crawler.arun(url="https://www.example.com", bypass_cache=True)
+```
+
+### User Simulation 🎭
+
+Simulate real user behavior to avoid detection:
+
+```python
+async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
+    result = await crawler.arun(
+        url="YOUR-URL-HERE",
+        bypass_cache=True,
+        simulate_user=True,  # Causes random mouse movements and clicks
+        override_navigator=True  # Makes the browser appear more like a real user
+    )
+```
+
+### Understanding Parameters 🧠
+
+By default, Crawl4AI caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action.
+
+```python
+async def main():
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        # First crawl (caches the result)
+        result1 = await crawler.arun(url="https://www.nbcnews.com/business")
+        print(f"First crawl result: {result1.markdown[:100]}...")
+
+        # Force to crawl again
+        result2 = await crawler.arun(url="https://www.nbcnews.com/business", bypass_cache=True)
+        print(f"Second crawl result: {result2.markdown[:100]}...")
+
+asyncio.run(main())
+```
+
+### Adding a Chunking Strategy 🧩
+
+Let's add a chunking strategy: `RegexChunking`! This strategy splits the text based on a given regex pattern.
+
+```python
+from crawl4ai.chunking_strategy import RegexChunking
+
+async def main():
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            chunking_strategy=RegexChunking(patterns=["\n\n"])
+        )
+        print(f"RegexChunking result: {result.extracted_content[:200]}...")
+
+asyncio.run(main())
+```
+
+### Using LLMExtractionStrategy with Different Providers 🤖
+
+Crawl4AI supports multiple LLM providers for extraction:
+
+```python
+from crawl4ai.extraction_strategy import LLMExtractionStrategy
+from pydantic import BaseModel, Field
+
+class OpenAIModelFee(BaseModel):
+    model_name: str = Field(..., description="Name of the OpenAI model.")
+    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
+    output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
+
+# OpenAI
+await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
+
+# Hugging Face
+await extract_structured_data_using_llm(
+    "huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", 
+    os.getenv("HUGGINGFACE_API_KEY")
+)
+
+# Ollama
+await extract_structured_data_using_llm("ollama/llama3.2")
+
+# With custom headers
+custom_headers = {
+    "Authorization": "Bearer your-custom-token",
+    "X-Custom-Header": "Some-Value"
+}
+await extract_structured_data_using_llm(extra_headers=custom_headers)
+```
+
+### Knowledge Graph Generation 🕸️
+
+Generate knowledge graphs from web content:
+
+```python
+from pydantic import BaseModel
+from typing import List
+
+class Entity(BaseModel):
+    name: str
+    description: str
+    
+class Relationship(BaseModel):
+    entity1: Entity
+    entity2: Entity
+    description: str
+    relation_type: str
+
+class KnowledgeGraph(BaseModel):
+    entities: List[Entity]
+    relationships: List[Relationship]
+
+extraction_strategy = LLMExtractionStrategy(
+    provider='openai/gpt-4o-mini',
+    api_token=os.getenv('OPENAI_API_KEY'),
+    schema=KnowledgeGraph.model_json_schema(),
+    extraction_type="schema",
+    instruction="Extract entities and relationships from the given text."
+)
+
+async with AsyncWebCrawler() as crawler:
+    result = await crawler.arun(
+        url="https://paulgraham.com/love.html",
+        bypass_cache=True,
+        extraction_strategy=extraction_strategy
+    )
+```
+
+### Advanced Session-Based Crawling with Dynamic Content 🔄
+
+For modern web applications with dynamic content loading, here's how to handle pagination and content updates:
+
+```python
+async def crawl_dynamic_content():
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        url = "https://github.com/microsoft/TypeScript/commits/main"
+        session_id = "typescript_commits_session"
+        
+        js_next_page = """
+        const button = document.querySelector('a[data-testid="pagination-next-button"]');
+        if (button) button.click();
+        """
+
+        wait_for = """() => {
+            const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
+            if (commits.length === 0) return false;
+            const firstCommit = commits[0].textContent.trim();
+            return firstCommit !== window.firstCommit;
+        }"""
+        
+        schema = {
+            "name": "Commit Extractor",
+            "baseSelector": "li.Box-sc-g0xbh4-0",
+            "fields": [
+                {
+                    "name": "title",
+                    "selector": "h4.markdown-title",
+                    "type": "text",
+                    "transform": "strip",
+                },
+            ],
+        }
+        extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
+
+        for page in range(3):  # Crawl 3 pages
+            result = await crawler.arun(
+                url=url,
+                session_id=session_id,
+                css_selector="li.Box-sc-g0xbh4-0",
+                extraction_strategy=extraction_strategy,
+                js_code=js_next_page if page > 0 else None,
+                wait_for=wait_for if page > 0 else None,
+                js_only=page > 0,
+                bypass_cache=True,
+                headless=False,
+            )
+
+        await crawler.crawler_strategy.kill_session(session_id)
+```
+
+### Handling Overlays and Fitting Content 📏
+
+Remove overlay elements and fit content appropriately:
+
+```python
+async with AsyncWebCrawler(headless=False) as crawler:
+    result = await crawler.arun(
+        url="your-url-here",
+        bypass_cache=True,
+        word_count_threshold=10,
+        remove_overlay_elements=True,
+        screenshot=True
+    )
+```
+
+## Performance Comparison 🏎️
+
+Crawl4AI offers impressive performance compared to other solutions:
+
+```python
+# Firecrawl comparison
+from firecrawl import FirecrawlApp
+app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY'])
+start = time.time()
+scrape_status = app.scrape_url(
+    'https://www.nbcnews.com/business',
+    params={'formats': ['markdown', 'html']}
+)
+end = time.time()
+
+# Crawl4AI comparison
+async with AsyncWebCrawler() as crawler:
+    start = time.time()
+    result = await crawler.arun(
+        url="https://www.nbcnews.com/business",
+        word_count_threshold=0,
+        bypass_cache=True,
+        verbose=False,
+    )
+    end = time.time()
+```
+
+Note: Performance comparisons should be conducted in environments with stable and fast internet connections for accurate results.
+
+## Congratulations! 🎉
+
+You've made it through the updated Crawl4AI Quickstart Guide! Now you're equipped with even more powerful features to crawl the web asynchronously like a pro! 🕸️
+
+Happy crawling! 🚀
--- a/docs/md_v2/basic/simple-crawling.md
+++ b/docs/md_v2/basic/simple-crawling.md
@@ -0,0 +1,120 @@
+# Simple Crawling
+
+This guide covers the basics of web crawling with Crawl4AI. You'll learn how to set up a crawler, make your first request, and understand the response.
+
+## Basic Usage
+
+Here's the simplest way to crawl a webpage:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def main():
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(url="https://example.com")
+        print(result.markdown)  # Print clean markdown content
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```
+
+## Understanding the Response
+
+The `arun()` method returns a `CrawlResult` object with several useful properties. Here's a quick overview (see [CrawlResult](../api/crawl-result.md) for complete details):
+
+```python
+result = await crawler.arun(url="https://example.com")
+
+# Different content formats
+print(result.html)         # Raw HTML
+print(result.cleaned_html) # Cleaned HTML
+print(result.markdown)     # Markdown version
+print(result.fit_markdown) # Most relevant content in markdown
+
+# Check success status
+print(result.success)      # True if crawl succeeded
+print(result.status_code)  # HTTP status code (e.g., 200, 404)
+
+# Access extracted media and links
+print(result.media)        # Dictionary of found media (images, videos, audio)
+print(result.links)        # Dictionary of internal and external links
+```
+
+## Adding Basic Options
+
+Customize your crawl with these common options:
+
+```python
+result = await crawler.arun(
+    url="https://example.com",
+    word_count_threshold=10,        # Minimum words per content block
+    exclude_external_links=True,    # Remove external links
+    remove_overlay_elements=True,   # Remove popups/modals
+    process_iframes=True           # Process iframe content
+)
+```
+
+## Handling Errors
+
+Always check if the crawl was successful:
+
+```python
+result = await crawler.arun(url="https://example.com")
+if not result.success:
+    print(f"Crawl failed: {result.error_message}")
+    print(f"Status code: {result.status_code}")
+```
+
+## Logging and Debugging
+
+Enable verbose mode for detailed logging:
+
+```python
+async with AsyncWebCrawler(verbose=True) as crawler:
+    result = await crawler.arun(url="https://example.com")
+```
+
+## Complete Example
+
+Here's a more comprehensive example showing common usage patterns:
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def main():
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        result = await crawler.arun(
+            url="https://example.com",
+            # Content filtering
+            word_count_threshold=10,
+            excluded_tags=['form', 'header'],
+            exclude_external_links=True,
+            
+            # Content processing
+            process_iframes=True,
+            remove_overlay_elements=True,
+            
+            # Cache control
+            bypass_cache=False  # Use cache if available
+        )
+        
+        if result.success:
+            # Print clean content
+            print("Content:", result.markdown[:500])  # First 500 chars
+            
+            # Process images
+            for image in result.media["images"]:
+                print(f"Found image: {image['src']}")
+            
+            # Process links
+            for link in result.links["internal"]:
+                print(f"Internal link: {link['href']}")
+                
+        else:
+            print(f"Crawl failed: {result.error_message}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+```