Enhance crawler capabilities and documentation

- Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation.
2024-12-25 21:34:31 +08:00
parent 84b311760f
commit d5ed451299
59 changed files with 2208 additions and 1763 deletions
--- a/docs/examples/browser_optimization_example.py
+++ b/docs/examples/browser_optimization_example.py
@@ -0,0 +1,128 @@
+"""
+This example demonstrates optimal browser usage patterns in Crawl4AI:
+1. Sequential crawling with session reuse
+2. Parallel crawling with browser instance reuse
+3. Performance optimization settings
+"""
+
+import asyncio
+import os
+from typing import List
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+from crawl4ai.content_filter_strategy import PruningContentFilter
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+
+async def crawl_sequential(urls: List[str]):
+    """
+    Sequential crawling using session reuse - most efficient for moderate workloads
+    """
+    print("\n=== Sequential Crawling with Session Reuse ===")
+
+    # Configure browser with optimized settings
+    browser_config = BrowserConfig(
+        headless=True,
+        browser_args=[
+            "--disable-gpu",  # Disable GPU acceleration
+            "--disable-dev-shm-usage",  # Disable /dev/shm usage
+            "--no-sandbox",  # Required for Docker
+        ],
+        viewport={
+            "width": 800,
+            "height": 600,
+        },  # Smaller viewport for better performance
+    )
+
+    # Configure crawl settings
+    crawl_config = CrawlerRunConfig(
+        markdown_generator=DefaultMarkdownGenerator(
+            #  content_filter=PruningContentFilter(), In case you need fit_markdown
+        ),
+    )
+
+    # Create single crawler instance
+    crawler = AsyncWebCrawler(config=browser_config)
+    await crawler.start()
+
+    try:
+        session_id = "session1"  # Use same session for all URLs
+        for url in urls:
+            result = await crawler.arun(
+                url=url,
+                config=crawl_config,
+                session_id=session_id,  # Reuse same browser tab
+            )
+            if result.success:
+                print(f"Successfully crawled {url}")
+                print(f"Content length: {len(result.markdown_v2.raw_markdown)}")
+    finally:
+        await crawler.close()
+
+
+async def crawl_parallel(urls: List[str], max_concurrent: int = 3):
+    """
+    Parallel crawling while reusing browser instance - best for large workloads
+    """
+    print("\n=== Parallel Crawling with Browser Reuse ===")
+
+    browser_config = BrowserConfig(
+        headless=True,
+        browser_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
+        viewport={"width": 800, "height": 600},
+    )
+
+    crawl_config = CrawlerRunConfig(
+        markdown_generator=DefaultMarkdownGenerator(
+            #  content_filter=PruningContentFilter(), In case you need fit_markdown
+        ),
+    )
+
+    # Create single crawler instance for all parallel tasks
+    crawler = AsyncWebCrawler(config=browser_config)
+    await crawler.start()
+
+    try:
+        # Create tasks in batches to control concurrency
+        for i in range(0, len(urls), max_concurrent):
+            batch = urls[i : i + max_concurrent]
+            tasks = []
+
+            for j, url in enumerate(batch):
+                session_id = (
+                    f"parallel_session_{j}"  # Different session per concurrent task
+                )
+                task = crawler.arun(url=url, config=crawl_config, session_id=session_id)
+                tasks.append(task)
+
+            # Wait for batch to complete
+            results = await asyncio.gather(*tasks, return_exceptions=True)
+
+            # Process results
+            for url, result in zip(batch, results):
+                if isinstance(result, Exception):
+                    print(f"Error crawling {url}: {str(result)}")
+                elif result.success:
+                    print(f"Successfully crawled {url}")
+                    print(f"Content length: {len(result.markdown_v2.raw_markdown)}")
+    finally:
+        await crawler.close()
+
+
+async def main():
+    # Example URLs
+    urls = [
+        "https://example.com/page1",
+        "https://example.com/page2",
+        "https://example.com/page3",
+        "https://example.com/page4",
+    ]
+
+    # Demo sequential crawling
+    await crawl_sequential(urls)
+
+    # Demo parallel crawling
+    await crawl_parallel(urls, max_concurrent=2)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/docs/examples/extraction_strategies_example.py
+++ b/docs/examples/extraction_strategies_example.py
@@ -0,0 +1,115 @@
+"""
+Example demonstrating different extraction strategies with various input formats.
+This example shows how to:
+1. Use different input formats (markdown, HTML, fit_markdown)
+2. Work with JSON-based extractors (CSS and XPath)
+3. Use LLM-based extraction with different input formats
+4. Configure browser and crawler settings properly
+"""
+
+import asyncio
+import os
+from typing import Dict, Any
+
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
+from crawl4ai.extraction_strategy import (
+    LLMExtractionStrategy,
+    JsonCssExtractionStrategy,
+    JsonXPathExtractionStrategy
+)
+from crawl4ai.chunking_strategy import RegexChunking, IdentityChunking
+from crawl4ai.content_filter_strategy import PruningContentFilter
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+
+async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str):
+    """Helper function to run extraction with proper configuration"""
+    try:
+        # Configure the crawler run settings
+        config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            extraction_strategy=strategy,
+            markdown_generator=DefaultMarkdownGenerator(
+                content_filter=PruningContentFilter()  # For fit_markdown support
+            )
+        )
+        
+        # Run the crawler
+        result = await crawler.arun(url=url, config=config)
+        
+        if result.success:
+            print(f"\n=== {name} Results ===")
+            print(f"Extracted Content: {result.extracted_content}")
+            print(f"Raw Markdown Length: {len(result.markdown_v2.raw_markdown)}")
+            print(f"Citations Markdown Length: {len(result.markdown_v2.markdown_with_citations)}")
+        else:
+            print(f"Error in {name}: Crawl failed")
+            
+    except Exception as e:
+        print(f"Error in {name}: {str(e)}")
+
+async def main():
+    # Example URL (replace with actual URL)
+    url = "https://example.com/product-page"
+    
+    # Configure browser settings
+    browser_config = BrowserConfig(
+        headless=True,
+        verbose=True
+    )
+    
+    # Initialize extraction strategies
+    
+    # 1. LLM Extraction with different input formats
+    markdown_strategy = LLMExtractionStrategy(
+        provider="openai/gpt-4o-mini",
+        api_token=os.getenv("OPENAI_API_KEY"),
+        instruction="Extract product information including name, price, and description"
+    )
+    
+    html_strategy = LLMExtractionStrategy(
+        input_format="html",
+        provider="openai/gpt-4o-mini",
+        api_token=os.getenv("OPENAI_API_KEY"),
+        instruction="Extract product information from HTML including structured data"
+    )
+    
+    fit_markdown_strategy = LLMExtractionStrategy(
+        input_format="fit_markdown",
+        provider="openai/gpt-4o-mini",
+        api_token=os.getenv("OPENAI_API_KEY"),
+        instruction="Extract product information from cleaned markdown"
+    )
+    
+    # 2. JSON CSS Extraction (automatically uses HTML input)
+    css_schema = {
+        "baseSelector": ".product",
+        "fields": [
+            {"name": "title", "selector": "h1.product-title", "type": "text"},
+            {"name": "price", "selector": ".price", "type": "text"},
+            {"name": "description", "selector": ".description", "type": "text"}
+        ]
+    }
+    css_strategy = JsonCssExtractionStrategy(schema=css_schema)
+    
+    # 3. JSON XPath Extraction (automatically uses HTML input)
+    xpath_schema = {
+        "baseSelector": "//div[@class='product']",
+        "fields": [
+            {"name": "title", "selector": ".//h1[@class='product-title']/text()", "type": "text"},
+            {"name": "price", "selector": ".//span[@class='price']/text()", "type": "text"},
+            {"name": "description", "selector": ".//div[@class='description']/text()", "type": "text"}
+        ]
+    }
+    xpath_strategy = JsonXPathExtractionStrategy(schema=xpath_schema)
+    
+    # Use context manager for proper resource handling
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        # Run all strategies
+        await run_extraction(crawler, url, markdown_strategy, "Markdown LLM")
+        await run_extraction(crawler, url, html_strategy, "HTML LLM")
+        await run_extraction(crawler, url, fit_markdown_strategy, "Fit Markdown LLM")
+        await run_extraction(crawler, url, css_strategy, "CSS Extraction")
+        await run_extraction(crawler, url, xpath_strategy, "XPath Extraction")
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/docs/examples/full_page_screenshot_and_pdf_export.md
+++ b/docs/examples/full_page_screenshot_and_pdf_export.md
@@ -39,8 +39,8 @@ async def main():
                    f.write(b64decode(result.screenshot))
            
            # Save PDF
-            if result.pdf_data:
-                pdf_bytes = b64decode(result.pdf_data)
+            if result.pdf:
+                pdf_bytes = b64decode(result.pdf)
                with open(os.path.join(__location__, "page.pdf"), "wb") as f:
                    f.write(pdf_bytes)