Apply Ruff Corrections

2025-01-13 19:19:58 +08:00
parent c3370ec5da
commit 8ec12d7d68
84 changed files with 6861 additions and 5076 deletions
--- a/docs/examples/extraction_strategies_example.py
+++ b/docs/examples/extraction_strategies_example.py
@@ -9,18 +9,17 @@ This example shows how to:

 import asyncio
 import os
-from typing import Dict, Any

 from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
 from crawl4ai.extraction_strategy import (
    LLMExtractionStrategy,
    JsonCssExtractionStrategy,
-    JsonXPathExtractionStrategy
+    JsonXPathExtractionStrategy,
 )
-from crawl4ai.chunking_strategy import RegexChunking, IdentityChunking
 from crawl4ai.content_filter_strategy import PruningContentFilter
 from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator

+
 async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str):
    """Helper function to run extraction with proper configuration"""
    try:
@@ -30,78 +29,90 @@ async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str
            extraction_strategy=strategy,
            markdown_generator=DefaultMarkdownGenerator(
                content_filter=PruningContentFilter()  # For fit_markdown support
-            )
+            ),
        )
-        
+
        # Run the crawler
        result = await crawler.arun(url=url, config=config)
-        
+
        if result.success:
            print(f"\n=== {name} Results ===")
            print(f"Extracted Content: {result.extracted_content}")
            print(f"Raw Markdown Length: {len(result.markdown_v2.raw_markdown)}")
-            print(f"Citations Markdown Length: {len(result.markdown_v2.markdown_with_citations)}")
+            print(
+                f"Citations Markdown Length: {len(result.markdown_v2.markdown_with_citations)}"
+            )
        else:
            print(f"Error in {name}: Crawl failed")
-            
+
    except Exception as e:
        print(f"Error in {name}: {str(e)}")

+
 async def main():
    # Example URL (replace with actual URL)
    url = "https://example.com/product-page"
-    
+
    # Configure browser settings
-    browser_config = BrowserConfig(
-        headless=True,
-        verbose=True
-    )
-    
+    browser_config = BrowserConfig(headless=True, verbose=True)
+
    # Initialize extraction strategies
-    
+
    # 1. LLM Extraction with different input formats
    markdown_strategy = LLMExtractionStrategy(
        provider="openai/gpt-4o-mini",
        api_token=os.getenv("OPENAI_API_KEY"),
-        instruction="Extract product information including name, price, and description"
+        instruction="Extract product information including name, price, and description",
    )
-    
+
    html_strategy = LLMExtractionStrategy(
        input_format="html",
        provider="openai/gpt-4o-mini",
        api_token=os.getenv("OPENAI_API_KEY"),
-        instruction="Extract product information from HTML including structured data"
+        instruction="Extract product information from HTML including structured data",
    )
-    
+
    fit_markdown_strategy = LLMExtractionStrategy(
        input_format="fit_markdown",
        provider="openai/gpt-4o-mini",
        api_token=os.getenv("OPENAI_API_KEY"),
-        instruction="Extract product information from cleaned markdown"
+        instruction="Extract product information from cleaned markdown",
    )
-    
+
    # 2. JSON CSS Extraction (automatically uses HTML input)
    css_schema = {
        "baseSelector": ".product",
        "fields": [
            {"name": "title", "selector": "h1.product-title", "type": "text"},
            {"name": "price", "selector": ".price", "type": "text"},
-            {"name": "description", "selector": ".description", "type": "text"}
-        ]
+            {"name": "description", "selector": ".description", "type": "text"},
+        ],
    }
    css_strategy = JsonCssExtractionStrategy(schema=css_schema)
-    
+
    # 3. JSON XPath Extraction (automatically uses HTML input)
    xpath_schema = {
        "baseSelector": "//div[@class='product']",
        "fields": [
-            {"name": "title", "selector": ".//h1[@class='product-title']/text()", "type": "text"},
-            {"name": "price", "selector": ".//span[@class='price']/text()", "type": "text"},
-            {"name": "description", "selector": ".//div[@class='description']/text()", "type": "text"}
-        ]
+            {
+                "name": "title",
+                "selector": ".//h1[@class='product-title']/text()",
+                "type": "text",
+            },
+            {
+                "name": "price",
+                "selector": ".//span[@class='price']/text()",
+                "type": "text",
+            },
+            {
+                "name": "description",
+                "selector": ".//div[@class='description']/text()",
+                "type": "text",
+            },
+        ],
    }
    xpath_strategy = JsonXPathExtractionStrategy(schema=xpath_schema)
-    
+
    # Use context manager for proper resource handling
    async with AsyncWebCrawler(config=browser_config) as crawler:
        # Run all strategies
@@ -111,5 +122,6 @@ async def main():
        await run_extraction(crawler, url, css_strategy, "CSS Extraction")
        await run_extraction(crawler, url, xpath_strategy, "XPath Extraction")

+
 if __name__ == "__main__":
    asyncio.run(main())