Apply Ruff Corrections

2025-01-13 19:19:58 +08:00
parent c3370ec5da
commit 8ec12d7d68
84 changed files with 6861 additions and 5076 deletions
--- a/docs/examples/v0_4_24_walkthrough.py
+++ b/docs/examples/v0_4_24_walkthrough.py
@@ -10,18 +10,17 @@ import asyncio
 import os
 import json
 import re
-from typing import List, Optional, Dict, Any
-from pydantic import BaseModel, Field
+from typing import List
 from crawl4ai import (
    AsyncWebCrawler,
    BrowserConfig,
    CrawlerRunConfig,
    CacheMode,
    LLMExtractionStrategy,
-    JsonCssExtractionStrategy
+    JsonCssExtractionStrategy,
 )
 from crawl4ai.content_filter_strategy import RelevantContentFilter
-from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator 
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
 from bs4 import BeautifulSoup

 # Sample HTML for demonstrations
@@ -52,17 +51,18 @@ SAMPLE_HTML = """
 </div>
 """

+
 async def demo_ssl_features():
    """
    Enhanced SSL & Security Features Demo
    -----------------------------------
-    
+
    This example demonstrates the new SSL certificate handling and security features:
    1. Custom certificate paths
    2. SSL verification options
    3. HTTPS error handling
    4. Certificate validation configurations
-    
+
    These features are particularly useful when:
    - Working with self-signed certificates
    - Dealing with corporate proxies
@@ -76,14 +76,11 @@ async def demo_ssl_features():

    run_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
-        fetch_ssl_certificate=True  # Enable SSL certificate fetching
+        fetch_ssl_certificate=True,  # Enable SSL certificate fetching
    )

    async with AsyncWebCrawler(config=browser_config) as crawler:
-        result = await crawler.arun(
-            url="https://example.com",
-            config=run_config
-        )
+        result = await crawler.arun(url="https://example.com", config=run_config)
        print(f"SSL Crawl Success: {result.success}")
        result.ssl_certificate.to_json(
            os.path.join(os.getcwd(), "ssl_certificate.json")
@@ -91,11 +88,12 @@ async def demo_ssl_features():
        if not result.success:
            print(f"SSL Error: {result.error_message}")

+
 async def demo_content_filtering():
    """
    Smart Content Filtering Demo
    ----------------------
-    
+
    Demonstrates advanced content filtering capabilities:
    1. Custom filter to identify and extract specific content
    2. Integration with markdown generation
@@ -110,87 +108,90 @@ async def demo_content_filtering():
            super().__init__()
            # Add news-specific patterns
            self.negative_patterns = re.compile(
-                r'nav|footer|header|sidebar|ads|comment|share|related|recommended|popular|trending',
-                re.I
+                r"nav|footer|header|sidebar|ads|comment|share|related|recommended|popular|trending",
+                re.I,
            )
            self.min_word_count = 30  # Higher threshold for news content

-        def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
+        def filter_content(
+            self, html: str, min_word_threshold: int = None
+        ) -> List[str]:
            """
            Implements news-specific content filtering logic.
-            
+
            Args:
                html (str): HTML content to be filtered
                min_word_threshold (int, optional): Minimum word count threshold
-                
+
            Returns:
                List[str]: List of filtered HTML content blocks
            """
            if not html or not isinstance(html, str):
                return []
-                
-            soup = BeautifulSoup(html, 'lxml')
+
+            soup = BeautifulSoup(html, "lxml")
            if not soup.body:
-                soup = BeautifulSoup(f'<body>{html}</body>', 'lxml')
-            
-            body = soup.find('body')
-            
+                soup = BeautifulSoup(f"<body>{html}</body>", "lxml")
+
+            body = soup.find("body")
+
            # Extract chunks with metadata
-            chunks = self.extract_text_chunks(body, min_word_threshold or self.min_word_count)
-            
+            chunks = self.extract_text_chunks(
+                body, min_word_threshold or self.min_word_count
+            )
+
            # Filter chunks based on news-specific criteria
            filtered_chunks = []
            for _, text, tag_type, element in chunks:
                # Skip if element has negative class/id
                if self.is_excluded(element):
                    continue
-                    
+
                # Headers are important in news articles
-                if tag_type == 'header':
+                if tag_type == "header":
                    filtered_chunks.append(self.clean_element(element))
                    continue
-                    
+
                # For content, check word count and link density
                text = element.get_text(strip=True)
                if len(text.split()) >= (min_word_threshold or self.min_word_count):
                    # Calculate link density
-                    links_text = ' '.join(a.get_text(strip=True) for a in element.find_all('a'))
+                    links_text = " ".join(
+                        a.get_text(strip=True) for a in element.find_all("a")
+                    )
                    link_density = len(links_text) / len(text) if text else 1
-                    
+
                    # Accept if link density is reasonable
                    if link_density < 0.5:
                        filtered_chunks.append(self.clean_element(element))
-            
+
            return filtered_chunks

    # Create markdown generator with custom filter
-    markdown_gen = DefaultMarkdownGenerator(
-        content_filter=CustomNewsFilter()
-    )
+    markdown_gen = DefaultMarkdownGenerator(content_filter=CustomNewsFilter())

    run_config = CrawlerRunConfig(
-        markdown_generator=markdown_gen,
-        cache_mode=CacheMode.BYPASS
+        markdown_generator=markdown_gen, cache_mode=CacheMode.BYPASS
    )

    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
-            url="https://news.ycombinator.com",
-            config=run_config
+            url="https://news.ycombinator.com", config=run_config
        )
        print("Filtered Content Sample:")
        print(result.markdown[:500])  # Show first 500 chars

+
 async def demo_json_extraction():
    """
    Improved JSON Extraction Demo
    ---------------------------
-    
+
    Demonstrates the enhanced JSON extraction capabilities:
    1. Base element attributes extraction
    2. Complex nested structures
    3. Multiple extraction patterns
-    
+
    Key features shown:
    - Extracting attributes from base elements (href, data-* attributes)
    - Processing repeated patterns
@@ -206,7 +207,7 @@ async def demo_json_extraction():
            "baseSelector": "div.article-list",
            "baseFields": [
                {"name": "list_id", "type": "attribute", "attribute": "data-list-id"},
-                {"name": "category", "type": "attribute", "attribute": "data-category"}
+                {"name": "category", "type": "attribute", "attribute": "data-category"},
            ],
            "fields": [
                {
@@ -214,8 +215,16 @@ async def demo_json_extraction():
                    "selector": "article.post",
                    "type": "nested_list",
                    "baseFields": [
-                        {"name": "post_id", "type": "attribute", "attribute": "data-post-id"},
-                        {"name": "author_id", "type": "attribute", "attribute": "data-author"}
+                        {
+                            "name": "post_id",
+                            "type": "attribute",
+                            "attribute": "data-post-id",
+                        },
+                        {
+                            "name": "author_id",
+                            "type": "attribute",
+                            "attribute": "data-author",
+                        },
                    ],
                    "fields": [
                        {
@@ -223,60 +232,68 @@ async def demo_json_extraction():
                            "selector": "h2.title a",
                            "type": "text",
                            "baseFields": [
-                                {"name": "url", "type": "attribute", "attribute": "href"}
-                            ]
+                                {
+                                    "name": "url",
+                                    "type": "attribute",
+                                    "attribute": "href",
+                                }
+                            ],
                        },
                        {
                            "name": "author",
                            "selector": "div.meta a.author",
                            "type": "text",
                            "baseFields": [
-                                {"name": "profile_url", "type": "attribute", "attribute": "href"}
-                            ]
-                        },
-                        {
-                            "name": "date",
-                            "selector": "span.date",
-                            "type": "text"
+                                {
+                                    "name": "profile_url",
+                                    "type": "attribute",
+                                    "attribute": "href",
+                                }
+                            ],
                        },
+                        {"name": "date", "selector": "span.date", "type": "text"},
                        {
                            "name": "read_more",
                            "selector": "a.read-more",
                            "type": "nested",
                            "fields": [
                                {"name": "text", "type": "text"},
-                                {"name": "url", "type": "attribute", "attribute": "href"}
-                            ]
-                        }
-                    ]
+                                {
+                                    "name": "url",
+                                    "type": "attribute",
+                                    "attribute": "href",
+                                },
+                            ],
+                        },
+                    ],
                }
-            ]
+            ],
        }
    )

    # Demonstrate extraction from raw HTML
    run_config = CrawlerRunConfig(
-        extraction_strategy=json_strategy,
-        cache_mode=CacheMode.BYPASS
+        extraction_strategy=json_strategy, cache_mode=CacheMode.BYPASS
    )

    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url="raw:" + SAMPLE_HTML,  # Use raw: prefix for raw HTML
-            config=run_config
+            config=run_config,
        )
        print("Extracted Content:")
        print(result.extracted_content)

+
 async def demo_input_formats():
    """
    Input Format Handling Demo
    ----------------------
-    
+
    Demonstrates how LLM extraction can work with different input formats:
    1. Markdown (default) - Good for simple text extraction
    2. HTML - Better when you need structure and attributes
-    
+
    This example shows how HTML input can be beneficial when:
    - You need to understand the DOM structure
    - You want to extract both visible text and HTML attributes
@@ -350,7 +367,7 @@ async def demo_input_formats():
        </footer>
    </div>
    """
-    
+
    # Use raw:// prefix to pass HTML content directly
    url = f"raw://{dummy_html}"

@@ -359,18 +376,30 @@ async def demo_input_formats():

    # Define our schema using Pydantic
    class JobRequirement(BaseModel):
-        category: str = Field(description="Category of the requirement (e.g., Technical, Soft Skills)")
-        items: List[str] = Field(description="List of specific requirements in this category")
-        priority: str = Field(description="Priority level (Required/Preferred) based on the HTML class or context")
+        category: str = Field(
+            description="Category of the requirement (e.g., Technical, Soft Skills)"
+        )
+        items: List[str] = Field(
+            description="List of specific requirements in this category"
+        )
+        priority: str = Field(
+            description="Priority level (Required/Preferred) based on the HTML class or context"
+        )

    class JobPosting(BaseModel):
        title: str = Field(description="Job title")
        department: str = Field(description="Department or team")
        location: str = Field(description="Job location, including remote options")
        salary_range: Optional[str] = Field(description="Salary range if specified")
-        requirements: List[JobRequirement] = Field(description="Categorized job requirements")
-        application_deadline: Optional[str] = Field(description="Application deadline if specified")
-        contact_info: Optional[dict] = Field(description="Contact information from footer or contact section")
+        requirements: List[JobRequirement] = Field(
+            description="Categorized job requirements"
+        )
+        application_deadline: Optional[str] = Field(
+            description="Application deadline if specified"
+        )
+        contact_info: Optional[dict] = Field(
+            description="Contact information from footer or contact section"
+        )

    # First try with markdown (default)
    markdown_strategy = LLMExtractionStrategy(
@@ -382,7 +411,7 @@ async def demo_input_formats():
        Extract job posting details into structured data. Focus on the visible text content 
        and organize requirements into categories.
        """,
-        input_format="markdown"  # default
+        input_format="markdown",  # default
    )

    # Then with HTML for better structure understanding
@@ -400,34 +429,25 @@ async def demo_input_formats():
        
        Use HTML attributes and classes to enhance extraction accuracy.
        """,
-        input_format="html"  # explicitly use HTML
+        input_format="html",  # explicitly use HTML
    )

    async with AsyncWebCrawler() as crawler:
        # Try with markdown first
-        markdown_config = CrawlerRunConfig(
-            extraction_strategy=markdown_strategy
-        )
-        markdown_result = await crawler.arun(
-            url=url,
-            config=markdown_config
-        )
+        markdown_config = CrawlerRunConfig(extraction_strategy=markdown_strategy)
+        markdown_result = await crawler.arun(url=url, config=markdown_config)
        print("\nMarkdown-based Extraction Result:")
        items = json.loads(markdown_result.extracted_content)
        print(json.dumps(items, indent=2))

        # Then with HTML for better structure understanding
-        html_config = CrawlerRunConfig(
-            extraction_strategy=html_strategy
-        )
-        html_result = await crawler.arun(
-            url=url,
-            config=html_config
-        )
+        html_config = CrawlerRunConfig(extraction_strategy=html_strategy)
+        html_result = await crawler.arun(url=url, config=html_config)
        print("\nHTML-based Extraction Result:")
        items = json.loads(html_result.extracted_content)
        print(json.dumps(items, indent=2))

+
 # Main execution
 async def main():
    print("Crawl4AI v0.4.24 Feature Walkthrough")
@@ -439,5 +459,6 @@ async def main():
    await demo_json_extraction()
    # await demo_input_formats()

+
 if __name__ == "__main__":
    asyncio.run(main())