Apply Ruff Corrections

2025-01-13 19:19:58 +08:00
parent c3370ec5da
commit 8ec12d7d68
84 changed files with 6861 additions and 5076 deletions
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -5,7 +5,6 @@ from .config import (
    PAGE_TIMEOUT,
    IMAGE_SCORE_THRESHOLD,
    SOCIAL_MEDIA_DOMAINS,
-
 )
 from .user_agent_generator import UserAgentGenerator
 from .extraction_strategy import ExtractionStrategy
@@ -14,6 +13,7 @@ from .markdown_generation_strategy import MarkdownGenerationStrategy
 from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
 from typing import Union, List

+
 class BrowserConfig:
    """
    Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy.
@@ -84,7 +84,7 @@ class BrowserConfig:
        proxy: str = None,
        proxy_config: dict = None,
        viewport_width: int = 1080,
-        viewport_height: int = 600, 
+        viewport_height: int = 600,
        accept_downloads: bool = False,
        downloads_path: str = None,
        storage_state=None,
@@ -103,7 +103,7 @@ class BrowserConfig:
        text_mode: bool = False,
        light_mode: bool = False,
        extra_args: list = None,
-        debugging_port : int = 9222,
+        debugging_port: int = 9222,
    ):
        self.browser_type = browser_type
        self.headless = headless
@@ -142,7 +142,7 @@ class BrowserConfig:
            self.user_agent = user_agenr_generator.generate()
        else:
            pass
-        
+
        self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent)
        self.headers.setdefault("sec-ch-ua", self.browser_hint)

@@ -313,7 +313,7 @@ class CrawlerRunConfig:
                        Default: True.
        log_console (bool): If True, log console messages from the page.
                            Default: False.
-        
+
        # Optional Parameters
        url: str = None  # This is not a compulsory parameter
    """
@@ -335,10 +335,8 @@ class CrawlerRunConfig:
        prettiify: bool = False,
        parser_type: str = "lxml",
        scraping_strategy: ContentScrapingStrategy = None,
-
        # SSL Parameters
        fetch_ssl_certificate: bool = False,
-
        # Caching Parameters
        cache_mode=None,
        session_id: str = None,
@@ -346,7 +344,6 @@ class CrawlerRunConfig:
        disable_cache: bool = False,
        no_cache_read: bool = False,
        no_cache_write: bool = False,
-
        # Page Navigation and Timing Parameters
        wait_until: str = "domcontentloaded",
        page_timeout: int = PAGE_TIMEOUT,
@@ -356,7 +353,6 @@ class CrawlerRunConfig:
        mean_delay: float = 0.1,
        max_range: float = 0.3,
        semaphore_count: int = 5,
-
        # Page Interaction Parameters
        js_code: Union[str, List[str]] = None,
        js_only: bool = False,
@@ -369,7 +365,6 @@ class CrawlerRunConfig:
        override_navigator: bool = False,
        magic: bool = False,
        adjust_viewport_to_content: bool = False,
-
        # Media Handling Parameters
        screenshot: bool = False,
        screenshot_wait_for: float = None,
@@ -378,21 +373,18 @@ class CrawlerRunConfig:
        image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
        image_score_threshold: int = IMAGE_SCORE_THRESHOLD,
        exclude_external_images: bool = False,
-
        # Link and Domain Handling Parameters
        exclude_social_media_domains: list = None,
        exclude_external_links: bool = False,
        exclude_social_media_links: bool = False,
        exclude_domains: list = None,
-
        # Debugging and Logging Parameters
        verbose: bool = True,
        log_console: bool = False,
-        
        url: str = None,
    ):
        self.url = url
-        
+
        # Content Processing Parameters
        self.word_count_threshold = word_count_threshold
        self.extraction_strategy = extraction_strategy
@@ -453,7 +445,9 @@ class CrawlerRunConfig:
        self.exclude_external_images = exclude_external_images

        # Link and Domain Handling Parameters
-        self.exclude_social_media_domains = exclude_social_media_domains or SOCIAL_MEDIA_DOMAINS
+        self.exclude_social_media_domains = (
+            exclude_social_media_domains or SOCIAL_MEDIA_DOMAINS
+        )
        self.exclude_external_links = exclude_external_links
        self.exclude_social_media_links = exclude_social_media_links
        self.exclude_domains = exclude_domains or []
@@ -466,11 +460,15 @@ class CrawlerRunConfig:
        if self.extraction_strategy is not None and not isinstance(
            self.extraction_strategy, ExtractionStrategy
        ):
-            raise ValueError("extraction_strategy must be an instance of ExtractionStrategy")
+            raise ValueError(
+                "extraction_strategy must be an instance of ExtractionStrategy"
+            )
        if self.chunking_strategy is not None and not isinstance(
            self.chunking_strategy, ChunkingStrategy
        ):
-            raise ValueError("chunking_strategy must be an instance of ChunkingStrategy")
+            raise ValueError(
+                "chunking_strategy must be an instance of ChunkingStrategy"
+            )

        # Set default chunking strategy if None
        if self.chunking_strategy is None:
@@ -494,10 +492,8 @@ class CrawlerRunConfig:
            prettiify=kwargs.get("prettiify", False),
            parser_type=kwargs.get("parser_type", "lxml"),
            scraping_strategy=kwargs.get("scraping_strategy"),
-
            # SSL Parameters
            fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
-
            # Caching Parameters
            cache_mode=kwargs.get("cache_mode"),
            session_id=kwargs.get("session_id"),
@@ -505,7 +501,6 @@ class CrawlerRunConfig:
            disable_cache=kwargs.get("disable_cache", False),
            no_cache_read=kwargs.get("no_cache_read", False),
            no_cache_write=kwargs.get("no_cache_write", False),
-
            # Page Navigation and Timing Parameters
            wait_until=kwargs.get("wait_until", "domcontentloaded"),
            page_timeout=kwargs.get("page_timeout", 60000),
@@ -515,7 +510,6 @@ class CrawlerRunConfig:
            mean_delay=kwargs.get("mean_delay", 0.1),
            max_range=kwargs.get("max_range", 0.3),
            semaphore_count=kwargs.get("semaphore_count", 5),
-
            # Page Interaction Parameters
            js_code=kwargs.get("js_code"),
            js_only=kwargs.get("js_only", False),
@@ -528,29 +522,34 @@ class CrawlerRunConfig:
            override_navigator=kwargs.get("override_navigator", False),
            magic=kwargs.get("magic", False),
            adjust_viewport_to_content=kwargs.get("adjust_viewport_to_content", False),
-
            # Media Handling Parameters
            screenshot=kwargs.get("screenshot", False),
            screenshot_wait_for=kwargs.get("screenshot_wait_for"),
-            screenshot_height_threshold=kwargs.get("screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD),
+            screenshot_height_threshold=kwargs.get(
+                "screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD
+            ),
            pdf=kwargs.get("pdf", False),
-            image_description_min_word_threshold=kwargs.get("image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD),
-            image_score_threshold=kwargs.get("image_score_threshold", IMAGE_SCORE_THRESHOLD),
+            image_description_min_word_threshold=kwargs.get(
+                "image_description_min_word_threshold",
+                IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
+            ),
+            image_score_threshold=kwargs.get(
+                "image_score_threshold", IMAGE_SCORE_THRESHOLD
+            ),
            exclude_external_images=kwargs.get("exclude_external_images", False),
-
            # Link and Domain Handling Parameters
-            exclude_social_media_domains=kwargs.get("exclude_social_media_domains", SOCIAL_MEDIA_DOMAINS),
+            exclude_social_media_domains=kwargs.get(
+                "exclude_social_media_domains", SOCIAL_MEDIA_DOMAINS
+            ),
            exclude_external_links=kwargs.get("exclude_external_links", False),
            exclude_social_media_links=kwargs.get("exclude_social_media_links", False),
            exclude_domains=kwargs.get("exclude_domains", []),
-
            # Debugging and Logging Parameters
            verbose=kwargs.get("verbose", True),
            log_console=kwargs.get("log_console", False),
-            
            url=kwargs.get("url"),
        )
-        
+
    # Create a funciton returns dict of the object
    def to_dict(self):
        return {