Apply Ruff Corrections

This commit is contained in:
UncleCode
2025-01-13 19:19:58 +08:00
parent c3370ec5da
commit 8ec12d7d68
84 changed files with 6861 additions and 5076 deletions

View File

@@ -5,7 +5,6 @@ from .config import (
PAGE_TIMEOUT,
IMAGE_SCORE_THRESHOLD,
SOCIAL_MEDIA_DOMAINS,
)
from .user_agent_generator import UserAgentGenerator
from .extraction_strategy import ExtractionStrategy
@@ -14,6 +13,7 @@ from .markdown_generation_strategy import MarkdownGenerationStrategy
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
from typing import Union, List
class BrowserConfig:
"""
Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy.
@@ -84,7 +84,7 @@ class BrowserConfig:
proxy: str = None,
proxy_config: dict = None,
viewport_width: int = 1080,
viewport_height: int = 600,
viewport_height: int = 600,
accept_downloads: bool = False,
downloads_path: str = None,
storage_state=None,
@@ -103,7 +103,7 @@ class BrowserConfig:
text_mode: bool = False,
light_mode: bool = False,
extra_args: list = None,
debugging_port : int = 9222,
debugging_port: int = 9222,
):
self.browser_type = browser_type
self.headless = headless
@@ -142,7 +142,7 @@ class BrowserConfig:
self.user_agent = user_agenr_generator.generate()
else:
pass
self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent)
self.headers.setdefault("sec-ch-ua", self.browser_hint)
@@ -313,7 +313,7 @@ class CrawlerRunConfig:
Default: True.
log_console (bool): If True, log console messages from the page.
Default: False.
# Optional Parameters
url: str = None # This is not a compulsory parameter
"""
@@ -335,10 +335,8 @@ class CrawlerRunConfig:
prettiify: bool = False,
parser_type: str = "lxml",
scraping_strategy: ContentScrapingStrategy = None,
# SSL Parameters
fetch_ssl_certificate: bool = False,
# Caching Parameters
cache_mode=None,
session_id: str = None,
@@ -346,7 +344,6 @@ class CrawlerRunConfig:
disable_cache: bool = False,
no_cache_read: bool = False,
no_cache_write: bool = False,
# Page Navigation and Timing Parameters
wait_until: str = "domcontentloaded",
page_timeout: int = PAGE_TIMEOUT,
@@ -356,7 +353,6 @@ class CrawlerRunConfig:
mean_delay: float = 0.1,
max_range: float = 0.3,
semaphore_count: int = 5,
# Page Interaction Parameters
js_code: Union[str, List[str]] = None,
js_only: bool = False,
@@ -369,7 +365,6 @@ class CrawlerRunConfig:
override_navigator: bool = False,
magic: bool = False,
adjust_viewport_to_content: bool = False,
# Media Handling Parameters
screenshot: bool = False,
screenshot_wait_for: float = None,
@@ -378,21 +373,18 @@ class CrawlerRunConfig:
image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
image_score_threshold: int = IMAGE_SCORE_THRESHOLD,
exclude_external_images: bool = False,
# Link and Domain Handling Parameters
exclude_social_media_domains: list = None,
exclude_external_links: bool = False,
exclude_social_media_links: bool = False,
exclude_domains: list = None,
# Debugging and Logging Parameters
verbose: bool = True,
log_console: bool = False,
url: str = None,
):
self.url = url
# Content Processing Parameters
self.word_count_threshold = word_count_threshold
self.extraction_strategy = extraction_strategy
@@ -453,7 +445,9 @@ class CrawlerRunConfig:
self.exclude_external_images = exclude_external_images
# Link and Domain Handling Parameters
self.exclude_social_media_domains = exclude_social_media_domains or SOCIAL_MEDIA_DOMAINS
self.exclude_social_media_domains = (
exclude_social_media_domains or SOCIAL_MEDIA_DOMAINS
)
self.exclude_external_links = exclude_external_links
self.exclude_social_media_links = exclude_social_media_links
self.exclude_domains = exclude_domains or []
@@ -466,11 +460,15 @@ class CrawlerRunConfig:
if self.extraction_strategy is not None and not isinstance(
self.extraction_strategy, ExtractionStrategy
):
raise ValueError("extraction_strategy must be an instance of ExtractionStrategy")
raise ValueError(
"extraction_strategy must be an instance of ExtractionStrategy"
)
if self.chunking_strategy is not None and not isinstance(
self.chunking_strategy, ChunkingStrategy
):
raise ValueError("chunking_strategy must be an instance of ChunkingStrategy")
raise ValueError(
"chunking_strategy must be an instance of ChunkingStrategy"
)
# Set default chunking strategy if None
if self.chunking_strategy is None:
@@ -494,10 +492,8 @@ class CrawlerRunConfig:
prettiify=kwargs.get("prettiify", False),
parser_type=kwargs.get("parser_type", "lxml"),
scraping_strategy=kwargs.get("scraping_strategy"),
# SSL Parameters
fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
# Caching Parameters
cache_mode=kwargs.get("cache_mode"),
session_id=kwargs.get("session_id"),
@@ -505,7 +501,6 @@ class CrawlerRunConfig:
disable_cache=kwargs.get("disable_cache", False),
no_cache_read=kwargs.get("no_cache_read", False),
no_cache_write=kwargs.get("no_cache_write", False),
# Page Navigation and Timing Parameters
wait_until=kwargs.get("wait_until", "domcontentloaded"),
page_timeout=kwargs.get("page_timeout", 60000),
@@ -515,7 +510,6 @@ class CrawlerRunConfig:
mean_delay=kwargs.get("mean_delay", 0.1),
max_range=kwargs.get("max_range", 0.3),
semaphore_count=kwargs.get("semaphore_count", 5),
# Page Interaction Parameters
js_code=kwargs.get("js_code"),
js_only=kwargs.get("js_only", False),
@@ -528,29 +522,34 @@ class CrawlerRunConfig:
override_navigator=kwargs.get("override_navigator", False),
magic=kwargs.get("magic", False),
adjust_viewport_to_content=kwargs.get("adjust_viewport_to_content", False),
# Media Handling Parameters
screenshot=kwargs.get("screenshot", False),
screenshot_wait_for=kwargs.get("screenshot_wait_for"),
screenshot_height_threshold=kwargs.get("screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD),
screenshot_height_threshold=kwargs.get(
"screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD
),
pdf=kwargs.get("pdf", False),
image_description_min_word_threshold=kwargs.get("image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD),
image_score_threshold=kwargs.get("image_score_threshold", IMAGE_SCORE_THRESHOLD),
image_description_min_word_threshold=kwargs.get(
"image_description_min_word_threshold",
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
),
image_score_threshold=kwargs.get(
"image_score_threshold", IMAGE_SCORE_THRESHOLD
),
exclude_external_images=kwargs.get("exclude_external_images", False),
# Link and Domain Handling Parameters
exclude_social_media_domains=kwargs.get("exclude_social_media_domains", SOCIAL_MEDIA_DOMAINS),
exclude_social_media_domains=kwargs.get(
"exclude_social_media_domains", SOCIAL_MEDIA_DOMAINS
),
exclude_external_links=kwargs.get("exclude_external_links", False),
exclude_social_media_links=kwargs.get("exclude_social_media_links", False),
exclude_domains=kwargs.get("exclude_domains", []),
# Debugging and Logging Parameters
verbose=kwargs.get("verbose", True),
log_console=kwargs.get("log_console", False),
url=kwargs.get("url"),
)
# Create a funciton returns dict of the object
def to_dict(self):
return {