Enhance crawler features and improve documentation

- Added detailed CrawlerRunConfig parameters documentation. - Introduced plans for real-time event-driven crawling. - Updated async logger default level to DEBUG for better insights. - Improved structure and readability in configuration file. - Enhanced documentation on future capabilities in new blog entries.
2024-12-16 18:52:51 +08:00
parent ed7bc1909c
commit a11d9646e3
6 changed files with 439 additions and 125 deletions
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -1,14 +1,18 @@
 from .config import (
-    MIN_WORD_THRESHOLD, 
+    MIN_WORD_THRESHOLD,
    IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
    SCREENSHOT_HEIGHT_TRESHOLD,
-    PAGE_TIMEOUT
+    PAGE_TIMEOUT,
+    IMAGE_SCORE_THRESHOLD,
+    SOCIAL_MEDIA_DOMAINS,
+
 )
 from .user_agent_generator import UserAgentGenerator
 from .extraction_strategy import ExtractionStrategy
 from .chunking_strategy import ChunkingStrategy
 from .markdown_generation_strategy import MarkdownGenerationStrategy

+
 class BrowserConfig:
    """
    Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy.
@@ -127,7 +131,7 @@ class BrowserConfig:
        self.extra_args = extra_args if extra_args is not None else []
        self.sleep_on_close = sleep_on_close
        self.verbose = verbose
-        
+
        user_agenr_generator = UserAgentGenerator()
        if self.user_agent_mode != "random":
            self.user_agent = user_agenr_generator.generate(
@@ -160,15 +164,16 @@ class BrowserConfig:
            java_script_enabled=kwargs.get("java_script_enabled", True),
            cookies=kwargs.get("cookies", []),
            headers=kwargs.get("headers", {}),
-            user_agent=kwargs.get("user_agent",
+            user_agent=kwargs.get(
+                "user_agent",
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
-                "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
+                "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
            ),
            user_agent_mode=kwargs.get("user_agent_mode"),
            user_agent_generator_config=kwargs.get("user_agent_generator_config"),
            text_only=kwargs.get("text_only", False),
            light_mode=kwargs.get("light_mode", False),
-            extra_args=kwargs.get("extra_args", [])
+            extra_args=kwargs.get("extra_args", []),
        )


@@ -182,22 +187,37 @@ class CrawlerRunConfig:
    By using this class, you have a single place to understand and adjust the crawling options.

    Attributes:
+        # Content Processing Parameters
        word_count_threshold (int): Minimum word count threshold before processing content.
                                    Default: MIN_WORD_THRESHOLD (typically 200).
        extraction_strategy (ExtractionStrategy or None): Strategy to extract structured data from crawled pages.
                                                          Default: None (NoExtractionStrategy is used if None).
        chunking_strategy (ChunkingStrategy): Strategy to chunk content before extraction.
                                              Default: RegexChunking().
+        markdown_generator (MarkdownGenerationStrategy): Strategy for generating markdown.
+                                                         Default: None.
        content_filter (RelevantContentFilter or None): Optional filter to prune irrelevant content.
                                                        Default: None.
+        only_text (bool): If True, attempt to extract text-only content where applicable.
+                          Default: False.
+        css_selector (str or None): CSS selector to extract a specific portion of the page.
+                                    Default: None.
+        excluded_tags (list of str or None): List of HTML tags to exclude from processing.
+                                             Default: None.
+        keep_data_attributes (bool): If True, retain `data-*` attributes while removing unwanted attributes.
+                                     Default: False.
+        remove_forms (bool): If True, remove all `<form>` elements from the HTML.
+                             Default: False.
+        prettiify (bool): If True, apply `fast_format_html` to produce prettified HTML output.
+                          Default: False.
+
+        # Caching Parameters
        cache_mode (CacheMode or None): Defines how caching is handled.
                                        If None, defaults to CacheMode.ENABLED internally.
                                        Default: None.
-        session_id (str or None):   Optional session ID to persist the browser context and the created 
-                                    page instance. If the ID already exists, the crawler does not 
-                                    create a new page and uses the current page to preserve the state;
-                                    if not, it creates a new page and context then stores it in 
-                                    memory with the given session ID.
+        session_id (str or None): Optional session ID to persist the browser context and the created
+                                  page instance. If the ID already exists, the crawler does not
+                                  create a new page and uses the current page to preserve the state.
        bypass_cache (bool): Legacy parameter, if True acts like CacheMode.BYPASS.
                             Default: False.
        disable_cache (bool): Legacy parameter, if True acts like CacheMode.DISABLED.
@@ -206,36 +226,32 @@ class CrawlerRunConfig:
                              Default: False.
        no_cache_write (bool): Legacy parameter, if True acts like CacheMode.READ_ONLY.
                               Default: False.
-        css_selector (str or None): CSS selector to extract a specific portion of the page.
-                                    Default: None.
-        screenshot (bool): Whether to take a screenshot after crawling.
-                           Default: False.
-        pdf (bool): Whether to generate a PDF of the page.
-                    Default: False.
-        verbose (bool): Enable verbose logging.
-                        Default: True.
-        only_text (bool): If True, attempt to extract text-only content where applicable.
-                          Default: False.
-        image_description_min_word_threshold (int): Minimum words for image description extraction.
-                                                    Default: IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD (e.g., 50).
-        prettiify (bool): If True, apply `fast_format_html` to produce prettified HTML output.
-                          Default: False.
-        js_code (str or list of str or None): JavaScript code/snippets to run on the page.
-                                              Default: None.
-        wait_for (str or None): A CSS selector or JS condition to wait for before extracting content.
-                                Default: None.
-        js_only (bool): If True, indicates subsequent calls are JS-driven updates, not full page loads.
-                        Default: False.
+
+        # Page Navigation and Timing Parameters
        wait_until (str): The condition to wait for when navigating, e.g. "domcontentloaded".
                          Default: "domcontentloaded".
        page_timeout (int): Timeout in ms for page operations like navigation.
                            Default: 60000 (60 seconds).
+        wait_for (str or None): A CSS selector or JS condition to wait for before extracting content.
+                                Default: None.
+        wait_for_images (bool): If True, wait for images to load before extracting content.
+                                Default: True.
+        delay_before_return_html (float): Delay in seconds before retrieving final HTML.
+                                          Default: 0.1.
+        mean_delay (float): Mean base delay between requests when calling arun_many.
+                            Default: 0.1.
+        max_range (float): Max random additional delay range for requests in arun_many.
+                           Default: 0.3.
+        semaphore_count (int): Number of concurrent operations allowed.
+                               Default: 5.
+
+        # Page Interaction Parameters
+        js_code (str or list of str or None): JavaScript code/snippets to run on the page.
+                                              Default: None.
+        js_only (bool): If True, indicates subsequent calls are JS-driven updates, not full page loads.
+                        Default: False.
        ignore_body_visibility (bool): If True, ignore whether the body is visible before proceeding.
                                       Default: True.
-        wait_for_images (bool): If True, wait for images to load before extracting content. 
-                                Default: True.
-        adjust_viewport_to_content (bool): If True, adjust viewport according to the page content dimensions.
-                                           Default: False.
        scan_full_page (bool): If True, scroll through the entire page to load all content.
                               Default: False.
        scroll_delay (float): Delay in seconds between scroll steps if scan_full_page is True.
@@ -244,163 +260,322 @@ class CrawlerRunConfig:
                                Default: False.
        remove_overlay_elements (bool): If True, remove overlays/popups before extracting HTML.
                                        Default: False.
-        delay_before_return_html (float): Delay in seconds before retrieving final HTML.
-                                          Default: 0.1.
-        log_console (bool): If True, log console messages from the page.
-                            Default: False.
        simulate_user (bool): If True, simulate user interactions (mouse moves, clicks) for anti-bot measures.
                              Default: False.
        override_navigator (bool): If True, overrides navigator properties for more human-like behavior.
                                   Default: False.
        magic (bool): If True, attempts automatic handling of overlays/popups.
                      Default: False.
+        adjust_viewport_to_content (bool): If True, adjust viewport according to the page content dimensions.
+                                           Default: False.
+
+        # Media Handling Parameters
+        screenshot (bool): Whether to take a screenshot after crawling.
+                           Default: False.
        screenshot_wait_for (float or None): Additional wait time before taking a screenshot.
                                             Default: None.
        screenshot_height_threshold (int): Threshold for page height to decide screenshot strategy.
                                           Default: SCREENSHOT_HEIGHT_TRESHOLD (from config, e.g. 20000).
-        mean_delay (float): Mean base delay between requests when calling arun_many.
-                            Default: 0.1.
-        max_range (float): Max random additional delay range for requests in arun_many.
-                           Default: 0.3.
-        # session_id and semaphore_count might be set at runtime, not needed as defaults here.
+        pdf (bool): Whether to generate a PDF of the page.
+                    Default: False.
+        image_description_min_word_threshold (int): Minimum words for image description extraction.
+                                                    Default: IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD (e.g., 50).
+        image_score_threshold (int): Minimum score threshold for processing an image.
+                                     Default: IMAGE_SCORE_THRESHOLD (e.g., 3).
+        exclude_external_images (bool): If True, exclude all external images from processing.
+                                         Default: False.
+
+        # Link and Domain Handling Parameters
+        exclude_social_media_domains (list of str): List of domains to exclude for social media links.
+                                                    Default: SOCIAL_MEDIA_DOMAINS (from config).
+        exclude_external_links (bool): If True, exclude all external links from the results.
+                                       Default: False.
+        exclude_social_media_links (bool): If True, exclude links pointing to social media domains.
+                                           Default: False.
+        exclude_domains (list of str): List of specific domains to exclude from results.
+                                       Default: [].
+
+        # Debugging and Logging Parameters
+        verbose (bool): Enable verbose logging.
+                        Default: True.
+        log_console (bool): If True, log console messages from the page.
+                            Default: False.
    """

    def __init__(
        self,
-        word_count_threshold: int =  MIN_WORD_THRESHOLD ,
-        extraction_strategy : ExtractionStrategy=None,  # Will default to NoExtractionStrategy if None
-        chunking_strategy : ChunkingStrategy= None,    # Will default to RegexChunking if None
-        markdown_generator : MarkdownGenerationStrategy = None,
+        # Content Processing Parameters
+        word_count_threshold: int = MIN_WORD_THRESHOLD,
+        extraction_strategy: ExtractionStrategy = None,
+        chunking_strategy: ChunkingStrategy = None,
+        markdown_generator: MarkdownGenerationStrategy = None,
        content_filter=None,
+        only_text: bool = False,
+        css_selector: str = None,
+        excluded_tags: list = None,
+        keep_data_attributes: bool = False,
+        remove_forms: bool = False,
+        prettiify: bool = False,
+
+        # Caching Parameters
        cache_mode=None,
        session_id: str = None,
        bypass_cache: bool = False,
        disable_cache: bool = False,
        no_cache_read: bool = False,
        no_cache_write: bool = False,
-        css_selector: str = None,
-        screenshot: bool = False,
-        pdf: bool = False,
-        verbose: bool = True,
-        only_text: bool = False,
-        image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
-        prettiify: bool = False,
-        js_code=None,
-        wait_for: str = None,
-        js_only: bool = False,
+
+        # Page Navigation and Timing Parameters
        wait_until: str = "domcontentloaded",
        page_timeout: int = PAGE_TIMEOUT,
-        ignore_body_visibility: bool = True,
+        wait_for: str = None,
        wait_for_images: bool = True,
-        adjust_viewport_to_content: bool = False,
+        delay_before_return_html: float = 0.1,
+        mean_delay: float = 0.1,
+        max_range: float = 0.3,
+        semaphore_count: int = 5,
+
+        # Page Interaction Parameters
+        js_code=None,
+        js_only: bool = False,
+        ignore_body_visibility: bool = True,
        scan_full_page: bool = False,
        scroll_delay: float = 0.2,
        process_iframes: bool = False,
        remove_overlay_elements: bool = False,
-        delay_before_return_html: float = 0.1,
-        log_console: bool = False,
        simulate_user: bool = False,
        override_navigator: bool = False,
        magic: bool = False,
+        adjust_viewport_to_content: bool = False,
+
+        # Media Handling Parameters
+        screenshot: bool = False,
        screenshot_wait_for: float = None,
        screenshot_height_threshold: int = SCREENSHOT_HEIGHT_TRESHOLD,
-        mean_delay: float = 0.1,
-        max_range: float = 0.3,
-        semaphore_count: int = 5,
+        pdf: bool = False,
+        image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
+        image_score_threshold: int = IMAGE_SCORE_THRESHOLD,
+        exclude_external_images: bool = False,
+
+        # Link and Domain Handling Parameters
+        exclude_social_media_domains: list = None,
+        exclude_external_links: bool = False,
+        exclude_social_media_links: bool = False,
+        exclude_domains: list = None,
+
+        # Debugging and Logging Parameters
+        verbose: bool = True,
+        log_console: bool = False,
    ):
+        # Content Processing Parameters
        self.word_count_threshold = word_count_threshold
        self.extraction_strategy = extraction_strategy
        self.chunking_strategy = chunking_strategy
        self.markdown_generator = markdown_generator
        self.content_filter = content_filter
+        self.only_text = only_text
+        self.css_selector = css_selector
+        self.excluded_tags = excluded_tags or []
+        self.keep_data_attributes = keep_data_attributes
+        self.remove_forms = remove_forms
+        self.prettiify = prettiify
+
+        # Caching Parameters
        self.cache_mode = cache_mode
        self.session_id = session_id
        self.bypass_cache = bypass_cache
        self.disable_cache = disable_cache
        self.no_cache_read = no_cache_read
        self.no_cache_write = no_cache_write
-        self.css_selector = css_selector
-        self.screenshot = screenshot
-        self.pdf = pdf
-        self.verbose = verbose
-        self.only_text = only_text
-        self.image_description_min_word_threshold = image_description_min_word_threshold
-        self.prettiify = prettiify
-        self.js_code = js_code
-        self.wait_for = wait_for
-        self.js_only = js_only
+
+        # Page Navigation and Timing Parameters
        self.wait_until = wait_until
        self.page_timeout = page_timeout
-        self.ignore_body_visibility = ignore_body_visibility
+        self.wait_for = wait_for
        self.wait_for_images = wait_for_images
-        self.adjust_viewport_to_content = adjust_viewport_to_content
-        self.scan_full_page = scan_full_page
-        self.scroll_delay = scroll_delay
-        self.process_iframes = process_iframes
-        self.remove_overlay_elements = remove_overlay_elements
        self.delay_before_return_html = delay_before_return_html
-        self.log_console = log_console
-        self.simulate_user = simulate_user
-        self.override_navigator = override_navigator
-        self.magic = magic
-        self.screenshot_wait_for = screenshot_wait_for
-        self.screenshot_height_threshold = screenshot_height_threshold
        self.mean_delay = mean_delay
        self.max_range = max_range
        self.semaphore_count = semaphore_count

+        # Page Interaction Parameters
+        self.js_code = js_code
+        self.js_only = js_only
+        self.ignore_body_visibility = ignore_body_visibility
+        self.scan_full_page = scan_full_page
+        self.scroll_delay = scroll_delay
+        self.process_iframes = process_iframes
+        self.remove_overlay_elements = remove_overlay_elements
+        self.simulate_user = simulate_user
+        self.override_navigator = override_navigator
+        self.magic = magic
+        self.adjust_viewport_to_content = adjust_viewport_to_content
+
+        # Media Handling Parameters
+        self.screenshot = screenshot
+        self.screenshot_wait_for = screenshot_wait_for
+        self.screenshot_height_threshold = screenshot_height_threshold
+        self.pdf = pdf
+        self.image_description_min_word_threshold = image_description_min_word_threshold
+        self.image_score_threshold = image_score_threshold
+        self.exclude_external_images = exclude_external_images
+
+        # Link and Domain Handling Parameters
+        self.exclude_social_media_domains = exclude_social_media_domains or SOCIAL_MEDIA_DOMAINS
+        self.exclude_external_links = exclude_external_links
+        self.exclude_social_media_links = exclude_social_media_links
+        self.exclude_domains = exclude_domains or []
+
+        # Debugging and Logging Parameters
+        self.verbose = verbose
+        self.log_console = log_console
+
        # Validate type of extraction strategy and chunking strategy if they are provided
-        if self.extraction_strategy is not None and not isinstance(self.extraction_strategy, ExtractionStrategy):
+        if self.extraction_strategy is not None and not isinstance(
+            self.extraction_strategy, ExtractionStrategy
+        ):
            raise ValueError("extraction_strategy must be an instance of ExtractionStrategy")
-        if self.chunking_strategy is not None and not isinstance(self.chunking_strategy, ChunkingStrategy):
+        if self.chunking_strategy is not None and not isinstance(
+            self.chunking_strategy, ChunkingStrategy
+        ):
            raise ValueError("chunking_strategy must be an instance of ChunkingStrategy")

        # Set default chunking strategy if None
        if self.chunking_strategy is None:
            from .chunking_strategy import RegexChunking
            self.chunking_strategy = RegexChunking()
-        

    @staticmethod
    def from_kwargs(kwargs: dict) -> "CrawlerRunConfig":
        return CrawlerRunConfig(
+            # Content Processing Parameters
            word_count_threshold=kwargs.get("word_count_threshold", 200),
            extraction_strategy=kwargs.get("extraction_strategy"),
            chunking_strategy=kwargs.get("chunking_strategy"),
            markdown_generator=kwargs.get("markdown_generator"),
            content_filter=kwargs.get("content_filter"),
+            only_text=kwargs.get("only_text", False),
+            css_selector=kwargs.get("css_selector"),
+            excluded_tags=kwargs.get("excluded_tags", []),
+            keep_data_attributes=kwargs.get("keep_data_attributes", False),
+            remove_forms=kwargs.get("remove_forms", False),
+            prettiify=kwargs.get("prettiify", False),
+
+            # Caching Parameters
            cache_mode=kwargs.get("cache_mode"),
            session_id=kwargs.get("session_id"),
            bypass_cache=kwargs.get("bypass_cache", False),
            disable_cache=kwargs.get("disable_cache", False),
            no_cache_read=kwargs.get("no_cache_read", False),
            no_cache_write=kwargs.get("no_cache_write", False),
-            css_selector=kwargs.get("css_selector"),
-            screenshot=kwargs.get("screenshot", False),
-            pdf=kwargs.get("pdf", False),
-            verbose=kwargs.get("verbose", True),
-            only_text=kwargs.get("only_text", False),
-            image_description_min_word_threshold=kwargs.get("image_description_min_word_threshold",  IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD),
-            prettiify=kwargs.get("prettiify", False),
-            js_code=kwargs.get("js_code"), # If not provided here, will default inside constructor
-            wait_for=kwargs.get("wait_for"),
-            js_only=kwargs.get("js_only", False),
+
+            # Page Navigation and Timing Parameters
            wait_until=kwargs.get("wait_until", "domcontentloaded"),
            page_timeout=kwargs.get("page_timeout", 60000),
+            wait_for=kwargs.get("wait_for"),
+            wait_for_images=kwargs.get("wait_for_images", True),
+            delay_before_return_html=kwargs.get("delay_before_return_html", 0.1),
+            mean_delay=kwargs.get("mean_delay", 0.1),
+            max_range=kwargs.get("max_range", 0.3),
+            semaphore_count=kwargs.get("semaphore_count", 5),
+
+            # Page Interaction Parameters
+            js_code=kwargs.get("js_code"),
+            js_only=kwargs.get("js_only", False),
            ignore_body_visibility=kwargs.get("ignore_body_visibility", True),
-            adjust_viewport_to_content=kwargs.get("adjust_viewport_to_content", False),
            scan_full_page=kwargs.get("scan_full_page", False),
            scroll_delay=kwargs.get("scroll_delay", 0.2),
            process_iframes=kwargs.get("process_iframes", False),
            remove_overlay_elements=kwargs.get("remove_overlay_elements", False),
-            delay_before_return_html=kwargs.get("delay_before_return_html", 0.1),
-            log_console=kwargs.get("log_console", False),
            simulate_user=kwargs.get("simulate_user", False),
            override_navigator=kwargs.get("override_navigator", False),
            magic=kwargs.get("magic", False),
+            adjust_viewport_to_content=kwargs.get("adjust_viewport_to_content", False),
+
+            # Media Handling Parameters
+            screenshot=kwargs.get("screenshot", False),
            screenshot_wait_for=kwargs.get("screenshot_wait_for"),
-            screenshot_height_threshold=kwargs.get("screenshot_height_threshold", 20000),
-            mean_delay=kwargs.get("mean_delay", 0.1),
-            max_range=kwargs.get("max_range", 0.3),
-            semaphore_count=kwargs.get("semaphore_count", 5)
+            screenshot_height_threshold=kwargs.get("screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD),
+            pdf=kwargs.get("pdf", False),
+            image_description_min_word_threshold=kwargs.get("image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD),
+            image_score_threshold=kwargs.get("image_score_threshold", IMAGE_SCORE_THRESHOLD),
+            exclude_external_images=kwargs.get("exclude_external_images", False),
+
+            # Link and Domain Handling Parameters
+            exclude_social_media_domains=kwargs.get("exclude_social_media_domains", SOCIAL_MEDIA_DOMAINS),
+            exclude_external_links=kwargs.get("exclude_external_links", False),
+            exclude_social_media_links=kwargs.get("exclude_social_media_links", False),
+            exclude_domains=kwargs.get("exclude_domains", []),
+
+            # Debugging and Logging Parameters
+            verbose=kwargs.get("verbose", True),
+            log_console=kwargs.get("log_console", False),
        )
+        
+        
+        
+        
+    # @staticmethod
+    # def from_kwargs(kwargs: dict) -> "CrawlerRunConfig":
+    #     return CrawlerRunConfig(
+    #         word_count_threshold=kwargs.get("word_count_threshold", 200),
+    #         extraction_strategy=kwargs.get("extraction_strategy"),
+    #         chunking_strategy=kwargs.get("chunking_strategy"),
+    #         markdown_generator=kwargs.get("markdown_generator"),
+    #         content_filter=kwargs.get("content_filter"),
+    #         cache_mode=kwargs.get("cache_mode"),
+    #         session_id=kwargs.get("session_id"),
+    #         bypass_cache=kwargs.get("bypass_cache", False),
+    #         disable_cache=kwargs.get("disable_cache", False),
+    #         no_cache_read=kwargs.get("no_cache_read", False),
+    #         no_cache_write=kwargs.get("no_cache_write", False),
+    #         css_selector=kwargs.get("css_selector"),
+    #         screenshot=kwargs.get("screenshot", False),
+    #         pdf=kwargs.get("pdf", False),
+    #         verbose=kwargs.get("verbose", True),
+    #         only_text=kwargs.get("only_text", False),
+    #         image_description_min_word_threshold=kwargs.get(
+    #             "image_description_min_word_threshold",
+    #             IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
+    #         ),
+    #         prettiify=kwargs.get("prettiify", False),
+    #         js_code=kwargs.get(
+    #             "js_code"
+    #         ),  # If not provided here, will default inside constructor
+    #         wait_for=kwargs.get("wait_for"),
+    #         js_only=kwargs.get("js_only", False),
+    #         wait_until=kwargs.get("wait_until", "domcontentloaded"),
+    #         page_timeout=kwargs.get("page_timeout", 60000),
+    #         ignore_body_visibility=kwargs.get("ignore_body_visibility", True),
+    #         adjust_viewport_to_content=kwargs.get("adjust_viewport_to_content", False),
+    #         scan_full_page=kwargs.get("scan_full_page", False),
+    #         scroll_delay=kwargs.get("scroll_delay", 0.2),
+    #         process_iframes=kwargs.get("process_iframes", False),
+    #         remove_overlay_elements=kwargs.get("remove_overlay_elements", False),
+    #         delay_before_return_html=kwargs.get("delay_before_return_html", 0.1),
+    #         log_console=kwargs.get("log_console", False),
+    #         simulate_user=kwargs.get("simulate_user", False),
+    #         override_navigator=kwargs.get("override_navigator", False),
+    #         magic=kwargs.get("magic", False),
+    #         screenshot_wait_for=kwargs.get("screenshot_wait_for"),
+    #         screenshot_height_threshold=kwargs.get(
+    #             "screenshot_height_threshold", 20000
+    #         ),
+    #         mean_delay=kwargs.get("mean_delay", 0.1),
+    #         max_range=kwargs.get("max_range", 0.3),
+    #         semaphore_count=kwargs.get("semaphore_count", 5),
+    #         image_score_threshold=kwargs.get(
+    #             "image_score_threshold", IMAGE_SCORE_THRESHOLD
+    #         ),
+    #         exclude_social_media_domains=kwargs.get(
+    #             "exclude_social_media_domains", SOCIAL_MEDIA_DOMAINS
+    #         ),
+    #         exclude_external_links=kwargs.get("exclude_external_links", False),
+    #         exclude_social_media_links=kwargs.get("exclude_social_media_links", False),
+    #         exclude_domains=kwargs.get("exclude_domains", []),
+    #         exclude_external_images=kwargs.get("exclude_external_images", False),
+    #         remove_forms=kwargs.get("remove_forms", False),
+    #         keep_data_attributes=kwargs.get("keep_data_attributes", False),
+    #         excluded_tags=kwargs.get("excluded_tags", []),
+    #     )
+