diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index aa0b849e..3d866529 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -1,14 +1,18 @@ from .config import ( - MIN_WORD_THRESHOLD, + MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, SCREENSHOT_HEIGHT_TRESHOLD, - PAGE_TIMEOUT + PAGE_TIMEOUT, + IMAGE_SCORE_THRESHOLD, + SOCIAL_MEDIA_DOMAINS, + ) from .user_agent_generator import UserAgentGenerator from .extraction_strategy import ExtractionStrategy from .chunking_strategy import ChunkingStrategy from .markdown_generation_strategy import MarkdownGenerationStrategy + class BrowserConfig: """ Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy. @@ -127,7 +131,7 @@ class BrowserConfig: self.extra_args = extra_args if extra_args is not None else [] self.sleep_on_close = sleep_on_close self.verbose = verbose - + user_agenr_generator = UserAgentGenerator() if self.user_agent_mode != "random": self.user_agent = user_agenr_generator.generate( @@ -160,15 +164,16 @@ class BrowserConfig: java_script_enabled=kwargs.get("java_script_enabled", True), cookies=kwargs.get("cookies", []), headers=kwargs.get("headers", {}), - user_agent=kwargs.get("user_agent", + user_agent=kwargs.get( + "user_agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " - "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36" + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36", ), user_agent_mode=kwargs.get("user_agent_mode"), user_agent_generator_config=kwargs.get("user_agent_generator_config"), text_only=kwargs.get("text_only", False), light_mode=kwargs.get("light_mode", False), - extra_args=kwargs.get("extra_args", []) + extra_args=kwargs.get("extra_args", []), ) @@ -182,22 +187,37 @@ class CrawlerRunConfig: By using this class, you have a single place to understand and adjust the crawling options. Attributes: + # Content Processing Parameters word_count_threshold (int): Minimum word count threshold before processing content. Default: MIN_WORD_THRESHOLD (typically 200). extraction_strategy (ExtractionStrategy or None): Strategy to extract structured data from crawled pages. Default: None (NoExtractionStrategy is used if None). chunking_strategy (ChunkingStrategy): Strategy to chunk content before extraction. Default: RegexChunking(). + markdown_generator (MarkdownGenerationStrategy): Strategy for generating markdown. + Default: None. content_filter (RelevantContentFilter or None): Optional filter to prune irrelevant content. Default: None. + only_text (bool): If True, attempt to extract text-only content where applicable. + Default: False. + css_selector (str or None): CSS selector to extract a specific portion of the page. + Default: None. + excluded_tags (list of str or None): List of HTML tags to exclude from processing. + Default: None. + keep_data_attributes (bool): If True, retain `data-*` attributes while removing unwanted attributes. + Default: False. + remove_forms (bool): If True, remove all `