From a317dc5e1ddaabf927dd8940309abe755659230a Mon Sep 17 00:00:00 2001 From: unclecode Date: Fri, 17 May 2024 15:13:06 +0800 Subject: [PATCH] Load CosineStrategy in the function --- crawl4ai/web_crawler.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index 56d2465e..0dc6e16c 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -62,14 +62,14 @@ class WebCrawler: extract_blocks_flag: bool = True, word_count_threshold=MIN_WORD_THRESHOLD, use_cached_html: bool = False, - extraction_strategy: ExtractionStrategy = CosineStrategy(), + extraction_strategy: ExtractionStrategy = None, chunking_strategy: ChunkingStrategy = RegexChunking(), **kwargs, ) -> CrawlResult: return self.run( url_model.url, word_count_threshold, - extraction_strategy, + extraction_strategy or CosineStrategy(), chunking_strategy, bypass_cache=url_model.forced, **kwargs, @@ -81,13 +81,14 @@ class WebCrawler: self, url: str, word_count_threshold=MIN_WORD_THRESHOLD, - extraction_strategy: ExtractionStrategy = CosineStrategy(), + extraction_strategy: ExtractionStrategy = None, chunking_strategy: ChunkingStrategy = RegexChunking(), bypass_cache: bool = False, css_selector: str = None, verbose=True, **kwargs, ) -> CrawlResult: + extraction_strategy = extraction_strategy or CosineStrategy() # Check if extraction strategy is an instance of ExtractionStrategy if not raise an error if not isinstance(extraction_strategy, ExtractionStrategy): raise ValueError("Unsupported extraction strategy") @@ -183,11 +184,11 @@ class WebCrawler: extract_blocks_flag: bool = True, word_count_threshold=MIN_WORD_THRESHOLD, use_cached_html: bool = False, - extraction_strategy: ExtractionStrategy = CosineStrategy(), + extraction_strategy: ExtractionStrategy = None, chunking_strategy: ChunkingStrategy = RegexChunking(), **kwargs, ) -> List[CrawlResult]: - + extraction_strategy = extraction_strategy or CosineStrategy() def fetch_page_wrapper(url_model, *args, **kwargs): return self.fetch_page(url_model, *args, **kwargs)