From f52f5260020a6e461aa800a37a2d11768f34de1d Mon Sep 17 00:00:00 2001 From: unclecode Date: Fri, 17 May 2024 16:03:35 +0800 Subject: [PATCH] chore: Update web_crawler.py to use NoExtractionStrategy as default --- crawl4ai/web_crawler.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index e8437d17..95e79034 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -43,7 +43,7 @@ class WebCrawler: result = self.run( url='https://crawl4ai.uccode.io/', word_count_threshold=5, - extraction_strategy= CosineStrategy(), + extraction_strategy= NoExtractionStrategy(), bypass_cache=False, verbose = False ) @@ -66,7 +66,7 @@ class WebCrawler: return self.run( url_model.url, word_count_threshold, - extraction_strategy or CosineStrategy(), + extraction_strategy or NoExtractionStrategy(), chunking_strategy, bypass_cache=url_model.forced, **kwargs, @@ -85,7 +85,7 @@ class WebCrawler: verbose=True, **kwargs, ) -> CrawlResult: - extraction_strategy = extraction_strategy or CosineStrategy() + extraction_strategy = extraction_strategy or NoExtractionStrategy() # Check if extraction strategy is an instance of ExtractionStrategy if not raise an error if not isinstance(extraction_strategy, ExtractionStrategy): raise ValueError("Unsupported extraction strategy") @@ -185,7 +185,7 @@ class WebCrawler: chunking_strategy: ChunkingStrategy = RegexChunking(), **kwargs, ) -> List[CrawlResult]: - extraction_strategy = extraction_strategy or CosineStrategy() + extraction_strategy = extraction_strategy or NoExtractionStrategy() def fetch_page_wrapper(url_model, *args, **kwargs): return self.fetch_page(url_model, *args, **kwargs)