chore: Update web_crawler.py to use NoExtractionStrategy as default

This commit is contained in:
unclecode
2024-05-17 16:03:35 +08:00
parent 3593f017d7
commit f52f526002

View File

@@ -43,7 +43,7 @@ class WebCrawler:
result = self.run( result = self.run(
url='https://crawl4ai.uccode.io/', url='https://crawl4ai.uccode.io/',
word_count_threshold=5, word_count_threshold=5,
extraction_strategy= CosineStrategy(), extraction_strategy= NoExtractionStrategy(),
bypass_cache=False, bypass_cache=False,
verbose = False verbose = False
) )
@@ -66,7 +66,7 @@ class WebCrawler:
return self.run( return self.run(
url_model.url, url_model.url,
word_count_threshold, word_count_threshold,
extraction_strategy or CosineStrategy(), extraction_strategy or NoExtractionStrategy(),
chunking_strategy, chunking_strategy,
bypass_cache=url_model.forced, bypass_cache=url_model.forced,
**kwargs, **kwargs,
@@ -85,7 +85,7 @@ class WebCrawler:
verbose=True, verbose=True,
**kwargs, **kwargs,
) -> CrawlResult: ) -> CrawlResult:
extraction_strategy = extraction_strategy or CosineStrategy() extraction_strategy = extraction_strategy or NoExtractionStrategy()
# Check if extraction strategy is an instance of ExtractionStrategy if not raise an error # Check if extraction strategy is an instance of ExtractionStrategy if not raise an error
if not isinstance(extraction_strategy, ExtractionStrategy): if not isinstance(extraction_strategy, ExtractionStrategy):
raise ValueError("Unsupported extraction strategy") raise ValueError("Unsupported extraction strategy")
@@ -185,7 +185,7 @@ class WebCrawler:
chunking_strategy: ChunkingStrategy = RegexChunking(), chunking_strategy: ChunkingStrategy = RegexChunking(),
**kwargs, **kwargs,
) -> List[CrawlResult]: ) -> List[CrawlResult]:
extraction_strategy = extraction_strategy or CosineStrategy() extraction_strategy = extraction_strategy or NoExtractionStrategy()
def fetch_page_wrapper(url_model, *args, **kwargs): def fetch_page_wrapper(url_model, *args, **kwargs):
return self.fetch_page(url_model, *args, **kwargs) return self.fetch_page(url_model, *args, **kwargs)