Enhance Markdown generation and external content control

- Integrate customized html2text library for flexible Markdown output
- Add options to exclude external links and images
- Improve content scraping efficiency and error handling
- Update AsyncPlaywrightCrawlerStrategy for faster closing
- Enhance CosineStrategy with generic embedding model loading
This commit is contained in:
UncleCode
2024-10-20 18:56:58 +08:00
parent e7cd8a1c2d
commit 6ec4cb33ca
14 changed files with 1981 additions and 21 deletions

View File

@@ -186,7 +186,8 @@ class AsyncWebCrawler:
try:
t1 = time.time()
scrapping_strategy = WebScrappingStrategy()
result = await scrapping_strategy.ascrap(
# result = await scrapping_strategy.ascrap(
result = await scrapping_strategy.scrap(
url,
html,
word_count_threshold=word_count_threshold,