Enhance Markdown generation and external content control

- Integrate customized html2text library for flexible Markdown output
- Add options to exclude external links and images
- Improve content scraping efficiency and error handling
- Update AsyncPlaywrightCrawlerStrategy for faster closing
- Enhance CosineStrategy with generic embedding model loading
This commit is contained in:
UncleCode
2024-10-20 18:56:58 +08:00
parent e7cd8a1c2d
commit 6ec4cb33ca
14 changed files with 1981 additions and 21 deletions

View File

@@ -396,6 +396,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
response_headers = {}
await page.wait_for_selector('body')
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
js_code = kwargs.get("js_code", kwargs.get("js", self.js_code))
@@ -477,7 +478,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
});
// Fallback timeout of 5 seconds
setTimeout(() => resolve(), 5000);
// setTimeout(() => resolve(), 5000);
resolve();
});
}
"""