feat: Enhance crawling control and LLM extraction flexibility
- Add before_retrieve_html hook and delay_before_return_html option - Implement flexible page_timeout for smart_wait function - Support extra_args and custom headers in LLM extraction - Allow arbitrary kwargs in AsyncWebCrawler initialization - Improve perform_completion_with_backoff for custom API calls - Update examples with new features and diverse LLM providers
This commit is contained in:
@@ -63,7 +63,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
'on_execution_started': None,
|
||||
'before_goto': None,
|
||||
'after_goto': None,
|
||||
'before_return_html': None
|
||||
'before_return_html': None,
|
||||
'before_retrieve_html': None
|
||||
}
|
||||
|
||||
async def __aenter__(self):
|
||||
@@ -295,7 +296,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
wait_for = kwargs.get("wait_for")
|
||||
if wait_for:
|
||||
try:
|
||||
await self.smart_wait(page, wait_for, timeout=kwargs.get("timeout", 30000))
|
||||
await self.smart_wait(page, wait_for, timeout=kwargs.get("page_timeout", 60000))
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Wait condition failed: {str(e)}")
|
||||
|
||||
@@ -304,8 +305,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
if kwargs.get("screenshot"):
|
||||
screenshot_data = await self.take_screenshot(url)
|
||||
|
||||
await self.execute_hook('before_retrieve_html', page)
|
||||
# Check if delay_before_return_html is set then wait for that time
|
||||
delay_before_return_html = kwargs.get("delay_before_return_html")
|
||||
if delay_before_return_html:
|
||||
await asyncio.sleep(delay_before_return_html)
|
||||
html = await page.content()
|
||||
page = await self.execute_hook('before_return_html', page, html)
|
||||
await self.execute_hook('before_return_html', page, html)
|
||||
|
||||
if self.verbose:
|
||||
print(f"[LOG] ✅ Crawled {url} successfully!")
|
||||
|
||||
Reference in New Issue
Block a user