Enhance AsyncWebCrawler with smart waiting and screenshot capabilities
- Implement smart_wait function in AsyncPlaywrightCrawlerStrategy - Add screenshot support to AsyncCrawlResponse and AsyncWebCrawler - Improve error handling and timeout management in crawling process - Fix typo in CrawlResult model (responser_headers -> response_headers) - Update .gitignore to exclude additional files - Adjust import path in test_basic_crawling.py
This commit is contained in:
@@ -3,7 +3,7 @@
|
||||
from .async_webcrawler import AsyncWebCrawler
|
||||
from .models import CrawlResult
|
||||
|
||||
__version__ = "0.3.4"
|
||||
__version__ = "0.3.5"
|
||||
|
||||
__all__ = [
|
||||
"AsyncWebCrawler",
|
||||
|
||||
@@ -12,10 +12,12 @@ import hashlib
|
||||
from pathlib import Path
|
||||
from playwright.async_api import ProxySettings
|
||||
from pydantic import BaseModel
|
||||
|
||||
class AsyncCrawlResponse(BaseModel):
|
||||
html: str
|
||||
response_headers: Dict[str, str]
|
||||
status_code: int
|
||||
screenshot: Optional[str] = None
|
||||
|
||||
class AsyncCrawlerStrategy(ABC):
|
||||
@abstractmethod
|
||||
@@ -139,6 +141,45 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
asyncio.create_task(self.kill_session(sid))
|
||||
|
||||
|
||||
async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000):
|
||||
wait_for = wait_for.strip()
|
||||
|
||||
if wait_for.startswith('js:'):
|
||||
# Explicitly specified JavaScript
|
||||
js_code = wait_for[3:].strip()
|
||||
return await self.csp_compliant_wait(page, js_code, timeout)
|
||||
elif wait_for.startswith('css:'):
|
||||
# Explicitly specified CSS selector
|
||||
css_selector = wait_for[4:].strip()
|
||||
try:
|
||||
await page.wait_for_selector(css_selector, timeout=timeout)
|
||||
except Error as e:
|
||||
if 'Timeout' in str(e):
|
||||
raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{css_selector}'")
|
||||
else:
|
||||
raise ValueError(f"Invalid CSS selector: '{css_selector}'")
|
||||
else:
|
||||
# Auto-detect based on content
|
||||
if wait_for.startswith('()') or wait_for.startswith('function'):
|
||||
# It's likely a JavaScript function
|
||||
return await self.csp_compliant_wait(page, wait_for, timeout)
|
||||
else:
|
||||
# Assume it's a CSS selector first
|
||||
try:
|
||||
await page.wait_for_selector(wait_for, timeout=timeout)
|
||||
except Error as e:
|
||||
if 'Timeout' in str(e):
|
||||
raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{wait_for}'")
|
||||
else:
|
||||
# If it's not a timeout error, it might be an invalid selector
|
||||
# Let's try to evaluate it as a JavaScript function as a fallback
|
||||
try:
|
||||
return await self.csp_compliant_wait(page, f"() => {{{wait_for}}}", timeout)
|
||||
except Error:
|
||||
raise ValueError(f"Invalid wait_for parameter: '{wait_for}'. "
|
||||
"It should be either a valid CSS selector, a JavaScript function, "
|
||||
"or explicitly prefixed with 'js:' or 'css:'.")
|
||||
|
||||
async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout: float = 30000):
|
||||
wrapper_js = f"""
|
||||
async () => {{
|
||||
@@ -250,19 +291,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
wait_for = kwargs.get("wait_for")
|
||||
if wait_for:
|
||||
try:
|
||||
await self.csp_compliant_wait(page, wait_for, timeout=kwargs.get("timeout", 30000))
|
||||
await self.smart_wait(page, wait_for, timeout=kwargs.get("timeout", 30000))
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Custom wait condition failed: {str(e)}")
|
||||
# try:
|
||||
# await page.wait_for_function(wait_for)
|
||||
# # if callable(wait_for):
|
||||
# # await page.wait_for_function(wait_for)
|
||||
# # elif isinstance(wait_for, str):
|
||||
# # await page.wait_for_selector(wait_for)
|
||||
# # else:
|
||||
# # raise ValueError("wait_for must be either a callable or a CSS selector string")
|
||||
# except Error as e:
|
||||
# raise Error(f"Custom wait condition failed: {str(e)}")
|
||||
raise RuntimeError(f"Wait condition failed: {str(e)}")
|
||||
|
||||
html = await page.content()
|
||||
page = await self.execute_hook('before_return_html', page, html)
|
||||
|
||||
@@ -80,7 +80,7 @@ class AsyncWebCrawler:
|
||||
|
||||
word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD)
|
||||
|
||||
async_response : AsyncCrawlResponse = None
|
||||
async_response: AsyncCrawlResponse = None
|
||||
cached = None
|
||||
screenshot_data = None
|
||||
extracted_content = None
|
||||
@@ -102,15 +102,14 @@ class AsyncWebCrawler:
|
||||
t1 = time.time()
|
||||
if user_agent:
|
||||
self.crawler_strategy.update_user_agent(user_agent)
|
||||
async_response : AsyncCrawlResponse = await self.crawler_strategy.crawl(url, **kwargs)
|
||||
async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(url, screenshot=screenshot, **kwargs)
|
||||
html = sanitize_input_encode(async_response.html)
|
||||
screenshot_data = async_response.screenshot
|
||||
t2 = time.time()
|
||||
if verbose:
|
||||
print(
|
||||
f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds"
|
||||
)
|
||||
if screenshot:
|
||||
screenshot_data = await self.crawler_strategy.take_screenshot(url)
|
||||
|
||||
crawl_result = await self.aprocess_html(
|
||||
url,
|
||||
@@ -127,7 +126,7 @@ class AsyncWebCrawler:
|
||||
**kwargs,
|
||||
)
|
||||
crawl_result.status_code = async_response.status_code if async_response else 200
|
||||
crawl_result.responser_headers = async_response.response_headers if async_response else {}
|
||||
crawl_result.response_headers = async_response.response_headers if async_response else {}
|
||||
crawl_result.success = bool(html)
|
||||
crawl_result.session_id = kwargs.get("session_id", None)
|
||||
return crawl_result
|
||||
|
||||
@@ -18,5 +18,5 @@ class CrawlResult(BaseModel):
|
||||
metadata: Optional[dict] = None
|
||||
error_message: Optional[str] = None
|
||||
session_id: Optional[str] = None
|
||||
responser_headers: Optional[dict] = None
|
||||
response_headers: Optional[dict] = None
|
||||
status_code: Optional[int] = None
|
||||
Reference in New Issue
Block a user