fix(extraction): JsonCss selector and crawler improvements

- Fix JsonCssExtractionStrategy._get_elements to return all matching elements instead of just one
- Add robust error handling to page_need_scroll with default fallback
- Improve JSON extraction strategies documentation
- Refactor content scraping strategy
- Update version to 0.4.247
This commit is contained in:
UncleCode
2025-01-05 19:26:46 +08:00
parent 0857c7b448
commit 72fbdac467
6 changed files with 56 additions and 102 deletions

View File

@@ -2163,7 +2163,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
}
""")
async def page_need_scroll(self, page: Page):
async def page_need_scroll(self, page: Page) -> bool:
"""
Determine whether the page need to scroll
@@ -2171,12 +2171,21 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
page: Playwright page object
Returns:
page should scroll or not
bool: True if page needs scrolling
"""
return await page.evaluate("""
try:
need_scroll = await page.evaluate("""
() => {
const scrollHeight = document.documentElement.scrollHeight;
const viewportHeight = window.innerHeight;
return scrollHeight > viewportHeight;
}
""")
""")
return need_scroll
except Exception as e:
self.logger.warning(
message="Failed to check scroll need: {error}. Defaulting to True for safety.",
tag="SCROLL",
params={"error": str(e)}
)
return True # Default to scrolling if check fails